diff --git a/README.md b/README.md index 971e899..f9f17ed 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,14 @@ The goal of model-based optimization is to find an input **x** that maximizes an Design-Bench can be installed with the complete set of benchmarks via our pip package. ```bash -pip install design-bench[all]==2.0.16 +pip install design-bench[all]==2.0.17 pip install morphing-agents==1.5.1 ``` Alternatively, if you do not have MuJoCo, you may opt for a minimal install. ```bash -pip install design-bench==2.0.16 +pip install design-bench==2.0.17 ``` ## Available Tasks diff --git a/design_bench/__init__.py b/design_bench/__init__.py index 1a27ff6..adbad6f 100644 --- a/design_bench/__init__.py +++ b/design_bench/__init__.py @@ -734,475 +734,447 @@ is_absolute=None))) -register('ChEMBLMorganFingerprint-GP-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.sklearn:GaussianProcessOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for building GP oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=2000, - distribution=None, - max_percentile=53, - min_percentile=0, - - # process the data into morgan fingerprints - feature_extractor=MorganFingerprintFeatures(dtype=np.int32), - - # parameters used for building the model - model_kwargs=dict(kernel=DefaultSequenceKernel(size=2), - alpha=0.01), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.5, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBL-GP-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.sklearn:GaussianProcessOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for building GP oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=2000, - distribution=None, - max_percentile=53, - min_percentile=0, - - # parameters used for building the model - model_kwargs=dict(kernel=DefaultSequenceKernel(size=591), - alpha=0.01), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.5, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBLMorganFingerprint-RandomForest-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.sklearn:RandomForestOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for building RandomForest oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=2000, - distribution=None, - max_percentile=53, - min_percentile=0, - - # process the data into morgan fingerprints - override_input_spec=True, - feature_extractor=MorganFingerprintFeatures(dtype=np.float32), - - # parameters used for building the model - model_kwargs=dict(n_estimators=100, - max_depth=100, - max_features="auto"), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.5, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBL-RandomForest-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.sklearn:RandomForestOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for building RandomForest oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=2000, - distribution=None, - max_percentile=53, - min_percentile=0, - - # parameters used for building the model - model_kwargs=dict(n_estimators=100, - max_depth=100, - max_features="auto"), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.5, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBLMorganFingerprint-FullyConnected-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.tensorflow:FullyConnectedOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for training FullyConnected oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=None, - distribution=None, - max_percentile=53, - min_percentile=0, - - # process the data into morgan fingerprints - feature_extractor=MorganFingerprintFeatures(dtype=np.float32), - - # parameters used for building the model - model_kwargs=dict(embedding_size=32, - hidden_size=512, - activation='relu', - num_layers=2, - epochs=5, - shuffle_buffer=5000, - learning_rate=0.0001), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.1, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBL-FullyConnected-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.tensorflow:FullyConnectedOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for training FullyConnected oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=None, - distribution=None, - max_percentile=53, - min_percentile=0, - - # parameters used for building the model - model_kwargs=dict(embedding_size=32, - hidden_size=512, - activation='relu', - num_layers=2, - epochs=20, - shuffle_buffer=5000, - learning_rate=0.0001), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.1, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBLMorganFingerprint-LSTM-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.tensorflow:LSTMOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for training LSTM oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=None, - distribution=None, - max_percentile=53, - min_percentile=0, - - # process the data into morgan fingerprints - feature_extractor=MorganFingerprintFeatures(dtype=np.int32), - - # parameters used for building the model - model_kwargs=dict(hidden_size=64, - num_layers=2, - epochs=20, - shuffle_buffer=5000, - learning_rate=0.001), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.1, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBL-LSTM-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.tensorflow:LSTMOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for training LSTM oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=None, - distribution=None, - max_percentile=53, - min_percentile=0, - - # parameters used for building the model - model_kwargs=dict(hidden_size=64, - num_layers=2, - epochs=20, - shuffle_buffer=5000, - learning_rate=0.001), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.1, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBLMorganFingerprint-ResNet-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.tensorflow:ResNetOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for training ResNet oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=None, - distribution=None, - max_percentile=53, - min_percentile=0, - - # process the data into morgan fingerprints - feature_extractor=MorganFingerprintFeatures(dtype=np.int32), - - # parameters used for building the model - model_kwargs=dict(hidden_size=64, - activation='relu', - kernel_size=3, - num_blocks=4, - epochs=20, - shuffle_buffer=5000, - learning_rate=0.001), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.1, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBL-ResNet-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.tensorflow:ResNetOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for training ResNet oracle - oracle_kwargs=dict( - noise_std=0.0, - max_samples=None, - distribution=None, - max_percentile=53, - min_percentile=0, - - # parameters used for building the model - model_kwargs=dict(hidden_size=64, - activation='relu', - kernel_size=3, - num_blocks=4, - epochs=20, - shuffle_buffer=5000, - learning_rate=0.001), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.1, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBLMorganFingerprint-Transformer-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.tensorflow:TransformerOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for training Transformer oracle - oracle_kwargs=dict( - noise_std=0.0, - internal_batch_size=32, - max_samples=None, - distribution=None, - max_percentile=53, - min_percentile=0, - - # process the data into morgan fingerprints - feature_extractor=MorganFingerprintFeatures(dtype=np.int32), - - # parameters used for building the model - model_kwargs=dict(hidden_size=128, - feed_forward_size=512, - activation='relu', - num_heads=4, - num_blocks=4, - epochs=20, - shuffle_buffer=20000, - learning_rate=0.0001, - dropout_rate=0.2), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.1, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) - - -register('ChEMBL-Transformer-v0', - 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', - 'design_bench.oracles.tensorflow:TransformerOracle', - - # keyword arguments for building the dataset - dataset_kwargs=dict( - max_samples=None, - distribution=None, - max_percentile=40, - min_percentile=0, - assay_chembl_id="CHEMBL1794345", - standard_type="Potency"), - - # keyword arguments for training Transformer oracle - oracle_kwargs=dict( - noise_std=0.0, - internal_batch_size=32, - max_samples=None, - distribution=None, - max_percentile=53, - min_percentile=0, - - # parameters used for building the model - model_kwargs=dict(hidden_size=128, - feed_forward_size=512, - activation='relu', - num_heads=4, - num_blocks=4, - epochs=20, - shuffle_buffer=20000, - learning_rate=0.0001, - dropout_rate=0.2), - - # parameters used for building the validation set - split_kwargs=dict(val_fraction=0.1, - subset=None, - shard_size=50000, - to_disk=True, - disk_target="chembl-Potency-CHEMBL1794345/split", - is_absolute=False))) +for standard_type, assay_chembl_id in [('MCHC', 'CHEMBL3885882'), + ('CHLORIDE', 'CHEMBL3885882'), + ('MCH', 'CHEMBL3885882'), + ('CREAT', 'CHEMBL3885882'), + ('PHOS', 'CHEMBL3885882'), + ('SODIUM', 'CHEMBL3885882')]: + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}_MorganFingerprint-GP-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.sklearn:GaussianProcessOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for building GP oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=2000, + distribution=None, + max_percentile=100, + min_percentile=0, + + # process the data into morgan fingerprints + feature_extractor=MorganFingerprintFeatures(dtype=np.int32), + + # parameters used for building the model + model_kwargs=dict(kernel=DefaultSequenceKernel(size=2), + alpha=0.01), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.5, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}-GP-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.sklearn:GaussianProcessOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for building GP oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=2000, + distribution=None, + max_percentile=100, + min_percentile=0, + + # parameters used for building the model + model_kwargs=dict(kernel=DefaultSequenceKernel(size=591), + alpha=0.01), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.5, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}_MorganFingerprint-RandomForest-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.sklearn:RandomForestOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for building RandomForest oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=2000, + distribution=None, + max_percentile=100, + min_percentile=0, + + # process the data into morgan fingerprints + override_input_spec=True, + feature_extractor=MorganFingerprintFeatures(dtype=np.float32), + + # parameters used for building the model + model_kwargs=dict(n_estimators=100, + max_depth=100, + max_features="auto"), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.5, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}-RandomForest-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.sklearn:RandomForestOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for building RandomForest oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=2000, + distribution=None, + max_percentile=100, + min_percentile=0, + + # parameters used for building the model + model_kwargs=dict(n_estimators=100, + max_depth=100, + max_features="auto"), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.5, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}_MorganFingerprint-FullyConnected-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.tensorflow:FullyConnectedOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for training FullyConnected oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=None, + distribution=None, + max_percentile=100, + min_percentile=0, + + # process the data into morgan fingerprints + feature_extractor=MorganFingerprintFeatures(dtype=np.float32), + + # parameters used for building the model + model_kwargs=dict(embedding_size=32, + hidden_size=512, + activation='relu', + num_layers=2, + epochs=5, + shuffle_buffer=5000, + learning_rate=0.0001), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.1, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}-FullyConnected-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.tensorflow:FullyConnectedOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for training FullyConnected oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=None, + distribution=None, + max_percentile=100, + min_percentile=0, + + # parameters used for building the model + model_kwargs=dict(embedding_size=32, + hidden_size=512, + activation='relu', + num_layers=2, + epochs=20, + shuffle_buffer=5000, + learning_rate=0.0001), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.1, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}_MorganFingerprint-LSTM-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.tensorflow:LSTMOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for training LSTM oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=None, + distribution=None, + max_percentile=100, + min_percentile=0, + + # process the data into morgan fingerprints + feature_extractor=MorganFingerprintFeatures(dtype=np.int32), + + # parameters used for building the model + model_kwargs=dict(hidden_size=64, + num_layers=2, + epochs=20, + shuffle_buffer=5000, + learning_rate=0.001), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.1, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}-LSTM-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.tensorflow:LSTMOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for training LSTM oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=None, + distribution=None, + max_percentile=100, + min_percentile=0, + + # parameters used for building the model + model_kwargs=dict(hidden_size=64, + num_layers=2, + epochs=20, + shuffle_buffer=5000, + learning_rate=0.001), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.1, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}_MorganFingerprint-ResNet-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.tensorflow:ResNetOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for training ResNet oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=None, + distribution=None, + max_percentile=100, + min_percentile=0, + + # process the data into morgan fingerprints + feature_extractor=MorganFingerprintFeatures(dtype=np.int32), + + # parameters used for building the model + model_kwargs=dict(hidden_size=64, + activation='relu', + kernel_size=3, + num_blocks=4, + epochs=20, + shuffle_buffer=5000, + learning_rate=0.001), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.1, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}-ResNet-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.tensorflow:ResNetOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for training ResNet oracle + oracle_kwargs=dict( + noise_std=0.0, + max_samples=None, + distribution=None, + max_percentile=100, + min_percentile=0, + + # parameters used for building the model + model_kwargs=dict(hidden_size=64, + activation='relu', + kernel_size=3, + num_blocks=4, + epochs=20, + shuffle_buffer=5000, + learning_rate=0.001), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.1, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}_MorganFingerprint-Transformer-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.tensorflow:TransformerOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for training Transformer oracle + oracle_kwargs=dict( + noise_std=0.0, + internal_batch_size=32, + max_samples=None, + distribution=None, + max_percentile=100, + min_percentile=0, + + # process the data into morgan fingerprints + feature_extractor=MorganFingerprintFeatures(dtype=np.int32), + + # parameters used for building the model + model_kwargs=dict(hidden_size=128, + feed_forward_size=512, + activation='relu', + num_heads=4, + num_blocks=4, + epochs=20, + shuffle_buffer=20000, + learning_rate=0.0001, + dropout_rate=0.2), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.1, + subset=None, + shard_size=50000, + to_disk=False))) + + register(f'ChEMBL_{standard_type}_{assay_chembl_id}-Transformer-v0', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', + 'design_bench.oracles.tensorflow:TransformerOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + distribution=None, + max_percentile=40, + min_percentile=0, + assay_chembl_id=assay_chembl_id, + standard_type=standard_type), + + # keyword arguments for training Transformer oracle + oracle_kwargs=dict( + noise_std=0.0, + internal_batch_size=32, + max_samples=None, + distribution=None, + max_percentile=100, + min_percentile=0, + + # parameters used for building the model + model_kwargs=dict(hidden_size=128, + feed_forward_size=512, + activation='relu', + num_heads=4, + num_blocks=4, + epochs=20, + shuffle_buffer=20000, + learning_rate=0.0001, + dropout_rate=0.2), + + # parameters used for building the validation set + split_kwargs=dict(val_fraction=0.1, + subset=None, + shard_size=50000, + to_disk=False))) register('ToyContinuous-Exact-v0', diff --git a/process_results.py b/process_results.py new file mode 100644 index 0000000..54dc288 --- /dev/null +++ b/process_results.py @@ -0,0 +1,66 @@ +import pickle as pkl +import numpy as np +from design_bench.datasets.discrete.chembl_dataset import ChEMBLDataset + + +if __name__ == "__main__": + + with open('type_assay_pairs.pkl', 'rb') as f: + type_assay_pairs = pkl.load(f) + + with open('results.txt', 'r') as f: + all_results = f.readlines() + + type_assay_pairs_to_y_lr = dict() + type_assay_pairs_to_y_rf = dict() + + type_assay_pairs_to_log_y_lr = dict() + type_assay_pairs_to_log_y_rf = dict() + + descriptors = [] + values = [] + + for row in all_results: + row = row.split(',') + if len(row) == 4: + idx, log_y, lr_corr, rf_corr = row + + idx = int(idx.replace("Index: ", "").strip()) + log_y = log_y.replace("log_y: ", "").strip() == "True" + lr_corr = float(lr_corr.replace("lr_corr: ", "").strip()) + rf_corr = float(rf_corr.replace("rf_corr: ", "").strip()) + + standard_type, assay = type_assay_pairs[idx] + dataset = ChEMBLDataset(assay_chembl_id=assay, standard_type=standard_type) + size = dataset.dataset_size + + print(standard_type, assay, size, log_y, lr_corr, rf_corr) + descriptors.append((standard_type, assay, size, log_y, 'lr_corr')) + descriptors.append((standard_type, assay, size, log_y, 'rf_corr')) + values.append(lr_corr) + values.append(rf_corr) + + if log_y: + type_assay_pairs_to_log_y_lr[(standard_type, assay)] = lr_corr + type_assay_pairs_to_log_y_rf[(standard_type, assay)] = rf_corr + + else: + type_assay_pairs_to_y_lr[(standard_type, assay)] = lr_corr + type_assay_pairs_to_y_rf[(standard_type, assay)] = rf_corr + + with open('type_assay_pairs_to_y_lr.pkl', 'wb') as f: + pkl.dump(type_assay_pairs_to_y_lr, f) + + with open('type_assay_pairs_to_y_rf.pkl', 'wb') as f: + pkl.dump(type_assay_pairs_to_y_rf, f) + + with open('type_assay_pairs_to_log_y_lr.pkl', 'wb') as f: + pkl.dump(type_assay_pairs_to_log_y_lr, f) + + with open('type_assay_pairs_to_log_y_rf.pkl', 'wb') as f: + pkl.dump(type_assay_pairs_to_log_y_rf, f) + + top_idx = np.argsort(values)[::-1] + for idx in top_idx: + print(descriptors[idx], values[idx]) + diff --git a/setup.py b/setup.py index 0498a52..e156c56 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ LONG_DESCRIPTION = readme.read() -setup(name='design-bench', version='2.0.16', license='MIT', +setup(name='design-bench', version='2.0.17', license='MIT', packages=find_packages(include=['design_bench', 'design_bench.*']), description='Design-Bench: Benchmarks for ' 'Data-Driven Offline Model-Based Optimization', @@ -17,7 +17,7 @@ author_email='brandon@btrabucco.com', url='https://github.com/brandontrabucco/design-bench', download_url='https://github.com/' - 'brandontrabucco/design-bench/archive/v2_0_16.tar.gz', + 'brandontrabucco/design-bench/archive/v2_0_17.tar.gz', keywords=['Deep Learning', 'Neural Networks', 'Benchmark', 'Model-Based Optimization'], extras_require={'all': ['gym[mujoco]'], 'cma': ['cma']},