diff --git a/README.md b/README.md index 2e83291..e7591de 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,14 @@ The goal of model-based optimization is to find an input **x** that maximizes an Design-Bench can be installed with the complete set of benchmarks via our pip package. ```bash -pip install design-bench[all]>=2.0.7 +pip install design-bench[all]>=2.0.8 pip install morphing-agents==1.4 ``` Alternatively, if you do not have MuJoCo, you may opt for a minimal install. ```bash -pip install design-bench>=2.0.7 +pip install design-bench>=2.0.8 ``` ## Available Tasks diff --git a/design_bench/__init__.py b/design_bench/__init__.py index 0243bbf..ad996b5 100644 --- a/design_bench/__init__.py +++ b/design_bench/__init__.py @@ -4,8 +4,23 @@ from sklearn.gaussian_process.kernels import ConstantKernel, RBF +register('ToyDiscrete-Exact-v0', + 'design_bench.datasets.discrete.toy_discrete_dataset:ToyDiscreteDataset', + 'design_bench.oracles.exact:ToyDiscreteOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + max_percentile=40, + min_percentile=0), + + # keyword arguments for building the exact oracle + oracle_kwargs=dict( + noise_std=0.0)) + + register('GFP-GP-v0', - 'design_bench.datasets.discrete:GFPDataset', + 'design_bench.datasets.discrete.gfp_dataset:GFPDataset', 'design_bench.oracles.sklearn:GaussianProcessOracle', # keyword arguments for building the dataset @@ -34,7 +49,7 @@ register('GFP-RandomForest-v0', - 'design_bench.datasets.discrete:GFPDataset', + 'design_bench.datasets.discrete.gfp_dataset:GFPDataset', 'design_bench.oracles.sklearn:RandomForestOracle', # keyword arguments for building the dataset @@ -65,7 +80,7 @@ register('GFP-FullyConnected-v0', - 'design_bench.datasets.discrete:GFPDataset', + 'design_bench.datasets.discrete.gfp_dataset:GFPDataset', 'design_bench.oracles.tensorflow:FullyConnectedOracle', # keyword arguments for building the dataset @@ -100,7 +115,7 @@ register('GFP-LSTM-v0', - 'design_bench.datasets.discrete:GFPDataset', + 'design_bench.datasets.discrete.gfp_dataset:GFPDataset', 'design_bench.oracles.tensorflow:LSTMOracle', # keyword arguments for building the dataset @@ -133,7 +148,7 @@ register('GFP-ResNet-v0', - 'design_bench.datasets.discrete:GFPDataset', + 'design_bench.datasets.discrete.gfp_dataset:GFPDataset', 'design_bench.oracles.tensorflow:ResNetOracle', # keyword arguments for building the dataset @@ -168,7 +183,7 @@ register('GFP-Transformer-v0', - 'design_bench.datasets.discrete:GFPDataset', + 'design_bench.datasets.discrete.gfp_dataset:GFPDataset', 'design_bench.oracles.tensorflow:TransformerOracle', # keyword arguments for building the dataset @@ -206,7 +221,7 @@ register('TFBind8-Exact-v0', - 'design_bench.datasets.discrete:TFBind8Dataset', + 'design_bench.datasets.discrete.tf_bind_8_dataset:TFBind8Dataset', 'design_bench.oracles.exact:TFBind8Oracle', # keyword arguments for building the dataset @@ -221,7 +236,7 @@ register('TFBind8-GP-v0', - 'design_bench.datasets.discrete:TFBind8Dataset', + 'design_bench.datasets.discrete.tf_bind_8_dataset:TFBind8Dataset', 'design_bench.oracles.sklearn:GaussianProcessOracle', # keyword arguments for building the dataset @@ -251,7 +266,7 @@ register('TFBind8-RandomForest-v0', - 'design_bench.datasets.discrete:TFBind8Dataset', + 'design_bench.datasets.discrete.tf_bind_8_dataset:TFBind8Dataset', 'design_bench.oracles.sklearn:RandomForestOracle', # keyword arguments for building the dataset @@ -283,7 +298,7 @@ register('TFBind8-FullyConnected-v0', - 'design_bench.datasets.discrete:TFBind8Dataset', + 'design_bench.datasets.discrete.tf_bind_8_dataset:TFBind8Dataset', 'design_bench.oracles.tensorflow:FullyConnectedOracle', # keyword arguments for building the dataset @@ -319,7 +334,7 @@ register('TFBind8-LSTM-v0', - 'design_bench.datasets.discrete:TFBind8Dataset', + 'design_bench.datasets.discrete.tf_bind_8_dataset:TFBind8Dataset', 'design_bench.oracles.tensorflow:LSTMOracle', # keyword arguments for building the dataset @@ -353,7 +368,7 @@ register('TFBind8-ResNet-v0', - 'design_bench.datasets.discrete:TFBind8Dataset', + 'design_bench.datasets.discrete.tf_bind_8_dataset:TFBind8Dataset', 'design_bench.oracles.tensorflow:ResNetOracle', # keyword arguments for building the dataset @@ -389,7 +404,7 @@ register('TFBind8-Transformer-v0', - 'design_bench.datasets.discrete:TFBind8Dataset', + 'design_bench.datasets.discrete.tf_bind_8_dataset:TFBind8Dataset', 'design_bench.oracles.tensorflow:TransformerOracle', # keyword arguments for building the dataset @@ -428,7 +443,7 @@ register('TFBind10-Exact-v0', - 'design_bench.datasets.discrete:TFBind10Dataset', + 'design_bench.datasets.discrete.tf_bind_10_dataset:TFBind10Dataset', 'design_bench.oracles.exact:TFBind10Oracle', # keyword arguments for building the dataset @@ -444,7 +459,7 @@ register('NASBench-Exact-v0', - 'design_bench.datasets.discrete:NASBenchDataset', + 'design_bench.datasets.discrete.nas_bench_dataset:NASBenchDataset', 'design_bench.oracles.exact:NASBenchOracle', # keyword arguments for building the dataset @@ -459,7 +474,7 @@ register('UTR-GP-v0', - 'design_bench.datasets.discrete:UTRDataset', + 'design_bench.datasets.discrete.utr_dataset:UTRDataset', 'design_bench.oracles.sklearn:GaussianProcessOracle', # keyword arguments for building the dataset @@ -488,7 +503,7 @@ register('UTR-RandomForest-v0', - 'design_bench.datasets.discrete:UTRDataset', + 'design_bench.datasets.discrete.utr_dataset:UTRDataset', 'design_bench.oracles.sklearn:RandomForestOracle', # keyword arguments for building the dataset @@ -519,7 +534,7 @@ register('UTR-FullyConnected-v0', - 'design_bench.datasets.discrete:UTRDataset', + 'design_bench.datasets.discrete.utr_dataset:UTRDataset', 'design_bench.oracles.tensorflow:FullyConnectedOracle', # keyword arguments for building the dataset @@ -554,7 +569,7 @@ register('UTR-LSTM-v0', - 'design_bench.datasets.discrete:UTRDataset', + 'design_bench.datasets.discrete.utr_dataset:UTRDataset', 'design_bench.oracles.tensorflow:LSTMOracle', # keyword arguments for building the dataset @@ -587,7 +602,7 @@ register('UTR-ResNet-v0', - 'design_bench.datasets.discrete:UTRDataset', + 'design_bench.datasets.discrete.utr_dataset:UTRDataset', 'design_bench.oracles.tensorflow:ResNetOracle', # keyword arguments for building the dataset @@ -622,7 +637,7 @@ register('UTR-Transformer-v0', - 'design_bench.datasets.discrete:UTRDataset', + 'design_bench.datasets.discrete.utr_dataset:UTRDataset', 'design_bench.oracles.tensorflow:TransformerOracle', # keyword arguments for building the dataset @@ -660,7 +675,7 @@ register('ChEMBL-GP-v0', - 'design_bench.datasets.discrete:ChEMBLDataset', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', 'design_bench.oracles.sklearn:GaussianProcessOracle', # keyword arguments for building the dataset @@ -692,7 +707,7 @@ register('ChEMBL-RandomForest-v0', - 'design_bench.datasets.discrete:ChEMBLDataset', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', 'design_bench.oracles.sklearn:RandomForestOracle', # keyword arguments for building the dataset @@ -725,7 +740,7 @@ register('ChEMBL-FullyConnected-v0', - 'design_bench.datasets.discrete:ChEMBLDataset', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', 'design_bench.oracles.tensorflow:FullyConnectedOracle', # keyword arguments for building the dataset @@ -762,7 +777,7 @@ register('ChEMBL-LSTM-v0', - 'design_bench.datasets.discrete:ChEMBLDataset', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', 'design_bench.oracles.tensorflow:LSTMOracle', # keyword arguments for building the dataset @@ -797,7 +812,7 @@ register('ChEMBL-ResNet-v0', - 'design_bench.datasets.discrete:ChEMBLDataset', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', 'design_bench.oracles.tensorflow:ResNetOracle', # keyword arguments for building the dataset @@ -834,7 +849,7 @@ register('ChEMBL-Transformer-v0', - 'design_bench.datasets.discrete:ChEMBLDataset', + 'design_bench.datasets.discrete.chembl_dataset:ChEMBLDataset', 'design_bench.oracles.tensorflow:TransformerOracle', # keyword arguments for building the dataset @@ -873,8 +888,23 @@ is_absolute=False))) +register('ToyContinuous-Exact-v0', + 'design_bench.datasets.continuous.toy_continuous_dataset:ToyContinuousDataset', + 'design_bench.oracles.exact:ToyContinuousOracle', + + # keyword arguments for building the dataset + dataset_kwargs=dict( + max_samples=None, + max_percentile=40, + min_percentile=0), + + # keyword arguments for building the exact oracle + oracle_kwargs=dict( + noise_std=0.0)) + + register('HopperController-Exact-v0', - 'design_bench.datasets.continuous:HopperControllerDataset', + 'design_bench.datasets.continuous.hopper_controller_dataset:HopperControllerDataset', 'design_bench.oracles.exact:HopperControllerOracle', # keyword arguments for building the dataset @@ -889,7 +919,7 @@ register('HopperController-GP-v0', - 'design_bench.datasets.continuous:HopperControllerDataset', + 'design_bench.datasets.continuous.hopper_controller_dataset:HopperControllerDataset', 'design_bench.oracles.sklearn:GaussianProcessOracle', # keyword arguments for building the dataset @@ -921,7 +951,7 @@ register('HopperController-RandomForest-v0', - 'design_bench.datasets.continuous:HopperControllerDataset', + 'design_bench.datasets.continuous.hopper_controller_dataset:HopperControllerDataset', 'design_bench.oracles.sklearn:RandomForestOracle', # keyword arguments for building the dataset @@ -952,7 +982,7 @@ register('HopperController-FullyConnected-v0', - 'design_bench.datasets.continuous:HopperControllerDataset', + 'design_bench.datasets.continuous.hopper_controller_dataset:HopperControllerDataset', 'design_bench.oracles.tensorflow:FullyConnectedOracle', # keyword arguments for building the dataset @@ -986,7 +1016,7 @@ register('Superconductor-GP-v0', - 'design_bench.datasets.continuous:SuperconductorDataset', + 'design_bench.datasets.continuous.superconductor_dataset:SuperconductorDataset', 'design_bench.oracles.sklearn:GaussianProcessOracle', # keyword arguments for building the dataset @@ -1018,7 +1048,7 @@ register('Superconductor-RandomForest-v0', - 'design_bench.datasets.continuous:SuperconductorDataset', + 'design_bench.datasets.continuous.superconductor_dataset:SuperconductorDataset', 'design_bench.oracles.sklearn:RandomForestOracle', # keyword arguments for building the dataset @@ -1049,7 +1079,7 @@ register('Superconductor-FullyConnected-v0', - 'design_bench.datasets.continuous:SuperconductorDataset', + 'design_bench.datasets.continuous.superconductor_dataset:SuperconductorDataset', 'design_bench.oracles.tensorflow:FullyConnectedOracle', # keyword arguments for building the dataset @@ -1083,7 +1113,7 @@ register('AntMorphology-Exact-v0', - 'design_bench.datasets.continuous:AntMorphologyDataset', + 'design_bench.datasets.continuous.ant_morphology_dataset:AntMorphologyDataset', 'design_bench.oracles.exact:AntMorphologyOracle', # keyword arguments for building the dataset @@ -1098,7 +1128,7 @@ register('AntMorphology-GP-v0', - 'design_bench.datasets.continuous:AntMorphologyDataset', + 'design_bench.datasets.continuous.ant_morphology_dataset:AntMorphologyDataset', 'design_bench.oracles.sklearn:GaussianProcessOracle', # keyword arguments for building the dataset @@ -1130,7 +1160,7 @@ register('AntMorphology-RandomForest-v0', - 'design_bench.datasets.continuous:AntMorphologyDataset', + 'design_bench.datasets.continuous.ant_morphology_dataset:AntMorphologyDataset', 'design_bench.oracles.sklearn:RandomForestOracle', # keyword arguments for building the dataset @@ -1161,7 +1191,7 @@ register('AntMorphology-FullyConnected-v0', - 'design_bench.datasets.continuous:AntMorphologyDataset', + 'design_bench.datasets.continuous.ant_morphology_dataset:AntMorphologyDataset', 'design_bench.oracles.tensorflow:FullyConnectedOracle', # keyword arguments for building the dataset @@ -1195,7 +1225,7 @@ register('DKittyMorphology-Exact-v0', - 'design_bench.datasets.continuous:DKittyMorphologyDataset', + 'design_bench.datasets.continuous.dkitty_morphology_dataset:DKittyMorphologyDataset', 'design_bench.oracles.exact:DKittyMorphologyOracle', # keyword arguments for building the dataset @@ -1210,7 +1240,7 @@ register('DKittyMorphology-GP-v0', - 'design_bench.datasets.continuous:DKittyMorphologyDataset', + 'design_bench.datasets.continuous.dkitty_morphology_dataset:DKittyMorphologyDataset', 'design_bench.oracles.sklearn:GaussianProcessOracle', # keyword arguments for building the dataset @@ -1242,7 +1272,7 @@ register('DKittyMorphology-RandomForest-v0', - 'design_bench.datasets.continuous:DKittyMorphologyDataset', + 'design_bench.datasets.continuous.dkitty_morphology_dataset:DKittyMorphologyDataset', 'design_bench.oracles.sklearn:RandomForestOracle', # keyword arguments for building the dataset @@ -1273,7 +1303,7 @@ register('DKittyMorphology-FullyConnected-v0', - 'design_bench.datasets.continuous:DKittyMorphologyDataset', + 'design_bench.datasets.continuous.dkitty_morphology_dataset:DKittyMorphologyDataset', 'design_bench.oracles.tensorflow:FullyConnectedOracle', # keyword arguments for building the dataset diff --git a/design_bench/datasets/continuous/__init__.py b/design_bench/datasets/continuous/__init__.py index 1d91bab..8b13789 100644 --- a/design_bench/datasets/continuous/__init__.py +++ b/design_bench/datasets/continuous/__init__.py @@ -1,4 +1 @@ -from .hopper_controller_dataset import HopperControllerDataset -from .superconductor_dataset import SuperconductorDataset -from .ant_morphology_dataset import AntMorphologyDataset -from .dkitty_morphology_dataset import DKittyMorphologyDataset + diff --git a/design_bench/datasets/continuous/toy_continuous_dataset.py b/design_bench/datasets/continuous/toy_continuous_dataset.py new file mode 100644 index 0000000..7e1eff1 --- /dev/null +++ b/design_bench/datasets/continuous/toy_continuous_dataset.py @@ -0,0 +1,249 @@ +from design_bench.datasets.continuous_dataset import ContinuousDataset +from design_bench.disk_resource import DiskResource + + +TOY_CONTINUOUS_FILES = ["toy_continuous/toy_continuous-x-0.npy", + "toy_continuous/toy_continuous-x-1.npy", + "toy_continuous/toy_continuous-x-2.npy", + "toy_continuous/toy_continuous-x-3.npy", + "toy_continuous/toy_continuous-x-4.npy", + "toy_continuous/toy_continuous-x-5.npy", + "toy_continuous/toy_continuous-x-6.npy", + "toy_continuous/toy_continuous-x-7.npy", + "toy_continuous/toy_continuous-x-8.npy", + "toy_continuous/toy_continuous-x-9.npy", + "toy_continuous/toy_continuous-x-10.npy", + "toy_continuous/toy_continuous-x-11.npy", + "toy_continuous/toy_continuous-x-12.npy", + "toy_continuous/toy_continuous-x-13.npy"] + + +class ToyContinuousDataset(ContinuousDataset): + """A toy dataset that defines a common set of functions + and attributes for a model-based optimization dataset, where the + goal is to find a design 'x' that maximizes a prediction 'y': + + max_x { y = f(x) } + + Public Attributes: + + name: str + An attribute that specifies the name of a model-based optimization + dataset, which might be used when labelling plots in a diagram of + performance in a research paper using design-bench + x_name: str + An attribute that specifies the name of designs in a model-based + optimization dataset, which might be used when labelling plots + in a visualization of performance in a research paper + y_name: str + An attribute that specifies the name of predictions in a model-based + optimization dataset, which might be used when labelling plots + in a visualization of performance in a research paper + + x: np.ndarray + the design values 'x' for a model-based optimization problem + represented as a numpy array of arbitrary type + input_shape: Tuple[int] + the shape of a single design values 'x', represented as a list of + integers similar to calling np.ndarray.shape + input_size: int + the total number of components in the design values 'x', represented + as a single integer, the product of its shape entries + input_dtype: np.dtype + the data type of the design values 'x', which is typically either + floating point or integer (np.float32 or np.int32) + + y: np.ndarray + the prediction values 'y' for a model-based optimization problem + represented by a scalar floating point value per 'x' + output_shape: Tuple[int] + the shape of a single prediction value 'y', represented as a list of + integers similar to calling np.ndarray.shape + output_size: int + the total number of components in the prediction values 'y', + represented as a single integer, the product of its shape entries + output_dtype: np.dtype + the data type of the prediction values 'y', which is typically a + type of floating point (np.float32 or np.float16) + + dataset_size: int + the total number of paired design values 'x' and prediction values + 'y' in the dataset, represented as a single integer + dataset_max_percentile: float + the percentile between 0 and 100 of prediction values 'y' above + which are hidden from access by members outside the class + dataset_min_percentile: float + the percentile between 0 and 100 of prediction values 'y' below + which are hidden from access by members outside the class + dataset_max_output: float + the specific cutoff threshold for prediction values 'y' above + which are hidden from access by members outside the class + dataset_min_output: float + the specific cutoff threshold for prediction values 'y' below + which are hidden from access by members outside the class + + internal_batch_size: int + the integer number of samples per batch that is used internally + when processing the dataset and generating samples + freeze_statistics: bool + a boolean indicator that when set to true prevents methods from + changing the normalization and sub sampling statistics + + is_normalized_x: bool + a boolean indicator that specifies whether the design values + in the dataset are being normalized + x_mean: np.ndarray + a numpy array that is automatically calculated to be the mean + of visible design values in the dataset + x_standard_dev: np.ndarray + a numpy array that is automatically calculated to be the standard + deviation of visible design values in the dataset + + is_normalized_y: bool + a boolean indicator that specifies whether the prediction values + in the dataset are being normalized + y_mean: np.ndarray + a numpy array that is automatically calculated to be the mean + of visible prediction values in the dataset + y_standard_dev: np.ndarray + a numpy array that is automatically calculated to be the standard + deviation of visible prediction values in the dataset + + Public Methods: + + iterate_batches(batch_size: int, return_x: bool, + return_y: bool, drop_remainder: bool) + -> Iterable[Tuple[np.ndarray, np.ndarray]]: + Returns an object that supports iterations, which yields tuples of + design values 'x' and prediction values 'y' from a model-based + optimization data set for training a model + iterate_samples(return_x: bool, return_y: bool): + -> Iterable[Tuple[np.ndarray, np.ndarray]]: + Returns an object that supports iterations, which yields tuples of + design values 'x' and prediction values 'y' from a model-based + optimization data set for training a model + + subsample(max_samples: int, + max_percentile: float, + min_percentile: float): + a function that exposes a subsampled version of a much larger + model-based optimization dataset containing design values 'x' + whose prediction values 'y' are skewed + relabel(relabel_function: + Callable[[np.ndarray, np.ndarray], np.ndarray]): + a function that accepts a function that maps from a dataset of + design values 'x' and prediction values y to a new set of + prediction values 'y' and relabels the model-based optimization dataset + + clone(subset: set, shard_size: int, + to_disk: bool, disk_target: str, is_absolute: bool): + Generate a cloned copy of a model-based optimization dataset + using the provided name and shard generation settings; useful + when relabelling a dataset buffer from the dis + split(fraction: float, subset: set, shard_size: int, + to_disk: bool, disk_target: str, is_absolute: bool): + split a model-based optimization data set into a training set and + a validation set allocating 'fraction' of the data set to the + validation set and the rest to the training set + + normalize_x(new_x: np.ndarray) -> np.ndarray: + a helper function that accepts floating point design values 'x' + as input and standardizes them so that they have zero + empirical mean and unit empirical variance + denormalize_x(new_x: np.ndarray) -> np.ndarray: + a helper function that accepts floating point design values 'x' + as input and undoes standardization so that they have their + original empirical mean and variance + normalize_y(new_x: np.ndarray) -> np.ndarray: + a helper function that accepts floating point prediction values 'y' + as input and standardizes them so that they have zero + empirical mean and unit empirical variance + denormalize_y(new_x: np.ndarray) -> np.ndarray: + a helper function that accepts floating point prediction values 'y' + as input and undoes standardization so that they have their + original empirical mean and variance + + map_normalize_x(): + a destructive function that standardizes the design values 'x' + in the class dataset in-place so that they have zero empirical + mean and unit variance + map_denormalize_x(): + a destructive function that undoes standardization of the + design values 'x' in the class dataset in-place which are expected + to have zero empirical mean and unit variance + map_normalize_y(): + a destructive function that standardizes the prediction values 'y' + in the class dataset in-place so that they have zero empirical + mean and unit variance + map_denormalize_y(): + a destructive function that undoes standardization of the + prediction values 'y' in the class dataset in-place which are + expected to have zero empirical mean and unit variance + + """ + + name = "toy_continuous/toy_continuous" + x_name = "toy_design" + y_name = "toy_prediction" + + @staticmethod + def register_x_shards(): + """Registers a remote file for download that contains design values + in a format compatible with the dataset builder class; + these files are downloaded all at once in the dataset initialization + + Returns: + + resources: list of RemoteResource + a list of RemoteResource objects specific to this dataset, which + will be automatically downloaded while the dataset is built + and may serve as shards if the dataset is large + + """ + + return [DiskResource( + file, is_absolute=False, + download_target=f"https://design-bench." + f"s3-us-west-1.amazonaws.com/{file}", + download_method="direct") for file in TOY_CONTINUOUS_FILES] + + @staticmethod + def register_y_shards(): + """Registers a remote file for download that contains prediction + values in a format compatible with the dataset builder class; + these files are downloaded all at once in the dataset initialization + + Returns: + + resources: list of RemoteResource + a list of RemoteResource objects specific to this dataset, which + will be automatically downloaded while the dataset is built + and may serve as shards if the dataset is large + + """ + + return [DiskResource( + file.replace("-x-", "-y-"), is_absolute=False, + download_target=f"https://design-bench." + f"s3-us-west-1.amazonaws.com/" + f"{file.replace('-x-', '-y-')}", + download_method="direct") for file in TOY_CONTINUOUS_FILES] + + def __init__(self, **kwargs): + """Initialize a model-based optimization dataset and prepare + that dataset by loading that dataset from disk and modifying + its distribution + + Arguments: + + **kwargs: dict + additional keyword arguments which are used to parameterize the + data set generation process, including which shard files are used + if multiple sets of data set shard files can be loaded + + """ + + # initialize the dataset using the method in the base class + super(ToyContinuousDataset, self).__init__( + self.register_x_shards(), + self.register_y_shards(), **kwargs) diff --git a/design_bench/datasets/discrete/__init__.py b/design_bench/datasets/discrete/__init__.py index af89ce3..8b13789 100644 --- a/design_bench/datasets/discrete/__init__.py +++ b/design_bench/datasets/discrete/__init__.py @@ -1,6 +1 @@ -from .chembl_dataset import ChEMBLDataset -from .gfp_dataset import GFPDataset -from .nas_bench_dataset import NASBenchDataset -from .tf_bind_8_dataset import TFBind8Dataset -from .tf_bind_10_dataset import TFBind10Dataset -from .utr_dataset import UTRDataset + diff --git a/design_bench/datasets/discrete/toy_discrete_dataset.py b/design_bench/datasets/discrete/toy_discrete_dataset.py new file mode 100644 index 0000000..900c04b --- /dev/null +++ b/design_bench/datasets/discrete/toy_discrete_dataset.py @@ -0,0 +1,281 @@ +from design_bench.datasets.discrete_dataset import DiscreteDataset +from design_bench.disk_resource import DiskResource + + +TOY_DISCRETE_FILES = ["toy_discrete/toy_discrete-x-0.npy", + "toy_discrete/toy_discrete-x-1.npy", + "toy_discrete/toy_discrete-x-2.npy", + "toy_discrete/toy_discrete-x-3.npy", + "toy_discrete/toy_discrete-x-4.npy", + "toy_discrete/toy_discrete-x-5.npy", + "toy_discrete/toy_discrete-x-6.npy", + "toy_discrete/toy_discrete-x-7.npy", + "toy_discrete/toy_discrete-x-8.npy", + "toy_discrete/toy_discrete-x-9.npy", + "toy_discrete/toy_discrete-x-10.npy", + "toy_discrete/toy_discrete-x-11.npy", + "toy_discrete/toy_discrete-x-12.npy", + "toy_discrete/toy_discrete-x-13.npy"] + + +class ToyDiscreteDataset(DiscreteDataset): + """A toy dataset that defines a common set of functions + and attributes for a model-based optimization dataset, where the + goal is to find a design 'x' that maximizes a prediction 'y': + + max_x { y = f(x) } + + Public Attributes: + + name: str + An attribute that specifies the name of a model-based optimization + dataset, which might be used when labelling plots in a diagram of + performance in a research paper using design-bench + x_name: str + An attribute that specifies the name of designs in a model-based + optimization dataset, which might be used when labelling plots + in a visualization of performance in a research paper + y_name: str + An attribute that specifies the name of predictions in a model-based + optimization dataset, which might be used when labelling plots + in a visualization of performance in a research paper + + x: np.ndarray + the design values 'x' for a model-based optimization problem + represented as a numpy array of arbitrary type + input_shape: Tuple[int] + the shape of a single design values 'x', represented as a list of + integers similar to calling np.ndarray.shape + input_size: int + the total number of components in the design values 'x', represented + as a single integer, the product of its shape entries + input_dtype: np.dtype + the data type of the design values 'x', which is typically either + floating point or integer (np.float32 or np.int32) + + y: np.ndarray + the prediction values 'y' for a model-based optimization problem + represented by a scalar floating point value per 'x' + output_shape: Tuple[int] + the shape of a single prediction value 'y', represented as a list of + integers similar to calling np.ndarray.shape + output_size: int + the total number of components in the prediction values 'y', + represented as a single integer, the product of its shape entries + output_dtype: np.dtype + the data type of the prediction values 'y', which is typically a + type of floating point (np.float32 or np.float16) + + dataset_size: int + the total number of paired design values 'x' and prediction values + 'y' in the dataset, represented as a single integer + dataset_max_percentile: float + the percentile between 0 and 100 of prediction values 'y' above + which are hidden from access by members outside the class + dataset_min_percentile: float + the percentile between 0 and 100 of prediction values 'y' below + which are hidden from access by members outside the class + dataset_max_output: float + the specific cutoff threshold for prediction values 'y' above + which are hidden from access by members outside the class + dataset_min_output: float + the specific cutoff threshold for prediction values 'y' below + which are hidden from access by members outside the class + + internal_batch_size: int + the integer number of samples per batch that is used internally + when processing the dataset and generating samples + freeze_statistics: bool + a boolean indicator that when set to true prevents methods from + changing the normalization and sub sampling statistics + + is_normalized_x: bool + a boolean indicator that specifies whether the design values + in the dataset are being normalized + x_mean: np.ndarray + a numpy array that is automatically calculated to be the mean + of visible design values in the dataset + x_standard_dev: np.ndarray + a numpy array that is automatically calculated to be the standard + deviation of visible design values in the dataset + + is_normalized_y: bool + a boolean indicator that specifies whether the prediction values + in the dataset are being normalized + y_mean: np.ndarray + a numpy array that is automatically calculated to be the mean + of visible prediction values in the dataset + y_standard_dev: np.ndarray + a numpy array that is automatically calculated to be the standard + deviation of visible prediction values in the dataset + + is_logits: bool (only supported for a DiscreteDataset) + a value that indicates whether the design values contained in the + model-based optimization dataset have already been converted to + logits and need not be converted again + + Public Methods: + + iterate_batches(batch_size: int, return_x: bool, + return_y: bool, drop_remainder: bool) + -> Iterable[Tuple[np.ndarray, np.ndarray]]: + Returns an object that supports iterations, which yields tuples of + design values 'x' and prediction values 'y' from a model-based + optimization data set for training a model + iterate_samples(return_x: bool, return_y: bool): + -> Iterable[Tuple[np.ndarray, np.ndarray]]: + Returns an object that supports iterations, which yields tuples of + design values 'x' and prediction values 'y' from a model-based + optimization data set for training a model + + subsample(max_samples: int, + max_percentile: float, + min_percentile: float): + a function that exposes a subsampled version of a much larger + model-based optimization dataset containing design values 'x' + whose prediction values 'y' are skewed + relabel(relabel_function: + Callable[[np.ndarray, np.ndarray], np.ndarray]): + a function that accepts a function that maps from a dataset of + design values 'x' and prediction values y to a new set of + prediction values 'y' and relabels the model-based optimization dataset + + clone(subset: set, shard_size: int, + to_disk: bool, disk_target: str, is_absolute: bool): + Generate a cloned copy of a model-based optimization dataset + using the provided name and shard generation settings; useful + when relabelling a dataset buffer from the dis + split(fraction: float, subset: set, shard_size: int, + to_disk: bool, disk_target: str, is_absolute: bool): + split a model-based optimization data set into a training set and + a validation set allocating 'fraction' of the data set to the + validation set and the rest to the training set + + normalize_x(new_x: np.ndarray) -> np.ndarray: + a helper function that accepts floating point design values 'x' + as input and standardizes them so that they have zero + empirical mean and unit empirical variance + denormalize_x(new_x: np.ndarray) -> np.ndarray: + a helper function that accepts floating point design values 'x' + as input and undoes standardization so that they have their + original empirical mean and variance + normalize_y(new_x: np.ndarray) -> np.ndarray: + a helper function that accepts floating point prediction values 'y' + as input and standardizes them so that they have zero + empirical mean and unit empirical variance + denormalize_y(new_x: np.ndarray) -> np.ndarray: + a helper function that accepts floating point prediction values 'y' + as input and undoes standardization so that they have their + original empirical mean and variance + + map_normalize_x(): + a destructive function that standardizes the design values 'x' + in the class dataset in-place so that they have zero empirical + mean and unit variance + map_denormalize_x(): + a destructive function that undoes standardization of the + design values 'x' in the class dataset in-place which are expected + to have zero empirical mean and unit variance + map_normalize_y(): + a destructive function that standardizes the prediction values 'y' + in the class dataset in-place so that they have zero empirical + mean and unit variance + map_denormalize_y(): + a destructive function that undoes standardization of the + prediction values 'y' in the class dataset in-place which are + expected to have zero empirical mean and unit variance + + --- for discrete tasks only + + to_logits(np.ndarray) > np.ndarray: + A helper function that accepts design values represented as a numpy + array of integers as input and converts them to floating point + logits of a certain probability distribution + to_integers(np.ndarray) > np.ndarray: + A helper function that accepts design values represented as a numpy + array of floating point logits as input and converts them to integer + representing the max of the distribution + + map_to_logits(): + a function that processes the dataset corresponding to this + model-based optimization problem, and converts integers to a + floating point representation as logits + map_to_integers(): + a function that processes the dataset corresponding to this + model-based optimization problem, and converts a floating point + representation as logits to integers + + """ + + name = "toy_discrete/toy_discrete" + x_name = "toy_design" + y_name = "toy_prediction" + + @staticmethod + def register_x_shards(): + """Registers a remote file for download that contains design values + in a format compatible with the dataset builder class; + these files are downloaded all at once in the dataset initialization + + Returns: + + resources: list of RemoteResource + a list of RemoteResource objects specific to this dataset, which + will be automatically downloaded while the dataset is built + and may serve as shards if the dataset is large + + """ + + return [DiskResource( + file, is_absolute=False, + download_target=f"https://design-bench." + f"s3-us-west-1.amazonaws.com/{file}", + download_method="direct") for file in TOY_DISCRETE_FILES] + + @staticmethod + def register_y_shards(): + """Registers a remote file for download that contains prediction + values in a format compatible with the dataset builder class; + these files are downloaded all at once in the dataset initialization + + Returns: + + resources: list of RemoteResource + a list of RemoteResource objects specific to this dataset, which + will be automatically downloaded while the dataset is built + and may serve as shards if the dataset is large + + """ + + return [DiskResource( + file.replace("-x-", "-y-"), is_absolute=False, + download_target=f"https://design-bench." + f"s3-us-west-1.amazonaws.com/" + f"{file.replace('-x-', '-y-')}", + download_method="direct") for file in TOY_DISCRETE_FILES] + + def __init__(self, soft_interpolation=0.6, **kwargs): + """Initialize a model-based optimization dataset and prepare + that dataset by loading that dataset from disk and modifying + its distribution + + Arguments: + + soft_interpolation: float + floating point hyper parameter used when converting design values + from integers to a floating point representation as logits, which + interpolates between a uniform and dirac distribution + 1.0 = dirac, 0.0 -> uniform + **kwargs: dict + additional keyword arguments which are used to parameterize the + data set generation process, including which shard files are used + if multiple sets of data set shard files can be loaded + + """ + + # initialize the dataset using the method in the base class + super(ToyDiscreteDataset, self).__init__( + self.register_x_shards(), + self.register_y_shards(), + is_logits=False, num_classes=4, + soft_interpolation=soft_interpolation, **kwargs) diff --git a/design_bench/oracles/exact/__init__.py b/design_bench/oracles/exact/__init__.py index 13c9343..cf023cf 100644 --- a/design_bench/oracles/exact/__init__.py +++ b/design_bench/oracles/exact/__init__.py @@ -1,6 +1,8 @@ from .hopper_controller_oracle import HopperControllerOracle from .ant_morphology_oracle import AntMorphologyOracle from .dkitty_morphology_oracle import DKittyMorphologyOracle +from .toy_continuous_oracle import ToyContinuousOracle from .nas_bench_oracle import NASBenchOracle from .tf_bind_8_oracle import TFBind8Oracle from .tf_bind_10_oracle import TFBind10Oracle +from .toy_discrete_oracle import ToyDiscreteOracle diff --git a/design_bench/oracles/exact/ant_morphology_oracle.py b/design_bench/oracles/exact/ant_morphology_oracle.py index bfd9dd8..56b68fd 100644 --- a/design_bench/oracles/exact/ant_morphology_oracle.py +++ b/design_bench/oracles/exact/ant_morphology_oracle.py @@ -4,7 +4,7 @@ from morphing_agents.mujoco.ant.elements import LEG_UPPER_BOUND from design_bench.oracles.exact_oracle import ExactOracle from design_bench.datasets.continuous_dataset import ContinuousDataset -from design_bench.datasets.continuous import AntMorphologyDataset +from design_bench.datasets.continuous.ant_morphology_dataset import AntMorphologyDataset from design_bench.disk_resource import DiskResource import numpy as np import pickle as pkl diff --git a/design_bench/oracles/exact/dkitty_morphology_oracle.py b/design_bench/oracles/exact/dkitty_morphology_oracle.py index 5081624..1931039 100644 --- a/design_bench/oracles/exact/dkitty_morphology_oracle.py +++ b/design_bench/oracles/exact/dkitty_morphology_oracle.py @@ -4,7 +4,7 @@ from morphing_agents.mujoco.dkitty.elements import LEG_UPPER_BOUND from design_bench.oracles.exact_oracle import ExactOracle from design_bench.datasets.continuous_dataset import ContinuousDataset -from design_bench.datasets.continuous import DKittyMorphologyDataset +from design_bench.datasets.continuous.dkitty_morphology_dataset import DKittyMorphologyDataset from design_bench.disk_resource import DiskResource import numpy as np import pickle as pkl diff --git a/design_bench/oracles/exact/toy_continuous_oracle.py b/design_bench/oracles/exact/toy_continuous_oracle.py new file mode 100644 index 0000000..f5fcf15 --- /dev/null +++ b/design_bench/oracles/exact/toy_continuous_oracle.py @@ -0,0 +1,162 @@ +from design_bench.oracles.exact_oracle import ExactOracle +from design_bench.datasets.continuous_dataset import ContinuousDataset +from design_bench.datasets.continuous.toy_continuous_dataset import ToyContinuousDataset +from design_bench.disk_resource import DiskResource +import numpy as np + + +class ToyContinuousOracle(ExactOracle): + """An abstract class for managing the ground truth score functions f(x) + for model-based optimization problems, where the + goal is to find a design 'x' that maximizes a prediction 'y': + + max_x { y = f(x) } + + Public Attributes: + + external_dataset: DatasetBuilder + an instance of a subclass of the DatasetBuilder class which points to + the mutable task dataset for a model-based optimization problem + + internal_dataset: DatasetBuilder + an instance of a subclass of the DatasetBuilder class which has frozen + statistics and is used for training the oracle + + is_batched: bool + a boolean variable that indicates whether the evaluation function + implemented for a particular oracle is batched, which effects + the scaling coefficient of its computational cost + + internal_batch_size: int + an integer representing the number of design values to process + internally at the same time, if None defaults to the entire + tensor given to the self.score method + internal_measurements: int + an integer representing the number of independent measurements of + the prediction made by the oracle, which are subsequently + averaged, and is useful when the oracle is stochastic + + noise_std: float + the standard deviation of gaussian noise added to the prediction + values 'y' coming out of the ground truth score function f(x) + in order to make the optimization problem difficult + + expect_normalized_y: bool + a boolean indicator that specifies whether the inputs to the oracle + score function are expected to be normalized + expect_normalized_x: bool + a boolean indicator that specifies whether the outputs of the oracle + score function are expected to be normalized + expect_logits: bool + a boolean that specifies whether the oracle score function is + expecting logits when the dataset is discrete + + Public Methods: + + predict(np.ndarray) -> np.ndarray: + a function that accepts a batch of design values 'x' as input and for + each design computes a prediction value 'y' which corresponds + to the score in a model-based optimization problem + + check_input_format(DatasetBuilder) -> bool: + a function that accepts a list of integers as input and returns true + when design values 'x' with the shape specified by that list are + compatible with this class of approximate oracle + + """ + + name = "toy_prediction" + + @classmethod + def supported_datasets(cls): + """An attribute the defines the set of dataset classes which this + oracle can be applied to forming a valid ground truth score + function for a model-based optimization problem + + """ + + return {ToyContinuousDataset} + + @classmethod + def fully_characterized(cls): + """An attribute the defines whether all possible inputs to the + model-based optimization problem have been evaluated and + are are returned via lookup in self.predict + + """ + + return False + + @classmethod + def is_simulated(cls): + """An attribute the defines whether the values returned by the oracle + were obtained by running a computer simulation rather than + performing physical experiments with real data + + """ + + return True + + def protected_predict(self, x): + """Score function to be implemented by oracle subclasses, where x is + either a batch of designs if self.is_batched is True or is a + single design when self._is_batched is False + + Arguments: + + x_batch: np.ndarray + a batch or single design 'x' that will be given as input to the + oracle model in order to obtain a prediction value 'y' for + each 'x' which is then returned + + Returns: + + y_batch: np.ndarray + a batch or single prediction 'y' made by the oracle model, + corresponding to the ground truth score for each design + value 'x' in a model-based optimization problem + + """ + + return np.square(x - + self.optimum).sum(keepdims=True).astype(np.float32) + + def __init__(self, dataset: ContinuousDataset, **kwargs): + """Initialize the ground truth score function f(x) for a model-based + optimization problem, which involves loading the parameters of an + oracle model and estimating its computational cost + + Arguments: + + dataset: DiscreteDataset + an instance of a subclass of the DatasetBuilder class which has + a set of design values 'x' and prediction values 'y', and defines + batching and sampling methods for those attributes + noise_std: float + the standard deviation of gaussian noise added to the prediction + values 'y' coming out of the ground truth score function f(x) + in order to make the optimization problem difficult + internal_measurements: int + an integer representing the number of independent measurements of + the prediction made by the oracle, which are subsequently + averaged, and is useful when the oracle is stochastic + + """ + + # ensure optimum has been downloaded + optimum = "toy_discrete/optimum.npy" + optimum = DiskResource( + optimum, is_absolute=False, download_method="direct", + download_target=f"https://design-bench." + f"s3-us-west-1.amazonaws.com/{optimum}") + if not optimum.is_downloaded and not optimum.download(): + raise ValueError("unable to download optimum for toy example") + + # load optimum used to calculate y values + self.optimum = np.load(optimum.disk_target) + + # initialize the oracle using the super class + super(ToyContinuousOracle, self).__init__( + dataset, internal_batch_size=1, is_batched=False, + expect_normalized_y=False, + expect_normalized_x=False, expect_logits=None, **kwargs) diff --git a/design_bench/oracles/exact/toy_discrete_oracle.py b/design_bench/oracles/exact/toy_discrete_oracle.py new file mode 100644 index 0000000..f83aa56 --- /dev/null +++ b/design_bench/oracles/exact/toy_discrete_oracle.py @@ -0,0 +1,162 @@ +from design_bench.oracles.exact_oracle import ExactOracle +from design_bench.datasets.discrete_dataset import DiscreteDataset +from design_bench.datasets.discrete.toy_discrete_dataset import ToyDiscreteDataset +from design_bench.disk_resource import DiskResource +import numpy as np + + +class ToyDiscreteOracle(ExactOracle): + """An abstract class for managing the ground truth score functions f(x) + for model-based optimization problems, where the + goal is to find a design 'x' that maximizes a prediction 'y': + + max_x { y = f(x) } + + Public Attributes: + + external_dataset: DatasetBuilder + an instance of a subclass of the DatasetBuilder class which points to + the mutable task dataset for a model-based optimization problem + + internal_dataset: DatasetBuilder + an instance of a subclass of the DatasetBuilder class which has frozen + statistics and is used for training the oracle + + is_batched: bool + a boolean variable that indicates whether the evaluation function + implemented for a particular oracle is batched, which effects + the scaling coefficient of its computational cost + + internal_batch_size: int + an integer representing the number of design values to process + internally at the same time, if None defaults to the entire + tensor given to the self.score method + internal_measurements: int + an integer representing the number of independent measurements of + the prediction made by the oracle, which are subsequently + averaged, and is useful when the oracle is stochastic + + noise_std: float + the standard deviation of gaussian noise added to the prediction + values 'y' coming out of the ground truth score function f(x) + in order to make the optimization problem difficult + + expect_normalized_y: bool + a boolean indicator that specifies whether the inputs to the oracle + score function are expected to be normalized + expect_normalized_x: bool + a boolean indicator that specifies whether the outputs of the oracle + score function are expected to be normalized + expect_logits: bool + a boolean that specifies whether the oracle score function is + expecting logits when the dataset is discrete + + Public Methods: + + predict(np.ndarray) -> np.ndarray: + a function that accepts a batch of design values 'x' as input and for + each design computes a prediction value 'y' which corresponds + to the score in a model-based optimization problem + + check_input_format(DatasetBuilder) -> bool: + a function that accepts a list of integers as input and returns true + when design values 'x' with the shape specified by that list are + compatible with this class of approximate oracle + + """ + + name = "toy_prediction" + + @classmethod + def supported_datasets(cls): + """An attribute the defines the set of dataset classes which this + oracle can be applied to forming a valid ground truth score + function for a model-based optimization problem + + """ + + return {ToyDiscreteDataset} + + @classmethod + def fully_characterized(cls): + """An attribute the defines whether all possible inputs to the + model-based optimization problem have been evaluated and + are are returned via lookup in self.predict + + """ + + return False + + @classmethod + def is_simulated(cls): + """An attribute the defines whether the values returned by the oracle + were obtained by running a computer simulation rather than + performing physical experiments with real data + + """ + + return True + + def protected_predict(self, x): + """Score function to be implemented by oracle subclasses, where x is + either a batch of designs if self.is_batched is True or is a + single design when self._is_batched is False + + Arguments: + + x_batch: np.ndarray + a batch or single design 'x' that will be given as input to the + oracle model in order to obtain a prediction value 'y' for + each 'x' which is then returned + + Returns: + + y_batch: np.ndarray + a batch or single prediction 'y' made by the oracle model, + corresponding to the ground truth score for each design + value 'x' in a model-based optimization problem + + """ + + return np.square(x.astype(np.float32) - + self.optimum).sum(keepdims=True).astype(np.float32) + + def __init__(self, dataset: DiscreteDataset, **kwargs): + """Initialize the ground truth score function f(x) for a model-based + optimization problem, which involves loading the parameters of an + oracle model and estimating its computational cost + + Arguments: + + dataset: DiscreteDataset + an instance of a subclass of the DatasetBuilder class which has + a set of design values 'x' and prediction values 'y', and defines + batching and sampling methods for those attributes + noise_std: float + the standard deviation of gaussian noise added to the prediction + values 'y' coming out of the ground truth score function f(x) + in order to make the optimization problem difficult + internal_measurements: int + an integer representing the number of independent measurements of + the prediction made by the oracle, which are subsequently + averaged, and is useful when the oracle is stochastic + + """ + + # ensure optimum has been downloaded + optimum = "toy_discrete/optimum.npy" + optimum = DiskResource( + optimum, is_absolute=False, download_method="direct", + download_target=f"https://design-bench." + f"s3-us-west-1.amazonaws.com/{optimum}") + if not optimum.is_downloaded and not optimum.download(): + raise ValueError("unable to download optimum for toy example") + + # load optimum used to calculate y values + self.optimum = np.load(optimum.disk_target) + + # initialize the oracle using the super class + super(ToyDiscreteOracle, self).__init__( + dataset, internal_batch_size=1, is_batched=False, + expect_normalized_y=False, + expect_normalized_x=False, expect_logits=False, **kwargs) diff --git a/process/process_raw_toy_continuous.py b/process/process_raw_toy_continuous.py new file mode 100644 index 0000000..c6e9c03 --- /dev/null +++ b/process/process_raw_toy_continuous.py @@ -0,0 +1,78 @@ +from design_bench.disk_resource import DATA_DIR +from design_bench.disk_resource import google_drive_download +from deepchem.feat.smiles_tokenizer import SmilesTokenizer +import pandas as pd +import numpy as np +import argparse +import glob +import os +import math +import itertools + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Process Toy Continuous Dataset") + parser.add_argument("--shard-folder", type=str, default="./") + parser.add_argument("--seq-length", type=int, default=8) + parser.add_argument("--options", type=int, default=4) + parser.add_argument("--samples-per-shard", type=int, default=5000) + args = parser.parse_args() + + optimum = np.random.uniform(0, 1, size=(args.seq_length,)).astype(np.float32) + os.makedirs(os.path.join( + args.shard_folder, f"toy_continuous/"), exist_ok=True) + np.save(os.path.join(args.shard_folder, + "toy_continuous/", "optimum.npy"), optimum) + + xs = [] + ys = [] + files_list = [] + shard_id = 0 + + options = list(range(args.options)) + list_options = [options for i in range(args.seq_length)] + for sample in itertools.product(*list_options): + + x = np.array(sample, dtype=np.int32).astype(np.float32) + x = (x + np.random.uniform(0., 1., size=x.shape)) / args.options + y = np.square(x - optimum).sum(keepdims=True).astype(np.float32) + + xs.append(x) + ys.append(y) + + if len(xs) == args.samples_per_shard: + + np.save(os.path.join( + args.shard_folder, + f"toy_continuous/" + f"toy_continuous-x-{shard_id}.npy"), xs) + + np.save(os.path.join( + args.shard_folder, + f"toy_continuous/" + f"toy_continuous-y-{shard_id}.npy"), ys) + + xs = [] + ys = [] + files_list.append(f"toy_continuous/" + f"toy_continuous-x-{shard_id}.npy") + shard_id += 1 + + if len(xs) > 0: + + np.save(os.path.join( + args.shard_folder, + f"toy_continuous/" + f"toy_continuous-x-{shard_id}.npy"), xs) + + np.save(os.path.join( + args.shard_folder, + f"toy_continuous/" + f"toy_continuous-y-{shard_id}.npy"), ys) + + xs = [] + ys = [] + files_list.append(f"toy_continuous/" + f"toy_continuous-x-{shard_id}.npy") + shard_id += 1 diff --git a/process/process_raw_toy_discrete.py b/process/process_raw_toy_discrete.py new file mode 100644 index 0000000..371d231 --- /dev/null +++ b/process/process_raw_toy_discrete.py @@ -0,0 +1,73 @@ +import numpy as np +import argparse +import os +import itertools + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Process Toy Discrete Dataset") + parser.add_argument("--shard-folder", type=str, default="./") + parser.add_argument("--seq-length", type=int, default=8) + parser.add_argument("--options", type=int, default=4) + parser.add_argument("--samples-per-shard", type=int, default=5000) + args = parser.parse_args() + + optimum = np.random.randint(args.options, + size=(args.seq_length,)).astype(np.float32) + os.makedirs(os.path.join( + args.shard_folder, f"toy_discrete/"), exist_ok=True) + np.save(os.path.join(args.shard_folder, + "toy_discrete/", "optimum.npy"), optimum) + + xs = [] + ys = [] + files_list = [] + shard_id = 0 + + options = list(range(args.options)) + list_options = [options for i in range(args.seq_length)] + for sample in itertools.product(*list_options): + + x = np.array(sample, dtype=np.int32) + y = np.square(x.astype(np.float32) - + optimum).sum(keepdims=True).astype(np.float32) + + xs.append(x) + ys.append(y) + + if len(xs) == args.samples_per_shard: + + np.save(os.path.join( + args.shard_folder, + f"toy_discrete/" + f"toy_discrete-x-{shard_id}.npy"), xs) + + np.save(os.path.join( + args.shard_folder, + f"toy_discrete/" + f"toy_discrete-y-{shard_id}.npy"), ys) + + xs = [] + ys = [] + files_list.append(f"toy_discrete/" + f"toy_discrete-x-{shard_id}.npy") + shard_id += 1 + + if len(xs) > 0: + + np.save(os.path.join( + args.shard_folder, + f"toy_discrete/" + f"toy_discrete-x-{shard_id}.npy"), xs) + + np.save(os.path.join( + args.shard_folder, + f"toy_discrete/" + f"toy_discrete-y-{shard_id}.npy"), ys) + + xs = [] + ys = [] + files_list.append(f"toy_discrete/" + f"toy_discrete-x-{shard_id}.npy") + shard_id += 1 diff --git a/setup.py b/setup.py index 2884996..422bf65 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ LONG_DESCRIPTION = readme.read() -setup(name='design-bench', version='2.0.7', license='MIT', +setup(name='design-bench', version='2.0.8', license='MIT', packages=find_packages(include=['design_bench', 'design_bench.*']), description='Design-Bench: Benchmarks for ' 'Data-Driven Offline Model-Based Optimization', @@ -17,7 +17,7 @@ author_email='brandon@btrabucco.com', url='https://github.com/brandontrabucco/design-bench', download_url='https://github.com/' - 'brandontrabucco/design-bench/archive/v2_0.tar.gz', + 'brandontrabucco/design-bench/archive/v2_0_8.tar.gz', keywords=['Deep Learning', 'Neural Networks', 'Benchmark', 'Model-Based Optimization'], extras_require={'all': ['gym[mujoco]']},