adding two additional fixed-length discrete MBO tasks

brandontrabucco · Mar 25, 2021 · 1702ad5 · 1702ad5
1 parent 92fc874
commit 1702ad5
Show file tree

Hide file tree

Showing 5 changed files with 232 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -7,6 +7,8 @@ This repository contains several design benchmarks for model-based optimization.
 Current model-based design benchmarks (circa 2020) typically vary from paper-to-paper. For example, tasks employed by biologists differ strongly from those of interest to roboticists. We provide a common interface for tasks that span a wide-range of disciplines, from materials science, to reinforcement learning. We list these tasks below.
 
 * __Biology__: Protein Fluorescence: `design_bench.make('GFP-v0')`
+* __Biology__: Transcription Factor Binding Affinity: `design_bench.make('TfBind8-v0')`
+* __Biology__: Gene Expression: `design_bench.make('UTRExpression-v0')`
 * __Chemistry__: Molecule Activity: `design_bench.make('MoleculeActivity-v0')`
 * __Materials Science__: Superconductor Critical Temperature: `design_bench.make('Superconductor-v0')`
 * __Robotics__: Hopper Controller: `design_bench.make('HopperController-v0')`

diff --git a/design_bench/__init__.py b/design_bench/__init__.py
@@ -75,19 +75,32 @@ def save_response(response, destination):
     kwargs=dict(split_percentile=20,
                 ys_noise=0.0))
 
-register(
-    'Superconductor-v0',
-    'design_bench.tasks.superconductor:SuperconductorTask',
-    kwargs=dict(split_percentile=80,
-                ys_noise=0.0))
-
 register(
     'MoleculeActivity-v0',
     'design_bench.tasks.molecule_activity_v0:MoleculeActivityV0Task',
     kwargs=dict(target_assay=600885,
                 split_percentile=80,
                 ys_noise=0.0))
 
+register(
+    'TfBind8-v0',
+    'design_bench.tasks.tfbind8_v0:TfBind8V0Task',
+    kwargs=dict(split_percentile=20,
+                transcription_factor='SIX6_REF_R1',
+                ys_noise=0.0))
+
+register(
+    'UTRExpression-v0',
+    'design_bench.tasks.utr_expression_v0:UTRExpressionV0Task',
+    kwargs=dict(split_percentile=20,
+                ys_noise=0.0))
+
+register(
+    'Superconductor-v0',
+    'design_bench.tasks.superconductor:SuperconductorTask',
+    kwargs=dict(split_percentile=80,
+                ys_noise=0.0))
+
 try:
 
     import mujoco_py  # test that MuJoCo is installed

diff --git a/design_bench/tasks/tfbind8_v0.py b/design_bench/tasks/tfbind8_v0.py
@@ -0,0 +1,110 @@
+from design_bench import DATA_DIR
+from design_bench import maybe_download
+from design_bench.task import Task
+import pandas as pd
+import numpy as np
+import os
+
+
+def onehottify(x, n=None, dtype=float):
+    """1-hot encode x with the max value n (computed from data if n is None)."""
+    x = np.asarray(x)
+    n = np.max(x) + 1 if n is None else n
+    return np.eye(n, dtype=dtype)[x]
+
+
+LETTER_TO_ID = dict(a=0, t=1, c=2, g=3)
+ID_TO_LETTER = ['a', 't', 'c', 'g']
+
+
+class TfBind8V0Task(Task):
+
+    def score(self, x):
+        return NotImplemented
+
+    def __init__(self,
+                 split_percentile=20,
+                 transcription_factor='SIX6_REF_R1',
+                 ys_noise=0.0):
+        """Load a fully experimentally characterized dataset of transcription
+        factor binding affinity for all possible 8-mers.
+        Inspired by: https://github.com/samsinai/FLEXS/blob/
+        41595eb6901eb2b17d30793c457c107cbb8dc488/
+        flexs/landscapes/tf_binding.py
+
+        Args:
+
+        split_percentile: int
+            the percentile (out of 100) to split the data set by and only
+            include samples with score below this percentile
+        ys_noise: float
+            the number of standard deviations of noise to add to
+            the static training dataset y values accompanying this task
+        """
+
+        maybe_download('1xS6N5qSwyFLC-ZPTADYrxZuPHjBkZCrj',
+                       os.path.join(DATA_DIR, 'TF_binding_landscapes.zip'))
+
+        # load the static dataset
+        tf_dir = os.path.join(os.path.join(
+            DATA_DIR, 'TF_binding_landscapes'), 'landscapes')
+        data = pd.read_csv(os.path.join(
+            tf_dir, f'{transcription_factor}_8mers.txt'), sep="\t")
+
+        # load the 8 mer sequences from the dataset
+        seq0 = np.char.lower(data["8-mer"].tolist())
+        seq1 = np.char.lower(data["8-mer.1"].tolist())
+
+        # load dna 8-mers from the dataset
+        x0 = np.array([[LETTER_TO_ID[c] for c in x.lower()] for x in seq0])
+        x1 = np.array([[LETTER_TO_ID[c] for c in x.lower()] for x in seq1])
+
+        # convert the token ids to one-hot representations
+        x = np.concatenate([x0, x1], axis=0)
+        x = onehottify(x, n=4, dtype=np.float32)
+
+        y0 = data["E-score"].to_numpy()  # "E-score" is enrichment score
+        y0 = (y0[:, np.newaxis] - y0.min()) / (y0.max() - y0.min())
+        y = np.concatenate([y0, y0], axis=0).astype(np.float32)
+
+        # split the remaining proteins with a threshold
+        ind = np.where(y <= np.percentile(y[:, 0], split_percentile))[0]
+
+        # expose the designs
+        x = x[ind]
+        y = y[ind]
+
+        mean_y = np.mean(y, axis=0, keepdims=True)
+        st_y = np.std(y - mean_y, axis=0, keepdims=True)
+        y = y + np.random.normal(0.0, 1.0, y.shape) * st_y * ys_noise
+
+        # expose the designs
+        self.x = x
+        self.y = y
+        self.sequences = dict(zip(seq0, y))
+        self.sequences.update(zip(seq1, y))
+        self.score = np.vectorize(self.scalar_score,
+                                  signature='(n,4)->(1)')
+
+    def scalar_score(self,
+                     x: np.ndarray) -> np.ndarray:
+        """Calculates a score for the provided tensor x using a ground truth
+        oracle function (the goal of the task is to maximize this)
+
+        Args:
+
+        x: np.ndarray
+            a batch of sampled designs that will be evaluated by
+            an oracle score function
+
+        Returns:
+
+        scores: np.ndarray
+            a batch of scores that correspond to the x values provided
+            in the function argument
+        """
+
+        # lookup the score of a single 8-mer
+        word = ''.join(map(lambda token:
+                           ID_TO_LETTER[token], np.argmax(x, axis=1)))
+        return np.asarray(self.sequences[word], dtype=np.float32)
diff --git a/design_bench/tasks/utr_expression_v0.py b/design_bench/tasks/utr_expression_v0.py
@@ -0,0 +1,97 @@
+from design_bench import DATA_DIR
+from design_bench import maybe_download
+from design_bench.task import Task
+import pandas as pd
+import numpy as np
+import os
+import keras
+
+
+def onehottify(x, n=None, dtype=float):
+    """1-hot encode x with the max value n (computed from data if n is None)."""
+    x = np.asarray(x)
+    n = np.max(x) + 1 if n is None else n
+    return np.eye(n, dtype=dtype)[x]
+
+
+LETTER_TO_ID = dict(a=0, c=1, g=2, t=3)
+ID_TO_LETTER = ['a', 'c', 'g', 't']
+
+
+class UTRExpressionV0Task(Task):
+
+    def __init__(self,
+                 split_percentile=20,
+                 ys_noise=0.0):
+        """Load a dataset of DNA sequences correspond to 5'UTR sequences
+        and their corresponding gene expression levels
+        Inspired by: https://github.com/pjsample/human_5utr_modeling
+
+        Args:
+
+        split_percentile: int
+            the percentile (out of 100) to split the data set by and only
+            include samples with score below this percentile
+        ys_noise: float
+            the number of standard deviations of noise to add to
+            the static training dataset y values accompanying this task
+        """
+
+        maybe_download('1pRypiGVYl-kmJZaMhVbuA1PEvqauWBBM',
+                       os.path.join(DATA_DIR, 'utr.zip'))
+        utr_dir = os.path.join(DATA_DIR, 'utr')
+
+        # load the static dataset
+        df = pd.read_csv(os.path.join(utr_dir, 'egfp_unmod_1.csv'))
+        df.sort_values('total_reads', inplace=True, ascending=False)
+        df.reset_index(inplace=True, drop=True)
+        df = df.iloc[:280000]
+
+        # load the 8 mer sequences from the dataset
+        seq = np.char.lower(df["utr"].tolist())
+
+        # load dna 8-mers from the dataset
+        x = np.array([[LETTER_TO_ID[c] for c in x.lower()] for x in seq])
+
+        # convert the token ids to one-hot representations
+        x = onehottify(x, n=4, dtype=np.float32)
+
+        self.model = keras.models.load_model(
+            os.path.join(utr_dir, 'main_MRL_model.hdf5'))
+        y = self.model.predict(x)  # label the data set using the trained model
+
+        # split the remaining proteins with a threshold
+        ind = np.where(y <= np.percentile(y[:, 0], split_percentile))[0]
+
+        # expose the designs
+        x = x[ind]
+        y = y[ind]
+
+        mean_y = np.mean(y, axis=0, keepdims=True)
+        st_y = np.std(y - mean_y, axis=0, keepdims=True)
+        y = y + np.random.normal(0.0, 1.0, y.shape) * st_y * ys_noise
+
+        # expose the designs
+        self.x = x
+        self.y = y
+
+    def score(self,
+              x: np.ndarray) -> np.ndarray:
+        """Calculates a score for the provided tensor x using a ground truth
+        oracle function (the goal of the task is to maximize this)
+
+        Args:
+
+        x: np.ndarray
+            a batch of sampled designs that will be evaluated by
+            an oracle score function
+
+        Returns:
+
+        scores: np.ndarray
+            a batch of scores that correspond to the x values provided
+            in the function argument
+        """
+
+        # use the trained model to predict the score
+        return self.model.predict(x)
diff --git a/setup.py b/setup.py
@@ -7,7 +7,8 @@
     LONG_DESCRIPTION = readme.read()
 
 
-INSTALL_REQUIRES = ['numpy', 'pandas', 'requests', 'scikit-learn', 'tape_proteins']
+INSTALL_REQUIRES = ['numpy', 'pandas', 'requests', 'scikit-learn',
+                    'tape_proteins', 'keras', 'tensorflow>=2.2']
 EXTRA_REQUIRES = ['gym[mujoco]', 'morphing-agents']
 
 
@@ -31,15 +32,15 @@
 setup(
     name='design-bench',
     packages=find_packages(include=['design_bench', 'design_bench.*']),
-    version='1.4',
+    version='1.5',
     license='MIT',
     description='Design-Bench: Benchmarks for Data-Driven Offline Model-Based Optimization',
     long_description=LONG_DESCRIPTION,
     long_description_content_type='text/markdown',
     author='Brandon Trabucco',
     author_email='brandon@btrabucco.com',
     url='https://github.com/brandontrabucco/design-bench',
-    download_url='https://github.com/brandontrabucco/design-bench/archive/v1_4.tar.gz',
+    download_url='https://github.com/brandontrabucco/design-bench/archive/v1_5.tar.gz',
     keywords=['Offline', 'Benchmark', 'Model-Based Optimization'],
     install_requires=INSTALL_REQUIRES,
     extras_require={'all': EXTRA_REQUIRES},