Skip to content

Commit

Permalink
adding two additional fixed-length discrete MBO tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
brandontrabucco committed Mar 25, 2021
1 parent 92fc874 commit 1702ad5
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 9 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ This repository contains several design benchmarks for model-based optimization.
Current model-based design benchmarks (circa 2020) typically vary from paper-to-paper. For example, tasks employed by biologists differ strongly from those of interest to roboticists. We provide a common interface for tasks that span a wide-range of disciplines, from materials science, to reinforcement learning. We list these tasks below.

* __Biology__: Protein Fluorescence: `design_bench.make('GFP-v0')`
* __Biology__: Transcription Factor Binding Affinity: `design_bench.make('TfBind8-v0')`
* __Biology__: Gene Expression: `design_bench.make('UTRExpression-v0')`
* __Chemistry__: Molecule Activity: `design_bench.make('MoleculeActivity-v0')`
* __Materials Science__: Superconductor Critical Temperature: `design_bench.make('Superconductor-v0')`
* __Robotics__: Hopper Controller: `design_bench.make('HopperController-v0')`
Expand Down
25 changes: 19 additions & 6 deletions design_bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,32 @@ def save_response(response, destination):
kwargs=dict(split_percentile=20,
ys_noise=0.0))

register(
'Superconductor-v0',
'design_bench.tasks.superconductor:SuperconductorTask',
kwargs=dict(split_percentile=80,
ys_noise=0.0))

register(
'MoleculeActivity-v0',
'design_bench.tasks.molecule_activity_v0:MoleculeActivityV0Task',
kwargs=dict(target_assay=600885,
split_percentile=80,
ys_noise=0.0))

register(
'TfBind8-v0',
'design_bench.tasks.tfbind8_v0:TfBind8V0Task',
kwargs=dict(split_percentile=20,
transcription_factor='SIX6_REF_R1',
ys_noise=0.0))

register(
'UTRExpression-v0',
'design_bench.tasks.utr_expression_v0:UTRExpressionV0Task',
kwargs=dict(split_percentile=20,
ys_noise=0.0))

register(
'Superconductor-v0',
'design_bench.tasks.superconductor:SuperconductorTask',
kwargs=dict(split_percentile=80,
ys_noise=0.0))

try:

import mujoco_py # test that MuJoCo is installed
Expand Down
110 changes: 110 additions & 0 deletions design_bench/tasks/tfbind8_v0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from design_bench import DATA_DIR
from design_bench import maybe_download
from design_bench.task import Task
import pandas as pd
import numpy as np
import os


def onehottify(x, n=None, dtype=float):
"""1-hot encode x with the max value n (computed from data if n is None)."""
x = np.asarray(x)
n = np.max(x) + 1 if n is None else n
return np.eye(n, dtype=dtype)[x]


LETTER_TO_ID = dict(a=0, t=1, c=2, g=3)
ID_TO_LETTER = ['a', 't', 'c', 'g']


class TfBind8V0Task(Task):

def score(self, x):
return NotImplemented

def __init__(self,
split_percentile=20,
transcription_factor='SIX6_REF_R1',
ys_noise=0.0):
"""Load a fully experimentally characterized dataset of transcription
factor binding affinity for all possible 8-mers.
Inspired by: https://github.com/samsinai/FLEXS/blob/
41595eb6901eb2b17d30793c457c107cbb8dc488/
flexs/landscapes/tf_binding.py
Args:
split_percentile: int
the percentile (out of 100) to split the data set by and only
include samples with score below this percentile
ys_noise: float
the number of standard deviations of noise to add to
the static training dataset y values accompanying this task
"""

maybe_download('1xS6N5qSwyFLC-ZPTADYrxZuPHjBkZCrj',
os.path.join(DATA_DIR, 'TF_binding_landscapes.zip'))

# load the static dataset
tf_dir = os.path.join(os.path.join(
DATA_DIR, 'TF_binding_landscapes'), 'landscapes')
data = pd.read_csv(os.path.join(
tf_dir, f'{transcription_factor}_8mers.txt'), sep="\t")

# load the 8 mer sequences from the dataset
seq0 = np.char.lower(data["8-mer"].tolist())
seq1 = np.char.lower(data["8-mer.1"].tolist())

# load dna 8-mers from the dataset
x0 = np.array([[LETTER_TO_ID[c] for c in x.lower()] for x in seq0])
x1 = np.array([[LETTER_TO_ID[c] for c in x.lower()] for x in seq1])

# convert the token ids to one-hot representations
x = np.concatenate([x0, x1], axis=0)
x = onehottify(x, n=4, dtype=np.float32)

y0 = data["E-score"].to_numpy() # "E-score" is enrichment score
y0 = (y0[:, np.newaxis] - y0.min()) / (y0.max() - y0.min())
y = np.concatenate([y0, y0], axis=0).astype(np.float32)

# split the remaining proteins with a threshold
ind = np.where(y <= np.percentile(y[:, 0], split_percentile))[0]

# expose the designs
x = x[ind]
y = y[ind]

mean_y = np.mean(y, axis=0, keepdims=True)
st_y = np.std(y - mean_y, axis=0, keepdims=True)
y = y + np.random.normal(0.0, 1.0, y.shape) * st_y * ys_noise

# expose the designs
self.x = x
self.y = y
self.sequences = dict(zip(seq0, y))
self.sequences.update(zip(seq1, y))
self.score = np.vectorize(self.scalar_score,
signature='(n,4)->(1)')

def scalar_score(self,
x: np.ndarray) -> np.ndarray:
"""Calculates a score for the provided tensor x using a ground truth
oracle function (the goal of the task is to maximize this)
Args:
x: np.ndarray
a batch of sampled designs that will be evaluated by
an oracle score function
Returns:
scores: np.ndarray
a batch of scores that correspond to the x values provided
in the function argument
"""

# lookup the score of a single 8-mer
word = ''.join(map(lambda token:
ID_TO_LETTER[token], np.argmax(x, axis=1)))
return np.asarray(self.sequences[word], dtype=np.float32)
97 changes: 97 additions & 0 deletions design_bench/tasks/utr_expression_v0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from design_bench import DATA_DIR
from design_bench import maybe_download
from design_bench.task import Task
import pandas as pd
import numpy as np
import os
import keras


def onehottify(x, n=None, dtype=float):
"""1-hot encode x with the max value n (computed from data if n is None)."""
x = np.asarray(x)
n = np.max(x) + 1 if n is None else n
return np.eye(n, dtype=dtype)[x]


LETTER_TO_ID = dict(a=0, c=1, g=2, t=3)
ID_TO_LETTER = ['a', 'c', 'g', 't']


class UTRExpressionV0Task(Task):

def __init__(self,
split_percentile=20,
ys_noise=0.0):
"""Load a dataset of DNA sequences correspond to 5'UTR sequences
and their corresponding gene expression levels
Inspired by: https://github.com/pjsample/human_5utr_modeling
Args:
split_percentile: int
the percentile (out of 100) to split the data set by and only
include samples with score below this percentile
ys_noise: float
the number of standard deviations of noise to add to
the static training dataset y values accompanying this task
"""

maybe_download('1pRypiGVYl-kmJZaMhVbuA1PEvqauWBBM',
os.path.join(DATA_DIR, 'utr.zip'))
utr_dir = os.path.join(DATA_DIR, 'utr')

# load the static dataset
df = pd.read_csv(os.path.join(utr_dir, 'egfp_unmod_1.csv'))
df.sort_values('total_reads', inplace=True, ascending=False)
df.reset_index(inplace=True, drop=True)
df = df.iloc[:280000]

# load the 8 mer sequences from the dataset
seq = np.char.lower(df["utr"].tolist())

# load dna 8-mers from the dataset
x = np.array([[LETTER_TO_ID[c] for c in x.lower()] for x in seq])

# convert the token ids to one-hot representations
x = onehottify(x, n=4, dtype=np.float32)

self.model = keras.models.load_model(
os.path.join(utr_dir, 'main_MRL_model.hdf5'))
y = self.model.predict(x) # label the data set using the trained model

# split the remaining proteins with a threshold
ind = np.where(y <= np.percentile(y[:, 0], split_percentile))[0]

# expose the designs
x = x[ind]
y = y[ind]

mean_y = np.mean(y, axis=0, keepdims=True)
st_y = np.std(y - mean_y, axis=0, keepdims=True)
y = y + np.random.normal(0.0, 1.0, y.shape) * st_y * ys_noise

# expose the designs
self.x = x
self.y = y

def score(self,
x: np.ndarray) -> np.ndarray:
"""Calculates a score for the provided tensor x using a ground truth
oracle function (the goal of the task is to maximize this)
Args:
x: np.ndarray
a batch of sampled designs that will be evaluated by
an oracle score function
Returns:
scores: np.ndarray
a batch of scores that correspond to the x values provided
in the function argument
"""

# use the trained model to predict the score
return self.model.predict(x)
7 changes: 4 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
LONG_DESCRIPTION = readme.read()


INSTALL_REQUIRES = ['numpy', 'pandas', 'requests', 'scikit-learn', 'tape_proteins']
INSTALL_REQUIRES = ['numpy', 'pandas', 'requests', 'scikit-learn',
'tape_proteins', 'keras', 'tensorflow>=2.2']
EXTRA_REQUIRES = ['gym[mujoco]', 'morphing-agents']


Expand All @@ -31,15 +32,15 @@
setup(
name='design-bench',
packages=find_packages(include=['design_bench', 'design_bench.*']),
version='1.4',
version='1.5',
license='MIT',
description='Design-Bench: Benchmarks for Data-Driven Offline Model-Based Optimization',
long_description=LONG_DESCRIPTION,
long_description_content_type='text/markdown',
author='Brandon Trabucco',
author_email='brandon@btrabucco.com',
url='https://github.com/brandontrabucco/design-bench',
download_url='https://github.com/brandontrabucco/design-bench/archive/v1_4.tar.gz',
download_url='https://github.com/brandontrabucco/design-bench/archive/v1_5.tar.gz',
keywords=['Offline', 'Benchmark', 'Model-Based Optimization'],
install_requires=INSTALL_REQUIRES,
extras_require={'all': EXTRA_REQUIRES},
Expand Down

0 comments on commit 1702ad5

Please sign in to comment.