Skip to content

Commit

Permalink
update design bench GFP oracle
Browse files Browse the repository at this point in the history
  • Loading branch information
brandontrabucco committed Nov 16, 2020
1 parent 7386cf1 commit 148b2ab
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 5 deletions.
6 changes: 5 additions & 1 deletion design_bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ def save_response(response, destination):

register(
'GFP-v0',
'design_bench.tasks.gfp:GFPTask',
'design_bench.tasks.gfp_v0:GFPV0Task',
kwargs=dict(split_percentile=100))
register(
'GFP-v1',
'design_bench.tasks.gfp_v1:GFPV1Task',
kwargs=dict(split_percentile=100))

register(
Expand Down
2 changes: 1 addition & 1 deletion design_bench/tasks/gfp.py → design_bench/tasks/gfp_v0.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import os


class GFPTask(Task):
class GFPV0Task(Task):

def __init__(self,
seed=0,
Expand Down
120 changes: 120 additions & 0 deletions design_bench/tasks/gfp_v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from design_bench import DATA_DIR
from design_bench import maybe_download
from design_bench.task import Task
from tape import TAPETokenizer
from tape import ProteinBertForValuePrediction
import torch
import pandas as pd
import numpy as np
import os


def onehottify(x, n=None, dtype=float):
"""1-hot encode x with the max value n (computed from data if n is None)."""
x = np.asarray(x)
n = np.max(x) + 1 if n is None else n
return np.eye(n, dtype=dtype)[x]


class GFPV1Task(Task):

def __init__(self,
split_percentile=100,
internal_batch_size=1,
use_cuda=True):
"""Load the GFP data set which includes maps from discrete
protein designs to fluorescence scores
Args:
split_percentile: int
the percentile (out of 100) to split the data set by and only
include samples with score below this percentile
"""

maybe_download('1_jcPkQ-M1FRhkEONoE57WEbp_Rivkho2',
os.path.join(DATA_DIR, 'gfp_data.csv'))
maybe_download('1R2UaplzHjMaWwsu-bT-kcOAx7GS1HYRu',
os.path.join(DATA_DIR, 'gfp_transformer_pretrained.zip'))
self.batch_size = internal_batch_size
self.use_cuda = use_cuda

# load the static dataset
df = pd.read_csv(os.path.join(DATA_DIR, 'gfp_data.csv'))

# remove all proteins with fluorescence below the mean
df = df.loc[df.loc[(df['medianBrightness'] >
df['medianBrightness'].mean())].index]

# remove all proteins with a stop marker
df = df.loc[df.loc[
~df['aaSequence'].str.contains('!')].index]

# load the tokenizer and pretrained protein model
self.tokenizer = TAPETokenizer(vocab='iupac')
self.model = ProteinBertForValuePrediction.from_pretrained(
os.path.join(DATA_DIR, 'gfp_transformer_pretrained'))
if self.use_cuda:
self.model = self.model.cuda()

# encode the entire dataset using the TAPE tokenizer
x = np.array([self.tokenizer.encode(x.upper())
for x in df['aaSequence']])

# convert the token ids to one-hot representations
x = onehottify(x, n=30, dtype=np.float32)

# format the fluorescence values to a tensor
y = df['medianBrightness']\
.to_numpy().astype(np.float32).reshape([-1, 1])

# split the remaining proteins with a threshold
ind = np.where(y <= np.percentile(
y[:, 0], split_percentile))[0]

# expose the designs
self.x = x[ind]
self.y = y[ind]

def score(self,
x: np.ndarray) -> np.ndarray:
"""Calculates a score for the provided tensor x using a ground truth
oracle function (the goal of the task is to maximize this)
Args:
x: np.ndarray
a batch of sampled designs that will be evaluated by
an oracle score function
Returns:
scores: np.ndarray
a batch of scores that correspond to the x values provided
in the function argument
"""

scores = []
with torch.no_grad():
for i in range(x.shape[0] // self.batch_size):

# run each batch through the torch model
x_ids = torch.tensor(np.argmax(
x[i: (i + 1) * self.batch_size], axis=-1))
if self.use_cuda:
x_ids = x_ids.cuda()
y = self.model(x_ids)[0].cpu()
scores.append(y.numpy().reshape([-1, 1]))

if x.shape[0] % self.batch_size > 0:

# if there are remaining elements run them at the end
x_ids = torch.tensor(np.argmax(
x[-(x.shape[0] % self.batch_size):], axis=-1))
if self.use_cuda:
x_ids = x_ids.cuda()
y = self.model(x_ids)[0].cpu()
scores.append(y.numpy().reshape([-1, 1]))

# merge together all batches into a single numpy tensor
return np.concatenate(scores, axis=0)
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
LONG_DESCRIPTION = readme.read()


INSTALL_REQUIRES = ['numpy', 'pandas', 'requests', 'scikit-learn']
INSTALL_REQUIRES = ['numpy', 'pandas', 'requests', 'scikit-learn', 'tape_proteins']
EXTRA_REQUIRES = ['gym[mujoco]', 'morphing-agents']


Expand All @@ -31,15 +31,15 @@
setup(
name='design-bench',
packages=find_packages(include=['design_bench', 'design_bench.*']),
version='1.1',
version='1.2',
license='MIT',
description='Design-Bench: Benchmarks for Data-Driven Offline Model-Based Optimization',
long_description=LONG_DESCRIPTION,
long_description_content_type='text/markdown',
author='Brandon Trabucco',
author_email='brandon@btrabucco.com',
url='https://github.com/brandontrabucco/design-bench',
download_url='https://github.com/brandontrabucco/design-bench/archive/v1_1.tar.gz',
download_url='https://github.com/brandontrabucco/design-bench/archive/v1_2.tar.gz',
keywords=['Offline', 'Benchmark', 'Model-Based Optimization'],
install_requires=INSTALL_REQUIRES,
extras_require={'all': EXTRA_REQUIRES},
Expand Down

0 comments on commit 148b2ab

Please sign in to comment.