Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Set random seeds #101

Merged
merged 14 commits into from
Jan 8, 2020
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# songbird changelog

## Version 1.0.2-dev
Added ability to set random seed for CLI and sets fixed random seeds for qiime2 [#101](https://github.com/biocore/songbird/pull/101)

Correcting matching between metadata and biom table and clarifying the min-feature-count parameter [#99](https://github.com/biocore/songbird/pull/99)

Added Tensorboard's HParams functionality to standalone [#95](https://github.com/biocore/songbird/pull/95)
Expand Down
20 changes: 19 additions & 1 deletion scripts/songbird
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ def songbird():
show_default=True,
help=DESCS["summary-dir"],
)
@click.option(
"--random-seed",
default=DEFAULTS["random-seed"],
show_default=True,
help=DESCS["random-seed"],
type=int,
)
def multinomial(
input_biom,
metadata_file,
Expand All @@ -115,6 +122,7 @@ def multinomial(
checkpoint_interval,
summary_interval,
summary_dir,
random_seed,
):
# load metadata and tables
metadata = read_metadata(metadata_file)
Expand All @@ -139,16 +147,22 @@ def multinomial(
'min_sample_count': min_sample_count,
'min_feature_count': min_feature_count,
}
if random_seed is not None:
hparams.update({
'random_seed': random_seed,
})

# split up training and testing
trainX, testX, trainY, testY = split_training(
dense_table,
metadata,
design,
training_column,
num_random_test_examples,
seed=random_seed,
)

# split up training and testing
# initialize and train the model
model = MultRegression(
learning_rate=learning_rate,
clipnorm=clipnorm,
Expand All @@ -157,6 +171,10 @@ def multinomial(
save_path=summary_dir,
)
with tf.Graph().as_default(), tf.Session() as session:
# set the tf random seed
if random_seed is not None:
tf.set_random_seed(random_seed)

model(session, trainX, trainY, testX, testY)

model.fit(
Expand Down
46 changes: 45 additions & 1 deletion scripts/test_songbird_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def setUp(self) -> None:
def tearDown(self) -> None:
shutil.rmtree(self.path)

def test_cli(self):
def test_cli_no_seed_set(self):
runner = CliRunner()
test_args = ['--input-biom', 'data/redsea/redsea.biom',
'--metadata-file', 'data/redsea/redsea_metadata.txt',
Expand All @@ -42,6 +42,50 @@ def test_cli(self):
error = Exception('Command failed with non-zero exit code')
raise error .with_traceback(ex.__traceback__)

def test_cli_set_set_seed_int(self):
runner = CliRunner()
test_args = ['--input-biom', 'data/redsea/redsea.biom',
'--metadata-file', 'data/redsea/redsea_metadata.txt',
'--formula',
'Depth+Temperature+Salinity+Oxygen+Fluorescence'
'+Nitrate',
'--epochs', '100',
'--differential-prior', '0.5',
'--summary-interval', '1',
'--summary-dir', self.path,
'--random-seed', 42,
]

result = runner.invoke(songbird.multinomial, test_args)
try:
self.assertEqual(0, result.exit_code)
except AssertionError:
ex = result.exception
error = Exception('Command failed with non-zero exit code')
raise error .with_traceback(ex.__traceback__)

def test_cli_set_random_seed_None(self):
runner = CliRunner()
test_args = ['--input-biom', 'data/redsea/redsea.biom',
'--metadata-file', 'data/redsea/redsea_metadata.txt',
'--formula',
'Depth+Temperature+Salinity+Oxygen+Fluorescence'
'+Nitrate',
'--epochs', '100',
'--differential-prior', '0.5',
'--summary-interval', '1',
'--summary-dir', self.path,
'--random-seed', None,
]

result = runner.invoke(songbird.multinomial, test_args)
try:
self.assertEqual(0, result.exit_code)
except AssertionError:
ex = result.exception
error = Exception('Command failed with non-zero exit code')
raise error .with_traceback(ex.__traceback__)


if __name__ == '__main__':
unittest.main()
5 changes: 5 additions & 0 deletions songbird/parameter_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@
'to summaries that can be loaded into Tensorboard and '
'checkpoints for recovering parameters during runtime.'
),
"random-seed": (
'The number to used to receive consistent results for the random '
'processes in the fitting procedure.'
),
}

DEFAULTS = {
Expand All @@ -60,4 +64,5 @@
"checkpoint-interval": 3600,
"summary-interval": 10,
"summary-dir": "summarydir",
"random-seed": 0,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"random-seed": 0,
"random-seed": None,

I'm still leaning towards having the default be None, otherwise the option to not specify a random seed will no longer exist. For example, if you have a multiple random runs, you do things like take averages, or take the best fit amongst multiple random runs. If this option is not available, it won't be possible for the user to not specify a random seed.

Copy link
Member Author

@gwarmstrong gwarmstrong Jan 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! And I appreciate the feedback on the PR.

I think it is important to mandate a seed for Q2. For a software that has a mission of reproducibility, you should not be able to create artifacts that you cannot reproduce. If someone were to give me an artifact created by songbird with seed set to None, I’d never be able to reproduce it exactly.

I’m also not sure why you can’t achieve your use case by setting the random seed differently, multiple times? This use case is actually the reason I had two different seeds originally, so you could keep the same train/test split, but change the model fitting. Though this behavior could be achieved by this PR by setting a training column and then varying the seed.

Also, as you mentioned, setting a None seed is still available via the CLI.

}
4 changes: 3 additions & 1 deletion songbird/q2/_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,16 @@ def multinomial(table: biom.Table,
# split up training and testing
trainX, testX, trainY, testY = split_training(
dense_table, metadata, design,
training_column, num_random_test_examples
training_column, num_random_test_examples,
seed=0,
)

model = MultRegression(learning_rate=learning_rate, clipnorm=clipnorm,
beta_mean=differential_prior,
batch_size=batch_size,
save_path=None)
with tf.Graph().as_default(), tf.Session() as session:
tf.set_random_seed(0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
tf.set_random_seed(0)
tf.set_random_seed(seed)

this won't work - the seed will always be set to zero otherwise.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably will want to have the seed=None option here as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the QIIME plugin, I thought it might be better to make the seed 0 always, so as to avoid exposing a seed argument to the QIIME user. Given this assumption, the code is fine as it stands (tf.set_random_seed(seed) will actually throw a NameError). Thought being that this could cut down on instances of comparing a model run with many different seeds to a baseline that was run only with the default seed, or something like that. It basically enforces the "right" behavior.

If you want me to expose the seed to the user via QIIME, that is fine too, and I can make the corresponding change.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want the seed to be always set to zero in the qiime2 side. Thanks.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. I will rewrite so that default seed for QIIME is 0, but can be set via a parameter.

I'm not sure I understand where seed=None factors into this interface. I think we want the default behavior of the QIIME plugin to be that the result of two commands on the same data are directly comparable by default. Then you can change seed if you need. That shouldn't require a seed=None anywhere for this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll still want to have an option to not specify a seed, namely

if seed is not None:
    tf.set_random_seed(seed)

model(session, trainX, trainY, testX, testY)

loss, cv, its = model.fit(
Expand Down
24 changes: 24 additions & 0 deletions songbird/q2/tests/test_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,30 @@ def test_fit(self):
npt.assert_allclose(exp_beta, res_beta.T, atol=0.6, rtol=0.6)
self.assertGreater(len(res_stats.to_dataframe().index), 1)

def test_fit_consistency(self):
md = self.md

md.name = 'sampleid'
md = qiime2.Metadata(md)

res_beta1, res_stats1, res_biplot1 = multinomial(
table=self.table, metadata=md,
min_sample_count=0, min_feature_count=0,
formula="X", epochs=1000)

res_beta2, res_stats2, res_biplot2 = multinomial(
table=self.table, metadata=md,
min_sample_count=0, min_feature_count=0,
formula="X", epochs=1000)

npt.assert_array_equal(res_beta1, res_beta2)
end_res_stats1 = res_stats1.to_dataframe().iloc[-1]
end_res_stats2 = res_stats2.to_dataframe().iloc[-1]
npt.assert_array_equal(end_res_stats1, end_res_stats2)
npt.assert_array_equal(res_biplot1.eigvals, res_biplot2.eigvals)
npt.assert_array_equal(res_biplot1.samples, res_biplot2.samples)
npt.assert_array_equal(res_biplot1.features, res_biplot2.features)


if __name__ == "__main__":
unittest.main()
3 changes: 2 additions & 1 deletion songbird/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,10 @@ def design_filter(val, id_, md):


def split_training(dense_table, metadata, design, training_column=None,
num_random_test_examples=10):
num_random_test_examples=10, seed=None):
mortonjt marked this conversation as resolved.
Show resolved Hide resolved

if training_column is None:
np.random.seed(seed)
idx = np.random.random(design.shape[0])
i = np.argsort(idx)[num_random_test_examples]

Expand Down