Skip to content

Commit

Permalink
ENH: Remove empty features and add tests (#171)
Browse files Browse the repository at this point in the history
This has apparently thrown off the Byrd test, since it looks like
there were a bunch of empty features in that dataset. Need to update
the testing utilities to allow for empty features to be removed.

also looks like the JSONs for the q2-moving-pictures and sleep apnea
integration tests are different -- figure out why, and if that's due
to a bug or just due to something else. once that's done this issue will
be done
  • Loading branch information
fedarko committed Jul 5, 2019
1 parent 841c586 commit 1ab7925
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 97 deletions.
67 changes: 51 additions & 16 deletions qurro/_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,41 +135,76 @@ def biom_table_to_sparse_df(table, min_row_ct=2, min_col_ct=1):
return table_sdf


def remove_empty_samples(table_sdf, sample_metadata_df):
"""Removes samples with 0 counts for every feature from the table and
sample metadata DataFrame.
def remove_empty_samples_and_features(
table_sdf, sample_metadata_df, feature_ranks_df
):
"""Removes empty samples and features from the table, sample metadata, and
feature ranks DataFrames.
This should be called *after* matching the table with the sample
metadata -- we assume that the columns of the table DataFrame are
equivalent to the indices of the sample metadata DataFrame.
metadata and feature ranks -- we assume that the columns of the
table DataFrame are equivalent to the indices of the sample metadata
DataFrame, and that the indices (rows) of the table are also equivalent
to the indices of the feature ranks DataFrame.
This will raise a ValueError if, after removing empty samples, either
the table's columns or the metadata's indices are empty (this will
happen in the case where all of the samples in these DataFrames are
empty).
This will raise a ValueError if the input table is empty (i.e. all
samples/features would be removed).
"""
logging.debug("Attempting to remove empty samples.")
table_df_equal_to_zero = table_sdf == 0
logging.debug("Attempting to remove empty samples and features.")

# If the table only contains zeros, then attempting to drop all empty
# samples and/or features would result in a 0x0 DataFrame. Therefore, we
# just raise a ValueError in this case.
if (table_sdf == 0).all().all():
raise ValueError("The table is empty.")

# Filter out empty samples
# Basically, we compute each cell in the table table to a bool (True if !=
# 0, False if == 0). Then we just find all the columns (samples) with at
# least one True value, and filter the table to just those columns.
neq_zero = table_sdf != 0
nonempty_samples = []
for sample in table_sdf.columns:
if not table_df_equal_to_zero[sample].all():
if neq_zero[sample].any():
nonempty_samples.append(sample)

filtered_table = table_sdf.filter(items=nonempty_samples, axis="columns")
samplefiltered_table = table_sdf.filter(
items=nonempty_samples, axis="columns"
)
filtered_metadata = sample_metadata_df.filter(
items=nonempty_samples, axis="index"
)

if len(filtered_table.columns) < 1 or len(filtered_metadata.index) < 1:
raise ValueError("Found all empty samples with current features.")
# Filter out empty features
# Same method as above, but operating on rows (features) instead of on
# columns (samples).
neq_zero = samplefiltered_table != 0
nonempty_features = []
for feature in samplefiltered_table.index:
if neq_zero.loc[feature].any():
nonempty_features.append(feature)

filtered_table = samplefiltered_table.filter(
items=nonempty_features, axis="index"
)
filtered_ranks = feature_ranks_df.filter(
items=nonempty_features, axis="index"
)

# Let user know about which samples/features may have been dropped, if any.
sample_diff = len(table_sdf.columns) - len(filtered_table.columns)
if sample_diff > 0:
logging.debug("Removed {} empty sample(s).".format(sample_diff))
else:
logging.debug("Couldn't find any empty samples.")

return filtered_table, filtered_metadata
feature_diff = len(table_sdf.index) - len(filtered_table.index)
if feature_diff > 0:
logging.debug("Removed {} empty feature(s).".format(feature_diff))
else:
logging.debug("Couldn't find any empty features.")

return filtered_table, filtered_metadata, filtered_ranks


def match_table_and_data(table, feature_ranks, sample_metadata):
Expand Down
26 changes: 13 additions & 13 deletions qurro/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
replace_nan,
validate_df,
biom_table_to_sparse_df,
remove_empty_samples,
remove_empty_samples_and_features,
match_table_and_data,
merge_feature_metadata,
)
Expand Down Expand Up @@ -80,8 +80,8 @@ def process_input(
5. Calls filter_unextreme_features() using the provided
extreme_feature_count. (If it's None, then nothing will be done.)
6. Calls remove_empty_samples() to filter samples without any counts for
any features. This is purposefully done *after*
6. Calls remove_empty_samples_and_features() to filter empty samples
(and features). This is purposefully done *after*
filter_unextreme_features() is called.
7. Calls merge_feature_metadata() on the feature ranks and feature
Expand All @@ -93,15 +93,15 @@ def process_input(
Sample metadata, but matched with the table and with empty samples
removed.
filtered_ranks: pd.DataFrame
output_ranks: pd.DataFrame
Feature ranks, post-filtering and with feature metadata columns
added in.
ranking_ids
The ranking columns' names in filtered_ranks.
The ranking columns' names in output_ranks.
feature_metadata_cols: list
The feature metadata columns' names in filtered_ranks.
The feature metadata columns' names in output_ranks.
output_table: pd.SparseDataFrame
The BIOM table, post matching with the feature ranks and sample
Expand Down Expand Up @@ -144,23 +144,23 @@ def process_input(
m_table, feature_ranks, extreme_feature_count
)

# Filter now-empty samples from the BIOM table.
output_table, output_metadata = remove_empty_samples(
filtered_table, m_sample_metadata
# Filter now-empty samples (and empty features) from the BIOM table.
output_table, output_metadata, u_ranks = remove_empty_samples_and_features(
filtered_table, m_sample_metadata, filtered_ranks
)

# Save a list of ranking IDs (before we add in feature metadata)
# TODO: just have merge_feature_metadata() give us this?
ranking_ids = filtered_ranks.columns
ranking_ids = u_ranks.columns

filtered_ranks, feature_metadata_cols = merge_feature_metadata(
filtered_ranks, feature_metadata
output_ranks, feature_metadata_cols = merge_feature_metadata(
u_ranks, feature_metadata
)

logging.debug("Finished input processing.")
return (
output_metadata,
filtered_ranks,
output_ranks,
ranking_ids,
feature_metadata_cols,
output_table,
Expand Down
122 changes: 107 additions & 15 deletions qurro/tests/test_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
ensure_df_headers_unique,
validate_df,
replace_nan,
remove_empty_samples,
remove_empty_samples_and_features,
merge_feature_metadata,
)

Expand Down Expand Up @@ -170,9 +170,10 @@ def test_replace_nan():


def get_test_data():
"""Returns a test table and metadata DataFrame.
"""Returns test table, metadata, and ranks DataFrames.
Mostly copied from get_test_data() in test_filter_unextreme_features.
Mostly based on/copied from get_test_data() in
test_filter_unextreme_features.
"""
feature_ids = ["F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8"]
table = DataFrame(
Expand All @@ -193,27 +194,39 @@ def get_test_data():
},
index=list(table.columns)[:],
)
return table, metadata
ranks = DataFrame(
{
"Rank 0": [1, 2, 3, 4, 5, 6, 7, 8],
"Rank 1": [8, 7, 6, 5, 4, 3, 2, 1],
},
index=list(table.index)[:],
)
return table, metadata, ranks


def test_remove_empty_samples_basic():
"""Tests remove_empty_samples() in the simple cases of removing 0, 1, and 2
empty sample(s).
def test_remove_empty_samples_and_features_samples():
"""Tests remove_empty_samples_and_features() in the simple cases of
removing 0, 1, and 2 empty sample(s).
"""

# TRY REMOVING 0 SAMPLES
table, metadata = get_test_data()
table, metadata, ranks = get_test_data()
# Check that, when none of the samples are empty, nothing is changed.
ftable, fmetadata = remove_empty_samples(table, metadata)
ftable, fmetadata, franks = remove_empty_samples_and_features(
table, metadata, ranks
)
assert_frame_equal(ftable, table)
assert_frame_equal(fmetadata, metadata)
assert_frame_equal(franks, ranks)

# TRY REMOVING 1 SAMPLE
# Zero out Sample3 (it only has one count, for F1)
table["Sample3"]["F1"] = 0
# Check that just the one empty sample (Sample3) was removed, from both the
# table and the sample metadata.
ftable, fmetadata = remove_empty_samples(table, metadata)
ftable, fmetadata, franks = remove_empty_samples_and_features(
table, metadata, ranks
)
assert_series_equal(ftable["Sample1"], table["Sample1"])
assert_series_equal(ftable["Sample2"], table["Sample2"])
assert_series_equal(ftable["Sample4"], table["Sample4"])
Expand All @@ -228,10 +241,14 @@ def test_remove_empty_samples_basic():
assert len(fmetadata.index) == 3
assert len(fmetadata.columns) == len(metadata.columns) == 4

assert_frame_equal(franks, ranks)

# TRY REMOVING 2 SAMPLES
# Now, zero out Sample4 (it only has one count in F4)
table["Sample4"]["F4"] = 0
ftable, fmetadata = remove_empty_samples(table, metadata)
ftable, fmetadata, franks = remove_empty_samples_and_features(
table, metadata, ranks
)
assert_series_equal(ftable["Sample1"], table["Sample1"])
assert_series_equal(ftable["Sample2"], table["Sample2"])
assert "Sample3" not in ftable.columns
Expand All @@ -245,17 +262,92 @@ def test_remove_empty_samples_basic():
assert "Sample4" not in fmetadata.index
assert len(fmetadata.index) == 2

assert_frame_equal(franks, ranks)


def test_remove_empty_samples_and_features_features():
"""Tests remove_empty_samples_and_features() in the simple cases of
removing 1 and then 2 empty feature(s).
"""

table, metadata, ranks = get_test_data()
# Zero out F8
table.loc["F8"] = 0
ftable, fmetadata, franks = remove_empty_samples_and_features(
table, metadata, ranks
)
assert_frame_equal(fmetadata, metadata)
# Check that F8 was removed from the table and ranks

def check(new, old):
assert_frame_equal(new.iloc[0:7], old.iloc[0:7])
assert "F8" not in new.index
assert len(new.index) == 7

check(ftable, table)
check(franks, ranks)

# Zero out F6, also
table.loc["F6"] = 0
ftable, fmetadata, franks = remove_empty_samples_and_features(
table, metadata, ranks
)
assert_frame_equal(fmetadata, metadata)
# Check that F1 through F5 (and F7) are still the same

def check2(new, old):
assert_frame_equal(new.iloc[0:5], old.iloc[0:5])
assert_series_equal(new.loc["F7"], old.loc["F7"])
assert "F6" not in new.index
assert len(new.index) == 6

check2(ftable, table)
check2(franks, ranks)


def test_remove_empty_samples_and_features_both():
"""Tests remove_empty_samples_and_features() when both samples and features
are empty.
"""

table, metadata, ranks = get_test_data()
# Zero out F8 and F7
table.loc["F8"] = 0
table.loc["F7"] = 0
# Zero out Sample2 and Sample4
table["Sample2"] = 0
table["Sample4"] = 0
ftable, fmetadata, franks = remove_empty_samples_and_features(
table, metadata, ranks
)

assert "F8" not in ftable.index
assert "F7" not in ftable.index
assert "F8" not in franks.index
assert "F7" not in franks.index
assert "Sample2" not in ftable.columns
assert "Sample4" not in ftable.columns
assert "Sample2" not in fmetadata.index
assert "Sample4" not in fmetadata.index
assert_frame_equal(
ftable, table[set(["Sample1", "Sample3"])].iloc[0:6], check_like=True
)
assert_frame_equal(
fmetadata, metadata.loc[set(["Sample1", "Sample3"])], check_like=True
)
assert_frame_equal(franks, ranks.iloc[0:6])


def test_remove_empty_samples_allempty():
"""Tests remove_empty_samples() when all samples in the table are empty."""
def test_remove_empty_samples_and_features_allempty():
"""Tests remove_empty_samples_and_features() on an empty table."""

table, metadata = get_test_data()
table, metadata, ranks = get_test_data()
table["Sample1"] = np.zeros(len(table.index))
table["Sample2"] = np.zeros(len(table.index))
table["Sample3"] = np.zeros(len(table.index))
table["Sample4"] = np.zeros(len(table.index))
with pytest.raises(ValueError):
ftable, fmetadata = remove_empty_samples(table, metadata)
remove_empty_samples_and_features(table, metadata, ranks)


def test_merge_feature_metadata():
Expand Down
Loading

0 comments on commit 1ab7925

Please sign in to comment.