ENH: Remove empty features and add tests (#171)

This has apparently thrown off the Byrd test, since it looks like there were a bunch of empty features in that dataset. Need to update the testing utilities to allow for empty features to be removed. also looks like the JSONs for the q2-moving-pictures and sleep apnea integration tests are different -- figure out why, and if that's due to a bug or just due to something else. once that's done this issue will be done
biocore · Jul 5, 2019 · 1ab7925 · 1ab7925
1 parent 841c586
commit 1ab7925
Show file tree

Hide file tree

Showing 4 changed files with 171 additions and 97 deletions.
diff --git a/qurro/_df_utils.py b/qurro/_df_utils.py
@@ -135,41 +135,76 @@ def biom_table_to_sparse_df(table, min_row_ct=2, min_col_ct=1):
     return table_sdf
 
 
-def remove_empty_samples(table_sdf, sample_metadata_df):
-    """Removes samples with 0 counts for every feature from the table and
-       sample metadata DataFrame.
+def remove_empty_samples_and_features(
+    table_sdf, sample_metadata_df, feature_ranks_df
+):
+    """Removes empty samples and features from the table, sample metadata, and
+       feature ranks DataFrames.
 
        This should be called *after* matching the table with the sample
-       metadata -- we assume that the columns of the table DataFrame are
-       equivalent to the indices of the sample metadata DataFrame.
+       metadata and feature ranks -- we assume that the columns of the
+       table DataFrame are equivalent to the indices of the sample metadata
+       DataFrame, and that the indices (rows) of the table are also equivalent
+       to the indices of the feature ranks DataFrame.
 
-       This will raise a ValueError if, after removing empty samples, either
-       the table's columns or the metadata's indices are empty (this will
-       happen in the case where all of the samples in these DataFrames are
-       empty).
+       This will raise a ValueError if the input table is empty (i.e. all
+       samples/features would be removed).
     """
-    logging.debug("Attempting to remove empty samples.")
-    table_df_equal_to_zero = table_sdf == 0
+    logging.debug("Attempting to remove empty samples and features.")
+
+    # If the table only contains zeros, then attempting to drop all empty
+    # samples and/or features would result in a 0x0 DataFrame. Therefore, we
+    # just raise a ValueError in this case.
+    if (table_sdf == 0).all().all():
+        raise ValueError("The table is empty.")
+
+    # Filter out empty samples
+    # Basically, we compute each cell in the table table to a bool (True if !=
+    # 0, False if == 0). Then we just find all the columns (samples) with at
+    # least one True value, and filter the table to just those columns.
+    neq_zero = table_sdf != 0
     nonempty_samples = []
     for sample in table_sdf.columns:
-        if not table_df_equal_to_zero[sample].all():
+        if neq_zero[sample].any():
             nonempty_samples.append(sample)
 
-    filtered_table = table_sdf.filter(items=nonempty_samples, axis="columns")
+    samplefiltered_table = table_sdf.filter(
+        items=nonempty_samples, axis="columns"
+    )
     filtered_metadata = sample_metadata_df.filter(
         items=nonempty_samples, axis="index"
     )
 
-    if len(filtered_table.columns) < 1 or len(filtered_metadata.index) < 1:
-        raise ValueError("Found all empty samples with current features.")
+    # Filter out empty features
+    # Same method as above, but operating on rows (features) instead of on
+    # columns (samples).
+    neq_zero = samplefiltered_table != 0
+    nonempty_features = []
+    for feature in samplefiltered_table.index:
+        if neq_zero.loc[feature].any():
+            nonempty_features.append(feature)
+
+    filtered_table = samplefiltered_table.filter(
+        items=nonempty_features, axis="index"
+    )
+    filtered_ranks = feature_ranks_df.filter(
+        items=nonempty_features, axis="index"
+    )
 
+    # Let user know about which samples/features may have been dropped, if any.
     sample_diff = len(table_sdf.columns) - len(filtered_table.columns)
     if sample_diff > 0:
         logging.debug("Removed {} empty sample(s).".format(sample_diff))
     else:
         logging.debug("Couldn't find any empty samples.")
 
-    return filtered_table, filtered_metadata
+    feature_diff = len(table_sdf.index) - len(filtered_table.index)
+    if feature_diff > 0:
+        logging.debug("Removed {} empty feature(s).".format(feature_diff))
+    else:
+        logging.debug("Couldn't find any empty features.")
+
+    return filtered_table, filtered_metadata, filtered_ranks
 
 
 def match_table_and_data(table, feature_ranks, sample_metadata):

diff --git a/qurro/generate.py b/qurro/generate.py
@@ -26,7 +26,7 @@
     replace_nan,
     validate_df,
     biom_table_to_sparse_df,
-    remove_empty_samples,
+    remove_empty_samples_and_features,
     match_table_and_data,
     merge_feature_metadata,
 )
@@ -80,8 +80,8 @@ def process_input(
        5. Calls filter_unextreme_features() using the provided
           extreme_feature_count. (If it's None, then nothing will be done.)
 
-       6. Calls remove_empty_samples() to filter samples without any counts for
-          any features. This is purposefully done *after*
+       6. Calls remove_empty_samples_and_features() to filter empty samples
+          (and features). This is purposefully done *after*
           filter_unextreme_features() is called.
 
        7. Calls merge_feature_metadata() on the feature ranks and feature
@@ -93,15 +93,15 @@ def process_input(
             Sample metadata, but matched with the table and with empty samples
             removed.
 
-       filtered_ranks: pd.DataFrame
+       output_ranks: pd.DataFrame
             Feature ranks, post-filtering and with feature metadata columns
             added in.
 
        ranking_ids
-            The ranking columns' names in filtered_ranks.
+            The ranking columns' names in output_ranks.
 
        feature_metadata_cols: list
-            The feature metadata columns' names in filtered_ranks.
+            The feature metadata columns' names in output_ranks.
 
        output_table: pd.SparseDataFrame
             The BIOM table, post matching with the feature ranks and sample
@@ -144,23 +144,23 @@ def process_input(
         m_table, feature_ranks, extreme_feature_count
     )
 
-    # Filter now-empty samples from the BIOM table.
-    output_table, output_metadata = remove_empty_samples(
-        filtered_table, m_sample_metadata
+    # Filter now-empty samples (and empty features) from the BIOM table.
+    output_table, output_metadata, u_ranks = remove_empty_samples_and_features(
+        filtered_table, m_sample_metadata, filtered_ranks
     )
 
     # Save a list of ranking IDs (before we add in feature metadata)
     # TODO: just have merge_feature_metadata() give us this?
-    ranking_ids = filtered_ranks.columns
+    ranking_ids = u_ranks.columns
 
-    filtered_ranks, feature_metadata_cols = merge_feature_metadata(
-        filtered_ranks, feature_metadata
+    output_ranks, feature_metadata_cols = merge_feature_metadata(
+        u_ranks, feature_metadata
     )
 
     logging.debug("Finished input processing.")
     return (
         output_metadata,
-        filtered_ranks,
+        output_ranks,
         ranking_ids,
         feature_metadata_cols,
         output_table,

diff --git a/qurro/tests/test_df_utils.py b/qurro/tests/test_df_utils.py
@@ -6,7 +6,7 @@
     ensure_df_headers_unique,
     validate_df,
     replace_nan,
-    remove_empty_samples,
+    remove_empty_samples_and_features,
     merge_feature_metadata,
 )
 
@@ -170,9 +170,10 @@ def test_replace_nan():
 
 
 def get_test_data():
-    """Returns a test table and metadata DataFrame.
+    """Returns test table, metadata, and ranks DataFrames.
 
-       Mostly copied from get_test_data() in test_filter_unextreme_features.
+       Mostly based on/copied from get_test_data() in
+       test_filter_unextreme_features.
     """
     feature_ids = ["F1", "F2", "F3", "F4", "F5", "F6", "F7", "F8"]
     table = DataFrame(
@@ -193,27 +194,39 @@ def get_test_data():
         },
         index=list(table.columns)[:],
     )
-    return table, metadata
+    ranks = DataFrame(
+        {
+            "Rank 0": [1, 2, 3, 4, 5, 6, 7, 8],
+            "Rank 1": [8, 7, 6, 5, 4, 3, 2, 1],
+        },
+        index=list(table.index)[:],
+    )
+    return table, metadata, ranks
 
 
-def test_remove_empty_samples_basic():
-    """Tests remove_empty_samples() in the simple cases of removing 0, 1, and 2
-       empty sample(s).
+def test_remove_empty_samples_and_features_samples():
+    """Tests remove_empty_samples_and_features() in the simple cases of
+       removing 0, 1, and 2 empty sample(s).
     """
 
     # TRY REMOVING 0 SAMPLES
-    table, metadata = get_test_data()
+    table, metadata, ranks = get_test_data()
     # Check that, when none of the samples are empty, nothing is changed.
-    ftable, fmetadata = remove_empty_samples(table, metadata)
+    ftable, fmetadata, franks = remove_empty_samples_and_features(
+        table, metadata, ranks
+    )
     assert_frame_equal(ftable, table)
     assert_frame_equal(fmetadata, metadata)
+    assert_frame_equal(franks, ranks)
 
     # TRY REMOVING 1 SAMPLE
     # Zero out Sample3 (it only has one count, for F1)
     table["Sample3"]["F1"] = 0
     # Check that just the one empty sample (Sample3) was removed, from both the
     # table and the sample metadata.
-    ftable, fmetadata = remove_empty_samples(table, metadata)
+    ftable, fmetadata, franks = remove_empty_samples_and_features(
+        table, metadata, ranks
+    )
     assert_series_equal(ftable["Sample1"], table["Sample1"])
     assert_series_equal(ftable["Sample2"], table["Sample2"])
     assert_series_equal(ftable["Sample4"], table["Sample4"])
@@ -228,10 +241,14 @@ def test_remove_empty_samples_basic():
     assert len(fmetadata.index) == 3
     assert len(fmetadata.columns) == len(metadata.columns) == 4
 
+    assert_frame_equal(franks, ranks)
+
     # TRY REMOVING 2 SAMPLES
     # Now, zero out Sample4 (it only has one count in F4)
     table["Sample4"]["F4"] = 0
-    ftable, fmetadata = remove_empty_samples(table, metadata)
+    ftable, fmetadata, franks = remove_empty_samples_and_features(
+        table, metadata, ranks
+    )
     assert_series_equal(ftable["Sample1"], table["Sample1"])
     assert_series_equal(ftable["Sample2"], table["Sample2"])
     assert "Sample3" not in ftable.columns
@@ -245,17 +262,92 @@ def test_remove_empty_samples_basic():
     assert "Sample4" not in fmetadata.index
     assert len(fmetadata.index) == 2
 
+    assert_frame_equal(franks, ranks)
+
+
+def test_remove_empty_samples_and_features_features():
+    """Tests remove_empty_samples_and_features() in the simple cases of
+       removing 1 and then 2 empty feature(s).
+    """
+
+    table, metadata, ranks = get_test_data()
+    # Zero out F8
+    table.loc["F8"] = 0
+    ftable, fmetadata, franks = remove_empty_samples_and_features(
+        table, metadata, ranks
+    )
+    assert_frame_equal(fmetadata, metadata)
+    # Check that F8 was removed from the table and ranks
+
+    def check(new, old):
+        assert_frame_equal(new.iloc[0:7], old.iloc[0:7])
+        assert "F8" not in new.index
+        assert len(new.index) == 7
+
+    check(ftable, table)
+    check(franks, ranks)
+
+    # Zero out F6, also
+    table.loc["F6"] = 0
+    ftable, fmetadata, franks = remove_empty_samples_and_features(
+        table, metadata, ranks
+    )
+    assert_frame_equal(fmetadata, metadata)
+    # Check that F1 through F5 (and F7) are still the same
+
+    def check2(new, old):
+        assert_frame_equal(new.iloc[0:5], old.iloc[0:5])
+        assert_series_equal(new.loc["F7"], old.loc["F7"])
+        assert "F6" not in new.index
+        assert len(new.index) == 6
+
+    check2(ftable, table)
+    check2(franks, ranks)
+
+
+def test_remove_empty_samples_and_features_both():
+    """Tests remove_empty_samples_and_features() when both samples and features
+       are empty.
+    """
+
+    table, metadata, ranks = get_test_data()
+    # Zero out F8 and F7
+    table.loc["F8"] = 0
+    table.loc["F7"] = 0
+    # Zero out Sample2 and Sample4
+    table["Sample2"] = 0
+    table["Sample4"] = 0
+    ftable, fmetadata, franks = remove_empty_samples_and_features(
+        table, metadata, ranks
+    )
+
+    assert "F8" not in ftable.index
+    assert "F7" not in ftable.index
+    assert "F8" not in franks.index
+    assert "F7" not in franks.index
+    assert "Sample2" not in ftable.columns
+    assert "Sample4" not in ftable.columns
+    assert "Sample2" not in fmetadata.index
+    assert "Sample4" not in fmetadata.index
+    assert_frame_equal(
+        ftable, table[set(["Sample1", "Sample3"])].iloc[0:6], check_like=True
+    )
+    assert_frame_equal(
+        fmetadata, metadata.loc[set(["Sample1", "Sample3"])], check_like=True
+    )
+    assert_frame_equal(franks, ranks.iloc[0:6])
+
 
-def test_remove_empty_samples_allempty():
-    """Tests remove_empty_samples() when all samples in the table are empty."""
+def test_remove_empty_samples_and_features_allempty():
+    """Tests remove_empty_samples_and_features() on an empty table."""
 
-    table, metadata = get_test_data()
+    table, metadata, ranks = get_test_data()
     table["Sample1"] = np.zeros(len(table.index))
     table["Sample2"] = np.zeros(len(table.index))
     table["Sample3"] = np.zeros(len(table.index))
     table["Sample4"] = np.zeros(len(table.index))
     with pytest.raises(ValueError):
-        ftable, fmetadata = remove_empty_samples(table, metadata)
+        remove_empty_samples_and_features(table, metadata, ranks)
 
 
 def test_merge_feature_metadata():