BUG: Add check_json_dataset_name(): close #190

biocore · Jul 15, 2019 · b270b37 · b270b37
1 parent 2f6e859
commit b270b37
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,12 @@
   ([#183](https://github.com/biocore/qurro/issues/183))
   Before, input datasets with this column name would have caused either the
   loss of this column of data or confusing errors.
+- Previously, there was a small (either almost or entirely impossible) chance
+  that all of the main data within the rank or sample plot JSON could get
+  overwritten due to a collision in the dataset name. Although this should
+  basically never happen anyway, Qurro's code now checks for this scenario and
+  raises an error accordingly.
+  ([#190](https://github.com/biocore/qurro/issues/190))
 ### Performance enhancements
 ### Miscellaneous
 - A clear error is now raised if Qurro is trying to parse a GNPS feature

diff --git a/qurro/_json_utils.py b/qurro/_json_utils.py
@@ -360,3 +360,25 @@ def replace_js_json_definitions(
     # If this was called on a JS test then this perfectly normal, but if this
     # was called during the execution of Qurro then this is a problem.
     return 1
+
+
+def check_json_dataset_names(json_dict, *restricted_names):
+    """Checks that certain dataset names aren't present in a Vega-Lite JSON.
+
+       This should never really fail, since all of the special dataset names we
+       use internally start with "qurro_" (and Altair generates hashes of the
+       data somehow to use as dataset names). However, this function is here
+       just in the exceedingly rare case that this conflict does actually come
+       up (if so, we can figure out a way around it).
+    """
+
+    if "datasets" not in json_dict:
+        # Should never happen normally but might as well check for this
+        raise ValueError("Input JSON doesn't have any datasets defined")
+
+    intersection = set(json_dict["datasets"].keys()) & set(restricted_names)
+    if len(intersection) > 0:
+        raise ValueError(
+            "Found the following disallowed dataset name(s) in a JSON: "
+            "{}".format(intersection)
+        )
diff --git a/qurro/generate.py b/qurro/generate.py
@@ -21,7 +21,10 @@
 import pandas as pd
 import altair as alt
 from qurro._rank_utils import filter_unextreme_features
-from qurro._json_utils import replace_js_json_definitions
+from qurro._json_utils import (
+    replace_js_json_definitions,
+    check_json_dataset_names,
+)
 from qurro._df_utils import (
     replace_nan,
     validate_df,
@@ -290,6 +293,8 @@ def gen_rank_plot(V, ranking_ids, feature_metadata_cols):
     rank_chart_json = rank_chart.to_dict()
     rank_ordering = "qurro_rank_ordering"
     fm_col_ordering = "qurro_feature_metadata_ordering"
+    check_json_dataset_names(rank_chart_json, rank_ordering, fm_col_ordering)
+
     # Note we don't use rank_data.columns for setting the rank ordering. This
     # is because rank_data's columns now include both the ranking IDs and the
     # "Feature ID" and "qurro_classification" columns (as well as any feature
@@ -373,6 +378,8 @@ def gen_sample_plot(metadata):
     sample_chart_dict = sample_chart.to_dict()
     sample_chart_dict["mark"] = {"type": "circle"}
 
+    sm_fields = "qurro_sample_metadata_fields"
+    check_json_dataset_names(sample_chart_dict, sm_fields)
     # Specify an alphabetical ordering for the sample metadata field names.
     # This will be used for populating the x-axis / color field selectors in
     # Qurro's sample plot controls.
@@ -388,9 +395,7 @@ def gen_sample_plot(metadata):
     # y-axis of the sample plot automatically.)
     sorted_md_cols = list(sorted(sample_metadata.columns, key=str.lower))
     sorted_md_cols.remove("qurro_balance")
-    sample_chart_dict["datasets"][
-        "qurro_sample_metadata_fields"
-    ] = sorted_md_cols
+    sample_chart_dict["datasets"][sm_fields] = sorted_md_cols
     return sample_chart_dict
 
 

diff --git a/qurro/tests/test_json_utils.py b/qurro/tests/test_json_utils.py
@@ -14,6 +14,7 @@
     plot_jsons_equal,
     try_to_replace_line_json,
     replace_js_json_definitions,
+    check_json_dataset_names,
 )
 
 
@@ -206,6 +207,57 @@ def test_plot_jsons_equal():
     assert plot_jsons_equal(a, b)
 
 
+def test_check_json_dataset_names():
+    def assert_in_e_info(e_info, *should_be_in):
+        s_val = str(e_info.value)
+        for thing in should_be_in:
+            assert thing in s_val
+
+    # Test weird case where no datasets are present
+    with pytest.raises(ValueError) as exception_info:
+        check_json_dataset_names({})
+    assert_in_e_info(exception_info, "JSON doesn't have any datasets defined")
+
+    # Should succeed if no conflict between restricted_names and the keys
+    # within a["datasets"]
+    a = {"a": "b", "datasets": {"asdf": {1: 2}}}
+    check_json_dataset_names(a, "abc", "ASDF", "a")
+
+    # Should fail if there is a conflict!
+    with pytest.raises(ValueError) as exception_info:
+        check_json_dataset_names(a, "lol", "ok", "but", "asdf")
+    assert_in_e_info(
+        exception_info,
+        "Found the following disallowed dataset name(s)",
+        "asdf",
+    )
+
+    # Also works properly if there are multiple datasets already defined
+    # (shouldn't ever be the case but might as well check)
+    a = {"a": "b", "datasets": {"asdf": {1: 2}, "thing2": {3: 4}}}
+    # (should succeed)
+    check_json_dataset_names(a, "abc", "ASDF", "a", "THING2")
+    # (should fail)
+    with pytest.raises(ValueError) as exception_info:
+        check_json_dataset_names(a, "lol", "thing2", "asdf")
+    assert_in_e_info(
+        exception_info,
+        "Found the following disallowed dataset name(s)",
+        "asdf",
+        "thing2",
+    )
+    # (should also fail)
+    with pytest.raises(ValueError) as exception_info:
+        check_json_dataset_names(a, "lol", "thing2")
+    assert_in_e_info(
+        exception_info,
+        "Found the following disallowed dataset name(s)",
+        "thing2",
+    )
+    # Only the intersection should be reported
+    assert "asdf" not in str(exception_info.value)
+
+
 def test_try_to_replace_line_json():
     # Test various cases where we expect a replacement
     good_rank_line = "  var rankPlotJSON = {};\n"