ENH: Make Qurro use sparse count JSON: close #58

There will be other ways to optimize Qurro, but I think this is sufficient to close #58 for the time being. Now I want to try benchmarking this, to see how much more of the EMP we can fit into a browser :)
biocore · Jul 7, 2019 · bc0f97d · bc0f97d
1 parent cea1037
commit bc0f97d
Show file tree

Hide file tree

Showing 20 changed files with 323 additions and 125 deletions.
diff --git a/docs/demos/byrd/js/display.js b/docs/demos/byrd/js/display.js
@@ -51,32 +51,32 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
             this.botFeatures = undefined;
 
             // Used when looking up a feature's count.
-            this.feature_cts = countJSON;
+            this.featureCts = countJSON;
             // Used when searching through features.
-            this.feature_ids = Object.keys(this.feature_cts);
+            // Since we filtered out empty features in the python side of
+            // things, we know that every feature should be represented in the
+            // count JSON's keys.
+            this.featureIDs = Object.keys(this.featureCts);
 
             // Just a list of all sample IDs.
-            var sampleIDs = Object.keys(this.feature_cts[this.feature_ids[0]]);
+            this.sampleIDs = RRVDisplay.identifySampleIDs(samplePlotJSON);
             // Used when letting the user know how many samples are present in
             // the sample plot.
-            // Note that we need to use Object.keys() in order to be able to
-            // figure out how many entries are in the sampleIDs list; see
-            // https://stackoverflow.com/a/6700/10730311
-            this.sampleCount = sampleIDs.length;
+            this.sampleCount = this.sampleIDs.length;
 
             // a mapping from "reason" (i.e. "balance", "xAxis", "color") to
             // list of dropped sample IDs.
             //
-            // "balance" maps to sampleIDs right now because all samples have a
-            // null balance starting off.
+            // "balance" maps to this.sampleIDs right now because all samples
+            // have a null balance starting off.
             //
             // NOTE: xAxis and color might already exclude some samples from
             // being shown in the default categorical encoding. Their
             // corresponding lists will be updated in makeSamplePlot(), before
             // dom_utils.updateMainSampleShownDiv() will be called (so the
             // nulls will be replaced with actual lists).
             this.droppedSamples = {
-                balance: sampleIDs,
+                balance: this.sampleIDs,
                 xAxis: null,
                 color: null
             };
@@ -208,7 +208,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
                 // view/select.
                 // TODO: make this a separate func so we can unit-test it
                 if (
-                    this.feature_ids.length <=
+                    this.featureIDs.length <=
                     this.rankPlotJSON.config.view.width
                 ) {
                     document.getElementById("barSize").value = "fit";
@@ -411,7 +411,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
                 // Not 100% sure this is optimal.
                 newBarSize =
                     this.rankPlotJSON.config.view.width /
-                    this.feature_ids.length;
+                    this.featureIDs.length;
             } else {
                 newBarSize = parseInt(newSizeType);
             }
@@ -964,6 +964,20 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
             }
         }
 
+        static identifySampleIDs(samplePlotSpec) {
+            // Like identifyMetadataColumns(), but just finds sample IDs.
+            var sampleIDs = [];
+            var dataName = samplePlotSpec.data.name;
+            var sid;
+            for (var s = 0; s < samplePlotSpec.datasets[dataName].length; s++) {
+                sid = samplePlotSpec.datasets[dataName][s]["Sample ID"]
+                if (sid !== undefined) {
+                    sampleIDs.push(sid);
+                }
+            }
+            return sampleIDs;
+        }
+
         static identifyMetadataColumns(samplePlotSpec) {
             // Given a Vega-Lite sample plot specification, find all the metadata cols.
             // Just uses whatever the first available sample's keys are as a
@@ -995,11 +1009,33 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
          * python side of things.)
          */
         validateSampleID(sampleID) {
-            if (this.feature_cts[this.feature_ids[0]][sampleID] === undefined) {
+            if (!this.sampleIDs.includes(sampleID)) {
                 throw new Error("Invalid sample ID: " + sampleID);
             }
         }
 
+        /* Gets count data from the featureCts object. This uses a sparse
+         * storage method, so only samples with a nonzero count for a given
+         * feature are contained in that feature's entry.
+         *
+         * So, if the "entry" for a sample for a feature in featureCts is falsy
+         * (i.e. undefined, but could also be false, 0, "", null, or NaN --
+         * none of these should ever occur in the count JSON but if they do
+         * that should also be indicative of a zero count [1]), then we
+         * consider that sample's count to be 0. Otherwise, we just return the
+         * entry.
+         *
+         * [1] See https://developer.mozilla.org/en-US/docs/Glossary/Truthy
+         */
+        getCount(featureID, sampleID) {
+            var putativeCount = this.featureCts[featureID][sampleID];
+            if (putativeCount) {
+                return putativeCount;
+            } else {
+                return 0;
+            }
+        }
+
         /* Given a "row" of the sample plot's JSON for a sample, and given an array of
          * features, return the sum of the sample's abundances for those particular features.
          * TODO: add option to do log geometric means
@@ -1009,9 +1045,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
             this.validateSampleID(sampleID);
             var abundance = 0;
             for (var t = 0; t < features.length; t++) {
-                abundance += this.feature_cts[features[t]["Feature ID"]][
-                    sampleID
-                ];
+                abundance += this.getCount(features[t]["Feature ID"], sampleID);
             }
             return abundance;
         }
@@ -1025,12 +1059,8 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
         updateBalanceSingle(sampleRow) {
             var sampleID = sampleRow["Sample ID"];
             this.validateSampleID(sampleID);
-            var topCt = this.feature_cts[this.newFeatureHigh["Feature ID"]][
-                sampleID
-            ];
-            var botCt = this.feature_cts[this.newFeatureLow["Feature ID"]][
-                sampleID
-            ];
+            var topCt = this.getCount(this.newFeatureHigh["Feature ID"], sampleID);
+            var botCt = this.getCount(this.newFeatureLow["Feature ID"], sampleID);
             return feature_computation.computeBalance(topCt, botCt);
         }
 

diff --git a/docs/demos/byrd/main.js b/docs/demos/byrd/main.js
diff --git a/docs/demos/q2_moving_pictures/js/display.js b/docs/demos/q2_moving_pictures/js/display.js
@@ -51,32 +51,32 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
             this.botFeatures = undefined;
 
             // Used when looking up a feature's count.
-            this.feature_cts = countJSON;
+            this.featureCts = countJSON;
             // Used when searching through features.
-            this.feature_ids = Object.keys(this.feature_cts);
+            // Since we filtered out empty features in the python side of
+            // things, we know that every feature should be represented in the
+            // count JSON's keys.
+            this.featureIDs = Object.keys(this.featureCts);
 
             // Just a list of all sample IDs.
-            var sampleIDs = Object.keys(this.feature_cts[this.feature_ids[0]]);
+            this.sampleIDs = RRVDisplay.identifySampleIDs(samplePlotJSON);
             // Used when letting the user know how many samples are present in
             // the sample plot.
-            // Note that we need to use Object.keys() in order to be able to
-            // figure out how many entries are in the sampleIDs list; see
-            // https://stackoverflow.com/a/6700/10730311
-            this.sampleCount = sampleIDs.length;
+            this.sampleCount = this.sampleIDs.length;
 
             // a mapping from "reason" (i.e. "balance", "xAxis", "color") to
             // list of dropped sample IDs.
             //
-            // "balance" maps to sampleIDs right now because all samples have a
-            // null balance starting off.
+            // "balance" maps to this.sampleIDs right now because all samples
+            // have a null balance starting off.
             //
             // NOTE: xAxis and color might already exclude some samples from
             // being shown in the default categorical encoding. Their
             // corresponding lists will be updated in makeSamplePlot(), before
             // dom_utils.updateMainSampleShownDiv() will be called (so the
             // nulls will be replaced with actual lists).
             this.droppedSamples = {
-                balance: sampleIDs,
+                balance: this.sampleIDs,
                 xAxis: null,
                 color: null
             };
@@ -208,7 +208,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
                 // view/select.
                 // TODO: make this a separate func so we can unit-test it
                 if (
-                    this.feature_ids.length <=
+                    this.featureIDs.length <=
                     this.rankPlotJSON.config.view.width
                 ) {
                     document.getElementById("barSize").value = "fit";
@@ -411,7 +411,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
                 // Not 100% sure this is optimal.
                 newBarSize =
                     this.rankPlotJSON.config.view.width /
-                    this.feature_ids.length;
+                    this.featureIDs.length;
             } else {
                 newBarSize = parseInt(newSizeType);
             }
@@ -964,6 +964,20 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
             }
         }
 
+        static identifySampleIDs(samplePlotSpec) {
+            // Like identifyMetadataColumns(), but just finds sample IDs.
+            var sampleIDs = [];
+            var dataName = samplePlotSpec.data.name;
+            var sid;
+            for (var s = 0; s < samplePlotSpec.datasets[dataName].length; s++) {
+                sid = samplePlotSpec.datasets[dataName][s]["Sample ID"]
+                if (sid !== undefined) {
+                    sampleIDs.push(sid);
+                }
+            }
+            return sampleIDs;
+        }
+
         static identifyMetadataColumns(samplePlotSpec) {
             // Given a Vega-Lite sample plot specification, find all the metadata cols.
             // Just uses whatever the first available sample's keys are as a
@@ -995,11 +1009,33 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
          * python side of things.)
          */
         validateSampleID(sampleID) {
-            if (this.feature_cts[this.feature_ids[0]][sampleID] === undefined) {
+            if (!this.sampleIDs.includes(sampleID)) {
                 throw new Error("Invalid sample ID: " + sampleID);
             }
         }
 
+        /* Gets count data from the featureCts object. This uses a sparse
+         * storage method, so only samples with a nonzero count for a given
+         * feature are contained in that feature's entry.
+         *
+         * So, if the "entry" for a sample for a feature in featureCts is falsy
+         * (i.e. undefined, but could also be false, 0, "", null, or NaN --
+         * none of these should ever occur in the count JSON but if they do
+         * that should also be indicative of a zero count [1]), then we
+         * consider that sample's count to be 0. Otherwise, we just return the
+         * entry.
+         *
+         * [1] See https://developer.mozilla.org/en-US/docs/Glossary/Truthy
+         */
+        getCount(featureID, sampleID) {
+            var putativeCount = this.featureCts[featureID][sampleID];
+            if (putativeCount) {
+                return putativeCount;
+            } else {
+                return 0;
+            }
+        }
+
         /* Given a "row" of the sample plot's JSON for a sample, and given an array of
          * features, return the sum of the sample's abundances for those particular features.
          * TODO: add option to do log geometric means
@@ -1009,9 +1045,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
             this.validateSampleID(sampleID);
             var abundance = 0;
             for (var t = 0; t < features.length; t++) {
-                abundance += this.feature_cts[features[t]["Feature ID"]][
-                    sampleID
-                ];
+                abundance += this.getCount(features[t]["Feature ID"], sampleID);
             }
             return abundance;
         }
@@ -1025,12 +1059,8 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
         updateBalanceSingle(sampleRow) {
             var sampleID = sampleRow["Sample ID"];
             this.validateSampleID(sampleID);
-            var topCt = this.feature_cts[this.newFeatureHigh["Feature ID"]][
-                sampleID
-            ];
-            var botCt = this.feature_cts[this.newFeatureLow["Feature ID"]][
-                sampleID
-            ];
+            var topCt = this.getCount(this.newFeatureHigh["Feature ID"], sampleID);
+            var botCt = this.getCount(this.newFeatureLow["Feature ID"], sampleID);
             return feature_computation.computeBalance(topCt, botCt);
         }
 

diff --git a/docs/demos/q2_moving_pictures/main.js b/docs/demos/q2_moving_pictures/main.js