fix: add datasource.changed_on to cache_key (#8901)

* Add datasource.changed_on to cache_key and add+fix related unit tests * Add note to UPDATING.md * Remove redundant comment about metric names
apache · Jan 16, 2020 · c087a48 · c087a48
1 parent 2b0c7a1
commit c087a48
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 40 deletions.
diff --git a/UPDATING.md b/UPDATING.md
@@ -22,8 +22,12 @@ This file documents any backwards-incompatible changes in Superset and
 assists people when migrating to a new version.
 
 ## Next
+* [8901](https://github.com/apache/incubator-superset/pull/8901): The datasource's update
+timestamp has been added to the query object's cache key to ensure updates to
+datasources are always reflected in associated query results. As a consequence all
+previously cached results will be invalidated when updating to the next version.
 
-* [8732](https://github.com/apache/incubator-superset/pull/8732): Swagger user interface is now enabled by default. 
+* [8732](https://github.com/apache/incubator-superset/pull/8732): Swagger user interface is now enabled by default.
 A new permission `show on SwaggerView` is created by `superset init` and given to the `Admin` Role. To disable the UI,
 set `FAB_API_SWAGGER_UI = False` on config.
 
@@ -90,17 +94,17 @@ which adds missing non-nullable fields to the `datasources` table. Depending on
 the integrity of the data, manual intervention may be required.
 
 * [5452](https://github.com/apache/incubator-superset/pull/5452): a change
-which adds missing non-nullable fields and uniqueness constraints (which may be 
-case insensitive depending on your database configuration) to the `columns`and 
-`table_columns` tables. Depending on the integrity of the data, manual 
+which adds missing non-nullable fields and uniqueness constraints (which may be
+case insensitive depending on your database configuration) to the `columns`and
+`table_columns` tables. Depending on the integrity of the data, manual
 intervention may be required.
 * `fabmanager` command line is deprecated since Flask-AppBuilder 2.0.0, use
 the new `flask fab <command>` integrated with *Flask cli*.
 * `SUPERSET_UPDATE_PERMS` environment variable was replaced by
 `FAB_UPDATE_PERMS` config boolean key. To disable automatic
 creation of permissions set `FAB_UPDATE_PERMS = False` on config.
 * [5453](https://github.com/apache/incubator-superset/pull/5453): a change
-which adds missing non-nullable fields and uniqueness constraints (which may be 
+which adds missing non-nullable fields and uniqueness constraints (which may be
 case insensitive depending on your database configuration) to the metrics
 and sql_metrics tables. Depending on the integrity of the data, manual
 intervention may be required.

diff --git a/setup.cfg b/setup.cfg
@@ -45,7 +45,7 @@ combine_as_imports = true
 include_trailing_comma = true
 line_length = 88
 known_first_party = superset
-known_third_party =alembic,backoff,bleach,celery,click,colorama,contextlib2,croniter,dateutil,flask,flask_appbuilder,flask_babel,flask_caching,flask_compress,flask_login,flask_migrate,flask_sqlalchemy,flask_talisman,flask_testing,flask_wtf,geohash,geopy,humanize,isodate,jinja2,markdown,markupsafe,marshmallow,msgpack,numpy,pandas,parsedatetime,pathlib2,polyline,prison,psycopg2,pyarrow,pyhive,pytz,retry,selenium,setuptools,simplejson,sphinx_rtd_theme,sqlalchemy,sqlalchemy_utils,sqlparse,werkzeug,wtforms,wtforms_json,yaml
+known_third_party =alembic,backoff,bleach,celery,click,colorama,contextlib2,croniter,dateutil,flask,flask_appbuilder,flask_babel,flask_caching,flask_compress,flask_login,flask_migrate,flask_sqlalchemy,flask_talisman,flask_testing,flask_wtf,geohash,geopy,humanize,isodate,jinja2,markdown,markupsafe,marshmallow,msgpack,numpy,pandas,parsedatetime,pathlib2,polyline,prison,pyarrow,pyhive,pytz,retry,selenium,setuptools,simplejson,sphinx_rtd_theme,sqlalchemy,sqlalchemy_utils,sqlparse,werkzeug,wtforms,wtforms_json,yaml
 multi_line_output = 3
 order_by_type = false
 

diff --git a/superset/common/query_context.py b/superset/common/query_context.py
@@ -156,20 +156,25 @@ def cache_timeout(self) -> int:
             return self.datasource.database.cache_timeout
         return config["CACHE_DEFAULT_TIMEOUT"]
 
-    def get_df_payload(  # pylint: disable=too-many-locals,too-many-statements
-        self, query_obj: QueryObject, **kwargs
-    ) -> Dict[str, Any]:
-        """Handles caching around the df payload retrieval"""
+    def cache_key(self, query_obj: QueryObject, **kwargs) -> Optional[str]:
         extra_cache_keys = self.datasource.get_extra_cache_keys(query_obj.to_dict())
         cache_key = (
             query_obj.cache_key(
                 datasource=self.datasource.uid,
                 extra_cache_keys=extra_cache_keys,
+                changed_on=self.datasource.changed_on,
                 **kwargs
             )
             if query_obj
             else None
         )
+        return cache_key
+
+    def get_df_payload(  # pylint: disable=too-many-locals,too-many-statements
+        self, query_obj: QueryObject, **kwargs
+    ) -> Dict[str, Any]:
+        """Handles caching around the df payload retrieval"""
+        cache_key = self.cache_key(query_obj, **kwargs)
         logging.info("Cache key: %s", cache_key)
         is_loaded = False
         stacktrace = None

diff --git a/superset/common/query_object.py b/superset/common/query_object.py
@@ -81,12 +81,7 @@ def __init__(
         self.time_shift = utils.parse_human_timedelta(time_shift)
         self.groupby = groupby or []
 
-        # Temporal solution for backward compatability issue
-        # due the new format of non-ad-hoc metric.
-        self.metrics = [
-            metric if "expressionType" in metric else metric["label"]  # type: ignore
-            for metric in metrics
-        ]
+        self.metrics = [utils.get_metric_name(metric) for metric in metrics]
         self.row_limit = row_limit
         self.filter = filters or []
         self.timeseries_limit = timeseries_limit

diff --git a/superset/viz.py b/superset/viz.py
@@ -372,6 +372,7 @@ def cache_key(self, query_obj, **extra):
         cache_dict["time_range"] = self.form_data.get("time_range")
         cache_dict["datasource"] = self.datasource.uid
         cache_dict["extra_cache_keys"] = self.datasource.get_extra_cache_keys(query_obj)
+        cache_dict["changed_on"] = self.datasource.changed_on
         json_data = self.json_dumps(cache_dict, sort_keys=True)
         return hashlib.md5(json_data.encode("utf-8")).hexdigest()
 

diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -28,15 +28,17 @@
 import random
 import re
 import string
+from typing import Any, Dict
 import unittest
 from unittest import mock
 
 import pandas as pd
-import psycopg2
 import sqlalchemy as sqla
 
 from tests.test_app import app
 from superset import dataframe, db, jinja_context, security_manager, sql_lab
+from superset.common.query_context import QueryContext
+from superset.connectors.connector_registry import ConnectorRegistry
 from superset.connectors.sqla.models import SqlaTable
 from superset.db_engine_specs.base import BaseEngineSpec
 from superset.db_engine_specs.mssql import MssqlEngineSpec
@@ -98,7 +100,23 @@ def test_slice_endpoint(self):
         resp = self.client.get("/superset/slice/-1/")
         assert resp.status_code == 404
 
-    def test_cache_key(self):
+    def _get_query_context_dict(self) -> Dict[str, Any]:
+        self.login(username="admin")
+        slc = self.get_slice("Girl Name Cloud", db.session)
+        return {
+            "datasource": {"id": slc.datasource_id, "type": slc.datasource_type},
+            "queries": [
+                {
+                    "granularity": "ds",
+                    "groupby": ["name"],
+                    "metrics": ["sum__num"],
+                    "filters": [],
+                    "row_limit": 100,
+                }
+            ],
+        }
+
+    def test_viz_cache_key(self):
         self.login(username="admin")
         slc = self.get_slice("Girls", db.session)
 
@@ -110,30 +128,40 @@ def test_cache_key(self):
         qobj["groupby"] = []
         self.assertNotEqual(cache_key, viz.cache_key(qobj))
 
+    def test_cache_key_changes_when_datasource_is_updated(self):
+        qc_dict = self._get_query_context_dict()
+
+        # construct baseline cache_key
+        query_context = QueryContext(**qc_dict)
+        query_object = query_context.queries[0]
+        cache_key_original = query_context.cache_key(query_object)
+
+        # make temporary change and revert it to refresh the changed_on property
+        datasource = ConnectorRegistry.get_datasource(
+            datasource_type=qc_dict["datasource"]["type"],
+            datasource_id=qc_dict["datasource"]["id"],
+            session=db.session,
+        )
+        description_original = datasource.description
+        datasource.description = "temporary description"
+        db.session.commit()
+        datasource.description = description_original
+        db.session.commit()
+
+        # create new QueryContext with unchanged attributes and extract new cache_key
+        query_context = QueryContext(**qc_dict)
+        query_object = query_context.queries[0]
+        cache_key_new = query_context.cache_key(query_object)
+
+        # the new cache_key should be different due to updated datasource
+        self.assertNotEqual(cache_key_original, cache_key_new)
+
     def test_api_v1_query_endpoint(self):
         self.login(username="admin")
-        slc = self.get_slice("Girl Name Cloud", db.session)
-        form_data = slc.form_data
-        data = json.dumps(
-            {
-                "datasource": {"id": slc.datasource_id, "type": slc.datasource_type},
-                "queries": [
-                    {
-                        "granularity": "ds",
-                        "groupby": ["name"],
-                        "metrics": ["sum__num"],
-                        "filters": [],
-                        "time_range": "{} : {}".format(
-                            form_data.get("since"), form_data.get("until")
-                        ),
-                        "limit": 100,
-                    }
-                ],
-            }
-        )
-        # TODO: update once get_data is implemented for QueryObject
-        with self.assertRaises(Exception):
-            self.get_resp("/api/v1/query/", {"query_context": data})
+        qc_dict = self._get_query_context_dict()
+        data = json.dumps(qc_dict)
+        resp = json.loads(self.get_resp("/api/v1/query/", {"query_context": data}))
+        self.assertEqual(resp[0]["rowcount"], 100)
 
     def test_old_slice_json_endpoint(self):
         self.login(username="admin")