Simplify pd.read_json() and pd.read_csv() for example data

apache · Aug 16, 2022 · 213bf79 · 213bf79
1 parent c67d43c
commit 213bf79
Show file tree

Hide file tree

Showing 12 changed files with 38 additions and 51 deletions.
diff --git a/superset/examples/bart_lines.py b/superset/examples/bart_lines.py
@@ -23,7 +23,7 @@
 from superset import db
 
 from ..utils.database import get_example_database
-from .helpers import get_example_data, get_table_connector_registry
+from .helpers import get_example_url, get_table_connector_registry
 
 
 def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
@@ -34,8 +34,8 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        content = get_example_data("bart-lines.json.gz").decode("utf-8")
-        df = pd.read_json(content, encoding="latin-1")
+        url = get_example_url("bart-lines.json.gz")
+        df = pd.read_json(url, encoding="latin-1", compression="gzip")
         df["path_json"] = df.path.map(json.dumps)
         df["polyline"] = df.path.map(polyline.encode)
         del df["path"]

diff --git a/superset/examples/birth_names.py b/superset/examples/birth_names.py
@@ -33,7 +33,7 @@
 
 from ..utils.database import get_example_database
 from .helpers import (
-    get_example_data,
+    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
@@ -66,7 +66,8 @@ def gen_filter(
 
 
 def load_data(tbl_name: str, database: Database, sample: bool = False) -> None:
-    pdf = pd.read_json(get_example_data("birth_names2.json.gz").decode("utf-8"))
+    url = get_example_url("birth_names2.json.gz")
+    pdf = pd.read_json(url, compression="gzip")
     # TODO(bkyryliuk): move load examples data into the pytest fixture
     if database.backend == "presto":
         pdf.ds = pd.to_datetime(pdf.ds, unit="ms")

diff --git a/superset/examples/country_map.py b/superset/examples/country_map.py
@@ -27,7 +27,7 @@
 from superset.utils.core import DatasourceType
 
 from .helpers import (
-    get_example_data,
+    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
@@ -44,10 +44,8 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        csv_bytes = get_example_data(
-            "birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True
-        )
-        data = pd.read_csv(csv_bytes, encoding="utf-8")
+        url = get_example_url("birth_france_data_for_country_map.csv")
+        data = pd.read_csv(url, encoding="utf-8")
         data["dttm"] = datetime.datetime.now().date()
         data.to_sql(
             tbl_name,

diff --git a/superset/examples/energy.py b/superset/examples/energy.py
@@ -28,7 +28,7 @@
 from superset.utils.core import DatasourceType
 
 from .helpers import (
-    get_example_data,
+    get_example_url,
     get_table_connector_registry,
     merge_slice,
     misc_dash_slices,
@@ -46,8 +46,8 @@ def load_energy(
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        data = get_example_data("energy.json.gz").decode("utf-8")
-        pdf = pd.read_json(data)
+        url = get_example_url("energy.json.gz")
+        pdf = pd.read_json(url, compression="gzip")
         pdf = pdf.head(100) if sample else pdf
         pdf.to_sql(
             tbl_name,

diff --git a/superset/examples/flights.py b/superset/examples/flights.py
@@ -20,7 +20,7 @@
 import superset.utils.database as database_utils
 from superset import db
 
-from .helpers import get_example_data, get_table_connector_registry
+from .helpers import get_example_url, get_table_connector_registry
 
 
 def load_flights(only_metadata: bool = False, force: bool = False) -> None:
@@ -32,12 +32,12 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None:
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        data = get_example_data("flight_data.csv.gz", make_bytes=True)
-        pdf = pd.read_csv(data, encoding="latin-1")
+        flight_data_url = get_example_url("flight_data.csv.gz")
+        pdf = pd.read_csv(flight_data_url, encoding="latin-1", compression="gzip")
 
         # Loading airports info to join and get lat/long
-        airports_bytes = get_example_data("airports.csv.gz", make_bytes=True)
-        airports = pd.read_csv(airports_bytes, encoding="latin-1")
+        airports_url = get_example_url("airports.csv.gz")
+        airports = pd.read_csv(airports_url, encoding="latin-1", compression="gzip")
         airports = airports.set_index("IATA_CODE")
 
         pdf[  # pylint: disable=unsupported-assignment-operation,useless-suppression

diff --git a/superset/examples/helpers.py b/superset/examples/helpers.py
@@ -17,10 +17,7 @@
 """Loads datasets, dashboards and slices in a new superset instance"""
 import json
 import os
-import zlib
-from io import BytesIO
-from typing import Union, Any, Dict, List, Set
-from urllib import request
+from typing import Any, Dict, List, Set
 
 from superset import app, db
 from superset.connectors.sqla.models import SqlaTable
@@ -73,14 +70,5 @@ def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str:
     return json.dumps(defaults_copy, indent=4, sort_keys=True)
 
 
-def get_example_data(
-    filepath: str, is_gzip: bool = True, make_bytes: bool = False
-) -> Union[bytes, BytesIO]:
-    content = request.urlopen(  # pylint: disable=consider-using-with
-        f"{BASE_URL}{filepath}?raw=true"
-    ).read()
-    if is_gzip:
-        content = zlib.decompress(content, zlib.MAX_WBITS | 16)
-    if make_bytes:
-        content = BytesIO(content)
-    return content
+def get_example_url(filepath: str) -> str:
+    return f"{BASE_URL}{filepath}?raw=true"
diff --git a/superset/examples/long_lat.py b/superset/examples/long_lat.py
@@ -27,7 +27,7 @@
 from superset.utils.core import DatasourceType
 
 from .helpers import (
-    get_example_data,
+    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
@@ -44,8 +44,8 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        data = get_example_data("san_francisco.csv.gz", make_bytes=True)
-        pdf = pd.read_csv(data, encoding="utf-8")
+        url = get_example_url("san_francisco.csv.gz")
+        pdf = pd.read_csv(url, encoding="utf-8", compression="gzip")
         start = datetime.datetime.now().replace(
             hour=0, minute=0, second=0, microsecond=0
         )

diff --git a/superset/examples/multiformat_time_series.py b/superset/examples/multiformat_time_series.py
@@ -25,7 +25,7 @@
 
 from ..utils.database import get_example_database
 from .helpers import (
-    get_example_data,
+    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
@@ -44,8 +44,8 @@ def load_multiformat_time_series(  # pylint: disable=too-many-locals
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        data = get_example_data("multiformat_time_series.json.gz").decode("utf-8")
-        pdf = pd.read_json(data)
+        url = get_example_url("multiformat_time_series.json.gz")
+        pdf = pd.read_json(url, compression="gzip")
         # TODO(bkyryliuk): move load examples data into the pytest fixture
         if database.backend == "presto":
             pdf.ds = pd.to_datetime(pdf.ds, unit="s")

diff --git a/superset/examples/paris.py b/superset/examples/paris.py
@@ -22,7 +22,7 @@
 import superset.utils.database as database_utils
 from superset import db
 
-from .helpers import get_example_data, get_table_connector_registry
+from .helpers import get_example_url, get_table_connector_registry
 
 
 def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> None:
@@ -33,8 +33,8 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) ->
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        data = get_example_data("paris_iris.json.gz").decode("utf-8")
-        df = pd.read_json(data)
+        url = get_example_url("paris_iris.json.gz")
+        df = pd.read_json(url, compression="gzip")
         df["features"] = df.features.map(json.dumps)
 
         df.to_sql(

diff --git a/superset/examples/random_time_series.py b/superset/examples/random_time_series.py
@@ -24,7 +24,7 @@
 from superset.utils.core import DatasourceType
 
 from .helpers import (
-    get_example_data,
+    get_example_url,
     get_slice_json,
     get_table_connector_registry,
     merge_slice,
@@ -42,8 +42,8 @@ def load_random_time_series_data(
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        data = get_example_data("random_time_series.json.gz").decode("utf-8")
-        pdf = pd.read_json(data)
+        url = get_example_url("random_time_series.json.gz")
+        pdf = pd.read_json(url, compression="gzip")
         if database.backend == "presto":
             pdf.ds = pd.to_datetime(pdf.ds, unit="s")
             pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S")

diff --git a/superset/examples/sf_population_polygons.py b/superset/examples/sf_population_polygons.py
@@ -22,7 +22,7 @@
 import superset.utils.database as database_utils
 from superset import db
 
-from .helpers import get_example_data, get_table_connector_registry
+from .helpers import get_example_url, get_table_connector_registry
 
 
 def load_sf_population_polygons(
@@ -35,8 +35,8 @@ def load_sf_population_polygons(
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        data = get_example_data("sf_population.json.gz").decode("utf-8")
-        df = pd.read_json(data)
+        url = get_example_url("sf_population.json.gz")
+        df = pd.read_json(url, compression="gzip")
         df["contour"] = df.contour.map(json.dumps)
 
         df.to_sql(

diff --git a/superset/examples/world_bank.py b/superset/examples/world_bank.py
@@ -33,7 +33,7 @@
 
 from ..connectors.base.models import BaseDatasource
 from .helpers import (
-    get_example_data,
+    get_example_url,
     get_examples_folder,
     get_slice_json,
     get_table_connector_registry,
@@ -56,8 +56,8 @@ def load_world_bank_health_n_pop(  # pylint: disable=too-many-locals, too-many-s
     table_exists = database.has_table_by_name(tbl_name)
 
     if not only_metadata and (not table_exists or force):
-        data = get_example_data("countries.json.gz").decode("utf-8")
-        pdf = pd.read_json(data)
+        url = get_example_url("countries.json.gz")
+        pdf = pd.read_json(url, compression="gzip")
         pdf.columns = [col.replace(".", "_") for col in pdf.columns]
         if database.backend == "presto":
             pdf.year = pd.to_datetime(pdf.year)