Skip to content

Commit

Permalink
Simplify pd.read_json() and pd.read_csv() for example data
Browse files Browse the repository at this point in the history
  • Loading branch information
EugeneTorap committed Aug 16, 2022
1 parent c67d43c commit 213bf79
Show file tree
Hide file tree
Showing 12 changed files with 38 additions and 51 deletions.
6 changes: 3 additions & 3 deletions superset/examples/bart_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from superset import db

from ..utils.database import get_example_database
from .helpers import get_example_data, get_table_connector_registry
from .helpers import get_example_url, get_table_connector_registry


def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
Expand All @@ -34,8 +34,8 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
content = get_example_data("bart-lines.json.gz").decode("utf-8")
df = pd.read_json(content, encoding="latin-1")
url = get_example_url("bart-lines.json.gz")
df = pd.read_json(url, encoding="latin-1", compression="gzip")
df["path_json"] = df.path.map(json.dumps)
df["polyline"] = df.path.map(polyline.encode)
del df["path"]
Expand Down
5 changes: 3 additions & 2 deletions superset/examples/birth_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from ..utils.database import get_example_database
from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand Down Expand Up @@ -66,7 +66,8 @@ def gen_filter(


def load_data(tbl_name: str, database: Database, sample: bool = False) -> None:
pdf = pd.read_json(get_example_data("birth_names2.json.gz").decode("utf-8"))
url = get_example_url("birth_names2.json.gz")
pdf = pd.read_json(url, compression="gzip")
# TODO(bkyryliuk): move load examples data into the pytest fixture
if database.backend == "presto":
pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
Expand Down
8 changes: 3 additions & 5 deletions superset/examples/country_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from superset.utils.core import DatasourceType

from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand All @@ -44,10 +44,8 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
csv_bytes = get_example_data(
"birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True
)
data = pd.read_csv(csv_bytes, encoding="utf-8")
url = get_example_url("birth_france_data_for_country_map.csv")
data = pd.read_csv(url, encoding="utf-8")
data["dttm"] = datetime.datetime.now().date()
data.to_sql(
tbl_name,
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from superset.utils.core import DatasourceType

from .helpers import (
get_example_data,
get_example_url,
get_table_connector_registry,
merge_slice,
misc_dash_slices,
Expand All @@ -46,8 +46,8 @@ def load_energy(
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("energy.json.gz").decode("utf-8")
pdf = pd.read_json(data)
url = get_example_url("energy.json.gz")
pdf = pd.read_json(url, compression="gzip")
pdf = pdf.head(100) if sample else pdf
pdf.to_sql(
tbl_name,
Expand Down
10 changes: 5 additions & 5 deletions superset/examples/flights.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import superset.utils.database as database_utils
from superset import db

from .helpers import get_example_data, get_table_connector_registry
from .helpers import get_example_url, get_table_connector_registry


def load_flights(only_metadata: bool = False, force: bool = False) -> None:
Expand All @@ -32,12 +32,12 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None:
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("flight_data.csv.gz", make_bytes=True)
pdf = pd.read_csv(data, encoding="latin-1")
flight_data_url = get_example_url("flight_data.csv.gz")
pdf = pd.read_csv(flight_data_url, encoding="latin-1", compression="gzip")

# Loading airports info to join and get lat/long
airports_bytes = get_example_data("airports.csv.gz", make_bytes=True)
airports = pd.read_csv(airports_bytes, encoding="latin-1")
airports_url = get_example_url("airports.csv.gz")
airports = pd.read_csv(airports_url, encoding="latin-1", compression="gzip")
airports = airports.set_index("IATA_CODE")

pdf[ # pylint: disable=unsupported-assignment-operation,useless-suppression
Expand Down
18 changes: 3 additions & 15 deletions superset/examples/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@
"""Loads datasets, dashboards and slices in a new superset instance"""
import json
import os
import zlib
from io import BytesIO
from typing import Union, Any, Dict, List, Set
from urllib import request
from typing import Any, Dict, List, Set

from superset import app, db
from superset.connectors.sqla.models import SqlaTable
Expand Down Expand Up @@ -73,14 +70,5 @@ def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str:
return json.dumps(defaults_copy, indent=4, sort_keys=True)


def get_example_data(
filepath: str, is_gzip: bool = True, make_bytes: bool = False
) -> Union[bytes, BytesIO]:
content = request.urlopen( # pylint: disable=consider-using-with
f"{BASE_URL}{filepath}?raw=true"
).read()
if is_gzip:
content = zlib.decompress(content, zlib.MAX_WBITS | 16)
if make_bytes:
content = BytesIO(content)
return content
def get_example_url(filepath: str) -> str:
return f"{BASE_URL}{filepath}?raw=true"
6 changes: 3 additions & 3 deletions superset/examples/long_lat.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from superset.utils.core import DatasourceType

from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand All @@ -44,8 +44,8 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("san_francisco.csv.gz", make_bytes=True)
pdf = pd.read_csv(data, encoding="utf-8")
url = get_example_url("san_francisco.csv.gz")
pdf = pd.read_csv(url, encoding="utf-8", compression="gzip")
start = datetime.datetime.now().replace(
hour=0, minute=0, second=0, microsecond=0
)
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/multiformat_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from ..utils.database import get_example_database
from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand All @@ -44,8 +44,8 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("multiformat_time_series.json.gz").decode("utf-8")
pdf = pd.read_json(data)
url = get_example_url("multiformat_time_series.json.gz")
pdf = pd.read_json(url, compression="gzip")
# TODO(bkyryliuk): move load examples data into the pytest fixture
if database.backend == "presto":
pdf.ds = pd.to_datetime(pdf.ds, unit="s")
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/paris.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import superset.utils.database as database_utils
from superset import db

from .helpers import get_example_data, get_table_connector_registry
from .helpers import get_example_url, get_table_connector_registry


def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> None:
Expand All @@ -33,8 +33,8 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) ->
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("paris_iris.json.gz").decode("utf-8")
df = pd.read_json(data)
url = get_example_url("paris_iris.json.gz")
df = pd.read_json(url, compression="gzip")
df["features"] = df.features.map(json.dumps)

df.to_sql(
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/random_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from superset.utils.core import DatasourceType

from .helpers import (
get_example_data,
get_example_url,
get_slice_json,
get_table_connector_registry,
merge_slice,
Expand All @@ -42,8 +42,8 @@ def load_random_time_series_data(
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("random_time_series.json.gz").decode("utf-8")
pdf = pd.read_json(data)
url = get_example_url("random_time_series.json.gz")
pdf = pd.read_json(url, compression="gzip")
if database.backend == "presto":
pdf.ds = pd.to_datetime(pdf.ds, unit="s")
pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S")
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/sf_population_polygons.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import superset.utils.database as database_utils
from superset import db

from .helpers import get_example_data, get_table_connector_registry
from .helpers import get_example_url, get_table_connector_registry


def load_sf_population_polygons(
Expand All @@ -35,8 +35,8 @@ def load_sf_population_polygons(
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("sf_population.json.gz").decode("utf-8")
df = pd.read_json(data)
url = get_example_url("sf_population.json.gz")
df = pd.read_json(url, compression="gzip")
df["contour"] = df.contour.map(json.dumps)

df.to_sql(
Expand Down
6 changes: 3 additions & 3 deletions superset/examples/world_bank.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from ..connectors.base.models import BaseDatasource
from .helpers import (
get_example_data,
get_example_url,
get_examples_folder,
get_slice_json,
get_table_connector_registry,
Expand All @@ -56,8 +56,8 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals, too-many-s
table_exists = database.has_table_by_name(tbl_name)

if not only_metadata and (not table_exists or force):
data = get_example_data("countries.json.gz").decode("utf-8")
pdf = pd.read_json(data)
url = get_example_url("countries.json.gz")
pdf = pd.read_json(url, compression="gzip")
pdf.columns = [col.replace(".", "_") for col in pdf.columns]
if database.backend == "presto":
pdf.year = pd.to_datetime(pdf.year)
Expand Down

0 comments on commit 213bf79

Please sign in to comment.