Skip to content

Commit

Permalink
Add ensure BZ2 (#55)
Browse files Browse the repository at this point in the history
* Add bz2 json support

* Update docs and interface

* Update test_module.py
  • Loading branch information
cthoyt committed Oct 31, 2022
1 parent d84f24d commit ebdc314
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 4 deletions.
2 changes: 2 additions & 0 deletions src/pystow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
ensure_from_s3,
ensure_gunzip,
ensure_json,
ensure_json_bz2,
ensure_open,
ensure_open_bz2,
ensure_open_gz,
ensure_open_lzma,
ensure_open_sqlite,
Expand Down
99 changes: 99 additions & 0 deletions src/pystow/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
# Downloader + opener functions
"ensure_open",
"ensure_open_gz",
"ensure_open_bz2",
"ensure_open_lzma",
"ensure_open_tarfile",
"ensure_open_zip",
Expand All @@ -56,6 +57,7 @@
"ensure_csv",
"ensure_custom",
"ensure_json",
"ensure_json_bz2",
"ensure_pickle",
"ensure_pickle_gz",
"ensure_excel",
Expand Down Expand Up @@ -573,6 +575,53 @@ def ensure_open_gz(
yield yv


@contextmanager
def ensure_open_bz2(
key: str,
*subkeys: str,
url: str,
name: Optional[str] = None,
force: bool = False,
download_kwargs: Optional[Mapping[str, Any]] = None,
mode: str = "rb",
open_kwargs: Optional[Mapping[str, Any]] = None,
) -> Opener:
"""Ensure a BZ2-compressed file is downloaded and open a file inside it.
:param key:
The name of the module. No funny characters. The envvar
`<key>_HOME` where key is uppercased is checked first before using
the default home directory.
:param subkeys:
A sequence of additional strings to join. If none are given,
returns the directory for this module.
:param url:
The URL to download.
:param name:
Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
:param force:
Should the download be done again, even if the path already exists?
Defaults to false.
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:param mode: The read mode, passed to :func:`bz2.open`
:param open_kwargs: Additional keyword arguments passed to :func:`bz2.open`
:yields: An open file object
"""
_module = Module.from_key(key, ensure_exists=True)
with _module.ensure_open_bz2(
*subkeys,
url=url,
name=name,
force=force,
download_kwargs=download_kwargs,
mode=mode,
open_kwargs=open_kwargs,
) as yv:
yield yv


def ensure_csv(
key: str,
*subkeys: str,
Expand Down Expand Up @@ -702,6 +751,7 @@ def ensure_json(
name: Optional[str] = None,
force: bool = False,
download_kwargs: Optional[Mapping[str, Any]] = None,
open_kwargs: Optional[Mapping[str, Any]] = None,
json_load_kwargs: Optional[Mapping[str, Any]] = None,
) -> JSON:
"""Download JSON and open with :mod:`json`.
Expand All @@ -719,6 +769,7 @@ def ensure_json(
Should the download be done again, even if the path already exists?
Defaults to false.
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:param open_kwargs: Additional keyword arguments passed to :func:`open`
:param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`.
:returns: A JSON object (list, dict, etc.)
Expand All @@ -735,6 +786,54 @@ def ensure_json(
name=name,
force=force,
download_kwargs=download_kwargs,
open_kwargs=open_kwargs,
json_load_kwargs=json_load_kwargs,
)


def ensure_json_bz2(
key: str,
*subkeys: str,
url: str,
name: Optional[str] = None,
force: bool = False,
download_kwargs: Optional[Mapping[str, Any]] = None,
open_kwargs: Optional[Mapping[str, Any]] = None,
json_load_kwargs: Optional[Mapping[str, Any]] = None,
) -> JSON:
"""Download BZ2-compressed JSON and open with :mod:`json`.
:param key: The module name
:param subkeys:
A sequence of additional strings to join. If none are given,
returns the directory for this module.
:param url:
The URL to download.
:param name:
Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
:param force:
Should the download be done again, even if the path already exists?
Defaults to false.
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:param open_kwargs: Additional keyword arguments passed to :func:`bz2.open`
:param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`.
:returns: A JSON object (list, dict, etc.)
Example usage::
>>> import pystow
>>> url = 'https://github.com/hetio/hetionet/raw/master/hetnet/json/hetionet-v1.0.json.bz2'
>>> hetionet = pystow.ensure_json_bz2('bio', 'hetionet', '1.0', url=url)
"""
_module = Module.from_key(key, ensure_exists=True)
return _module.ensure_json_bz2(
*subkeys,
url=url,
name=name,
force=force,
download_kwargs=download_kwargs,
open_kwargs=open_kwargs,
json_load_kwargs=json_load_kwargs,
)

Expand Down
86 changes: 85 additions & 1 deletion src/pystow/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

"""Module implementation."""

import bz2
import gzip
import json
import logging
Expand Down Expand Up @@ -564,6 +565,44 @@ def ensure_open_gz(
with gzip.open(path, **open_kwargs) as file:
yield file

@contextmanager
def ensure_open_bz2(
self,
*subkeys: str,
url: str,
name: Optional[str] = None,
force: bool = False,
download_kwargs: Optional[Mapping[str, Any]] = None,
mode: str = "rb",
open_kwargs: Optional[Mapping[str, Any]] = None,
) -> Opener:
"""Ensure a BZ2-compressed file is downloaded and open a file inside it.
:param subkeys:
A sequence of additional strings to join. If none are given,
returns the directory for this module.
:param url:
The URL to download.
:param name:
Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
:param force:
Should the download be done again, even if the path already exists?
Defaults to false.
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:param mode: The read mode, passed to :func:`bz2.open`
:param open_kwargs: Additional keyword arguments passed to :func:`bz2.open`
:yields: An open file object
"""
path = self.ensure(
*subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
)
open_kwargs = {} if open_kwargs is None else dict(open_kwargs)
open_kwargs.setdefault("mode", mode)
with bz2.open(path, **open_kwargs) as file:
yield file

def ensure_csv(
self,
*subkeys: str,
Expand Down Expand Up @@ -656,6 +695,7 @@ def ensure_json(
name: Optional[str] = None,
force: bool = False,
download_kwargs: Optional[Mapping[str, Any]] = None,
open_kwargs: Optional[Mapping[str, Any]] = None,
json_load_kwargs: Optional[Mapping[str, Any]] = None,
) -> JSON:
"""Download JSON and open with :mod:`json`.
Expand All @@ -672,11 +712,55 @@ def ensure_json(
Should the download be done again, even if the path already exists?
Defaults to false.
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:param open_kwargs: Additional keyword arguments passed to :func:`open`
:param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`.
:returns: A JSON object (list, dict, etc.)
"""
with self.ensure_open(
*subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
*subkeys,
url=url,
name=name,
force=force,
download_kwargs=download_kwargs,
open_kwargs=open_kwargs,
) as file:
return json.load(file, **(json_load_kwargs or {}))

def ensure_json_bz2(
self,
*subkeys: str,
url: str,
name: Optional[str] = None,
force: bool = False,
download_kwargs: Optional[Mapping[str, Any]] = None,
open_kwargs: Optional[Mapping[str, Any]] = None,
json_load_kwargs: Optional[Mapping[str, Any]] = None,
):
"""Download BZ2-compressed JSON and open with :mod:`json`.
:param subkeys:
A sequence of additional strings to join. If none are given,
returns the directory for this module.
:param url:
The URL to download.
:param name:
Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
:param force:
Should the download be done again, even if the path already exists?
Defaults to false.
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:param open_kwargs: Additional keyword arguments passed to :func:`bz2.open`
:param json_load_kwargs: Keyword arguments to pass through to :func:`json.load`.
:returns: A JSON object (list, dict, etc.)
"""
with self.ensure_open_bz2(
*subkeys,
url=url,
name=name,
force=force,
download_kwargs=download_kwargs,
open_kwargs=open_kwargs,
) as file:
return json.load(file, **(json_load_kwargs or {}))

Expand Down
26 changes: 23 additions & 3 deletions tests/test_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

"""Tests for PyStow."""

import bz2
import contextlib
import itertools as itt
import json
import lzma
import os
import pickle
Expand Down Expand Up @@ -44,6 +46,7 @@

JSON_NAME = "test_1.json"
JSON_URL = f"{n()}/{JSON_NAME}"
JSON_PATH = RESOURCES / JSON_NAME

PICKLE_NAME = "test_1.pkl"
PICKLE_URL = f"{n()}/{PICKLE_NAME}"
Expand All @@ -53,9 +56,15 @@
PICKLE_GZ_URL = f"{n()}/{PICKLE_GZ_NAME}"
PICKLE_GZ_PATH = RESOURCES / PICKLE_GZ_NAME

JSON_BZ2_NAME = "test_1.json.bz2"
JSON_BZ2_URL = f"{n()}/{JSON_BZ2_NAME}"
JSON_BZ2_PATH = RESOURCES / JSON_BZ2_NAME


MOCK_FILES: Mapping[str, Path] = {
TSV_URL: RESOURCES / TSV_NAME,
JSON_URL: RESOURCES / JSON_NAME,
JSON_URL: JSON_PATH,
JSON_BZ2_URL: JSON_BZ2_PATH,
PICKLE_URL: PICKLE_PATH,
PICKLE_GZ_URL: PICKLE_GZ_PATH,
SQLITE_URL: SQLITE_PATH,
Expand All @@ -67,6 +76,7 @@
("v2_1", "v2_2", "v2_3"),
]
TEST_DF = pd.DataFrame(TEST_TSV_ROWS)
TEST_JSON = {"key": "value"}

# Make the pickle file
if not PICKLE_PATH.is_file():
Expand All @@ -75,6 +85,13 @@
if not SQLITE_PATH.is_file():
write_sql(TEST_DF, name=SQLITE_TABLE, path=SQLITE_PATH, index=False)

if not JSON_PATH.is_file():
JSON_PATH.write_text(json.dumps(TEST_JSON))

if not JSON_BZ2_PATH.is_file():
with bz2.open(JSON_BZ2_PATH, mode="wt") as file:
json.dump(TEST_JSON, file, indent=2)


class TestMocks(unittest.TestCase):
"""Tests for :mod:`pystow` mocks and context managers."""
Expand Down Expand Up @@ -191,8 +208,7 @@ def test_ensure(self):

with self.subTest(type="json"):
j = pystow.ensure_json("test", url=JSON_URL)
self.assertIn("key", j)
self.assertEqual("value", j["key"])
self.assertEqual(TEST_JSON, j)

j2 = pystow.load_json("test", name=JSON_NAME)
self.assertEqual(j, j2)
Expand All @@ -211,6 +227,10 @@ def test_ensure(self):
p2 = pystow.load_pickle_gz("test", name=PICKLE_GZ_NAME)
self.assertEqual(p, p2)

with self.subTest(type="json_bz2"):
p = pystow.ensure_json_bz2("test", url=JSON_BZ2_URL)
self.assertEqual(TEST_JSON, p)

def test_open_fail(self):
"""Test opening a missing file."""
with self.assertRaises(FileNotFoundError):
Expand Down

0 comments on commit ebdc314

Please sign in to comment.