Skip to content

Commit

Permalink
Adds ensure gunzip command for single files (#47)
Browse files Browse the repository at this point in the history
Closes #45

Co-authored-by: Max Berrendorf <berrendorf@dbs.ifi.lmu.de>
  • Loading branch information
cthoyt and mberr committed Jul 25, 2022
1 parent aa60314 commit 91da88f
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/pystow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
ensure_excel,
ensure_from_google,
ensure_from_s3,
ensure_gunzip,
ensure_json,
ensure_open,
ensure_open_gz,
ensure_open_lzma,
ensure_open_sqlite,
ensure_open_sqlite_gz,
ensure_open_tarfile,
ensure_open_zip,
ensure_pickle,
Expand Down
92 changes: 91 additions & 1 deletion src/pystow/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,15 @@
"ensure_from_google",
# Downloader functions with postprocessing
"ensure_untar",
"ensure_gunzip",
# Downloader + opener functions
"ensure_open",
"ensure_open_gz",
"ensure_open_lzma",
"ensure_open_tarfile",
"ensure_open_zip",
"ensure_open_sqlite",
"ensure_open_sqlite_gz",
# Processors
"ensure_csv",
"ensure_custom",
Expand Down Expand Up @@ -285,6 +287,49 @@ def ensure_untar(
)


def ensure_gunzip(
key: str,
*subkeys: str,
url: str,
name: Optional[str] = None,
force: bool = False,
autoclean: bool = True,
download_kwargs: Optional[Mapping[str, Any]] = None,
) -> Path:
"""Ensure a file is downloaded and gunzipped.
:param key:
The name of the module. No funny characters. The envvar
<key>_HOME where key is uppercased is checked first before using
the default home directory.
:param subkeys:
A sequence of additional strings to join. If none are given,
returns the directory for this module.
:param url:
The URL to download.
:param name:
Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
:param force:
Should the download be done again, even if the path already exists?
Defaults to false.
:param autoclean: Should the zipped file be deleted?
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:return:
The path of the directory where the file that has been downloaded
gets extracted to
"""
_module = Module.from_key(key, ensure_exists=True)
return _module.ensure_gunzip(
*subkeys,
url=url,
name=name,
force=force,
autoclean=autoclean,
download_kwargs=download_kwargs,
)


@contextmanager
def ensure_open(
key: str,
Expand Down Expand Up @@ -1458,11 +1503,56 @@ def ensure_open_sqlite(
>>> import pystow
>>> import pandas as pd
>>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db"
>>> sql = "SELECT * FROM entailed_edge LIMIT 10"
>>> with pystow.ensure_open_sqlite("test", url=url) as conn:
>>> df = pd.read_sql(" <query> ", conn)
>>> df = pd.read_sql(sql, conn)
"""
_module = Module.from_key(key, ensure_exists=True)
with _module.ensure_open_sqlite(
*subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
) as yv:
yield yv


@contextmanager
def ensure_open_sqlite_gz(
key: str,
*subkeys: str,
url: str,
name: Optional[str] = None,
force: bool = False,
download_kwargs: Optional[Mapping[str, Any]] = None,
):
"""Ensure and connect to a gzipped SQLite database.
:param key:
The name of the module. No funny characters. The envvar
`<key>_HOME` where key is uppercased is checked first before using
the default home directory.
:param subkeys:
A sequence of additional strings to join. If none are given,
returns the directory for this module.
:param url:
The URL to download.
:param name:
Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
:param force:
Should the download be done again, even if the path already exists?
Defaults to false.
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:yields: An instance of :class:`sqlite3.Connection` from :func:`sqlite3.connect`
Example usage:
>>> import pystow
>>> import pandas as pd
>>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db.gz"
>>> sql = "SELECT * FROM entailed_edge LIMIT 10"
>>> with pystow.ensure_open_sqlite_gz("test", url=url) as conn:
>>> df = pd.read_sql(sql, conn)
"""
_module = Module.from_key(key, ensure_exists=True)
with _module.ensure_open_sqlite_gz(
*subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
) as yv:
yield yv
97 changes: 96 additions & 1 deletion src/pystow/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
from . import utils
from .constants import JSON, Opener, Provider
from .utils import (
base_from_gzip_name,
download_from_google,
download_from_s3,
get_base,
gunzip,
mkdir,
name_from_s3_key,
name_from_url,
Expand Down Expand Up @@ -263,6 +265,53 @@ def ensure_untar(
tar_file.extractall(unzipped_path, **(extract_kwargs or {}))
return unzipped_path

def ensure_gunzip(
self,
*subkeys: str,
url: str,
name: Optional[str] = None,
force: bool = False,
autoclean: bool = True,
download_kwargs: Optional[Mapping[str, Any]] = None,
) -> Path:
"""Ensure a tar.gz file is downloaded and unarchived.
:param subkeys:
A sequence of additional strings to join. If none are given,
returns the directory for this module.
:param url:
The URL to download.
:param name:
Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
:param force:
Should the download be done again, even if the path already exists?
Defaults to false.
:param autoclean: Should the zipped file be deleted?
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:return:
The path of the directory where the file that has been downloaded
gets extracted to
"""
if name is None:
name = name_from_url(url)
gunzipped_name = base_from_gzip_name(name)
gunzipped_path = self.join(*subkeys, name=gunzipped_name, ensure_exists=True)
if gunzipped_path.is_file() and not force:
return gunzipped_path
path = self.ensure(
*subkeys,
url=url,
name=name,
force=force,
download_kwargs=download_kwargs,
)
gunzip(path, gunzipped_path)
if autoclean:
logger.info("removing original gzipped file %s", path)
path.unlink()
return gunzipped_path

@contextmanager
def ensure_open(
self,
Expand Down Expand Up @@ -1303,9 +1352,10 @@ def ensure_open_sqlite(
>>> import pystow
>>> import pandas as pd
>>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db"
>>> sql = "SELECT * FROM entailed_edge LIMIT 10"
>>> module = pystow.module("test")
>>> with module.ensure_open_sqlite(url=url) as conn:
>>> df = pd.read_sql(" <query> ", conn)
>>> df = pd.read_sql(sql, conn)
"""
import sqlite3

Expand All @@ -1315,6 +1365,51 @@ def ensure_open_sqlite(
with closing(sqlite3.connect(path.as_posix())) as conn:
yield conn

@contextmanager
def ensure_open_sqlite_gz(
self,
*subkeys: str,
url: str,
name: Optional[str] = None,
force: bool = False,
download_kwargs: Optional[Mapping[str, Any]] = None,
):
"""Ensure and connect to a SQLite database that's gzipped.
Unfortunately, it's a paid feature to directly read gzipped sqlite files,
so this automatically gunzips it first.
:param subkeys:
A sequence of additional strings to join. If none are given,
returns the directory for this module.
:param url:
The URL to download.
:param name:
Overrides the name of the file at the end of the URL, if given. Also
useful for URLs that don't have proper filenames with extensions.
:param force:
Should the download be done again, even if the path already exists?
Defaults to false.
:param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
:yields: An instance of :class:`sqlite3.Connection` from :func:`sqlite3.connect`
Example usage:
>>> import pystow
>>> import pandas as pd
>>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db.gz"
>>> module = pystow.module("test")
>>> sql = "SELECT * FROM entailed_edge LIMIT 10"
>>> with module.ensure_open_sqlite_gz(url=url) as conn:
>>> df = pd.read_sql(sql, conn)
"""
import sqlite3

path = self.ensure_gunzip(
*subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
)
with closing(sqlite3.connect(path.as_posix())) as conn:
yield conn


def _clean_csv_kwargs(read_csv_kwargs):
read_csv_kwargs = {} if read_csv_kwargs is None else dict(read_csv_kwargs)
Expand Down
23 changes: 23 additions & 0 deletions src/pystow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"get_np_io",
# LZMA utilities
"write_lzma_csv",
"gunzip",
# Zipfile utilities
"write_zipfile_csv",
"read_zipfile_csv",
Expand Down Expand Up @@ -351,6 +352,18 @@ def name_from_url(url: str) -> str:
return name


def base_from_gzip_name(name: str) -> str:
"""Get the base name for a file after stripping the gz ending.
:param name: The name of the gz file
:returns: The cleaned name of the file, with no gz ending
:raises ValueError: if the file does not end with ".gz"
"""
if not name.endswith(".gz"):
raise ValueError(f"Name does not end with .gz: {name}")
return name[: -len(".gz")]


def name_from_s3_key(key: str) -> str:
"""Get the filename from the S3 key.
Expand Down Expand Up @@ -921,3 +934,13 @@ def path_to_sqlite(path: Union[str, Path]) -> str:
"""
path = Path(path).expanduser().resolve()
return f"sqlite:///{path.as_posix()}"


def gunzip(source: Union[str, Path], target: Union[str, Path]) -> None:
"""Unzip a file in the source to the target.
:param source: The path to an input file
:param target: The path to an output file
"""
with gzip.open(source, "rb") as in_file, open(target, "wb") as out_file:
shutil.copyfileobj(in_file, out_file)

0 comments on commit 91da88f

Please sign in to comment.