cthoyt · cthoyt · Jul 25, 2022 · Jul 22, 2022 · Jul 25, 2022 · Jul 25, 2022
diff --git a/src/pystow/__init__.py b/src/pystow/__init__.py
@@ -14,11 +14,13 @@
     ensure_excel,
     ensure_from_google,
     ensure_from_s3,
+    ensure_gunzip,
     ensure_json,
     ensure_open,
     ensure_open_gz,
     ensure_open_lzma,
     ensure_open_sqlite,
+    ensure_open_sqlite_gz,
     ensure_open_tarfile,
     ensure_open_zip,
     ensure_pickle,

diff --git a/src/pystow/api.py b/src/pystow/api.py
@@ -43,13 +43,15 @@
     "ensure_from_google",
     # Downloader functions with postprocessing
     "ensure_untar",
+    "ensure_gunzip",
     # Downloader + opener functions
     "ensure_open",
     "ensure_open_gz",
     "ensure_open_lzma",
     "ensure_open_tarfile",
     "ensure_open_zip",
     "ensure_open_sqlite",
+    "ensure_open_sqlite_gz",
     # Processors
     "ensure_csv",
     "ensure_custom",
@@ -285,6 +287,49 @@ def ensure_untar(
     )
 
 
+def ensure_gunzip(
+    key: str,
+    *subkeys: str,
+    url: str,
+    name: Optional[str] = None,
+    force: bool = False,
+    autoclean: bool = True,
+    download_kwargs: Optional[Mapping[str, Any]] = None,
+) -> Path:
+    """Ensure a file is downloaded and gunzipped.
+
+    :param key:
+        The name of the module. No funny characters. The envvar
+        <key>_HOME where key is uppercased is checked first before using
+        the default home directory.
+    :param subkeys:
+        A sequence of additional strings to join. If none are given,
+        returns the directory for this module.
+    :param url:
+        The URL to download.
+    :param name:
+        Overrides the name of the file at the end of the URL, if given. Also
+        useful for URLs that don't have proper filenames with extensions.
+    :param force:
+        Should the download be done again, even if the path already exists?
+        Defaults to false.
+    :param autoclean: Should the zipped file be deleted?
+    :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
+    :return:
+        The path of the directory where the file that has been downloaded
+        gets extracted to
+    """
+    _module = Module.from_key(key, ensure_exists=True)
+    return _module.ensure_gunzip(
+        *subkeys,
+        url=url,
+        name=name,
+        force=force,
+        autoclean=autoclean,
+        download_kwargs=download_kwargs,
+    )
+
+
 @contextmanager
 def ensure_open(
     key: str,
@@ -1458,11 +1503,56 @@ def ensure_open_sqlite(
     >>> import pystow
     >>> import pandas as pd
     >>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db"
+    >>> sql = "SELECT * FROM entailed_edge LIMIT 10"
     >>> with pystow.ensure_open_sqlite("test", url=url) as conn:
-    >>>     df = pd.read_sql(" <query> ", conn)
+    >>>     df = pd.read_sql(sql, conn)
     """
     _module = Module.from_key(key, ensure_exists=True)
     with _module.ensure_open_sqlite(
         *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
     ) as yv:
         yield yv
+
+
+@contextmanager
+def ensure_open_sqlite_gz(
+    key: str,
+    *subkeys: str,
+    url: str,
+    name: Optional[str] = None,
+    force: bool = False,
+    download_kwargs: Optional[Mapping[str, Any]] = None,
+):
+    """Ensure and connect to a gzipped SQLite database.
+
+    :param key:
+        The name of the module. No funny characters. The envvar
+        `<key>_HOME` where key is uppercased is checked first before using
+        the default home directory.
+    :param subkeys:
+        A sequence of additional strings to join. If none are given,
+        returns the directory for this module.
+    :param url:
+        The URL to download.
+    :param name:
+        Overrides the name of the file at the end of the URL, if given. Also
+        useful for URLs that don't have proper filenames with extensions.
+    :param force:
+        Should the download be done again, even if the path already exists?
+        Defaults to false.
+    :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
+    :yields: An instance of :class:`sqlite3.Connection` from :func:`sqlite3.connect`
+
+    Example usage:
+    >>> import pystow
+    >>> import pandas as pd
+    >>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db.gz"
+    >>> sql = "SELECT * FROM entailed_edge LIMIT 10"
+    >>> with pystow.ensure_open_sqlite_gz("test", url=url) as conn:
+    >>>     df = pd.read_sql(sql, conn)
+    """
+    _module = Module.from_key(key, ensure_exists=True)
+    with _module.ensure_open_sqlite_gz(
+        *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
+    ) as yv:
+        yield yv
diff --git a/src/pystow/impl.py b/src/pystow/impl.py
@@ -16,9 +16,11 @@
 from . import utils
 from .constants import JSON, Opener, Provider
 from .utils import (
+    base_from_gzip_name,
     download_from_google,
     download_from_s3,
     get_base,
+    gunzip,
     mkdir,
     name_from_s3_key,
     name_from_url,
@@ -263,6 +265,53 @@ def ensure_untar(
             tar_file.extractall(unzipped_path, **(extract_kwargs or {}))
         return unzipped_path
 
+    def ensure_gunzip(
+        self,
+        *subkeys: str,
+        url: str,
+        name: Optional[str] = None,
+        force: bool = False,
+        autoclean: bool = True,
+        download_kwargs: Optional[Mapping[str, Any]] = None,
+    ) -> Path:
+        """Ensure a tar.gz file is downloaded and unarchived.
+
+        :param subkeys:
+            A sequence of additional strings to join. If none are given,
+            returns the directory for this module.
+        :param url:
+            The URL to download.
+        :param name:
+            Overrides the name of the file at the end of the URL, if given. Also
+            useful for URLs that don't have proper filenames with extensions.
+        :param force:
+            Should the download be done again, even if the path already exists?
+            Defaults to false.
+        :param autoclean: Should the zipped file be deleted?
+        :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
+        :return:
+            The path of the directory where the file that has been downloaded
+            gets extracted to
+        """
+        if name is None:
+            name = name_from_url(url)
+        gunzipped_name = base_from_gzip_name(name)
+        gunzipped_path = self.join(*subkeys, name=gunzipped_name, ensure_exists=True)
+        if gunzipped_path.is_file() and not force:
+            return gunzipped_path
+        path = self.ensure(
+            *subkeys,
+            url=url,
+            name=name,
+            force=force,
+            download_kwargs=download_kwargs,
+        )
+        gunzip(path, gunzipped_path)
+        if autoclean:
+            logger.info("removing original gzipped file %s", path)
+            path.unlink()
+        return gunzipped_path
+
     @contextmanager
     def ensure_open(
         self,
@@ -1303,9 +1352,10 @@ def ensure_open_sqlite(
         >>> import pystow
         >>> import pandas as pd
         >>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db"
+        >>> sql = "SELECT * FROM entailed_edge LIMIT 10"
         >>> module = pystow.module("test")
         >>> with module.ensure_open_sqlite(url=url) as conn:
-        >>>     df = pd.read_sql(" <query> ", conn)
+        >>>     df = pd.read_sql(sql, conn)
         """
         import sqlite3
 
@@ -1315,6 +1365,51 @@ def ensure_open_sqlite(
         with closing(sqlite3.connect(path.as_posix())) as conn:
             yield conn
 
+    @contextmanager
+    def ensure_open_sqlite_gz(
+        self,
+        *subkeys: str,
+        url: str,
+        name: Optional[str] = None,
+        force: bool = False,
+        download_kwargs: Optional[Mapping[str, Any]] = None,
+    ):
+        """Ensure and connect to a SQLite database that's gzipped.
+
+        Unfortunately, it's a paid feature to directly read gzipped sqlite files,
+        so this automatically gunzips it first.
+
+        :param subkeys:
+            A sequence of additional strings to join. If none are given,
+            returns the directory for this module.
+        :param url:
+            The URL to download.
+        :param name:
+            Overrides the name of the file at the end of the URL, if given. Also
+            useful for URLs that don't have proper filenames with extensions.
+        :param force:
+            Should the download be done again, even if the path already exists?
+            Defaults to false.
+        :param download_kwargs: Keyword arguments to pass through to :func:`pystow.utils.download`.
+        :yields: An instance of :class:`sqlite3.Connection` from :func:`sqlite3.connect`
+
+        Example usage:
+        >>> import pystow
+        >>> import pandas as pd
+        >>> url = "https://s3.amazonaws.com/bbop-sqlite/hp.db.gz"
+        >>> module = pystow.module("test")
+        >>> sql = "SELECT * FROM entailed_edge LIMIT 10"
+        >>> with module.ensure_open_sqlite_gz(url=url) as conn:
+        >>>     df = pd.read_sql(sql, conn)
+        """
+        import sqlite3
+
+        path = self.ensure_gunzip(
+            *subkeys, url=url, name=name, force=force, download_kwargs=download_kwargs
+        )
+        with closing(sqlite3.connect(path.as_posix())) as conn:
+            yield conn
+
 
 def _clean_csv_kwargs(read_csv_kwargs):
     read_csv_kwargs = {} if read_csv_kwargs is None else dict(read_csv_kwargs)

diff --git a/src/pystow/utils.py b/src/pystow/utils.py
@@ -57,6 +57,7 @@
     "get_np_io",
     # LZMA utilities
     "write_lzma_csv",
+    "gunzip",
     # Zipfile utilities
     "write_zipfile_csv",
     "read_zipfile_csv",
@@ -351,6 +352,18 @@ def name_from_url(url: str) -> str:
     return name
 
 
+def base_from_gzip_name(name: str) -> str:
+    """Get the base name for a file after stripping the gz ending.
+
+    :param name: The name of the gz file
+    :returns: The cleaned name of the file, with no gz ending
+    :raises ValueError: if the file does not end with ".gz"
+    """
+    if not name.endswith(".gz"):
+        raise ValueError(f"Name does not end with .gz: {name}")
+    return name[: -len(".gz")]
+
+
 def name_from_s3_key(key: str) -> str:
     """Get the filename from the S3 key.
 
@@ -921,3 +934,13 @@ def path_to_sqlite(path: Union[str, Path]) -> str:
     """
     path = Path(path).expanduser().resolve()
     return f"sqlite:///{path.as_posix()}"
+
+
+def gunzip(source: Union[str, Path], target: Union[str, Path]) -> None:
+    """Unzip a file in the source to the target.
+
+    :param source: The path to an input file
+    :param target: The path to an output file
+    """
+    with gzip.open(source, "rb") as in_file, open(target, "wb") as out_file:
+        shutil.copyfileobj(in_file, out_file)