✨ PyogrioReaderIterDataPipe for reading vector OGR files (#19)

An iterable-style DataPipe for vector data! Also added Python 3.8 job to CI build matrix which doesn't include 'vector' dependencies. That job is also skipped when PR is in draft mode. * ➕ Add pyogrio[geopandas] Vectorized vector I/O using OGR! * ✨ PyogrioReaderIterDataPipe for reading vector OGR files An iterable-style DataPipe for vector data! Uses pyogrio with geopandas for the I/O. Included a doctest and unit test, added a new section in the API docs and some more intersphinx mappings. * 👷 Run tests with optional packages on Python 3.9 Making a proper build matrix now! Minimal tests (no optional dependencies) run on Python 3.8, while full tests (with all dependencies) run on Python 3.9. Wanted to do Python 3.10 for full tests, but need to wait for rasterio 1.3.0 to come out of beta first. * 🚩 Skip Ubuntu Python 3.8 CI tests for draft PRs Conserve GitHub Actions Continuous Integration resources when a Pull Request is in draft mode.
weiji14 · Jun 9, 2022 · f1f7652 · f1f7652
1 parent 7f28f95
commit f1f7652
Show file tree

Hide file tree

Showing 8 changed files with 372 additions and 3 deletions.
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -7,6 +7,7 @@ on:
   push:
     branches: [ "main" ]
   pull_request:
+    types: [opened, ready_for_review, reopened, synchronize]
     branches: [ "main" ]
 
 permissions:
@@ -19,8 +20,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9"]
+        python-version: ["3.8", "3.9"]
         os: [ubuntu-22.04]
+        # Is it a draft Pull Request (true or false)?
+        isDraft:
+          - ${{ github.event.pull_request.draft }}
+        # Exclude Ubuntu + Python 3.8 job for draft PRs
+        exclude:
+          - python-version: '3.8'
+            isDraft: true
+        # Only install optional packages on Ubuntu-22.04/Python 3.9
+        include:
+          - os: 'ubuntu-22.04'
+            python-version: '3.9'
+            extra-packages: '--extras vector'
 
     steps:
       # Checkout current git repository
@@ -37,7 +50,7 @@ jobs:
       - name: Install Poetry python dependencies
         run: |
           pip install poetry==1.2.0b2
-          poetry install
+          poetry install ${{ matrix.extra-packages }}
           poetry self add poetry-dynamic-versioning-plugin
           poetry show
 

diff --git a/docs/_config.yml b/docs/_config.yml
@@ -32,6 +32,12 @@ sphinx:
     autodoc_typehints: 'description'
     html_show_copyright: false
     intersphinx_mapping:
+      geopandas:
+        - 'https://geopandas.org/en/latest/'
+        - null
+      pyogrio:
+        - 'https://pyogrio.readthedocs.io/en/latest/'
+        - null
       python:
         - 'https://docs.python.org/3/'
         - null

diff --git a/docs/api.md b/docs/api.md
@@ -15,3 +15,12 @@
 .. autoclass:: zen3geo.datapipes.rioxarray.RioXarrayReaderIterDataPipe
     :show-inheritance:
 ```
+
+### Pyogrio
+
+```{eval-rst}
+.. automodule:: zen3geo.datapipes.pyogrio
+.. autoclass:: zen3geo.datapipes.PyogrioReader
+.. autoclass:: zen3geo.datapipes.pyogrio.PyogrioReaderIterDataPipe
+    :show-inheritance:
+```
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ classifiers = [
 python = "^3.8"
 rioxarray = ">=0.10.0"
 torchdata = ">=0.3.0"
+pyogrio = {version = ">=0.4.0a1", extras = ["geopandas"], optional = true}
 # Docs
 jupyter-book = {version="*", optional=true}
 planetary-computer = {version="*", optional=true}
@@ -36,6 +37,7 @@ docs = [
     "planetary-computer",
     "pystac"
 ]
+vector = ["pyogrio"]
 
 [tool.poetry-dynamic-versioning]
 bump = true

diff --git a/zen3geo/datapipes/__init__.py b/zen3geo/datapipes/__init__.py
@@ -2,4 +2,5 @@
 Iterable-style DataPipes for geospatial raster 🌈 and vector 🚏 data.
 """
 
+from zen3geo.datapipes.pyogrio import PyogrioReaderIterDataPipe as PyogrioReader
 from zen3geo.datapipes.rioxarray import RioXarrayReaderIterDataPipe as RioXarrayReader
diff --git a/zen3geo/datapipes/pyogrio.py b/zen3geo/datapipes/pyogrio.py
@@ -0,0 +1,99 @@
+"""
+DataPipes for :doc:`pyogrio <pyogrio:index>`.
+"""
+from typing import Any, Dict, Iterator, Optional, Tuple
+
+try:
+    import pyogrio
+except:
+    pyogrio = None
+from torchdata.datapipes import functional_datapipe
+from torchdata.datapipes.iter import IterDataPipe
+from torchdata.datapipes.utils import StreamWrapper
+
+
+@functional_datapipe("read_from_pyogrio")
+class PyogrioReaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
+    """
+    Takes vector files (e.g. FlatGeoBuf, GeoPackage, GeoJSON) from local disk
+    or URLs (as long as they can be read by pyogrio) and yields tuples of
+    filename and :py:class:`geopandas.GeoDataFrame` objects
+    (functional name: ``read_from_pyogrio``).
+
+    Based on
+    https://github.com/pytorch/data/blob/v0.3.0/torchdata/datapipes/iter/load/iopath.py#L37-L83
+
+    Parameters
+    ----------
+    source_datapipe : IterDataPipe[str]
+        A DataPipe that contains filepaths or URL links to vector files such as
+        FlatGeoBuf, GeoPackage, GeoJSON, etc.
+
+    kwargs : Optional
+        Extra keyword arguments to pass to
+        `pyogrio.read_dataframe <https://pyogrio.readthedocs.io/en/latest/api.html#geopandas-integration>`_.
+
+    Yields
+    ------
+    stream_obj : Tuple[str, geopandas.GeoDataFrame]
+        A tuple consisting of the filename that was passed in, and a
+        :py:class:`geopandas.GeoDataFrame` object containing the vector data.
+
+    Raises
+    ------
+    ModuleNotFoundError
+        If ``pyogrio`` is not installed. See
+        :doc:`install instructions for pyogrio <pyogrio:install>`, and ensure
+        that ``geopandas`` is installed too (e.g. via
+        ``pip install pyogrio[geopandas]``) before using this class.
+
+    Example
+    -------
+    >>> import pytest
+    >>> pyogrio = pytest.importorskip("pyogrio")
+    ...
+    >>> from torchdata.datapipes.iter import IterableWrapper
+    >>> from zen3geo.datapipes import PyogrioReader
+    ...
+    >>> # Read in GeoTIFF data using DataPipe
+    >>> file_url: str = "https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg"
+    >>> dp = IterableWrapper(iterable=[file_url])
+    >>> dp_pyogrio = dp.read_from_pyogrio()
+    ...
+    >>> # Loop or iterate over the DataPipe stream
+    >>> it = iter(dp_pyogrio)
+    >>> filename, geodataframe = next(it)
+    >>> filename
+    'https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg'
+    >>> geodataframe
+    StreamWrapper<   col_bool  col_int8  ...  col_float64                 geometry
+    0       1.0       1.0  ...          1.5  POINT (0.00000 0.00000)
+    1       0.0       2.0  ...          2.5  POINT (1.00000 1.00000)
+    2       1.0       3.0  ...          3.5  POINT (2.00000 2.00000)
+    3       NaN       NaN  ...          NaN  POINT (4.00000 4.00000)
+    <BLANKLINE>
+    [4 rows x 12 columns]>
+    """
+
+    def __init__(
+        self, source_datapipe: IterDataPipe[str], **kwargs: Optional[Dict[str, Any]]
+    ) -> None:
+        if pyogrio is None:
+            raise ModuleNotFoundError(
+                "Package `pyogrio` is required to be installed to use this datapipe. "
+                "Please use `pip install pyogrio[geopandas]` or "
+                "`conda install -c conda-forge pyogrio` "
+                "to install the package"
+            )
+        self.source_datapipe: IterDataPipe[str] = source_datapipe
+        self.kwargs = kwargs
+
+    def __iter__(self) -> Iterator[Tuple]:
+        for filename in self.source_datapipe:
+            yield (
+                filename,
+                StreamWrapper(pyogrio.read_dataframe(filename, **self.kwargs)),
+            )
+
+    def __len__(self) -> int:
+        return len(self.source_datapipe)
diff --git a/zen3geo/tests/test_datapipes_pyogrio.py b/zen3geo/tests/test_datapipes_pyogrio.py
@@ -0,0 +1,32 @@
+"""
+Tests for pyogrio datapipes.
+"""
+import pytest
+from torchdata.datapipes.iter import IterableWrapper
+
+from zen3geo.datapipes import PyogrioReader
+
+pyogrio = pytest.importorskip("pyogrio")
+
+# %%
+def test_pyogrio_reader():
+    """
+    Ensure that PyogrioReader works to read in a GeoTIFF file and outputs a
+    tuple made up of a filename and an xarray.DataArray object.
+    """
+    file_url: str = "https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg"
+    dp = IterableWrapper(iterable=[file_url])
+
+    # Using class constructors
+    dp_pyogrio = PyogrioReader(source_datapipe=dp)
+    # Using functional form (recommended)
+    dp_pyogrio = dp.read_from_pyogrio()
+
+    assert len(dp_pyogrio) == 1
+    it = iter(dp_pyogrio)
+    filename, geodataframe = next(it)
+
+    assert isinstance(filename, str)
+    assert geodataframe.shape == (4, 12)
+    assert any(geodataframe.isna())
+    assert all(geodataframe.geom_type == "Point")