Skip to content

Commit

Permalink
✨ PyogrioReaderIterDataPipe for reading vector OGR files (#19)
Browse files Browse the repository at this point in the history
An iterable-style DataPipe for vector data! Also added Python 3.8 job to CI build matrix which doesn't include 'vector' dependencies. That job is also skipped when PR is in draft mode.

* ➕ Add pyogrio[geopandas]

Vectorized vector I/O using OGR!

* ✨ PyogrioReaderIterDataPipe for reading vector OGR files

An iterable-style DataPipe for vector data! Uses pyogrio with geopandas for the I/O. Included a doctest and unit test, added a new section in the API docs and some more intersphinx mappings.

* 👷 Run tests with optional packages on Python 3.9

Making a proper build matrix now! Minimal tests (no optional dependencies) run on Python 3.8, while full tests (with all dependencies) run on Python 3.9.

Wanted to do Python 3.10 for full tests, but need to wait for rasterio 1.3.0 to come out of beta first.

* 🚩 Skip Ubuntu Python 3.8 CI tests for draft PRs

Conserve GitHub Actions Continuous Integration resources when a Pull Request is in draft mode.
  • Loading branch information
weiji14 committed Jun 9, 2022
1 parent 7f28f95 commit f1f7652
Show file tree
Hide file tree
Showing 8 changed files with 372 additions and 3 deletions.
17 changes: 15 additions & 2 deletions .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
push:
branches: [ "main" ]
pull_request:
types: [opened, ready_for_review, reopened, synchronize]
branches: [ "main" ]

permissions:
Expand All @@ -19,8 +20,20 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9"]
python-version: ["3.8", "3.9"]
os: [ubuntu-22.04]
# Is it a draft Pull Request (true or false)?
isDraft:
- ${{ github.event.pull_request.draft }}
# Exclude Ubuntu + Python 3.8 job for draft PRs
exclude:
- python-version: '3.8'
isDraft: true
# Only install optional packages on Ubuntu-22.04/Python 3.9
include:
- os: 'ubuntu-22.04'
python-version: '3.9'
extra-packages: '--extras vector'

steps:
# Checkout current git repository
Expand All @@ -37,7 +50,7 @@ jobs:
- name: Install Poetry python dependencies
run: |
pip install poetry==1.2.0b2
poetry install
poetry install ${{ matrix.extra-packages }}
poetry self add poetry-dynamic-versioning-plugin
poetry show
Expand Down
6 changes: 6 additions & 0 deletions docs/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ sphinx:
autodoc_typehints: 'description'
html_show_copyright: false
intersphinx_mapping:
geopandas:
- 'https://geopandas.org/en/latest/'
- null
pyogrio:
- 'https://pyogrio.readthedocs.io/en/latest/'
- null
python:
- 'https://docs.python.org/3/'
- null
Expand Down
9 changes: 9 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,12 @@
.. autoclass:: zen3geo.datapipes.rioxarray.RioXarrayReaderIterDataPipe
:show-inheritance:
```

### Pyogrio

```{eval-rst}
.. automodule:: zen3geo.datapipes.pyogrio
.. autoclass:: zen3geo.datapipes.PyogrioReader
.. autoclass:: zen3geo.datapipes.pyogrio.PyogrioReaderIterDataPipe
:show-inheritance:
```
209 changes: 208 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ classifiers = [
python = "^3.8"
rioxarray = ">=0.10.0"
torchdata = ">=0.3.0"
pyogrio = {version = ">=0.4.0a1", extras = ["geopandas"], optional = true}
# Docs
jupyter-book = {version="*", optional=true}
planetary-computer = {version="*", optional=true}
Expand All @@ -36,6 +37,7 @@ docs = [
"planetary-computer",
"pystac"
]
vector = ["pyogrio"]

[tool.poetry-dynamic-versioning]
bump = true
Expand Down
1 change: 1 addition & 0 deletions zen3geo/datapipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
Iterable-style DataPipes for geospatial raster 🌈 and vector 🚏 data.
"""

from zen3geo.datapipes.pyogrio import PyogrioReaderIterDataPipe as PyogrioReader
from zen3geo.datapipes.rioxarray import RioXarrayReaderIterDataPipe as RioXarrayReader
99 changes: 99 additions & 0 deletions zen3geo/datapipes/pyogrio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
DataPipes for :doc:`pyogrio <pyogrio:index>`.
"""
from typing import Any, Dict, Iterator, Optional, Tuple

try:
import pyogrio
except:
pyogrio = None
from torchdata.datapipes import functional_datapipe
from torchdata.datapipes.iter import IterDataPipe
from torchdata.datapipes.utils import StreamWrapper


@functional_datapipe("read_from_pyogrio")
class PyogrioReaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
"""
Takes vector files (e.g. FlatGeoBuf, GeoPackage, GeoJSON) from local disk
or URLs (as long as they can be read by pyogrio) and yields tuples of
filename and :py:class:`geopandas.GeoDataFrame` objects
(functional name: ``read_from_pyogrio``).
Based on
https://github.com/pytorch/data/blob/v0.3.0/torchdata/datapipes/iter/load/iopath.py#L37-L83
Parameters
----------
source_datapipe : IterDataPipe[str]
A DataPipe that contains filepaths or URL links to vector files such as
FlatGeoBuf, GeoPackage, GeoJSON, etc.
kwargs : Optional
Extra keyword arguments to pass to
`pyogrio.read_dataframe <https://pyogrio.readthedocs.io/en/latest/api.html#geopandas-integration>`_.
Yields
------
stream_obj : Tuple[str, geopandas.GeoDataFrame]
A tuple consisting of the filename that was passed in, and a
:py:class:`geopandas.GeoDataFrame` object containing the vector data.
Raises
------
ModuleNotFoundError
If ``pyogrio`` is not installed. See
:doc:`install instructions for pyogrio <pyogrio:install>`, and ensure
that ``geopandas`` is installed too (e.g. via
``pip install pyogrio[geopandas]``) before using this class.
Example
-------
>>> import pytest
>>> pyogrio = pytest.importorskip("pyogrio")
...
>>> from torchdata.datapipes.iter import IterableWrapper
>>> from zen3geo.datapipes import PyogrioReader
...
>>> # Read in GeoTIFF data using DataPipe
>>> file_url: str = "https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg"
>>> dp = IterableWrapper(iterable=[file_url])
>>> dp_pyogrio = dp.read_from_pyogrio()
...
>>> # Loop or iterate over the DataPipe stream
>>> it = iter(dp_pyogrio)
>>> filename, geodataframe = next(it)
>>> filename
'https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg'
>>> geodataframe
StreamWrapper< col_bool col_int8 ... col_float64 geometry
0 1.0 1.0 ... 1.5 POINT (0.00000 0.00000)
1 0.0 2.0 ... 2.5 POINT (1.00000 1.00000)
2 1.0 3.0 ... 3.5 POINT (2.00000 2.00000)
3 NaN NaN ... NaN POINT (4.00000 4.00000)
<BLANKLINE>
[4 rows x 12 columns]>
"""

def __init__(
self, source_datapipe: IterDataPipe[str], **kwargs: Optional[Dict[str, Any]]
) -> None:
if pyogrio is None:
raise ModuleNotFoundError(
"Package `pyogrio` is required to be installed to use this datapipe. "
"Please use `pip install pyogrio[geopandas]` or "
"`conda install -c conda-forge pyogrio` "
"to install the package"
)
self.source_datapipe: IterDataPipe[str] = source_datapipe
self.kwargs = kwargs

def __iter__(self) -> Iterator[Tuple]:
for filename in self.source_datapipe:
yield (
filename,
StreamWrapper(pyogrio.read_dataframe(filename, **self.kwargs)),
)

def __len__(self) -> int:
return len(self.source_datapipe)
32 changes: 32 additions & 0 deletions zen3geo/tests/test_datapipes_pyogrio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Tests for pyogrio datapipes.
"""
import pytest
from torchdata.datapipes.iter import IterableWrapper

from zen3geo.datapipes import PyogrioReader

pyogrio = pytest.importorskip("pyogrio")

# %%
def test_pyogrio_reader():
"""
Ensure that PyogrioReader works to read in a GeoTIFF file and outputs a
tuple made up of a filename and an xarray.DataArray object.
"""
file_url: str = "https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg"
dp = IterableWrapper(iterable=[file_url])

# Using class constructors
dp_pyogrio = PyogrioReader(source_datapipe=dp)
# Using functional form (recommended)
dp_pyogrio = dp.read_from_pyogrio()

assert len(dp_pyogrio) == 1
it = iter(dp_pyogrio)
filename, geodataframe = next(it)

assert isinstance(filename, str)
assert geodataframe.shape == (4, 12)
assert any(geodataframe.isna())
assert all(geodataframe.geom_type == "Point")

0 comments on commit f1f7652

Please sign in to comment.