Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

open_datatree performance improvement on NetCDF, H5, and Zarr files #9014

Merged
merged 38 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
14aaf56
open_datatree performance improvement on NetCDF files
aladinor May 7, 2024
3a5edb4
fixing issue with forward slashes
aladinor May 7, 2024
72d7660
Merge branch 'main' into datatree-zarr
aladinor May 7, 2024
d9dde29
fixing issue with pytest
aladinor May 7, 2024
2bc5e73
fixing issue with pytest
aladinor May 7, 2024
89fb4fb
Merge branch 'main' into datatree-zarr
aladinor May 8, 2024
0343f10
open datatree in zarr format improvement
aladinor May 10, 2024
93e1d59
Merge branch 'main' into datatree-zarr
aladinor May 10, 2024
ac11b3e
fixing incompatibility in returned object
aladinor May 10, 2024
6d0ee13
Merge branch 'datatree-zarr' of https://github.com/aladinor/xarray in…
aladinor May 10, 2024
91c5f0a
Merge branch 'main' into datatree-zarr
aladinor May 12, 2024
3363e91
Merge branch 'main' into datatree-zarr
aladinor May 18, 2024
7bba52c
passing group parameter to opendatatree method and reducing duplicate…
aladinor May 18, 2024
725aed7
Merge branch 'datatree-zarr' of https://github.com/aladinor/xarray in…
aladinor May 18, 2024
903effd
passing group parameter to opendatatree method - NetCDF
aladinor May 18, 2024
d468478
Merge branch 'main' into datatree-zarr
aladinor May 19, 2024
51da175
Update xarray/backends/netCDF4_.py
aladinor May 28, 2024
24881bd
Merge branch 'main' into datatree-zarr
aladinor May 28, 2024
5f4bff1
renaming variables
aladinor May 28, 2024
41ceb4f
renaming variables
aladinor May 28, 2024
f18ead6
renaming group_store variable
aladinor May 29, 2024
33d9769
removing _open_datatree_netcdf function not used anymore in open_data…
aladinor May 29, 2024
3345b92
improving performance of open_datatree method
aladinor May 29, 2024
3cb131c
renaming 'i' variable within list comprehension in open_store method …
aladinor May 29, 2024
6a759c0
using the default generator instead of loading zarr groups in memory
aladinor May 29, 2024
6c00641
fixing issue with group path to avoid using group[1:] notation. Addin…
aladinor May 29, 2024
189b497
fixing issue with group path to avoid using group[1:] notation and ad…
aladinor May 29, 2024
a9c306d
fixing issue with group path to avoid using group[1:] notation and ad…
aladinor May 29, 2024
fad0e76
Merge branch 'main' into datatree-zarr
aladinor Jun 3, 2024
792f9c7
Merge branch 'main' into datatree-zarr
aladinor Jun 4, 2024
8c5796f
adding 'mode' parameter to open_datatree method
aladinor Jun 4, 2024
728b374
adding 'mode' parameter to H5NetCDFStore.open method
aladinor Jun 4, 2024
74b9a7c
Merge branch 'main' into datatree-zarr
kmuehlbauer Jun 5, 2024
e298ac4
Merge branch 'main' into datatree-zarr
aladinor Jun 12, 2024
833c978
Merge branch 'main' into datatree-zarr
aladinor Jun 12, 2024
4ff6035
adding new entry related to open_datatree performance improvement
aladinor Jun 12, 2024
3844dea
adding new entry related to open_datatree performance improvement
aladinor Jun 12, 2024
456ce29
Getting rid of unnecessary parameters for 'open_datatree' method for …
aladinor Jun 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ Performance
By `Deepak Cherian <https://github.com/dcherian>`_.
- Small optimizations to help reduce indexing speed of datasets (:pull:`9002`).
By `Mark Harfouche <https://github.com/hmaarrfk>`_.
- Performance improvement in `open_datatree` method for Zarr, netCDF4 and h5netcdf backends (:issue:`8994`, :pull:`9014`).
By `Alfonso Ladino <https://github.com/aladinor>`_.


Breaking changes
Expand Down
30 changes: 0 additions & 30 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@
if TYPE_CHECKING:
from io import BufferedIOBase

from h5netcdf.legacyapi import Dataset as ncDatasetLegacyH5
from netCDF4 import Dataset as ncDataset

from xarray.core.dataset import Dataset
from xarray.core.datatree import DataTree
from xarray.core.types import NestedSequence
Expand Down Expand Up @@ -131,33 +128,6 @@ def _decode_variable_name(name):
return name


def _open_datatree_netcdf(
ncDataset: ncDataset | ncDatasetLegacyH5,
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
**kwargs,
) -> DataTree:
from xarray.backends.api import open_dataset
from xarray.core.datatree import DataTree
from xarray.core.treenode import NodePath

ds = open_dataset(filename_or_obj, **kwargs)
tree_root = DataTree.from_dict({"/": ds})
with ncDataset(filename_or_obj, mode="r") as ncds:
for path in _iter_nc_groups(ncds):
subgroup_ds = open_dataset(filename_or_obj, group=path, **kwargs)

# TODO refactor to use __setitem__ once creation of new nodes by assigning Dataset works again
node_name = NodePath(path).name
new_node: DataTree = DataTree(name=node_name, data=subgroup_ds)
tree_root._set_item(
path,
new_node,
allow_overwrite=False,
new_nodes_along_path=True,
)
return tree_root


def _iter_nc_groups(root, parent="/"):
from xarray.core.treenode import NodePath

Expand Down
54 changes: 50 additions & 4 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
import functools
import io
import os
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from typing import TYPE_CHECKING, Any

from xarray.backends.common import (
BACKEND_ENTRYPOINTS,
BackendEntrypoint,
WritableCFDataStore,
_normalize_path,
_open_datatree_netcdf,
find_root_and_group,
)
from xarray.backends.file_manager import CachingFileManager, DummyFileManager
Expand Down Expand Up @@ -431,11 +430,58 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
def open_datatree(
self,
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
*,
mask_and_scale=True,
flamingbear marked this conversation as resolved.
Show resolved Hide resolved
decode_times=True,
concat_characters=True,
decode_coords=True,
drop_variables: str | Iterable[str] | None = None,
use_cftime=None,
decode_timedelta=None,
group: str | Iterable[str] | Callable | None = None,
**kwargs,
) -> DataTree:
from h5netcdf.legacyapi import Dataset as ncDataset
from xarray.backends.api import open_dataset
from xarray.backends.common import _iter_nc_groups
from xarray.core.datatree import DataTree
from xarray.core.treenode import NodePath
from xarray.core.utils import close_on_error

return _open_datatree_netcdf(ncDataset, filename_or_obj, **kwargs)
filename_or_obj = _normalize_path(filename_or_obj)
store = H5NetCDFStore.open(
filename_or_obj,
group=group,
)
if group:
parent = NodePath("/") / NodePath(group)
else:
parent = NodePath("/")

manager = store._manager
ds = open_dataset(store, **kwargs)
tree_root = DataTree.from_dict({str(parent): ds})
for path_group in _iter_nc_groups(store.ds, parent=parent):
group_store = H5NetCDFStore(manager, group=path_group, **kwargs)
store_entrypoint = StoreBackendEntrypoint()
with close_on_error(group_store):
ds = store_entrypoint.open_dataset(
group_store,
mask_and_scale=mask_and_scale,
decode_times=decode_times,
concat_characters=concat_characters,
decode_coords=decode_coords,
drop_variables=drop_variables,
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
)
new_node: DataTree = DataTree(name=NodePath(path_group).name, data=ds)
tree_root._set_item(
path_group,
new_node,
allow_overwrite=False,
new_nodes_along_path=True,
)
return tree_root


BACKEND_ENTRYPOINTS["h5netcdf"] = ("h5netcdf", H5netcdfBackendEntrypoint)
53 changes: 49 additions & 4 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import functools
import operator
import os
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from contextlib import suppress
from typing import TYPE_CHECKING, Any

Expand All @@ -16,7 +16,6 @@
BackendEntrypoint,
WritableCFDataStore,
_normalize_path,
_open_datatree_netcdf,
flamingbear marked this conversation as resolved.
Show resolved Hide resolved
find_root_and_group,
robust_getitem,
)
Expand Down Expand Up @@ -672,11 +671,57 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
def open_datatree(
self,
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
*,
mask_and_scale=True,
decode_times=True,
concat_characters=True,
decode_coords=True,
drop_variables: str | Iterable[str] | None = None,
use_cftime=None,
decode_timedelta=None,
group: str | Iterable[str] | Callable | None = None,
**kwargs,
) -> DataTree:
from netCDF4 import Dataset as ncDataset
from xarray.backends.api import open_dataset
from xarray.backends.common import _iter_nc_groups
from xarray.core.datatree import DataTree
from xarray.core.treenode import NodePath

return _open_datatree_netcdf(ncDataset, filename_or_obj, **kwargs)
filename_or_obj = _normalize_path(filename_or_obj)
store = NetCDF4DataStore.open(
filename_or_obj,
group=group,
)
if group:
parent = NodePath("/") / NodePath(group)
else:
parent = NodePath("/")

manager = store._manager
ds = open_dataset(store, **kwargs)
tree_root = DataTree.from_dict({str(parent): ds})
for path_group in _iter_nc_groups(store.ds, parent=parent):
group_store = NetCDF4DataStore(manager, group=path_group, **kwargs)
store_entrypoint = StoreBackendEntrypoint()
with close_on_error(group_store):
ds = store_entrypoint.open_dataset(
group_store,
mask_and_scale=mask_and_scale,
decode_times=decode_times,
concat_characters=concat_characters,
decode_coords=decode_coords,
drop_variables=drop_variables,
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
)
new_node: DataTree = DataTree(name=NodePath(path_group).name, data=ds)
tree_root._set_item(
path_group,
new_node,
allow_overwrite=False,
new_nodes_along_path=True,
)
return tree_root


BACKEND_ENTRYPOINTS["netcdf4"] = ("netCDF4", NetCDF4BackendEntrypoint)
Loading
Loading