Skip to content

Commit

Permalink
Add mode='a-': Do not overwrite coordinates when appending to Zarr wi…
Browse files Browse the repository at this point in the history
…th `append_dim` (#8428)

Co-authored-by: Deepak Cherian <deepak@cherian.net>
Co-authored-by: Anderson Banihirwe <13301940+andersy005@users.noreply.github.com>
  • Loading branch information
3 people committed Dec 1, 2023
1 parent 1715ed3 commit c93b31a
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 27 deletions.
4 changes: 2 additions & 2 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ New Features

- Use a concise format when plotting datetime arrays. (:pull:`8449`).
By `Jimmy Westling <https://github.com/illviljan>`_.


- Avoid overwriting unchanged existing coordinate variables when appending by setting ``mode='a-'``.
By `Ryan Abernathey <https://github.com/rabernat>`_ and `Deepak Cherian <https://github.com/dcherian>`_.
- :py:meth:`~xarray.DataArray.rank` now operates on dask-backed arrays, assuming
the core dim has exactly one chunk. (:pull:`8475`).
By `Maximilian Roos <https://github.com/max-sixty>`_.
Expand Down
22 changes: 12 additions & 10 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk
from xarray.core.indexes import Index
from xarray.core.parallelcompat import guess_chunkmanager
from xarray.core.types import ZarrWriteModes
from xarray.core.utils import is_remote_uri

if TYPE_CHECKING:
Expand Down Expand Up @@ -69,7 +70,6 @@
"NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC"
]


DATAARRAY_NAME = "__xarray_dataarray_name__"
DATAARRAY_VARIABLE = "__xarray_dataarray_variable__"

Expand Down Expand Up @@ -1577,7 +1577,7 @@ def to_zarr(
dataset: Dataset,
store: MutableMapping | str | os.PathLike[str] | None = None,
chunk_store: MutableMapping | str | os.PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
Expand All @@ -1601,7 +1601,7 @@ def to_zarr(
dataset: Dataset,
store: MutableMapping | str | os.PathLike[str] | None = None,
chunk_store: MutableMapping | str | os.PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
Expand All @@ -1623,7 +1623,7 @@ def to_zarr(
dataset: Dataset,
store: MutableMapping | str | os.PathLike[str] | None = None,
chunk_store: MutableMapping | str | os.PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
Expand Down Expand Up @@ -1680,16 +1680,18 @@ def to_zarr(
else:
mode = "w-"

if mode != "a" and append_dim is not None:
if mode not in ["a", "a-"] and append_dim is not None:
raise ValueError("cannot set append_dim unless mode='a' or mode=None")

if mode not in ["a", "r+"] and region is not None:
raise ValueError("cannot set region unless mode='a', mode='r+' or mode=None")
if mode not in ["a", "a-", "r+"] and region is not None:
raise ValueError(
"cannot set region unless mode='a', mode='a-', mode='r+' or mode=None"
)

if mode not in ["w", "w-", "a", "r+"]:
if mode not in ["w", "w-", "a", "a-", "r+"]:
raise ValueError(
"The only supported options for mode are 'w', "
f"'w-', 'a' and 'r+', but mode={mode!r}"
f"'w-', 'a', 'a-', and 'r+', but mode={mode!r}"
)

# validate Dataset keys, DataArray names
Expand Down Expand Up @@ -1745,7 +1747,7 @@ def to_zarr(
write_empty=write_empty_chunks,
)

if mode in ["a", "r+"]:
if mode in ["a", "a-", "r+"]:
_validate_datatypes_for_zarr_append(zstore, dataset)
if append_dim is not None:
existing_dims = zstore.get_dimensions()
Expand Down
21 changes: 18 additions & 3 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from xarray.core import indexing
from xarray.core.parallelcompat import guess_chunkmanager
from xarray.core.pycompat import integer_types
from xarray.core.types import ZarrWriteModes
from xarray.core.utils import (
FrozenDict,
HiddenKeyDict,
Expand Down Expand Up @@ -385,7 +386,7 @@ class ZarrStore(AbstractWritableDataStore):
def open_group(
cls,
store,
mode="r",
mode: ZarrWriteModes = "r",
synchronizer=None,
group=None,
consolidated=False,
Expand All @@ -410,7 +411,8 @@ def open_group(
zarr_version = getattr(store, "_store_version", 2)

open_kwargs = dict(
mode=mode,
# mode='a-' is a handcrafted xarray specialty
mode="a" if mode == "a-" else mode,
synchronizer=synchronizer,
path=group,
)
Expand Down Expand Up @@ -639,8 +641,21 @@ def store(
self.set_attributes(attributes)
self.set_dimensions(variables_encoded, unlimited_dims=unlimited_dims)

# if we are appending to an append_dim, only write either
# - new variables not already present, OR
# - variables with the append_dim in their dimensions
# We do NOT overwrite other variables.
if self._mode == "a-" and self._append_dim is not None:
variables_to_set = {
k: v
for k, v in variables_encoded.items()
if (k not in existing_variable_names) or (self._append_dim in v.dims)
}
else:
variables_to_set = variables_encoded

self.set_variables(
variables_encoded, check_encoding_set, writer, unlimited_dims=unlimited_dims
variables_to_set, check_encoding_set, writer, unlimited_dims=unlimited_dims
)
if self._consolidate_on_close:
zarr.consolidate_metadata(self.zarr_group.store)
Expand Down
18 changes: 12 additions & 6 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,12 @@
from xarray.core.indexing import is_fancy_indexer, map_index_queries
from xarray.core.merge import PANDAS_TYPES, MergeError
from xarray.core.options import OPTIONS, _get_keep_attrs
from xarray.core.types import DaCompatible, T_DataArray, T_DataArrayOrSet
from xarray.core.types import (
DaCompatible,
T_DataArray,
T_DataArrayOrSet,
ZarrWriteModes,
)
from xarray.core.utils import (
Default,
HybridMappingProxy,
Expand Down Expand Up @@ -4074,7 +4079,7 @@ def to_zarr(
self,
store: MutableMapping | str | PathLike[str] | None = None,
chunk_store: MutableMapping | str | PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
*,
Expand All @@ -4095,7 +4100,7 @@ def to_zarr(
self,
store: MutableMapping | str | PathLike[str] | None = None,
chunk_store: MutableMapping | str | PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
Expand All @@ -4114,7 +4119,7 @@ def to_zarr(
self,
store: MutableMapping | str | PathLike[str] | None = None,
chunk_store: MutableMapping | str | PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
Expand Down Expand Up @@ -4150,10 +4155,11 @@ def to_zarr(
chunk_store : MutableMapping, str or path-like, optional
Store or path to directory in local or remote file system only for Zarr
array chunks. Requires zarr-python v2.4.0 or later.
mode : {"w", "w-", "a", "r+", None}, optional
mode : {"w", "w-", "a", "a-", r+", None}, optional
Persistence mode: "w" means create (overwrite if exists);
"w-" means create (fail if exists);
"a" means override existing variables (create if does not exist);
"a" means override all existing variables including dimension coordinates (create if does not exist);
"a-" means only append those variables that have ``append_dim``.
"r+" means modify existing array *values* only (raise an error if
any metadata or shapes would change).
The default mode is "a" if ``append_dim`` is set. Otherwise, it is
Expand Down
12 changes: 7 additions & 5 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
T_Chunks,
T_DataArrayOrSet,
T_Dataset,
ZarrWriteModes,
)
from xarray.core.utils import (
Default,
Expand Down Expand Up @@ -2305,7 +2306,7 @@ def to_zarr(
self,
store: MutableMapping | str | PathLike[str] | None = None,
chunk_store: MutableMapping | str | PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
Expand All @@ -2328,7 +2329,7 @@ def to_zarr(
self,
store: MutableMapping | str | PathLike[str] | None = None,
chunk_store: MutableMapping | str | PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
Expand All @@ -2349,7 +2350,7 @@ def to_zarr(
self,
store: MutableMapping | str | PathLike[str] | None = None,
chunk_store: MutableMapping | str | PathLike | None = None,
mode: Literal["w", "w-", "a", "r+", None] = None,
mode: ZarrWriteModes | None = None,
synchronizer=None,
group: str | None = None,
encoding: Mapping | None = None,
Expand Down Expand Up @@ -2387,10 +2388,11 @@ def to_zarr(
chunk_store : MutableMapping, str or path-like, optional
Store or path to directory in local or remote file system only for Zarr
array chunks. Requires zarr-python v2.4.0 or later.
mode : {"w", "w-", "a", "r+", None}, optional
mode : {"w", "w-", "a", "a-", r+", None}, optional
Persistence mode: "w" means create (overwrite if exists);
"w-" means create (fail if exists);
"a" means override existing variables (create if does not exist);
"a" means override all existing variables including dimension coordinates (create if does not exist);
"a-" means only append those variables that have ``append_dim``.
"r+" means modify existing array *values* only (raise an error if
any metadata or shapes would change).
The default mode is "a" if ``append_dim`` is set. Otherwise, it is
Expand Down
3 changes: 3 additions & 0 deletions xarray/core/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,3 +282,6 @@ def copy(
"midpoint",
"nearest",
]


ZarrWriteModes = Literal["w", "w-", "a", "a-", "r+", "r"]
25 changes: 24 additions & 1 deletion xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2390,6 +2390,29 @@ def test_append_with_new_variable(self) -> None:
xr.open_dataset(store_target, engine="zarr", **self.version_kwargs),
)

def test_append_with_append_dim_no_overwrite(self) -> None:
ds, ds_to_append, _ = create_append_test_data()
with self.create_zarr_target() as store_target:
ds.to_zarr(store_target, mode="w", **self.version_kwargs)
original = xr.concat([ds, ds_to_append], dim="time")
original2 = xr.concat([original, ds_to_append], dim="time")

# overwrite a coordinate;
# for mode='a-', this will not get written to the store
# because it does not have the append_dim as a dim
ds_to_append.lon.data[:] = -999
ds_to_append.to_zarr(
store_target, mode="a-", append_dim="time", **self.version_kwargs
)
actual = xr.open_dataset(store_target, engine="zarr", **self.version_kwargs)
assert_identical(original, actual)

# by default, mode="a" will overwrite all coordinates.
ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs)
actual = xr.open_dataset(store_target, engine="zarr", **self.version_kwargs)
original2.lon.data[:] = -999
assert_identical(original2, actual)

@requires_dask
def test_to_zarr_compute_false_roundtrip(self) -> None:
from dask.delayed import Delayed
Expand Down Expand Up @@ -2586,7 +2609,7 @@ def setup_and_verify_store(expected=data):
with pytest.raises(
ValueError,
match=re.escape(
"cannot set region unless mode='a', mode='r+' or mode=None"
"cannot set region unless mode='a', mode='a-', mode='r+' or mode=None"
),
):
data.to_zarr(
Expand Down

0 comments on commit c93b31a

Please sign in to comment.