Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle missing attributes key from metadata, and other fixes #2058

Merged
merged 5 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/zarr/abc/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def to_dict(self) -> JSON:
elif isinstance(value, str):
out_dict[key] = value
elif isinstance(value, Sequence):
out_dict[key] = [v.to_dict() if isinstance(v, Metadata) else v for v in value]
out_dict[key] = tuple(v.to_dict() if isinstance(v, Metadata) else v for v in value)
else:
out_dict[key] = value

Expand Down
2 changes: 1 addition & 1 deletion src/zarr/chunk_grids.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _from_dict(cls, data: dict[str, JSON]) -> Self:
return cls(**configuration_parsed) # type: ignore[arg-type]

def to_dict(self) -> dict[str, JSON]:
return {"name": "regular", "configuration": {"chunk_shape": list(self.chunk_shape)}}
return {"name": "regular", "configuration": {"chunk_shape": tuple(self.chunk_shape)}}

def all_chunk_coords(self, array_shape: ChunkCoords) -> Iterator[ChunkCoords]:
return itertools.product(
Expand Down
8 changes: 7 additions & 1 deletion src/zarr/chunk_key_encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,13 @@ def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncoding) -> ChunkKeyEncoding
if isinstance(data, ChunkKeyEncoding):
return data

name_parsed, configuration_parsed = parse_named_configuration(data)
# configuration is optional for chunk key encodings
name_parsed, configuration_parsed = parse_named_configuration(
data, require_configuration=False
)
# normalize missing configuration to the default "/" separator.
d-v-b marked this conversation as resolved.
Show resolved Hide resolved
if configuration_parsed is None:
configuration_parsed = {"separator": "/"}
if name_parsed == "default":
return DefaultChunkKeyEncoding(**configuration_parsed) # type: ignore[arg-type]
if name_parsed == "v2":
Expand Down
18 changes: 11 additions & 7 deletions src/zarr/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,15 +283,19 @@ def _json_convert(o: Any) -> Any:

@classmethod
def from_dict(cls, data: dict[str, JSON]) -> ArrayV3Metadata:
# make a copy because we are modifying the dict
_data = data.copy()
# TODO: Remove the type: ignores[] comments below and use a TypedDict to type `data`
# check that the zarr_format attribute is correct
_ = parse_zarr_format_v3(data.pop("zarr_format")) # type: ignore[arg-type]
_ = parse_zarr_format_v3(_data.pop("zarr_format")) # type: ignore[arg-type]
# check that the node_type attribute is correct
_ = parse_node_type_array(data.pop("node_type")) # type: ignore[arg-type]
_ = parse_node_type_array(_data.pop("node_type")) # type: ignore[arg-type]

data["dimension_names"] = data.pop("dimension_names", None)

return cls(**data) # type: ignore[arg-type]
# dimension_names key is optional, normalize missing to `None`
_data["dimension_names"] = _data.pop("dimension_names", None)
# attributes key is optional, normalize missing to `None`
_data["attributes"] = _data.pop("attributes", None)
return cls(**_data) # type: ignore[arg-type]

def to_dict(self) -> dict[str, Any]:
out_dict = super().to_dict()
Expand Down Expand Up @@ -446,10 +450,10 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
return replace(self, attributes=attributes)


def parse_dimension_names(data: None | Iterable[str]) -> tuple[str, ...] | None:
def parse_dimension_names(data: None | Iterable[str | None]) -> tuple[str | None, ...] | None:
if data is None:
return data
elif all(isinstance(x, str) for x in data):
elif all(isinstance(x, type(None) | str) for x in data):
return tuple(data)
else:
msg = f"Expected either None or a iterable of str, got {type(data)}"
Expand Down
77 changes: 75 additions & 2 deletions tests/v3/test_metadata/test_v3.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal

from zarr.abc.codec import Codec
from zarr.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding
from zarr.codecs.bytes import BytesCodec

if TYPE_CHECKING:
from typing import Any
Expand All @@ -11,7 +15,7 @@
import numpy as np
import pytest

from zarr.metadata import parse_dimension_names
from zarr.metadata import ArrayV3Metadata, parse_dimension_names
from zarr.metadata import parse_fill_value_v3 as parse_fill_value
from zarr.metadata import parse_zarr_format_v3 as parse_zarr_format

Expand Down Expand Up @@ -157,3 +161,72 @@ def test_parse_fill_value_invalid_type_sequence(fill_value: Any, dtype_str: str)
match = f"Cannot parse non-string sequence {fill_value} as a scalar with type {dtype}"
with pytest.raises(TypeError, match=re.escape(match)):
parse_fill_value(fill_value, dtype)


@pytest.mark.parametrize("chunk_grid", ["regular"])
@pytest.mark.parametrize("attributes", [None, {"foo": "bar"}])
@pytest.mark.parametrize("codecs", [[BytesCodec()]])
@pytest.mark.parametrize("fill_value", [0, 1])
@pytest.mark.parametrize("chunk_key_encoding", ["v2", "default"])
@pytest.mark.parametrize("dimension_separator", [".", "/", None])
@pytest.mark.parametrize("dimension_names", ["nones", "strings", "missing"])
def test_metadata_to_dict(
chunk_grid: str,
codecs: list[Codec],
fill_value: Any,
chunk_key_encoding: Literal["v2", "default"],
dimension_separator: Literal[".", "/"] | None,
dimension_names: Literal["nones", "strings", "missing"],
attributes: None | dict[str, Any],
) -> None:
shape = (1, 2, 3)
data_type = "uint8"
if chunk_grid == "regular":
cgrid = {"name": "regular", "configuration": {"chunk_shape": (1, 1, 1)}}

cke: dict[str, Any]
cke_name_dict = {"name": chunk_key_encoding}
if dimension_separator is not None:
cke = cke_name_dict | {"configuration": {"separator": dimension_separator}}
else:
cke = cke_name_dict
dnames: tuple[str | None, ...] | None

if dimension_names == "strings":
dnames = tuple(map(str, range(len(shape))))
elif dimension_names == "missing":
dnames = None
elif dimension_names == "nones":
dnames = (None,) * len(shape)

metadata_dict = {
"zarr_format": 3,
"node_type": "array",
"shape": shape,
"chunk_grid": cgrid,
"data_type": data_type,
"chunk_key_encoding": cke,
"codecs": tuple(c.to_dict() for c in codecs),
"fill_value": fill_value,
}

if attributes is not None:
metadata_dict["attributes"] = attributes
if dnames is not None:
metadata_dict["dimension_names"] = dnames

metadata = ArrayV3Metadata.from_dict(metadata_dict)
observed = metadata.to_dict()
expected = metadata_dict
if attributes is None:
assert observed["attributes"] == {}
observed.pop("attributes")
if dimension_separator is None:
if chunk_key_encoding == "default":
expected_cke_dict = DefaultChunkKeyEncoding(separator="/").to_dict()
else:
expected_cke_dict = V2ChunkKeyEncoding(separator="/").to_dict()
assert observed["chunk_key_encoding"] == expected_cke_dict
observed.pop("chunk_key_encoding")
expected.pop("chunk_key_encoding")
assert observed == expected