diff --git a/changes/3963.removal.md b/changes/3963.removal.md new file mode 100644 index 0000000000..a83fd53853 --- /dev/null +++ b/changes/3963.removal.md @@ -0,0 +1,6 @@ +The ``BloscShuffle`` and ``BloscCname`` enums (``zarr.codecs.BloscShuffle``, +``zarr.codecs.BloscCname``) are now deprecated. Pass the equivalent literal +string (e.g. ``"zstd"``, ``"bitshuffle"``) when constructing a ``BloscCodec``. +The enum classes remain importable but emit ``DeprecationWarning`` on member +access, and will be removed in a future release. ``BloscCodec.cname`` and +``BloscCodec.shuffle`` are now plain strings rather than enum members. diff --git a/docs/quick-start.md b/docs/quick-start.md index bb7a556b96..27dc8e6045 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -58,7 +58,7 @@ z = zarr.create_array( compressors=zarr.codecs.BloscCodec( cname="zstd", clevel=3, - shuffle=zarr.codecs.BloscShuffle.shuffle + shuffle="shuffle" ) ) diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index 4b52629645..14122003c0 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -201,7 +201,7 @@ Different compressors can be provided via the `compressors` keyword argument accepted by all array creation functions. For example: ```python exec="true" session="arrays" source="above" result="ansi" -compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=3, shuffle=zarr.codecs.BloscShuffle.bitshuffle) +compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=3, shuffle='bitshuffle') data = np.arange(100000000, dtype='int32').reshape(10000, 10000) z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors) z[:] = data @@ -298,7 +298,7 @@ Here is an example using a delta filter with the Blosc compressor: from zarr.codecs.numcodecs import Delta filters = [Delta(dtype='int32')] -compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=1, shuffle=zarr.codecs.BloscShuffle.shuffle) +compressors = zarr.codecs.BloscCodec(cname='zstd', clevel=1, shuffle='shuffle') data = np.arange(100000000, dtype='int32').reshape(10000, 10000) z = zarr.create_array(store='data/example-9.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), filters=filters, compressors=compressors) print(z.info_complete()) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 62ceff7659..8a20282060 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -1,10 +1,11 @@ from __future__ import annotations import asyncio +import warnings from dataclasses import dataclass, field, replace from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict +from typing import TYPE_CHECKING, ClassVar, Final, Literal, NotRequired, TypedDict import numcodecs from numcodecs.blosc import Blosc @@ -12,7 +13,7 @@ from zarr.abc.codec import BytesBytesCodec from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, NamedRequiredConfig, parse_enum, parse_named_configuration +from zarr.core.common import JSON, NamedRequiredConfig, parse_named_configuration from zarr.core.dtype.common import HasItemSize if TYPE_CHECKING: @@ -21,19 +22,21 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer -Shuffle = Literal["noshuffle", "shuffle", "bitshuffle"] +BloscShuffleLiteral = Literal["noshuffle", "shuffle", "bitshuffle"] """The shuffle values permitted for the blosc codec""" -SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") +BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") -CName = Literal["lz4", "lz4hc", "blosclz", "snappy", "zlib", "zstd"] -"""The codec identifiers used in the blosc codec """ +BloscCnameLiteral = Literal["lz4", "lz4hc", "blosclz", "snappy", "zlib", "zstd"] +"""The codec identifiers used in the blosc codec""" + +BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "snappy", "zlib", "zstd") class BloscConfigV2(TypedDict): """Configuration for the V2 Blosc codec""" - cname: CName + cname: BloscCnameLiteral clevel: int shuffle: int blocksize: int @@ -43,9 +46,9 @@ class BloscConfigV2(TypedDict): class BloscConfigV3(TypedDict): """Configuration for the V3 Blosc codec""" - cname: CName + cname: BloscCnameLiteral clevel: int - shuffle: Shuffle + shuffle: BloscShuffleLiteral blocksize: int typesize: int @@ -56,38 +59,66 @@ class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ -class BloscShuffle(Enum): +class _DeprecatedStrEnumMeta(type): """ - Enum for shuffle filter used by blosc. + Metaclass for the legacy `BloscShuffle` / `BloscCname` classes. Accessing + a member name (e.g. `BloscShuffle.bitshuffle`) emits a `DeprecationWarning` + and returns the equivalent string. """ - noshuffle = "noshuffle" - shuffle = "shuffle" - bitshuffle = "bitshuffle" + _members: dict[str, str] - @classmethod - def from_int(cls, num: int) -> BloscShuffle: - blosc_shuffle_int_to_str = { + def __getattr__(cls, name: str) -> str: + members: dict[str, str] = type.__getattribute__(cls, "_members") + if name in members: + warnings.warn( + f"{cls.__name__}.{name} is deprecated; pass the string {members[name]!r} instead.", + DeprecationWarning, + stacklevel=2, + ) + return members[name] + raise AttributeError(name) + + +class BloscShuffle(metaclass=_DeprecatedStrEnumMeta): + """ + Deprecated. Pass a literal string (`"noshuffle"`, `"shuffle"`, or + `"bitshuffle"`) directly to `BloscCodec` instead. + """ + + _members: ClassVar[dict[str, str]] = { + "noshuffle": "noshuffle", + "shuffle": "shuffle", + "bitshuffle": "bitshuffle", + } + + @staticmethod + def from_int(num: int) -> BloscShuffleLiteral: + mapping: dict[int, BloscShuffleLiteral] = { 0: "noshuffle", 1: "shuffle", 2: "bitshuffle", } - if num not in blosc_shuffle_int_to_str: + if num not in mapping: raise ValueError(f"Value must be between 0 and 2. Got {num}.") - return BloscShuffle[blosc_shuffle_int_to_str[num]] + return mapping[num] -class BloscCname(Enum): +class BloscCname(metaclass=_DeprecatedStrEnumMeta): """ - Enum for compression library used by blosc. + Deprecated. Pass a literal string (one of `"lz4"`, `"lz4hc"`, + `"blosclz"`, `"snappy"`, `"zlib"`, `"zstd"`) directly to + `BloscCodec` instead. """ - lz4 = "lz4" - lz4hc = "lz4hc" - blosclz = "blosclz" - zstd = "zstd" - snappy = "snappy" - zlib = "zlib" + _members: ClassVar[dict[str, str]] = { + "lz4": "lz4", + "lz4hc": "lz4hc", + "blosclz": "blosclz", + "snappy": "snappy", + "zstd": "zstd", + "zlib": "zlib", + } # See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc @@ -118,6 +149,34 @@ def parse_blocksize(data: JSON) -> int: raise TypeError(f"Value should be an int. Got {type(data)} instead.") +def _coerce_enum_input(value: object, param_name: str) -> object: + """ + If `value` is a real `enum.Enum` instance, emit a deprecation warning + and return `value.value`. Otherwise return `value` unchanged. + """ + if isinstance(value, Enum): + warnings.warn( + f"Passing an enum to BloscCodec(..., {param_name}=...) is deprecated; " + "pass the equivalent literal string instead.", + DeprecationWarning, + stacklevel=3, + ) + return value.value + return value + + +def _parse_cname(data: object) -> BloscCnameLiteral: + if isinstance(data, str) and data in BLOSC_CNAME: + return data # type: ignore[return-value] + raise ValueError(f"cname must be one of {list(BLOSC_CNAME)!r}. Got {data!r}.") + + +def _parse_shuffle(data: object) -> BloscShuffleLiteral: + if isinstance(data, str) and data in BLOSC_SHUFFLE: + return data # type: ignore[return-value] + raise ValueError(f"shuffle must be one of {list(BLOSC_SHUFFLE)!r}. Got {data!r}.") + + @dataclass(frozen=True) class BloscCodec(BytesBytesCodec): """ @@ -133,12 +192,14 @@ class BloscCodec(BytesBytesCodec): Always False for Blosc codec, as compression produces variable-sized output. typesize : int The data type size in bytes used for shuffle filtering. - cname : BloscCname - The compression algorithm being used (lz4, lz4hc, blosclz, snappy, zlib, or zstd). + cname : BloscCnameLiteral + The compression algorithm being used; one of "lz4", "lz4hc", + "blosclz", "snappy", "zlib", or "zstd". clevel : int The compression level (0-9). - shuffle : BloscShuffle - The shuffle filter mode (noshuffle, shuffle, or bitshuffle). + shuffle : BloscShuffleLiteral + The shuffle filter mode; one of "noshuffle", "shuffle", or + "bitshuffle". blocksize : int The size of compressed blocks in bytes (0 for automatic). @@ -148,13 +209,16 @@ class BloscCodec(BytesBytesCodec): The data type size in bytes. This affects how the shuffle filter processes the data. If None, defaults to 1 and the attribute is marked as tunable. Default: 1. - cname : BloscCname or {'lz4', 'lz4hc', 'blosclz', 'snappy', 'zlib', 'zstd'}, optional - The compression algorithm to use. Default: 'zstd'. + cname : BloscCnameLiteral, optional + The compression algorithm to use; one of "lz4", "lz4hc", "blosclz", + "snappy", "zlib", or "zstd". Default is "zstd". Passing a `BloscCname` + enum is deprecated. clevel : int, optional The compression level, from 0 (no compression) to 9 (maximum compression). Higher values provide better compression at the cost of speed. Default: 5. - shuffle : BloscShuffle or {'noshuffle', 'shuffle', 'bitshuffle'}, optional - The shuffle filter to apply before compression: + shuffle : BloscShuffleLiteral or None, optional + The shuffle filter to apply before compression; one of "noshuffle", + "shuffle", or "bitshuffle": - 'noshuffle': No shuffling - 'shuffle': Byte shuffling (better for typesize > 1) @@ -183,18 +247,13 @@ class BloscCodec(BytesBytesCodec): >>> codec.typesize 1 >>> codec.shuffle - + 'bitshuffle' Create a codec with specific compression settings: >>> codec = BloscCodec(cname='zstd', clevel=9, shuffle='shuffle') >>> codec.cname - - - See Also - -------- - BloscShuffle : Enum for shuffle filter options - BloscCname : Enum for compression algorithm options + 'zstd' """ # This attribute tracks parameters were set to None at init time, and thus tunable @@ -202,38 +261,37 @@ class BloscCodec(BytesBytesCodec): is_fixed_size = False typesize: int - cname: BloscCname + cname: BloscCnameLiteral clevel: int - shuffle: BloscShuffle + shuffle: BloscShuffleLiteral blocksize: int def __init__( self, *, typesize: int | None = None, - cname: BloscCname | CName = BloscCname.zstd, + cname: BloscCname | BloscCnameLiteral = "zstd", clevel: int = 5, - shuffle: BloscShuffle | Shuffle | None = None, + shuffle: BloscShuffle | BloscShuffleLiteral | None = None, blocksize: int = 0, ) -> None: object.__setattr__(self, "_tunable_attrs", set()) - # If typesize was set to None, replace it with a valid typesize - # and flag the typesize attribute as safe to replace later if typesize is None: typesize = 1 self._tunable_attrs.update({"typesize"}) - # If shuffle was set to None, replace it with a valid shuffle - # and flag the shuffle attribute as safe to replace later if shuffle is None: - shuffle = BloscShuffle.bitshuffle + shuffle = "bitshuffle" self._tunable_attrs.update({"shuffle"}) + cname = _coerce_enum_input(cname, "cname") # type: ignore[assignment] + shuffle = _coerce_enum_input(shuffle, "shuffle") # type: ignore[assignment] + typesize_parsed = parse_typesize(typesize) - cname_parsed = parse_enum(cname, BloscCname) + cname_parsed = _parse_cname(cname) clevel_parsed = parse_clevel(clevel) - shuffle_parsed = parse_enum(shuffle, BloscShuffle) + shuffle_parsed = _parse_shuffle(shuffle) blocksize_parsed = parse_blocksize(blocksize) object.__setattr__(self, "typesize", typesize_parsed) @@ -252,9 +310,9 @@ def to_dict(self) -> dict[str, JSON]: "name": "blosc", "configuration": { "typesize": self.typesize, - "cname": self.cname.value, + "cname": self.cname, "clevel": self.clevel, - "shuffle": self.shuffle.value, + "shuffle": self.shuffle, "blocksize": self.blocksize, }, } @@ -276,20 +334,20 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if "shuffle" in self._tunable_attrs: new_codec = replace( new_codec, - shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), + shuffle=("bitshuffle" if item_size == 1 else "shuffle"), ) return new_codec @cached_property def _blosc_codec(self) -> Blosc: - map_shuffle_str_to_int = { - BloscShuffle.noshuffle: 0, - BloscShuffle.shuffle: 1, - BloscShuffle.bitshuffle: 2, + map_shuffle_str_to_int: dict[BloscShuffleLiteral, int] = { + "noshuffle": 0, + "shuffle": 1, + "bitshuffle": 2, } config_dict: BloscConfigV2 = { - "cname": self.cname.name, # type: ignore[typeddict-item] + "cname": self.cname, "clevel": self.clevel, "shuffle": map_shuffle_str_to_int[self.shuffle], "blocksize": self.blocksize, diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 0201beb8de..f5f13f4d05 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -1,4 +1,7 @@ +import enum import json +import warnings +from typing import Any, cast import numcodecs import numpy as np @@ -8,7 +11,14 @@ import zarr from zarr.abc.codec import SupportsSyncCodec from zarr.codecs import BloscCodec -from zarr.codecs.blosc import BloscShuffle, Shuffle +from zarr.codecs.blosc import ( + BLOSC_CNAME, + BLOSC_SHUFFLE, + BloscCname, + BloscCnameLiteral, + BloscShuffle, + BloscShuffleLiteral, +) from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.buffer import default_buffer_prototype from zarr.core.dtype import UInt16, get_data_type_from_native_dtype @@ -61,16 +71,26 @@ async def test_blosc_evolve(dtype: str) -> None: assert blosc_configuration_json["shuffle"] == "shuffle" -@pytest.mark.parametrize("shuffle", [None, "bitshuffle", BloscShuffle.shuffle]) +@pytest.mark.parametrize("shuffle", [None, "bitshuffle", "legacy-enum"]) @pytest.mark.parametrize("typesize", [None, 1, 2]) -def test_tunable_attrs_param(shuffle: None | Shuffle | BloscShuffle, typesize: None | int) -> None: +def test_tunable_attrs_param( + shuffle: None | BloscShuffleLiteral | str, typesize: None | int +) -> None: """ - Test that the tunable_attrs parameter is set as expected when creating a BloscCodec, + Test that the tunable_attrs parameter is set as expected when creating a BloscCodec. """ - codec = BloscCodec(typesize=typesize, shuffle=shuffle) + # Materialize BloscShuffle.shuffle via the deprecation shim without + # contaminating the BloscCodec construction below with that warning. + if shuffle == "legacy-enum": + with pytest.warns(DeprecationWarning, match="BloscShuffle.shuffle"): + shuffle_arg: None | BloscShuffleLiteral | str = BloscShuffle.shuffle + else: + shuffle_arg = shuffle + + codec = BloscCodec(typesize=typesize, shuffle=cast(BloscShuffleLiteral | None, shuffle_arg)) - if shuffle is None: - assert codec.shuffle == BloscShuffle.bitshuffle # default shuffle + if shuffle_arg is None: + assert codec.shuffle == "bitshuffle" # default shuffle assert "shuffle" in codec._tunable_attrs if typesize is None: assert codec.typesize == 1 # default typesize @@ -82,7 +102,7 @@ def test_tunable_attrs_param(shuffle: None | Shuffle | BloscShuffle, typesize: N dtype=new_dtype, fill_value=1, prototype=default_buffer_prototype(), - config={}, # type: ignore[arg-type] + config=cast(ArrayConfig, {}), ) evolved_codec = codec.evolve_from_array_spec(array_spec=array_spec) @@ -90,8 +110,8 @@ def test_tunable_attrs_param(shuffle: None | Shuffle | BloscShuffle, typesize: N assert evolved_codec.typesize == new_dtype.item_size else: assert evolved_codec.typesize == codec.typesize - if shuffle is None: - assert evolved_codec.shuffle == BloscShuffle.shuffle + if shuffle_arg is None: + assert evolved_codec.shuffle == "shuffle" else: assert evolved_codec.shuffle == codec.shuffle @@ -135,3 +155,121 @@ def test_blosc_codec_sync_roundtrip() -> None: decoded = codec._decode_sync(encoded, spec) result = np.frombuffer(decoded.as_numpy_array(), dtype="float64") np.testing.assert_array_equal(arr, result) + + +@pytest.mark.parametrize("cname", BLOSC_CNAME) +def test_blosc_codec_accepts_all_cnames(cname: BloscCnameLiteral) -> None: + """ + Every compressor name in BLOSC_CNAME is accepted by BloscCodec and round-trips + to the same value on the stored attribute. Adding a new value to the + BloscCnameLiteral type alias without also adding it to BLOSC_CNAME (or vice + versa) is caught here. + """ + codec = BloscCodec(cname=cname) + assert codec.cname == cname + + +@pytest.mark.parametrize("shuffle", BLOSC_SHUFFLE) +def test_blosc_codec_accepts_all_shuffles(shuffle: BloscShuffleLiteral) -> None: + """ + Every shuffle mode in BLOSC_SHUFFLE is accepted by BloscCodec and round-trips + to the same value on the stored attribute. Adding a new value to the + BloscShuffleLiteral type alias without also adding it to BLOSC_SHUFFLE (or + vice versa) is caught here. + """ + codec = BloscCodec(shuffle=shuffle) + assert codec.shuffle == shuffle + + +@pytest.mark.parametrize("shuffle", BLOSC_SHUFFLE) +@pytest.mark.parametrize("cname", BLOSC_CNAME) +def test_blosc_codec_json_roundtrip(cname: BloscCnameLiteral, shuffle: BloscShuffleLiteral) -> None: + """ + JSON serialization (to_dict / from_dict) preserves every (cname, shuffle) + pair drawn from BLOSC_CNAME x BLOSC_SHUFFLE. Guards against drift in the + codec's V3 JSON form for any combination of compressor and shuffle option. + + The non-varied fields are fully specified so the codec has no tunable + attributes; tunability is not part of the JSON form and would otherwise + cause spurious round-trip mismatches. + """ + codec = BloscCodec(typesize=1, cname=cname, clevel=5, shuffle=shuffle, blocksize=0) + restored = BloscCodec.from_dict(codec.to_dict()) + assert restored == codec + + +@pytest.mark.parametrize( + ("enum_cls", "member", "expected"), + [ + (BloscShuffle, "shuffle", "shuffle"), + (BloscCname, "zstd", "zstd"), + ], +) +def test_blosc_enum_member_access_warns(enum_cls: type, member: str, expected: str) -> None: + """ + Accessing a member on the deprecated BloscShuffle / BloscCname classes + emits a DeprecationWarning and resolves to the equivalent literal string. + """ + match = f"{enum_cls.__name__}.{member}" + with pytest.warns(DeprecationWarning, match=match): + value = getattr(enum_cls, member) + assert value == expected + + +def test_blosc_enum_classes_import_silently() -> None: + """ + Importing the deprecated enum classes by name must not emit a warning; + only member access does. This guards against the blosc module accidentally + triggering its own deprecation warnings when it (or zarr) is imported. + """ + with warnings.catch_warnings(): + warnings.simplefilter("error") + from zarr.codecs.blosc import BloscCname as _BloscCname # noqa: F401 + from zarr.codecs.blosc import BloscShuffle as _BloscShuffle # noqa: F401 + + +def test_blosc_codec_init_with_enum_instance_warns() -> None: + """ + Passing a real `enum.Enum` instance to BloscCodec.__init__ (e.g. an + instance materialized before the deprecation shim was introduced) must + trigger the init-level deprecation warning and still normalize the value + to the corresponding literal string. + """ + + class LegacyShuffle(enum.Enum): + bitshuffle = "bitshuffle" + + class LegacyCname(enum.Enum): + zstd = "zstd" + + with pytest.warns(DeprecationWarning, match="enum"): + codec = BloscCodec( + cname=cast(BloscCname, LegacyCname.zstd), + shuffle=cast(BloscShuffle, LegacyShuffle.bitshuffle), + ) + assert codec.cname == "zstd" + assert codec.shuffle == "bitshuffle" + + +@pytest.mark.parametrize("param", ["cname", "shuffle"]) +def test_blosc_codec_rejects_unknown(param: str) -> None: + """ + BloscCodec.__init__ raises ValueError when given a string outside the + allowed set for `cname` or `shuffle`, and the error message names the + offending parameter. + """ + kwargs: dict[str, Any] = {param: f"not-a-{param}"} + with pytest.raises(ValueError, match=f"{param} must be one of"): + BloscCodec(**kwargs) + + +@pytest.mark.parametrize("enum_cls", [BloscShuffle, BloscCname]) +def test_blosc_enum_attribute_error_for_unknown_member(enum_cls: type) -> None: + """ + Attribute access for a name that is not a known member of the deprecated + enum classes falls through to AttributeError, matching the behavior of a + regular class. + """ + unknown_name = "not_a_member" + with pytest.raises(AttributeError): + getattr(enum_cls, unknown_name)