Skip to content

Commit 4e2a157

Browse files
committed
put string dtypes in the strings module
1 parent e67d4dc commit 4e2a157

File tree

5 files changed

+171
-166
lines changed

5 files changed

+171
-166
lines changed

src/zarr/core/dtype/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88
from zarr.core.dtype.npy.float import Float16, Float32, Float64
99
from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64
1010
from zarr.core.dtype.npy.sized import (
11-
FixedLengthASCII,
1211
FixedLengthBytes,
13-
FixedLengthUTF32,
1412
Structured,
1513
)
1614
from zarr.core.dtype.npy.time import DateTime64, TimeDelta64
@@ -24,6 +22,8 @@
2422
from zarr.core.common import JSON
2523
from zarr.core.dtype.npy.string import (
2624
_NUMPY_SUPPORTS_VLEN_STRING,
25+
FixedLengthASCII,
26+
FixedLengthUTF32,
2727
VariableLengthString,
2828
)
2929
from zarr.core.dtype.registry import DataTypeRegistry

src/zarr/core/dtype/npy/sized.py

Lines changed: 1 addition & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -2,100 +2,25 @@
22
import re
33
from collections.abc import Sequence
44
from dataclasses import dataclass
5-
from typing import Any, ClassVar, Self, TypeGuard, cast
5+
from typing import Any, Self, TypeGuard, cast
66

77
import numpy as np
88

99
from zarr.core.common import JSON, ZarrFormat
1010
from zarr.core.dtype.common import (
1111
DataTypeValidationError,
12-
HasEndianness,
1312
HasItemSize,
1413
HasLength,
1514
v3_unstable_dtype_warning,
1615
)
1716
from zarr.core.dtype.npy.common import (
18-
EndiannessNumpy,
1917
bytes_from_json,
2018
bytes_to_json,
2119
check_json_str,
22-
endianness_from_numpy_str,
23-
endianness_to_numpy_str,
2420
)
2521
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
2622

2723

28-
@dataclass(frozen=True, kw_only=True)
29-
class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize):
30-
dtype_cls = np.dtypes.BytesDType
31-
_zarr_v3_name = "numpy.fixed_length_ascii"
32-
33-
@classmethod
34-
def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self:
35-
return cls(length=dtype.itemsize)
36-
37-
def to_dtype(self) -> np.dtypes.BytesDType[int]:
38-
return self.dtype_cls(self.length)
39-
40-
@classmethod
41-
def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]:
42-
"""
43-
Check that the input is a valid JSON representation of a numpy S dtype.
44-
"""
45-
if zarr_format == 2:
46-
# match |S1, |S2, etc
47-
return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None
48-
elif zarr_format == 3:
49-
return (
50-
isinstance(data, dict)
51-
and set(data.keys()) == {"name", "configuration"}
52-
and data["name"] == cls._zarr_v3_name
53-
and isinstance(data["configuration"], dict)
54-
and "length_bytes" in data["configuration"]
55-
)
56-
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
57-
58-
def to_json(self, zarr_format: ZarrFormat) -> JSON:
59-
if zarr_format == 2:
60-
return self.to_dtype().str
61-
elif zarr_format == 3:
62-
return {
63-
"name": self._zarr_v3_name,
64-
"configuration": {"length_bytes": self.length},
65-
}
66-
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
67-
68-
@classmethod
69-
def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self:
70-
if zarr_format == 2:
71-
return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type]
72-
elif zarr_format == 3:
73-
return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload]
74-
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
75-
76-
def default_value(self) -> np.bytes_:
77-
return np.bytes_(b"")
78-
79-
def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str:
80-
return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type]
81-
82-
def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_:
83-
if check_json_str(data):
84-
return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii")))
85-
raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover
86-
87-
def check_value(self, data: object) -> bool:
88-
# this is generous for backwards compatibility
89-
return isinstance(data, np.bytes_ | str | bytes | int)
90-
91-
def _cast_value_unsafe(self, value: object) -> np.bytes_:
92-
return self.to_dtype().type(value)
93-
94-
@property
95-
def item_size(self) -> int:
96-
return self.length
97-
98-
9924
@dataclass(frozen=True, kw_only=True)
10025
class FixedLengthBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize):
10126
# np.dtypes.VoidDType is specified in an odd way in numpy
@@ -190,87 +115,6 @@ def item_size(self) -> int:
190115
return self.length
191116

192117

193-
@dataclass(frozen=True, kw_only=True)
194-
class FixedLengthUTF32(
195-
ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize
196-
):
197-
dtype_cls = np.dtypes.StrDType
198-
_zarr_v3_name = "numpy.fixed_length_utf32"
199-
code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point
200-
201-
@classmethod
202-
def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self:
203-
byte_order = cast("EndiannessNumpy", dtype.byteorder)
204-
return cls(
205-
length=dtype.itemsize // (cls.code_point_bytes),
206-
endianness=endianness_from_numpy_str(byte_order),
207-
)
208-
209-
def to_dtype(self) -> np.dtypes.StrDType[int]:
210-
byte_order = endianness_to_numpy_str(self.endianness)
211-
return self.dtype_cls(self.length).newbyteorder(byte_order)
212-
213-
@classmethod
214-
def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]:
215-
"""
216-
Check that the input is a valid JSON representation of a numpy S dtype.
217-
"""
218-
if zarr_format == 2:
219-
# match >U1, <U2, etc
220-
return isinstance(data, str) and re.match(r"^[><]U\d+$", data) is not None
221-
elif zarr_format == 3:
222-
return (
223-
isinstance(data, dict)
224-
and set(data.keys()) == {"name", "configuration"}
225-
and data["name"] == cls._zarr_v3_name
226-
and "configuration" in data
227-
and isinstance(data["configuration"], dict)
228-
and set(data["configuration"].keys()) == {"length_bytes"}
229-
and isinstance(data["configuration"]["length_bytes"], int)
230-
)
231-
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
232-
233-
def to_json(self, zarr_format: ZarrFormat) -> JSON:
234-
if zarr_format == 2:
235-
return self.to_dtype().str
236-
elif zarr_format == 3:
237-
return {
238-
"name": self._zarr_v3_name,
239-
"configuration": {"length_bytes": self.length * self.code_point_bytes},
240-
}
241-
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
242-
243-
@classmethod
244-
def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self:
245-
if zarr_format == 2:
246-
return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type]
247-
elif zarr_format == 3:
248-
return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator]
249-
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
250-
251-
def default_value(self) -> np.str_:
252-
return np.str_("")
253-
254-
def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str:
255-
return str(data)
256-
257-
def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_:
258-
if check_json_str(data):
259-
return self.to_dtype().type(data)
260-
raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover
261-
262-
def check_value(self, data: object) -> bool:
263-
# this is generous for backwards compatibility
264-
return isinstance(data, str | np.str_ | bytes | int)
265-
266-
def _cast_value_unsafe(self, data: object) -> np.str_:
267-
return self.to_dtype().type(data)
268-
269-
@property
270-
def item_size(self) -> int:
271-
return self.length * self.code_point_bytes
272-
273-
274118
@dataclass(frozen=True, kw_only=True)
275119
class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize):
276120
dtype_cls = np.dtypes.VoidDType # type: ignore[assignment]

src/zarr/core/dtype/npy/string.py

Lines changed: 162 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
from __future__ import annotations
22

3+
import base64
4+
import re
35
from dataclasses import dataclass
4-
from typing import TYPE_CHECKING, Self, TypeGuard
6+
from typing import TYPE_CHECKING, ClassVar, Self, TypeGuard, cast
57

68
import numpy as np
79

8-
from zarr.core.dtype.npy.common import check_json_str
10+
from zarr.core.dtype.common import HasEndianness, HasItemSize, HasLength
11+
from zarr.core.dtype.npy.common import (
12+
EndiannessNumpy,
13+
check_json_str,
14+
endianness_from_numpy_str,
15+
endianness_to_numpy_str,
16+
)
917
from zarr.core.dtype.wrapper import ZDType
1018

1119
if TYPE_CHECKING:
@@ -15,6 +23,158 @@
1523
_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType")
1624

1725

26+
@dataclass(frozen=True, kw_only=True)
27+
class FixedLengthASCII(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize):
28+
dtype_cls = np.dtypes.BytesDType
29+
_zarr_v3_name = "numpy.fixed_length_ascii"
30+
31+
@classmethod
32+
def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self:
33+
return cls(length=dtype.itemsize)
34+
35+
def to_dtype(self) -> np.dtypes.BytesDType[int]:
36+
return self.dtype_cls(self.length)
37+
38+
@classmethod
39+
def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]:
40+
"""
41+
Check that the input is a valid JSON representation of a numpy S dtype.
42+
"""
43+
if zarr_format == 2:
44+
# match |S1, |S2, etc
45+
return isinstance(data, str) and re.match(r"^\|S\d+$", data) is not None
46+
elif zarr_format == 3:
47+
return (
48+
isinstance(data, dict)
49+
and set(data.keys()) == {"name", "configuration"}
50+
and data["name"] == cls._zarr_v3_name
51+
and isinstance(data["configuration"], dict)
52+
and "length_bytes" in data["configuration"]
53+
)
54+
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
55+
56+
def to_json(self, zarr_format: ZarrFormat) -> JSON:
57+
if zarr_format == 2:
58+
return self.to_dtype().str
59+
elif zarr_format == 3:
60+
return {
61+
"name": self._zarr_v3_name,
62+
"configuration": {"length_bytes": self.length},
63+
}
64+
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
65+
66+
@classmethod
67+
def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self:
68+
if zarr_format == 2:
69+
return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type]
70+
elif zarr_format == 3:
71+
return cls(length=data["configuration"]["length_bytes"]) # type: ignore[arg-type, index, call-overload]
72+
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
73+
74+
def default_value(self) -> np.bytes_:
75+
return np.bytes_(b"")
76+
77+
def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str:
78+
return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type]
79+
80+
def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_:
81+
if check_json_str(data):
82+
return self.to_dtype().type(base64.standard_b64decode(data.encode("ascii")))
83+
raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover
84+
85+
def check_value(self, data: object) -> bool:
86+
# this is generous for backwards compatibility
87+
return isinstance(data, np.bytes_ | str | bytes | int)
88+
89+
def _cast_value_unsafe(self, value: object) -> np.bytes_:
90+
return self.to_dtype().type(value)
91+
92+
@property
93+
def item_size(self) -> int:
94+
return self.length
95+
96+
97+
@dataclass(frozen=True, kw_only=True)
98+
class FixedLengthUTF32(
99+
ZDType[np.dtypes.StrDType[int], np.str_], HasEndianness, HasLength, HasItemSize
100+
):
101+
dtype_cls = np.dtypes.StrDType
102+
_zarr_v3_name = "numpy.fixed_length_utf32"
103+
code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point
104+
105+
@classmethod
106+
def _from_dtype_unsafe(cls, dtype: TBaseDType) -> Self:
107+
byte_order = cast("EndiannessNumpy", dtype.byteorder)
108+
return cls(
109+
length=dtype.itemsize // (cls.code_point_bytes),
110+
endianness=endianness_from_numpy_str(byte_order),
111+
)
112+
113+
def to_dtype(self) -> np.dtypes.StrDType[int]:
114+
byte_order = endianness_to_numpy_str(self.endianness)
115+
return self.dtype_cls(self.length).newbyteorder(byte_order)
116+
117+
@classmethod
118+
def check_json(cls, data: JSON, zarr_format: ZarrFormat) -> TypeGuard[JSON]:
119+
"""
120+
Check that the input is a valid JSON representation of a numpy S dtype.
121+
"""
122+
if zarr_format == 2:
123+
# match >U1, <U2, etc
124+
return isinstance(data, str) and re.match(r"^[><]U\d+$", data) is not None
125+
elif zarr_format == 3:
126+
return (
127+
isinstance(data, dict)
128+
and set(data.keys()) == {"name", "configuration"}
129+
and data["name"] == cls._zarr_v3_name
130+
and "configuration" in data
131+
and isinstance(data["configuration"], dict)
132+
and set(data["configuration"].keys()) == {"length_bytes"}
133+
and isinstance(data["configuration"]["length_bytes"], int)
134+
)
135+
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
136+
137+
def to_json(self, zarr_format: ZarrFormat) -> JSON:
138+
if zarr_format == 2:
139+
return self.to_dtype().str
140+
elif zarr_format == 3:
141+
return {
142+
"name": self._zarr_v3_name,
143+
"configuration": {"length_bytes": self.length * self.code_point_bytes},
144+
}
145+
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
146+
147+
@classmethod
148+
def _from_json_unsafe(cls, data: JSON, zarr_format: ZarrFormat) -> Self:
149+
if zarr_format == 2:
150+
return cls.from_dtype(np.dtype(data)) # type: ignore[arg-type]
151+
elif zarr_format == 3:
152+
return cls(length=data["configuration"]["length_bytes"] // cls.code_point_bytes) # type: ignore[arg-type, index, call-overload, operator]
153+
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
154+
155+
def default_value(self) -> np.str_:
156+
return np.str_("")
157+
158+
def to_json_value(self, data: object, *, zarr_format: ZarrFormat) -> str:
159+
return str(data)
160+
161+
def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_:
162+
if check_json_str(data):
163+
return self.to_dtype().type(data)
164+
raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover
165+
166+
def check_value(self, data: object) -> bool:
167+
# this is generous for backwards compatibility
168+
return isinstance(data, str | np.str_ | bytes | int)
169+
170+
def _cast_value_unsafe(self, data: object) -> np.str_:
171+
return self.to_dtype().type(data)
172+
173+
@property
174+
def item_size(self) -> int:
175+
return self.length * self.code_point_bytes
176+
177+
18178
if _NUMPY_SUPPORTS_VLEN_STRING:
19179

20180
@dataclass(frozen=True, kw_only=True)

0 commit comments

Comments
 (0)