diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 745b972e..2eb79553 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,12 +17,12 @@ jobs: - uses: actions/checkout@master - uses: actions/setup-python@master with: - python-version: 3.12 + python-version: 3.13 - run: pip install -r requirements-build-3_12.txt - run: python setup.py sdist - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: dist + name: dist-sdist path: dist matrix_config: @@ -48,7 +48,6 @@ jobs: matrix: os: ${{ fromJson(needs.matrix_config.outputs.matrix_os) }} python: - - {minor: 9, req_build: 'requirements-build-3_11.txt', req_test: 'requirements-dev-3_11.txt'} - {minor: 10, req_build: 'requirements-build-3_11.txt', req_test: 'requirements-dev-3_11.txt'} - {minor: 11, req_build: 'requirements-build-3_11.txt', req_test: 'requirements-dev-3_11.txt'} - {minor: 12, req_build: 'requirements-build-3_12.txt', req_test: 'requirements-dev-3_12.txt'} @@ -87,10 +86,10 @@ jobs: CIBW_BEFORE_TEST: pip install -r {project}/${{ matrix.python.req_test }} CIBW_TEST_COMMAND: pytest {project}/test - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: dist - path: dist + name: dist-wheels-${{ matrix.os }}-py3${{ matrix.python.minor }} # Unique artifact name + path: dist/* upload: name: Publish @@ -98,9 +97,13 @@ jobs: needs: [tar_gz, wheels] runs-on: ubuntu-22.04 steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 + with: + name: dist-sdist + path: dist + - uses: actions/download-artifact@v4 with: - name: dist + name: dist-wheels path: dist - uses: pypa/gh-action-pypi-publish@master with: diff --git a/README.rst b/README.rst index 2618650e..962f160e 100644 --- a/README.rst +++ b/README.rst @@ -29,7 +29,7 @@ Dependencies ArrayKit requires the following: -- Python>=3.9 +- Python>=3.10 - numpy>=1.19.5 diff --git a/performance/auto_map/fixtures.py b/performance/auto_map/fixtures.py new file mode 100644 index 00000000..e098a37e --- /dev/null +++ b/performance/auto_map/fixtures.py @@ -0,0 +1,273 @@ +import typing as tp + + +import numpy as np + +from arraymap import AutoMap +from arraymap import FrozenAutoMap + + +class PayLoad: + def __init__(self, array: np.ndarray): + self.array = array + self.list = list(array) + self.faml = FrozenAutoMap(self.list) + self.fama = FrozenAutoMap(self.array) + self.ama = AutoMap(self.array) + self.d = dict(zip(self.list, range(len(self.list)))) + self.sel_array = array[(np.arange(len(array)) % 2) == 0] + self.sel_scalar = list(self.sel_array) + + +# ------------------------------------------------------------------------------- +INT_START = 500 # avoid cached ints starting at 256 + + +class FixtureFactory: + NAME = "" + SORT = 0 + CACHE = {} # can be shared for all classes + + @staticmethod + def get_array(size: int) -> np.ndarray: + raise NotImplementedError() + + @classmethod + def get_label_array(cls, size: int) -> tp.Tuple[str, PayLoad]: + key = (cls, size) + if key not in cls.CACHE: + pl = PayLoad(cls.get_array(size)) + cls.CACHE[key] = pl + return cls.NAME, cls.CACHE[key] + + +class FFInt64(FixtureFactory): + NAME = "int64" + SORT = 0 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = np.arange(INT_START, INT_START + size, dtype=np.int64) + array.flags.writeable = False + return array + + +class FFInt32(FixtureFactory): + NAME = "int32" + SORT = 1 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = np.arange(INT_START, INT_START + size, dtype=np.int32) + array.flags.writeable = False + return array + + +class FFUInt64(FixtureFactory): + NAME = "uint64" + SORT = 2 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = np.arange(INT_START, INT_START + size, dtype=np.uint64) + array.flags.writeable = False + return array + + +class FFUInt32(FixtureFactory): + NAME = "uint32" + SORT = 3 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = np.arange(INT_START, INT_START + size, dtype=np.uint32) + array.flags.writeable = False + return array + + +class FFFloat64(FixtureFactory): + NAME = "float64" + SORT = 4 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = (np.arange(INT_START, INT_START + size) * 0.5).astype(np.float64) + array.flags.writeable = False + return array + + +class FFFloat32(FixtureFactory): + NAME = "float32" + SORT = 5 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = (np.arange(INT_START, INT_START + size) * 0.5).astype(np.float32) + array.flags.writeable = False + return array + + +def get_string_array(size: int, char_count: int, kind: str) -> str: + fmt = f"-<{char_count}" + array = np.array( + [ + f"{hex(e) * (char_count // 8)}".format(fmt) + for e in range(INT_START, INT_START + size) + ], + dtype=f"{kind}{char_count}", + ) + array.flags.writeable = False + return array + + +class FFU8(FixtureFactory): + NAME = "U8" + SORT = 6 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 8, "U") + + +class FFU16(FixtureFactory): + NAME = "U16" + SORT = 7 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 16, "U") + + +class FFU32(FixtureFactory): + NAME = "U32" + SORT = 8 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 32, "U") + + +class FFU64(FixtureFactory): + NAME = "U64" + SORT = 9 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 64, "U") + + +class FFU128(FixtureFactory): + NAME = "U128" + SORT = 10 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 128, "U") + + +class FFS8(FixtureFactory): + NAME = "S8" + SORT = 11 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 8, "S") + + +class FFS16(FixtureFactory): + NAME = "S16" + SORT = 12 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 16, "S") + + +class FFS32(FixtureFactory): + NAME = "S32" + SORT = 13 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 32, "S") + + +class FFS64(FixtureFactory): + NAME = "S64" + SORT = 14 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 64, "S") + + +class FFS128(FixtureFactory): + NAME = "S128" + SORT = 15 + + @staticmethod + def get_array(size: int) -> np.ndarray: + return get_string_array(size, 128, "S") + + +class FFDTY(FixtureFactory): + NAME = "dt[Y]" + SORT = 20 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = np.arange(INT_START, INT_START + size, dtype="datetime64[Y]") + array.flags.writeable = False + return array + + +class FFDTD(FixtureFactory): + NAME = "dt[D]" + SORT = 21 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = np.arange(INT_START, INT_START + size, dtype="datetime64[D]") + array.flags.writeable = False + return array + + +class FFDTs(FixtureFactory): + NAME = "dt[s]" + SORT = 22 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = np.arange(INT_START, INT_START + size, dtype="datetime64[s]") + array.flags.writeable = False + return array + + +class FFDTns(FixtureFactory): + NAME = "dt[ns]" + SORT = 23 + + @staticmethod + def get_array(size: int) -> np.ndarray: + array = np.arange(INT_START, INT_START + size, dtype="datetime64[ns]") + array.flags.writeable = False + return array + + +class FFObject(FixtureFactory): + NAME = "object" + SORT = 5 + + @staticmethod + def get_array(size: int) -> np.ndarray: + ints = np.arange(INT_START, INT_START + size) + array = ints.astype(object) + + target = 1 == ints % 3 + array[target] = ints[target] * 0.5 + + target = 2 == ints % 3 + array[target] = np.array([hex(e) for e in ints[target]]) + + array.flags.writeable = False + return array diff --git a/performance/auto_map/get-all-any.py b/performance/auto_map/get-all-any.py new file mode 100644 index 00000000..bc6d48b6 --- /dev/null +++ b/performance/auto_map/get-all-any.py @@ -0,0 +1,294 @@ +import os +import sys +import timeit +import typing as tp +from typing import NamedTuple +from itertools import repeat + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +import arraymap +from arraymap import AutoMap +from arraymap import FrozenAutoMap + +sys.path.append(os.getcwd()) + +from fixtures import ( + PayLoad, + FFInt64, + FFInt32, + FFUInt64, + FFFloat64, + FFU8, + FFU16, + FFS8, + FFS16, + FFDTY, + FFDTD, + FFDTs, + FFDTns, +) + + +# class PayLoad: +# def __init__(self, array: np.ndarray): +# self.array = array +# self.fama = FrozenAutoMap(self.array) +# self.sel_array = array[(np.arange(len(array)) % 2) == 0] +# self.sel_scalar = list(self.sel_array) +# self.sel_obj = self.sel_array.tolist() + + +class MapProcessor: + NAME = "" + SORT = -1 + + def __init__(self, pl: PayLoad): + self.array = pl.array + self.fama = pl.fama + self.sel_array = pl.sel_array + self.sel_scalar = pl.sel_scalar + + +# ------------------------------------------------------------------------------- +class ListCompAllScalar(MapProcessor): + NAME = "all: list comp, lookup by scalar" + SORT = 0 + + def __call__(self): + post = [self.fama[e] for e in self.sel_array] + assert len(post) == len(self.fama) // 2 + + +# class GetAllListObj(MapProcessor): +# NAME = "all: get all, lookup by obj list" +# SORT = 0 + +# def __call__(self): +# post = self.fama.get_all(self.sel_obj) +# assert len(post) == len(self.fama) // 2 + + +class GetAllListScalar(MapProcessor): + NAME = "all: get all, lookup by scalar list" + SORT = 0 + + def __call__(self): + post = self.fama.get_all(self.sel_scalar) + assert len(post) == len(self.fama) // 2 + + +class GetAllArray(MapProcessor): + NAME = "all: get all, lookup by array" + SORT = 0 + + def __call__(self): + post = self.fama.get_all(self.sel_array) + assert len(post) == len(self.fama) // 2 + + +# ------------------------------------------------------------------------------- +class ListCompAnyScalar(MapProcessor): + NAME = "any: list comp, lookup by scalar" + SORT = 0 + + def __call__(self): + post = [self.fama[e] for e in self.sel_array if e in self.fama] + assert len(post) == len(self.fama) // 2 + + +# class GetAnyListObj(MapProcessor): +# NAME = "any: get all, lookup by obj list" +# SORT = 0 + +# def __call__(self): +# post = self.fama.get_any(self.sel_obj) +# assert len(post) == len(self.fama) // 2 + + +class GetAnyListScalar(MapProcessor): + NAME = "any: get all, lookup by scalar list" + SORT = 0 + + def __call__(self): + post = self.fama.get_any(self.sel_scalar) + assert len(post) == len(self.fama) // 2 + + +class GetAnyArray(MapProcessor): + NAME = "any: get all, lookup by array" + SORT = 0 + + def __call__(self): + post = self.fama.get_any(self.sel_array) + assert len(post) == len(self.fama) // 2 + + +# ------------------------------------------------------------------------------- + + +def get_versions() -> str: + import platform + + return f"OS: {platform.system()} / ArrayMap: {arraymap.__version__} / NumPy: {np.__version__}\n" + + +CLS_FF = ( + FFInt64, + FFUInt64, + # FFFloat64, + FFU16, + FFS16, + FFDTY, + FFDTD, + FFDTs, + FFDTns, +) +FF_ORDER = [f.NAME for f in sorted(CLS_FF, key=lambda ff: ff.SORT)] + +# ------------------------------------------------------------------------------- +NUMBER = 20 + +from itertools import product + + +def seconds_to_display(seconds: float) -> str: + seconds /= NUMBER + if seconds < 1e-4: + return f"{seconds * 1e6: .1f} (µs)" + if seconds < 1e-1: + return f"{seconds * 1e3: .1f} (ms)" + return f"{seconds: .1f} (s)" + + +def plot_performance(frame, suffix: str = ""): + fixture_total = len(frame["fixture"].unique()) + cat_total = len(frame["size"].unique()) + processor_total = len(frame["cls_processor"].unique()) + fig, axes = plt.subplots(cat_total, fixture_total) + + # cmap = plt.get_cmap('terrain') + cmap = plt.get_cmap("plasma") + color = cmap(np.arange(processor_total) / processor_total) + + # category is the size of the array + for cat_count, (cat_label, cat) in enumerate(frame.groupby("size")): + + # fixture is the data type fixture + fixture_data = {fix_label: fix for fix_label, fix in cat.groupby("fixture")} + for fixture_count, fixture_label in enumerate(FF_ORDER): + fixture = fixture_data[fixture_label] + ax = axes[cat_count][fixture_count] + + # set order by cls_processor, i.e., the type of test being done + fixture["sort"] = [f.SORT for f in fixture["cls_processor"]] + fixture = fixture.sort_values("sort") + + results = fixture["time"].values.tolist() + names = [cls.NAME for cls in fixture["cls_processor"]] + # x = np.arange(len(results)) + names_display = names + post = ax.bar(names_display, results, color=color) + + # density, position = fixture_label.split('-') + # cat_label is the size of the array + title = f"{cat_label:.0e}\n{fixture_label}" + + ax.set_title(title, fontsize=6) + ax.set_box_aspect(0.8) + time_max = fixture["time"].max() + time_min = fixture["time"].min() + y_ticks = [0, time_min, time_max * 0.5, time_max] + y_labels = [ + "", + seconds_to_display(time_min), + seconds_to_display(time_max * 0.5), + seconds_to_display(time_max), + ] + if time_min > time_max * 0.25: + # remove the min if it is greater than quarter + y_ticks.pop(1) + y_labels.pop(1) + + ax.set_yticks(y_ticks) + ax.set_yticklabels(y_labels, fontsize=4) + # ax.set_xticks(x, names_display, rotation='vertical') + ax.tick_params( + axis="x", + bottom=False, + labelbottom=False, + ) + ax.tick_params( + axis="y", + length=2, + width=0.5, + pad=1, + ) + + fig.set_size_inches(9, 4) # width, height + fig.legend(post, names_display, loc="center right", fontsize=6) + # horizontal, vertical + fig.text(0.05, 0.96, f"AutoMap {suffix.title()}: {NUMBER} Iterations", fontsize=10) + fig.text(0.05, 0.90, get_versions(), fontsize=6) + + fp = f"/tmp/arraymap-{suffix}.png" + plt.subplots_adjust( + left=0.075, + bottom=0.05, + right=0.8, + top=0.80, + wspace=0.8, # width + hspace=0.2, + ) + # plt.rcParams.update({'font.size': 22}) + print(fp) + plt.savefig(fp, dpi=300) + + if sys.platform.startswith("linux"): + os.system(f"eog {fp}&") + else: + os.system(f"open {fp}") + + +def run_test(processors, suffix): + records = [] + for size in (10_000, 100_000, 1_000_000): + for ff in CLS_FF: + fixture_label, fixture = ff.get_label_array(size) + for cls in processors: + runner = cls(fixture) + + record = [cls, NUMBER, fixture_label, size] + print(record) + try: + result = timeit.timeit(f"runner()", globals=locals(), number=NUMBER) + except OSError: + result = np.nan + finally: + pass + record.append(result) + records.append(record) + + f = pd.DataFrame.from_records( + records, columns=("cls_processor", "number", "fixture", "size", "time") + ) + print(f) + plot_performance(f, suffix) + + +if __name__ == "__main__": + + cls_instantiate = ( + ListCompAllScalar, + # GetAllListObj, + GetAllListScalar, + GetAllArray, + ListCompAnyScalar, + # GetAnyListObj, + GetAnyListScalar, + GetAnyArray, + ) + + run_test(cls_instantiate, "get-all-any") diff --git a/performance/auto_map/npy-opt.py b/performance/auto_map/npy-opt.py new file mode 100644 index 00000000..b7c152f0 --- /dev/null +++ b/performance/auto_map/npy-opt.py @@ -0,0 +1,455 @@ +import os +import sys +import timeit +from typing import NamedTuple +from itertools import repeat + +import arraymap +from arraymap import AutoMap +from arraymap import FrozenAutoMap + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +sys.path.append(os.getcwd()) + +from fixtures import ( + PayLoad, + FFInt64, + FFInt32, + FFUInt64, + FFU8, + FFU16, + FFDTD, + FFDTY, + FFDTns, + FFDTs, +) + + +class MapProcessor: + NAME = "" + SORT = -1 + + def __init__(self, pl: PayLoad): + self.array = pl.array + self.list = pl.list + self.faml = pl.faml + self.fama = pl.fama + self.ama = pl.ama + self.d = pl.d + + +# ------------------------------------------------------------------------------- +class FAMLInstantiate(MapProcessor): + NAME = "FAM(L): instantiate" + SORT = 0 + + def __call__(self): + fam = FrozenAutoMap(self.list) + assert len(fam) == len(self.list) + + +class AMAInstantiate(MapProcessor): + NAME = "AM(A): instantiate" + SORT = 0 + + def __call__(self): + fam = AutoMap(self.array) + assert len(fam) == len(self.list) + + +class FAMAInstantiate(MapProcessor): + NAME = "FAM(A): instantiate" + SORT = 0 + + def __call__(self): + fam = FrozenAutoMap(self.array) + assert len(fam) == len(self.list) + + +class FAMAtolistInstantiate(MapProcessor): + NAME = "FAM(Atolist): instantiate" + SORT = 0 + + def __call__(self): + fam = FrozenAutoMap(self.array.tolist()) + assert len(fam) == len(self.list) + + +class DictInstantiate(MapProcessor): + NAME = "Dict: instantiate" + SORT = 0 + + def __call__(self): + d = dict(zip(self.list, range(len(self.list)))) + assert len(d) == len(self.list) + + +# ------------------------------------------------------------------------------- +class FAMLLookup(MapProcessor): + NAME = "FAM(L): lookup" + SORT = 0 + + def __call__(self): + m = self.faml + for k in self.list: + _ = m[k] + + +class FAMALookup(MapProcessor): + NAME = "FAM(A): lookup" + SORT = 0 + + def __call__(self): + m = self.fama + for k in self.list: + _ = m[k] + + +class DictLookup(MapProcessor): + NAME = "Dict: lookup" + SORT = 0 + + def __call__(self): + m = self.d + for k in self.list: + _ = m[k] + + +# ------------------------------------------------------------------------------- +class FAMLLookupScalar(MapProcessor): + NAME = "FAM(L): lookup scalar" + SORT = 0 + + def __call__(self): + m = self.faml + for k in self.array: + _ = m[k] + + +class FAMALookupScalar(MapProcessor): + NAME = "FAM(A): lookup scalar" + SORT = 0 + + def __call__(self): + m = self.fama + for k in self.array: + _ = m[k] + + +class AMALookupScalar(MapProcessor): + NAME = "AM(A): lookup scalar" + SORT = 0 + + def __call__(self): + m = self.ama + for k in self.array: + _ = m[k] + + +class DictLookupScalar(MapProcessor): + NAME = "Dict: lookup scalar" + SORT = 0 + + def __call__(self): + m = self.d + for k in self.array: + _ = m[k] + + +# ------------------------------------------------------------------------------- +class FAMLNotIn(MapProcessor): + NAME = "FAM(L): not in" + SORT = 0 + + def __call__(self): + m = self.faml + for _ in self.list: + assert None not in m + + +class FAMANotIn(MapProcessor): + NAME = "FAM(A): not in" + SORT = 0 + + def __call__(self): + m = self.fama + for _ in self.list: + assert None not in m + + +class AMANotIn(MapProcessor): + NAME = "AM(A): not in" + SORT = 0 + + def __call__(self): + m = self.ama + for _ in self.array: + assert None not in m + + +class DictNotIn(MapProcessor): + NAME = "Dict: not in" + SORT = 0 + + def __call__(self): + m = self.d + for _ in self.list: + assert None not in m + + +# ------------------------------------------------------------------------------- +class FAMLKeys(MapProcessor): + NAME = "FAM(L): keys" + SORT = 0 + + def __call__(self): + for v in self.faml.keys(): + pass + + +class FAMAKeys(MapProcessor): + NAME = "FAM(A): keys" + SORT = 0 + + def __call__(self): + for v in self.fama.keys(): + pass + + +class DictKeys(MapProcessor): + NAME = "Dict: keys" + SORT = 0 + + def __call__(self): + for v in self.d.keys(): + pass + + +# ------------------------------------------------------------------------------- +class FAMLItems(MapProcessor): + NAME = "FAM(L): items" + SORT = 0 + + def __call__(self): + for k, v in self.faml.items(): + pass + + +class FAMAItems(MapProcessor): + NAME = "FAM(A): items" + SORT = 0 + + def __call__(self): + for k, v in self.fama.items(): + pass + + +class DictItems(MapProcessor): + NAME = "Dict: items" + SORT = 0 + + def __call__(self): + for k, v in self.d.items(): + pass + + +def get_versions() -> str: + import platform + + return f"OS: {platform.system()} / ArrayMap: {arraymap.__version__} / NumPy: {np.__version__}\n" + + +CLS_FF = ( + # FFInt32, + FFInt64, + # FFUInt32, + # FFUInt64, + # FFFloat64, + # FFU8, + # FFU16, + # FFU32, + # FFU64, + # FFU128, + # FFS8, + # FFS16, + # FFS32, + # FFS64, + # FFS128, + # FFObject, + FFDTY, + FFDTD, + FFDTs, + FFDTns, +) +FF_ORDER = [f.NAME for f in sorted(CLS_FF, key=lambda ff: ff.SORT)] + +# ------------------------------------------------------------------------------- +NUMBER = 2 + + +def seconds_to_display(seconds: float) -> str: + seconds /= NUMBER + if seconds < 1e-4: + return f"{seconds * 1e6: .1f} (µs)" + if seconds < 1e-1: + return f"{seconds * 1e3: .1f} (ms)" + return f"{seconds: .1f} (s)" + + +def plot_performance(frame, suffix: str = ""): + fixture_total = len(frame["fixture"].unique()) + cat_total = len(frame["size"].unique()) + processor_total = len(frame["cls_processor"].unique()) + fig, axes = plt.subplots(cat_total, fixture_total) + + # cmap = plt.get_cmap('terrain') + cmap = plt.get_cmap("plasma") + color = cmap(np.arange(processor_total) / processor_total) + + # category is the size of the array + for cat_count, (cat_label, cat) in enumerate(frame.groupby("size")): + + # fixture is the data type fixture + fixture_data = {fix_label: fix for fix_label, fix in cat.groupby("fixture")} + for fixture_count, fixture_label in enumerate(FF_ORDER): + fixture = fixture_data[fixture_label] + ax = axes[cat_count][fixture_count] + + # set order by cls_processor, i.e., the type of test being done + fixture["sort"] = [f.SORT for f in fixture["cls_processor"]] + fixture = fixture.sort_values("sort") + + results = fixture["time"].values.tolist() + names = [cls.NAME for cls in fixture["cls_processor"]] + # x = np.arange(len(results)) + names_display = names + post = ax.bar(names_display, results, color=color) + + # density, position = fixture_label.split('-') + # cat_label is the size of the array + title = f"{cat_label:.0e}\n{fixture_label}" + + ax.set_title(title, fontsize=6) + ax.set_box_aspect(0.8) + time_max = fixture["time"].max() + time_min = fixture["time"].min() + y_ticks = [0, time_min, time_max * 0.5, time_max] + y_labels = [ + "", + seconds_to_display(time_min), + seconds_to_display(time_max * 0.5), + seconds_to_display(time_max), + ] + if time_min > time_max * 0.25: + # remove the min if it is greater than quarter + y_ticks.pop(1) + y_labels.pop(1) + + ax.set_yticks(y_ticks) + ax.set_yticklabels(y_labels, fontsize=4) + # ax.set_xticks(x, names_display, rotation='vertical') + ax.tick_params( + axis="x", + bottom=False, + labelbottom=False, + ) + ax.tick_params( + axis="y", + length=2, + width=0.5, + pad=1, + ) + fig.set_size_inches(9, 3) # width, height + fig.legend(post, names_display, loc="center right", fontsize=6) + # horizontal, vertical + fig.text(0.05, 0.96, f"AutoMap {suffix.title()}: {NUMBER} Iterations", fontsize=10) + fig.text(0.05, 0.90, get_versions(), fontsize=6) + + fp = f"/tmp/arraymap-{suffix}.png" + plt.subplots_adjust( + left=0.075, + bottom=0.05, + right=0.85, + top=0.80, + wspace=1.0, # width + hspace=0.5, + ) + # plt.rcParams.update({'font.size': 22}) + plt.savefig(fp, dpi=300) + + if sys.platform.startswith("linux"): + os.system(f"eog {fp}&") + else: + os.system(f"open {fp}") + + +def run_test(processors, suffix): + records = [] + for size in (10_000, 100_000, 1_000_000): + for ff in CLS_FF: + fixture_label, fixture = ff.get_label_array(size) + for cls in processors: + runner = cls(fixture) + + record = [cls, NUMBER, fixture_label, size] + print(record) + try: + result = timeit.timeit(f"runner()", globals=locals(), number=NUMBER) + except OSError: + result = np.nan + finally: + pass + record.append(result) + records.append(record) + + f = pd.DataFrame.from_records( + records, columns=("cls_processor", "number", "fixture", "size", "time") + ) + print(f) + plot_performance(f, suffix) + + +if __name__ == "__main__": + + CLS_PROCESSOR = ( + FAMLInstantiate, + FAMAInstantiate, + AMAInstantiate, + DictInstantiate, + # FAMLLookup, + # FAMALookup, + # DictLookup, + # FAMLLookupScalar, + # FAMALookupScalar, + # DictLookupScalar, + # FAMLNotIn, + # FAMANotIn, + # DictNotIn, + # FAMLKeys, + # FAMAKeys, + # DictKeys, + ) + + cls_instantiate = ( + FAMLInstantiate, + FAMAInstantiate, + AMAInstantiate, + DictInstantiate, + ) + + cls_lookup = ( + FAMLLookupScalar, + FAMALookupScalar, + AMALookupScalar, + DictLookupScalar, + # FAMLNotIn, + # FAMANotIn, + # AMANotIn, + # DictNotIn, + ) + + run_test(cls_instantiate, "instantiate") + run_test(cls_lookup, "lookup") diff --git a/performance/auto_map/npy-opt.txt b/performance/auto_map/npy-opt.txt new file mode 100644 index 00000000..1b3911dd --- /dev/null +++ b/performance/auto_map/npy-opt.txt @@ -0,0 +1,143 @@ + +These changes integrate direct support of NumPy arrays given as keys to `AutoMap`s and `FrozenAutoMap`s, optimizing their usage. + +Improvements are made in `AutoMap` initialization, whereby an array is converted to a list using optimal array methods; that list is then held as the keys. + +Improvements are made in `FrozenAutoMap` initialization, whereby an immutable array (for integer, floating point, and flexible dtypes), when given as keys, is held as a reference without copying to a list. Further, hashing and lookup make use of C types, avoiding any management of PyObjects. + +For array dtypes not explicitly handled, or for non-array keys, `FrozenAutoMap` operation is unchanged. In all cases, hash table layout and scanning, and management of the PyObject integer cache, are the same as before. + +A key change is that, on initialization (`fam_init`), a `KeysArrayType` enum value is assigned to the `keys_array_type` attribute of `FAMObject`. This is used for branching in all places where divergent behavior is required between keys stored as lists (as was done previously) or as keys stored as typed arrays. + +Performance panels compare FAM(L), FAM(A), AM(A), and Dict (`FrozenAutoMap` created from a list, `FrozenAutoMap` created from an array, `AutoMap` created from an array, and a dictionary implementing an `AutoMap` mapping). Key indicators are the performance of instantiation and lookup. + +The relevant comparison for StaticFrame usage is between FAM(A) and AM(A), the latter approximating what StaticFrame does presently when creating `FrozenAutoMap`s. (FAM(L) is not available to StaticFrame as AutoMaps are always created from an array, not a list of Python objects.) + +Across all supported types, FAM(A) initialization is more than twice as fast as AM(A). FAM(A) lookup performance varies greatly by type, but always out-performs AM(A), in some cases more than twice as fast as AM(A). In all tests, we see signs that out-performance grows with scale. + +Independent of performance time, All `FrozenAutoMap` usage of arrays reduces memory usage: no new `PyObject`s are created and the passed array simply has reference incremented. + + + +Key Changes + +Split the old `fam_new()` into `fam_new()` and `fam_init()`, implemented `__setstate__()`, `__getstate__()`: + To support pickling a FAM with a NumPy array, `__setstate__()` must re-set the `writeable` flag of an arary to False. + To integrate `__setstate__()`, the old `fam_new()` had to be divided into a `fam_new()` and a `fam_init()`. + +Based on `keys_array_type`, `fam_init` calls type-specific insert routines, which use type-specific hash routines to add entries to the hash table. + +On lookup, type-specific lookup routines are called based on `keys_array_type`. These routines identify PyObjects as PyArray scalars or native PyObject types, extract the appropriate C-type, compute a hash, and use type-specific lookup routines to discover the position in the keys array. + + +Split `copy()` into `copy()` and `copy_to_new()`. + Due to now having `fam_new()` and `fam_init()`, copy allocation and copy setting needed to split into two methods. Now, in `fam_init`, if a `FAMType` is identified as the keys argument, `copy_to_new()` is used to transfer values from the argument to the new instance. The `copy()` function remains, now using `fam_new()` and `copy_to_new()`. + +Additions to the `FAMObject` struct: + `key_array_type`: A `KeysArrayType` enum specifying a list or array dtype. As a list is assigned zero (and all other array dtypes as non-zero), the value can be used to branch on non-array versus array processing. + + `keys_size`: As determining size must branch on `keys_array_type`, this attribute is used to track size, avoiding having to go the underly keys container. + + `key_buffer`: For Unicode arrays, this is a dynamically allocated buffer of size equal to one more than the array's max number of characters. This buffer is given to `PyUnicode_AsUCS4`, which will add a NULL and is why the size of the buffer is one more than max characters. This is only used for a FAM with Unicode array keys; all other usage keeps this as NULL. + +The type of `key_count_global` is now a platform independent 64 bit signed integer. Perviously, it was a `Py_ssize_t`, which is 32 bits on 32 bit systems and could overflow in scenarios when many indicies are created. + +Extended property tests for FAMs with arrays + A custom Hypothesis strategy has been implemented to deliver both contiguous and non-contiguous arrays. + + New Hypothesis tests for array-initialized `FrozenAutoMap`s now cover all features previously tested by Hypothesis. + + + + + + + +At the core of every StaticFrame Index is an AutoMap (or the immutable FrozenAutoMap), a custom dictionary-like container implemented as a C-extension. This container meets the special needs of Index objects: given a sequence of hashable labels, permit obtaining their position with near constant-time performance. This can be done with standard dicts: given a sequence of hashables (seq) we could do the following: {k: v for v, k in enumerate(seq)}. This is inefficient, however, in that they values are always just contiguous integers; duplicating them for every dictionary is unnecessary. The first implementation optimized this aspect by using a shared cache of contiguous integers. In addition, that first implementation used a novel hash-table collision scanning approach to take advantage of contiguous memory. While this permited creating AutoMap's in about half the time as normal dicts, that performance was only available when starting with a list of Python objects. As every StaticFrame Index stores an immutable array of labels, converting that array to a list was costly and significantly reduced the performance advantage. Even more, the opportunity of just holding a reference to the immutable array was not taken, missing out on performance and memory savings. Over the past few weeks I have extended FrozenAutoMap to work directly with immutable arrays, holding on to them without copy, and using their C-types directly for hashing and lookup comparison. In most cases, particularly at scale, FrozenAutoMap, given an immutable array of non-object types, now outperforms any AutoMap created from a list. This offers a potentially significant general performance improvement throughout StaticFrame. + + + +AutoMap + +A specialized dictionary where keys and their ordering are the only thing that matters + +A key points to an integer; we want constant-time lookup from a key to its ordered position. + +Original AutoMap: + + Took arbitrary iterables and loaded them into an internally managed List + + The hash "table" is an array of structs, where each item is a pair of hash code and the index into keys list to find the key (struct TableElement) + + Hashing used PyObject_Hash (lookup()) + + Comparison to other objects used PyObject_RichCompareBool (lookup_hash()) + + On insert, we compute a hash, find a hashble index, and lookup the found hash; we search the table until we find an empty position. + + On lookup, we compute a hash and map it to a table_pos; fetching this table position, we see if the hash matches; if so, we fetch the key and compare it; if not a match (or if hashes do not match), we continue searching the table. + + Brandt's original implementation used a sequential scan when possible to optimize performance. + + +Problems with Arrays. + + For every SF index, there is an immutable NumPy array of labels. + + When creating an FrozenAutoMap from an array, those values would be extracted as Scalars into a list. + + It was discovered that for most types, calling `tolist()` first was faster than letting FAM iterate the array and create a list. This created an intermediary list that was thrown away. + + The goal of using immutable arrays is no-copy reuse. How can we use this in a FAM? + + For AutoMap (mutable), we can continue to use a list + + +NumPy Arrays + + A PyObject that wraps (sometimes) contiguous byte data interpreted (with a dtype and strides) as a N-dimensional array data. + + There are two ways to get elements out of an array: + as a PyObject + as a Scalar + Calling `tolist()` creates PyObjects + Selection & iteration result in Scalars + + The performance advantage of NumPy is using the byte data and C-types directly, avoiding Py Objects + + +First Approach + + Store immutable array instead of keys list (no-copy) + + For insertion and hashing, use PyArray_GETITEM to get a PyObject + + For lookup, use PyArray_GETITEM to get a PyObject, then use PyObject_RichCompare + + Performance was not improved (memory might have been) + + Using PyArray_ToScalar was slower. + + Problem: still have to create a PyObject for every element on initialization and lookup. + + +Second Approach + + Identify arrays on initialization + + On insert, read C-type from byte-data and pass it to type-specific insertion function. + + If array is contiguous, can do sequential access after extracting PyArray_DATA + + If not, can use PyArray_GETPTR1 + + Type-specific insertion can use type-specific hash function (no PyObjects). + + Type-specific lookup can compare a foreign key (as a C type) to value in the array (extracted with PyArray_GETPTR1) + + Generic lookup routines know the type of the stored array: can reject foreign keys by type, before hashing + + Given Scalars, can extract data. + + Given PyObjects, can convert to C-types for type-specific loookup. + diff --git a/setup.py b/setup.py index bcfce559..b20d8123 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ def get_ext_dir(*components: tp.Iterable[str]) -> tp.Sequence[str]: 'src/delimited_to_arrays.c', 'src/methods.c', 'src/tri_map.c', + 'src/auto_map.c', ], include_dirs=get_ext_dir('numpy', '_core', 'include') + ['src'], library_dirs=get_ext_dir('numpy', '_core', 'lib'), @@ -68,7 +69,6 @@ def get_ext_dir(*components: tp.Iterable[str]) -> tp.Sequence[str]: 'Operating System :: MacOS :: MacOS X', 'Operating System :: Microsoft :: Windows', 'Operating System :: POSIX', - 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', diff --git a/src/__init__.py b/src/__init__.py index 8c85b5e5..dc4ed2e8 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -31,3 +31,7 @@ from ._arraykit import array_to_tuple_array as array_to_tuple_array from ._arraykit import array_to_tuple_iter as array_to_tuple_iter from ._arraykit import nonzero_1d as nonzero_1d + +from ._arraykit import AutoMap as AutoMap +from ._arraykit import FrozenAutoMap as FrozenAutoMap +from ._arraykit import NonUniqueError as NonUniqueError diff --git a/src/__init__.pyi b/src/__init__.pyi index 25a763c7..6304f89d 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -108,6 +108,39 @@ class BlockIndex: ) -> tp.Iterator[tp.Tuple[int, tp.Union[slice, int]]]: ... def iter_block(self) -> tp.Iterator[tp.Tuple[int, slice]]: ... + +class FrozenAutoMap: + def __init__(self, labels: tp.Iterable[_TLabel] | np.ndarray, /,) -> None: ... + def get(self, __key: _TLabel, /,) -> int: ... + def keys(self) -> tp.Iterator[_TLabel]: ... + def items(self) -> tp.Iterator[tuple[_TLabel, int]]: ... + def values(self) -> tp.Iterator[int]: ... + def get_all(self, __key: list[_TLabel] | np.ndarray) -> np.ndarray: ... + def get_any(self, __key: list[_TLabel] | np.ndarray) -> list[int]: ... + + def __iter__(self) -> tp.Iterator[_TLabel]: ... + def __getitem__(self, __key: tp.Any) -> int: ... + def __contains__(self, __key: tp.Any) -> bool: ... + def __getnewargs__(self) -> tp.Any: ... + def __reversed__(self) -> tp.Any: ... + def __sizeof__(self) -> int: ... + def __getstate__(self) -> tp.Any: ... + def __setstate__(self, __state: tp.Any) -> None: ... + def __len__(self) -> int: ... + + def __or__(self) -> tp.Any: ... + def __ror__(self) -> tp.Any: ... + + +class AutoMap(FrozenAutoMap): + def __init__(self, labels: tp.Iterable[_TLabel] | np.ndarray, /,) -> None: ... + def __ior__(self) -> tp.Any: ... + def add(self, __key: int) -> None: ... + def update(self, __keys: tp.Iterable[_TLabel] | np.ndarray) -> None: ... + + + + def iterable_str_to_array_1d( iterable: tp.Iterable[str], *, diff --git a/src/_arraykit.c b/src/_arraykit.c index 5ceacac0..1646a0ca 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -11,6 +11,7 @@ # include "delimited_to_arrays.h" # include "methods.h" # include "tri_map.h" +# include "auto_map.h" static PyMethodDef arraykit_methods[] = { {"immutable_filter", immutable_filter, METH_O, NULL}, @@ -85,6 +86,15 @@ PyInit__arraykit(void) return NULL; } + NonUniqueError = PyErr_NewExceptionWithDoc( + "arraykit.NonUniqueError", + "ValueError for non-unique values.", + PyExc_ValueError, + NULL); + if (NonUniqueError == NULL) { + return NULL; + } + PyObject *copy = PyImport_ImportModule("copy"); if (copy == NULL) { return NULL; @@ -107,12 +117,19 @@ PyInit__arraykit(void) PyType_Ready(&BIIterBlockType) || PyType_Ready(&TriMapType) || PyType_Ready(&ArrayGOType) || + PyType_Ready(&AMType) || + PyType_Ready(&FAMIType) || + PyType_Ready(&FAMVType) || + PyType_Ready(&FAMType) || PyModule_AddObject(m, "BlockIndex", (PyObject *) &BlockIndexType) || PyModule_AddObject(m, "TriMap", (PyObject *) &TriMapType) || PyModule_AddObject(m, "ArrayGO", (PyObject *) &ArrayGOType) || PyModule_AddObject(m, "deepcopy", deepcopy) || - PyModule_AddObject(m, "ErrorInitTypeBlocks", ErrorInitTypeBlocks) - ){ + PyModule_AddObject(m, "ErrorInitTypeBlocks", ErrorInitTypeBlocks) || + PyModule_AddObject(m, "AutoMap", (PyObject *)&AMType) || + PyModule_AddObject(m, "FrozenAutoMap", (PyObject *)&FAMType) || + PyModule_AddObject(m, "NonUniqueError", NonUniqueError) +){ Py_DECREF(deepcopy); Py_XDECREF(m); return NULL; diff --git a/src/auto_map.c b/src/auto_map.c new file mode 100644 index 00000000..879c5833 --- /dev/null +++ b/src/auto_map.c @@ -0,0 +1,2714 @@ +// For background on the hashtable design first implemented in AutoMap, see the following: +// https://github.com/brandtbucher/automap/blob/b787199d38d6bfa1b55484e5ea1e89b31cc1fa72/automap.c#L12 +# include +# include "Python.h" +# include "stdbool.h" + +# define PY_SSIZE_T_CLEAN + +# define NO_IMPORT_ARRAY +# define PY_ARRAY_UNIQUE_SYMBOL AK_ARRAY_API +# define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION + +# include "auto_map.h" +# include "numpy/arrayobject.h" +# include "numpy/arrayscalars.h" +# include "numpy/halffloat.h" + +# define DEBUG_MSG_OBJ(msg, obj) \ + fprintf(stderr, "--- %s: %i: %s: ", __FILE__, __LINE__, __FUNCTION__); \ + fprintf(stderr, #msg " "); \ + PyObject_Print(obj, stderr, 0); \ + fprintf(stderr, "\n"); \ + fflush(stderr); \ + +//------------------------------------------------------------------------------ +// Common + +// static PyTypeObject AMType; +// static PyTypeObject FAMIType; +// static PyTypeObject FAMVType; +// static PyTypeObject FAMType; +PyObject *NonUniqueError; + +// The main storage "table" is an array of TableElement +typedef struct TableElement{ + Py_ssize_t keys_pos; + Py_hash_t hash; +} TableElement; + +// Table configuration; experimentation shows that these values work well: +# define LOAD 0.9 +# define SCAN 16 + +const static size_t UCS4_SIZE = sizeof(Py_UCS4); + +// Partial, two-argument version of PyUnicode_FromKindAndData for consistent templating with bytes version. +static inline PyObject* +PyUnicode_FromUCS4AndData(const void *buffer, Py_ssize_t size) { + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buffer, size); +} + +typedef enum KeysArrayType{ + KAT_LIST = 0, // must be falsy + + KAT_INT8, // order matters as ranges of size are used in selection + KAT_INT16, + KAT_INT32, + KAT_INT64, + + KAT_UINT8, + KAT_UINT16, + KAT_UINT32, + KAT_UINT64, + + KAT_FLOAT16, + KAT_FLOAT32, + KAT_FLOAT64, + + KAT_UNICODE, + KAT_STRING, + + KAT_DTY, + KAT_DTM, + KAT_DTW, + KAT_DTD, + + KAT_DTh, + KAT_DTm, + KAT_DTs, + KAT_DTms, + KAT_DTus, + KAT_DTns, + KAT_DTps, + KAT_DTfs, + KAT_DTas, +} KeysArrayType; + +NPY_DATETIMEUNIT +dt_unit_from_array(PyArrayObject* a) { + // This is based on get_datetime_metadata_from_dtype in the NumPy source, but that function is private. This does not check that the dytpe is of the appropriate type. + PyArray_Descr* dt = PyArray_DESCR(a); // borrowed ref + PyArray_DatetimeMetaData* dma = &(((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dt))->meta); + return dma->base; +} + +NPY_DATETIMEUNIT +dt_unit_from_scalar(PyDatetimeScalarObject* dts) { + // Based on convert_pyobject_to_datetime and related usage in datetime.c + PyArray_DatetimeMetaData* dma = &(dts->obmeta); + return dma->base; +} + +KeysArrayType +at_to_kat(int array_t, PyArrayObject* a) { + switch (array_t) { + case NPY_INT64: + return KAT_INT64; + case NPY_INT32: + return KAT_INT32; + case NPY_INT16: + return KAT_INT16; + case NPY_INT8: + return KAT_INT8; + + case NPY_UINT64: + return KAT_UINT64; + case NPY_UINT32: + return KAT_UINT32; + case NPY_UINT16: + return KAT_UINT16; + case NPY_UINT8: + return KAT_UINT8; + + case NPY_FLOAT64: + return KAT_FLOAT64; + case NPY_FLOAT32: + return KAT_FLOAT32; + case NPY_FLOAT16: + return KAT_FLOAT16; + + case NPY_UNICODE: + return KAT_UNICODE; + case NPY_STRING: + return KAT_STRING; + + case NPY_DATETIME: { + NPY_DATETIMEUNIT dtu = dt_unit_from_array(a); + switch (dtu) { + case NPY_FR_Y: + return KAT_DTY; + case NPY_FR_M: + return KAT_DTM; + case NPY_FR_W: + return KAT_DTW; + case NPY_FR_D: + return KAT_DTD; + case NPY_FR_h: + return KAT_DTh; + case NPY_FR_m: + return KAT_DTm; + case NPY_FR_s: + return KAT_DTs; + case NPY_FR_ms: + return KAT_DTms; + case NPY_FR_us: + return KAT_DTus; + case NPY_FR_ns: + return KAT_DTns; + case NPY_FR_ps: + return KAT_DTps; + case NPY_FR_fs: + return KAT_DTfs; + case NPY_FR_as: + return KAT_DTas; + case NPY_FR_ERROR: + case NPY_FR_GENERIC: + return KAT_LIST; // fall back to list + } + } + default: + return KAT_LIST; + } +} + +// To determine when we can use direct array lookups, this function return 1 if we match, 0 if we do not match. Given a keys array type and the kind of lookup key, return true only for the largest KAT types. +int +kat_is_kind(KeysArrayType kat, char kind) { + switch (kat) { + case KAT_INT64: + // case KAT_INT32: + // case KAT_INT16: + // case KAT_INT8: + return kind == 'i'; + + case KAT_UINT64: + // case KAT_UINT32: + // case KAT_UINT16: + // case KAT_UINT8: + return kind == 'u'; + + case KAT_FLOAT64: + // case KAT_FLOAT32: + // case KAT_FLOAT16: + return kind == 'f'; + + case KAT_UNICODE: + return kind == 'U'; + case KAT_STRING: + return kind == 'S'; + + case KAT_DTY: + case KAT_DTM: + case KAT_DTW: + case KAT_DTD: + case KAT_DTh: + case KAT_DTm: + case KAT_DTs: + case KAT_DTms: + case KAT_DTus: + case KAT_DTns: + case KAT_DTps: + case KAT_DTfs: + case KAT_DTas: + return kind == 'M'; + + default: + return 0; + } +} + +// Given a KAT, determine if it matches a NumPy dt64 unit. +bool +kat_is_datetime_unit(KeysArrayType kat, NPY_DATETIMEUNIT unit) { + switch (kat) { + case KAT_DTY: + if (unit == NPY_FR_Y ) {return true;} + break; + case KAT_DTM: + if (unit == NPY_FR_M ) {return true;} + break; + case KAT_DTW: + if (unit == NPY_FR_W ) {return true;} + break; + case KAT_DTD: + if (unit == NPY_FR_D ) {return true;} + break; + case KAT_DTh: + if (unit == NPY_FR_h ) {return true;} + break; + case KAT_DTm: + if (unit == NPY_FR_m ) {return true;} + break; + case KAT_DTs: + if (unit == NPY_FR_s ) {return true;} + break; + case KAT_DTms: + if (unit == NPY_FR_ms) {return true;} + break; + case KAT_DTus: + if (unit == NPY_FR_us) {return true;} + break; + case KAT_DTns: + if (unit == NPY_FR_ns) {return true;} + break; + case KAT_DTps: + if (unit == NPY_FR_ps) {return true;} + break; + case KAT_DTfs: + if (unit == NPY_FR_fs) {return true;} + break; + case KAT_DTas: + if (unit == NPY_FR_as) {return true;} + break; + default: // non dt64 KATs + return false; + } + return false; +} + +typedef struct FAMObject{ + PyObject_HEAD + Py_ssize_t table_size; + TableElement *table; // an array of TableElement structs + PyObject *keys; + KeysArrayType keys_array_type; + Py_ssize_t keys_size; + Py_UCS4* key_buffer; +} FAMObject; + +typedef enum ViewKind{ + ITEMS, + KEYS, + VALUES, +} ViewKind; + +// Return the end pointer, or the pointer to the location after the last valid character. The end pointer minus the start pointer is the number of characters. For an empty string, all characters are NULL, and the start pointer and end pointer should be equal. NOTE: would like to use strchr(str, '\0') instead of this routine, but some buffers might not have a null terminator and stread by full to the the dt_size. +static inline Py_UCS4* +ucs4_get_end_p(Py_UCS4* p_start, Py_ssize_t dt_size) { + for (Py_UCS4* p = p_start + dt_size - 1; p >= p_start; p--) { + if (*p != '\0') { + return p + 1; // 1 after first non-null + } + } + return p_start; +} + +static inline char* +char_get_end_p(char* p_start, Py_ssize_t dt_size) { + for (char* p = p_start + dt_size - 1; p >= p_start; p--) { + if (*p != '\0') { + return p + 1; // 1 after first non-null + } + } + return p_start; +} + +// This masks the input with INT64_MAX, which removes the MSB; we then cast to an int64; the range is now between 0 and INT64_MAX. We then use the MSB of the original value; if set, we negate the number, producing negative values for the upper half of the uint64 range. Note that we only need to check for hash -1 in this branch. +static inline Py_hash_t +uint_to_hash(npy_uint64 v) { + Py_hash_t hash = (Py_hash_t)(v & INT64_MAX); + if (v >> 63) { + hash = -hash; + } + if (hash == -1) { // might happen due to overflow on 32 bit systems + return -2; + } + return hash; +} + +static inline Py_hash_t +int_to_hash(npy_int64 v) { + Py_hash_t hash = (Py_hash_t)v; + if (hash == -1) { + return -2; + } + return hash; +} + +// This is a adapted from https://github.com/python/cpython/blob/ba65a065cf07a7a9f53be61057a090f7311a5ad7/Python/pyhash.c#L92 +#define HASH_MODULUS (((size_t)1 << 61) - 1) +#define HASH_BITS 61 +static inline Py_hash_t +double_to_hash(double v) +{ + int e, sign; + double m; + Py_uhash_t x, y; + + if (isinf(v)) { + return v > 0 ? 314159 : -314159; + } + if (isnan(v)) { + return 0; + } + m = frexp(v, &e); + sign = 1; + if (m < 0) { + sign = -1; + m = -m; + } + x = 0; + while (m) { + x = ((x << 28) & HASH_MODULUS) | x >> (HASH_BITS - 28); + m *= 268435456.0; /* 2**28 */ + e -= 28; + y = (Py_uhash_t)m; /* pull out integer part */ + m -= y; + x += y; + if (x >= HASH_MODULUS) + x -= HASH_MODULUS; + } + e = e >= 0 ? e % HASH_BITS : HASH_BITS-1-((-1-e) % HASH_BITS); + x = ((x << e) & HASH_MODULUS) | x >> (HASH_BITS - e); + x = x * sign; + if (x == (Py_uhash_t)-1) + x = (Py_uhash_t)-2; + return (Py_hash_t)x; +} + +// The `str` arg is a pointer to a C-array of Py_UCS4; we will only read `len` characters from this. This is a "djb2" hash algorithm. +static inline Py_hash_t +unicode_to_hash(Py_UCS4 *str, Py_ssize_t len) { + Py_UCS4* p = str; + Py_UCS4* p_end = str + len; + Py_hash_t hash = 5381; + while (p < p_end) { + hash = ((hash << 5) + hash) + *p++; + } + if (hash == -1) { + return -2; + } + return hash; +} + +static inline Py_hash_t +string_to_hash(char *str, Py_ssize_t len) { + char* p = str; + char* p_end = str + len; + Py_hash_t hash = 5381; + while (p < p_end) { + hash = ((hash << 5) + hash) + *p++; + } + if (hash == -1) { + return -2; + } + return hash; +} + +//------------------------------------------------------------------------------ +// the global int_cache is shared among all instances + +static PyObject *int_cache = NULL; + +// NOTE: this used to be a Py_ssize_t, which can be 32 bits on some machines and might easily overflow with a few very large indices. Using an explicit 64-bit int seems safer +static npy_int64 key_count_global = 0; + +// Fill the int_cache up to size_needed with PyObject ints; `size` is not the key_count_global. +static int +int_cache_fill(Py_ssize_t size_needed) +{ + PyObject *item; + if (!int_cache) { + int_cache = PyList_New(0); + if (!int_cache) { + return -1; + } + } + for (Py_ssize_t i = PyList_GET_SIZE(int_cache); i < size_needed; i++) { + item = PyLong_FromSsize_t(i); + if (!item) { + return -1; + } + if (PyList_Append(int_cache, item)) { + Py_DECREF(item); + return -1; + } + Py_DECREF(item); + } + return 0; +} + +// Given the current key_count_global, remove cache elements only if the key_count is less than the the current size of the int_cache. +void +int_cache_remove(Py_ssize_t key_count) +{ + if (!key_count) { + Py_CLEAR(int_cache); + } + else if (key_count < PyList_GET_SIZE(int_cache)) { + // del int_cache[key_count:] + PyList_SetSlice(int_cache, key_count, PyList_GET_SIZE(int_cache), NULL); + } +} + +//------------------------------------------------------------------------------ +// FrozenAutoMapIterator functions + +typedef struct FAMIObject { + PyObject_HEAD + FAMObject *fam; + PyArrayObject* keys_array; + ViewKind kind; + bool reversed; + Py_ssize_t index; // current index state, mutated in-place +} FAMIObject; + +static void +fami_dealloc(FAMIObject *self) +{ + Py_DECREF(self->fam); + PyObject_Del((PyObject *)self); +} + +static FAMIObject * +fami_iter(FAMIObject *self) +{ + Py_INCREF(self); + return self; +} + +// For a FAMI, Return appropriate PyObject for items, keys, and values. When values are needed they are retrieved from the int_cache. For consistency with NumPy array iteration, arrays use PyArray_ToScalar instead of PyArray_GETITEM. +static PyObject * +fami_iternext(FAMIObject *self) +{ + Py_ssize_t index; + if (self->reversed) { + index = self->fam->keys_size - ++self->index; + if (index < 0) { + return NULL; + } + } + else { + index = self->index++; + } + if (self->fam->keys_size <= index) { + return NULL; + } + switch (self->kind) { + case ITEMS: { + if (self->fam->keys_array_type) { + return PyTuple_Pack( + 2, + PyArray_ToScalar(PyArray_GETPTR1(self->keys_array, index), self->keys_array), + PyList_GET_ITEM(int_cache, index) + ); + } + else { + return PyTuple_Pack( + 2, + PyList_GET_ITEM(self->fam->keys, index), + PyList_GET_ITEM(int_cache, index) + ); + } + } + case KEYS: { + if (self->fam->keys_array_type) { + return PyArray_ToScalar(PyArray_GETPTR1(self->keys_array, index), self->keys_array); + } + else { + PyObject* yield = PyList_GET_ITEM(self->fam->keys, index); + Py_INCREF(yield); + return yield; + } + } + case VALUES: { + PyObject *yield = PyList_GET_ITEM(int_cache, index); + Py_INCREF(yield); + return yield; + } + } + Py_UNREACHABLE(); +} + +static PyObject * +fami_length_hint(FAMIObject *self) +{ + Py_ssize_t len = Py_MAX(0, self->fam->keys_size - self->index); + return PyLong_FromSsize_t(len); +} + +static PyObject *fami_new(FAMObject *, ViewKind, bool); + +static PyObject * +fami_reversed(FAMIObject *self) +{ + return fami_new(self->fam, self->kind, !self->reversed); +} + +static PyMethodDef fami_methods[] = { + {"__length_hint__", (PyCFunction)fami_length_hint, METH_NOARGS, NULL}, + {"__reversed__", (PyCFunction)fami_reversed, METH_NOARGS, NULL}, + {NULL}, +}; + +PyTypeObject FAMIType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_basicsize = sizeof(FAMIObject), + .tp_dealloc = (destructor) fami_dealloc, + .tp_iter = (getiterfunc) fami_iter, + .tp_iternext = (iternextfunc) fami_iternext, + .tp_methods = fami_methods, + .tp_name = "arraykit.FrozenAutoMapIterator", +}; + +static PyObject * +fami_new(FAMObject *fam, ViewKind kind, bool reversed) +{ + FAMIObject *fami = PyObject_New(FAMIObject, &FAMIType); + if (!fami) { + return NULL; + } + Py_INCREF(fam); + fami->fam = fam; + if (fam->keys_array_type) { + fami->keys_array = (PyArrayObject *)fam->keys; + } + else { + fami->keys_array = NULL; + } + fami->kind = kind; + fami->reversed = reversed; + fami->index = 0; + return (PyObject *)fami; +} + +//------------------------------------------------------------------------------ +// FrozenAutoMapView functions + +// A FAMVObject contains a reference to the FAM from which it was derived +typedef struct FAMVObject{ + PyObject_HEAD + FAMObject *fam; + ViewKind kind; +} FAMVObject; + +# define FAMV_SET_OP(name, op) \ +static PyObject * \ +name(PyObject *left, PyObject *right) \ +{ \ + left = PySet_New(left); \ + if (!left) { \ + return NULL; \ + } \ + right = PySet_New(right); \ + if (!right) { \ + Py_DECREF(left); \ + return NULL; \ + } \ + PyObject *result = PyNumber_InPlace##op(left, right); \ + Py_DECREF(left); \ + Py_DECREF(right); \ + return result; \ +} + +FAMV_SET_OP(famv_and, And) +FAMV_SET_OP(famv_or, Or) +FAMV_SET_OP(famv_subtract, Subtract) +FAMV_SET_OP(famv_xor, Xor) + +# undef FAMV_SET_OP + +static PyNumberMethods famv_as_number = { + .nb_and = (binaryfunc) famv_and, + .nb_or = (binaryfunc) famv_or, + .nb_subtract = (binaryfunc) famv_subtract, + .nb_xor = (binaryfunc) famv_xor, +}; + +static int fam_contains(FAMObject *, PyObject *); +static PyObject *famv_fami_new(FAMVObject *); + +static int +famv_contains(FAMVObject *self, PyObject *other) +{ + if (self->kind == KEYS) { + return fam_contains(self->fam, other); + } + PyObject *iterator = famv_fami_new(self); + if (!iterator) { + return -1; + } + int result = PySequence_Contains(iterator, other); + Py_DECREF(iterator); + return result; +} + +static PySequenceMethods famv_as_sequence = { + .sq_contains = (objobjproc) famv_contains, +}; + +static void +famv_dealloc(FAMVObject *self) +{ + Py_DECREF(self->fam); + PyObject_Del((PyObject *)self); +} + +static PyObject * +famv_fami_new(FAMVObject *self) +{ + return fami_new(self->fam, self->kind, false); +} + +static PyObject * +famv_length_hint(FAMVObject *self) +{ + return PyLong_FromSsize_t(self->fam->keys_size); +} + +static PyObject * +famv_reversed(FAMVObject *self) +{ + return fami_new(self->fam, self->kind, true); +} + +static PyObject * +famv_isdisjoint(FAMVObject *self, PyObject *other) +{ + PyObject *intersection = famv_and((PyObject *)self, other); + if (!intersection) { + return NULL; + } + Py_ssize_t result = PySet_GET_SIZE(intersection); + Py_DECREF(intersection); + return PyBool_FromLong(result); +} + +static PyObject * +famv_richcompare(FAMVObject *self, PyObject *other, int op) +{ + PyObject *left = PySet_New((PyObject *)self); + if (!left) { + return NULL; + } + PyObject *right = PySet_New(other); + if (!right) { + Py_DECREF(left); + return NULL; + } + PyObject *result = PyObject_RichCompare(left, right, op); + Py_DECREF(left); + Py_DECREF(right); + return result; +} + +static PyMethodDef famv_methods[] = { + {"__length_hint__", (PyCFunction) famv_length_hint, METH_NOARGS, NULL}, + {"__reversed__", (PyCFunction) famv_reversed, METH_NOARGS, NULL}, + {"isdisjoint", (PyCFunction) famv_isdisjoint, METH_O, NULL}, + {NULL}, +}; + +PyTypeObject FAMVType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_as_number = &famv_as_number, + .tp_as_sequence = &famv_as_sequence, + .tp_basicsize = sizeof(FAMVObject), + .tp_dealloc = (destructor) famv_dealloc, + .tp_iter = (getiterfunc) famv_fami_new, + .tp_methods = famv_methods, + .tp_name = "arraykit.FrozenAutoMapView", + .tp_richcompare = (richcmpfunc) famv_richcompare, +}; + +static PyObject * +famv_new(FAMObject *fam, ViewKind kind) +{ + FAMVObject *famv = (FAMVObject *)PyObject_New(FAMVObject, &FAMVType); + if (!famv) { + return NULL; + } + famv->kind = kind; + famv->fam = fam; + Py_INCREF(fam); + return (PyObject *)famv; +} + +//------------------------------------------------------------------------------ +// FrozenAutoMap functions + +// Given a key and a computed hash, return the table_pos if that hash and key are found, or if not, the first table position that has not been assigned. Return -1 on error. +static Py_ssize_t +lookup_hash_obj(FAMObject *self, PyObject *key, Py_hash_t hash) +{ + TableElement *table = self->table; + Py_ssize_t mask = self->table_size - 1; + Py_hash_t mixin = Py_ABS(hash); + Py_ssize_t table_pos = hash & mask; + + PyObject *guess = NULL; + PyObject *keys = self->keys; + int result = -1; + Py_hash_t h = 0; + + while (1) { + for (Py_ssize_t i = 0; i < SCAN; i++) { + h = table[table_pos].hash; + if (h == -1) { // Miss. Found a position that can be used for insertion. + return table_pos; + } + if (h != hash) { // Collision. + table_pos++; + continue; + } + guess = PyList_GET_ITEM(keys, table[table_pos].keys_pos); + if (guess == key) { // Hit. Object ID comparison + return table_pos; + } + result = PyObject_RichCompareBool(guess, key, Py_EQ); + if (result < 0) { // Error. + return -1; + } + if (result) { // Hit. + return table_pos; + } + table_pos++; + } + table_pos = (5 * (table_pos - SCAN) + (mixin >>= 1) + 1) & mask; + } +} + + +// Used for both integer and datetime types; for this reason kat is passed in separately. +static Py_ssize_t +lookup_hash_int(FAMObject *self, npy_int64 key, Py_hash_t hash, KeysArrayType kat) +{ + TableElement *table = self->table; + Py_ssize_t mask = self->table_size - 1; + Py_hash_t mixin = Py_ABS(hash); + Py_ssize_t table_pos = hash & mask; // taking the modulo + + PyArrayObject *a = (PyArrayObject *)self->keys; + npy_int64 k = 0; + Py_hash_t h = 0; + + while (1) { + for (Py_ssize_t i = 0; i < SCAN; i++) { + h = table[table_pos].hash; + if (h == -1) { // Miss. Position that can be used for insertion. + return table_pos; + } + if (h != hash) { + table_pos++; + continue; + } + switch (kat) { + case KAT_INT64: + k = *(npy_int64*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + case KAT_INT32: + k = *(npy_int32*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + case KAT_INT16: + k = *(npy_int16*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + case KAT_INT8: + k = *(npy_int8*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + default: + return -1; + } + if (key == k) { + return table_pos; + } + table_pos++; + } + table_pos = (5 * (table_pos - SCAN) + (mixin >>= 1) + 1) & mask; + } +} + + +// NOTE: kat is passed in separately to match the interface of lookup_hash_int. +static Py_ssize_t +lookup_hash_uint(FAMObject *self, npy_uint64 key, Py_hash_t hash, KeysArrayType kat) +{ + TableElement *table = self->table; + Py_ssize_t mask = self->table_size - 1; + Py_hash_t mixin = Py_ABS(hash); + Py_ssize_t table_pos = hash & mask; + + PyArrayObject *a = (PyArrayObject *)self->keys; + npy_uint64 k = 0; + Py_hash_t h = 0; + + while (1) { + for (Py_ssize_t i = 0; i < SCAN; i++) { + h = table[table_pos].hash; + if (h == -1) { + return table_pos; + } + if (h != hash) { + table_pos++; + continue; + } + switch (kat) { + case KAT_UINT64: + k = *(npy_uint64*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + case KAT_UINT32: + k = *(npy_uint32*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + case KAT_UINT16: + k = *(npy_uint16*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + case KAT_UINT8: + k = *(npy_uint8*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + default: + return -1; + } + if (key == k) { + return table_pos; + } + table_pos++; + } + table_pos = (5 * (table_pos - SCAN) + (mixin >>= 1) + 1) & mask; + } +} + + +// NOTE: kat is passed in separately to match the interface of lookup_hash_int +static Py_ssize_t +lookup_hash_double(FAMObject *self, npy_double key, Py_hash_t hash, KeysArrayType kat) +{ + TableElement *table = self->table; + Py_ssize_t mask = self->table_size - 1; + Py_hash_t mixin = Py_ABS(hash); + Py_ssize_t table_pos = hash & mask; + + PyArrayObject *a = (PyArrayObject *)self->keys; + npy_double k = 0; + Py_hash_t h = 0; + + while (1) { + for (Py_ssize_t i = 0; i < SCAN; i++) { + h = table[table_pos].hash; + if (h == -1) { + return table_pos; + } + if (h != hash) { + table_pos++; + continue; + } + switch (kat) { + case KAT_FLOAT64: + k = *(npy_double*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + case KAT_FLOAT32: + k = *(npy_float*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + break; + case KAT_FLOAT16: + k = npy_half_to_double(*(npy_half*)PyArray_GETPTR1(a, table[table_pos].keys_pos)); + break; + default: + return -1; + } + if (key == k) { + return table_pos; + } + table_pos++; + } + table_pos = (5 * (table_pos - SCAN) + (mixin >>= 1) + 1) & mask; + } +} + + +// Compare a passed Py_UCS4 array to stored keys. This does not use any dynamic memory. Returns -1 on error. +static Py_ssize_t +lookup_hash_unicode( + FAMObject *self, + Py_UCS4* key, + Py_ssize_t key_size, + Py_hash_t hash) +{ + TableElement *table = self->table; + Py_ssize_t mask = self->table_size - 1; + Py_hash_t mixin = Py_ABS(hash); + Py_ssize_t table_pos = hash & mask; + + PyArrayObject *a = (PyArrayObject *)self->keys; + Py_ssize_t dt_size = PyArray_ITEMSIZE(a) / UCS4_SIZE; + Py_ssize_t cmp_bytes = Py_MIN(key_size, dt_size) * UCS4_SIZE; + + Py_hash_t h = 0; + Py_UCS4* p_start = NULL; + + while (1) { + for (Py_ssize_t i = 0; i < SCAN; i++) { + h = table[table_pos].hash; + if (h == -1) { + return table_pos; + } + if (h != hash) { + table_pos++; + continue; + } + p_start = (Py_UCS4*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + // memcmp returns 0 on match + if (!memcmp(p_start, key, cmp_bytes)) { + return table_pos; + } + table_pos++; + } + table_pos = (5 * (table_pos - SCAN) + (mixin >>= 1) + 1) & mask; + } +} + + +// Compare a passed char array to stored keys. This does not use any dynamic memory. Returns -1 on error. +static Py_ssize_t +lookup_hash_string( + FAMObject *self, + char* key, + Py_ssize_t key_size, + Py_hash_t hash) +{ + TableElement *table = self->table; + Py_ssize_t mask = self->table_size - 1; + Py_hash_t mixin = Py_ABS(hash); + Py_ssize_t table_pos = hash & mask; + + PyArrayObject *a = (PyArrayObject *)self->keys; + Py_ssize_t dt_size = PyArray_ITEMSIZE(a); + Py_ssize_t cmp_bytes = Py_MIN(key_size, dt_size); + + Py_hash_t h = 0; + char* p_start = NULL; + + while (1) { + for (Py_ssize_t i = 0; i < SCAN; i++) { + h = table[table_pos].hash; + if (h == -1) { + return table_pos; + } + if (h != hash) { + table_pos++; + continue; + } + p_start = (char*)PyArray_GETPTR1(a, table[table_pos].keys_pos); + // memcmp returns 0 on match + if (!memcmp(p_start, key, cmp_bytes)) { + return table_pos; + } + table_pos++; + } + table_pos = (5 * (table_pos - SCAN) + (mixin >>= 1) + 1) & mask; + } +} + + +static Py_ssize_t +lookup_int(FAMObject *self, PyObject* key) { + npy_int64 v = 0; + // NOTE: we handle PyArray Scalar Byte, Short, UByte, UShort with PyNumber_Check, below, saving four branches here + if (PyArray_IsScalar(key, LongLong)) { + v = (npy_int64)PyArrayScalar_VAL(key, LongLong); + } + else if (PyArray_IsScalar(key, Long)) { + v = (npy_int64)PyArrayScalar_VAL(key, Long); + } + else if (PyLong_Check(key)) { + v = PyLong_AsLongLong(key); + if (v == -1 && PyErr_Occurred()) { + PyErr_Clear(); + return -1; + } + } + else if (PyArray_IsScalar(key, Double)) { + double dv = PyArrayScalar_VAL(key, Double); + if (floor(dv) != dv) { + return -1; + } + v = (npy_int64)dv; + } + else if (PyFloat_Check(key)) { + double dv = PyFloat_AsDouble(key); + if (dv == -1.0 && PyErr_Occurred()) { + PyErr_Clear(); + return -1; + } + v = (npy_int64)dv; // truncate to integer + if (v != dv) { + return -1; + } + } + else if (PyArray_IsScalar(key, ULongLong)) { + v = (npy_int64)PyArrayScalar_VAL(key, ULongLong); + } + else if (PyArray_IsScalar(key, ULong)) { + v = (npy_int64)PyArrayScalar_VAL(key, ULong); + } + else if (PyArray_IsScalar(key, Int)) { + v = (npy_int64)PyArrayScalar_VAL(key, Int); + } + else if (PyArray_IsScalar(key, UInt)) { + v = (npy_int64)PyArrayScalar_VAL(key, UInt); + } + else if (PyArray_IsScalar(key, Float)) { + double dv = (double)PyArrayScalar_VAL(key, Float); + if (floor(dv) != dv) { + return -1; + } + v = (npy_int64)dv; + } + else if (PyArray_IsScalar(key, Half)) { + double dv = npy_half_to_double(PyArrayScalar_VAL(key, Half)); + if (floor(dv) != dv) { + return -1; + } + v = (npy_int64)dv; + } + else if (PyBool_Check(key)) { + v = PyObject_IsTrue(key); + } + else if (PyNumber_Check(key)) { + // NOTE: this returns a Py_ssize_t, which might be 32 bit. This can be used for PyArray_Scalars <= ssize_t. + v = (npy_int64)PyNumber_AsSsize_t(key, PyExc_OverflowError); + if (v == -1 && PyErr_Occurred()) { + return -1; + } + } + else { + return -1; + } + Py_hash_t hash = int_to_hash(v); + return lookup_hash_int(self, v, hash, self->keys_array_type); +} + + +// In current usage as an AM, np.datetime64 will match to any numpy Scalar that is at or greater than the resolution of the values stored here. No matches are made to other numerics or Python datetime objects. For AM to be consistent with FAM, we will do the same for now. +static Py_ssize_t +lookup_datetime(FAMObject *self, PyObject* key) { + npy_int64 v = 0; // int64 + if (PyArray_IsScalar(key, Datetime)) { + v = (npy_int64)PyArrayScalar_VAL(key, Datetime); + // if we observe a NAT, we skip unit checks + if (v != NPY_DATETIME_NAT) { + NPY_DATETIMEUNIT key_unit = dt_unit_from_scalar( + (PyDatetimeScalarObject *)key); + if (!kat_is_datetime_unit(self->keys_array_type, key_unit)) { + return -1; + } + } + // DEBUG_MSG_OBJ("dt64 value", PyLong_FromLongLong(v)); + } + else { + return -1; + } + Py_hash_t hash = int_to_hash(v); + return lookup_hash_int(self, v, hash, KAT_INT64); +} + + +static Py_ssize_t +lookup_uint(FAMObject *self, PyObject* key) { + npy_uint64 v = 0; + + // NOTE: we handle PyArray Scalar Byte, Short, UByte, UShort with PyNumber_Check, below, saving four branches here + if (PyArray_IsScalar(key, ULongLong)) { + v = (npy_uint64)PyArrayScalar_VAL(key, ULongLong); + } + else if (PyArray_IsScalar(key, ULong)) { + v = (npy_uint64)PyArrayScalar_VAL(key, ULong); + } + else if (PyArray_IsScalar(key, LongLong)) { + npy_int64 si = (npy_int64)PyArrayScalar_VAL(key, LongLong); + if (si < 0) { + return -1; + } + v = (npy_uint64)si; + } + else if (PyArray_IsScalar(key, Long)) { + npy_int64 si = (npy_int64)PyArrayScalar_VAL(key, Long); + if (si < 0) { + return -1; + } + v = (npy_uint64)si; + } + else if (PyLong_Check(key)) { + v = PyLong_AsUnsignedLongLong(key); + if (v == (unsigned long long)-1 && PyErr_Occurred()) { + PyErr_Clear(); + return -1; + } + } + else if (PyArray_IsScalar(key, Double)) { + double dv = PyArrayScalar_VAL(key, Double); + if (dv < 0 || floor(dv) != dv) { + return -1; + } + v = (npy_uint64)dv; + } + else if (PyFloat_Check(key)) { + double dv = PyFloat_AsDouble(key); + if (dv == -1.0 && PyErr_Occurred()) { + PyErr_Clear(); + return -1; + } + if (dv < 0) { + return -1; + } + v = (npy_uint64)dv; // truncate to integer + if (v != dv) { + return -1; + } + } + else if (PyArray_IsScalar(key, Int)) { + npy_int64 si = (npy_int64)PyArrayScalar_VAL(key, Int); + if (si < 0) { + return -1; + } + v = (npy_uint64)si; + } + else if (PyArray_IsScalar(key, UInt)) { + v = (npy_uint64)PyArrayScalar_VAL(key, UInt); + } + else if (PyArray_IsScalar(key, Float)) { + double dv = (double)PyArrayScalar_VAL(key, Float); + if (dv < 0 || floor(dv) != dv) { + return -1; + } + v = (npy_uint64)dv; + } + else if (PyArray_IsScalar(key, Half)) { + double dv = npy_half_to_double(PyArrayScalar_VAL(key, Half)); + if (dv < 0 || floor(dv) != dv) { + return -1; + } + v = (npy_uint64)dv; + } + else if (PyBool_Check(key)) { + v = PyObject_IsTrue(key); + } + else if (PyNumber_Check(key)) { + // NOTE: this returns a Py_ssize_t, which might be 32 bit. This can be used for PyArray_Scalars <= ssize_t. + npy_int64 si = PyNumber_AsSsize_t(key, PyExc_OverflowError); + if (si == -1 && PyErr_Occurred()) { + PyErr_Clear(); + return -1; + } + if (si < 0) { + return -1; + } + v = (npy_uint64)si; + } + else { + return -1; + } + return lookup_hash_uint(self, v, uint_to_hash(v), self->keys_array_type); +} + + +static Py_ssize_t +lookup_double(FAMObject *self, PyObject* key) { + double v = 0; + if (PyArray_IsScalar(key, Double)) { + v = PyArrayScalar_VAL(key, Double); + } + else if (PyFloat_Check(key)) { + v = PyFloat_AsDouble(key); + if (v == -1.0 && PyErr_Occurred()) { + PyErr_Clear(); + return -1; + } + } + else if (PyLong_Check(key)) { + v = (double)PyLong_AsLongLong(key); + if (v == -1 && PyErr_Occurred()) { + PyErr_Clear(); + return -1; + } + } + // NOTE: we handle PyArray Scalar Byte, Short with PyNumber_Check, below, saving four branches here + else if (PyArray_IsScalar(key, LongLong)) { + v = (double)PyArrayScalar_VAL(key, LongLong); + } + else if (PyArray_IsScalar(key, Long)) { + v = (double)PyArrayScalar_VAL(key, Long); + } + else if (PyArray_IsScalar(key, Int)) { + v = (double)PyArrayScalar_VAL(key, Int); + } + else if (PyArray_IsScalar(key, ULongLong)) { + v = (double)PyArrayScalar_VAL(key, ULongLong); + } + else if (PyArray_IsScalar(key, ULong)) { + v = (double)PyArrayScalar_VAL(key, ULong); + } + else if (PyArray_IsScalar(key, UInt)) { + v = (double)PyArrayScalar_VAL(key, UInt); + } + else if (PyArray_IsScalar(key, Float)) { + v = (double)PyArrayScalar_VAL(key, Float); + } + else if (PyArray_IsScalar(key, Half)) { + v = npy_half_to_double(PyArrayScalar_VAL(key, Half)); + } + else if (PyBool_Check(key)) { + v = PyObject_IsTrue(key); + } + else if (PyNumber_Check(key)) { + // NOTE: this returns a Py_ssize_t, which might be 32 bit. This can be used for PyArray_Scalars <= ssize_t. + npy_int64 si = PyNumber_AsSsize_t(key, PyExc_OverflowError); + if (si == -1 && PyErr_Occurred()) { + PyErr_Clear(); + return -1; + } + v = (double)si; + } + else { + return -1; + } + return lookup_hash_double(self, v, double_to_hash(v), self->keys_array_type); +} + + +static Py_ssize_t +lookup_unicode(FAMObject *self, PyObject* key) { + // NOTE: while we can identify and use PyArray_IsScalar(key, Unicode), this did not improve performance and fails on Windows. + if (!PyUnicode_Check(key)) { + return -1; + } + PyArrayObject *a = (PyArrayObject *)self->keys; + Py_ssize_t dt_size = PyArray_ITEMSIZE(a) / UCS4_SIZE; + // if the key_size is greater than the dtype size of the array, we know there cannot be a match + Py_ssize_t k_size = PyUnicode_GetLength(key); + if (k_size > dt_size) { + return -1; + } + // The buffer will have dt_size + 1 storage. We copy a NULL character so do not have to clear the buffer, but instead can reuse it and still discover the lookup + if (!PyUnicode_AsUCS4(key, self->key_buffer, dt_size+1, 1)) { + return -1; // exception will be set + } + Py_hash_t hash = unicode_to_hash(self->key_buffer, k_size); + return lookup_hash_unicode(self, self->key_buffer, k_size, hash); +} + + +static Py_ssize_t +lookup_string(FAMObject *self, PyObject* key) { + if (!PyBytes_Check(key)) { + return -1; + } + PyArrayObject *a = (PyArrayObject *)self->keys; + Py_ssize_t dt_size = PyArray_ITEMSIZE(a); + Py_ssize_t k_size = PyBytes_GET_SIZE(key); + if (k_size > dt_size) { + return -1; + } + char* k = PyBytes_AS_STRING(key); + Py_hash_t hash = string_to_hash(k, k_size); + return lookup_hash_string(self, k, k_size, hash); +} + + +// Given a key as a PyObject, return the Py_ssize_t keys_pos value stored in the TableElement. Return -1 on key not found (without setting an exception) and -1 on error (with setting an exception). +static Py_ssize_t +lookup(FAMObject *self, PyObject *key) { + Py_ssize_t table_pos = -1; + + switch (self->keys_array_type) { + case KAT_INT64: + case KAT_INT32: + case KAT_INT16: + case KAT_INT8: + table_pos = lookup_int(self, key); + break; + case KAT_UINT64: + case KAT_UINT32: + case KAT_UINT16: + case KAT_UINT8: + table_pos = lookup_uint(self, key); + break; + case KAT_FLOAT64: + case KAT_FLOAT32: + case KAT_FLOAT16: + table_pos = lookup_double(self, key); + break; + case KAT_UNICODE: + table_pos = lookup_unicode(self, key); + break; + case KAT_STRING: + table_pos = lookup_string(self, key); + break; + case KAT_DTY: + case KAT_DTM: + case KAT_DTW: + case KAT_DTD: + case KAT_DTh: + case KAT_DTm: + case KAT_DTs: + case KAT_DTms: + case KAT_DTus: + case KAT_DTns: + case KAT_DTps: + case KAT_DTfs: + case KAT_DTas: + table_pos = lookup_datetime(self, key); + break; + case KAT_LIST: { + Py_hash_t hash = PyObject_Hash(key); + if (hash == -1) { + return -1; + } + table_pos = lookup_hash_obj(self, key, hash); + break; + } + } + // A -1 hash is an unused storage location + if ((table_pos < 0) || (self->table[table_pos].hash == -1)) { + return -1; + } + return self->table[table_pos].keys_pos; +} + +// Insert a key_pos, hash pair into the table. Assumes table already has appropriate size. When inserting a new itme, `hash` is -1, forcing a fresh hash to be computed here. Return 0 on success, -1 on error. +static int +insert_obj( + FAMObject *self, + PyObject *key, + Py_ssize_t keys_pos, + Py_hash_t hash) +{ + if (hash == -1) { + hash = PyObject_Hash(key); + if (hash == -1) { + return -1; + } + } + // table position is not dependent on keys_pos + Py_ssize_t table_pos = lookup_hash_obj(self, key, hash); + + if (table_pos < 0) { + return -1; + } + // We expect, on insertion, to get back a table_pos that points to an unassigned hash value (-1); if we get anything else, we have found a match to an already-existing key, and thus raise a NonUniqueError error. + if (self->table[table_pos].hash != -1) { + PyErr_SetObject(NonUniqueError, key); + return -1; + } + self->table[table_pos].keys_pos = keys_pos; + self->table[table_pos].hash = hash; + return 0; +} + + +static int +insert_int( + FAMObject *self, + npy_int64 key, + Py_ssize_t keys_pos, + Py_hash_t hash, + KeysArrayType kat) +{ + if (hash == -1) { + hash = int_to_hash(key); + } + // table position is not dependent on keys_pos + Py_ssize_t table_pos = lookup_hash_int(self, key, hash, kat); + if (table_pos < 0) { + return -1; + } + if (self->table[table_pos].hash != -1) { + PyObject* er = PyLong_FromLongLong(key); // for error reporting + if (er == NULL) { + return -1; + } + PyErr_SetObject(NonUniqueError, er); + Py_DECREF(er); + return -1; + } + self->table[table_pos].keys_pos = keys_pos; + self->table[table_pos].hash = hash; // key is the hash + return 0; +} + + +static int +insert_uint( + FAMObject *self, + npy_uint64 key, + Py_ssize_t keys_pos, + Py_hash_t hash, + KeysArrayType kat) +{ + if (hash == -1) { + hash = uint_to_hash(key); + } + Py_ssize_t table_pos = lookup_hash_uint(self, key, hash, kat); + + if (table_pos < 0) { + return -1; + } + if (self->table[table_pos].hash != -1) { + PyObject* er = PyLong_FromUnsignedLongLong(key); + if (er == NULL) { + return -1; + } + PyErr_SetObject(NonUniqueError, er); + Py_DECREF(er); + return -1; + } + self->table[table_pos].keys_pos = keys_pos; + self->table[table_pos].hash = hash; + return 0; +} + + +static int +insert_double( + FAMObject *self, + npy_double key, + Py_ssize_t keys_pos, + Py_hash_t hash, + KeysArrayType kat) +{ + if (hash == -1) { + hash = double_to_hash(key); + } + // table position is not dependent on keys_pos + Py_ssize_t table_pos = lookup_hash_double(self, key, hash, kat); + + if (table_pos < 0) { + return -1; + } + if (self->table[table_pos].hash != -1) { + PyObject* er = PyFloat_FromDouble(key); + if (er == NULL) { + return -1; + } + PyErr_SetObject(NonUniqueError, er); + Py_DECREF(er); + return -1; + } + self->table[table_pos].keys_pos = keys_pos; + self->table[table_pos].hash = hash; + return 0; +} + + +static int +insert_unicode( + FAMObject *self, + Py_UCS4* key, + Py_ssize_t key_size, + Py_ssize_t keys_pos, + Py_hash_t hash) +{ + if (hash == -1) { + hash = unicode_to_hash(key, key_size); + } + // table position is not dependent on keys_pos + Py_ssize_t table_pos = lookup_hash_unicode(self, key, key_size, hash); + if (table_pos < 0) { + return -1; + } + if (self->table[table_pos].hash != -1) { + PyObject* er = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, key, key_size); + if (er == NULL) { + return -1; + } + PyErr_SetObject(NonUniqueError, er); + Py_DECREF(er); + return -1; + } + self->table[table_pos].keys_pos = keys_pos; + self->table[table_pos].hash = hash; + return 0; +} + + +static int +insert_string( + FAMObject *self, + char* key, + Py_ssize_t key_size, + Py_ssize_t keys_pos, + Py_hash_t hash) +{ + if (hash == -1) { + hash = string_to_hash(key, key_size); + } + // table position is not dependent on keys_pos + Py_ssize_t table_pos = lookup_hash_string(self, key, key_size, hash); + if (table_pos < 0) { + return -1; + } + if (self->table[table_pos].hash != -1) { + PyObject* er = PyBytes_FromStringAndSize(key, key_size); + if (er == NULL) { + return -1; + } + PyErr_SetObject(NonUniqueError, er); + Py_DECREF(er); + return -1; + } + self->table[table_pos].keys_pos = keys_pos; + self->table[table_pos].hash = hash; + return 0; +} + + +//------------------------------------------------------------------------------ + +// Called in fam_new(), extend(), append(), with the size of observed keys. This table is updated only when append or extending. Only if there is an old table will keys be accessed Returns 0 on success, -1 on failure. +static int +grow_table(FAMObject *self, Py_ssize_t keys_size) +{ + // NOTE: this is the only place int_cache_fill is called; it is not called with key_count_global, but with the max value needed + if (int_cache_fill(keys_size)) { + return -1; + } + Py_ssize_t keys_load = keys_size / LOAD; + Py_ssize_t size_old = self->table_size; + if (keys_load < size_old) { + return 0; + } + + // get the next power of 2 greater than current keys_load + Py_ssize_t size_new = 1; + while (size_new <= keys_load) { + size_new <<= 1; + } + // size_new > keys_load; we know that keys_load >= size_old, so size_new must be > size_old + TableElement *table_old = self->table; + TableElement *table_new = PyMem_New(TableElement, size_new + SCAN - 1); + if (!table_new) { + return -1; + } + + // initialize all hash and keys_pos values to -1 + Py_ssize_t table_pos; + for (table_pos = 0; table_pos < size_new + SCAN - 1; table_pos++) { + table_new[table_pos].hash = -1; + table_new[table_pos].keys_pos = -1; + } + self->table = table_new; + self->table_size = size_new; + + // if we have an old table, move them into the new table + if (size_old) { + if (self->keys_array_type) { + PyErr_SetString(PyExc_NotImplementedError, "Cannot grow table for array keys"); + goto restore; + } + Py_ssize_t i; + Py_hash_t h; + for (table_pos = 0; table_pos < size_old + SCAN - 1; table_pos++) { + i = table_old[table_pos].keys_pos; + h = table_old[table_pos].hash; + if ((h != -1) && insert_obj(self, PyList_GET_ITEM(self->keys, i), i, h)) + { + goto restore; + } + } + } + PyMem_Del(table_old); + return 0; +restore: + PyMem_Del(self->table); + self->table = table_old; + self->table_size = size_old; + return -1; +} + + +// Given a new, possibly un-initialized FAMObject, copy attrs from self to new. Return 0 on success, -1 on error. +int +copy_to_new(PyTypeObject *cls, FAMObject *self, FAMObject *new) +{ + if (self->keys_array_type) { + new->keys = self->keys; + Py_INCREF(new->keys); + } + else { + new->keys = PySequence_List(self->keys); + if (!new->keys) { + return -1; + } + } + key_count_global += self->keys_size; + + new->table_size = self->table_size; + new->keys_array_type = self->keys_array_type; + new->keys_size = self->keys_size; + + new->key_buffer = NULL; + if (new->keys_array_type == KAT_UNICODE) { + PyArrayObject *a = (PyArrayObject *)new->keys; + Py_ssize_t dt_size = PyArray_ITEMSIZE(a) / UCS4_SIZE; + new->key_buffer = (Py_UCS4*)PyMem_Malloc((dt_size+1) * UCS4_SIZE); + } + + Py_ssize_t table_size_alloc = new->table_size + SCAN - 1; + new->table = PyMem_New(TableElement, table_size_alloc); + if (!new->table) { + // Py_DECREF(new->keys); // assume this will get cleaned up + return -1; + } + memcpy(new->table, self->table, table_size_alloc * sizeof(TableElement)); + return 0; +} + + +static PyObject * +fam_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs); + + +// Create a copy of self. Used in `fam_or()`. Returns a new FAMObject on success, NULL on error. +static FAMObject * +copy(PyTypeObject *cls, FAMObject *self) +{ + if (!PyType_IsSubtype(cls, &AMType) && !PyObject_TypeCheck(self, &AMType)) { + Py_INCREF(self); + return self; + } + // fam_new to allocate and full struct attrs + FAMObject *new = (FAMObject*)fam_new(cls, NULL, NULL); + if (!new) { + return NULL; + } + if (copy_to_new(cls, self, new)) { + Py_DECREF(new); // assume this will decref any partially set attrs of new + } + return new; +} + + + +// Returns -1 on error, 0 on success. +static int +extend(FAMObject *self, PyObject *keys) +{ + if (self->keys_array_type) { + PyErr_SetString(PyExc_NotImplementedError, "Not supported for array keys"); + return -1; + } + // this should fail for self->keys types that are not a list + keys = PySequence_Fast(keys, "expected an iterable of keys"); + if (!keys) { + return -1; + } + Py_ssize_t size_extend = PySequence_Fast_GET_SIZE(keys); + key_count_global += size_extend; + self->keys_size += size_extend; + + if (grow_table(self, self->keys_size)) { + Py_DECREF(keys); + return -1; + } + + PyObject **keys_fi = PySequence_Fast_ITEMS(keys); + + for (Py_ssize_t index = 0; index < size_extend; index++) { + // get the new keys_size after each append + if (insert_obj(self, keys_fi[index], PyList_GET_SIZE(self->keys), -1) || + PyList_Append(self->keys, keys_fi[index])) + { + Py_DECREF(keys); + return -1; + } + } + Py_DECREF(keys); + return 0; +} + + +// Returns -1 on error, 0 on success. +static int +append(FAMObject *self, PyObject *key) +{ + if (self->keys_array_type) { + PyErr_SetString(PyExc_NotImplementedError, "Not supported for array keys"); + return -1; + } + key_count_global++; + self->keys_size++; + + if (grow_table(self, self->keys_size)) { + return -1; + } + // keys_size is already incremented; provide last index + if (insert_obj(self, key, self->keys_size - 1, -1) || + PyList_Append(self->keys, key)) + { + return -1; + } + return 0; +} + + +static Py_ssize_t +fam_length(FAMObject *self) +{ + return self->keys_size; +} + + +// Given a key for a FAM, return the Python integer (via the int_cache) associated with that key. Utility function used in both fam_subscript() and fam_get() +static PyObject * +get(FAMObject *self, PyObject *key, PyObject *missing) { + Py_ssize_t keys_pos = lookup(self, key); + if (keys_pos < 0) { + if (PyErr_Occurred()) { + return NULL; + } + if (missing) { + Py_INCREF(missing); + return missing; + } + PyErr_SetObject(PyExc_KeyError, key); + return NULL; + } + // use a C-integer to fetch the Python integer + PyObject *index = PyList_GET_ITEM(int_cache, keys_pos); + Py_INCREF(index); + return index; +} + + +// Give an array of the same kind as KAT, lookup and load all keys_pos. Depends on self, key_size, key_array, table_pos, i, k, b +# define GET_ALL_SCALARS(npy_type_src, npy_type_dst, kat, lookup_func, hash_func, to_obj_func, post_deref) \ +{ \ + npy_type_dst v; \ + for (; i < key_size; i++) { \ + v = post_deref(*(npy_type_src*)PyArray_GETPTR1(key_array, i)); \ + table_pos = lookup_func(self, v, hash_func(v), kat); \ + if (table_pos < 0 || (self->table[table_pos].hash == -1)) { \ + Py_DECREF(array); \ + if (PyErr_Occurred()) { \ + return NULL; \ + } \ + k = to_obj_func(v); \ + if (k == NULL) { \ + return NULL; \ + } \ + PyErr_SetObject(PyExc_KeyError, k); \ + Py_DECREF(k); \ + return NULL; \ + } \ + b[i] = (npy_int64)self->table[table_pos].keys_pos; \ + } \ +} \ + +# define GET_ALL_DT64(npy_type_src, npy_type_dst, kat, lookup_func, hash_func) \ +{ \ + npy_type_dst v; \ + for (; i < key_size; i++) { \ + v = *(npy_type_src*)PyArray_GETPTR1(key_array, i); \ + table_pos = lookup_func(self, v, hash_func(v), kat); \ + if (table_pos < 0 || (self->table[table_pos].hash == -1)) { \ + Py_DECREF(array); \ + if (PyErr_Occurred()) { \ + return NULL; \ + } \ + k = PyArray_ToScalar(&v, key_array); \ + if (k == NULL) { \ + return NULL; \ + } \ + PyErr_SetObject(PyExc_KeyError, k); \ + Py_DECREF(k); \ + return NULL; \ + } \ + b[i] = (npy_int64)self->table[table_pos].keys_pos; \ + } \ +} \ + +# define GET_ALL_FLEXIBLE(char_type, get_end_func, lookup_func, hash_func, to_obj_func) \ +{ \ + char_type* v; \ + Py_ssize_t dt_size = PyArray_ITEMSIZE(key_array) / sizeof(char_type);\ + Py_ssize_t k_size; \ + for (; i < key_size; i++) { \ + v = (char_type*)PyArray_GETPTR1(key_array, i); \ + k_size = get_end_func(v, dt_size) - v; \ + table_pos = lookup_func(self, v, k_size, hash_func(v, k_size)); \ + if (table_pos < 0 || (self->table[table_pos].hash == -1)) { \ + Py_DECREF(array); \ + if (PyErr_Occurred()) { \ + return NULL; \ + } \ + k = to_obj_func(v, k_size); \ + if (k == NULL) { \ + return NULL; \ + } \ + PyErr_SetObject(PyExc_KeyError, k); \ + Py_DECREF(k); \ + return NULL; \ + } \ + b[i] = (npy_int64)self->table[table_pos].keys_pos; \ + } \ +} \ + +// Given a list or array of keys, return an array of the lookup-up integer values. If any unmatched keys are found, a KeyError will raise. An immutable array is always returned. +static PyObject * +fam_get_all(FAMObject *self, PyObject *key) { + Py_ssize_t key_size = 0; + Py_ssize_t keys_pos = -1; + PyObject* k = NULL; + PyObject *array = NULL; + Py_ssize_t i = 0; + + int key_is_list; + if (PyList_CheckExact(key)) { + key_is_list = 1; + key_size = PyList_GET_SIZE(key); + } + else if (PyArray_Check(key)) { + key_is_list = 0; + key_size = PyArray_SIZE((PyArrayObject *)key); + } + else { + PyErr_SetString(PyExc_TypeError, "Must provide a list or array."); + return NULL; + } + + // construct array to be returned; this is a little expensive if we do not yet know if we can use it + npy_intp dims[] = {key_size}; + array = PyArray_EMPTY(1, dims, NPY_INT64, 0); + if (array == NULL) { + return NULL; + } + npy_int64* b = (npy_int64*)PyArray_DATA((PyArrayObject*)array); + + if (key_is_list) { + for (; i < key_size; i++) { + k = PyList_GET_ITEM(key, i); // borrow + keys_pos = lookup(self, k); + if (keys_pos < 0) { + Py_DECREF(array); + if (PyErr_Occurred()) { + return NULL; + } + PyErr_SetObject(PyExc_KeyError, k); + return NULL; + } + b[i] = (npy_int64)keys_pos; + } + } + else { // key is an array + PyArrayObject* key_array = (PyArrayObject *)key; + // if key is an np array of the same kind as this FAMs keys, we can do optimized lookups; otherwise, we have to go through scalar to do full branching and coercion into lookup + int key_array_t = PyArray_TYPE(key_array); + + // NOTE: we only match numeric kinds of the KAT is 64 bit; we could support, for each key_array_t, a switch for every KAT, but the size of that code is huge and the performance benefit is not massive + if (kat_is_kind(self->keys_array_type, PyArray_DESCR(key_array)->kind)) { + Py_ssize_t table_pos; + switch (key_array_t) { // type of passed in array + case NPY_INT64: + GET_ALL_SCALARS(npy_int64, npy_int64, KAT_INT64, lookup_hash_int, int_to_hash, PyLong_FromLongLong,); + break; + case NPY_INT32: + GET_ALL_SCALARS(npy_int32, npy_int64, KAT_INT32, lookup_hash_int, int_to_hash, PyLong_FromLongLong,); + break; + case NPY_INT16: + GET_ALL_SCALARS(npy_int16, npy_int64, KAT_INT16, lookup_hash_int, int_to_hash, PyLong_FromLongLong,); + break; + case NPY_INT8: + GET_ALL_SCALARS(npy_int8, npy_int64, KAT_INT8, lookup_hash_int, int_to_hash, PyLong_FromLongLong,); + break; + case NPY_UINT64: + GET_ALL_SCALARS(npy_uint64, npy_uint64, KAT_UINT64, lookup_hash_uint, uint_to_hash, PyLong_FromUnsignedLongLong,); + break; + case NPY_UINT32: + GET_ALL_SCALARS(npy_uint32, npy_uint64, KAT_UINT32, lookup_hash_uint, uint_to_hash, PyLong_FromUnsignedLongLong,); + break; + case NPY_UINT16: + GET_ALL_SCALARS(npy_uint16, npy_uint64, KAT_UINT16, lookup_hash_uint, uint_to_hash, PyLong_FromUnsignedLongLong,); + break; + case NPY_UINT8: + GET_ALL_SCALARS(npy_uint8, npy_uint64, KAT_UINT8, lookup_hash_uint, uint_to_hash, PyLong_FromUnsignedLongLong,); + break; + case NPY_FLOAT64: + GET_ALL_SCALARS(npy_double, npy_double, KAT_FLOAT64, lookup_hash_double, double_to_hash, PyFloat_FromDouble,); + break; + case NPY_FLOAT32: + GET_ALL_SCALARS(npy_float, npy_double, KAT_FLOAT32, lookup_hash_double, double_to_hash, PyFloat_FromDouble,); + break; + case NPY_FLOAT16: + GET_ALL_SCALARS(npy_half, npy_double, KAT_FLOAT16, lookup_hash_double, double_to_hash, PyFloat_FromDouble, npy_half_to_double); + break; + case NPY_UNICODE: + GET_ALL_FLEXIBLE(Py_UCS4, ucs4_get_end_p, lookup_hash_unicode, unicode_to_hash, PyUnicode_FromUCS4AndData); + break; + case NPY_STRING: + GET_ALL_FLEXIBLE(char, char_get_end_p, lookup_hash_string, string_to_hash, PyBytes_FromStringAndSize); + break; + case NPY_DATETIME: { + NPY_DATETIMEUNIT key_unit = dt_unit_from_array(key_array); + if (!kat_is_datetime_unit(self->keys_array_type, key_unit)) { + PyErr_SetString(PyExc_KeyError, "datetime64 units do not match"); + Py_DECREF(array); + return NULL; + } + GET_ALL_DT64(npy_int64, npy_int64, KAT_INT64, lookup_hash_int, int_to_hash); + break; + } + } + } + else { + for (; i < key_size; i++) { + k = PyArray_ToScalar(PyArray_GETPTR1(key_array, i), key_array); + if (k == NULL) { + Py_DECREF(array); + return NULL; + } + keys_pos = lookup(self, k); + if (keys_pos < 0) { + Py_DECREF(array); + if (PyErr_Occurred()) { + Py_DECREF(k); + return NULL; + } + PyErr_SetObject(PyExc_KeyError, k); + Py_DECREF(k); + return NULL; + } + Py_DECREF(k); + b[i] = (npy_int64)keys_pos; + } + } + } + + PyArray_CLEARFLAGS((PyArrayObject *)array, NPY_ARRAY_WRITEABLE); + return array; + +} + + +# undef GET_ALL_SCALARS +# undef GET_ALL_FLEXIBLE + + +// Give an array of the same kind as KAT, lookup and load any keys_pos. Depends on self, key_size, key_array, table_pos, i, k, values +# define GET_ANY_SCALARS(npy_type_src, npy_type_dst, kat, lookup_func, hash_func, post_deref) \ +{ \ + npy_type_dst v; \ + for (; i < key_size; i++) { \ + v = post_deref(*(npy_type_src*)PyArray_GETPTR1(key_array, i)); \ + table_pos = lookup_func(self, v, hash_func(v), kat); \ + if (table_pos < 0 || (self->table[table_pos].hash == -1)) { \ + if (PyErr_Occurred()) { \ + Py_DECREF(values); \ + return NULL; \ + } \ + continue; \ + } \ + keys_pos = self->table[table_pos].keys_pos; \ + if (PyList_Append(values, PyList_GET_ITEM(int_cache, keys_pos))) { \ + Py_DECREF(values); \ + return NULL; \ + } \ + } \ +} \ + +# define GET_ANY_FLEXIBLE(char_type, get_end_func, lookup_func, hash_func) \ +{ \ + char_type* v; \ + Py_ssize_t dt_size = PyArray_ITEMSIZE(key_array) / sizeof(char_type);\ + Py_ssize_t k_size; \ + for (; i < key_size; i++) { \ + v = (char_type*)PyArray_GETPTR1(key_array, i); \ + k_size = get_end_func(v, dt_size) - v; \ + table_pos = lookup_func(self, v, k_size, hash_func(v, k_size)); \ + if (table_pos < 0 || (self->table[table_pos].hash == -1)) { \ + if (PyErr_Occurred()) { \ + Py_DECREF(values); \ + return NULL; \ + } \ + continue; \ + } \ + keys_pos = self->table[table_pos].keys_pos; \ + if (PyList_Append(values, PyList_GET_ITEM(int_cache, keys_pos))) { \ + Py_DECREF(values); \ + return NULL; \ + } \ + } \ +} \ + +// Given a list or array of keys, return a list of the lookup-up integer values. If any unmatched keys are found, they are ignored. A list is always returned. +static PyObject * +fam_get_any(FAMObject *self, PyObject *key) { + Py_ssize_t key_size = 0; + Py_ssize_t keys_pos = -1; + Py_ssize_t i = 0; + PyObject* k = NULL; + PyObject* values = NULL; + + int key_is_list; + if (PyList_CheckExact(key)) { + key_is_list = 1; + key_size = PyList_GET_SIZE(key); + } + else if (PyArray_Check(key)) { + key_is_list = 0; + key_size = PyArray_SIZE((PyArrayObject *)key); + } + else { + PyErr_SetString(PyExc_TypeError, "Must provide a list or array."); + return NULL; + } + + values = PyList_New(0); + if (!values) { + return NULL; + } + + if (key_is_list) { + for (; i < key_size; i++) { + k = PyList_GET_ITEM(key, i); // borrow + keys_pos = lookup(self, k); + if (keys_pos < 0) { + if (PyErr_Occurred()) { // only exit if exception set + Py_DECREF(values); + return NULL; + } + continue; + } + if (PyList_Append(values, PyList_GET_ITEM(int_cache, keys_pos))) { + Py_DECREF(values); + return NULL; + } + } + } + else { + PyArrayObject* key_array = (PyArrayObject *)key; + // if key is an np array of the same kind as this FAMs keys, we can do optimized lookups; otherwise, we have to go through scalar to do full branching and coercion into lookup + int key_array_t = PyArray_TYPE(key_array); + + if (kat_is_kind(self->keys_array_type, PyArray_DESCR(key_array)->kind)) { + Py_ssize_t table_pos; + switch (key_array_t) { + case NPY_INT64: + GET_ANY_SCALARS(npy_int64, npy_int64, KAT_INT64, lookup_hash_int, int_to_hash,); + break; + case NPY_INT32: + GET_ANY_SCALARS(npy_int32, npy_int64, KAT_INT32, lookup_hash_int, int_to_hash,); + break; + case NPY_INT16: + GET_ANY_SCALARS(npy_int16, npy_int64, KAT_INT16, lookup_hash_int, int_to_hash,); + break; + case NPY_INT8: + GET_ANY_SCALARS(npy_int8, npy_int64, KAT_INT8, lookup_hash_int, int_to_hash,); + break; + case NPY_UINT64: + GET_ANY_SCALARS(npy_uint64, npy_uint64, KAT_UINT64, lookup_hash_uint, uint_to_hash,); + break; + case NPY_UINT32: + GET_ANY_SCALARS(npy_uint32, npy_uint64, KAT_UINT32, lookup_hash_uint, uint_to_hash,); + break; + case NPY_UINT16: + GET_ANY_SCALARS(npy_uint16, npy_uint64, KAT_UINT16, lookup_hash_uint, uint_to_hash,); + break; + case NPY_UINT8: + GET_ANY_SCALARS(npy_uint8, npy_uint64, KAT_UINT8, lookup_hash_uint, uint_to_hash,); + break; + case NPY_FLOAT64: + GET_ANY_SCALARS(npy_double, npy_double, KAT_FLOAT64, lookup_hash_double, double_to_hash,); + break; + case NPY_FLOAT32: + GET_ANY_SCALARS(npy_float, npy_double, KAT_FLOAT32, lookup_hash_double, double_to_hash,); + break; + case NPY_FLOAT16: + GET_ANY_SCALARS(npy_half, npy_double, KAT_FLOAT16, lookup_hash_double, double_to_hash, npy_half_to_double); + break; + case NPY_UNICODE: + GET_ANY_FLEXIBLE(Py_UCS4, ucs4_get_end_p, lookup_hash_unicode, unicode_to_hash); + break; + case NPY_STRING: + GET_ANY_FLEXIBLE(char, char_get_end_p, lookup_hash_string, string_to_hash); + break; + case NPY_DATETIME: { + NPY_DATETIMEUNIT key_unit = dt_unit_from_array(key_array); + if (!kat_is_datetime_unit(self->keys_array_type, key_unit)) { + return values; + } + GET_ANY_SCALARS(npy_int64, npy_int64, KAT_INT64, lookup_hash_int, int_to_hash,); + break; + } + } + } + else { + for (; i < key_size; i++) { + k = PyArray_ToScalar(PyArray_GETPTR1(key_array, i), key_array); + if (k == NULL) { + Py_DECREF(values); + return NULL; + } + keys_pos = lookup(self, k); + Py_DECREF(k); + if (keys_pos < 0) { + if (PyErr_Occurred()) { // only exit if exception set + Py_DECREF(values); + return NULL; + } + continue; // do not raise + } + if (PyList_Append(values, PyList_GET_ITEM(int_cache, keys_pos))) { + Py_DECREF(values); + return NULL; + } + } + } + } + return values; // might be empty +} + + +# undef GET_ANY_SCALARS +# undef GET_ANY_FLEXIBLE + + +static PyObject * +fam_subscript(FAMObject *self, PyObject *key) +{ + return get(self, key, NULL); +} + + +static PyMappingMethods fam_as_mapping = { + .mp_length = (lenfunc) fam_length, + .mp_subscript = (binaryfunc) fam_subscript, +}; + + +static PyObject * +fam_or(PyObject *left, PyObject *right) +{ + if (!PyObject_TypeCheck(left, &FAMType) || + !PyObject_TypeCheck(right, &FAMType) + ) { + Py_RETURN_NOTIMPLEMENTED; + } + FAMObject *updated = copy(Py_TYPE(left), (FAMObject *)left); + if (!updated) { + return NULL; + } + if (extend(updated, ((FAMObject *)right)->keys)) { + Py_DECREF(updated); + return NULL; + } + return (PyObject *)updated; +} + + +static PyNumberMethods fam_as_number = { + .nb_or = (binaryfunc) fam_or, +}; + + +static int +fam_contains(FAMObject *self, PyObject *key) +{ + if (lookup(self, key) < 0) { + if (PyErr_Occurred()) { + return -1; + } + return 0; + } + return 1; +} + + +static PySequenceMethods fam_as_sequence = { + .sq_contains = (objobjproc) fam_contains, +}; + + +static void +fam_dealloc(FAMObject *self) +{ + if (self->table) { + PyMem_Free(self->table); + } + if (self->key_buffer) { + PyMem_Free(self->key_buffer); + } + if (self->keys) { + Py_DECREF(self->keys); + } + + key_count_global -= self->keys_size; + + Py_TYPE(self)->tp_free((PyObject *)self); + int_cache_remove(key_count_global); +} + + +// Return a hash integer for an entire FAM by combining all stored hashes +static Py_hash_t +fam_hash(FAMObject *self) +{ + Py_hash_t hash = 0; + for (Py_ssize_t i = 0; i < self->table_size; i++) { + hash = hash * 3 + self->table[i].hash; + } + if (hash == -1) { // most not return -1 + return 0; + } + return hash; +} + + +static PyObject * +fam_iter(FAMObject *self) +{ + return fami_new(self, KEYS, false); +} + + +static PyObject * +fam_getnewargs(FAMObject *self) +{ + return PyTuple_Pack(1, self->keys); +} + + +static PyObject * +fam_reversed(FAMObject *self) +{ + return fami_new(self, KEYS, true); +} + + +static PyObject * +fam_sizeof(FAMObject *self) +{ + PyObject *listsizeof = PyObject_CallMethod(self->keys, "__sizeof__", NULL); + if (!listsizeof) { + return NULL; + } + Py_ssize_t listbytes = PyLong_AsSsize_t(listsizeof); + Py_DECREF(listsizeof); + if (listbytes == -1 && PyErr_Occurred()) { + return NULL; + } + return PyLong_FromSsize_t( + Py_TYPE(self)->tp_basicsize + + listbytes + + (self->table_size + SCAN - 1) * sizeof(TableElement) + ); +} + + +static PyObject * +fam_get(FAMObject *self, PyObject *args) +{ + PyObject *key, *missing = Py_None; + if (!PyArg_UnpackTuple(args, Py_TYPE(self)->tp_name, 1, 2, &key, &missing)) + { + return NULL; + } + return get(self, key, missing); +} + + +static PyObject * +fam_items(FAMObject *self) +{ + return famv_new(self, ITEMS); +} + + +static PyObject * +fam_keys(FAMObject *self) +{ + return famv_new(self, KEYS); +} + + +static PyObject * +fam_values(FAMObject *self) +{ + return famv_new(self, VALUES); +} + + +static PyObject * +fam_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs) +{ + // NOTE: The original fam_new used to be able to provide a same reference back if a fam was in the args; this is tricky now that we have fam_init + FAMObject *self = (FAMObject *)cls->tp_alloc(cls, 0); + if (!self) { + return NULL; + } + self->table = NULL; + self->keys = NULL; + self->key_buffer = NULL; + self->keys_size = 0; + return (PyObject*)self; +} + + +// This macro can be used with integer and floating point NumPy types, given an `npy_type` and a specialized `insert_func`. Uses context of `fam_init` to get `fam`, `contiguous`, `a`, `keys_size`, and `i`. An optional `post_deref` function can be supplied to transform extracted values before calling the appropriate insert function. +# define INSERT_SCALARS(npy_type, insert_func, kat, post_deref) \ +{ \ + if (contiguous) { \ + npy_type* b = (npy_type*)PyArray_DATA(a); \ + npy_type* b_end = b + keys_size; \ + while (b < b_end) { \ + if (insert_func(fam, post_deref(*b), i, -1, kat)) { \ + goto error; \ + } \ + b++; \ + i++; \ + } \ + } \ + else { \ + for (; i < keys_size; i++) { \ + if (insert_func(fam, \ + post_deref(*(npy_type*)PyArray_GETPTR1(a, i)),\ + i, \ + -1, \ + kat)) { \ + goto error; \ + } \ + } \ + } \ +} \ + +// This macro is for inserting flexible-sized types, Unicode (Py_UCS4) or strings (char). Uses context of `fam_init`. +# define INSERT_FLEXIBLE(char_type, insert_func, get_end_func) \ +{ \ + char_type* p = NULL; \ + if (contiguous) { \ + char_type *b = (char_type*)PyArray_DATA(a); \ + char_type *b_end = b + keys_size * dt_size; \ + while (b < b_end) { \ + p = get_end_func(b, dt_size); \ + if (insert_func(fam, b, p-b, i, -1)) { \ + goto error; \ + } \ + b += dt_size; \ + i++; \ + } \ + } \ + else { \ + for (; i < keys_size; i++) { \ + char_type* v = (char_type*)PyArray_GETPTR1(a, i); \ + p = get_end_func(v, dt_size); \ + if (insert_func(fam, v, p-v, i, -1)) { \ + goto error; \ + } \ + } \ + } \ +} \ + +// Initialize an allocated FAMObject. Returns 0 on success, -1 on error. +int +fam_init(PyObject *self, PyObject *args, PyObject *kwargs) +{ + PyTypeObject* cls = Py_TYPE(self); // borrowed ref + const char *name = cls->tp_name; + FAMObject* fam = (FAMObject*)self; + + if (kwargs) { + PyErr_Format(PyExc_TypeError, "%s takes no keyword arguments", name); + return -1; + } + + KeysArrayType keys_array_type = KAT_LIST; // default, will override if necessary + + PyObject *keys = NULL; + Py_ssize_t keys_size = 0; + + if (!PyArg_UnpackTuple(args, name, 0, 1, &keys)) { + return -1; + } + + if (!keys) { + keys = PyList_New(0); + if (!keys) { + return -1; + } + } + else if (PyObject_TypeCheck(keys, &FAMType)) { + // Use `keys` as old, `self` as new, and fill from old to new. This returns the same error codes as this function. + return copy_to_new(cls, (FAMObject*)keys, fam); + } + else if (PyArray_Check(keys)) { + PyArrayObject *a = (PyArrayObject *)keys; + if (PyArray_NDIM(a) != 1) { + PyErr_SetString(PyExc_TypeError, "Arrays must be 1-dimensional"); + return -1; + } + + int array_t = PyArray_TYPE(a); + keys_size = PyArray_SIZE(a); + + if (cls != &AMType && + (PyTypeNum_ISINTEGER(array_t) // signed and unsigned + || PyTypeNum_ISFLOAT(array_t) + || PyTypeNum_ISFLEXIBLE(array_t) + || array_t == NPY_DATETIME )) + { + if ((PyArray_FLAGS(a) & NPY_ARRAY_WRITEABLE)) { + PyErr_Format(PyExc_TypeError, "Arrays must be immutable when given to a %s", name); + return -1; + } + // NOTE: this might return 0 (list) given a dt64 array without a unit + keys_array_type = at_to_kat(array_t, a); + } + + if (keys_array_type) { // we have a usable array + Py_INCREF(keys); + } + else { + // DEBUG_MSG_OBJ("got KAT", PyLong_FromLongLong(keys_array_type)); + // if an AutoMap or an array that we do not handle, create a list + if (array_t == NPY_DATETIME || array_t == NPY_TIMEDELTA){ + keys = PySequence_List(keys); // force scalars + } + else { + keys = PyArray_ToList(a); // converts to objs + } + if (!keys) { + return -1; + } + } + } + else { // assume an arbitrary iterable + keys = PySequence_List(keys); + if (!keys) { + return -1; + } + keys_size = PyList_GET_SIZE(keys); + } + + fam->keys = keys; + fam->keys_array_type = keys_array_type; + fam->keys_size = keys_size; + fam->key_buffer = NULL; + key_count_global += keys_size; + + // NOTE: on itialization, grow_table() does not use keys + if (grow_table(fam, keys_size)) { + return -1; + } + Py_ssize_t i = 0; + if (keys_array_type) { + PyArrayObject *a = (PyArrayObject *)fam->keys; + int contiguous = PyArray_IS_C_CONTIGUOUS(a); + switch (keys_array_type) { + case KAT_INT64: + INSERT_SCALARS(npy_int64, insert_int, keys_array_type,); + break; + case KAT_INT32: + INSERT_SCALARS(npy_int32, insert_int, keys_array_type,); + break; + case KAT_INT16: + INSERT_SCALARS(npy_int16, insert_int, keys_array_type,); + break; + case KAT_INT8: + INSERT_SCALARS(npy_int8, insert_int, keys_array_type,); + break; + case KAT_UINT64: + INSERT_SCALARS(npy_uint64, insert_uint, keys_array_type,); + break; + case KAT_UINT32: + INSERT_SCALARS(npy_uint32, insert_uint, keys_array_type,); + break; + case KAT_UINT16: + INSERT_SCALARS(npy_uint16, insert_uint, keys_array_type,); + break; + case KAT_UINT8: + INSERT_SCALARS(npy_uint8, insert_uint, keys_array_type,); + break; + case KAT_FLOAT64: + INSERT_SCALARS(npy_double, insert_double, keys_array_type,); + break; + case KAT_FLOAT32: + INSERT_SCALARS(npy_float, insert_double, keys_array_type,); + break; + case KAT_FLOAT16: + INSERT_SCALARS(npy_half, insert_double, keys_array_type, npy_half_to_double); + break; + case KAT_UNICODE: { + // Over allocate buffer by 1 so there is room for null at end. This buffer is only used in lookup(); + Py_ssize_t dt_size = PyArray_ITEMSIZE(a) / UCS4_SIZE; + fam->key_buffer = (Py_UCS4*)PyMem_Malloc((dt_size+1) * UCS4_SIZE); + INSERT_FLEXIBLE(Py_UCS4, insert_unicode, ucs4_get_end_p); + break; + } + case KAT_STRING: { + Py_ssize_t dt_size = PyArray_ITEMSIZE(a); + INSERT_FLEXIBLE(char, insert_string, char_get_end_p); + break; + } + case KAT_DTY: + case KAT_DTM: + case KAT_DTW: + case KAT_DTD: + case KAT_DTh: + case KAT_DTm: + case KAT_DTs: + case KAT_DTms: + case KAT_DTus: + case KAT_DTns: + case KAT_DTps: + case KAT_DTfs: + case KAT_DTas: + INSERT_SCALARS(npy_int64, insert_int, KAT_INT64,); + break; + default: + return -1; + } + } + else { + for (; i < keys_size; i++) { + if (insert_obj(fam, PyList_GET_ITEM(keys, i), i, -1)) { + goto error; + } + } + } + return 0; +error: + // assume all dynamic memory assigned to struct attrs that will be cleaned + return -1; +} + + +# undef INSERT_SCALARS +# undef INSERT_FLEXIBLE + + +static PyObject * +fam_repr(FAMObject *self) +{ + return PyUnicode_FromFormat("%s(%R)", Py_TYPE(self)->tp_name, self->keys); +} + + +static PyObject * +fam_richcompare(FAMObject *self, PyObject *other, int op) +{ + if (!PyObject_TypeCheck(other, &FAMType)) { + Py_RETURN_NOTIMPLEMENTED; + } + return PyObject_RichCompare(self->keys, ((FAMObject *)other)->keys, op); +} + + +static PyObject* +fam_getstate(FAMObject *self) +{ + PyObject* state = PyTuple_Pack(1, self->keys); + return state; +} + + +// State returned here is a tuple of keys, suitable for usage as an `args` argument. +static PyObject* +fam_setstate(FAMObject *self, PyObject *state) +{ + if (!PyTuple_CheckExact(state) || !PyTuple_GET_SIZE(state)) { + PyErr_SetString(PyExc_ValueError, "Unexpected pickled object."); + return NULL; + } + PyObject *keys = PyTuple_GetItem(state, 0); + if (PyArray_Check(keys)) { + // if we an array, make it immutable + PyArray_CLEARFLAGS((PyArrayObject*)keys, NPY_ARRAY_WRITEABLE); + } + fam_init((PyObject*)self, state, NULL); + Py_RETURN_NONE; +} + + +static PyMethodDef fam_methods[] = { + {"__getnewargs__", (PyCFunction) fam_getnewargs, METH_NOARGS, NULL}, + {"__reversed__", (PyCFunction) fam_reversed, METH_NOARGS, NULL}, + {"__sizeof__", (PyCFunction) fam_sizeof, METH_NOARGS, NULL}, + {"__getstate__", (PyCFunction) fam_getstate, METH_NOARGS, NULL}, + {"__setstate__", (PyCFunction) fam_setstate, METH_O, NULL}, + {"get", (PyCFunction) fam_get, METH_VARARGS, NULL}, + {"items", (PyCFunction) fam_items, METH_NOARGS, NULL}, + {"keys", (PyCFunction) fam_keys, METH_NOARGS, NULL}, + {"values", (PyCFunction) fam_values, METH_NOARGS, NULL}, + {"get_all", (PyCFunction) fam_get_all, METH_O, NULL}, + {"get_any", (PyCFunction) fam_get_any, METH_O, NULL}, + {NULL}, +}; + + +PyTypeObject FAMType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_as_mapping = &fam_as_mapping, + .tp_as_number = &fam_as_number, + .tp_as_sequence = &fam_as_sequence, + .tp_basicsize = sizeof(FAMObject), + .tp_dealloc = (destructor) fam_dealloc, + .tp_doc = "An immutable auto-incremented integer-valued mapping.", + .tp_hash = (hashfunc) fam_hash, + .tp_iter = (getiterfunc) fam_iter, + .tp_methods = fam_methods, + .tp_name = "arraykit.FrozenAutoMap", + .tp_new = fam_new, + .tp_init = fam_init, + .tp_repr = (reprfunc) fam_repr, + .tp_richcompare = (richcmpfunc) fam_richcompare, +}; + + +//------------------------------------------------------------------------------ +// AutoMap subclass + +static PyObject * +am_inplace_or(FAMObject *self, PyObject *other) +{ + if (PyObject_TypeCheck(other, &FAMType)) { + other = ((FAMObject *)other)->keys; + } + if (extend(self, other)) { + return NULL; + } + Py_INCREF(self); + return (PyObject *)self; +} + + +static PyNumberMethods am_as_number = { + .nb_inplace_or = (binaryfunc) am_inplace_or, +}; + + +static PyObject * +am_add(FAMObject *self, PyObject *other) +{ + if (append(self, other)) { + return NULL; + } + Py_RETURN_NONE; +} + + +static PyObject * +am_update(FAMObject *self, PyObject *other) +{ + if (PyObject_TypeCheck(other, &FAMType)) { + other = ((FAMObject *)other)->keys; + } + if (extend(self, other)) { + return NULL; + } + Py_RETURN_NONE; +} + + +static PyMethodDef am_methods[] = { + {"add", (PyCFunction) am_add, METH_O, NULL}, + {"update", (PyCFunction) am_update, METH_O, NULL}, + {NULL}, +}; + +PyTypeObject AMType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_as_number = &am_as_number, + .tp_base = &FAMType, + .tp_doc = "A grow-only autoincremented integer-valued mapping.", + .tp_methods = am_methods, + .tp_name = "arraykit.AutoMap", + .tp_richcompare = (richcmpfunc) fam_richcompare, +}; + diff --git a/src/auto_map.h b/src/auto_map.h new file mode 100644 index 00000000..fe9a03b5 --- /dev/null +++ b/src/auto_map.h @@ -0,0 +1,14 @@ +# ifndef ARRAYKIT_SRC_AUTO_MAP_H_ +# define ARRAYKIT_SRC_AUTO_MAP_H_ + +# include "Python.h" + +// extern PyTypeObject TriMapType; +extern PyTypeObject AMType; +extern PyTypeObject FAMIType; +extern PyTypeObject FAMVType; +extern PyTypeObject FAMType; +extern PyObject *NonUniqueError; + + +# endif /* ARRAYKIT_SRC_AUTO_MAP_H_ */ diff --git a/test/test_auto_map.py b/test/test_auto_map.py new file mode 100644 index 00000000..220ec3a8 --- /dev/null +++ b/test/test_auto_map.py @@ -0,0 +1,994 @@ +import pickle +import pytest +import numpy as np + +from arraykit import AutoMap +from arraykit import FrozenAutoMap +from arraykit import NonUniqueError + + +# ------------------------------------------------------------------------------ + + +def test_am_extend(): + am1 = AutoMap(("a", "b")) + am2 = am1 | AutoMap(("c", "d")) + assert list(am2.keys()) == ["a", "b", "c", "d"] + + +def test_am_add(): + a = AutoMap() + for l, key in enumerate(["a", "b", "c", "d"]): + assert a.add(key) is None + assert len(a) == l + 1 + assert a[key] == l + + +def test_fam_contains(): + x = [] + fam = FrozenAutoMap(("a", "b", "c")) + assert (x in fam.values()) == False + # NOTE: exercise x to force seg fault + assert len(x) == 0 + + +# ------------------------------------------------------------------------------ + + +def test_fam_constructor_a(): + with pytest.raises(ZeroDivisionError): + fam = FrozenAutoMap((x / 0 for x in range(3))) + + +def test_fam_constructor_b(): + fam1 = FrozenAutoMap(range(3)) + fam2 = FrozenAutoMap(fam1) + assert list(fam2), [0, 1, 2] + + +# ------------------------------------------------------------------------------ + + +def test_fam_constructor_array_int_a1(): + a1 = np.array((10, 20, 30), dtype=np.int64) + with pytest.raises(TypeError): + fam = FrozenAutoMap(a1) + + +def test_fam_constructor_array_int_a2(): + a1 = np.array((10, 20, 30), dtype=np.int32) + with pytest.raises(TypeError): + fam = FrozenAutoMap(a1) + + +def test_fam_constructor_array_int_b(): + a1 = np.array((10, 20, 30, 40), dtype=np.int64).reshape(2, 2) + a1.flags.writeable = False + with pytest.raises(TypeError): + fam = FrozenAutoMap(a1) + + +def test_fam_constructor_array_int_c(): + a1 = np.array((10, 20, 30), dtype=np.int8) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + for k in a1: + assert k in fam + + +def test_fam_constructor_array_int_d(): + a1 = np.array((-2, -1, 1, 2), dtype=np.int8) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + for k in a1: + assert k in fam + + +def test_fam_constructor_array_int_e(): + # https://github.com/static-frame/arraymap/issues/12 + a1 = np.array((0, 0, 1, 1, 2, 2), dtype=int) + a2 = a1[[0, 2, 4]] + a2.flags.writeable = False + fam1 = FrozenAutoMap(a2) + assert list(fam1) == [0, 1, 2] + + d1 = {i: int(i) for i in a2} + fam2 = FrozenAutoMap(d1) + assert list(fam2) == [0, 1, 2] + + d2 = {0: 0, 3: 1} + fam3 = FrozenAutoMap(d2) + assert list(fam3) == [0, 3] + + +# ------------------------------------------------------------------------------ + + +def test_fam_constructor_array_float_a(): + a1 = np.array((1.2, 8.8, 1.2)) + a1.flags.writeable = False + with pytest.raises(NonUniqueError): + fam = FrozenAutoMap(a1) + + +# ------------------------------------------------------------------------------ + + +def test_fam_constructor_array_dt64_a(): + a1 = np.array(("1970-01", "2023-05"), dtype=np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam[np.datetime64("2023-05")] == 1 + assert fam[np.datetime64("1970-01")] == 0 + + with pytest.raises(KeyError): + fam[np.datetime64("nat")] + + with pytest.raises(KeyError): + fam[np.datetime64("1970")] + + +def test_fam_constructor_array_dt64_b(): + a1 = np.array(("1542", "nat"), dtype=np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert fam[np.datetime64("nat")] == 1 + assert fam[np.datetime64("nat", "D")] == 1 + assert fam[np.datetime64("nat", "ns")] == 1 + assert fam[np.datetime64("1542")] == 0 + + +def test_fam_constructor_array_dt64_c(): + a1 = np.array(("nat", "nat"), dtype=np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + # when we get "generic" dt64 units, we load scalars in a list, and can thus support multiple NaNs + assert len(fam) == 2 + + +def test_fam_constructor_array_dt64_d(): + a1 = np.array(("2023-05", "2023-05"), dtype=np.datetime64) + a1.flags.writeable = False + with pytest.raises(NonUniqueError): + fam = FrozenAutoMap(a1) + + +# ------------------------------------------------------------------------------ + + +def test_fam_constructor_array_unicode_a(): + a1 = np.array(("a", "b", "a")) + a1.flags.writeable = False + with pytest.raises(NonUniqueError): + fam = FrozenAutoMap(a1) + + +def test_fam_constructor_array_unicode_b(): + a1 = np.array(("a", "bb", "ccc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + for k in a1: + assert k in fam + + +def test_fam_constructor_array_unicode_c(): + a1 = np.array(("z0Ct", "z0DS", "z0E9")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + +# NOTE +# >>> u = "\x000\x00" +# >>> len(u) +# 3 +# >>> a1 = np.array(['', ''], dtype='U4') +# >>> a1[0] = u +# >>> a1 +# array(['\x000', ''], dtype='>> len(a1[0]) +# 2 + + +def test_fam_constructor_array_unicode_d1(): + a1 = np.array(["", "\x000"], dtype="U2") + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert len(fam) == 2 + assert list(fam) == ["", "\x000"] + assert "" in fam + assert "\x000" in fam + + +def test_fam_constructor_array_unicode_d2(): + a1 = np.array(["", "\x000\x00"], dtype="U3") + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert len(fam) == 2 + assert list(fam) == ["", "\x000"] # we lost the last null + assert "" in fam + assert "\x000" in fam + + +def test_fam_copy_array_unicode_a(): + a1 = np.array(("a", "ccc", "bb")) + a1.flags.writeable = False + fam1 = FrozenAutoMap(a1) + fam2 = FrozenAutoMap(fam1) + assert fam2["a"] == 0 + assert fam2["ccc"] == 1 + assert fam2["bb"] == 2 + + +# ------------------------------------------------------------------------------ + + +def test_fam_constructor_array_bytes_a(): + a1 = np.array((b"a", b"b", b"c")) + with pytest.raises(TypeError): + fam = FrozenAutoMap(a1) + + +def test_fam_constructor_array_bytes_b(): + a1 = np.array((b"aaa", b"b", b"aaa")) + a1.flags.writeable = False + with pytest.raises(NonUniqueError): + fam = FrozenAutoMap(a1) + + +def test_fam_constructor_array_bytes_c(): + a1 = np.array((b"aaa", b"b", b"cc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert fam[b"aaa"] == 0 + assert fam[b"b"] == 1 + assert fam[b"cc"] == 2 + + +def test_fam_copy_array_bytes_a(): + a1 = np.array((b"a", b"ccc", b"bb")) + a1.flags.writeable = False + fam1 = FrozenAutoMap(a1) + fam2 = FrozenAutoMap(fam1) + assert fam2[b"a"] == 0 + assert fam2[b"ccc"] == 1 + assert fam2[b"bb"] == 2 + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_bytes_get_a(): + a1 = np.array((b"", b" ", b" ", b" ")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get(b"") == 0 + assert fam.get(b" ") == None + assert fam.get(b" ") == 2 + assert fam.get(b" ") == 3 + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_len_a(): + a1 = np.array((10, 20, 30, 40), dtype=np.int64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert len(fam) == 4 + + +def test_fam_array_len_b(): + a1 = np.array((10, 20, 30, 40), dtype=np.int64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert fam[10] == 0 + assert fam[20] == 1 + assert fam[30] == 2 + assert fam[40] == 3 + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_int_get_a(): + a1 = np.array((1, 100, 300, 4000), dtype=np.int64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1) == 0 + assert fam.get(True) == 0 + assert fam.get(a1[2]) == 2 + assert fam.get(1.0) == 0 + + +def test_fam_array_int_get_b(): + a1 = np.array((1, 100, 300, 4000), dtype=np.int32) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1) == 0 + assert fam.get(True) == 0 + assert fam.get(a1[2]) == 2 + assert fam.get(1.0) == 0 + assert fam.get(1.1) is None + + +def test_fam_array_int_get_c1(): + a1 = np.array((1, 5, 10, 20), dtype=np.int16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1) == 0 + assert fam.get(True) == 0 + assert fam.get(a1[2]) == 2 + assert fam.get(20.0) == 3 + + +def test_fam_array_int_get_c2(): + a1 = np.array((1,), dtype=np.int16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + for k in a1: + assert k in fam + + +def test_fam_array_int_get_c3(): + a1 = np.array((19037,), dtype=np.int16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + for k in a1: + assert k in fam + + +def test_fam_array_int_get_d(): + a1 = np.array((1, 5, 10, 20), dtype=np.int8) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1) == 0 + assert fam.get(True) == 0 + assert fam.get(a1[2]) == 2 + assert fam.get(20.0) == 3 + assert fam.get(20.1) is None + + +def test_fam_array_int_get_e(): + a1 = np.array([2147483648], dtype=np.int64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(2147483648) == 0 + assert fam.get(a1[0]) == 0 + + +def test_fam_array_int_get_f1(): + ctype = np.int64 + a1 = np.array([np.iinfo(ctype).min, np.iinfo(ctype).max], dtype=ctype) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert list(fam.keys()) == [np.iinfo(ctype).min, np.iinfo(ctype).max] + + +def test_fam_array_int_get_f2(): + ctype = np.int64 + a1 = np.array([np.iinfo(ctype).min, np.iinfo(ctype).max], dtype=ctype) + a1.flags.writeable = False + + fam = FrozenAutoMap(a1) + assert fam.get(np.iinfo(ctype).min) == 0 + assert fam.get(np.iinfo(ctype).max) == 1 + + +def test_fam_array_int_get_d(): + a1 = np.array((8, 2, 4, 0, 1), dtype=np.int64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + for ctype in ( + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float16, + np.float32, + np.float64, + ): + a2 = a1.astype(ctype) + for k in a2: + assert k in fam, f"{type(k)}" + assert 2.0 in fam + assert 2.1 not in fam + assert True in fam + assert False in fam + assert 4 in fam + + +def test_fam_array_int_get_e(): + a1 = np.array((1,), dtype=np.int16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert a1[0] in fam + assert 1 in fam + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_uint_get_a(): + a1 = np.array((1, 100, 300, 4000), dtype=np.uint64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1) == 0 + assert fam.get(True) == 0 + assert fam.get(a1[2]) == 2 + assert fam.get(1.0) == 0 + + for k in a1: + assert k in fam + + +def test_fam_array_uint_get_b(): + a1 = np.arange(0, 100, dtype=np.uint32) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1) == 1 + assert fam.get(True) == 1 + assert fam.get(a1[2]) == 2 + assert fam.get(1.0) == 1 + + for k in a1: + assert k in fam + + +def test_fam_array_uint_get_c(): + a1 = np.arange(0, 100, dtype=np.uint16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1) == 1 + assert fam.get(True) == 1 + assert fam.get(a1[2]) == 2 + assert fam.get(1.0) == 1 + + for k in a1: + assert k in fam + + +def test_fam_array_uint_get_d(): + a1 = np.arange(0, 100, dtype=np.uint8) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1) == 1 + assert fam.get(True) == 1 + assert fam.get(a1[2]) == 2 + assert fam.get(1.0) == 1 + + for k in a1: + assert k in fam + + +def test_fam_array_uint_get_e(): + a1 = np.array((1,), dtype=np.uint16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + for k in a1: + assert k in fam + + +def test_fam_array_uint_get_f(): + a1 = np.array((8, 2, 4, 1), dtype=np.uint64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + for ctype in ( + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float16, + np.float32, + np.float64, + ): + a2 = a1.astype(ctype) + for k in a2: + assert k in fam, f"{type(k)}" + a3 = -a2 + for k in a3: + assert k not in fam, f"{type(k)}" + + assert True in fam + assert 4.0 in fam + assert 4.1 not in fam + assert 8 in fam + assert -8 not in fam + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_float_get_a(): + a1 = np.array((1.5, 10.2, 8.8), dtype=np.float64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + assert fam.get(1.5) == 0 + assert fam.get(10.2) == 1 + assert fam.get(a1[1]) == 1 + assert fam.get(8.8) == 2 + + +def test_fam_array_float_get_b(): + a1 = np.array((1.5, 10.2, 8.8), dtype=np.float32) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("f") is None + # assert fam.get(1.5) == 0 + assert fam.get(a1[0]) == 0 + assert fam.get(a1[1]) == 1 + assert fam.get(a1[2]) == 2 + + +def test_fam_array_float_get_c1(): + a1 = np.array((1.5, 10.2, 8.8), dtype=np.float16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert fam.get("f") is None + assert fam.get(a1[0]) == 0 + assert fam.get(a1[1]) == 1 + assert fam.get(a1[2]) == 2 + + +def test_fam_array_float_get_c2(): + a1 = np.array((0.0,), dtype=np.float16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + +def test_fam_array_float_get_d(): + a1 = np.array((8, 2, 4, 1), dtype=np.float64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + for ctype in ( + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float16, + np.float32, + np.float64, + ): + a2 = a1.astype(ctype) + for k in a2: + assert k in fam, f"{type(k)}" + a3 = -a2 + for k in a3: + assert k not in fam, f"{type(k)}" + + assert True in fam + assert 4.0 in fam + assert 4.1 not in fam + assert 8 in fam + assert -8 not in fam + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_unicode_get_a(): + a1 = np.array(("bb", "a", "ccc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("a") == 1 + assert fam.get("bb") == 0 + assert fam.get("ccc") == 2 + assert fam.get(None) is None + assert fam.get(3.2) is None + assert fam.get("cc") is None + assert fam.get("cccc") is None + + +def test_fam_array_unicode_get_b(): + a1 = np.array(("", " ", " ", " ")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + assert fam.get("") == 0 + assert fam.get(" ") == None + assert fam.get(" ") == 2 + assert fam.get(" ") == 3 + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_values_a(): + a1 = np.array((10, 20, 30, 40), dtype=np.int64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert list(fam.values()) == [0, 1, 2, 3] + + +def test_fam_array_keys_a(): + a1 = np.array((10, 20, 30, 40), dtype=np.int64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert list(fam.keys()) == [10, 20, 30, 40] + + +def test_fam_array_keys_a(): + a1 = np.array((10, 20, 30, 40), dtype=np.int8) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + e = next(iter(fam)) + assert isinstance(e, np.int8) + + +def test_fam_array_items_a(): + a1 = np.array((10, 20, 30, 40), dtype=np.int64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert list(fam.items()) == [(10, 0), (20, 1), (30, 2), (40, 3)] + + +def test_fam_array_values_b(): + a1 = np.array(("a", "b", "c", "d")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert list(fam.values()) == [0, 1, 2, 3] + + +def test_fam_array_keys_b(): + a1 = np.array(("a", "b", "c", "d")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert list(fam.keys()) == ["a", "b", "c", "d"] + + +def test_fam_array_items_b(): + a1 = np.array(("a", "b", "c", "d")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert list(fam.items()) == [("a", 0), ("b", 1), ("c", 2), ("d", 3)] + + +def test_fam_array_items_c(): + a1 = np.array(("a", "b", "c")) + a1.flags.writeable = False + fam1 = FrozenAutoMap(a1) + + fam2 = FrozenAutoMap(fam1) + assert list(fam2.items()) == [("a", 0), ("b", 1), ("c", 2)] + assert list(fam1.items()) == [("a", 0), ("b", 1), ("c", 2)] + + +# ------------------------------------------------------------------------------ + + +def test_am_array_constructor_a(): + a1 = np.array(("a", "b", "c")) + a1.flags.writeable = False + am1 = AutoMap(a1) + + +def test_am_array_constructor_b(): + a1 = np.array(("2022-01", "2023-05"), dtype=np.datetime64) + a1.flags.writeable = False + am1 = AutoMap(a1) + assert am1[np.datetime64("2023-05")] == 1 + + +def test_am_array_constructor_c(): + a1 = np.array((10, 20, 30, 40), dtype=np.int64) + a1.flags.writeable = False + am = AutoMap(a1) + am.update((60, 80)) + am.add(90) + assert list(am.keys()) == [10, 20, 30, 40, 60, 80, 90] + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_pickle_a(): + a1 = np.array(("a", "b", "c", "d")) + a1.flags.writeable = False + fam1 = FrozenAutoMap(a1) + fam2 = pickle.loads(pickle.dumps(fam1)) + assert list(fam1.values()) == list(fam2.values()) + + +# ------------------------------------------------------------------------------ + + +def test_fam_array_get_all_a(): + a1 = np.array((1, 100, 300, 4000)) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + with pytest.raises(TypeError): + fam.get_all((3, 3)) + + with pytest.raises(TypeError): + fam.get_all("a") + + with pytest.raises(TypeError): + fam.get_all(None) + + +def test_fam_array_get_all_b(): + a1 = np.array((1, 100, 300, 4000)) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + post1 = fam.get_all([300, 100]) + assert post1.tolist() == [2, 1] + x = [y for y in post1] + del x + del post1 + post2 = fam.get_all([4000, 4000, 4000]) + assert post2.tolist() == [3, 3, 3] + x = [y for y in post2] + del x + + +def test_fam_array_get_all_c(): + a1 = np.array(("a", "bb", "ccc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + with pytest.raises(KeyError): + fam.get_all(["bb", "c"]) + + +def test_fam_array_get_all_d1(): + a1 = np.array(("a", "bb", "ccc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + post1 = fam.get_all(np.array(("bb", "a", "ccc", "a", "bb"))) + assert post1.tolist() == [1, 0, 2, 0, 1] + assert post1.flags.writeable == False + + +def test_fam_array_get_all_d2(): + a1 = np.array(("a", "bb", "ccc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + with pytest.raises(KeyError): + fam.get_all(np.array(("bb", "a", "ccc", "aa"))) + + +def test_fam_array_get_all_e(): + a1 = np.array((2,), dtype=np.uint64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + assert fam.get_all([2]) == [0] + assert fam.get_all(a1) == [0] + + +def test_fam_array_get_all_f1(): + a1 = np.array(("a", "bb", "ccc", "dd")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + post = fam.get_all(np.array(["ccc", "dd", "bb", "bb"])) + assert post.tolist() == [2, 3, 1, 1] + + +def test_fam_array_get_all_f2(): + a1 = np.array(("a", "bb", "ccc", "dd")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + with pytest.raises(KeyError): + fam.get_all(np.array(["bb", "c"])) + + +def test_fam_array_get_all_g1(): + a1 = np.array((b"a", b"bb", b"ccc", b"dd")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + post = fam.get_all(np.array([b"ccc", b"dd", b"bb", b"bb"])) + assert post.tolist() == [2, 3, 1, 1] + + +def test_fam_array_get_all_g2(): + a1 = np.array((b"a", b"bb", b"ccc", b"dd")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + with pytest.raises(KeyError): + fam.get_all(np.array([b"dd", b"x"])) + + +def test_fam_array_get_all_h(): + a1 = np.array((b"a", b"")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + post = fam.get_all(np.array([b"", b"", b"a"])) + assert post.tolist() == [1, 1, 0] + + +def test_fam_array_get_all_i(): + a1 = np.array((b"foo", b"bar")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + with pytest.raises(KeyError): + _ = fam.get_all(np.array([b"fo", b"ba"])) + + with pytest.raises(KeyError): + _ = fam.get_all(np.array([b"", b""])) + + +def test_fam_array_get_all_j(): + a1 = np.array(("aaaaa", "bb", "ccc", "dd")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + with pytest.raises(KeyError): + _ = fam.get_all(np.array(["a", "b"])) + + assert fam.get_all(np.array(("bb", "dd", "bb", "dd"))).tolist() == [1, 3, 1, 3] + + +def test_fam_array_get_all_k1(): + a1 = np.array(("2023-01-05", "1854-05-02"), np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + post = fam.get_all( + np.array(["1854-05-02", "2023-01-05", "2023-01-05"], np.datetime64) + ) + assert post.tolist() == [1, 0, 0] + + +def test_fam_array_get_all_k2(): + a1 = np.array(("2023-01-05", "1854-05-02"), np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + with pytest.raises(KeyError): + post = fam.get_all( + np.array(["1854-05-02", "2023-01-05", "2020-01-05"], np.datetime64) + ) + + +def test_fam_array_get_all_l(): + a1 = np.array(("2023-01-05", "1854-05-02", "1988-01-01"), np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + with pytest.raises(KeyError): + _ = fam.get_all(np.array(["2022-01", "2023-01", "1988-01"], np.datetime64)) + + +def test_fam_array_get_all_m1(): + # NOTE: small than 64bit arrays in FAMs do not get optimal array lookup performance + a1 = np.array((1, 100, 300), dtype=np.int32) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + post1 = fam.get_all(np.array([300, 100], dtype=np.int64)) + assert post1.tolist() == [2, 1] + + +def test_fam_array_get_all_m2(): + a1 = np.array((1, 100, 300), dtype=np.int16) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + post1 = fam.get_all(np.array([300, 100], dtype=np.int64)) + assert post1.tolist() == [2, 1] + + +def test_fam_array_get_all_m3(): + a1 = np.array((1, 100, 30), dtype=np.int8) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + post1 = fam.get_all(np.array([30, 100], dtype=np.int64)) + assert post1.tolist() == [2, 1] + + post2 = fam.get_all(np.array([30, 100], dtype=np.int8)) + assert post2.tolist() == [2, 1] + + +# ------------------------------------------------------------------------------- + + +def test_fam_array_get_any_a1(): + a1 = np.array(("a", "bb", "ccc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + post1 = fam.get_any(["bbb", "ccc", "a", "bbb"]) + assert post1 == [2, 0] + + post2 = fam.get_any(["bbb", "bbb"]) + assert post2 == [] + + +def test_fam_array_get_any_a2(): + a1 = np.array(("a", "bb", "ccc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + post1 = fam.get_any(np.array(("bbb", "a", "ccc", "aa", "bbb"))) + assert post1 == [0, 2] + + +def test_fam_array_get_any_a3(): + a1 = np.array(("a", "bb", "ccc")) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + post1 = fam.get_any(np.array(["bbb", "ccc", "a", "bbb"])) + assert post1 == [2, 0] + + post2 = fam.get_any(np.array(["bbb", "bbb"])) + assert post2 == [] + + +def test_fam_array_get_any_b(): + a1 = np.array([4294967295], dtype=np.uint32) + a1.flags.writeable = False + a1_list = list(a1) + fam = FrozenAutoMap(a1) + assert a1[0] in fam + assert 4294967295 in fam + + post1 = fam.get_any(a1_list) + assert post1 == list(fam.values()) + + +def test_fam_array_get_any_c1(): + a1 = np.array(("2023-01-05", "1854-05-02"), np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + post = fam.get_any( + np.array( + ["1854-05-02", "nat", "1854-05-02", "2023-01-05", "nat"], np.datetime64 + ) + ) + assert post == [1, 1, 0] + + +def test_fam_array_get_any_c2(): + a1 = np.array(("2023-01-05", "1854-05-02"), np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + post = fam.get_any( + np.array(["1854-05-02", "2023-01-05", "2020-01-05"], np.datetime64) + ) + assert post == [1, 0] + + +def test_fam_array_get_any_d(): + a1 = np.array(("2023-01-05", "1854-05-02", "1988-01-01"), np.datetime64) + a1.flags.writeable = False + fam = FrozenAutoMap(a1) + + post = fam.get_any(np.array(["2022-01", "2023-01", "1988-01"], np.datetime64)) + assert post == [] diff --git a/test/test_auto_map_property.py b/test/test_auto_map_property.py new file mode 100644 index 00000000..a95fbeeb --- /dev/null +++ b/test/test_auto_map_property.py @@ -0,0 +1,283 @@ +import pickle +import typing as tp +from functools import partial +import sys +import warnings + +import numpy as np +import hypothesis +from hypothesis.extra.numpy import arrays +from hypothesis.extra.numpy import scalar_dtypes +from hypothesis import strategies as st +from hypothesis import given + +import pytest + +from arraykit import AutoMap +from arraykit import FrozenAutoMap +from arraykit import NonUniqueError + +Keys = tp.Set[tp.Union[int, str, float, bool, bytes, tp.Tuple[int, ...]]] + +NATIVE_BYTE_ORDER = "<" if sys.byteorder == "little" else ">" +VALID_BYTE_ORDERS = ("=", NATIVE_BYTE_ORDER) + + +def get_array() -> st.SearchStrategy: + """ + Labels are suitable for creating non-date Indices (though they might include dates); these labels might force an object array result. + """ + + def proc(a: np.ndarray, contiguous: bool): + if a.dtype.kind in ("f", "c"): + a = a[~np.isnan(a)] + elif a.dtype.kind in ("m", "M"): + a = a[~np.isnat(a)] + + if a.dtype.byteorder not in VALID_BYTE_ORDERS: + a = a.astype(a.dtype.newbyteorder(NATIVE_BYTE_ORDER)) + + if not contiguous: + a = np.lib.stride_tricks.as_strided( + a, + shape=(len(a) // 2,), + strides=(a.dtype.itemsize * 2,), + ) + + a.flags.writeable = False + return a + + def strategy(contiguous: bool): + return arrays( + shape=1, unique=True, fill=st.nothing(), dtype=scalar_dtypes() + ).map(partial(proc, contiguous=contiguous)) + + return st.one_of( + strategy(contiguous=True), + strategy(contiguous=False), + ) + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer) +def test_am___len__(keys: Keys) -> None: + assert len(AutoMap(keys)) == len(keys) + + +@given(keys=get_array()) +def test_fam_array___len__(keys: Keys) -> None: + assert len(FrozenAutoMap(keys)) == len(keys) + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer, others=hypothesis.infer) +def test_am___contains__(keys: Keys, others: Keys) -> None: + a = AutoMap(keys) + for key in keys: + assert key in a + others -= keys + for key in others: + assert key not in a + + +@given(keys=get_array()) +def test_fam_array___contains__(keys: Keys) -> None: + fam = FrozenAutoMap(keys) + for key in keys: + assert key in fam + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer, others=hypothesis.infer) +def test_am___getitem__(keys: Keys, others: Keys) -> None: + a = AutoMap(keys) + for index, key in enumerate(keys): + assert a[key] == index + others -= keys + for key in others: + with pytest.raises(KeyError): + a[key] + + +@given(keys=get_array()) +def test_fam_array___getitem__(keys: Keys) -> None: + a = FrozenAutoMap(keys) + for index, key in enumerate(keys): + assert a[key] == index + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer) +def test_am___hash__(keys: Keys) -> None: + assert hash(FrozenAutoMap(keys)) == hash(FrozenAutoMap(keys)) + + +@given(keys=get_array()) +def test_fam_array___hash__(keys: Keys) -> None: + assert hash(FrozenAutoMap(keys)) == hash(FrozenAutoMap(keys)) + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer) +def test_am___iter__(keys: Keys) -> None: + assert [*AutoMap(keys)] == [*keys] + + +@given(keys=hypothesis.infer) +def test_fam_array___iter__(keys: Keys) -> None: + assert [*FrozenAutoMap(keys)] == [*keys] + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer) +def test_am___reversed__(keys: Keys) -> None: + assert [*reversed(AutoMap(keys))] == [*reversed([*keys])] + + +@given(keys=get_array()) +def test_fam_array___reversed__(keys: Keys) -> None: + assert [*reversed(FrozenAutoMap(keys))] == [*reversed([*keys])] + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer) +def test_am_add(keys: Keys) -> None: + a = AutoMap() + for l, key in enumerate(keys): + assert a.add(key) is None + assert len(a) == l + 1 + assert a[key] == l + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer) +def test_am_pickle(keys: Keys) -> None: + try: + hypothesis.assume(pickle.loads(pickle.dumps(keys)) == keys) + except (TypeError, pickle.PicklingError): + hypothesis.assume(False) + a = AutoMap(keys) + assert pickle.loads(pickle.dumps(a)) == a + + +@given(keys=get_array()) +def test_fam_array_pickle(keys: Keys) -> None: + a = FrozenAutoMap(keys) + assert list(pickle.loads(pickle.dumps(a))) == list(a) + + +# ------------------------------------------------------------------------------- +@given(keys=hypothesis.infer) +def test_issue_3(keys: Keys) -> None: + hypothesis.assume(keys) + key = keys.pop() + a = AutoMap(keys) + a |= (key,) + with pytest.raises(ValueError): + a |= (key,) + + +@given(keys=hypothesis.infer) +def test_am_non_unique_exception(keys: Keys): + hypothesis.assume(keys) + duplicate = next(iter(keys)) + + with pytest.raises(ValueError): + AutoMap([*keys, duplicate]) + + with pytest.raises(NonUniqueError): + AutoMap([*keys, duplicate]) + + +@given(keys=get_array()) +def test_fam_array_non_unique_exception(keys: Keys): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + hypothesis.assume(keys) + duplicate = next(iter(keys)) + + with pytest.raises(ValueError): + FrozenAutoMap([*keys, duplicate]) + + with pytest.raises(NonUniqueError): + FrozenAutoMap([*keys, duplicate]) + + +# ------------------------------------------------------------------------------- +@given(keys=get_array()) +def test_fam_array_get_all(keys: Keys) -> None: + fam = FrozenAutoMap(keys) + keys_list = list(keys) + + post1 = fam.get_all(keys_list) + assert list(post1) == list(fam.values()) + + post2 = fam.get_all(keys) + assert list(post2) == list(fam.values()) + + +@given(keys=get_array()) +def test_fam_array_get_any(keys: Keys) -> None: + fam = FrozenAutoMap(keys) + keys_list = list(keys) + + post1 = fam.get_any(keys_list) + assert post1 == list(fam.values()) + + post2 = fam.get_any(keys) + assert post2 == list(fam.values()) + + +# ------------------------------------------------------------------------------- + + +@given(keys=get_array()) +def test_am_array_get_all(keys: Keys) -> None: + fam = AutoMap(keys) + keys_list = list(keys) + + post1 = fam.get_all(keys_list) + assert list(post1) == list(fam.values()) + + post2 = fam.get_all(keys) + assert list(post2) == list(fam.values()) + + +@given(keys=get_array()) +def test_am_array_get_any(keys: Keys) -> None: + fam = AutoMap(keys) + keys_list = list(keys) + + post1 = fam.get_any(keys_list) + assert post1 == list(fam.values()) + + post2 = fam.get_any(keys) + assert post2 == list(fam.values()) + + +# ------------------------------------------------------------------------------- + + +@given(keys=get_array()) +def test_am_get_all(keys: Keys) -> None: + keys_list = list(keys) + fam = AutoMap(keys_list) + + post1 = fam.get_all(keys_list) + assert list(post1) == list(fam.values()) + + post2 = fam.get_all(keys_list) + assert list(post2) == list(fam.values()) + + +@given(keys=get_array()) +def test_am_get_any(keys: Keys) -> None: + keys_list = list(keys) + fam = AutoMap(keys_list) + + post1 = fam.get_any(keys_list) + assert post1 == list(fam.values()) + + post2 = fam.get_any(keys_list) + assert post2 == list(fam.values()) diff --git a/test/test_pyi.py b/test/test_pyi.py index 8e39516a..86fc76a8 100644 --- a/test/test_pyi.py +++ b/test/test_pyi.py @@ -30,7 +30,7 @@ def from_module(cls, module): continue obj = getattr(module, name) if isinstance(obj, type): # a class - if name == ak.ErrorInitTypeBlocks.__name__: + if name in (ak.ErrorInitTypeBlocks.__name__, ak.NonUniqueError.__name__): # skip as there is Python version variability continue classes[name] = [] @@ -58,11 +58,22 @@ def test_interface(self) -> None: spec = spec_from_loader('', loader=None) pyi_mod = module_from_spec(spec) + exec(msg, pyi_mod.__dict__) ak_content = Interface.from_module(ak) pyi_content = Interface.from_module(pyi_mod) - self.assertEqual(ak_content, pyi_content) + + self.assertEqual(ak_content.functions, pyi_content.functions) + + for name in ak_content.classes.keys(): + ak_class = ak_content.classes[name] + pyi_class = pyi_content.classes[name] + + if '__hash__' in ak_class: ak_class.remove('__hash__') + if '__hash__' in pyi_class: pyi_class.remove('__hash__') + + self.assertEqual(ak_class, pyi_class) if __name__ == '__main__':