diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b947cfb..8f97497 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -2,9 +2,9 @@ name: tests on: push: - branches: [ "master" ] + branches: [ "*" ] pull_request: - branches: [ "master" ] + branches: [ "*" ] jobs: build: @@ -24,7 +24,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip wheel setuptools pytest + python -m pip install --upgrade pip wheel setuptools pytest packaging python -m pip install -e . - name: Test with pytest run: | diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 79cf27f..dd5c71b 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -25,14 +25,14 @@ jobs: python-version: '3.10' - name: Install cibuildwheel - run: python -m pip install --upgrade pip wheel setuptools cibuildwheel + run: python -m pip install --upgrade pip wheel setuptools packaging cibuildwheel - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse # Options (https://cibuildwheel.readthedocs.io/en/stable/options/) env: CIBW_SKIP: cp36-* cp37-* cp38-* *pp37-* pp38-* *musllinux* - CIBW_BEFORE_BUILD_MACOS: python3 -m pip install --upgrade setuptools wheel cffi + CIBW_BEFORE_BUILD_MACOS: python3 -m pip install --upgrade setuptools wheel cffi packaging CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_ARCHS_WINDOWS: "AMD64" CIBW_ARCHS_LINUX: "x86_64" diff --git a/.gitignore b/.gitignore index b0fcd31..94e6234 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,9 @@ console/ # Virtual environments venv p35*/ +p312 +p313 +p314 # Installer logs pip-log.txt diff --git a/MANIFEST.in b/MANIFEST.in index e331ac2..04027fb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,5 +5,7 @@ prune test exclude src/icegrams/*.o exclude src/icegrams/*.so exclude src/icegrams/*.pyd +exclude src/icegrams/*.DS_Store +exclude src/icegrams/resources/*.DS_Store prune src/icegrams/resources include src/icegrams/resources/trigrams.bin diff --git a/README.rst b/README.rst index 9dc7bf7..95044e9 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,7 @@ Icegrams: A fast, compact trigram library for Icelandic Overview ******** -**Icegrams** is an MIT-licensed Python 3 (>= 3.7) package that encapsulates a +**Icegrams** is an MIT-licensed Python 3 (>=3.9) package that encapsulates a **large trigram library for Icelandic**. (A trigram is a tuple of three consecutive words or tokens that appear in real-world text.) @@ -319,8 +319,8 @@ to placeholder strings, see the Prerequisites ************* -This package runs on CPython 3.7 or newer, and on PyPy 3.7 or newer. It -has been tested on Linux (gcc on x86-64 and ARMhf), MacOS (clang) and +This package runs on CPython 3.9 or newer, and on PyPy 3.9 or newer. It +has been tested on Linux (gcc on x86-64 and ARMhf), macOS (clang) and Windows (MSVC). If a binary wheel package isn't available on `PyPI `_ @@ -363,6 +363,7 @@ virtualenv), then run:: Changelog ********* +* Version 1.1.3: Minor tweaks. Support for Python 3.13. Now requires Python 3.9+. (2024-08-27) * Version 1.1.2: Minor bug fixes. Cross-platform wheels provided. Now requires Python 3.7+. (2022-12-14) * Version 1.1.0: Python 3.5 support dropped; macOS builds fixed; PyPy wheels generated @@ -375,7 +376,7 @@ Changelog Copyright and licensing *********************** -Icegrams is Copyright © 2024 `Miðeind ehf. `__. +Icegrams is Copyright © 2020-2024 `Miðeind ehf. `__. The original author of this software is *Vilhjálmur Þorsteinsson*. This software is licensed under the **MIT License**: diff --git a/test.py b/old/test.py similarity index 100% rename from test.py rename to old/test.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..67595da --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,67 @@ +[project] +name = "icegrams" +version = "1.1.3" +description = "Trigram statistics for Icelandic" +authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }] +maintainers = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }] +readme = { file = "README.rst", content-type = "text/x-rst" } +license = { text = "MIT" } +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Natural Language :: Icelandic", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Utilities", + "Topic :: Text Processing :: Linguistic", +] +keywords = ["nlp", "trigram", "ngram", "trigrams", "ngrams", "icelandic"] +requires-python = ">=3.9" +dependencies = ["cffi>=1.15.1"] + +[project.urls] +Repository = "https://github.com/mideind/Icegrams" + +[project.optional-dependencies] +# dev dependencies +dev = ["pytest"] + +# *** Configuration of tools *** + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +where = ["src"] + +[tool.pytest.ini_options] +filterwarnings = [ + # Ignore deprecation warnings in libraries, their problem not ours + # "ignore::DeprecationWarning", +] + +[tool.ruff] +line-length = 88 + +[tool.black] +line-length = 88 + +[tool.isort] +# This forces these imports to placed at the top +known_future_library = ["__future__", "typing", "typing_extensions"] +profile = "black" +line_length = 88 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 7eb3531..198a7b4 --- a/setup.py +++ b/setup.py @@ -1,115 +1,17 @@ #!/usr/bin/env python3 -""" - - Icegrams: A trigrams library for Icelandic - - setup.py - - Copyright (C) 2024 Miðeind ehf. - Author: Vilhjálmur Þorsteinsson - - This software is licensed under the MIT License: - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - This module sets up the icegrams package. It uses the cffi_modules - parameter, available in recent versions of setuptools, to - automatically compile the trie.cpp module to trie.*.so/.pyd - and build the required CFFI Python wrapper via trie_build.py. - - Note that installing under PyPy >= 3.7 is supported. - -""" - -from typing import Any - -import io -import re -import sys from glob import glob -from os.path import basename, dirname, join, splitext +from os.path import basename, splitext from setuptools import find_packages, setup - -if sys.version_info < (3, 7): - print("Icegrams requires Python >= 3.7") - sys.exit(1) - - -def read(*names: Any, **kwargs: Any): - try: - return io.open( - join(dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8") - ).read() - except (IOError, OSError): - return "" - - setup( - name="icegrams", - version="1.1.2", # Also update in src/icegrams/__init__.py - license="MIT", - description="Trigram statistics for Icelandic", - long_description="{0}\n{1}".format( - re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub( - "", read("README.rst") - ), - re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", read("CHANGELOG.rst")), - ), - author="Miðeind ehf", - author_email="mideind@mideind.is", - url="https://github.com/mideind/Icegrams", packages=find_packages("src"), package_dir={"": "src"}, py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], package_data={"icegrams": ["py.typed"]}, include_package_data=True, zip_safe=False, - classifiers=[ - # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: Unix", - "Operating System :: POSIX", - "Operating System :: Microsoft :: Windows", - "Operating System :: MacOS", - "Natural Language :: Icelandic", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Utilities", - "Topic :: Text Processing :: Linguistic", - ], - keywords=["nlp", "trigram", "ngram", "trigrams", "ngrams", "icelandic"], setup_requires=["cffi>=1.15.1"], install_requires=["cffi>=1.15.1"], cffi_modules=["src/icegrams/trie_build.py:ffibuilder"], diff --git a/src/icegrams/__init__.py b/src/icegrams/__init__.py index 16614ac..8fd9855 100644 --- a/src/icegrams/__init__.py +++ b/src/icegrams/__init__.py @@ -33,10 +33,11 @@ """ -# Expose the icegrams API - -from .ngrams import Ngrams, MAX_ORDER # type: ignore +import importlib.metadata __author__ = "Miðeind ehf." -__copyright__ = "(C) 2024 Miðeind ehf." -__version__ = "1.1.2" +__copyright__ = "(C) 2020-2024 Miðeind ehf." +__version__ = importlib.metadata.version("icegrams") + +# Expose the icegrams API +from .ngrams import Ngrams, MAX_ORDER diff --git a/src/icegrams/ngrams.py b/src/icegrams/ngrams.py index 7f54e96..6f8614a 100644 --- a/src/icegrams/ngrams.py +++ b/src/icegrams/ngrams.py @@ -143,7 +143,7 @@ # Import the CFFI wrapper for the trie.cpp C++ module # (see also trie.py and build_trie.py) -elif __name__ == "__main__": +if __name__ == "__main__": # Running as a main program from _trie import lib as trie_cffi, ffi # type: ignore # pylint: disable=import-error from trie import Trie @@ -155,12 +155,10 @@ # Make sure that the trigrams.bin file is # unpacked and ready for use - import pkg_resources + import importlib.resources as importlib_resources - # Note: the resource path below should NOT use os.path.join() - BINARY_FILENAME = pkg_resources.resource_filename( # type: ignore - __name__, "resources/trigrams.bin" - ) + ref = importlib_resources.files("icegrams").joinpath("resources", "trigrams.bin") + BINARY_FILENAME = str(ref) ffi: Any = cast(Any, ffi) # type: ignore trie_cffi: Any = cast(Any, trie_cffi) # type: ignore @@ -384,8 +382,9 @@ def __init__(self, b: Optional[bytes] = None) -> None: # (usually a memoryview() object) self.b = b self.ffi_b: Optional[bytes] = ( - None if b is None else - ffi.cast("uint8_t*", ffi.from_buffer(b)) # type: ignore + None + if b is None + else ffi.cast("uint8_t*", ffi.from_buffer(b)) # type: ignore ) self.n = 0 self.u = 0 @@ -533,7 +532,9 @@ def search(self, p1: int, p2: int, i: int) -> Optional[int]: """Look for i in the range [p1, p2> within the list""" if self.ffi_b is None: raise ValueError("Search not allowed in uncompressed list") - r = cast(int, trie_cffi.searchMonotonic(self.ffi_b, self.QUANTUM_SIZE, p1, p2, i)) + r = cast( + int, trie_cffi.searchMonotonic(self.ffi_b, self.QUANTUM_SIZE, p1, p2, i) + ) return None if r == 0xFFFFFFFF else r def search_prefix(self, p1: int, p2: int, i: int) -> Optional[int]: @@ -864,8 +865,8 @@ def bigram_frequency(self, i0: Optional[int], i1: Optional[int]) -> int: if i0 is None or i1 is None: return 0 # Check degenerate case - #if not (i0 or i1): - # return 0 + if not (i0 or i1): + return 0 assert self._unigram_ptrs_ml is not None p1, p2 = self._unigram_ptrs_ml.lookup_pair(i0) # Then, look for id i1 within the level 2 ids delimited by [p1, p2> @@ -1000,7 +1001,9 @@ def unigram_succ(self, n: int, i0: Optional[int]) -> List[Tuple[str, float]]: result = sorted(result, key=lambda e: e[1], reverse=True)[0:n] return [(self.id_to_word(j), lp) for j, lp in result] - def bigram_succ(self, n: int, i0: Optional[int], i1: Optional[int]) -> List[Tuple[str, float]]: + def bigram_succ( + self, n: int, i0: Optional[int], i1: Optional[int] + ) -> List[Tuple[str, float]]: """Return successors to the bigram (i0, i1)""" if i0 is None or i1 is None: return [] @@ -1084,9 +1087,7 @@ def read_tsv(self, fname: str, *, add_all_bigrams: bool = False) -> None: vocab[to_bytes(w2)] += 1 # Trie that maps unigrams to integer identifiers using_empty = b"" in vocab - trie: Any = Trie( - reserve_zero_for_empty=using_empty - ) + trie: Any = Trie(reserve_zero_for_empty=using_empty) # Dict to map words to integer ids ids = {b"": 0} if using_empty else {} # Build the trie in decreasing order of occurrences, ensuring that diff --git a/src/icegrams/trie_build.py b/src/icegrams/trie_build.py index 76142b4..66810f9 100644 --- a/src/icegrams/trie_build.py +++ b/src/icegrams/trie_build.py @@ -44,9 +44,9 @@ # change it in setup.py as well ffibuilder = cffi.FFI() -_PATH = os.path.dirname(__file__) or "." WINDOWS = platform.system() == "Windows" MACOS = platform.system() == "Darwin" +IMPLEMENTATION = platform.python_implementation() # What follows is the actual Python-wrapped C interface to trie.*.so # It must be kept in sync with trie.h @@ -100,12 +100,18 @@ extra_compile_args = ["/Zc:offsetof-"] elif MACOS: os.environ["CFLAGS"] = "-stdlib=libc++" # Fixes PyPy build on macOS 10.15.6+ + os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9" extra_compile_args = ["-mmacosx-version-min=10.7", "-stdlib=libc++"] else: # Adding -O3 to the compiler arguments doesn't seem to make # any discernible difference in lookup speed extra_compile_args = ["-std=c++11"] +# On some systems, the linker needs to be told to use the C++ compiler +# under PyPy due to changes in the default behaviour of distutils. +if IMPLEMENTATION == "PyPy": + os.environ["LDCXXSHARED"] = "c++ -shared" + ffibuilder.set_source( "icegrams._trie", # trie.cpp is written in C++ but must export a pure C interface. @@ -120,4 +126,3 @@ if __name__ == "__main__": ffibuilder.compile(verbose=False) -