From 82510c4d5cd9d0488249fc4149345b5d357febfd Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Mon, 26 Aug 2024 23:15:38 +0000 Subject: [PATCH 1/7] Updated CFFI build config --- .github/workflows/python-package.yml | 4 +- README.md | 404 +++++++++++++++++++++++++++ src/icegrams/trie_build.py | 9 +- 3 files changed, 413 insertions(+), 4 deletions(-) create mode 100644 README.md diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b947cfb..0058679 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -2,9 +2,9 @@ name: tests on: push: - branches: [ "master" ] + branches: [ "*" ] pull_request: - branches: [ "master" ] + branches: [ "*" ] jobs: build: diff --git a/README.md b/README.md new file mode 100644 index 0000000..c7db1c4 --- /dev/null +++ b/README.md @@ -0,0 +1,404 @@ +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) +![Release](https://shields.io/github/v/release/mideind/Icegrams?display_name=tag) +![PyPI](https://img.shields.io/pypi/v/icegrams) +![Build](https://github.com/mideind/Icegrams/actions/workflows/python-package.yml/badge.svg) + +# Icegrams + +**A fast, compact trigram library for Icelandic** + +## Overview + +**Icegrams** is an MIT-licensed Python 3 (>= 3.7) package that encapsulates a +**large trigram library for Icelandic**. (A trigram is a tuple of +three consecutive words or tokens that appear in real-world text.) + +14 million unique trigrams and their frequency counts are heavily compressed +using radix tries and `quasi-succinct indexes `_ +employing Elias-Fano encoding. This enables the ~43 megabyte compressed trigram file +to be mapped directly into memory, with no *ex ante* decompression, for fast queries +(typically ~10 microseconds per lookup). + +The Icegrams library is implemented in Python and C/C++, glued together via +`CFFI `_. + +The trigram storage approach is based on a +`2017 paper by Pibiri and Venturini `_, +also referring to +`Ottaviano and Venturini `_ +(2014) regarding partitioned Elias-Fano indexes. + +You can use Icegrams to obtain probabilities (relative frequencies) of +over a million different **unigrams** (single words or tokens), or of +**bigrams** (pairs of two words or tokens), or of **trigrams**. You can also +ask it to return the N most likely successors to any unigram or bigram. + +Icegrams is useful for instance in spelling correction, predictive typing, +to help disabled people write text faster, and for various text generation, +statistics and modelling tasks. + +The Icegrams trigram corpus is built from the 2017 edition of the +Icelandic Gigaword Corpus +(`Risamálheild `_), +which is collected and maintained by *The Árni Magnússon Institute* +*for Icelandic Studies*. A mixed, manually vetted subset consisting of 157 +documents from the corpus was used as the source of the token stream, +yielding over 100 million tokens. Trigrams that only occurred +once or twice in the stream were eliminated before creating the +compressed Icegrams database. The creation process is further +`described here `_. + +## Example + +>>> from icegrams import Ngrams +>>> ng = Ngrams() + +>>> # Obtain the frequency of the unigram 'Ísland' + +>>> ng.freq("Ísland") +42018 + +>>> # Obtain the probability of the unigram 'Ísland', as a fraction + +>>> # of the frequency of all unigrams in the database + +>>> ng.prob("Ísland") +0.0003979926900206475 + +>>> # Obtain the log probability (base e) of the unigram 'Ísland' + +>>> ng.logprob("Ísland") +-7.8290769196308005 + +>>> # Obtain the frequency of the bigram 'Katrín Jakobsdóttir' + +>>> ng.freq("Katrín", "Jakobsdóttir") +3517 + +>>> # Obtain the probability of 'Jakobsdóttir' given 'Katrín' + +>>> ng.prob("Katrín", "Jakobsdóttir") +0.23298013245033142 + +>>> # Obtain the probability of 'Júlíusdóttir' given 'Katrín' + +>>> ng.prob("Katrín", "Júlíusdóttir") +0.013642384105960274 + +>>> # Obtain the frequency of 'velta fyrirtækisins er' + +>>> ng.freq("velta", "fyrirtækisins", "er") +4 + +>>> # adj_freq returns adjusted frequencies, i.e incremented by 1 + +>>> ng.adj_freq("xxx", "yyy", "zzz") +1 + +>>> # Obtain the N most likely successors of a given unigram or bigram + +>>> # in descending order by log probability of each successor + +>>> ng.succ(10, "stjórnarskrá", "lýðveldisins") +[('Íslands', -1.3708244393477589), ('.', -2.2427905461504567), + (',', -3.313814878299737), ('og', -3.4920631097060557), ('sem', -4.566577846795106), + ('er', -4.720728526622363), ('að', -4.807739903611993), ('um', -5.0084105990741445), + ('en', -5.0084105990741445), ('á', -5.25972502735505)] + +## Reference + +### Initializing Icegrams + +After installing the ``icegrams`` package, use the following code to +import it and initialize an instance of the ``Ngrams`` class:: + + from icegrams import Ngrams + ng = Ngrams() + +Now you can use the ``ng`` instance to query for unigram, bigram +and trigram frequencies and probabilities. + +### The Ngrams class + +* ``__init__(self)`` + + Initializes the ``Ngrams`` instance. + +* ``freq(self, *args) -> int`` + + Returns the frequency of a unigram, bigram or trigram. + + * ``str[] *args`` A parameter sequence of consecutive unigrams + to query the frequency for. + * **returns** An integer with the frequency of the unigram, + bigram or trigram. + + To query for the frequency of a unigram in the text, call + ``ng.freq("unigram1")``. This returns the number of times that + the unigram appears in the database. The unigram is + queried as-is, i.e. with no string stripping or lowercasing. + + To query for the frequency of a bigram in the text, call + ``ng.freq("unigram1", "unigram2")``. + + To query for the frequency of a trigram in the text, call + ``ng.freq("unigram1", "unigram2", "unigram3")``. + + If you pass more than 3 arguments to ``ng.freq()``, only the + last 3 are significant, and the query will be treated + as a trigram query. + + Examples:: + + >>>> ng.freq("stjórnarskrá") + 2973 + >>>> ng.freq("stjórnarskrá", "lýðveldisins") + 39 + >>>> ng.freq("stjórnarskrá", "lýðveldisins", "Íslands") + 12 + >>>> ng.freq("xxx", "yyy", "zzz") + 0 + +* ``adj_freq(self, *args) -> int`` + + Returns the adjusted frequency of a unigram, bigram or trigram. + + * ``str[] *args`` A parameter sequence of consecutive unigrams + to query the frequency for. + * **returns** An integer with the adjusted frequency of the unigram, + bigram or trigram. The adjusted frequency is the actual + frequency plus 1. The method thus never returns 0. + + To query for the frequency of a unigram in the text, call + ``ng.adj_freq("unigram1")``. This returns the number of times that + the unigram appears in the database, plus 1. The unigram is + queried as-is, i.e. with no string stripping or lowercasing. + + To query for the frequency of a bigram in the text, call + ``ng.adj_freq("unigram1", "unigram2")``. + + To query for the frequency of a trigram in the text, call + ``ng.adj_freq("unigram1", "unigram2", "unigram3")``. + + If you pass more than 3 arguments to ``ng.adj_freq()``, only the + last 3 are significant, and the query will be treated + as a trigram query. + + Examples:: + + >>>> ng.adj_freq("stjórnarskrá") + 2974 + >>>> ng.adj_freq("stjórnarskrá", "lýðveldisins") + 40 + >>>> ng.adj_freq("stjórnarskrá", "lýðveldisins", "Íslands") + 13 + >>>> ng.adj_freq("xxx", "yyy", "zzz") + 1 + +* ``prob(self, *args) -> float`` + + Returns the probability of a unigram, bigram or trigram. + + * ``str[] *args`` A parameter sequence of consecutive unigrams + to query the probability for. + * **returns** A float with the probability of the given unigram, + bigram or trigram. + + The probability of a *unigram* is + the frequency of the unigram divided by the sum of the + frequencies of all unigrams in the database. + + The probability of a *bigram* ``(u1, u2)`` is the frequency + of the bigram divided by the frequency of the unigram ``u1``, + i.e. how likely ``u2`` is to succeed ``u1``. + + The probability of a trigram ``(u1, u2, u3)`` is the frequency + of the trigram divided by the frequency of the bigram ``(u1, u2)``, + i.e. how likely ``u3`` is to succeed ``u1 u2``. + + If you pass more than 3 arguments to ``ng.prob()``, only the + last 3 are significant, and the query will be treated + as a trigram probability query. + + Examples:: + + >>>> ng.prob("stjórnarskrá") + 2.8168929772755334e-05 + >>>> ng.prob("stjórnarskrá", "lýðveldisins") + 0.01344989912575655 + >>>> ng.prob("stjórnarskrá", "lýðveldisins", "Íslands") + 0.325 + +* ``logprob(self, *args) -> float`` + + Returns the log probability of a unigram, bigram or trigram. + + * ``str[] *args`` A parameter sequence of consecutive unigrams + to query the log probability for. + * **returns** A float with the natural logarithm (base *e*) of the + probability of the given unigram, bigram or trigram. + + The probability of a *unigram* is + the adjusted frequency of the unigram divided by the sum of the + frequencies of all unigrams in the database. + + The probability of a *bigram* ``(u1, u2)`` is the adjusted frequency + of the bigram divided by the adjusted frequency of the unigram ``u1``, + i.e. how likely ``u2`` is to succeed ``u1``. + + The probability of a trigram ``(u1, u2, u3)`` is the adjusted frequency + of the trigram divided by the adjusted frequency of the bigram ``(u1, u2)``, + i.e. how likely ``u3`` is to succeed ``u1 u2``. + + If you pass more than 3 arguments to ``ng.logprob()``, only the + last 3 are significant, and the query will be treated + as a trigram probability query. + + Examples:: + + >>>> ng.logprob("stjórnarskrá") + -10.477290968535172 + >>>> ng.logprob("stjórnarskrá", "lýðveldisins") + -4.308783672906165 + >>>> ng.logprob("stjórnarskrá", "lýðveldisins", "Íslands") + -1.1239300966523995 + +* ``succ(self, n, *args) -> list[tuple]`` + + Returns the *N* most probable successors of a unigram or bigram. + + * ``int n`` A positive integer specifying how many successors, + at a maximum, should be returned. + * ``str[] *args`` One or two string parameters containing the + unigram or bigram to query the successors for. + * **returns** A list of tuples of (successor unigram, log probability), + in descending order of probability. + + If you pass more than 2 string arguments to ``ng.succ()``, only the + last 2 are significant, and the query will be treated + as a bigram successor query. + + Examples:: + + >>>> ng.succ(2, "stjórnarskrá") + [('.', -1.8259625296091855), ('landsins', -2.223111581475692)] + >>>> ng.succ(2, "stjórnarskrá", "lýðveldisins") + [('Íslands', -1.1239300966523995), ('og', -1.3862943611198904)] + + >>>> # The following is equivalent to ng.succ(2, "lýðveldisins", "Íslands") + + >>>> ng.succ(2, "stjórnarskrá", "lýðveldisins", "Íslands") + [('.', -1.3862943611198908), (',', -1.6545583477145702)] + +## Notes + +Icegrams is built with a sliding window over the source text. This means that +a sentence such as ``"Maðurinn borðaði ísinn."`` results in the following +trigrams being added to the database:: + + ("", "", "Maðurinn") + ("", "Maðurinn", "borðaði") + ("Maðurinn", "borðaði", "ísinn") + ("borðaði", "ísinn", ".") + ("ísinn", ".", "") + (".", "", "") + +The same sliding window strategy is applied for bigrams, so the following +bigrams would be recorded for the same sentence:: + + ("", "Maðurinn") + ("Maðurinn", "borðaði") + ("borðaði", "ísinn") + ("ísinn", ".") + (".", "") + +You can thus obtain the N unigrams that most often start +a sentence by asking for ``ng.succ(N, "")``. + +And, of course, four unigrams are also added, one for each token in the +sentence. + +The tokenization of the source text into unigrams is done with the +`Tokenizer package `_and +uses the rules documented there. Importantly, tokens other than words, +abbreviations, entity names, person names and punctuation are +**replaced by placeholders**. This means that all numbers are represented by the token +``[NUMBER]``, amounts by ``[AMOUNT]``, dates by ``[DATEABS]`` and ``[DATEREL]``, +e-mail addresses by ``[EMAIL]``, etc. For the complete mapping of token types +to placeholder strings, see the +`documentation for the Tokenizer package `_. + +## Prerequisites + +This package runs on CPython 3.6 or newer, and on PyPy 3.6 or newer. It +has been tested on Linux (gcc on x86-64 and ARMhf), MacOS (clang) and +Windows (MSVC). + +If a binary wheel package isn't available on `PyPI `_ +for your system, you may need to have the ``python3-dev`` package +(or its Windows equivalent) installed on your system to set up +Icegrams successfully. This is because a source distribution +install requires a C++ compiler and linker:: + + # Debian or Ubuntu: + sudo apt-get install python3-dev + +## Installation + +To install this package:: + + pip install icegrams + +If you want to be able to edit the source, do like so (assuming you have **git** installed):: + + git clone https://github.com/mideind/Icegrams + cd Icegrams + # [ Activate your virtualenv here if you have one ] + python setup.py develop + +The package source code is now in ``./src/icegrams``. + +## Tests + +To run the built-in tests, install `pytest `_, +``cd`` to your ``Icegrams`` subdirectory (and optionally activate your +virtualenv), then run:: + + python -m pytest + +## Version History + +* Version 1.1.2: Minor bug fixes. Cross-platform wheels provided. Now requires Python 3.7+. (2022-12-14) +* Version 1.1.0: Python 3.5 support dropped; macOS builds fixed; PyPy wheels + generated +* Version 1.0.0: New trigram database sourced from the Icelandic Gigaword Corpus + (Risamálheild) with improved tokenization. Replaced GNU GPLv3 with MIT license. +* Version 0.6.0: Python type annotations added +* Version 0.5.0: Trigrams corpus has been spell-checked + +## License + +Icegrams is Copyright © 2022 [Miðeind ehf.](https://mideind.is) +The original author of this software is *Vilhjálmur Þorsteinsson*. + +This software is licensed under the **MIT License**: + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software, +and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/src/icegrams/trie_build.py b/src/icegrams/trie_build.py index 50d889e..7285993 100644 --- a/src/icegrams/trie_build.py +++ b/src/icegrams/trie_build.py @@ -44,9 +44,9 @@ # change it in setup.py as well ffibuilder = cffi.FFI() -_PATH = os.path.dirname(__file__) or "." WINDOWS = platform.system() == "Windows" MACOS = platform.system() == "Darwin" +IMPLEMENTATION = platform.python_implementation() # What follows is the actual Python-wrapped C interface to trie.*.so # It must be kept in sync with trie.h @@ -100,12 +100,18 @@ extra_compile_args = ["/Zc:offsetof-"] elif MACOS: os.environ["CFLAGS"] = "-stdlib=libc++" # Fixes PyPy build on macOS 10.15.6+ + os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9" extra_compile_args = ["-mmacosx-version-min=10.7", "-stdlib=libc++"] else: # Adding -O3 to the compiler arguments doesn't seem to make # any discernible difference in lookup speed extra_compile_args = ["-std=c++11"] +# On some systems, the linker needs to be told to use the C++ compiler +# under PyPy due to changes in the default behaviour of distutils. +if IMPLEMENTATION == "PyPy": + os.environ["LDCXXSHARED"] = "c++ -shared" + ffibuilder.set_source( "icegrams._trie", # trie.cpp is written in C++ but must export a pure C interface. @@ -120,4 +126,3 @@ if __name__ == "__main__": ffibuilder.compile(verbose=False) - From 7be5386dfb40e2fcd3be9a140df75256de07e8a8 Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Tue, 27 Aug 2024 13:58:26 +0000 Subject: [PATCH 2/7] Migrated from pkg_resources over to importlib + use importlib for version export --- src/icegrams/__init__.py | 11 +- src/icegrams/ngrams.py | 436 ++++++++++++++++++++------------------- 2 files changed, 231 insertions(+), 216 deletions(-) diff --git a/src/icegrams/__init__.py b/src/icegrams/__init__.py index e35e796..9697f0b 100644 --- a/src/icegrams/__init__.py +++ b/src/icegrams/__init__.py @@ -33,10 +33,11 @@ """ -# Expose the icegrams API - -from .ngrams import Ngrams, MAX_ORDER +import importlib.metadata __author__ = "Miðeind ehf." -__copyright__ = "(C) 2020 Miðeind ehf." -__version__ = "1.1.2" +__copyright__ = "(C) 2020-2024 Miðeind ehf." +__version__ = importlib.metadata.version("icegrams") + +# Expose the icegrams API +from .ngrams import Ngrams, MAX_ORDER diff --git a/src/icegrams/ngrams.py b/src/icegrams/ngrams.py index d910cd0..5011d59 100644 --- a/src/icegrams/ngrams.py +++ b/src/icegrams/ngrams.py @@ -110,8 +110,18 @@ """ from typing import ( - List, Dict, Tuple, Set, Sized, Iterable, Optional, Any, Callable, IO, - cast, TYPE_CHECKING + List, + Dict, + Tuple, + Set, + Sized, + Iterable, + Optional, + Any, + Callable, + IO, + cast, + TYPE_CHECKING, ) import time from collections import defaultdict @@ -137,15 +147,20 @@ # Running as a main program from _trie import lib as trie_cffi, ffi # type: ignore # pylint: disable=import-error from trie import Trie # type: ignore + BINARY_FILENAME = os.path.join(_PATH, "resources", "trigrams.bin") else: # Imported as a package from ._trie import lib as trie_cffi, ffi # type: ignore # pylint: disable=import-error,no-name-in-module + # Make sure that the trigrams.bin file is # unpacked and ready for use - import pkg_resources + import importlib.resources as importlib_resources + + ref = importlib_resources.files("icegrams").joinpath("resources/trigrams.bin") + # Note: the resource path below should NOT use os.path.join() - BINARY_FILENAME = pkg_resources.resource_filename(__name__, "resources/trigrams.bin") + BINARY_FILENAME = str(ref) UINT32 = struct.Struct(" bytes: - """ Convert string from normal Python representation to - a bytes string containing indices into the alphabet. - The indices are offset by 1 since 0 is not a valid - byte value. """ + """Convert string from normal Python representation to + a bytes string containing indices into the alphabet. + The indices are offset by 1 since 0 is not a valid + byte value.""" return bytes(ALPHABET.index(ch) + 1 for ch in s) def to_str(by: Iterable[int]) -> str: - """ Convert a sequence of byte indices into a normal Python string. - The byte indices are decremented by 1 before the conversion, - since 0 is not a valid byte index. """ + """Convert a sequence of byte indices into a normal Python string. + The byte indices are decremented by 1 before the conversion, + since 0 is not a valid byte index.""" return "".join(ALPHABET[b - 1] for b in by) class Ngrams: - - """ A wrapper class around the n-gram store, allowing - queries for n-gram frequencies and probabilities. - The current n-gram store contains unigrams, bigrams and - trigrams. """ + """A wrapper class around the n-gram store, allowing + queries for n-gram frequencies and probabilities. + The current n-gram store contains unigrams, bigrams and + trigrams.""" def __init__(self) -> None: self.ngrams = NgramStorage() self.ngrams.load(BINARY_FILENAME) def __contains__(self, word: str) -> bool: - """ Return True if the word exists as a unigram """ + """Return True if the word exists as a unigram""" return bool(word) and (self.ngrams.word_to_id(word) is not None) def freq(self, *args: str) -> int: - """ Return the frequency of the n-gram given in *args, where - 1 <= n <= 3 """ + """Return the frequency of the n-gram given in *args, where + 1 <= n <= 3""" if not args: raise ValueError("Must provide at least one string argument") return self.ngrams.freq(*args) def adj_freq(self, *args: str) -> int: - """ Return the frequency of the n-gram given in *args, where - 1 <= n <= 3. The frequency is adjusted so that n-grams - that do not occur in the database have frequency 1, and all - others have their actual frequency incremented by one. """ + """Return the frequency of the n-gram given in *args, where + 1 <= n <= 3. The frequency is adjusted so that n-grams + that do not occur in the database have frequency 1, and all + others have their actual frequency incremented by one.""" if not args: raise ValueError("Must provide at least one string argument") return self.ngrams.freq(*args) + 1 def logprob(self, *args: str) -> float: - """ Return the log of the approximate probability - of word w(n) given its predecessors w(1)..w(n-1), - for 1 <= n <= 3 (i.e. unigram, bigram or trigram) """ + """Return the log of the approximate probability + of word w(n) given its predecessors w(1)..w(n-1), + for 1 <= n <= 3 (i.e. unigram, bigram or trigram)""" if not args: raise ValueError("Must provide at least one string argument") return self.ngrams.logprob(*args) def prob(self, *args: str) -> float: - """ Return the approximate probability (in the range (0.0..1.0], - note that it is never zero) of word w(n) given its - predecessors w(1)..w(n-1), for 1 <= n <= 3 (i.e. unigram, - bigram or trigram) """ + """Return the approximate probability (in the range (0.0..1.0], + note that it is never zero) of word w(n) given its + predecessors w(1)..w(n-1), for 1 <= n <= 3 (i.e. unigram, + bigram or trigram)""" if not args: raise ValueError("Must provide at least one string argument") return math.exp(self.logprob(*args)) def succ(self, n, *args: str) -> List[Tuple[str, float]]: - """ Returns a sorted list of length <= n with the most likely - successors to the words given, in descending order of - log probability. The list consists of tuples of - (word, log probability). """ + """Returns a sorted list of length <= n with the most likely + successors to the words given, in descending order of + log probability. The list consists of tuples of + (word, log probability).""" if not isinstance(n, int) or n < 1: raise TypeError("Expected positive integer for parameter n") if not args: @@ -239,18 +253,17 @@ def succ(self, n, *args: str) -> List[Tuple[str, float]]: return self.ngrams.succ(n, *args) def close(self) -> None: - """ Close the underlying storage and its memory map """ + """Close the underlying storage and its memory map""" self.ngrams.close() self.ngrams = None # type: ignore class BitArray: - - """ BitArray implements a compressed array of bits. - Bits are indexed starting from the least significant - bit of each byte. Bit 0 is thus the lowest bit of - the first byte of the array and bit 7 is the highest - bit of that byte. """ + """BitArray implements a compressed array of bits. + Bits are indexed starting from the least significant + bit of each byte. Bit 0 is thus the lowest bit of + the first byte of the array and bit 7 is the highest + bit of that byte.""" def __init__(self) -> None: # Accumulator for completed bytes @@ -263,13 +276,13 @@ def __init__(self) -> None: self.length = None # type: Optional[int] def num_bits(self) -> int: - """ Return the total number of bits written to the byte array """ + """Return the total number of bits written to the byte array""" return len(self.b) * 8 + self.bits def append(self, val: int, bits: int) -> None: - """ Append the given value to the BitArray, using the indicated - number of bits. The value is masked by this function before - adding it to the array. """ + """Append the given value to the BitArray, using the indicated + number of bits. The value is masked by this function before + adding it to the array.""" assert self.length is None if bits <= 0: raise ValueError("Bits parameter must be > 0") @@ -285,8 +298,8 @@ def append(self, val: int, bits: int) -> None: self.bits -= 8 def finish(self) -> None: - """ Optionally call this to complete writing any still - buffered bits to the byte array """ + """Optionally call this to complete writing any still + buffered bits to the byte array""" assert self.length is None self.length = len(self.b) * 8 + self.bits if self.bits: @@ -297,8 +310,8 @@ def finish(self) -> None: self.bits = 0 def get(self, index: int, bits: int) -> int: - """ Obtain the value stored at the given bit index, using - the indicated number of bits """ + """Obtain the value stored at the given bit index, using + the indicated number of bits""" if bits <= 0: raise ValueError("Bits parameter must be > 0") # Finish writing to the byte buffer @@ -331,40 +344,39 @@ def get(self, index: int, bits: int) -> int: return buf & ((1 << bits) - 1) def to_bytes(self) -> bytes: - """ Finish the byte array and return it as a bytes object """ + """Finish the byte array and return it as a bytes object""" if self.length is None: self.finish() return bytes(self.b) def __len__(self) -> int: - """ Return the length of this BitArray, in bytes """ + """Return the length of this BitArray, in bytes""" return len(self.b) + (1 if self.bits else 0) class BaseList: def lookup(self, ix: int) -> int: - """ Should always be overridden in derived classes """ + """Should always be overridden in derived classes""" raise NotImplementedError def __getitem__(self, ix: int) -> int: - """ Returns the integer at index ix within the sequence """ + """Returns the integer at index ix within the sequence""" return self.lookup(ix) def lookup_pair(self, ix: int) -> Tuple[int, int]: - """ Return the pair of values at [ix] and [ix+1] """ + """Return the pair of values at [ix] and [ix+1]""" raise NotImplementedError class MonotonicList(BaseList): - - """ A MonotonicList stores a presorted, monotonically increasing - list of integers in a compact byte buffer using Elias-Fano - encoding. """ + """A MonotonicList stores a presorted, monotonically increasing + list of integers in a compact byte buffer using Elias-Fano + encoding.""" QUANTUM_SIZE = 128 - def __init__(self, b: Optional[bytes]=None) -> None: + def __init__(self, b: Optional[bytes] = None) -> None: # If b is given, it should be a byte buffer of some sort # (usually a memoryview() object) self.b = b @@ -374,13 +386,15 @@ def __init__(self, b: Optional[bytes]=None) -> None: self.low_bits = 0 self.high_bits = 0 - def compress(self, int_list: List[int], vocab_size: Optional[int]=None) -> None: - """ Compress a presorted, monotonically increasing list of integers - in int_list, all of them <= vocab_size (if given), to a bytes() object - and return it """ + def compress(self, int_list: List[int], vocab_size: Optional[int] = None) -> None: + """Compress a presorted, monotonically increasing list of integers + in int_list, all of them <= vocab_size (if given), to a bytes() object + and return it""" self.n = n = len(int_list) - if n == 0 or n >= 2 ** 32: - raise ValueError("List must have more than zero and less than 2**32 elements") + if n == 0 or n >= 2**32: + raise ValueError( + "List must have more than zero and less than 2**32 elements" + ) # Get vocabulary size if vocab_size is None: @@ -464,9 +478,10 @@ def compress(self, int_list: List[int], vocab_size: Optional[int]=None) -> None: # of low and high bits, which is all we need for decompression parts = [ UINT32.pack(self.n), - UINT16.pack(low_bits), UINT16.pack(high_bits), + UINT16.pack(low_bits), + UINT16.pack(high_bits), bytes(hbuf_index), - bytes(buf + hbuf) + bytes(buf + hbuf), ] # Align the byte block to a DWORD (32-bit) boundary frag = sum(len(p) for p in parts) & 3 @@ -476,51 +491,47 @@ def compress(self, int_list: List[int], vocab_size: Optional[int]=None) -> None: self.ffi_b = ffi.cast("uint8_t*", ffi.from_buffer(self.b)) def to_bytes(self) -> bytes: - """ Return a bytes object containing the compressed list """ + """Return a bytes object containing the compressed list""" assert self.b is not None return self.b def __str__(self) -> str: s = "MonotonicList: u is {0:,}, n is {1:,}\n".format(self.u, self.n) - s += ( - "low_bits is {0}, high_bits is {1}, total range {2:,}\n" - .format(self.low_bits, self.high_bits, 2**(self.low_bits + self.high_bits) - 1) + s += "low_bits is {0}, high_bits is {1}, total range {2:,}\n".format( + self.low_bits, self.high_bits, 2 ** (self.low_bits + self.high_bits) - 1 ) - s += ( - "size in bytes is {0:,} instead of straightforward {1:,}" - .format( - 0 if self.b is None else len(self.b), - (self.n * int(math.log(self.u, 2) + 1.0) + 7) // 8 - ) + s += "size in bytes is {0:,} instead of straightforward {1:,}".format( + 0 if self.b is None else len(self.b), + (self.n * int(math.log(self.u, 2) + 1.0) + 7) // 8, ) return s def __len__(self) -> int: - """ Return the number of elements in the list """ + """Return the number of elements in the list""" return self.n def lookup(self, ix: int) -> int: - """ Returns the integer at index ix within the sequence """ + """Returns the integer at index ix within the sequence""" if self.ffi_b is None: raise ValueError("Lookup not allowed from uncompressed list") return trie_cffi.lookupMonotonic(self.ffi_b, self.QUANTUM_SIZE, ix) - def lookup_pair(self, ix:int) -> Tuple[int, int]: - """ Return the pair of values at [ix] and [ix+1] """ + def lookup_pair(self, ix: int) -> Tuple[int, int]: + """Return the pair of values at [ix] and [ix+1]""" p1 = ffi.new("uint64_t*") p2 = ffi.new("uint64_t*") trie_cffi.lookupPairMonotonic(self.ffi_b, self.QUANTUM_SIZE, ix, p1, p2) return p1[0], p2[0] def search(self, p1: int, p2: int, i: int) -> Optional[int]: - """ Look for i in the range [p1, p2> within the list """ + """Look for i in the range [p1, p2> within the list""" if self.ffi_b is None: raise ValueError("Search not allowed in uncompressed list") r = trie_cffi.searchMonotonic(self.ffi_b, self.QUANTUM_SIZE, p1, p2, i) return None if r == 0xFFFFFFFF else r def search_prefix(self, p1: int, p2: int, i: int) -> Optional[int]: - """ Look for i in the range [p1, p2> within the list """ + """Look for i in the range [p1, p2> within the list""" if self.ffi_b is None: raise ValueError("Search not allowed in uncompressed list") r = trie_cffi.searchMonotonicPrefix(self.ffi_b, self.QUANTUM_SIZE, p1, p2, i) @@ -528,26 +539,25 @@ def search_prefix(self, p1: int, p2: int, i: int) -> Optional[int]: class PartitionedMonotonicList(BaseList): - - """ A PartitionedMonotonicList consists of a list - of Elias-Fano lists, with the trick being that - each sublist is encoded with its own item - sequence, after subtracting the value of the - first item of the list (which is stored in - the first level list). """ + """A PartitionedMonotonicList consists of a list + of Elias-Fano lists, with the trick being that + each sublist is encoded with its own item + sequence, after subtracting the value of the + first item of the list (which is stored in + the first level list).""" QUANTUM_SIZE = 1 << 11 - def __init__(self, b: Optional[bytes]=None): + def __init__(self, b: Optional[bytes] = None): self.b = b self.ffi_b = None if b is None else ffi.cast("uint8_t*", ffi.from_buffer(b)) def compress(self, int_list: List[int]) -> None: - """ Compress int_list into a two-level partitioned - Elias-Fano list, where the lower level consists - of sublists of length <= QUANTUM_SIZE, and the - upper level consists of a list of the values of - the first items of the sublists. """ + """Compress int_list into a two-level partitioned + Elias-Fano list, where the lower level consists + of sublists of length <= QUANTUM_SIZE, and the + upper level consists of a list of the values of + the first items of the sublists.""" # The upper level list chunks = [] @@ -611,7 +621,7 @@ def compress(self, int_list: List[int]) -> None: UINT32.pack(len(chunk_index)), b"".join(UINT32.pack(pos + offset) for pos in chunk_index), chunk_bytes, - merged_buf + merged_buf, ] # Align the byte block to a DWORD (32-bit) boundary frag = sum(len(p) for p in parts) & 3 @@ -621,16 +631,16 @@ def compress(self, int_list: List[int]) -> None: self.ffi_b = ffi.cast("uint8_t*", ffi.from_buffer(self.b)) def to_bytes(self) -> bytes: - """ Return the byte buffer containing the compressed list """ + """Return the byte buffer containing the compressed list""" assert self.b is not None return self.b def __len__(self) -> int: - """ Return the compressed list size in bytes """ + """Return the compressed list size in bytes""" return 0 if self.b is None else len(self.b) def lookup(self, ix: int) -> int: - """ Lookup a value from the compressed list, by index """ + """Lookup a value from the compressed list, by index""" if self.ffi_b is None: raise ValueError("Lookup not allowed from uncompressed list") return trie_cffi.lookupPartition( @@ -638,7 +648,7 @@ def lookup(self, ix: int) -> int: ) def lookup_pair(self, ix: int) -> Tuple[int, int]: - """ Return the pair of values at [ix] and [ix+1] """ + """Return the pair of values at [ix] and [ix+1]""" p1 = ffi.new("uint64_t*") p2 = ffi.new("uint64_t*") trie_cffi.lookupPairPartition( @@ -650,8 +660,7 @@ def search(self, p1: int, p2: int, i: int) -> Optional[int]: if self.ffi_b is None: raise ValueError("Search not allowed in uncompressed list") r = trie_cffi.searchPartition( - self.ffi_b, self.QUANTUM_SIZE, MonotonicList.QUANTUM_SIZE, - p1, p2, i + self.ffi_b, self.QUANTUM_SIZE, MonotonicList.QUANTUM_SIZE, p1, p2, i ) return None if r == 0xFFFFFFFF else r @@ -659,15 +668,13 @@ def search_prefix(self, p1: int, p2: int, i: int) -> Optional[int]: if self.ffi_b is None: raise ValueError("Search not allowed in uncompressed list") r = trie_cffi.searchPartitionPrefix( - self.ffi_b, self.QUANTUM_SIZE, MonotonicList.QUANTUM_SIZE, - p1, p2, i + self.ffi_b, self.QUANTUM_SIZE, MonotonicList.QUANTUM_SIZE, p1, p2, i ) return None if r == 0xFFFFFFFF else r class _Level: - - """ A level within a trigram tree structure """ + """A level within a trigram tree structure""" __slots__ = ("cnt", "d") @@ -686,9 +693,8 @@ def reset(self, depth: int) -> None: class NgramStorage: - - """ NgramStorage wraps the compressed binary representation of - the trigram store """ + """NgramStorage wraps the compressed binary representation of + the trigram store""" # We store an index position in the frequency array once # every FREQ_QUANTUM_SIZE frequency values @@ -703,7 +709,7 @@ class NgramStorage: # this makes lookup faster for the most-used words. VOCAB_INDEX_CUTOFF = 1024 - VERSION = b'Reynir 001.00.00' + VERSION = b"Reynir 001.00.00" assert len(VERSION) == 16 # Note that the trie offset must be the first header @@ -746,28 +752,26 @@ def __init__(self) -> None: self._trigram_freqs = bytes() self._vocab = bytes() - def compress( - self, tsv_filename: str, binary_filename: str, *, add_all_bigrams: bool=False + self, tsv_filename: str, binary_filename: str, *, add_all_bigrams: bool = False ): - """ Create a new compressed binary file from a trigram text (.tsv) file. - If add_all_bigrams is True, then for each input trigram (w0, w1, w2) - we add both (w0, w1) and (w1, w2) as bigrams. Otherwise, we add only - (w0, w1) - and assume that (w1, w2, w3) is also present as a trigram - causing (w1, w2) to be implicitly added. """ + """Create a new compressed binary file from a trigram text (.tsv) file. + If add_all_bigrams is True, then for each input trigram (w0, w1, w2) + we add both (w0, w1) and (w1, w2) as bigrams. Otherwise, we add only + (w0, w1) - and assume that (w1, w2, w3) is also present as a trigram + causing (w1, w2) to be implicitly added.""" self.read_tsv(tsv_filename, add_all_bigrams=add_all_bigrams) self.write_binary(binary_filename) def word_to_id(self, word: str) -> Optional[int]: - """ Obtain the unigram id for the given word by - calling the C++ mapping() function from - trie.cpp that has been wrapped using CFFI """ + """Obtain the unigram id for the given word by + calling the C++ mapping() function from + trie.cpp that has been wrapped using CFFI""" if word == "": return 0 try: m = trie_cffi.mapping( - ffi.cast("uint8_t*", self._mmap_buffer), - to_bytes(word) + ffi.cast("uint8_t*", self._mmap_buffer), to_bytes(word) ) except ValueError: # The word contains a character that is not in our alphabet @@ -775,7 +779,7 @@ def word_to_id(self, word: str) -> Optional[int]: return None if m == 0xFFFFFFFF else m def id_to_word(self, n: int) -> str: - """ Convert a vocabulary index back to the original unigram text """ + """Convert a vocabulary index back to the original unigram text""" if n < self.VOCAB_INDEX_CUTOFF: # For low ids, we have an index entry for every id q, r = n, 0 @@ -805,13 +809,13 @@ def id_to_word(self, n: int) -> str: return to_str(self._compressed_vocab[start:end]) def indices(self, *args: str) -> Tuple[Optional[int], ...]: - """ Convert word strings to vocabulary indices, or None - if the word is not found in the vocabulary """ + """Convert word strings to vocabulary indices, or None + if the word is not found in the vocabulary""" return tuple(self.word_to_id(w) for w in args) def lookup_frequency(self, level: int, b: bytes, index: Optional[int]) -> int: - """ Look up the frequency with the given index, - stored in the byte buffer b """ + """Look up the frequency with the given index, + stored in the byte buffer b""" if index is None: return 0 buf = ffi.from_buffer(b) @@ -823,19 +827,19 @@ def lookup_frequency(self, level: int, b: bytes, index: Optional[int]) -> int: return self.freqs[level][rank] def unigram_frequency(self, i0: Optional[int]) -> int: - """ Return the frequency of the unigram i0, - specified as a vocabulary index. """ + """Return the frequency of the unigram i0, + specified as a vocabulary index.""" return self.lookup_frequency(1, self._unigram_freqs, i0) def unigram_logprob(self, i0: Optional[int]) -> float: - """ Return the log of the probability of the unigram - given by vocabulary index i0, relative to the entire - unigram frequency count """ + """Return the log of the probability of the unigram + given by vocabulary index i0, relative to the entire + unigram frequency count""" return math.log(self.unigram_frequency(i0) + 1) - self.log_ucnt def bigram_frequency(self, i0: Optional[int], i1: Optional[int]) -> int: - """ Return the frequency of the bigram (i0, i1), - given as vocabulary indices. """ + """Return the frequency of the bigram (i0, i1), + given as vocabulary indices.""" # Look up the pointer range for i0 in the unigram pointers if i0 is None or i1 is None: return 0 @@ -848,19 +852,18 @@ def bigram_frequency(self, i0: Optional[int], i1: Optional[int]) -> int: return self.lookup_frequency(2, self._bigram_freqs, i) def bigram_logprob(self, i0: Optional[int], i1: Optional[int]) -> float: - """ Return the log of the probability of the bigram - consisting of vocabulary indices i0 and i1, - relative to the unigram frequency of i0 """ - return ( - math.log(self.bigram_frequency(i0, i1) + 1) - - math.log(self.unigram_frequency(i0) + 1) + """Return the log of the probability of the bigram + consisting of vocabulary indices i0 and i1, + relative to the unigram frequency of i0""" + return math.log(self.bigram_frequency(i0, i1) + 1) - math.log( + self.unigram_frequency(i0) + 1 ) def trigram_frequency( self, i0: Optional[int], i1: Optional[int], i2: Optional[int] ) -> int: - """ Return the frequency of the trigram (i0, i1, i2), - given as vocabulary indices. """ + """Return the frequency of the trigram (i0, i1, i2), + given as vocabulary indices.""" # Look up the pointer range for i0 in the unigram pointers if i0 is None or i1 is None or i2 is None: return 0 @@ -897,23 +900,24 @@ def trigram_frequency( def trigram_logprob( self, i0: Optional[int], i1: Optional[int], i2: Optional[int] ) -> float: - """ Return the log of the probability of the trigram - consisting of vocabulary indices i0, i1 and i2, - relative to the bigram of i0 and i1 """ - return ( - math.log(self.trigram_frequency(i0, i1, i2) + 1) - - math.log(self.bigram_frequency(i0, i1) + 1) + """Return the log of the probability of the trigram + consisting of vocabulary indices i0, i1 and i2, + relative to the bigram of i0 and i1""" + return math.log(self.trigram_frequency(i0, i1, i2) + 1) - math.log( + self.bigram_frequency(i0, i1) + 1 ) _FREQ_DISPATCH = { - 1: unigram_frequency, 2: bigram_frequency, 3: trigram_frequency + 1: unigram_frequency, + 2: bigram_frequency, + 3: trigram_frequency, } # type: Dict[int, Callable[..., int]] def freq(self, *args: str) -> int: - """ Return the frequency of the n-gram given in *args, where - 1 <= n <= 3. The frequency is adjusted so that n-grams - that do not occur in the database have frequency 1, and all - others have their actual frequency incremented by one. """ + """Return the frequency of the n-gram given in *args, where + 1 <= n <= 3. The frequency is adjusted so that n-grams + that do not occur in the database have frequency 1, and all + others have their actual frequency incremented by one.""" if len(args) > MAX_ORDER: # Allow more than 3 arguments, but then we only return the # trigram probability of the last 3 @@ -921,13 +925,15 @@ def freq(self, *args: str) -> int: return self._FREQ_DISPATCH[len(args)](self, *self.indices(*args)) _PROB_DISPATCH = { - 1: unigram_logprob, 2: bigram_logprob, 3: trigram_logprob + 1: unigram_logprob, + 2: bigram_logprob, + 3: trigram_logprob, } # type: Dict[int, Callable[..., float]] def logprob(self, *args: str) -> float: - """ Return the log of the approximate probability - of word w(n) given its predecessors w(1)..w(n-1), - for 1 <= n <= 3 (i.e. unigram, bigram or trigram) """ + """Return the log of the approximate probability + of word w(n) given its predecessors w(1)..w(n-1), + for 1 <= n <= 3 (i.e. unigram, bigram or trigram)""" if len(args) > MAX_ORDER: # Allow more than 3 arguments, but then we only return the # trigram probability of the last 3 @@ -935,7 +941,7 @@ def logprob(self, *args: str) -> float: return self._PROB_DISPATCH[len(args)](self, *self.indices(*args)) def unigram_succ(self, n: int, i0: int) -> List[Tuple[str, float]]: - """ Return successors to the unigram whose id is in i0 """ + """Return successors to the unigram whose id is in i0""" if i0 is None: return [] p1, p2 = self._unigram_ptrs_ml.lookup_pair(i0) @@ -948,11 +954,11 @@ def unigram_succ(self, n: int, i0: int) -> List[Tuple[str, float]]: j = self._bigram_pl.lookup(i) - prefix_sum lpi = math.log(self.lookup_frequency(2, self._bigram_freqs, i) + 1) result.append((j, lpi - lp0)) - result = sorted(result, key=lambda e:e[1], reverse=True)[0:n] + result = sorted(result, key=lambda e: e[1], reverse=True)[0:n] return [(self.id_to_word(j), lp) for j, lp in result] def bigram_succ(self, n: int, i0: int, i1: int) -> List[Tuple[str, float]]: - """ Return successors to the bigram (i0, i1) """ + """Return successors to the bigram (i0, i1)""" if i0 is None or i1 is None: return [] p1, p2 = self._unigram_ptrs_ml.lookup_pair(i0) @@ -979,24 +985,25 @@ def bigram_succ(self, n: int, i0: int, i1: int) -> List[Tuple[str, float]]: j = self._bigram_pl.lookup(q1 + remapped_id) - prefix_sum_bi lpi = math.log(self.lookup_frequency(3, self._trigram_freqs, i) + 1) result.append((j, lpi - lp0)) - result = sorted(result, key=lambda e:e[1], reverse=True)[0:n] + result = sorted(result, key=lambda e: e[1], reverse=True)[0:n] return [(self.id_to_word(j), lp) for j, lp in result] _SUCC_DISPATCH = { - 1: unigram_succ, 2: bigram_succ + 1: unigram_succ, + 2: bigram_succ, } # type: Dict[int, Callable[..., List[Tuple[str, float]]]] def succ(self, n: int, *args: str) -> List[Tuple[str, float]]: - """ Return a list of likely successors to the words - in *args, of length <= n. The list consists of - tuples of (word, log probability), in descending - order of log probability. """ + """Return a list of likely successors to the words + in *args, of length <= n. The list consists of + tuples of (word, log probability), in descending + order of log probability.""" if len(args) >= MAX_ORDER: - args = args[-(MAX_ORDER - 1):] + args = args[-(MAX_ORDER - 1) :] return self._SUCC_DISPATCH[len(args)](self, n, *self.indices(*args)) - def read_tsv(self, fname: str, *, add_all_bigrams: bool=False) -> None: - """ Populate the trigram database from a tab-separated (.tsv) file """ + def read_tsv(self, fname: str, *, add_all_bigrams: bool = False) -> None: + """Populate the trigram database from a tab-separated (.tsv) file""" print("Reading {0}, first pass...".format(fname), flush=True) t0 = time.time() @@ -1030,17 +1037,17 @@ def read_tsv(self, fname: str, *, add_all_bigrams: bool=False) -> None: vocab[to_bytes(w2)] += 1 # Trie that maps unigrams to integer identifiers using_empty = b"" in vocab - trie = Trie(reserve_zero_for_empty=using_empty) # pylint: disable=used-before-assignment + trie = Trie( + reserve_zero_for_empty=using_empty + ) # pylint: disable=used-before-assignment # Dict to map words to integer ids - ids = { b"": 0 } if using_empty else {} + ids = {b"": 0} if using_empty else {} # Build the trie in decreasing order of occurrences, ensuring that # the most common unigrams get the lowest indices if using_empty: # Hack to make sure that the blank entry goes to the front of the list vocab[b""] = 10**50 - vocab_list = sorted( - vocab.items(), key=lambda item: item[1], reverse=True - ) + vocab_list = sorted(vocab.items(), key=lambda item: item[1], reverse=True) assert not using_empty or vocab_list[0][0] == b"" del vocab for unigram_id, (w, c) in enumerate(vocab_list): @@ -1061,22 +1068,20 @@ def read_tsv(self, fname: str, *, add_all_bigrams: bool=False) -> None: # The index consists of w1...w1023 followed by w1024,w1088,... for ix, (w, _) in enumerate(vocab_list): if ix and ( - ix % self.VOCAB_QUANTUM_SIZE == 0 - or ix < self.VOCAB_INDEX_CUTOFF + ix % self.VOCAB_QUANTUM_SIZE == 0 or ix < self.VOCAB_INDEX_CUTOFF ): compressed_index.extend(UINT32.pack(len(compressed_vocab))) compressed_vocab.extend(w + b"\x00") parts = [ UINT32.pack(len(compressed_index) // 4), compressed_index, - gzip.compress(compressed_vocab) + gzip.compress(compressed_vocab), ] self.compressed_vocab = b"".join(parts) print( "Compressed vocabulary including index is {0:,} bytes, " - "{1:,} uncompressed, {2:,} index" - .format(len(self.compressed_vocab), len(compressed_vocab), - len(compressed_index) + "{1:,} uncompressed, {2:,} index".format( + len(self.compressed_vocab), len(compressed_vocab), len(compressed_index) ) ) del vocab_list @@ -1209,21 +1214,21 @@ def count_level(depth, level): print("Level {0}: Frequency buckets are {1}".format(k, len(v))) # For each level, create a dict of indices into an ascending list of frequencies self.fbuckets = { - k: { f: ix for ix, f in enumerate(sorted(list(v))) } - for k, v in freqs.items() + k: {f: ix for ix, f in enumerate(sorted(list(v)))} for k, v in freqs.items() } t1 = time.time() print( "Done in {3:.1f} sec, trigram count is {0:,}, " - "voc size is {1:,}, unigram count {2:,}" - .format(cnt, len(trie), ucnt, t1 - t0) + "voc size is {1:,}, unigram count {2:,}".format( + cnt, len(trie), ucnt, t1 - t0 + ) ) def write_unigram_pointers(self, f: IO[Any]) -> None: - """ Unigram sequence: we write pointers to the next level - for every unigram id. Some ids may not have an associated - next level, in which case their range is zero. """ + """Unigram sequence: we write pointers to the next level + for every unigram id. Some ids may not have an associated + next level, in which case their range is zero.""" level = self.level0 assert level is not None # Initialize the pointer list, which always starts with a 0 @@ -1249,7 +1254,7 @@ def write_unigram_pointers(self, f: IO[Any]) -> None: print("Uni-pointers: {0}\n".format(ml)) def write_unigram_frequencies(self, f: IO[Any]) -> None: - """ Write the unigram frequency data """ + """Write the unigram frequency data""" if self.trie is None: len_trie = 0 else: @@ -1261,13 +1266,11 @@ def write_unigram_frequencies(self, f: IO[Any]) -> None: d = self.level0.d assert d is not None pos = f.tell() - self.write_frequencies( - f, [freqs[d[i].cnt] for i in range(len_trie)] - ) + self.write_frequencies(f, [freqs[d[i].cnt] for i in range(len_trie)]) print("Uni-frequencies occupy {0:,} bytes.".format(f.tell() - pos)) def write_bigram_and_trigram_levels(self, f: IO[Any]) -> Tuple[int, int, int, int]: - """ Write the bigram and trigram levels to the binary file """ + """Write the bigram and trigram levels to the binary file""" level0 = self.level0 assert level0 is not None assert self.fbuckets is not None @@ -1364,7 +1367,9 @@ def sorted_child_ids(w0: int) -> List[int]: print("\nBi-ids are {0:,}".format(len(bi_ids))) pl.compress(bi_ids) f.write(pl.to_bytes()) - print("Bi_ids compressed with partitions: {0:,} bytes".format(len(pl.to_bytes()))) + print( + "Bi_ids compressed with partitions: {0:,} bytes".format(len(pl.to_bytes())) + ) print("Bi-pointers are {0:,}".format(len(ptrs))) ml.compress(ptrs) @@ -1378,7 +1383,9 @@ def sorted_child_ids(w0: int) -> List[int]: pl.compress(tri_ids) tri_id_loc = f.tell() f.write(pl.to_bytes()) - print("Tri_ids compressed with partitions: {0:,} bytes".format(len(pl.to_bytes()))) + print( + "Tri_ids compressed with partitions: {0:,} bytes".format(len(pl.to_bytes())) + ) del pl del ml @@ -1394,7 +1401,7 @@ def sorted_child_ids(w0: int) -> List[int]: return bi_fq_loc, tri_fq_loc, bi_ptr_loc, tri_id_loc def write_frequencies(self, f: IO[Any], freq_ranks: List[int]) -> None: - """ Write an array containing frequency ranks in a minimal number of bits """ + """Write an array containing frequency ranks in a minimal number of bits""" # Create a dictionary of code words for each frequency rank, # using the fewest bits for the most frequent ranks codebook = dict() # type: Dict[int, Tuple[int, int]] @@ -1403,7 +1410,7 @@ def write_frequencies(self, f: IO[Any], freq_ranks: List[int]) -> None: for fqr in freq_ranks: cnt[fqr] += 1 # Sort the frequency ranks in descending order by how common they are - sorted_freq_ranks = sorted(cnt.items(), key=lambda e:e[1], reverse=True) + sorted_freq_ranks = sorted(cnt.items(), key=lambda e: e[1], reverse=True) # Allocate code words to ranks in descending order of frequency for ix, (rank, _) in enumerate(sorted_freq_ranks): # Number of bits for this code word @@ -1452,7 +1459,7 @@ def write_frequencies(self, f: IO[Any], freq_ranks: List[int]) -> None: f.write(startbits.to_bytes()) def write_binary(self, fname: str) -> None: - """ Write a compressed form of the trigram database to a file """ + """Write a compressed form of the trigram database to a file""" print("Writing file '{0}'...".format(fname)) # Create a byte buffer stream f = io.BytesIO() @@ -1475,8 +1482,8 @@ class Headers: f.write(UINT32.pack(0)) def write_padded(b: bytes, n: int) -> None: - """ Write bytes to the file f with padding - so that they align to n """ + """Write bytes to the file f with padding + so that they align to n""" # Align to 4 bytes first pos = f.tell() & 3 if pos: @@ -1484,9 +1491,9 @@ def write_padded(b: bytes, n: int) -> None: assert len(b) <= n f.write(b + b"\x00" * (n - len(b))) - def fixup(ptr: int, loc: Optional[int]=None) -> None: - """ Go back and fix up a previous pointer to point at the - current offset in the stream """ + def fixup(ptr: int, loc: Optional[int] = None) -> None: + """Go back and fix up a previous pointer to point at the + current offset in the stream""" fix = f.tell() if loc is None else loc f.seek(ptr) f.write(UINT32.pack(fix)) @@ -1535,14 +1542,20 @@ def fixup(ptr: int, loc: Optional[int]=None) -> None: # Write the bigram and trigram levels fixup(h.bigrams_offset) - bi_fq_loc, tri_fq_loc, bi_ptr_loc, tri_id_loc = self.write_bigram_and_trigram_levels(f) + bi_fq_loc, tri_fq_loc, bi_ptr_loc, tri_id_loc = ( + self.write_bigram_and_trigram_levels(f) + ) fixup(h.bigram_freqs_offset, bi_fq_loc) fixup(h.trigram_freqs_offset, tri_fq_loc) fixup(h.bigram_ptrs_offset, bi_ptr_loc) fixup(h.trigrams_offset, tri_id_loc) f.seek(0, io.SEEK_END) - print("Bigram and trigram levels take a total of {0:,} bytes.".format(f.tell() - pos)) + print( + "Bigram and trigram levels take a total of {0:,} bytes.".format( + f.tell() - pos + ) + ) # Write vocabulary write_padded(b"[vocab]", 16) @@ -1555,7 +1568,7 @@ def fixup(ptr: int, loc: Optional[int]=None) -> None: stream.write(f.getvalue()) def load(self, fname: str) -> None: - """ Open a compressed trigram file and map it into memory """ + """Open a compressed trigram file and map it into memory""" with open(fname, "rb") as stream: self._b = mmap.mmap(stream.fileno(), 0, access=mmap.ACCESS_READ) @@ -1570,9 +1583,8 @@ def load(self, fname: str) -> None: for hdr, val in zip( self._HEADERS, struct.unpack( - "<" + "I" * self._NUM_HEADERS, - mb[16:16 + 4 * self._NUM_HEADERS] - ) + "<" + "I" * self._NUM_HEADERS, mb[16 : 16 + 4 * self._NUM_HEADERS] + ), ): # Assign the file sections to attributes # of the self object @@ -1595,7 +1607,9 @@ def load(self, fname: str) -> None: # Load the vocabulary buffer num_compressed_index = UINT32.unpack_from(self._vocab[0:4], 0)[0] - self._compressed_vocab = gzip.decompress(self._vocab[4 + 4 * num_compressed_index:]) + self._compressed_vocab = gzip.decompress( + self._vocab[4 + 4 * num_compressed_index :] + ) # Load the freqs rank list into memory self.freqs = [] @@ -1614,7 +1628,7 @@ def load(self, fname: str) -> None: self.log_ucnt = math.log(ucnt + 1) def close(self) -> None: - """ Close the memory map and destroy all references to it """ + """Close the memory map and destroy all references to it""" if self._b is not None: for hdr in self._HEADERS: setattr(self, hdr, None) From 587ede0728301f9ff9158c36dad7707e179797cd Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Tue, 27 Aug 2024 13:59:55 +0000 Subject: [PATCH 3/7] Cleanup, rm unused files --- .gitignore | 3 + MANIFEST.in | 2 + README.md | 404 ----------------------------------------- test.py => old/test.py | 0 4 files changed, 5 insertions(+), 404 deletions(-) delete mode 100644 README.md rename test.py => old/test.py (100%) diff --git a/.gitignore b/.gitignore index b0fcd31..94e6234 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,9 @@ console/ # Virtual environments venv p35*/ +p312 +p313 +p314 # Installer logs pip-log.txt diff --git a/MANIFEST.in b/MANIFEST.in index e331ac2..04027fb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,5 +5,7 @@ prune test exclude src/icegrams/*.o exclude src/icegrams/*.so exclude src/icegrams/*.pyd +exclude src/icegrams/*.DS_Store +exclude src/icegrams/resources/*.DS_Store prune src/icegrams/resources include src/icegrams/resources/trigrams.bin diff --git a/README.md b/README.md deleted file mode 100644 index c7db1c4..0000000 --- a/README.md +++ /dev/null @@ -1,404 +0,0 @@ -[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) -![Release](https://shields.io/github/v/release/mideind/Icegrams?display_name=tag) -![PyPI](https://img.shields.io/pypi/v/icegrams) -![Build](https://github.com/mideind/Icegrams/actions/workflows/python-package.yml/badge.svg) - -# Icegrams - -**A fast, compact trigram library for Icelandic** - -## Overview - -**Icegrams** is an MIT-licensed Python 3 (>= 3.7) package that encapsulates a -**large trigram library for Icelandic**. (A trigram is a tuple of -three consecutive words or tokens that appear in real-world text.) - -14 million unique trigrams and their frequency counts are heavily compressed -using radix tries and `quasi-succinct indexes `_ -employing Elias-Fano encoding. This enables the ~43 megabyte compressed trigram file -to be mapped directly into memory, with no *ex ante* decompression, for fast queries -(typically ~10 microseconds per lookup). - -The Icegrams library is implemented in Python and C/C++, glued together via -`CFFI `_. - -The trigram storage approach is based on a -`2017 paper by Pibiri and Venturini `_, -also referring to -`Ottaviano and Venturini `_ -(2014) regarding partitioned Elias-Fano indexes. - -You can use Icegrams to obtain probabilities (relative frequencies) of -over a million different **unigrams** (single words or tokens), or of -**bigrams** (pairs of two words or tokens), or of **trigrams**. You can also -ask it to return the N most likely successors to any unigram or bigram. - -Icegrams is useful for instance in spelling correction, predictive typing, -to help disabled people write text faster, and for various text generation, -statistics and modelling tasks. - -The Icegrams trigram corpus is built from the 2017 edition of the -Icelandic Gigaword Corpus -(`Risamálheild `_), -which is collected and maintained by *The Árni Magnússon Institute* -*for Icelandic Studies*. A mixed, manually vetted subset consisting of 157 -documents from the corpus was used as the source of the token stream, -yielding over 100 million tokens. Trigrams that only occurred -once or twice in the stream were eliminated before creating the -compressed Icegrams database. The creation process is further -`described here `_. - -## Example - ->>> from icegrams import Ngrams ->>> ng = Ngrams() - ->>> # Obtain the frequency of the unigram 'Ísland' - ->>> ng.freq("Ísland") -42018 - ->>> # Obtain the probability of the unigram 'Ísland', as a fraction - ->>> # of the frequency of all unigrams in the database - ->>> ng.prob("Ísland") -0.0003979926900206475 - ->>> # Obtain the log probability (base e) of the unigram 'Ísland' - ->>> ng.logprob("Ísland") --7.8290769196308005 - ->>> # Obtain the frequency of the bigram 'Katrín Jakobsdóttir' - ->>> ng.freq("Katrín", "Jakobsdóttir") -3517 - ->>> # Obtain the probability of 'Jakobsdóttir' given 'Katrín' - ->>> ng.prob("Katrín", "Jakobsdóttir") -0.23298013245033142 - ->>> # Obtain the probability of 'Júlíusdóttir' given 'Katrín' - ->>> ng.prob("Katrín", "Júlíusdóttir") -0.013642384105960274 - ->>> # Obtain the frequency of 'velta fyrirtækisins er' - ->>> ng.freq("velta", "fyrirtækisins", "er") -4 - ->>> # adj_freq returns adjusted frequencies, i.e incremented by 1 - ->>> ng.adj_freq("xxx", "yyy", "zzz") -1 - ->>> # Obtain the N most likely successors of a given unigram or bigram - ->>> # in descending order by log probability of each successor - ->>> ng.succ(10, "stjórnarskrá", "lýðveldisins") -[('Íslands', -1.3708244393477589), ('.', -2.2427905461504567), - (',', -3.313814878299737), ('og', -3.4920631097060557), ('sem', -4.566577846795106), - ('er', -4.720728526622363), ('að', -4.807739903611993), ('um', -5.0084105990741445), - ('en', -5.0084105990741445), ('á', -5.25972502735505)] - -## Reference - -### Initializing Icegrams - -After installing the ``icegrams`` package, use the following code to -import it and initialize an instance of the ``Ngrams`` class:: - - from icegrams import Ngrams - ng = Ngrams() - -Now you can use the ``ng`` instance to query for unigram, bigram -and trigram frequencies and probabilities. - -### The Ngrams class - -* ``__init__(self)`` - - Initializes the ``Ngrams`` instance. - -* ``freq(self, *args) -> int`` - - Returns the frequency of a unigram, bigram or trigram. - - * ``str[] *args`` A parameter sequence of consecutive unigrams - to query the frequency for. - * **returns** An integer with the frequency of the unigram, - bigram or trigram. - - To query for the frequency of a unigram in the text, call - ``ng.freq("unigram1")``. This returns the number of times that - the unigram appears in the database. The unigram is - queried as-is, i.e. with no string stripping or lowercasing. - - To query for the frequency of a bigram in the text, call - ``ng.freq("unigram1", "unigram2")``. - - To query for the frequency of a trigram in the text, call - ``ng.freq("unigram1", "unigram2", "unigram3")``. - - If you pass more than 3 arguments to ``ng.freq()``, only the - last 3 are significant, and the query will be treated - as a trigram query. - - Examples:: - - >>>> ng.freq("stjórnarskrá") - 2973 - >>>> ng.freq("stjórnarskrá", "lýðveldisins") - 39 - >>>> ng.freq("stjórnarskrá", "lýðveldisins", "Íslands") - 12 - >>>> ng.freq("xxx", "yyy", "zzz") - 0 - -* ``adj_freq(self, *args) -> int`` - - Returns the adjusted frequency of a unigram, bigram or trigram. - - * ``str[] *args`` A parameter sequence of consecutive unigrams - to query the frequency for. - * **returns** An integer with the adjusted frequency of the unigram, - bigram or trigram. The adjusted frequency is the actual - frequency plus 1. The method thus never returns 0. - - To query for the frequency of a unigram in the text, call - ``ng.adj_freq("unigram1")``. This returns the number of times that - the unigram appears in the database, plus 1. The unigram is - queried as-is, i.e. with no string stripping or lowercasing. - - To query for the frequency of a bigram in the text, call - ``ng.adj_freq("unigram1", "unigram2")``. - - To query for the frequency of a trigram in the text, call - ``ng.adj_freq("unigram1", "unigram2", "unigram3")``. - - If you pass more than 3 arguments to ``ng.adj_freq()``, only the - last 3 are significant, and the query will be treated - as a trigram query. - - Examples:: - - >>>> ng.adj_freq("stjórnarskrá") - 2974 - >>>> ng.adj_freq("stjórnarskrá", "lýðveldisins") - 40 - >>>> ng.adj_freq("stjórnarskrá", "lýðveldisins", "Íslands") - 13 - >>>> ng.adj_freq("xxx", "yyy", "zzz") - 1 - -* ``prob(self, *args) -> float`` - - Returns the probability of a unigram, bigram or trigram. - - * ``str[] *args`` A parameter sequence of consecutive unigrams - to query the probability for. - * **returns** A float with the probability of the given unigram, - bigram or trigram. - - The probability of a *unigram* is - the frequency of the unigram divided by the sum of the - frequencies of all unigrams in the database. - - The probability of a *bigram* ``(u1, u2)`` is the frequency - of the bigram divided by the frequency of the unigram ``u1``, - i.e. how likely ``u2`` is to succeed ``u1``. - - The probability of a trigram ``(u1, u2, u3)`` is the frequency - of the trigram divided by the frequency of the bigram ``(u1, u2)``, - i.e. how likely ``u3`` is to succeed ``u1 u2``. - - If you pass more than 3 arguments to ``ng.prob()``, only the - last 3 are significant, and the query will be treated - as a trigram probability query. - - Examples:: - - >>>> ng.prob("stjórnarskrá") - 2.8168929772755334e-05 - >>>> ng.prob("stjórnarskrá", "lýðveldisins") - 0.01344989912575655 - >>>> ng.prob("stjórnarskrá", "lýðveldisins", "Íslands") - 0.325 - -* ``logprob(self, *args) -> float`` - - Returns the log probability of a unigram, bigram or trigram. - - * ``str[] *args`` A parameter sequence of consecutive unigrams - to query the log probability for. - * **returns** A float with the natural logarithm (base *e*) of the - probability of the given unigram, bigram or trigram. - - The probability of a *unigram* is - the adjusted frequency of the unigram divided by the sum of the - frequencies of all unigrams in the database. - - The probability of a *bigram* ``(u1, u2)`` is the adjusted frequency - of the bigram divided by the adjusted frequency of the unigram ``u1``, - i.e. how likely ``u2`` is to succeed ``u1``. - - The probability of a trigram ``(u1, u2, u3)`` is the adjusted frequency - of the trigram divided by the adjusted frequency of the bigram ``(u1, u2)``, - i.e. how likely ``u3`` is to succeed ``u1 u2``. - - If you pass more than 3 arguments to ``ng.logprob()``, only the - last 3 are significant, and the query will be treated - as a trigram probability query. - - Examples:: - - >>>> ng.logprob("stjórnarskrá") - -10.477290968535172 - >>>> ng.logprob("stjórnarskrá", "lýðveldisins") - -4.308783672906165 - >>>> ng.logprob("stjórnarskrá", "lýðveldisins", "Íslands") - -1.1239300966523995 - -* ``succ(self, n, *args) -> list[tuple]`` - - Returns the *N* most probable successors of a unigram or bigram. - - * ``int n`` A positive integer specifying how many successors, - at a maximum, should be returned. - * ``str[] *args`` One or two string parameters containing the - unigram or bigram to query the successors for. - * **returns** A list of tuples of (successor unigram, log probability), - in descending order of probability. - - If you pass more than 2 string arguments to ``ng.succ()``, only the - last 2 are significant, and the query will be treated - as a bigram successor query. - - Examples:: - - >>>> ng.succ(2, "stjórnarskrá") - [('.', -1.8259625296091855), ('landsins', -2.223111581475692)] - >>>> ng.succ(2, "stjórnarskrá", "lýðveldisins") - [('Íslands', -1.1239300966523995), ('og', -1.3862943611198904)] - - >>>> # The following is equivalent to ng.succ(2, "lýðveldisins", "Íslands") - - >>>> ng.succ(2, "stjórnarskrá", "lýðveldisins", "Íslands") - [('.', -1.3862943611198908), (',', -1.6545583477145702)] - -## Notes - -Icegrams is built with a sliding window over the source text. This means that -a sentence such as ``"Maðurinn borðaði ísinn."`` results in the following -trigrams being added to the database:: - - ("", "", "Maðurinn") - ("", "Maðurinn", "borðaði") - ("Maðurinn", "borðaði", "ísinn") - ("borðaði", "ísinn", ".") - ("ísinn", ".", "") - (".", "", "") - -The same sliding window strategy is applied for bigrams, so the following -bigrams would be recorded for the same sentence:: - - ("", "Maðurinn") - ("Maðurinn", "borðaði") - ("borðaði", "ísinn") - ("ísinn", ".") - (".", "") - -You can thus obtain the N unigrams that most often start -a sentence by asking for ``ng.succ(N, "")``. - -And, of course, four unigrams are also added, one for each token in the -sentence. - -The tokenization of the source text into unigrams is done with the -`Tokenizer package `_and -uses the rules documented there. Importantly, tokens other than words, -abbreviations, entity names, person names and punctuation are -**replaced by placeholders**. This means that all numbers are represented by the token -``[NUMBER]``, amounts by ``[AMOUNT]``, dates by ``[DATEABS]`` and ``[DATEREL]``, -e-mail addresses by ``[EMAIL]``, etc. For the complete mapping of token types -to placeholder strings, see the -`documentation for the Tokenizer package `_. - -## Prerequisites - -This package runs on CPython 3.6 or newer, and on PyPy 3.6 or newer. It -has been tested on Linux (gcc on x86-64 and ARMhf), MacOS (clang) and -Windows (MSVC). - -If a binary wheel package isn't available on `PyPI `_ -for your system, you may need to have the ``python3-dev`` package -(or its Windows equivalent) installed on your system to set up -Icegrams successfully. This is because a source distribution -install requires a C++ compiler and linker:: - - # Debian or Ubuntu: - sudo apt-get install python3-dev - -## Installation - -To install this package:: - - pip install icegrams - -If you want to be able to edit the source, do like so (assuming you have **git** installed):: - - git clone https://github.com/mideind/Icegrams - cd Icegrams - # [ Activate your virtualenv here if you have one ] - python setup.py develop - -The package source code is now in ``./src/icegrams``. - -## Tests - -To run the built-in tests, install `pytest `_, -``cd`` to your ``Icegrams`` subdirectory (and optionally activate your -virtualenv), then run:: - - python -m pytest - -## Version History - -* Version 1.1.2: Minor bug fixes. Cross-platform wheels provided. Now requires Python 3.7+. (2022-12-14) -* Version 1.1.0: Python 3.5 support dropped; macOS builds fixed; PyPy wheels - generated -* Version 1.0.0: New trigram database sourced from the Icelandic Gigaword Corpus - (Risamálheild) with improved tokenization. Replaced GNU GPLv3 with MIT license. -* Version 0.6.0: Python type annotations added -* Version 0.5.0: Trigrams corpus has been spell-checked - -## License - -Icegrams is Copyright © 2022 [Miðeind ehf.](https://mideind.is) -The original author of this software is *Vilhjálmur Þorsteinsson*. - -This software is licensed under the **MIT License**: - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the "Software"), to deal in the Software without restriction, -including without limitation the rights to use, copy, modify, merge, -publish, distribute, sublicense, and/or sell copies of the Software, -and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/test.py b/old/test.py similarity index 100% rename from test.py rename to old/test.py From 4120b0e4d2440e6eeee0b668ecb8512d1b602207 Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Tue, 27 Aug 2024 14:00:32 +0000 Subject: [PATCH 4/7] Updated setup.py + CI config and docs --- .github/workflows/python-package.yml | 2 +- .github/workflows/wheels.yml | 4 +- README.rst | 9 ++-- setup.py | 61 ++++------------------------ 4 files changed, 15 insertions(+), 61 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 0058679..8f97497 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -24,7 +24,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip wheel setuptools pytest + python -m pip install --upgrade pip wheel setuptools pytest packaging python -m pip install -e . - name: Test with pytest run: | diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 79cf27f..dd5c71b 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -25,14 +25,14 @@ jobs: python-version: '3.10' - name: Install cibuildwheel - run: python -m pip install --upgrade pip wheel setuptools cibuildwheel + run: python -m pip install --upgrade pip wheel setuptools packaging cibuildwheel - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse # Options (https://cibuildwheel.readthedocs.io/en/stable/options/) env: CIBW_SKIP: cp36-* cp37-* cp38-* *pp37-* pp38-* *musllinux* - CIBW_BEFORE_BUILD_MACOS: python3 -m pip install --upgrade setuptools wheel cffi + CIBW_BEFORE_BUILD_MACOS: python3 -m pip install --upgrade setuptools wheel cffi packaging CIBW_ARCHS_MACOS: "x86_64 arm64" CIBW_ARCHS_WINDOWS: "AMD64" CIBW_ARCHS_LINUX: "x86_64" diff --git a/README.rst b/README.rst index 069eafd..0cd1bcb 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,7 @@ Icegrams: A fast, compact trigram library for Icelandic Overview ******** -**Icegrams** is an MIT-licensed Python 3 (>= 3.7) package that encapsulates a +**Icegrams** is an MIT-licensed Python 3 (>=3.9) package that encapsulates a **large trigram library for Icelandic**. (A trigram is a tuple of three consecutive words or tokens that appear in real-world text.) @@ -319,8 +319,8 @@ to placeholder strings, see the Prerequisites ************* -This package runs on CPython 3.6 or newer, and on PyPy 3.6 or newer. It -has been tested on Linux (gcc on x86-64 and ARMhf), MacOS (clang) and +This package runs on CPython 3.9 or newer, and on PyPy 3.9 or newer. It +has been tested on Linux (gcc on x86-64 and ARMhf), macOS (clang) and Windows (MSVC). If a binary wheel package isn't available on `PyPI `_ @@ -363,6 +363,7 @@ virtualenv), then run:: Changelog ********* +* Version 1.1.3: Minor tweaks. Support for Python 3.13. Now requires Python 3.9+. (2024-08-27) * Version 1.1.2: Minor bug fixes. Cross-platform wheels provided. Now requires Python 3.7+. (2022-12-14) * Version 1.1.0: Python 3.5 support dropped; macOS builds fixed; PyPy wheels generated @@ -375,7 +376,7 @@ Changelog Copyright and licensing *********************** -Icegrams is Copyright © 2022 `Miðeind ehf. `__. +Icegrams is Copyright © 2020-2024 `Miðeind ehf. `__. The original author of this software is *Vilhjálmur Þorsteinsson*. This software is licensed under the **MIT License**: diff --git a/setup.py b/setup.py index f9f4982..a2a80e1 100644 --- a/setup.py +++ b/setup.py @@ -1,59 +1,13 @@ #!/usr/bin/env python3 -""" - Icegrams: A trigrams library for Icelandic - - setup.py - - Copyright (C) 2020 Miðeind ehf. - Author: Vilhjálmur Þorsteinsson - - This software is licensed under the MIT License: - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - - This module sets up the icegrams package. It uses the cffi_modules - parameter, available in recent versions of setuptools, to - automatically compile the trie.cpp module to trie.*.so/.pyd - and build the required CFFI Python wrapper via trie_build.py. - - Note that installing under PyPy >= 3.7 is supported. - -""" - -import io import re -import sys - +import io from glob import glob -from os.path import basename, dirname, join, splitext +from os.path import basename, splitext, dirname, join from setuptools import find_packages, setup # type: ignore -if sys.version_info < (3, 7): - print("Icegrams requires Python >= 3.7") - sys.exit(1) - - def read(*names, **kwargs): try: return io.open( @@ -65,7 +19,7 @@ def read(*names, **kwargs): setup( name="icegrams", - version="1.1.2", # Also update in src/icegrams/__init__.py + version="1.1.3", license="MIT", description="Trigram statistics for Icelandic", long_description="{0}\n{1}".format( @@ -84,23 +38,22 @@ def read(*names, **kwargs): include_package_data=True, zip_safe=False, classifiers=[ - # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", - "Operating System :: Unix", "Operating System :: POSIX", - "Operating System :: Microsoft :: Windows", + "Operating System :: Unix", "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", "Natural Language :: Icelandic", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Software Development :: Libraries :: Python Modules", From 83bba3d8baafd266dfcbdbeba1ac12d17d630f79 Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Wed, 28 Aug 2024 14:50:08 +0000 Subject: [PATCH 5/7] Re-applied migration to importlib_resources --- src/icegrams/ngrams.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/icegrams/ngrams.py b/src/icegrams/ngrams.py index ab5e081..29303d8 100644 --- a/src/icegrams/ngrams.py +++ b/src/icegrams/ngrams.py @@ -155,12 +155,10 @@ # Make sure that the trigrams.bin file is # unpacked and ready for use - import pkg_resources + import importlib.resources as importlib_resources - # Note: the resource path below should NOT use os.path.join() - BINARY_FILENAME = pkg_resources.resource_filename( # type: ignore - __name__, "resources/trigrams.bin" - ) + ref = importlib_resources.files("icegrams").joinpath("resources/trigrams.bin") + BINARY_FILENAME = str(ref) ffi: Any = cast(Any, ffi) # type: ignore trie_cffi: Any = cast(Any, trie_cffi) # type: ignore From b163751c47867ce0ab4b7c8944f267ae9d624b7e Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Wed, 28 Aug 2024 15:04:39 +0000 Subject: [PATCH 6/7] Migrated package metadata over to pyproject.toml --- pyproject.toml | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 54 +--------------------------------------- 2 files changed, 68 insertions(+), 53 deletions(-) create mode 100644 pyproject.toml mode change 100644 => 100755 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..67595da --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,67 @@ +[project] +name = "icegrams" +version = "1.1.3" +description = "Trigram statistics for Icelandic" +authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }] +maintainers = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }] +readme = { file = "README.rst", content-type = "text/x-rst" } +license = { text = "MIT" } +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Natural Language :: Icelandic", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Utilities", + "Topic :: Text Processing :: Linguistic", +] +keywords = ["nlp", "trigram", "ngram", "trigrams", "ngrams", "icelandic"] +requires-python = ">=3.9" +dependencies = ["cffi>=1.15.1"] + +[project.urls] +Repository = "https://github.com/mideind/Icegrams" + +[project.optional-dependencies] +# dev dependencies +dev = ["pytest"] + +# *** Configuration of tools *** + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +where = ["src"] + +[tool.pytest.ini_options] +filterwarnings = [ + # Ignore deprecation warnings in libraries, their problem not ours + # "ignore::DeprecationWarning", +] + +[tool.ruff] +line-length = 88 + +[tool.black] +line-length = 88 + +[tool.isort] +# This forces these imports to placed at the top +known_future_library = ["__future__", "typing", "typing_extensions"] +profile = "black" +line_length = 88 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 5b1d2ac..198a7b4 --- a/setup.py +++ b/setup.py @@ -1,69 +1,17 @@ #!/usr/bin/env python3 - -from typing import Any - -import re -import io from glob import glob -from os.path import basename, splitext, dirname, join +from os.path import basename, splitext from setuptools import find_packages, setup - -def read(*names, **kwargs): - try: - return io.open( - join(dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8") - ).read() - except (IOError, OSError): - return "" - - setup( - name="icegrams", - version="1.1.3", - license="MIT", - description="Trigram statistics for Icelandic", - long_description="{0}\n{1}".format( - re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub( - "", read("README.rst") - ), - re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", read("CHANGELOG.rst")), - ), - author="Miðeind ehf", - author_email="mideind@mideind.is", - url="https://github.com/mideind/Icegrams", packages=find_packages("src"), package_dir={"": "src"}, py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], package_data={"icegrams": ["py.typed"]}, include_package_data=True, zip_safe=False, - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: POSIX", - "Operating System :: Unix", - "Operating System :: MacOS", - "Operating System :: Microsoft :: Windows", - "Natural Language :: Icelandic", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Utilities", - "Topic :: Text Processing :: Linguistic", - ], - keywords=["nlp", "trigram", "ngram", "trigrams", "ngrams", "icelandic"], setup_requires=["cffi>=1.15.1"], install_requires=["cffi>=1.15.1"], cffi_modules=["src/icegrams/trie_build.py:ffibuilder"], From 182768c7adab1c0f2a5e44d4c133d6674f2abec1 Mon Sep 17 00:00:00 2001 From: Sveinbjorn Thordarson Date: Wed, 28 Aug 2024 16:42:30 +0000 Subject: [PATCH 7/7] Fixes as per code review --- src/icegrams/ngrams.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/icegrams/ngrams.py b/src/icegrams/ngrams.py index 29303d8..6f8614a 100644 --- a/src/icegrams/ngrams.py +++ b/src/icegrams/ngrams.py @@ -143,7 +143,7 @@ # Import the CFFI wrapper for the trie.cpp C++ module # (see also trie.py and build_trie.py) -elif __name__ == "__main__": +if __name__ == "__main__": # Running as a main program from _trie import lib as trie_cffi, ffi # type: ignore # pylint: disable=import-error from trie import Trie @@ -157,7 +157,7 @@ # unpacked and ready for use import importlib.resources as importlib_resources - ref = importlib_resources.files("icegrams").joinpath("resources/trigrams.bin") + ref = importlib_resources.files("icegrams").joinpath("resources", "trigrams.bin") BINARY_FILENAME = str(ref) ffi: Any = cast(Any, ffi) # type: ignore @@ -865,8 +865,8 @@ def bigram_frequency(self, i0: Optional[int], i1: Optional[int]) -> int: if i0 is None or i1 is None: return 0 # Check degenerate case - # if not (i0 or i1): - # return 0 + if not (i0 or i1): + return 0 assert self._unigram_ptrs_ml is not None p1, p2 = self._unigram_ptrs_ml.lookup_pair(i0) # Then, look for id i1 within the level 2 ids delimited by [p1, p2>