From 82510c4d5cd9d0488249fc4149345b5d357febfd Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Mon, 26 Aug 2024 23:15:38 +0000
Subject: [PATCH 1/7] Updated CFFI build config

---
 .github/workflows/python-package.yml |   4 +-
 README.md                            | 404 +++++++++++++++++++++++++++
 src/icegrams/trie_build.py           |   9 +-
 3 files changed, 413 insertions(+), 4 deletions(-)
 create mode 100644 README.md

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index b947cfb..0058679 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -2,9 +2,9 @@ name: tests
 
 on:
   push:
-    branches: [ "master" ]
+    branches: [ "*" ]
   pull_request:
-    branches: [ "master" ]
+    branches: [ "*" ]
 
 jobs:
   build:
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c7db1c4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,404 @@
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/)
+![Release](https://shields.io/github/v/release/mideind/Icegrams?display_name=tag)
+![PyPI](https://img.shields.io/pypi/v/icegrams)
+![Build](https://github.com/mideind/Icegrams/actions/workflows/python-package.yml/badge.svg)
+
+# Icegrams
+
+**A fast, compact trigram library for Icelandic**
+
+## Overview
+
+**Icegrams** is an MIT-licensed Python 3 (>= 3.7) package that encapsulates a
+**large trigram library for Icelandic**. (A trigram is a tuple of
+three consecutive words or tokens that appear in real-world text.)
+
+14 million unique trigrams and their frequency counts are heavily compressed
+using radix tries and `quasi-succinct indexes <https://arxiv.org/abs/1206.4300>`_
+employing Elias-Fano encoding. This enables the ~43 megabyte compressed trigram file
+to be mapped directly into memory, with no *ex ante* decompression, for fast queries
+(typically ~10 microseconds per lookup).
+
+The Icegrams library is implemented in Python and C/C++, glued together via
+`CFFI <https://cffi.readthedocs.io/en/latest/>`_.
+
+The trigram storage approach is based on a
+`2017 paper by Pibiri and Venturini <http://pages.di.unipi.it/pibiri/papers/SIGIR17.pdf>`_,
+also referring to
+`Ottaviano and Venturini <http://www.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf>`_
+(2014) regarding partitioned Elias-Fano indexes.
+
+You can use Icegrams to obtain probabilities (relative frequencies) of
+over a million different **unigrams** (single words or tokens), or of
+**bigrams** (pairs of two words or tokens), or of **trigrams**. You can also
+ask it to return the N most likely successors to any unigram or bigram.
+
+Icegrams is useful for instance in spelling correction, predictive typing,
+to help disabled people write text faster, and for various text generation,
+statistics and modelling tasks.
+
+The Icegrams trigram corpus is built from the 2017 edition of the
+Icelandic Gigaword Corpus
+(`Risamálheild <https://malheildir.arnastofnun.is/?mode=rmh2017>`_),
+which is collected and maintained by *The Árni Magnússon Institute*
+*for Icelandic Studies*. A mixed, manually vetted subset consisting of 157
+documents from the corpus was used as the source of the token stream,
+yielding over 100 million tokens. Trigrams that only occurred
+once or twice in the stream were eliminated before creating the
+compressed Icegrams database. The creation process is further
+`described here <https://github.com/mideind/Icegrams/blob/master/doc/overview.md>`_.
+
+## Example
+
+>>> from icegrams import Ngrams
+>>> ng = Ngrams()
+
+>>> # Obtain the frequency of the unigram 'Ísland'
+
+>>> ng.freq("Ísland")
+42018
+
+>>> # Obtain the probability of the unigram 'Ísland', as a fraction
+
+>>> # of the frequency of all unigrams in the database
+
+>>> ng.prob("Ísland")
+0.0003979926900206475
+
+>>> # Obtain the log probability (base e) of the unigram 'Ísland'
+
+>>> ng.logprob("Ísland")
+-7.8290769196308005
+
+>>> # Obtain the frequency of the bigram 'Katrín Jakobsdóttir'
+
+>>> ng.freq("Katrín", "Jakobsdóttir")
+3517
+
+>>> # Obtain the probability of 'Jakobsdóttir' given 'Katrín'
+
+>>> ng.prob("Katrín", "Jakobsdóttir")
+0.23298013245033142
+
+>>> # Obtain the probability of 'Júlíusdóttir' given 'Katrín'
+
+>>> ng.prob("Katrín", "Júlíusdóttir")
+0.013642384105960274
+
+>>> # Obtain the frequency of 'velta fyrirtækisins er'
+
+>>> ng.freq("velta", "fyrirtækisins", "er")
+4
+
+>>> # adj_freq returns adjusted frequencies, i.e incremented by 1
+
+>>> ng.adj_freq("xxx", "yyy", "zzz")
+1
+
+>>> # Obtain the N most likely successors of a given unigram or bigram
+
+>>> # in descending order by log probability of each successor
+
+>>> ng.succ(10, "stjórnarskrá", "lýðveldisins")
+[('Íslands', -1.3708244393477589), ('.', -2.2427905461504567),
+    (',', -3.313814878299737), ('og', -3.4920631097060557), ('sem', -4.566577846795106),
+    ('er', -4.720728526622363), ('að', -4.807739903611993), ('um', -5.0084105990741445),
+    ('en', -5.0084105990741445), ('á', -5.25972502735505)]
+
+## Reference
+
+### Initializing Icegrams
+
+After installing the ``icegrams`` package, use the following code to
+import it and initialize an instance of the ``Ngrams`` class::
+
+    from icegrams import Ngrams
+    ng = Ngrams()
+
+Now you can use the ``ng`` instance to query for unigram, bigram
+and trigram frequencies and probabilities.
+
+### The Ngrams class
+
+* ``__init__(self)``
+
+  Initializes the ``Ngrams`` instance.
+
+* ``freq(self, *args) -> int``
+
+  Returns the frequency of a unigram, bigram or trigram.
+
+  * ``str[] *args`` A parameter sequence of consecutive unigrams
+    to query the frequency for.
+  * **returns** An integer with the frequency of the unigram,
+    bigram or trigram.
+
+  To query for the frequency of a unigram in the text, call
+  ``ng.freq("unigram1")``. This returns the number of times that
+  the unigram appears in the database. The unigram is
+  queried as-is, i.e. with no string stripping or lowercasing.
+
+  To query for the frequency of a bigram in the text, call
+  ``ng.freq("unigram1", "unigram2")``.
+
+  To query for the frequency of a trigram in the text, call
+  ``ng.freq("unigram1", "unigram2", "unigram3")``.
+
+  If you pass more than 3 arguments to ``ng.freq()``, only the
+  last 3 are significant, and the query will be treated
+  as a trigram query.
+
+  Examples::
+
+    >>>> ng.freq("stjórnarskrá")
+    2973
+    >>>> ng.freq("stjórnarskrá", "lýðveldisins")
+    39
+    >>>> ng.freq("stjórnarskrá", "lýðveldisins", "Íslands")
+    12
+    >>>> ng.freq("xxx", "yyy", "zzz")
+    0
+
+* ``adj_freq(self, *args) -> int``
+
+  Returns the adjusted frequency of a unigram, bigram or trigram.
+
+  * ``str[] *args`` A parameter sequence of consecutive unigrams
+    to query the frequency for.
+  * **returns** An integer with the adjusted frequency of the unigram,
+    bigram or trigram. The adjusted frequency is the actual
+    frequency plus 1. The method thus never returns 0.
+
+  To query for the frequency of a unigram in the text, call
+  ``ng.adj_freq("unigram1")``. This returns the number of times that
+  the unigram appears in the database, plus 1. The unigram is
+  queried as-is, i.e. with no string stripping or lowercasing.
+
+  To query for the frequency of a bigram in the text, call
+  ``ng.adj_freq("unigram1", "unigram2")``.
+
+  To query for the frequency of a trigram in the text, call
+  ``ng.adj_freq("unigram1", "unigram2", "unigram3")``.
+
+  If you pass more than 3 arguments to ``ng.adj_freq()``, only the
+  last 3 are significant, and the query will be treated
+  as a trigram query.
+
+  Examples::
+
+    >>>> ng.adj_freq("stjórnarskrá")
+    2974
+    >>>> ng.adj_freq("stjórnarskrá", "lýðveldisins")
+    40
+    >>>> ng.adj_freq("stjórnarskrá", "lýðveldisins", "Íslands")
+    13
+    >>>> ng.adj_freq("xxx", "yyy", "zzz")
+    1
+
+* ``prob(self, *args) -> float``
+
+  Returns the probability of a unigram, bigram or trigram.
+
+  * ``str[] *args`` A parameter sequence of consecutive unigrams
+    to query the probability for.
+  * **returns** A float with the probability of the given unigram,
+    bigram or trigram.
+
+  The probability of a *unigram* is
+  the frequency of the unigram divided by the sum of the
+  frequencies of all unigrams in the database.
+
+  The probability of a *bigram* ``(u1, u2)`` is the frequency
+  of the bigram divided by the frequency of the unigram ``u1``,
+  i.e. how likely ``u2`` is to succeed ``u1``.
+
+  The probability of a trigram ``(u1, u2, u3)`` is the frequency
+  of the trigram divided by the frequency of the bigram ``(u1, u2)``,
+  i.e. how likely ``u3`` is to succeed ``u1 u2``.
+
+  If you pass more than 3 arguments to ``ng.prob()``, only the
+  last 3 are significant, and the query will be treated
+  as a trigram probability query.
+
+  Examples::
+
+    >>>> ng.prob("stjórnarskrá")
+    2.8168929772755334e-05
+    >>>> ng.prob("stjórnarskrá", "lýðveldisins")
+    0.01344989912575655
+    >>>> ng.prob("stjórnarskrá", "lýðveldisins", "Íslands")
+    0.325
+
+* ``logprob(self, *args) -> float``
+
+  Returns the log probability of a unigram, bigram or trigram.
+
+  * ``str[] *args`` A parameter sequence of consecutive unigrams
+    to query the log probability for.
+  * **returns** A float with the natural logarithm (base *e*) of the
+    probability of the given unigram, bigram or trigram.
+
+  The probability of a *unigram* is
+  the adjusted frequency of the unigram divided by the sum of the
+  frequencies of all unigrams in the database.
+
+  The probability of a *bigram* ``(u1, u2)`` is the adjusted frequency
+  of the bigram divided by the adjusted frequency of the unigram ``u1``,
+  i.e. how likely ``u2`` is to succeed ``u1``.
+
+  The probability of a trigram ``(u1, u2, u3)`` is the adjusted frequency
+  of the trigram divided by the adjusted frequency of the bigram ``(u1, u2)``,
+  i.e. how likely ``u3`` is to succeed ``u1 u2``.
+
+  If you pass more than 3 arguments to ``ng.logprob()``, only the
+  last 3 are significant, and the query will be treated
+  as a trigram probability query.
+
+  Examples::
+
+    >>>> ng.logprob("stjórnarskrá")
+    -10.477290968535172
+    >>>> ng.logprob("stjórnarskrá", "lýðveldisins")
+    -4.308783672906165
+    >>>> ng.logprob("stjórnarskrá", "lýðveldisins", "Íslands")
+    -1.1239300966523995
+
+* ``succ(self, n, *args) -> list[tuple]``
+
+  Returns the *N* most probable successors of a unigram or bigram.
+
+  * ``int n`` A positive integer specifying how many successors,
+    at a maximum, should be returned.
+  * ``str[] *args`` One or two string parameters containing the
+    unigram or bigram to query the successors for.
+  * **returns** A list of tuples of (successor unigram, log probability),
+    in descending order of probability.
+
+  If you pass more than 2 string arguments to ``ng.succ()``, only the
+  last 2 are significant, and the query will be treated
+  as a bigram successor query.
+
+  Examples::
+
+    >>>> ng.succ(2, "stjórnarskrá")
+    [('.', -1.8259625296091855), ('landsins', -2.223111581475692)]
+    >>>> ng.succ(2, "stjórnarskrá", "lýðveldisins")
+    [('Íslands', -1.1239300966523995), ('og', -1.3862943611198904)]
+
+    >>>> # The following is equivalent to ng.succ(2, "lýðveldisins", "Íslands")
+
+    >>>> ng.succ(2, "stjórnarskrá", "lýðveldisins", "Íslands")
+    [('.', -1.3862943611198908), (',', -1.6545583477145702)]
+
+## Notes
+
+Icegrams is built with a sliding window over the source text. This means that
+a sentence such as ``"Maðurinn borðaði ísinn."`` results in the following
+trigrams being added to the database::
+
+   ("", "", "Maðurinn")
+   ("", "Maðurinn", "borðaði")
+   ("Maðurinn", "borðaði", "ísinn")
+   ("borðaði", "ísinn", ".")
+   ("ísinn", ".", "")
+   (".", "", "")
+
+The same sliding window strategy is applied for bigrams, so the following
+bigrams would be recorded for the same sentence::
+
+   ("", "Maðurinn")
+   ("Maðurinn", "borðaði")
+   ("borðaði", "ísinn")
+   ("ísinn", ".")
+   (".", "")
+
+You can thus obtain the N unigrams that most often start
+a sentence by asking for ``ng.succ(N, "")``.
+
+And, of course, four unigrams are also added, one for each token in the
+sentence.
+
+The tokenization of the source text into unigrams is done with the
+`Tokenizer package <https://pypi.org/project/tokenizer>`_and
+uses the rules documented there. Importantly, tokens other than words,
+abbreviations, entity names, person names and punctuation are
+**replaced by placeholders**. This means that all numbers are represented by the token
+``[NUMBER]``, amounts by ``[AMOUNT]``, dates by ``[DATEABS]`` and ``[DATEREL]``,
+e-mail addresses by ``[EMAIL]``, etc. For the complete mapping of token types
+to placeholder strings, see the
+`documentation for the Tokenizer package <https://github.com/mideind/Tokenizer/blob/master/README.rst>`_.
+
+## Prerequisites
+
+This package runs on CPython 3.6 or newer, and on PyPy 3.6 or newer. It
+has been tested on Linux (gcc on x86-64 and ARMhf), MacOS (clang) and
+Windows (MSVC).
+
+If a binary wheel package isn't available on `PyPI <https://pypi.org>`_
+for your system, you may need to have the ``python3-dev`` package
+(or its Windows equivalent) installed on your system to set up
+Icegrams successfully. This is because a source distribution
+install requires a C++ compiler and linker::
+
+    # Debian or Ubuntu:
+    sudo apt-get install python3-dev
+
+## Installation
+
+To install this package::
+
+    pip install icegrams
+
+If you want to be able to edit the source, do like so (assuming you have **git** installed)::
+
+    git clone https://github.com/mideind/Icegrams
+    cd Icegrams
+    # [ Activate your virtualenv here if you have one ]
+    python setup.py develop
+
+The package source code is now in ``./src/icegrams``.
+
+## Tests
+
+To run the built-in tests, install `pytest <https://docs.pytest.org/en/latest/>`_,
+``cd`` to your ``Icegrams`` subdirectory (and optionally activate your
+virtualenv), then run::
+
+    python -m pytest
+
+## Version History
+
+* Version 1.1.2: Minor bug fixes. Cross-platform wheels provided. Now requires Python 3.7+. (2022-12-14)
+* Version 1.1.0: Python 3.5 support dropped; macOS builds fixed; PyPy wheels
+  generated
+* Version 1.0.0: New trigram database sourced from the Icelandic Gigaword Corpus
+  (Risamálheild) with improved tokenization. Replaced GNU GPLv3 with MIT license.
+* Version 0.6.0: Python type annotations added
+* Version 0.5.0: Trigrams corpus has been spell-checked
+
+## License
+
+Icegrams is Copyright © 2022 [Miðeind ehf.](https://mideind.is)  
+The original author of this software is *Vilhjálmur Þorsteinsson*.
+
+This software is licensed under the **MIT License**:
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/src/icegrams/trie_build.py b/src/icegrams/trie_build.py
index 50d889e..7285993 100644
--- a/src/icegrams/trie_build.py
+++ b/src/icegrams/trie_build.py
@@ -44,9 +44,9 @@
 # change it in setup.py as well
 ffibuilder = cffi.FFI()
 
-_PATH = os.path.dirname(__file__) or "."
 WINDOWS = platform.system() == "Windows"
 MACOS = platform.system() == "Darwin"
+IMPLEMENTATION = platform.python_implementation()
 
 # What follows is the actual Python-wrapped C interface to trie.*.so
 # It must be kept in sync with trie.h
@@ -100,12 +100,18 @@
     extra_compile_args = ["/Zc:offsetof-"]
 elif MACOS:
     os.environ["CFLAGS"] = "-stdlib=libc++"  # Fixes PyPy build on macOS 10.15.6+
+    os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9"
     extra_compile_args = ["-mmacosx-version-min=10.7", "-stdlib=libc++"]
 else:
     # Adding -O3 to the compiler arguments doesn't seem to make
     # any discernible difference in lookup speed
     extra_compile_args = ["-std=c++11"]
 
+# On some systems, the linker needs to be told to use the C++ compiler
+# under PyPy due to changes in the default behaviour of distutils.
+if IMPLEMENTATION == "PyPy":
+    os.environ["LDCXXSHARED"] = "c++ -shared"
+
 ffibuilder.set_source(
     "icegrams._trie",
     # trie.cpp is written in C++ but must export a pure C interface.
@@ -120,4 +126,3 @@
 
 if __name__ == "__main__":
     ffibuilder.compile(verbose=False)
-

From 7be5386dfb40e2fcd3be9a140df75256de07e8a8 Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Tue, 27 Aug 2024 13:58:26 +0000
Subject: [PATCH 2/7] Migrated from pkg_resources over to importlib + use
 importlib for version export

---
 src/icegrams/__init__.py |  11 +-
 src/icegrams/ngrams.py   | 436 ++++++++++++++++++++-------------------
 2 files changed, 231 insertions(+), 216 deletions(-)

diff --git a/src/icegrams/__init__.py b/src/icegrams/__init__.py
index e35e796..9697f0b 100644
--- a/src/icegrams/__init__.py
+++ b/src/icegrams/__init__.py
@@ -33,10 +33,11 @@
 
 """
 
-# Expose the icegrams API
-
-from .ngrams import Ngrams, MAX_ORDER
+import importlib.metadata
 
 __author__ = "Miðeind ehf."
-__copyright__ = "(C) 2020 Miðeind ehf."
-__version__ = "1.1.2"
+__copyright__ = "(C) 2020-2024 Miðeind ehf."
+__version__ = importlib.metadata.version("icegrams")
+
+# Expose the icegrams API
+from .ngrams import Ngrams, MAX_ORDER
diff --git a/src/icegrams/ngrams.py b/src/icegrams/ngrams.py
index d910cd0..5011d59 100644
--- a/src/icegrams/ngrams.py
+++ b/src/icegrams/ngrams.py
@@ -110,8 +110,18 @@
 """
 
 from typing import (
-    List, Dict, Tuple, Set, Sized, Iterable, Optional, Any, Callable, IO,
-    cast, TYPE_CHECKING
+    List,
+    Dict,
+    Tuple,
+    Set,
+    Sized,
+    Iterable,
+    Optional,
+    Any,
+    Callable,
+    IO,
+    cast,
+    TYPE_CHECKING,
 )
 import time
 from collections import defaultdict
@@ -137,15 +147,20 @@
     # Running as a main program
     from _trie import lib as trie_cffi, ffi  # type: ignore  # pylint: disable=import-error
     from trie import Trie  # type: ignore
+
     BINARY_FILENAME = os.path.join(_PATH, "resources", "trigrams.bin")
 else:
     # Imported as a package
     from ._trie import lib as trie_cffi, ffi  # type: ignore  # pylint: disable=import-error,no-name-in-module
+
     # Make sure that the trigrams.bin file is
     # unpacked and ready for use
-    import pkg_resources
+    import importlib.resources as importlib_resources
+
+    ref = importlib_resources.files("icegrams").joinpath("resources/trigrams.bin")
+
     # Note: the resource path below should NOT use os.path.join()
-    BINARY_FILENAME = pkg_resources.resource_filename(__name__, "resources/trigrams.bin")
+    BINARY_FILENAME = str(ref)
 
 UINT32 = struct.Struct("<I")
 UINT16 = struct.Struct("<H")
@@ -165,73 +180,72 @@
 
 
 def to_bytes(s: str) -> bytes:
-    """ Convert string from normal Python representation to
-        a bytes string containing indices into the alphabet.
-        The indices are offset by 1 since 0 is not a valid
-        byte value. """
+    """Convert string from normal Python representation to
+    a bytes string containing indices into the alphabet.
+    The indices are offset by 1 since 0 is not a valid
+    byte value."""
     return bytes(ALPHABET.index(ch) + 1 for ch in s)
 
 
 def to_str(by: Iterable[int]) -> str:
-    """ Convert a sequence of byte indices into a normal Python string.
-        The byte indices are decremented by 1 before the conversion,
-        since 0 is not a valid byte index. """
+    """Convert a sequence of byte indices into a normal Python string.
+    The byte indices are decremented by 1 before the conversion,
+    since 0 is not a valid byte index."""
     return "".join(ALPHABET[b - 1] for b in by)
 
 
 class Ngrams:
-
-    """ A wrapper class around the n-gram store, allowing
-        queries for n-gram frequencies and probabilities.
-        The current n-gram store contains unigrams, bigrams and
-        trigrams. """
+    """A wrapper class around the n-gram store, allowing
+    queries for n-gram frequencies and probabilities.
+    The current n-gram store contains unigrams, bigrams and
+    trigrams."""
 
     def __init__(self) -> None:
         self.ngrams = NgramStorage()
         self.ngrams.load(BINARY_FILENAME)
 
     def __contains__(self, word: str) -> bool:
-        """ Return True if the word exists as a unigram """
+        """Return True if the word exists as a unigram"""
         return bool(word) and (self.ngrams.word_to_id(word) is not None)
 
     def freq(self, *args: str) -> int:
-        """ Return the frequency of the n-gram given in *args, where
-            1 <= n <= 3 """
+        """Return the frequency of the n-gram given in *args, where
+        1 <= n <= 3"""
         if not args:
             raise ValueError("Must provide at least one string argument")
         return self.ngrams.freq(*args)
 
     def adj_freq(self, *args: str) -> int:
-        """ Return the frequency of the n-gram given in *args, where
-            1 <= n <= 3. The frequency is adjusted so that n-grams
-            that do not occur in the database have frequency 1, and all
-            others have their actual frequency incremented by one. """
+        """Return the frequency of the n-gram given in *args, where
+        1 <= n <= 3. The frequency is adjusted so that n-grams
+        that do not occur in the database have frequency 1, and all
+        others have their actual frequency incremented by one."""
         if not args:
             raise ValueError("Must provide at least one string argument")
         return self.ngrams.freq(*args) + 1
 
     def logprob(self, *args: str) -> float:
-        """ Return the log of the approximate probability
-            of word w(n) given its predecessors w(1)..w(n-1),
-            for 1 <= n <= 3 (i.e. unigram, bigram or trigram) """
+        """Return the log of the approximate probability
+        of word w(n) given its predecessors w(1)..w(n-1),
+        for 1 <= n <= 3 (i.e. unigram, bigram or trigram)"""
         if not args:
             raise ValueError("Must provide at least one string argument")
         return self.ngrams.logprob(*args)
 
     def prob(self, *args: str) -> float:
-        """ Return the approximate probability (in the range (0.0..1.0],
-            note that it is never zero) of word w(n) given its
-            predecessors w(1)..w(n-1), for 1 <= n <= 3 (i.e. unigram,
-            bigram or trigram) """
+        """Return the approximate probability (in the range (0.0..1.0],
+        note that it is never zero) of word w(n) given its
+        predecessors w(1)..w(n-1), for 1 <= n <= 3 (i.e. unigram,
+        bigram or trigram)"""
         if not args:
             raise ValueError("Must provide at least one string argument")
         return math.exp(self.logprob(*args))
 
     def succ(self, n, *args: str) -> List[Tuple[str, float]]:
-        """ Returns a sorted list of length <= n with the most likely
-            successors to the words given, in descending order of
-            log probability. The list consists of tuples of
-            (word, log probability). """
+        """Returns a sorted list of length <= n with the most likely
+        successors to the words given, in descending order of
+        log probability. The list consists of tuples of
+        (word, log probability)."""
         if not isinstance(n, int) or n < 1:
             raise TypeError("Expected positive integer for parameter n")
         if not args:
@@ -239,18 +253,17 @@ def succ(self, n, *args: str) -> List[Tuple[str, float]]:
         return self.ngrams.succ(n, *args)
 
     def close(self) -> None:
-        """ Close the underlying storage and its memory map """
+        """Close the underlying storage and its memory map"""
         self.ngrams.close()
         self.ngrams = None  # type: ignore
 
 
 class BitArray:
-
-    """ BitArray implements a compressed array of bits.
-        Bits are indexed starting from the least significant
-        bit of each byte. Bit 0 is thus the lowest bit of
-        the first byte of the array and bit 7 is the highest
-        bit of that byte. """
+    """BitArray implements a compressed array of bits.
+    Bits are indexed starting from the least significant
+    bit of each byte. Bit 0 is thus the lowest bit of
+    the first byte of the array and bit 7 is the highest
+    bit of that byte."""
 
     def __init__(self) -> None:
         # Accumulator for completed bytes
@@ -263,13 +276,13 @@ def __init__(self) -> None:
         self.length = None  # type: Optional[int]
 
     def num_bits(self) -> int:
-        """ Return the total number of bits written to the byte array """
+        """Return the total number of bits written to the byte array"""
         return len(self.b) * 8 + self.bits
 
     def append(self, val: int, bits: int) -> None:
-        """ Append the given value to the BitArray, using the indicated
-            number of bits. The value is masked by this function before
-            adding it to the array. """
+        """Append the given value to the BitArray, using the indicated
+        number of bits. The value is masked by this function before
+        adding it to the array."""
         assert self.length is None
         if bits <= 0:
             raise ValueError("Bits parameter must be > 0")
@@ -285,8 +298,8 @@ def append(self, val: int, bits: int) -> None:
             self.bits -= 8
 
     def finish(self) -> None:
-        """ Optionally call this to complete writing any still
-            buffered bits to the byte array """
+        """Optionally call this to complete writing any still
+        buffered bits to the byte array"""
         assert self.length is None
         self.length = len(self.b) * 8 + self.bits
         if self.bits:
@@ -297,8 +310,8 @@ def finish(self) -> None:
             self.bits = 0
 
     def get(self, index: int, bits: int) -> int:
-        """ Obtain the value stored at the given bit index, using
-            the indicated number of bits """
+        """Obtain the value stored at the given bit index, using
+        the indicated number of bits"""
         if bits <= 0:
             raise ValueError("Bits parameter must be > 0")
         # Finish writing to the byte buffer
@@ -331,40 +344,39 @@ def get(self, index: int, bits: int) -> int:
         return buf & ((1 << bits) - 1)
 
     def to_bytes(self) -> bytes:
-        """ Finish the byte array and return it as a bytes object """
+        """Finish the byte array and return it as a bytes object"""
         if self.length is None:
             self.finish()
         return bytes(self.b)
 
     def __len__(self) -> int:
-        """ Return the length of this BitArray, in bytes """
+        """Return the length of this BitArray, in bytes"""
         return len(self.b) + (1 if self.bits else 0)
 
 
 class BaseList:
 
     def lookup(self, ix: int) -> int:
-        """ Should always be overridden in derived classes """
+        """Should always be overridden in derived classes"""
         raise NotImplementedError
 
     def __getitem__(self, ix: int) -> int:
-        """ Returns the integer at index ix within the sequence """
+        """Returns the integer at index ix within the sequence"""
         return self.lookup(ix)
 
     def lookup_pair(self, ix: int) -> Tuple[int, int]:
-        """ Return the pair of values at [ix] and [ix+1] """
+        """Return the pair of values at [ix] and [ix+1]"""
         raise NotImplementedError
 
 
 class MonotonicList(BaseList):
-
-    """ A MonotonicList stores a presorted, monotonically increasing
-        list of integers in a compact byte buffer using Elias-Fano
-        encoding. """
+    """A MonotonicList stores a presorted, monotonically increasing
+    list of integers in a compact byte buffer using Elias-Fano
+    encoding."""
 
     QUANTUM_SIZE = 128
 
-    def __init__(self, b: Optional[bytes]=None) -> None:
+    def __init__(self, b: Optional[bytes] = None) -> None:
         # If b is given, it should be a byte buffer of some sort
         # (usually a memoryview() object)
         self.b = b
@@ -374,13 +386,15 @@ def __init__(self, b: Optional[bytes]=None) -> None:
         self.low_bits = 0
         self.high_bits = 0
 
-    def compress(self, int_list: List[int], vocab_size: Optional[int]=None) -> None:
-        """ Compress a presorted, monotonically increasing list of integers
-            in int_list, all of them <= vocab_size (if given), to a bytes() object
-            and return it """
+    def compress(self, int_list: List[int], vocab_size: Optional[int] = None) -> None:
+        """Compress a presorted, monotonically increasing list of integers
+        in int_list, all of them <= vocab_size (if given), to a bytes() object
+        and return it"""
         self.n = n = len(int_list)
-        if n == 0 or n >= 2 ** 32:
-            raise ValueError("List must have more than zero and less than 2**32 elements")
+        if n == 0 or n >= 2**32:
+            raise ValueError(
+                "List must have more than zero and less than 2**32 elements"
+            )
 
         # Get vocabulary size
         if vocab_size is None:
@@ -464,9 +478,10 @@ def compress(self, int_list: List[int], vocab_size: Optional[int]=None) -> None:
         # of low and high bits, which is all we need for decompression
         parts = [
             UINT32.pack(self.n),
-            UINT16.pack(low_bits), UINT16.pack(high_bits),
+            UINT16.pack(low_bits),
+            UINT16.pack(high_bits),
             bytes(hbuf_index),
-            bytes(buf + hbuf)
+            bytes(buf + hbuf),
         ]
         # Align the byte block to a DWORD (32-bit) boundary
         frag = sum(len(p) for p in parts) & 3
@@ -476,51 +491,47 @@ def compress(self, int_list: List[int], vocab_size: Optional[int]=None) -> None:
         self.ffi_b = ffi.cast("uint8_t*", ffi.from_buffer(self.b))
 
     def to_bytes(self) -> bytes:
-        """ Return a bytes object containing the compressed list """
+        """Return a bytes object containing the compressed list"""
         assert self.b is not None
         return self.b
 
     def __str__(self) -> str:
         s = "MonotonicList: u is {0:,}, n is {1:,}\n".format(self.u, self.n)
-        s += (
-            "low_bits is {0}, high_bits is {1}, total range {2:,}\n"
-            .format(self.low_bits, self.high_bits, 2**(self.low_bits + self.high_bits) - 1)
+        s += "low_bits is {0}, high_bits is {1}, total range {2:,}\n".format(
+            self.low_bits, self.high_bits, 2 ** (self.low_bits + self.high_bits) - 1
         )
-        s += (
-            "size in bytes is {0:,} instead of straightforward {1:,}"
-            .format(
-                0 if self.b is None else len(self.b),
-                (self.n * int(math.log(self.u, 2) + 1.0) + 7) // 8
-            )
+        s += "size in bytes is {0:,} instead of straightforward {1:,}".format(
+            0 if self.b is None else len(self.b),
+            (self.n * int(math.log(self.u, 2) + 1.0) + 7) // 8,
         )
         return s
 
     def __len__(self) -> int:
-        """ Return the number of elements in the list """
+        """Return the number of elements in the list"""
         return self.n
 
     def lookup(self, ix: int) -> int:
-        """ Returns the integer at index ix within the sequence """
+        """Returns the integer at index ix within the sequence"""
         if self.ffi_b is None:
             raise ValueError("Lookup not allowed from uncompressed list")
         return trie_cffi.lookupMonotonic(self.ffi_b, self.QUANTUM_SIZE, ix)
 
-    def lookup_pair(self, ix:int) -> Tuple[int, int]:
-        """ Return the pair of values at [ix] and [ix+1] """
+    def lookup_pair(self, ix: int) -> Tuple[int, int]:
+        """Return the pair of values at [ix] and [ix+1]"""
         p1 = ffi.new("uint64_t*")
         p2 = ffi.new("uint64_t*")
         trie_cffi.lookupPairMonotonic(self.ffi_b, self.QUANTUM_SIZE, ix, p1, p2)
         return p1[0], p2[0]
 
     def search(self, p1: int, p2: int, i: int) -> Optional[int]:
-        """ Look for i in the range [p1, p2> within the list """
+        """Look for i in the range [p1, p2> within the list"""
         if self.ffi_b is None:
             raise ValueError("Search not allowed in uncompressed list")
         r = trie_cffi.searchMonotonic(self.ffi_b, self.QUANTUM_SIZE, p1, p2, i)
         return None if r == 0xFFFFFFFF else r
 
     def search_prefix(self, p1: int, p2: int, i: int) -> Optional[int]:
-        """ Look for i in the range [p1, p2> within the list """
+        """Look for i in the range [p1, p2> within the list"""
         if self.ffi_b is None:
             raise ValueError("Search not allowed in uncompressed list")
         r = trie_cffi.searchMonotonicPrefix(self.ffi_b, self.QUANTUM_SIZE, p1, p2, i)
@@ -528,26 +539,25 @@ def search_prefix(self, p1: int, p2: int, i: int) -> Optional[int]:
 
 
 class PartitionedMonotonicList(BaseList):
-
-    """ A PartitionedMonotonicList consists of a list
-        of Elias-Fano lists, with the trick being that
-        each sublist is encoded with its own item
-        sequence, after subtracting the value of the
-        first item of the list (which is stored in
-        the first level list). """
+    """A PartitionedMonotonicList consists of a list
+    of Elias-Fano lists, with the trick being that
+    each sublist is encoded with its own item
+    sequence, after subtracting the value of the
+    first item of the list (which is stored in
+    the first level list)."""
 
     QUANTUM_SIZE = 1 << 11
 
-    def __init__(self, b: Optional[bytes]=None):
+    def __init__(self, b: Optional[bytes] = None):
         self.b = b
         self.ffi_b = None if b is None else ffi.cast("uint8_t*", ffi.from_buffer(b))
 
     def compress(self, int_list: List[int]) -> None:
-        """ Compress int_list into a two-level partitioned
-            Elias-Fano list, where the lower level consists
-            of sublists of length <= QUANTUM_SIZE, and the
-            upper level consists of a list of the values of
-            the first items of the sublists. """
+        """Compress int_list into a two-level partitioned
+        Elias-Fano list, where the lower level consists
+        of sublists of length <= QUANTUM_SIZE, and the
+        upper level consists of a list of the values of
+        the first items of the sublists."""
 
         # The upper level list
         chunks = []
@@ -611,7 +621,7 @@ def compress(self, int_list: List[int]) -> None:
             UINT32.pack(len(chunk_index)),
             b"".join(UINT32.pack(pos + offset) for pos in chunk_index),
             chunk_bytes,
-            merged_buf
+            merged_buf,
         ]
         # Align the byte block to a DWORD (32-bit) boundary
         frag = sum(len(p) for p in parts) & 3
@@ -621,16 +631,16 @@ def compress(self, int_list: List[int]) -> None:
         self.ffi_b = ffi.cast("uint8_t*", ffi.from_buffer(self.b))
 
     def to_bytes(self) -> bytes:
-        """ Return the byte buffer containing the compressed list """
+        """Return the byte buffer containing the compressed list"""
         assert self.b is not None
         return self.b
 
     def __len__(self) -> int:
-        """ Return the compressed list size in bytes """
+        """Return the compressed list size in bytes"""
         return 0 if self.b is None else len(self.b)
 
     def lookup(self, ix: int) -> int:
-        """ Lookup a value from the compressed list, by index """
+        """Lookup a value from the compressed list, by index"""
         if self.ffi_b is None:
             raise ValueError("Lookup not allowed from uncompressed list")
         return trie_cffi.lookupPartition(
@@ -638,7 +648,7 @@ def lookup(self, ix: int) -> int:
         )
 
     def lookup_pair(self, ix: int) -> Tuple[int, int]:
-        """ Return the pair of values at [ix] and [ix+1] """
+        """Return the pair of values at [ix] and [ix+1]"""
         p1 = ffi.new("uint64_t*")
         p2 = ffi.new("uint64_t*")
         trie_cffi.lookupPairPartition(
@@ -650,8 +660,7 @@ def search(self, p1: int, p2: int, i: int) -> Optional[int]:
         if self.ffi_b is None:
             raise ValueError("Search not allowed in uncompressed list")
         r = trie_cffi.searchPartition(
-            self.ffi_b, self.QUANTUM_SIZE, MonotonicList.QUANTUM_SIZE,
-            p1, p2, i
+            self.ffi_b, self.QUANTUM_SIZE, MonotonicList.QUANTUM_SIZE, p1, p2, i
         )
         return None if r == 0xFFFFFFFF else r
 
@@ -659,15 +668,13 @@ def search_prefix(self, p1: int, p2: int, i: int) -> Optional[int]:
         if self.ffi_b is None:
             raise ValueError("Search not allowed in uncompressed list")
         r = trie_cffi.searchPartitionPrefix(
-            self.ffi_b, self.QUANTUM_SIZE, MonotonicList.QUANTUM_SIZE,
-            p1, p2, i
+            self.ffi_b, self.QUANTUM_SIZE, MonotonicList.QUANTUM_SIZE, p1, p2, i
         )
         return None if r == 0xFFFFFFFF else r
 
 
 class _Level:
-
-    """ A level within a trigram tree structure """
+    """A level within a trigram tree structure"""
 
     __slots__ = ("cnt", "d")
 
@@ -686,9 +693,8 @@ def reset(self, depth: int) -> None:
 
 
 class NgramStorage:
-
-    """ NgramStorage wraps the compressed binary representation of
-        the trigram store """
+    """NgramStorage wraps the compressed binary representation of
+    the trigram store"""
 
     # We store an index position in the frequency array once
     # every FREQ_QUANTUM_SIZE frequency values
@@ -703,7 +709,7 @@ class NgramStorage:
     # this makes lookup faster for the most-used words.
     VOCAB_INDEX_CUTOFF = 1024
 
-    VERSION = b'Reynir 001.00.00'
+    VERSION = b"Reynir 001.00.00"
     assert len(VERSION) == 16
 
     # Note that the trie offset must be the first header
@@ -746,28 +752,26 @@ def __init__(self) -> None:
         self._trigram_freqs = bytes()
         self._vocab = bytes()
 
-
     def compress(
-        self, tsv_filename: str, binary_filename: str, *, add_all_bigrams: bool=False
+        self, tsv_filename: str, binary_filename: str, *, add_all_bigrams: bool = False
     ):
-        """ Create a new compressed binary file from a trigram text (.tsv) file.
-            If add_all_bigrams is True, then for each input trigram (w0, w1, w2)
-            we add both (w0, w1) and (w1, w2) as bigrams. Otherwise, we add only
-            (w0, w1) - and assume that (w1, w2, w3) is also present as a trigram
-            causing (w1, w2) to be implicitly added. """
+        """Create a new compressed binary file from a trigram text (.tsv) file.
+        If add_all_bigrams is True, then for each input trigram (w0, w1, w2)
+        we add both (w0, w1) and (w1, w2) as bigrams. Otherwise, we add only
+        (w0, w1) - and assume that (w1, w2, w3) is also present as a trigram
+        causing (w1, w2) to be implicitly added."""
         self.read_tsv(tsv_filename, add_all_bigrams=add_all_bigrams)
         self.write_binary(binary_filename)
 
     def word_to_id(self, word: str) -> Optional[int]:
-        """ Obtain the unigram id for the given word by
-            calling the C++ mapping() function from
-            trie.cpp that has been wrapped using CFFI """
+        """Obtain the unigram id for the given word by
+        calling the C++ mapping() function from
+        trie.cpp that has been wrapped using CFFI"""
         if word == "":
             return 0
         try:
             m = trie_cffi.mapping(
-                ffi.cast("uint8_t*", self._mmap_buffer),
-                to_bytes(word)
+                ffi.cast("uint8_t*", self._mmap_buffer), to_bytes(word)
             )
         except ValueError:
             # The word contains a character that is not in our alphabet
@@ -775,7 +779,7 @@ def word_to_id(self, word: str) -> Optional[int]:
         return None if m == 0xFFFFFFFF else m
 
     def id_to_word(self, n: int) -> str:
-        """ Convert a vocabulary index back to the original unigram text """
+        """Convert a vocabulary index back to the original unigram text"""
         if n < self.VOCAB_INDEX_CUTOFF:
             # For low ids, we have an index entry for every id
             q, r = n, 0
@@ -805,13 +809,13 @@ def id_to_word(self, n: int) -> str:
         return to_str(self._compressed_vocab[start:end])
 
     def indices(self, *args: str) -> Tuple[Optional[int], ...]:
-        """ Convert word strings to vocabulary indices, or None
-            if the word is not found in the vocabulary """
+        """Convert word strings to vocabulary indices, or None
+        if the word is not found in the vocabulary"""
         return tuple(self.word_to_id(w) for w in args)
 
     def lookup_frequency(self, level: int, b: bytes, index: Optional[int]) -> int:
-        """ Look up the frequency with the given index,
-            stored in the byte buffer b """
+        """Look up the frequency with the given index,
+        stored in the byte buffer b"""
         if index is None:
             return 0
         buf = ffi.from_buffer(b)
@@ -823,19 +827,19 @@ def lookup_frequency(self, level: int, b: bytes, index: Optional[int]) -> int:
         return self.freqs[level][rank]
 
     def unigram_frequency(self, i0: Optional[int]) -> int:
-        """ Return the frequency of the unigram i0,
-            specified as a vocabulary index. """
+        """Return the frequency of the unigram i0,
+        specified as a vocabulary index."""
         return self.lookup_frequency(1, self._unigram_freqs, i0)
 
     def unigram_logprob(self, i0: Optional[int]) -> float:
-        """ Return the log of the probability of the unigram
-            given by vocabulary index i0, relative to the entire
-            unigram frequency count """
+        """Return the log of the probability of the unigram
+        given by vocabulary index i0, relative to the entire
+        unigram frequency count"""
         return math.log(self.unigram_frequency(i0) + 1) - self.log_ucnt
 
     def bigram_frequency(self, i0: Optional[int], i1: Optional[int]) -> int:
-        """ Return the frequency of the bigram (i0, i1),
-            given as vocabulary indices. """
+        """Return the frequency of the bigram (i0, i1),
+        given as vocabulary indices."""
         # Look up the pointer range for i0 in the unigram pointers
         if i0 is None or i1 is None:
             return 0
@@ -848,19 +852,18 @@ def bigram_frequency(self, i0: Optional[int], i1: Optional[int]) -> int:
         return self.lookup_frequency(2, self._bigram_freqs, i)
 
     def bigram_logprob(self, i0: Optional[int], i1: Optional[int]) -> float:
-        """ Return the log of the probability of the bigram
-            consisting of vocabulary indices i0 and i1,
-            relative to the unigram frequency of i0 """
-        return (
-            math.log(self.bigram_frequency(i0, i1) + 1)
-            - math.log(self.unigram_frequency(i0) + 1)
+        """Return the log of the probability of the bigram
+        consisting of vocabulary indices i0 and i1,
+        relative to the unigram frequency of i0"""
+        return math.log(self.bigram_frequency(i0, i1) + 1) - math.log(
+            self.unigram_frequency(i0) + 1
         )
 
     def trigram_frequency(
         self, i0: Optional[int], i1: Optional[int], i2: Optional[int]
     ) -> int:
-        """ Return the frequency of the trigram (i0, i1, i2),
-            given as vocabulary indices. """
+        """Return the frequency of the trigram (i0, i1, i2),
+        given as vocabulary indices."""
         # Look up the pointer range for i0 in the unigram pointers
         if i0 is None or i1 is None or i2 is None:
             return 0
@@ -897,23 +900,24 @@ def trigram_frequency(
     def trigram_logprob(
         self, i0: Optional[int], i1: Optional[int], i2: Optional[int]
     ) -> float:
-        """ Return the log of the probability of the trigram
-            consisting of vocabulary indices i0, i1 and i2,
-            relative to the bigram of i0 and i1 """
-        return (
-            math.log(self.trigram_frequency(i0, i1, i2) + 1)
-            - math.log(self.bigram_frequency(i0, i1) + 1)
+        """Return the log of the probability of the trigram
+        consisting of vocabulary indices i0, i1 and i2,
+        relative to the bigram of i0 and i1"""
+        return math.log(self.trigram_frequency(i0, i1, i2) + 1) - math.log(
+            self.bigram_frequency(i0, i1) + 1
         )
 
     _FREQ_DISPATCH = {
-        1: unigram_frequency, 2: bigram_frequency, 3: trigram_frequency
+        1: unigram_frequency,
+        2: bigram_frequency,
+        3: trigram_frequency,
     }  # type: Dict[int, Callable[..., int]]
 
     def freq(self, *args: str) -> int:
-        """ Return the frequency of the n-gram given in *args, where
-            1 <= n <= 3. The frequency is adjusted so that n-grams
-            that do not occur in the database have frequency 1, and all
-            others have their actual frequency incremented by one. """
+        """Return the frequency of the n-gram given in *args, where
+        1 <= n <= 3. The frequency is adjusted so that n-grams
+        that do not occur in the database have frequency 1, and all
+        others have their actual frequency incremented by one."""
         if len(args) > MAX_ORDER:
             # Allow more than 3 arguments, but then we only return the
             # trigram probability of the last 3
@@ -921,13 +925,15 @@ def freq(self, *args: str) -> int:
         return self._FREQ_DISPATCH[len(args)](self, *self.indices(*args))
 
     _PROB_DISPATCH = {
-        1: unigram_logprob, 2: bigram_logprob, 3: trigram_logprob
+        1: unigram_logprob,
+        2: bigram_logprob,
+        3: trigram_logprob,
     }  # type: Dict[int, Callable[..., float]]
 
     def logprob(self, *args: str) -> float:
-        """ Return the log of the approximate probability
-            of word w(n) given its predecessors w(1)..w(n-1),
-            for 1 <= n <= 3 (i.e. unigram, bigram or trigram) """
+        """Return the log of the approximate probability
+        of word w(n) given its predecessors w(1)..w(n-1),
+        for 1 <= n <= 3 (i.e. unigram, bigram or trigram)"""
         if len(args) > MAX_ORDER:
             # Allow more than 3 arguments, but then we only return the
             # trigram probability of the last 3
@@ -935,7 +941,7 @@ def logprob(self, *args: str) -> float:
         return self._PROB_DISPATCH[len(args)](self, *self.indices(*args))
 
     def unigram_succ(self, n: int, i0: int) -> List[Tuple[str, float]]:
-        """ Return successors to the unigram whose id is in i0 """
+        """Return successors to the unigram whose id is in i0"""
         if i0 is None:
             return []
         p1, p2 = self._unigram_ptrs_ml.lookup_pair(i0)
@@ -948,11 +954,11 @@ def unigram_succ(self, n: int, i0: int) -> List[Tuple[str, float]]:
             j = self._bigram_pl.lookup(i) - prefix_sum
             lpi = math.log(self.lookup_frequency(2, self._bigram_freqs, i) + 1)
             result.append((j, lpi - lp0))
-        result = sorted(result, key=lambda e:e[1], reverse=True)[0:n]
+        result = sorted(result, key=lambda e: e[1], reverse=True)[0:n]
         return [(self.id_to_word(j), lp) for j, lp in result]
 
     def bigram_succ(self, n: int, i0: int, i1: int) -> List[Tuple[str, float]]:
-        """ Return successors to the bigram (i0, i1) """
+        """Return successors to the bigram (i0, i1)"""
         if i0 is None or i1 is None:
             return []
         p1, p2 = self._unigram_ptrs_ml.lookup_pair(i0)
@@ -979,24 +985,25 @@ def bigram_succ(self, n: int, i0: int, i1: int) -> List[Tuple[str, float]]:
             j = self._bigram_pl.lookup(q1 + remapped_id) - prefix_sum_bi
             lpi = math.log(self.lookup_frequency(3, self._trigram_freqs, i) + 1)
             result.append((j, lpi - lp0))
-        result = sorted(result, key=lambda e:e[1], reverse=True)[0:n]
+        result = sorted(result, key=lambda e: e[1], reverse=True)[0:n]
         return [(self.id_to_word(j), lp) for j, lp in result]
 
     _SUCC_DISPATCH = {
-        1: unigram_succ, 2: bigram_succ
+        1: unigram_succ,
+        2: bigram_succ,
     }  # type: Dict[int, Callable[..., List[Tuple[str, float]]]]
 
     def succ(self, n: int, *args: str) -> List[Tuple[str, float]]:
-        """ Return a list of likely successors to the words
-            in *args, of length <= n. The list consists of
-            tuples of (word, log probability), in descending
-            order of log probability. """
+        """Return a list of likely successors to the words
+        in *args, of length <= n. The list consists of
+        tuples of (word, log probability), in descending
+        order of log probability."""
         if len(args) >= MAX_ORDER:
-            args = args[-(MAX_ORDER - 1):]
+            args = args[-(MAX_ORDER - 1) :]
         return self._SUCC_DISPATCH[len(args)](self, n, *self.indices(*args))
 
-    def read_tsv(self, fname: str, *, add_all_bigrams: bool=False) -> None:
-        """ Populate the trigram database from a tab-separated (.tsv) file """
+    def read_tsv(self, fname: str, *, add_all_bigrams: bool = False) -> None:
+        """Populate the trigram database from a tab-separated (.tsv) file"""
         print("Reading {0}, first pass...".format(fname), flush=True)
         t0 = time.time()
 
@@ -1030,17 +1037,17 @@ def read_tsv(self, fname: str, *, add_all_bigrams: bool=False) -> None:
                     vocab[to_bytes(w2)] += 1
         # Trie that maps unigrams to integer identifiers
         using_empty = b"" in vocab
-        trie = Trie(reserve_zero_for_empty=using_empty)  # pylint: disable=used-before-assignment
+        trie = Trie(
+            reserve_zero_for_empty=using_empty
+        )  # pylint: disable=used-before-assignment
         # Dict to map words to integer ids
-        ids = { b"": 0 } if using_empty else {}
+        ids = {b"": 0} if using_empty else {}
         # Build the trie in decreasing order of occurrences, ensuring that
         # the most common unigrams get the lowest indices
         if using_empty:
             # Hack to make sure that the blank entry goes to the front of the list
             vocab[b""] = 10**50
-        vocab_list = sorted(
-            vocab.items(), key=lambda item: item[1], reverse=True
-        )
+        vocab_list = sorted(vocab.items(), key=lambda item: item[1], reverse=True)
         assert not using_empty or vocab_list[0][0] == b""
         del vocab
         for unigram_id, (w, c) in enumerate(vocab_list):
@@ -1061,22 +1068,20 @@ def read_tsv(self, fname: str, *, add_all_bigrams: bool=False) -> None:
         # The index consists of w1...w1023 followed by w1024,w1088,...
         for ix, (w, _) in enumerate(vocab_list):
             if ix and (
-                ix % self.VOCAB_QUANTUM_SIZE == 0
-                or ix < self.VOCAB_INDEX_CUTOFF
+                ix % self.VOCAB_QUANTUM_SIZE == 0 or ix < self.VOCAB_INDEX_CUTOFF
             ):
                 compressed_index.extend(UINT32.pack(len(compressed_vocab)))
             compressed_vocab.extend(w + b"\x00")
         parts = [
             UINT32.pack(len(compressed_index) // 4),
             compressed_index,
-            gzip.compress(compressed_vocab)
+            gzip.compress(compressed_vocab),
         ]
         self.compressed_vocab = b"".join(parts)
         print(
             "Compressed vocabulary including index is {0:,} bytes, "
-            "{1:,} uncompressed, {2:,} index"
-            .format(len(self.compressed_vocab), len(compressed_vocab),
-                len(compressed_index)
+            "{1:,} uncompressed, {2:,} index".format(
+                len(self.compressed_vocab), len(compressed_vocab), len(compressed_index)
             )
         )
         del vocab_list
@@ -1209,21 +1214,21 @@ def count_level(depth, level):
             print("Level {0}: Frequency buckets are {1}".format(k, len(v)))
         # For each level, create a dict of indices into an ascending list of frequencies
         self.fbuckets = {
-            k: { f: ix for ix, f in enumerate(sorted(list(v))) }
-            for k, v in freqs.items()
+            k: {f: ix for ix, f in enumerate(sorted(list(v)))} for k, v in freqs.items()
         }
 
         t1 = time.time()
         print(
             "Done in {3:.1f} sec, trigram count is {0:,}, "
-            "voc size is {1:,}, unigram count {2:,}"
-            .format(cnt, len(trie), ucnt, t1 - t0)
+            "voc size is {1:,}, unigram count {2:,}".format(
+                cnt, len(trie), ucnt, t1 - t0
+            )
         )
 
     def write_unigram_pointers(self, f: IO[Any]) -> None:
-        """ Unigram sequence: we write pointers to the next level
-            for every unigram id. Some ids may not have an associated
-            next level, in which case their range is zero. """
+        """Unigram sequence: we write pointers to the next level
+        for every unigram id. Some ids may not have an associated
+        next level, in which case their range is zero."""
         level = self.level0
         assert level is not None
         # Initialize the pointer list, which always starts with a 0
@@ -1249,7 +1254,7 @@ def write_unigram_pointers(self, f: IO[Any]) -> None:
         print("Uni-pointers: {0}\n".format(ml))
 
     def write_unigram_frequencies(self, f: IO[Any]) -> None:
-        """ Write the unigram frequency data """
+        """Write the unigram frequency data"""
         if self.trie is None:
             len_trie = 0
         else:
@@ -1261,13 +1266,11 @@ def write_unigram_frequencies(self, f: IO[Any]) -> None:
         d = self.level0.d
         assert d is not None
         pos = f.tell()
-        self.write_frequencies(
-            f, [freqs[d[i].cnt] for i in range(len_trie)]
-        )
+        self.write_frequencies(f, [freqs[d[i].cnt] for i in range(len_trie)])
         print("Uni-frequencies occupy {0:,} bytes.".format(f.tell() - pos))
 
     def write_bigram_and_trigram_levels(self, f: IO[Any]) -> Tuple[int, int, int, int]:
-        """ Write the bigram and trigram levels to the binary file """
+        """Write the bigram and trigram levels to the binary file"""
         level0 = self.level0
         assert level0 is not None
         assert self.fbuckets is not None
@@ -1364,7 +1367,9 @@ def sorted_child_ids(w0: int) -> List[int]:
         print("\nBi-ids are {0:,}".format(len(bi_ids)))
         pl.compress(bi_ids)
         f.write(pl.to_bytes())
-        print("Bi_ids compressed with partitions: {0:,} bytes".format(len(pl.to_bytes())))
+        print(
+            "Bi_ids compressed with partitions: {0:,} bytes".format(len(pl.to_bytes()))
+        )
 
         print("Bi-pointers are {0:,}".format(len(ptrs)))
         ml.compress(ptrs)
@@ -1378,7 +1383,9 @@ def sorted_child_ids(w0: int) -> List[int]:
         pl.compress(tri_ids)
         tri_id_loc = f.tell()
         f.write(pl.to_bytes())
-        print("Tri_ids compressed with partitions: {0:,} bytes".format(len(pl.to_bytes())))
+        print(
+            "Tri_ids compressed with partitions: {0:,} bytes".format(len(pl.to_bytes()))
+        )
 
         del pl
         del ml
@@ -1394,7 +1401,7 @@ def sorted_child_ids(w0: int) -> List[int]:
         return bi_fq_loc, tri_fq_loc, bi_ptr_loc, tri_id_loc
 
     def write_frequencies(self, f: IO[Any], freq_ranks: List[int]) -> None:
-        """ Write an array containing frequency ranks in a minimal number of bits """
+        """Write an array containing frequency ranks in a minimal number of bits"""
         # Create a dictionary of code words for each frequency rank,
         # using the fewest bits for the most frequent ranks
         codebook = dict()  # type: Dict[int, Tuple[int, int]]
@@ -1403,7 +1410,7 @@ def write_frequencies(self, f: IO[Any], freq_ranks: List[int]) -> None:
         for fqr in freq_ranks:
             cnt[fqr] += 1
         # Sort the frequency ranks in descending order by how common they are
-        sorted_freq_ranks = sorted(cnt.items(), key=lambda e:e[1], reverse=True)
+        sorted_freq_ranks = sorted(cnt.items(), key=lambda e: e[1], reverse=True)
         # Allocate code words to ranks in descending order of frequency
         for ix, (rank, _) in enumerate(sorted_freq_ranks):
             # Number of bits for this code word
@@ -1452,7 +1459,7 @@ def write_frequencies(self, f: IO[Any], freq_ranks: List[int]) -> None:
         f.write(startbits.to_bytes())
 
     def write_binary(self, fname: str) -> None:
-        """ Write a compressed form of the trigram database to a file """
+        """Write a compressed form of the trigram database to a file"""
         print("Writing file '{0}'...".format(fname))
         # Create a byte buffer stream
         f = io.BytesIO()
@@ -1475,8 +1482,8 @@ class Headers:
             f.write(UINT32.pack(0))
 
         def write_padded(b: bytes, n: int) -> None:
-            """ Write bytes to the file f with padding
-                so that they align to n """
+            """Write bytes to the file f with padding
+            so that they align to n"""
             # Align to 4 bytes first
             pos = f.tell() & 3
             if pos:
@@ -1484,9 +1491,9 @@ def write_padded(b: bytes, n: int) -> None:
             assert len(b) <= n
             f.write(b + b"\x00" * (n - len(b)))
 
-        def fixup(ptr: int, loc: Optional[int]=None) -> None:
-            """ Go back and fix up a previous pointer to point at the
-                current offset in the stream """
+        def fixup(ptr: int, loc: Optional[int] = None) -> None:
+            """Go back and fix up a previous pointer to point at the
+            current offset in the stream"""
             fix = f.tell() if loc is None else loc
             f.seek(ptr)
             f.write(UINT32.pack(fix))
@@ -1535,14 +1542,20 @@ def fixup(ptr: int, loc: Optional[int]=None) -> None:
 
         # Write the bigram and trigram levels
         fixup(h.bigrams_offset)
-        bi_fq_loc, tri_fq_loc, bi_ptr_loc, tri_id_loc = self.write_bigram_and_trigram_levels(f)
+        bi_fq_loc, tri_fq_loc, bi_ptr_loc, tri_id_loc = (
+            self.write_bigram_and_trigram_levels(f)
+        )
         fixup(h.bigram_freqs_offset, bi_fq_loc)
         fixup(h.trigram_freqs_offset, tri_fq_loc)
         fixup(h.bigram_ptrs_offset, bi_ptr_loc)
         fixup(h.trigrams_offset, tri_id_loc)
         f.seek(0, io.SEEK_END)
 
-        print("Bigram and trigram levels take a total of {0:,} bytes.".format(f.tell() - pos))
+        print(
+            "Bigram and trigram levels take a total of {0:,} bytes.".format(
+                f.tell() - pos
+            )
+        )
 
         # Write vocabulary
         write_padded(b"[vocab]", 16)
@@ -1555,7 +1568,7 @@ def fixup(ptr: int, loc: Optional[int]=None) -> None:
             stream.write(f.getvalue())
 
     def load(self, fname: str) -> None:
-        """ Open a compressed trigram file and map it into memory """
+        """Open a compressed trigram file and map it into memory"""
         with open(fname, "rb") as stream:
             self._b = mmap.mmap(stream.fileno(), 0, access=mmap.ACCESS_READ)
 
@@ -1570,9 +1583,8 @@ def load(self, fname: str) -> None:
         for hdr, val in zip(
             self._HEADERS,
             struct.unpack(
-                "<" + "I" * self._NUM_HEADERS,
-                mb[16:16 + 4 * self._NUM_HEADERS]
-            )
+                "<" + "I" * self._NUM_HEADERS, mb[16 : 16 + 4 * self._NUM_HEADERS]
+            ),
         ):
             # Assign the file sections to attributes
             # of the self object
@@ -1595,7 +1607,9 @@ def load(self, fname: str) -> None:
 
         # Load the vocabulary buffer
         num_compressed_index = UINT32.unpack_from(self._vocab[0:4], 0)[0]
-        self._compressed_vocab = gzip.decompress(self._vocab[4 + 4 * num_compressed_index:])
+        self._compressed_vocab = gzip.decompress(
+            self._vocab[4 + 4 * num_compressed_index :]
+        )
 
         # Load the freqs rank list into memory
         self.freqs = []
@@ -1614,7 +1628,7 @@ def load(self, fname: str) -> None:
         self.log_ucnt = math.log(ucnt + 1)
 
     def close(self) -> None:
-        """ Close the memory map and destroy all references to it """
+        """Close the memory map and destroy all references to it"""
         if self._b is not None:
             for hdr in self._HEADERS:
                 setattr(self, hdr, None)

From 587ede0728301f9ff9158c36dad7707e179797cd Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Tue, 27 Aug 2024 13:59:55 +0000
Subject: [PATCH 3/7] Cleanup, rm unused files

---
 .gitignore             |   3 +
 MANIFEST.in            |   2 +
 README.md              | 404 -----------------------------------------
 test.py => old/test.py |   0
 4 files changed, 5 insertions(+), 404 deletions(-)
 delete mode 100644 README.md
 rename test.py => old/test.py (100%)

diff --git a/.gitignore b/.gitignore
index b0fcd31..94e6234 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,6 +60,9 @@ console/
 # Virtual environments
 venv
 p35*/
+p312
+p313
+p314
 
 # Installer logs
 pip-log.txt
diff --git a/MANIFEST.in b/MANIFEST.in
index e331ac2..04027fb 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,5 +5,7 @@ prune test
 exclude src/icegrams/*.o
 exclude src/icegrams/*.so
 exclude src/icegrams/*.pyd
+exclude src/icegrams/*.DS_Store
+exclude src/icegrams/resources/*.DS_Store
 prune src/icegrams/resources
 include src/icegrams/resources/trigrams.bin
diff --git a/README.md b/README.md
deleted file mode 100644
index c7db1c4..0000000
--- a/README.md
+++ /dev/null
@@ -1,404 +0,0 @@
-[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/)
-![Release](https://shields.io/github/v/release/mideind/Icegrams?display_name=tag)
-![PyPI](https://img.shields.io/pypi/v/icegrams)
-![Build](https://github.com/mideind/Icegrams/actions/workflows/python-package.yml/badge.svg)
-
-# Icegrams
-
-**A fast, compact trigram library for Icelandic**
-
-## Overview
-
-**Icegrams** is an MIT-licensed Python 3 (>= 3.7) package that encapsulates a
-**large trigram library for Icelandic**. (A trigram is a tuple of
-three consecutive words or tokens that appear in real-world text.)
-
-14 million unique trigrams and their frequency counts are heavily compressed
-using radix tries and `quasi-succinct indexes <https://arxiv.org/abs/1206.4300>`_
-employing Elias-Fano encoding. This enables the ~43 megabyte compressed trigram file
-to be mapped directly into memory, with no *ex ante* decompression, for fast queries
-(typically ~10 microseconds per lookup).
-
-The Icegrams library is implemented in Python and C/C++, glued together via
-`CFFI <https://cffi.readthedocs.io/en/latest/>`_.
-
-The trigram storage approach is based on a
-`2017 paper by Pibiri and Venturini <http://pages.di.unipi.it/pibiri/papers/SIGIR17.pdf>`_,
-also referring to
-`Ottaviano and Venturini <http://www.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf>`_
-(2014) regarding partitioned Elias-Fano indexes.
-
-You can use Icegrams to obtain probabilities (relative frequencies) of
-over a million different **unigrams** (single words or tokens), or of
-**bigrams** (pairs of two words or tokens), or of **trigrams**. You can also
-ask it to return the N most likely successors to any unigram or bigram.
-
-Icegrams is useful for instance in spelling correction, predictive typing,
-to help disabled people write text faster, and for various text generation,
-statistics and modelling tasks.
-
-The Icegrams trigram corpus is built from the 2017 edition of the
-Icelandic Gigaword Corpus
-(`Risamálheild <https://malheildir.arnastofnun.is/?mode=rmh2017>`_),
-which is collected and maintained by *The Árni Magnússon Institute*
-*for Icelandic Studies*. A mixed, manually vetted subset consisting of 157
-documents from the corpus was used as the source of the token stream,
-yielding over 100 million tokens. Trigrams that only occurred
-once or twice in the stream were eliminated before creating the
-compressed Icegrams database. The creation process is further
-`described here <https://github.com/mideind/Icegrams/blob/master/doc/overview.md>`_.
-
-## Example
-
->>> from icegrams import Ngrams
->>> ng = Ngrams()
-
->>> # Obtain the frequency of the unigram 'Ísland'
-
->>> ng.freq("Ísland")
-42018
-
->>> # Obtain the probability of the unigram 'Ísland', as a fraction
-
->>> # of the frequency of all unigrams in the database
-
->>> ng.prob("Ísland")
-0.0003979926900206475
-
->>> # Obtain the log probability (base e) of the unigram 'Ísland'
-
->>> ng.logprob("Ísland")
--7.8290769196308005
-
->>> # Obtain the frequency of the bigram 'Katrín Jakobsdóttir'
-
->>> ng.freq("Katrín", "Jakobsdóttir")
-3517
-
->>> # Obtain the probability of 'Jakobsdóttir' given 'Katrín'
-
->>> ng.prob("Katrín", "Jakobsdóttir")
-0.23298013245033142
-
->>> # Obtain the probability of 'Júlíusdóttir' given 'Katrín'
-
->>> ng.prob("Katrín", "Júlíusdóttir")
-0.013642384105960274
-
->>> # Obtain the frequency of 'velta fyrirtækisins er'
-
->>> ng.freq("velta", "fyrirtækisins", "er")
-4
-
->>> # adj_freq returns adjusted frequencies, i.e incremented by 1
-
->>> ng.adj_freq("xxx", "yyy", "zzz")
-1
-
->>> # Obtain the N most likely successors of a given unigram or bigram
-
->>> # in descending order by log probability of each successor
-
->>> ng.succ(10, "stjórnarskrá", "lýðveldisins")
-[('Íslands', -1.3708244393477589), ('.', -2.2427905461504567),
-    (',', -3.313814878299737), ('og', -3.4920631097060557), ('sem', -4.566577846795106),
-    ('er', -4.720728526622363), ('að', -4.807739903611993), ('um', -5.0084105990741445),
-    ('en', -5.0084105990741445), ('á', -5.25972502735505)]
-
-## Reference
-
-### Initializing Icegrams
-
-After installing the ``icegrams`` package, use the following code to
-import it and initialize an instance of the ``Ngrams`` class::
-
-    from icegrams import Ngrams
-    ng = Ngrams()
-
-Now you can use the ``ng`` instance to query for unigram, bigram
-and trigram frequencies and probabilities.
-
-### The Ngrams class
-
-* ``__init__(self)``
-
-  Initializes the ``Ngrams`` instance.
-
-* ``freq(self, *args) -> int``
-
-  Returns the frequency of a unigram, bigram or trigram.
-
-  * ``str[] *args`` A parameter sequence of consecutive unigrams
-    to query the frequency for.
-  * **returns** An integer with the frequency of the unigram,
-    bigram or trigram.
-
-  To query for the frequency of a unigram in the text, call
-  ``ng.freq("unigram1")``. This returns the number of times that
-  the unigram appears in the database. The unigram is
-  queried as-is, i.e. with no string stripping or lowercasing.
-
-  To query for the frequency of a bigram in the text, call
-  ``ng.freq("unigram1", "unigram2")``.
-
-  To query for the frequency of a trigram in the text, call
-  ``ng.freq("unigram1", "unigram2", "unigram3")``.
-
-  If you pass more than 3 arguments to ``ng.freq()``, only the
-  last 3 are significant, and the query will be treated
-  as a trigram query.
-
-  Examples::
-
-    >>>> ng.freq("stjórnarskrá")
-    2973
-    >>>> ng.freq("stjórnarskrá", "lýðveldisins")
-    39
-    >>>> ng.freq("stjórnarskrá", "lýðveldisins", "Íslands")
-    12
-    >>>> ng.freq("xxx", "yyy", "zzz")
-    0
-
-* ``adj_freq(self, *args) -> int``
-
-  Returns the adjusted frequency of a unigram, bigram or trigram.
-
-  * ``str[] *args`` A parameter sequence of consecutive unigrams
-    to query the frequency for.
-  * **returns** An integer with the adjusted frequency of the unigram,
-    bigram or trigram. The adjusted frequency is the actual
-    frequency plus 1. The method thus never returns 0.
-
-  To query for the frequency of a unigram in the text, call
-  ``ng.adj_freq("unigram1")``. This returns the number of times that
-  the unigram appears in the database, plus 1. The unigram is
-  queried as-is, i.e. with no string stripping or lowercasing.
-
-  To query for the frequency of a bigram in the text, call
-  ``ng.adj_freq("unigram1", "unigram2")``.
-
-  To query for the frequency of a trigram in the text, call
-  ``ng.adj_freq("unigram1", "unigram2", "unigram3")``.
-
-  If you pass more than 3 arguments to ``ng.adj_freq()``, only the
-  last 3 are significant, and the query will be treated
-  as a trigram query.
-
-  Examples::
-
-    >>>> ng.adj_freq("stjórnarskrá")
-    2974
-    >>>> ng.adj_freq("stjórnarskrá", "lýðveldisins")
-    40
-    >>>> ng.adj_freq("stjórnarskrá", "lýðveldisins", "Íslands")
-    13
-    >>>> ng.adj_freq("xxx", "yyy", "zzz")
-    1
-
-* ``prob(self, *args) -> float``
-
-  Returns the probability of a unigram, bigram or trigram.
-
-  * ``str[] *args`` A parameter sequence of consecutive unigrams
-    to query the probability for.
-  * **returns** A float with the probability of the given unigram,
-    bigram or trigram.
-
-  The probability of a *unigram* is
-  the frequency of the unigram divided by the sum of the
-  frequencies of all unigrams in the database.
-
-  The probability of a *bigram* ``(u1, u2)`` is the frequency
-  of the bigram divided by the frequency of the unigram ``u1``,
-  i.e. how likely ``u2`` is to succeed ``u1``.
-
-  The probability of a trigram ``(u1, u2, u3)`` is the frequency
-  of the trigram divided by the frequency of the bigram ``(u1, u2)``,
-  i.e. how likely ``u3`` is to succeed ``u1 u2``.
-
-  If you pass more than 3 arguments to ``ng.prob()``, only the
-  last 3 are significant, and the query will be treated
-  as a trigram probability query.
-
-  Examples::
-
-    >>>> ng.prob("stjórnarskrá")
-    2.8168929772755334e-05
-    >>>> ng.prob("stjórnarskrá", "lýðveldisins")
-    0.01344989912575655
-    >>>> ng.prob("stjórnarskrá", "lýðveldisins", "Íslands")
-    0.325
-
-* ``logprob(self, *args) -> float``
-
-  Returns the log probability of a unigram, bigram or trigram.
-
-  * ``str[] *args`` A parameter sequence of consecutive unigrams
-    to query the log probability for.
-  * **returns** A float with the natural logarithm (base *e*) of the
-    probability of the given unigram, bigram or trigram.
-
-  The probability of a *unigram* is
-  the adjusted frequency of the unigram divided by the sum of the
-  frequencies of all unigrams in the database.
-
-  The probability of a *bigram* ``(u1, u2)`` is the adjusted frequency
-  of the bigram divided by the adjusted frequency of the unigram ``u1``,
-  i.e. how likely ``u2`` is to succeed ``u1``.
-
-  The probability of a trigram ``(u1, u2, u3)`` is the adjusted frequency
-  of the trigram divided by the adjusted frequency of the bigram ``(u1, u2)``,
-  i.e. how likely ``u3`` is to succeed ``u1 u2``.
-
-  If you pass more than 3 arguments to ``ng.logprob()``, only the
-  last 3 are significant, and the query will be treated
-  as a trigram probability query.
-
-  Examples::
-
-    >>>> ng.logprob("stjórnarskrá")
-    -10.477290968535172
-    >>>> ng.logprob("stjórnarskrá", "lýðveldisins")
-    -4.308783672906165
-    >>>> ng.logprob("stjórnarskrá", "lýðveldisins", "Íslands")
-    -1.1239300966523995
-
-* ``succ(self, n, *args) -> list[tuple]``
-
-  Returns the *N* most probable successors of a unigram or bigram.
-
-  * ``int n`` A positive integer specifying how many successors,
-    at a maximum, should be returned.
-  * ``str[] *args`` One or two string parameters containing the
-    unigram or bigram to query the successors for.
-  * **returns** A list of tuples of (successor unigram, log probability),
-    in descending order of probability.
-
-  If you pass more than 2 string arguments to ``ng.succ()``, only the
-  last 2 are significant, and the query will be treated
-  as a bigram successor query.
-
-  Examples::
-
-    >>>> ng.succ(2, "stjórnarskrá")
-    [('.', -1.8259625296091855), ('landsins', -2.223111581475692)]
-    >>>> ng.succ(2, "stjórnarskrá", "lýðveldisins")
-    [('Íslands', -1.1239300966523995), ('og', -1.3862943611198904)]
-
-    >>>> # The following is equivalent to ng.succ(2, "lýðveldisins", "Íslands")
-
-    >>>> ng.succ(2, "stjórnarskrá", "lýðveldisins", "Íslands")
-    [('.', -1.3862943611198908), (',', -1.6545583477145702)]
-
-## Notes
-
-Icegrams is built with a sliding window over the source text. This means that
-a sentence such as ``"Maðurinn borðaði ísinn."`` results in the following
-trigrams being added to the database::
-
-   ("", "", "Maðurinn")
-   ("", "Maðurinn", "borðaði")
-   ("Maðurinn", "borðaði", "ísinn")
-   ("borðaði", "ísinn", ".")
-   ("ísinn", ".", "")
-   (".", "", "")
-
-The same sliding window strategy is applied for bigrams, so the following
-bigrams would be recorded for the same sentence::
-
-   ("", "Maðurinn")
-   ("Maðurinn", "borðaði")
-   ("borðaði", "ísinn")
-   ("ísinn", ".")
-   (".", "")
-
-You can thus obtain the N unigrams that most often start
-a sentence by asking for ``ng.succ(N, "")``.
-
-And, of course, four unigrams are also added, one for each token in the
-sentence.
-
-The tokenization of the source text into unigrams is done with the
-`Tokenizer package <https://pypi.org/project/tokenizer>`_and
-uses the rules documented there. Importantly, tokens other than words,
-abbreviations, entity names, person names and punctuation are
-**replaced by placeholders**. This means that all numbers are represented by the token
-``[NUMBER]``, amounts by ``[AMOUNT]``, dates by ``[DATEABS]`` and ``[DATEREL]``,
-e-mail addresses by ``[EMAIL]``, etc. For the complete mapping of token types
-to placeholder strings, see the
-`documentation for the Tokenizer package <https://github.com/mideind/Tokenizer/blob/master/README.rst>`_.
-
-## Prerequisites
-
-This package runs on CPython 3.6 or newer, and on PyPy 3.6 or newer. It
-has been tested on Linux (gcc on x86-64 and ARMhf), MacOS (clang) and
-Windows (MSVC).
-
-If a binary wheel package isn't available on `PyPI <https://pypi.org>`_
-for your system, you may need to have the ``python3-dev`` package
-(or its Windows equivalent) installed on your system to set up
-Icegrams successfully. This is because a source distribution
-install requires a C++ compiler and linker::
-
-    # Debian or Ubuntu:
-    sudo apt-get install python3-dev
-
-## Installation
-
-To install this package::
-
-    pip install icegrams
-
-If you want to be able to edit the source, do like so (assuming you have **git** installed)::
-
-    git clone https://github.com/mideind/Icegrams
-    cd Icegrams
-    # [ Activate your virtualenv here if you have one ]
-    python setup.py develop
-
-The package source code is now in ``./src/icegrams``.
-
-## Tests
-
-To run the built-in tests, install `pytest <https://docs.pytest.org/en/latest/>`_,
-``cd`` to your ``Icegrams`` subdirectory (and optionally activate your
-virtualenv), then run::
-
-    python -m pytest
-
-## Version History
-
-* Version 1.1.2: Minor bug fixes. Cross-platform wheels provided. Now requires Python 3.7+. (2022-12-14)
-* Version 1.1.0: Python 3.5 support dropped; macOS builds fixed; PyPy wheels
-  generated
-* Version 1.0.0: New trigram database sourced from the Icelandic Gigaword Corpus
-  (Risamálheild) with improved tokenization. Replaced GNU GPLv3 with MIT license.
-* Version 0.6.0: Python type annotations added
-* Version 0.5.0: Trigrams corpus has been spell-checked
-
-## License
-
-Icegrams is Copyright © 2022 [Miðeind ehf.](https://mideind.is)  
-The original author of this software is *Vilhjálmur Þorsteinsson*.
-
-This software is licensed under the **MIT License**:
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without restriction,
-including without limitation the rights to use, copy, modify, merge,
-publish, distribute, sublicense, and/or sell copies of the Software,
-and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/test.py b/old/test.py
similarity index 100%
rename from test.py
rename to old/test.py

From 4120b0e4d2440e6eeee0b668ecb8512d1b602207 Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Tue, 27 Aug 2024 14:00:32 +0000
Subject: [PATCH 4/7] Updated setup.py + CI config and docs

---
 .github/workflows/python-package.yml |  2 +-
 .github/workflows/wheels.yml         |  4 +-
 README.rst                           |  9 ++--
 setup.py                             | 61 ++++------------------------
 4 files changed, 15 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 0058679..8f97497 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -24,7 +24,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip wheel setuptools pytest
+        python -m pip install --upgrade pip wheel setuptools pytest packaging
         python -m pip install -e .
     - name: Test with pytest
       run: |
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 79cf27f..dd5c71b 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -25,14 +25,14 @@ jobs:
           python-version: '3.10'
 
       - name: Install cibuildwheel
-        run: python -m pip install --upgrade pip wheel setuptools cibuildwheel
+        run: python -m pip install --upgrade pip wheel setuptools packaging cibuildwheel
 
       - name: Build wheels
         run: python -m cibuildwheel --output-dir wheelhouse
         # Options (https://cibuildwheel.readthedocs.io/en/stable/options/)
         env:
           CIBW_SKIP: cp36-* cp37-* cp38-* *pp37-* pp38-* *musllinux*
-          CIBW_BEFORE_BUILD_MACOS: python3 -m pip install --upgrade setuptools wheel cffi
+          CIBW_BEFORE_BUILD_MACOS: python3 -m pip install --upgrade setuptools wheel cffi packaging
           CIBW_ARCHS_MACOS: "x86_64 arm64"
           CIBW_ARCHS_WINDOWS: "AMD64"
           CIBW_ARCHS_LINUX: "x86_64"
diff --git a/README.rst b/README.rst
index 069eafd..0cd1bcb 100644
--- a/README.rst
+++ b/README.rst
@@ -9,7 +9,7 @@ Icegrams: A fast, compact trigram library for Icelandic
 Overview
 ********
 
-**Icegrams** is an MIT-licensed Python 3 (>= 3.7) package that encapsulates a
+**Icegrams** is an MIT-licensed Python 3 (>=3.9) package that encapsulates a
 **large trigram library for Icelandic**. (A trigram is a tuple of
 three consecutive words or tokens that appear in real-world text.)
 
@@ -319,8 +319,8 @@ to placeholder strings, see the
 Prerequisites
 *************
 
-This package runs on CPython 3.6 or newer, and on PyPy 3.6 or newer. It
-has been tested on Linux (gcc on x86-64 and ARMhf), MacOS (clang) and
+This package runs on CPython 3.9 or newer, and on PyPy 3.9 or newer. It
+has been tested on Linux (gcc on x86-64 and ARMhf), macOS (clang) and
 Windows (MSVC).
 
 If a binary wheel package isn't available on `PyPI <https://pypi.org>`_
@@ -363,6 +363,7 @@ virtualenv), then run::
 Changelog
 *********
 
+* Version 1.1.3: Minor tweaks. Support for Python 3.13. Now requires Python 3.9+. (2024-08-27)
 * Version 1.1.2: Minor bug fixes. Cross-platform wheels provided. Now requires Python 3.7+. (2022-12-14)
 * Version 1.1.0: Python 3.5 support dropped; macOS builds fixed; PyPy wheels
   generated
@@ -375,7 +376,7 @@ Changelog
 Copyright and licensing
 ***********************
 
-Icegrams is Copyright © 2022 `Miðeind ehf. <https://mideind.is>`__.
+Icegrams is Copyright © 2020-2024 `Miðeind ehf. <https://mideind.is>`__.
 The original author of this software is *Vilhjálmur Þorsteinsson*.
 
 This software is licensed under the **MIT License**:
diff --git a/setup.py b/setup.py
index f9f4982..a2a80e1 100644
--- a/setup.py
+++ b/setup.py
@@ -1,59 +1,13 @@
 #!/usr/bin/env python3
-"""
 
-    Icegrams: A trigrams library for Icelandic
-
-    setup.py
-
-    Copyright (C) 2020 Miðeind ehf.
-    Author: Vilhjálmur Þorsteinsson
-
-    This software is licensed under the MIT License:
-
-        Permission is hereby granted, free of charge, to any person
-        obtaining a copy of this software and associated documentation
-        files (the "Software"), to deal in the Software without restriction,
-        including without limitation the rights to use, copy, modify, merge,
-        publish, distribute, sublicense, and/or sell copies of the Software,
-        and to permit persons to whom the Software is furnished to do so,
-        subject to the following conditions:
-
-        The above copyright notice and this permission notice shall be
-        included in all copies or substantial portions of the Software.
-
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-        EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-        MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-        IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-        CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-        TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-        SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-
-    This module sets up the icegrams package. It uses the cffi_modules
-    parameter, available in recent versions of setuptools, to
-    automatically compile the trie.cpp module to trie.*.so/.pyd
-    and build the required CFFI Python wrapper via trie_build.py.
-
-    Note that installing under PyPy >= 3.7 is supported.
-
-"""
-
-import io
 import re
-import sys
-
+import io
 from glob import glob
-from os.path import basename, dirname, join, splitext
+from os.path import basename, splitext, dirname, join
 
 from setuptools import find_packages, setup  # type: ignore
 
 
-if sys.version_info < (3, 7):
-    print("Icegrams requires Python >= 3.7")
-    sys.exit(1)
-
-
 def read(*names, **kwargs):
     try:
         return io.open(
@@ -65,7 +19,7 @@ def read(*names, **kwargs):
 
 setup(
     name="icegrams",
-    version="1.1.2",  # Also update in src/icegrams/__init__.py
+    version="1.1.3",
     license="MIT",
     description="Trigram statistics for Icelandic",
     long_description="{0}\n{1}".format(
@@ -84,23 +38,22 @@ def read(*names, **kwargs):
     include_package_data=True,
     zip_safe=False,
     classifiers=[
-        # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers
         "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Developers",
         "Intended Audience :: Science/Research",
         "License :: OSI Approved :: MIT License",
-        "Operating System :: Unix",
         "Operating System :: POSIX",
-        "Operating System :: Microsoft :: Windows",
+        "Operating System :: Unix",
         "Operating System :: MacOS",
+        "Operating System :: Microsoft :: Windows",
         "Natural Language :: Icelandic",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
         "Programming Language :: Python :: Implementation :: CPython",
         "Programming Language :: Python :: Implementation :: PyPy",
         "Topic :: Software Development :: Libraries :: Python Modules",

From 83bba3d8baafd266dfcbdbeba1ac12d17d630f79 Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Wed, 28 Aug 2024 14:50:08 +0000
Subject: [PATCH 5/7] Re-applied migration to importlib_resources

---
 src/icegrams/ngrams.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/icegrams/ngrams.py b/src/icegrams/ngrams.py
index ab5e081..29303d8 100644
--- a/src/icegrams/ngrams.py
+++ b/src/icegrams/ngrams.py
@@ -155,12 +155,10 @@
 
     # Make sure that the trigrams.bin file is
     # unpacked and ready for use
-    import pkg_resources
+    import importlib.resources as importlib_resources
 
-    # Note: the resource path below should NOT use os.path.join()
-    BINARY_FILENAME = pkg_resources.resource_filename(  # type: ignore
-        __name__, "resources/trigrams.bin"
-    )
+    ref = importlib_resources.files("icegrams").joinpath("resources/trigrams.bin")
+    BINARY_FILENAME = str(ref)
 
 ffi: Any = cast(Any, ffi)  # type: ignore
 trie_cffi: Any = cast(Any, trie_cffi)  # type: ignore

From b163751c47867ce0ab4b7c8944f267ae9d624b7e Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Wed, 28 Aug 2024 15:04:39 +0000
Subject: [PATCH 6/7] Migrated package metadata over to pyproject.toml

---
 pyproject.toml | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
 setup.py       | 54 +---------------------------------------
 2 files changed, 68 insertions(+), 53 deletions(-)
 create mode 100644 pyproject.toml
 mode change 100644 => 100755 setup.py

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..67595da
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,67 @@
+[project]
+name = "icegrams"
+version = "1.1.3"
+description = "Trigram statistics for Icelandic"
+authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }]
+maintainers = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }]
+readme = { file = "README.rst", content-type = "text/x-rst" }
+license = { text = "MIT" }
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: POSIX",
+    "Operating System :: Unix",
+    "Operating System :: MacOS",
+    "Operating System :: Microsoft :: Windows",
+    "Natural Language :: Icelandic",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Python :: Implementation :: PyPy",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Utilities",
+    "Topic :: Text Processing :: Linguistic",
+]
+keywords = ["nlp", "trigram", "ngram", "trigrams", "ngrams", "icelandic"]
+requires-python = ">=3.9"
+dependencies = ["cffi>=1.15.1"]
+
+[project.urls]
+Repository = "https://github.com/mideind/Icegrams"
+
+[project.optional-dependencies]
+# dev dependencies
+dev = ["pytest"]
+
+# *** Configuration of tools ***
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+where = ["src"]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    # Ignore deprecation warnings in libraries, their problem not ours
+    # "ignore::DeprecationWarning",
+]
+
+[tool.ruff]
+line-length = 88
+
+[tool.black]
+line-length = 88
+
+[tool.isort]
+# This forces these imports to placed at the top
+known_future_library = ["__future__", "typing", "typing_extensions"]
+profile = "black"
+line_length = 88
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 5b1d2ac..198a7b4
--- a/setup.py
+++ b/setup.py
@@ -1,69 +1,17 @@
 #!/usr/bin/env python3
 
-
-from typing import Any
-
-import re
-import io
 from glob import glob
-from os.path import basename, splitext, dirname, join
+from os.path import basename, splitext
 
 from setuptools import find_packages, setup
 
-
-def read(*names, **kwargs):
-    try:
-        return io.open(
-            join(dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8")
-        ).read()
-    except (IOError, OSError):
-        return ""
-
-
 setup(
-    name="icegrams",
-    version="1.1.3",
-    license="MIT",
-    description="Trigram statistics for Icelandic",
-    long_description="{0}\n{1}".format(
-        re.compile("^.. start-badges.*^.. end-badges", re.M | re.S).sub(
-            "", read("README.rst")
-        ),
-        re.sub(":[a-z]+:`~?(.*?)`", r"``\1``", read("CHANGELOG.rst")),
-    ),
-    author="Miðeind ehf",
-    author_email="mideind@mideind.is",
-    url="https://github.com/mideind/Icegrams",
     packages=find_packages("src"),
     package_dir={"": "src"},
     py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")],
     package_data={"icegrams": ["py.typed"]},
     include_package_data=True,
     zip_safe=False,
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: POSIX",
-        "Operating System :: Unix",
-        "Operating System :: MacOS",
-        "Operating System :: Microsoft :: Windows",
-        "Natural Language :: Icelandic",
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: 3.12",
-        "Programming Language :: Python :: 3.13",
-        "Programming Language :: Python :: Implementation :: CPython",
-        "Programming Language :: Python :: Implementation :: PyPy",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-        "Topic :: Utilities",
-        "Topic :: Text Processing :: Linguistic",
-    ],
-    keywords=["nlp", "trigram", "ngram", "trigrams", "ngrams", "icelandic"],
     setup_requires=["cffi>=1.15.1"],
     install_requires=["cffi>=1.15.1"],
     cffi_modules=["src/icegrams/trie_build.py:ffibuilder"],

From 182768c7adab1c0f2a5e44d4c133d6674f2abec1 Mon Sep 17 00:00:00 2001
From: Sveinbjorn Thordarson <sveinbjorn@sveinbjorn.org>
Date: Wed, 28 Aug 2024 16:42:30 +0000
Subject: [PATCH 7/7] Fixes as per code review

---
 src/icegrams/ngrams.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/icegrams/ngrams.py b/src/icegrams/ngrams.py
index 29303d8..6f8614a 100644
--- a/src/icegrams/ngrams.py
+++ b/src/icegrams/ngrams.py
@@ -143,7 +143,7 @@
 
 # Import the CFFI wrapper for the trie.cpp C++ module
 # (see also trie.py and build_trie.py)
-elif __name__ == "__main__":
+if __name__ == "__main__":
     # Running as a main program
     from _trie import lib as trie_cffi, ffi  # type: ignore  # pylint: disable=import-error
     from trie import Trie
@@ -157,7 +157,7 @@
     # unpacked and ready for use
     import importlib.resources as importlib_resources
 
-    ref = importlib_resources.files("icegrams").joinpath("resources/trigrams.bin")
+    ref = importlib_resources.files("icegrams").joinpath("resources", "trigrams.bin")
     BINARY_FILENAME = str(ref)
 
 ffi: Any = cast(Any, ffi)  # type: ignore
@@ -865,8 +865,8 @@ def bigram_frequency(self, i0: Optional[int], i1: Optional[int]) -> int:
         if i0 is None or i1 is None:
             return 0
         # Check degenerate case
-        # if not (i0 or i1):
-        #    return 0
+        if not (i0 or i1):
+            return 0
         assert self._unigram_ptrs_ml is not None
         p1, p2 = self._unigram_ptrs_ml.lookup_pair(i0)
         # Then, look for id i1 within the level 2 ids delimited by [p1, p2>