From c24ed35aef294ff831c62c7c2232fe499bf68e6b Mon Sep 17 00:00:00 2001
From: John Franey <1728528+johnfraney@users.noreply.github.com>
Date: Mon, 20 Jan 2025 10:21:09 -0400
Subject: [PATCH] typing: add initial types

---
 pyproject.toml                   |   6 +-
 src/textblob/_text.py            |   5 +-
 src/textblob/base.py             |  30 +++++----
 src/textblob/blob.py             |   6 +-
 src/textblob/decorators.py       |  13 +++-
 src/textblob/en/inflect.py       | 103 +++++++++++++++++--------------
 src/textblob/en/np_extractors.py |  18 +++---
 src/textblob/formats.py          |   7 ++-
 src/textblob/mixins.py           |  10 ++-
 src/textblob/utils.py            |  24 +++++--
 10 files changed, 139 insertions(+), 83 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2f1712c5..e52d64bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ Source = "https://github.com/sloria/TextBlob"
 [project.optional-dependencies]
 docs = ["sphinx==8.1.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"]
 tests = ["pytest", "numpy"]
-dev = ["textblob[tests]", "tox", "pre-commit~=3.5"]
+dev = ["textblob[tests]", "tox", "pre-commit~=3.5", "pyright", "ruff"]
 
 [build-system]
 requires = ["flit_core<4"]
@@ -96,3 +96,7 @@ markers = [
   "slow: marks tests as slow (deselect with '-m \"not slow\"')",
   "numpy: marks tests that require numpy",
 ]
+
+[tool.pyright]
+include = ["src/**"]
+exclude = ["tests/**"]
diff --git a/src/textblob/_text.py b/src/textblob/_text.py
index 62cc0d55..606346e3 100644
--- a/src/textblob/_text.py
+++ b/src/textblob/_text.py
@@ -1570,9 +1570,8 @@ def parse(
 
 TOKENS = "tokens"
 
-
 class TaggedString(str):
-    def __new__(self, string, tags=None, language=None):
+    def __new__(cls, string, tags=None, language=None):
         """Unicode string with tags and language attributes.
         For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]).
         """
@@ -1588,7 +1587,7 @@ def __new__(self, string, tags=None, language=None):
                 for s in string
             ]
             string = "\n".join(" ".join("/".join(token) for token in s) for s in string)
-        s = str.__new__(self, string)
+        s = str.__new__(cls, string)
         s.tags = list(tags)
         s.language = language
         return s
diff --git a/src/textblob/base.py b/src/textblob/base.py
index 2c726073..dd20d517 100644
--- a/src/textblob/base.py
+++ b/src/textblob/base.py
@@ -5,10 +5,16 @@
     All base classes are defined in the same module, ``textblob.base``.
 """
 
+from __future__ import annotations
+
 from abc import ABCMeta, abstractmethod
+from typing import TYPE_CHECKING
 
 import nltk
 
+if TYPE_CHECKING:
+    from typing import Any, AnyStr
+
 ##### POS TAGGERS #####
 
 
@@ -19,11 +25,11 @@ class BaseTagger(metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def tag(self, text, tokenize=True):
+    def tag(self, text: str, tokenize=True) -> list[tuple[str, str]]:
         """Return a list of tuples of the form (word, tag)
         for a given set of text or BaseBlob instance.
         """
-        return
+        raise NotImplementedError("Subclass must implement a tag method")
 
 
 ##### NOUN PHRASE EXTRACTORS #####
@@ -36,29 +42,29 @@ class BaseNPExtractor(metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def extract(self, text):
+    def extract(self, text: str) -> list[str]:
         """Return a list of noun phrases (strings) for a body of text."""
-        return
+        raise NotImplementedError("Subclass must implement an extract method")
 
 
 ##### TOKENIZERS #####
 
 
-class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta):
+class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta):  # pyright: ignore
     """Abstract base class from which all Tokenizer classes inherit.
     Descendant classes must implement a ``tokenize(text)`` method
     that returns a list of noun phrases as strings.
     """
 
     @abstractmethod
-    def tokenize(self, text):
+    def tokenize(self, text: str) -> list[str]:
         """Return a list of tokens (strings) for a body of text.
 
         :rtype: list
         """
-        return
+        return NotImplemented
 
-    def itokenize(self, text, *args, **kwargs):
+    def itokenize(self, text: str, *args, **kwargs):
         """Return a generator that generates tokens "on-demand".
 
         .. versionadded:: 0.6.0
@@ -81,6 +87,8 @@ class BaseSentimentAnalyzer(metaclass=ABCMeta):
     results of analysis.
     """
 
+    _trained: bool
+
     kind = DISCRETE
 
     def __init__(self):
@@ -91,7 +99,7 @@ def train(self):
         self._trained = True
 
     @abstractmethod
-    def analyze(self, text):
+    def analyze(self, text) -> Any:
         """Return the result of of analysis. Typically returns either a
         tuple, float, or dictionary.
         """
@@ -111,6 +119,6 @@ class BaseParser(metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def parse(self, text):
+    def parse(self, text: AnyStr):
         """Parses the text."""
-        return
+        raise NotImplementedError("Subclass must implement a parse method")
diff --git a/src/textblob/blob.py b/src/textblob/blob.py
index d26e2f0b..eeaa28d0 100644
--- a/src/textblob/blob.py
+++ b/src/textblob/blob.py
@@ -138,9 +138,9 @@ def lemmatize(self, pos=None):
         lemmatizer = nltk.stem.WordNetLemmatizer()
         return lemmatizer.lemmatize(self.string, tag)
 
-    PorterStemmer = nltk.stem.porter.PorterStemmer()
-    LancasterStemmer = nltk.stem.lancaster.LancasterStemmer()
-    SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english")
+    PorterStemmer = nltk.stem.PorterStemmer()
+    LancasterStemmer = nltk.stem.LancasterStemmer()
+    SnowballStemmer = nltk.stem.SnowballStemmer("english")
 
     # added 'stemmer' on lines of lemmatizer
     # based on nltk
diff --git a/src/textblob/decorators.py b/src/textblob/decorators.py
index 9b91ce87..ef5ace04 100644
--- a/src/textblob/decorators.py
+++ b/src/textblob/decorators.py
@@ -1,9 +1,18 @@
 """Custom decorators."""
 
+from __future__ import annotations
+
 from functools import wraps
+from typing import TYPE_CHECKING
 
 from textblob.exceptions import MissingCorpusError
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from typing import TypeVar
+
+    ReturnType = TypeVar("ReturnType")
+
 
 class cached_property:
     """A property that is only computed once per instance and then replaces
@@ -24,7 +33,9 @@ def __get__(self, obj, cls):
         return value
 
 
-def requires_nltk_corpus(func):
+def requires_nltk_corpus(
+    func: Callable[..., ReturnType],
+) -> Callable[..., ReturnType]:
     """Wraps a function that requires an NLTK corpus. If the corpus isn't found,
     raise a :exc:`MissingCorpusError`.
     """
diff --git a/src/textblob/en/inflect.py b/src/textblob/en/inflect.py
index 3d4ba244..5c6b13df 100644
--- a/src/textblob/en/inflect.py
+++ b/src/textblob/en/inflect.py
@@ -4,7 +4,15 @@
 See here https://github.com/clips/pattern/blob/master/LICENSE.txt for
 complete license information.
 """
+
+from __future__ import annotations
+from collections.abc import MutableMapping
 import re
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from typing import AnyStr
+
 
 VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"
 
@@ -523,7 +531,7 @@
 }
 
 
-def pluralize(word, pos=NOUN, custom=None, classical=True):
+def pluralize(word: str, pos=NOUN, custom=None, classical=True) -> str:
     """Returns the plural of a given word.
     For example: child -> children.
     Handles nouns and adjectives, using classical inflection by default
@@ -584,6 +592,7 @@ def pluralize(word, pos=NOUN, custom=None, classical=True):
                 ):
                     if suffix.search(word) is not None:
                         return suffix.sub(inflection, word)
+    return word
 
 
 #### SINGULARIZE ###################################################################################
@@ -607,55 +616,57 @@ def pluralize(word, pos=NOUN, custom=None, classical=True):
 # THIS SOFTWARE.
 
 singular_rules = [
-    ["(?i)(.)ae$", "\\1a"],
-    ["(?i)(.)itis$", "\\1itis"],
-    ["(?i)(.)eaux$", "\\1eau"],
-    ["(?i)(quiz)zes$", "\\1"],
-    ["(?i)(matr)ices$", "\\1ix"],
-    ["(?i)(ap|vert|ind)ices$", "\\1ex"],
-    ["(?i)^(ox)en", "\\1"],
-    ["(?i)(alias|status)es$", "\\1"],
-    ["(?i)([octop|vir])i$", "\\1us"],
-    ["(?i)(cris|ax|test)es$", "\\1is"],
-    ["(?i)(shoe)s$", "\\1"],
-    ["(?i)(o)es$", "\\1"],
-    ["(?i)(bus)es$", "\\1"],
-    ["(?i)([m|l])ice$", "\\1ouse"],
-    ["(?i)(x|ch|ss|sh)es$", "\\1"],
-    ["(?i)(m)ovies$", "\\1ovie"],
-    ["(?i)(.)ombies$", "\\1ombie"],
-    ["(?i)(s)eries$", "\\1eries"],
-    ["(?i)([^aeiouy]|qu)ies$", "\\1y"],
+    (re.compile("(?i)(.)ae$"), "\\1a"),
+    (re.compile("(?i)(.)itis$"), "\\1itis"),
+    (re.compile("(?i)(.)eaux$"), "\\1eau"),
+    (re.compile("(?i)(quiz)zes$"), "\\1"),
+    (re.compile("(?i)(matr)ices$"), "\\1ix"),
+    (re.compile("(?i)(ap|vert|ind)ices$"), "\\1ex"),
+    (re.compile("(?i)^(ox)en"), "\\1"),
+    (re.compile("(?i)(alias|status)es$"), "\\1"),
+    (re.compile("(?i)([octop|vir])i$"), "\\1us"),
+    (re.compile("(?i)(cris|ax|test)es$"), "\\1is"),
+    (re.compile("(?i)(shoe)s$"), "\\1"),
+    (re.compile("(?i)(o)es$"), "\\1"),
+    (re.compile("(?i)(bus)es$"), "\\1"),
+    (re.compile("(?i)([m|l])ice$"), "\\1ouse"),
+    (re.compile("(?i)(x|ch|ss|sh)es$"), "\\1"),
+    (re.compile("(?i)(m)ovies$"), "\\1ovie"),
+    (re.compile("(?i)(.)ombies$"), "\\1ombie"),
+    (re.compile("(?i)(s)eries$"), "\\1eries"),
+    (re.compile("(?i)([^aeiouy]|qu)ies$"), "\\1y"),
     # Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
-    ["([aeo]l)ves$", "\\1f"],
-    ["([^d]ea)ves$", "\\1f"],
-    ["arves$", "arf"],
-    ["erves$", "erve"],
-    ["([nlw]i)ves$", "\\1fe"],
-    ["(?i)([lr])ves$", "\\1f"],
-    ["([aeo])ves$", "\\1ve"],
-    ["(?i)(sive)s$", "\\1"],
-    ["(?i)(tive)s$", "\\1"],
-    ["(?i)(hive)s$", "\\1"],
-    ["(?i)([^f])ves$", "\\1fe"],
+    (re.compile("([aeo]l)ves$"), "\\1f"),
+    (re.compile("([^d]ea)ves$"), "\\1f"),
+    (re.compile("arves$"), "arf"),
+    (re.compile("erves$"), "erve"),
+    (re.compile("([nlw]i)ves$"), "\\1fe"),
+    (re.compile("(?i)([lr])ves$"), "\\1f"),
+    (re.compile("([aeo])ves$"), "\\1ve"),
+    (re.compile("(?i)(sive)s$"), "\\1"),
+    (re.compile("(?i)(tive)s$"), "\\1"),
+    (re.compile("(?i)(hive)s$"), "\\1"),
+    (re.compile("(?i)([^f])ves$"), "\\1fe"),
     # -es suffix.
-    ["(?i)(^analy)ses$", "\\1sis"],
-    ["(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "\\1\\2sis"],
-    ["(?i)(.)opses$", "\\1opsis"],
-    ["(?i)(.)yses$", "\\1ysis"],
-    ["(?i)(h|d|r|o|n|b|cl|p)oses$", "\\1ose"],
-    ["(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose"],
-    ["(?i)(.)oses$", "\\1osis"],
+    (re.compile("(?i)(^analy)ses$"), "\\1sis"),
+    (
+        re.compile("(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"),
+        "\\1\\2sis",
+    ),
+    (re.compile("(?i)(.)opses$"), "\\1opsis"),
+    (re.compile("(?i)(.)yses$"), "\\1ysis"),
+    (re.compile("(?i)(h|d|r|o|n|b|cl|p)oses$"), "\\1ose"),
+    (
+        re.compile("(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$"),
+        "\\1ose",
+    ),
+    (re.compile("(?i)(.)oses$"), "\\1osis"),
     # -a
-    ["(?i)([ti])a$", "\\1um"],
-    ["(?i)(n)ews$", "\\1ews"],
-    ["(?i)s$", ""],
+    (re.compile("(?i)([ti])a$"), "\\1um"),
+    (re.compile("(?i)(n)ews$"), "\\1ews"),
+    (re.compile("(?i)s$"), ""),
 ]
 
-# For performance, compile the regular expressions only once:
-for rule in singular_rules:
-    rule[0] = re.compile(rule[0])
-
 singular_uninflected = [
     "aircraft",
     "antelope",
@@ -833,7 +844,7 @@ def pluralize(word, pos=NOUN, custom=None, classical=True):
 }
 
 
-def singularize(word, pos=NOUN, custom=None):
+def singularize(word: str, pos=NOUN, custom: MutableMapping[str, str] | None = None):
     if custom is None:
         custom = {}
     if word in list(custom.keys()):
diff --git a/src/textblob/en/np_extractors.py b/src/textblob/en/np_extractors.py
index 489d6da9..6653e6fe 100644
--- a/src/textblob/en/np_extractors.py
+++ b/src/textblob/en/np_extractors.py
@@ -9,6 +9,8 @@
 
 
 class ChunkParser(nltk.ChunkParserI):
+    _trained: bool
+
     def __init__(self):
         self._trained = False
 
@@ -25,22 +27,21 @@ def train(self):
         self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
         self._trained = True
 
-    def parse(self, sentence):
+    def parse(self, tokens):
         """Return the parse tree for the sentence."""
         if not self._trained:
             self.train()
-        pos_tags = [pos for (word, pos) in sentence]
+        pos_tags = [pos for (_, pos) in tokens]
         tagged_pos_tags = self.tagger.tag(pos_tags)
-        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
+        chunktags = [chunktag for (_, chunktag) in tagged_pos_tags]
         conlltags = [
             (word, pos, chunktag)
-            for ((word, pos), chunktag) in zip(sentence, chunktags)
+            for ((word, pos), chunktag) in zip(tokens, chunktags)
         ]
         return nltk.chunk.util.conlltags2tree(conlltags)
 
 
 class ConllExtractor(BaseNPExtractor):
-
     """A noun phrase extractor that uses chunk parsing trained with the
     ConLL-2000 training corpus.
     """
@@ -89,7 +90,6 @@ def _parse_sentence(self, sentence):
 
 
 class FastNPExtractor(BaseNPExtractor):
-
     """A fast and simple noun phrase extractor.
 
     Credit to Shlomi Babluk. Link to original blog post:
@@ -97,6 +97,8 @@ class FastNPExtractor(BaseNPExtractor):
         http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
     """
 
+    _trained: bool
+
     CFG = {
         ("NNP", "NNP"): "NNP",
         ("NN", "NN"): "NNI",
@@ -137,11 +139,11 @@ def _tokenize_sentence(self, sentence):
         tokens = nltk.word_tokenize(sentence)
         return tokens
 
-    def extract(self, sentence):
+    def extract(self, text):
         """Return a list of noun phrases (strings) for body of text."""
         if not self._trained:
             self.train()
-        tokens = self._tokenize_sentence(sentence)
+        tokens = self._tokenize_sentence(text)
         tagged = self.tagger.tag(tokens)
         tags = _normalize_tags(tagged)
         merge = True
diff --git a/src/textblob/formats.py b/src/textblob/formats.py
index cff7c7a4..8b4e40ab 100644
--- a/src/textblob/formats.py
+++ b/src/textblob/formats.py
@@ -21,6 +21,8 @@ class PipeDelimitedFormat(formats.DelimitedFormat):
         cl = NaiveBayesAnalyzer(fp, format="psv")
 """
 
+from __future__ import annotations
+
 import csv
 import json
 from collections import OrderedDict
@@ -48,7 +50,7 @@ def to_iterable(self):
         raise NotImplementedError('Must implement a "to_iterable" method.')
 
     @classmethod
-    def detect(cls, stream):
+    def detect(cls, stream: str):
         """Detect the file format given a filename.
         Return True if a stream is this file format.
 
@@ -61,6 +63,7 @@ def detect(cls, stream):
 class DelimitedFormat(BaseFormat):
     """A general character-delimited format."""
 
+    data: list[list[str]]
     delimiter = ","
 
     def __init__(self, fp, **kwargs):
@@ -121,7 +124,7 @@ def to_iterable(self):
         return [(d["text"], d["label"]) for d in self.dict]
 
     @classmethod
-    def detect(cls, stream):
+    def detect(cls, stream: str | bytes | bytearray):
         """Return True if stream is valid JSON."""
         try:
             json.loads(stream)
diff --git a/src/textblob/mixins.py b/src/textblob/mixins.py
index 447171a5..65dff4ac 100644
--- a/src/textblob/mixins.py
+++ b/src/textblob/mixins.py
@@ -4,6 +4,9 @@
 class ComparableMixin:
     """Implements rich operators for an object."""
 
+    def _cmpkey(self):
+        raise NotImplementedError("Class must implement _cmpkey method")
+
     def _compare(self, other, method):
         try:
             return method(self._cmpkey(), other._cmpkey())
@@ -49,6 +52,9 @@ class StringlikeMixin:
     of __str__ ensures consistent behavior between Python 2 and 3.
     """
 
+    def _strkey(self) -> str:
+        raise NotImplementedError("Class must implement _strkey method")
+
     def __repr__(self):
         """Returns a string representation for debugging."""
         class_name = self.__class__.__name__
@@ -94,7 +100,7 @@ def find(self, sub, start=0, end=sys.maxsize):
 
     def rfind(self, sub, start=0, end=sys.maxsize):
         """Behaves like the built-in str.rfind() method. Returns an integer,
-        the index of he last (right-most) occurence of the substring argument
+        the index of the last (right-most) occurrence of the substring argument
         sub in the sub-sequence given by [start:end].
         """
         return self._strkey().rfind(sub, start, end)
@@ -161,7 +167,7 @@ def join(self, iterable):
         return self.__class__(self._strkey().join(iterable))
 
     def replace(self, old, new, count=sys.maxsize):
-        """Return a new blob object with all the occurence of `old` replaced
+        """Return a new blob object with all occurrences of `old` replaced
         by `new`.
         """
         return self.__class__(self._strkey().replace(old, new, count))
diff --git a/src/textblob/utils.py b/src/textblob/utils.py
index 7be12c9e..43883f23 100644
--- a/src/textblob/utils.py
+++ b/src/textblob/utils.py
@@ -1,10 +1,16 @@
+from __future__ import annotations
+
 import re
 import string
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
 
 PUNCTUATION_REGEX = re.compile(f"[{re.escape(string.punctuation)}]")
 
 
-def strip_punc(s, all=False):
+def strip_punc(s: str, all=False):
     """Removes punctuation from a string.
 
     :param s: The string.
@@ -17,7 +23,7 @@ def strip_punc(s, all=False):
         return s.strip().strip(string.punctuation)
 
 
-def lowerstrip(s, all=False):
+def lowerstrip(s: str, all=False):
     """Makes text all lowercase and strips punctuation and whitespace.
 
     :param s: The string.
@@ -33,12 +39,14 @@ def tree2str(tree, concat=" "):
     For example:
         (NP a/DT beautiful/JJ new/JJ dashboard/NN) -> "a beautiful dashboard"
     """
-    return concat.join([word for (word, tag) in tree])
+    return concat.join([word for (word, _) in tree])
 
 
-def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")):
+def filter_insignificant(
+    chunk, tag_suffixes: Iterable[str] = ("DT", "CC", "PRP$", "PRP")
+):
     """Filter out insignificant (word, tag) tuples from a chunk of text."""
-    good = []
+    good: list[tuple[str, str]] = []
     for word, tag in chunk:
         ok = True
         for suffix in tag_suffixes:
@@ -52,4 +60,8 @@ def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")):
 
 def is_filelike(obj):
     """Return whether ``obj`` is a file-like object."""
-    return hasattr(obj, "read")
+    if not hasattr(obj, "read"):
+        return False
+    if not callable(obj.read):
+        return False
+    return True