From c24ed35aef294ff831c62c7c2232fe499bf68e6b Mon Sep 17 00:00:00 2001 From: John Franey <1728528+johnfraney@users.noreply.github.com> Date: Mon, 20 Jan 2025 10:21:09 -0400 Subject: [PATCH] typing: add initial types --- pyproject.toml | 6 +- src/textblob/_text.py | 5 +- src/textblob/base.py | 30 +++++---- src/textblob/blob.py | 6 +- src/textblob/decorators.py | 13 +++- src/textblob/en/inflect.py | 103 +++++++++++++++++-------------- src/textblob/en/np_extractors.py | 18 +++--- src/textblob/formats.py | 7 ++- src/textblob/mixins.py | 10 ++- src/textblob/utils.py | 24 +++++-- 10 files changed, 139 insertions(+), 83 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2f1712c5..e52d64bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ Source = "https://github.com/sloria/TextBlob" [project.optional-dependencies] docs = ["sphinx==8.1.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"] tests = ["pytest", "numpy"] -dev = ["textblob[tests]", "tox", "pre-commit~=3.5"] +dev = ["textblob[tests]", "tox", "pre-commit~=3.5", "pyright", "ruff"] [build-system] requires = ["flit_core<4"] @@ -96,3 +96,7 @@ markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", "numpy: marks tests that require numpy", ] + +[tool.pyright] +include = ["src/**"] +exclude = ["tests/**"] diff --git a/src/textblob/_text.py b/src/textblob/_text.py index 62cc0d55..606346e3 100644 --- a/src/textblob/_text.py +++ b/src/textblob/_text.py @@ -1570,9 +1570,8 @@ def parse( TOKENS = "tokens" - class TaggedString(str): - def __new__(self, string, tags=None, language=None): + def __new__(cls, string, tags=None, language=None): """Unicode string with tags and language attributes. For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]). """ @@ -1588,7 +1587,7 @@ def __new__(self, string, tags=None, language=None): for s in string ] string = "\n".join(" ".join("/".join(token) for token in s) for s in string) - s = str.__new__(self, string) + s = str.__new__(cls, string) s.tags = list(tags) s.language = language return s diff --git a/src/textblob/base.py b/src/textblob/base.py index 2c726073..dd20d517 100644 --- a/src/textblob/base.py +++ b/src/textblob/base.py @@ -5,10 +5,16 @@ All base classes are defined in the same module, ``textblob.base``. """ +from __future__ import annotations + from abc import ABCMeta, abstractmethod +from typing import TYPE_CHECKING import nltk +if TYPE_CHECKING: + from typing import Any, AnyStr + ##### POS TAGGERS ##### @@ -19,11 +25,11 @@ class BaseTagger(metaclass=ABCMeta): """ @abstractmethod - def tag(self, text, tokenize=True): + def tag(self, text: str, tokenize=True) -> list[tuple[str, str]]: """Return a list of tuples of the form (word, tag) for a given set of text or BaseBlob instance. """ - return + raise NotImplementedError("Subclass must implement a tag method") ##### NOUN PHRASE EXTRACTORS ##### @@ -36,29 +42,29 @@ class BaseNPExtractor(metaclass=ABCMeta): """ @abstractmethod - def extract(self, text): + def extract(self, text: str) -> list[str]: """Return a list of noun phrases (strings) for a body of text.""" - return + raise NotImplementedError("Subclass must implement an extract method") ##### TOKENIZERS ##### -class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta): +class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta): # pyright: ignore """Abstract base class from which all Tokenizer classes inherit. Descendant classes must implement a ``tokenize(text)`` method that returns a list of noun phrases as strings. """ @abstractmethod - def tokenize(self, text): + def tokenize(self, text: str) -> list[str]: """Return a list of tokens (strings) for a body of text. :rtype: list """ - return + return NotImplemented - def itokenize(self, text, *args, **kwargs): + def itokenize(self, text: str, *args, **kwargs): """Return a generator that generates tokens "on-demand". .. versionadded:: 0.6.0 @@ -81,6 +87,8 @@ class BaseSentimentAnalyzer(metaclass=ABCMeta): results of analysis. """ + _trained: bool + kind = DISCRETE def __init__(self): @@ -91,7 +99,7 @@ def train(self): self._trained = True @abstractmethod - def analyze(self, text): + def analyze(self, text) -> Any: """Return the result of of analysis. Typically returns either a tuple, float, or dictionary. """ @@ -111,6 +119,6 @@ class BaseParser(metaclass=ABCMeta): """ @abstractmethod - def parse(self, text): + def parse(self, text: AnyStr): """Parses the text.""" - return + raise NotImplementedError("Subclass must implement a parse method") diff --git a/src/textblob/blob.py b/src/textblob/blob.py index d26e2f0b..eeaa28d0 100644 --- a/src/textblob/blob.py +++ b/src/textblob/blob.py @@ -138,9 +138,9 @@ def lemmatize(self, pos=None): lemmatizer = nltk.stem.WordNetLemmatizer() return lemmatizer.lemmatize(self.string, tag) - PorterStemmer = nltk.stem.porter.PorterStemmer() - LancasterStemmer = nltk.stem.lancaster.LancasterStemmer() - SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english") + PorterStemmer = nltk.stem.PorterStemmer() + LancasterStemmer = nltk.stem.LancasterStemmer() + SnowballStemmer = nltk.stem.SnowballStemmer("english") # added 'stemmer' on lines of lemmatizer # based on nltk diff --git a/src/textblob/decorators.py b/src/textblob/decorators.py index 9b91ce87..ef5ace04 100644 --- a/src/textblob/decorators.py +++ b/src/textblob/decorators.py @@ -1,9 +1,18 @@ """Custom decorators.""" +from __future__ import annotations + from functools import wraps +from typing import TYPE_CHECKING from textblob.exceptions import MissingCorpusError +if TYPE_CHECKING: + from collections.abc import Callable + from typing import TypeVar + + ReturnType = TypeVar("ReturnType") + class cached_property: """A property that is only computed once per instance and then replaces @@ -24,7 +33,9 @@ def __get__(self, obj, cls): return value -def requires_nltk_corpus(func): +def requires_nltk_corpus( + func: Callable[..., ReturnType], +) -> Callable[..., ReturnType]: """Wraps a function that requires an NLTK corpus. If the corpus isn't found, raise a :exc:`MissingCorpusError`. """ diff --git a/src/textblob/en/inflect.py b/src/textblob/en/inflect.py index 3d4ba244..5c6b13df 100644 --- a/src/textblob/en/inflect.py +++ b/src/textblob/en/inflect.py @@ -4,7 +4,15 @@ See here https://github.com/clips/pattern/blob/master/LICENSE.txt for complete license information. """ + +from __future__ import annotations +from collections.abc import MutableMapping import re +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import AnyStr + VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB" @@ -523,7 +531,7 @@ } -def pluralize(word, pos=NOUN, custom=None, classical=True): +def pluralize(word: str, pos=NOUN, custom=None, classical=True) -> str: """Returns the plural of a given word. For example: child -> children. Handles nouns and adjectives, using classical inflection by default @@ -584,6 +592,7 @@ def pluralize(word, pos=NOUN, custom=None, classical=True): ): if suffix.search(word) is not None: return suffix.sub(inflection, word) + return word #### SINGULARIZE ################################################################################### @@ -607,55 +616,57 @@ def pluralize(word, pos=NOUN, custom=None, classical=True): # THIS SOFTWARE. singular_rules = [ - ["(?i)(.)ae$", "\\1a"], - ["(?i)(.)itis$", "\\1itis"], - ["(?i)(.)eaux$", "\\1eau"], - ["(?i)(quiz)zes$", "\\1"], - ["(?i)(matr)ices$", "\\1ix"], - ["(?i)(ap|vert|ind)ices$", "\\1ex"], - ["(?i)^(ox)en", "\\1"], - ["(?i)(alias|status)es$", "\\1"], - ["(?i)([octop|vir])i$", "\\1us"], - ["(?i)(cris|ax|test)es$", "\\1is"], - ["(?i)(shoe)s$", "\\1"], - ["(?i)(o)es$", "\\1"], - ["(?i)(bus)es$", "\\1"], - ["(?i)([m|l])ice$", "\\1ouse"], - ["(?i)(x|ch|ss|sh)es$", "\\1"], - ["(?i)(m)ovies$", "\\1ovie"], - ["(?i)(.)ombies$", "\\1ombie"], - ["(?i)(s)eries$", "\\1eries"], - ["(?i)([^aeiouy]|qu)ies$", "\\1y"], + (re.compile("(?i)(.)ae$"), "\\1a"), + (re.compile("(?i)(.)itis$"), "\\1itis"), + (re.compile("(?i)(.)eaux$"), "\\1eau"), + (re.compile("(?i)(quiz)zes$"), "\\1"), + (re.compile("(?i)(matr)ices$"), "\\1ix"), + (re.compile("(?i)(ap|vert|ind)ices$"), "\\1ex"), + (re.compile("(?i)^(ox)en"), "\\1"), + (re.compile("(?i)(alias|status)es$"), "\\1"), + (re.compile("(?i)([octop|vir])i$"), "\\1us"), + (re.compile("(?i)(cris|ax|test)es$"), "\\1is"), + (re.compile("(?i)(shoe)s$"), "\\1"), + (re.compile("(?i)(o)es$"), "\\1"), + (re.compile("(?i)(bus)es$"), "\\1"), + (re.compile("(?i)([m|l])ice$"), "\\1ouse"), + (re.compile("(?i)(x|ch|ss|sh)es$"), "\\1"), + (re.compile("(?i)(m)ovies$"), "\\1ovie"), + (re.compile("(?i)(.)ombies$"), "\\1ombie"), + (re.compile("(?i)(s)eries$"), "\\1eries"), + (re.compile("(?i)([^aeiouy]|qu)ies$"), "\\1y"), # Certain words ending in -f or -fe take -ves in the plural (lives, wolves). - ["([aeo]l)ves$", "\\1f"], - ["([^d]ea)ves$", "\\1f"], - ["arves$", "arf"], - ["erves$", "erve"], - ["([nlw]i)ves$", "\\1fe"], - ["(?i)([lr])ves$", "\\1f"], - ["([aeo])ves$", "\\1ve"], - ["(?i)(sive)s$", "\\1"], - ["(?i)(tive)s$", "\\1"], - ["(?i)(hive)s$", "\\1"], - ["(?i)([^f])ves$", "\\1fe"], + (re.compile("([aeo]l)ves$"), "\\1f"), + (re.compile("([^d]ea)ves$"), "\\1f"), + (re.compile("arves$"), "arf"), + (re.compile("erves$"), "erve"), + (re.compile("([nlw]i)ves$"), "\\1fe"), + (re.compile("(?i)([lr])ves$"), "\\1f"), + (re.compile("([aeo])ves$"), "\\1ve"), + (re.compile("(?i)(sive)s$"), "\\1"), + (re.compile("(?i)(tive)s$"), "\\1"), + (re.compile("(?i)(hive)s$"), "\\1"), + (re.compile("(?i)([^f])ves$"), "\\1fe"), # -es suffix. - ["(?i)(^analy)ses$", "\\1sis"], - ["(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "\\1\\2sis"], - ["(?i)(.)opses$", "\\1opsis"], - ["(?i)(.)yses$", "\\1ysis"], - ["(?i)(h|d|r|o|n|b|cl|p)oses$", "\\1ose"], - ["(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose"], - ["(?i)(.)oses$", "\\1osis"], + (re.compile("(?i)(^analy)ses$"), "\\1sis"), + ( + re.compile("(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"), + "\\1\\2sis", + ), + (re.compile("(?i)(.)opses$"), "\\1opsis"), + (re.compile("(?i)(.)yses$"), "\\1ysis"), + (re.compile("(?i)(h|d|r|o|n|b|cl|p)oses$"), "\\1ose"), + ( + re.compile("(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$"), + "\\1ose", + ), + (re.compile("(?i)(.)oses$"), "\\1osis"), # -a - ["(?i)([ti])a$", "\\1um"], - ["(?i)(n)ews$", "\\1ews"], - ["(?i)s$", ""], + (re.compile("(?i)([ti])a$"), "\\1um"), + (re.compile("(?i)(n)ews$"), "\\1ews"), + (re.compile("(?i)s$"), ""), ] -# For performance, compile the regular expressions only once: -for rule in singular_rules: - rule[0] = re.compile(rule[0]) - singular_uninflected = [ "aircraft", "antelope", @@ -833,7 +844,7 @@ def pluralize(word, pos=NOUN, custom=None, classical=True): } -def singularize(word, pos=NOUN, custom=None): +def singularize(word: str, pos=NOUN, custom: MutableMapping[str, str] | None = None): if custom is None: custom = {} if word in list(custom.keys()): diff --git a/src/textblob/en/np_extractors.py b/src/textblob/en/np_extractors.py index 489d6da9..6653e6fe 100644 --- a/src/textblob/en/np_extractors.py +++ b/src/textblob/en/np_extractors.py @@ -9,6 +9,8 @@ class ChunkParser(nltk.ChunkParserI): + _trained: bool + def __init__(self): self._trained = False @@ -25,22 +27,21 @@ def train(self): self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger) self._trained = True - def parse(self, sentence): + def parse(self, tokens): """Return the parse tree for the sentence.""" if not self._trained: self.train() - pos_tags = [pos for (word, pos) in sentence] + pos_tags = [pos for (_, pos) in tokens] tagged_pos_tags = self.tagger.tag(pos_tags) - chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] + chunktags = [chunktag for (_, chunktag) in tagged_pos_tags] conlltags = [ (word, pos, chunktag) - for ((word, pos), chunktag) in zip(sentence, chunktags) + for ((word, pos), chunktag) in zip(tokens, chunktags) ] return nltk.chunk.util.conlltags2tree(conlltags) class ConllExtractor(BaseNPExtractor): - """A noun phrase extractor that uses chunk parsing trained with the ConLL-2000 training corpus. """ @@ -89,7 +90,6 @@ def _parse_sentence(self, sentence): class FastNPExtractor(BaseNPExtractor): - """A fast and simple noun phrase extractor. Credit to Shlomi Babluk. Link to original blog post: @@ -97,6 +97,8 @@ class FastNPExtractor(BaseNPExtractor): http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/ """ + _trained: bool + CFG = { ("NNP", "NNP"): "NNP", ("NN", "NN"): "NNI", @@ -137,11 +139,11 @@ def _tokenize_sentence(self, sentence): tokens = nltk.word_tokenize(sentence) return tokens - def extract(self, sentence): + def extract(self, text): """Return a list of noun phrases (strings) for body of text.""" if not self._trained: self.train() - tokens = self._tokenize_sentence(sentence) + tokens = self._tokenize_sentence(text) tagged = self.tagger.tag(tokens) tags = _normalize_tags(tagged) merge = True diff --git a/src/textblob/formats.py b/src/textblob/formats.py index cff7c7a4..8b4e40ab 100644 --- a/src/textblob/formats.py +++ b/src/textblob/formats.py @@ -21,6 +21,8 @@ class PipeDelimitedFormat(formats.DelimitedFormat): cl = NaiveBayesAnalyzer(fp, format="psv") """ +from __future__ import annotations + import csv import json from collections import OrderedDict @@ -48,7 +50,7 @@ def to_iterable(self): raise NotImplementedError('Must implement a "to_iterable" method.') @classmethod - def detect(cls, stream): + def detect(cls, stream: str): """Detect the file format given a filename. Return True if a stream is this file format. @@ -61,6 +63,7 @@ def detect(cls, stream): class DelimitedFormat(BaseFormat): """A general character-delimited format.""" + data: list[list[str]] delimiter = "," def __init__(self, fp, **kwargs): @@ -121,7 +124,7 @@ def to_iterable(self): return [(d["text"], d["label"]) for d in self.dict] @classmethod - def detect(cls, stream): + def detect(cls, stream: str | bytes | bytearray): """Return True if stream is valid JSON.""" try: json.loads(stream) diff --git a/src/textblob/mixins.py b/src/textblob/mixins.py index 447171a5..65dff4ac 100644 --- a/src/textblob/mixins.py +++ b/src/textblob/mixins.py @@ -4,6 +4,9 @@ class ComparableMixin: """Implements rich operators for an object.""" + def _cmpkey(self): + raise NotImplementedError("Class must implement _cmpkey method") + def _compare(self, other, method): try: return method(self._cmpkey(), other._cmpkey()) @@ -49,6 +52,9 @@ class StringlikeMixin: of __str__ ensures consistent behavior between Python 2 and 3. """ + def _strkey(self) -> str: + raise NotImplementedError("Class must implement _strkey method") + def __repr__(self): """Returns a string representation for debugging.""" class_name = self.__class__.__name__ @@ -94,7 +100,7 @@ def find(self, sub, start=0, end=sys.maxsize): def rfind(self, sub, start=0, end=sys.maxsize): """Behaves like the built-in str.rfind() method. Returns an integer, - the index of he last (right-most) occurence of the substring argument + the index of the last (right-most) occurrence of the substring argument sub in the sub-sequence given by [start:end]. """ return self._strkey().rfind(sub, start, end) @@ -161,7 +167,7 @@ def join(self, iterable): return self.__class__(self._strkey().join(iterable)) def replace(self, old, new, count=sys.maxsize): - """Return a new blob object with all the occurence of `old` replaced + """Return a new blob object with all occurrences of `old` replaced by `new`. """ return self.__class__(self._strkey().replace(old, new, count)) diff --git a/src/textblob/utils.py b/src/textblob/utils.py index 7be12c9e..43883f23 100644 --- a/src/textblob/utils.py +++ b/src/textblob/utils.py @@ -1,10 +1,16 @@ +from __future__ import annotations + import re import string +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterable PUNCTUATION_REGEX = re.compile(f"[{re.escape(string.punctuation)}]") -def strip_punc(s, all=False): +def strip_punc(s: str, all=False): """Removes punctuation from a string. :param s: The string. @@ -17,7 +23,7 @@ def strip_punc(s, all=False): return s.strip().strip(string.punctuation) -def lowerstrip(s, all=False): +def lowerstrip(s: str, all=False): """Makes text all lowercase and strips punctuation and whitespace. :param s: The string. @@ -33,12 +39,14 @@ def tree2str(tree, concat=" "): For example: (NP a/DT beautiful/JJ new/JJ dashboard/NN) -> "a beautiful dashboard" """ - return concat.join([word for (word, tag) in tree]) + return concat.join([word for (word, _) in tree]) -def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")): +def filter_insignificant( + chunk, tag_suffixes: Iterable[str] = ("DT", "CC", "PRP$", "PRP") +): """Filter out insignificant (word, tag) tuples from a chunk of text.""" - good = [] + good: list[tuple[str, str]] = [] for word, tag in chunk: ok = True for suffix in tag_suffixes: @@ -52,4 +60,8 @@ def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")): def is_filelike(obj): """Return whether ``obj`` is a file-like object.""" - return hasattr(obj, "read") + if not hasattr(obj, "read"): + return False + if not callable(obj.read): + return False + return True