Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
typing: add initial types
Browse files Browse the repository at this point in the history
johnfraney committed Jan 20, 2025
1 parent 1a4f357 commit 813a787
Showing 11 changed files with 151 additions and 95 deletions.
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -28,7 +28,7 @@ Source = "https://github.com/sloria/TextBlob"
[project.optional-dependencies]
docs = ["sphinx==8.1.3", "sphinx-issues==5.0.0", "PyYAML==6.0.2"]
tests = ["pytest", "numpy"]
dev = ["textblob[tests]", "tox", "pre-commit~=3.5"]
dev = ["textblob[tests]", "tox", "pre-commit~=3.5", "pyright", "ruff"]

[build-system]
requires = ["flit_core<4"]
@@ -96,3 +96,7 @@ markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"numpy: marks tests that require numpy",
]

[tool.pyright]
include = ["src/**"]
exclude = ["tests/**"]
23 changes: 11 additions & 12 deletions src/textblob/_text.py
Original file line number Diff line number Diff line change
@@ -124,7 +124,7 @@ def keys(self):
def values(self):
return self._lazy("values")

def update(self, *args):
def update(self, *args, **kwargs):
return self._lazy("update", *args)

def pop(self, *args):
@@ -324,10 +324,10 @@ def penntreebank2universal(token, tag):
("cry", -1.00): set((":'(", ":'''(", ";'(")),
}

RE_EMOTICONS = [
TEMP_RE_EMOTICONS = [
r" ?".join([re.escape(each) for each in e]) for v in EMOTICONS.values() for e in v
]
RE_EMOTICONS = re.compile(r"(%s)($|\s)" % "|".join(RE_EMOTICONS))
RE_EMOTICONS = re.compile(r"(%s)($|\s)" % "|".join(TEMP_RE_EMOTICONS))

# Handle sarcasm punctuation (!).
RE_SARCASM = re.compile(r"\( ?\! ?\)")
@@ -490,9 +490,9 @@ class Lexicon(lazydict):
def __init__(
self,
path="",
morphology=None,
context=None,
entities=None,
morphology="",
context="",
entities="",
NNP="NNP",
language=None,
):
@@ -724,7 +724,7 @@ def apply(self, tokens):
t[i] = [t[i][0], r[1]]
return t[len(o) : -len(o)]

def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None):
def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None, *args):
"""Inserts a new rule that updates words with tag1 to tag2,
given constraints x and y, e.g., Context.append("TO < NN", "VB")
"""
@@ -739,7 +739,7 @@ def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None):
def append(self, *args, **kwargs):
self.insert(len(self) - 1, *args, **kwargs)

def extend(self, rules=None):
def extend(self, rules=None, *args):
if rules is None:
rules = []
for r in rules:
@@ -1570,9 +1570,8 @@ def parse(

TOKENS = "tokens"


class TaggedString(str):
def __new__(self, string, tags=None, language=None):
def __new__(cls, string, tags=None, language=None):
"""Unicode string with tags and language attributes.
For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]).
"""
@@ -1588,7 +1587,7 @@ def __new__(self, string, tags=None, language=None):
for s in string
]
string = "\n".join(" ".join("/".join(token) for token in s) for s in string)
s = str.__new__(self, string)
s = str.__new__(cls, string)
s.tags = list(tags)
s.language = language
return s
@@ -1634,7 +1633,7 @@ def language(self):
return self._language

@classmethod
def train(self, s, path="spelling.txt"):
def train(cls, s, path="spelling.txt"):
"""Counts the words in the given string and saves the probabilities at the given path.
This can be used to generate a new model for the Spelling() constructor.
"""
30 changes: 19 additions & 11 deletions src/textblob/base.py
Original file line number Diff line number Diff line change
@@ -5,10 +5,16 @@
All base classes are defined in the same module, ``textblob.base``.
"""

from __future__ import annotations

from abc import ABCMeta, abstractmethod
from typing import TYPE_CHECKING

import nltk

if TYPE_CHECKING:
from typing import Any, AnyStr

##### POS TAGGERS #####


@@ -19,11 +25,11 @@ class BaseTagger(metaclass=ABCMeta):
"""

@abstractmethod
def tag(self, text, tokenize=True):
def tag(self, text: str, tokenize=True) -> list[tuple[str, str]]:
"""Return a list of tuples of the form (word, tag)
for a given set of text or BaseBlob instance.
"""
return
raise NotImplementedError("Subclass must implement a tag method")


##### NOUN PHRASE EXTRACTORS #####
@@ -36,29 +42,29 @@ class BaseNPExtractor(metaclass=ABCMeta):
"""

@abstractmethod
def extract(self, text):
def extract(self, text: str) -> list[str]:
"""Return a list of noun phrases (strings) for a body of text."""
return
raise NotImplementedError("Subclass must implement an extract method")


##### TOKENIZERS #####


class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta):
class BaseTokenizer(nltk.tokenize.api.TokenizerI, metaclass=ABCMeta): # pyright: ignore
"""Abstract base class from which all Tokenizer classes inherit.
Descendant classes must implement a ``tokenize(text)`` method
that returns a list of noun phrases as strings.
"""

@abstractmethod
def tokenize(self, text):
def tokenize(self, text: str) -> list[str]:
"""Return a list of tokens (strings) for a body of text.
:rtype: list
"""
return
raise NotImplementedError("Subclasss must implement tokenize method")

def itokenize(self, text, *args, **kwargs):
def itokenize(self, text: str, *args, **kwargs):
"""Return a generator that generates tokens "on-demand".
.. versionadded:: 0.6.0
@@ -81,6 +87,8 @@ class BaseSentimentAnalyzer(metaclass=ABCMeta):
results of analysis.
"""

_trained: bool

kind = DISCRETE

def __init__(self):
@@ -91,7 +99,7 @@ def train(self):
self._trained = True

@abstractmethod
def analyze(self, text):
def analyze(self, text) -> Any:
"""Return the result of of analysis. Typically returns either a
tuple, float, or dictionary.
"""
@@ -111,6 +119,6 @@ class BaseParser(metaclass=ABCMeta):
"""

@abstractmethod
def parse(self, text):
def parse(self, text: AnyStr):
"""Parses the text."""
return
raise NotImplementedError("Subclass must implement a parse method")
6 changes: 3 additions & 3 deletions src/textblob/blob.py
Original file line number Diff line number Diff line change
@@ -138,9 +138,9 @@ def lemmatize(self, pos=None):
lemmatizer = nltk.stem.WordNetLemmatizer()
return lemmatizer.lemmatize(self.string, tag)

PorterStemmer = nltk.stem.porter.PorterStemmer()
LancasterStemmer = nltk.stem.lancaster.LancasterStemmer()
SnowballStemmer = nltk.stem.snowball.SnowballStemmer("english")
PorterStemmer = nltk.stem.PorterStemmer()
LancasterStemmer = nltk.stem.LancasterStemmer()
SnowballStemmer = nltk.stem.SnowballStemmer("english")

# added 'stemmer' on lines of lemmatizer
# based on nltk
4 changes: 2 additions & 2 deletions src/textblob/classifiers.py
Original file line number Diff line number Diff line change
@@ -510,8 +510,8 @@ def update(


class MaxEntClassifier(NLTKClassifier):
__doc__ = nltk.classify.maxent.MaxentClassifier.__doc__
nltk_class = nltk.classify.maxent.MaxentClassifier
__doc__ = nltk.classify.MaxentClassifier.__doc__
nltk_class = nltk.classify.MaxentClassifier

def prob_classify(self, text):
"""Return the label probability distribution for classifying a string
13 changes: 12 additions & 1 deletion src/textblob/decorators.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
"""Custom decorators."""

from __future__ import annotations

from functools import wraps
from typing import TYPE_CHECKING

from textblob.exceptions import MissingCorpusError

if TYPE_CHECKING:
from collections.abc import Callable
from typing import TypeVar

ReturnType = TypeVar("ReturnType")


class cached_property:
"""A property that is only computed once per instance and then replaces
@@ -24,7 +33,9 @@ def __get__(self, obj, cls):
return value


def requires_nltk_corpus(func):
def requires_nltk_corpus(
func: Callable[..., ReturnType],
) -> Callable[..., ReturnType]:
"""Wraps a function that requires an NLTK corpus. If the corpus isn't found,
raise a :exc:`MissingCorpusError`.
"""
103 changes: 57 additions & 46 deletions src/textblob/en/inflect.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,15 @@
See here https://github.com/clips/pattern/blob/master/LICENSE.txt for
complete license information.
"""

from __future__ import annotations
from collections.abc import MutableMapping
import re
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import AnyStr


VERB, NOUN, ADJECTIVE, ADVERB = "VB", "NN", "JJ", "RB"

@@ -523,7 +531,7 @@
}


def pluralize(word, pos=NOUN, custom=None, classical=True):
def pluralize(word: str, pos=NOUN, custom=None, classical=True) -> str:
"""Returns the plural of a given word.
For example: child -> children.
Handles nouns and adjectives, using classical inflection by default
@@ -584,6 +592,7 @@ def pluralize(word, pos=NOUN, custom=None, classical=True):
):
if suffix.search(word) is not None:
return suffix.sub(inflection, word)
return word


#### SINGULARIZE ###################################################################################
@@ -607,55 +616,57 @@ def pluralize(word, pos=NOUN, custom=None, classical=True):
# THIS SOFTWARE.

singular_rules = [
["(?i)(.)ae$", "\\1a"],
["(?i)(.)itis$", "\\1itis"],
["(?i)(.)eaux$", "\\1eau"],
["(?i)(quiz)zes$", "\\1"],
["(?i)(matr)ices$", "\\1ix"],
["(?i)(ap|vert|ind)ices$", "\\1ex"],
["(?i)^(ox)en", "\\1"],
["(?i)(alias|status)es$", "\\1"],
["(?i)([octop|vir])i$", "\\1us"],
["(?i)(cris|ax|test)es$", "\\1is"],
["(?i)(shoe)s$", "\\1"],
["(?i)(o)es$", "\\1"],
["(?i)(bus)es$", "\\1"],
["(?i)([m|l])ice$", "\\1ouse"],
["(?i)(x|ch|ss|sh)es$", "\\1"],
["(?i)(m)ovies$", "\\1ovie"],
["(?i)(.)ombies$", "\\1ombie"],
["(?i)(s)eries$", "\\1eries"],
["(?i)([^aeiouy]|qu)ies$", "\\1y"],
(re.compile("(?i)(.)ae$"), "\\1a"),
(re.compile("(?i)(.)itis$"), "\\1itis"),
(re.compile("(?i)(.)eaux$"), "\\1eau"),
(re.compile("(?i)(quiz)zes$"), "\\1"),
(re.compile("(?i)(matr)ices$"), "\\1ix"),
(re.compile("(?i)(ap|vert|ind)ices$"), "\\1ex"),
(re.compile("(?i)^(ox)en"), "\\1"),
(re.compile("(?i)(alias|status)es$"), "\\1"),
(re.compile("(?i)([octop|vir])i$"), "\\1us"),
(re.compile("(?i)(cris|ax|test)es$"), "\\1is"),
(re.compile("(?i)(shoe)s$"), "\\1"),
(re.compile("(?i)(o)es$"), "\\1"),
(re.compile("(?i)(bus)es$"), "\\1"),
(re.compile("(?i)([m|l])ice$"), "\\1ouse"),
(re.compile("(?i)(x|ch|ss|sh)es$"), "\\1"),
(re.compile("(?i)(m)ovies$"), "\\1ovie"),
(re.compile("(?i)(.)ombies$"), "\\1ombie"),
(re.compile("(?i)(s)eries$"), "\\1eries"),
(re.compile("(?i)([^aeiouy]|qu)ies$"), "\\1y"),
# Certain words ending in -f or -fe take -ves in the plural (lives, wolves).
["([aeo]l)ves$", "\\1f"],
["([^d]ea)ves$", "\\1f"],
["arves$", "arf"],
["erves$", "erve"],
["([nlw]i)ves$", "\\1fe"],
["(?i)([lr])ves$", "\\1f"],
["([aeo])ves$", "\\1ve"],
["(?i)(sive)s$", "\\1"],
["(?i)(tive)s$", "\\1"],
["(?i)(hive)s$", "\\1"],
["(?i)([^f])ves$", "\\1fe"],
(re.compile("([aeo]l)ves$"), "\\1f"),
(re.compile("([^d]ea)ves$"), "\\1f"),
(re.compile("arves$"), "arf"),
(re.compile("erves$"), "erve"),
(re.compile("([nlw]i)ves$"), "\\1fe"),
(re.compile("(?i)([lr])ves$"), "\\1f"),
(re.compile("([aeo])ves$"), "\\1ve"),
(re.compile("(?i)(sive)s$"), "\\1"),
(re.compile("(?i)(tive)s$"), "\\1"),
(re.compile("(?i)(hive)s$"), "\\1"),
(re.compile("(?i)([^f])ves$"), "\\1fe"),
# -es suffix.
["(?i)(^analy)ses$", "\\1sis"],
["(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "\\1\\2sis"],
["(?i)(.)opses$", "\\1opsis"],
["(?i)(.)yses$", "\\1ysis"],
["(?i)(h|d|r|o|n|b|cl|p)oses$", "\\1ose"],
["(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$", "\\1ose"],
["(?i)(.)oses$", "\\1osis"],
(re.compile("(?i)(^analy)ses$"), "\\1sis"),
(
re.compile("(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$"),
"\\1\\2sis",
),
(re.compile("(?i)(.)opses$"), "\\1opsis"),
(re.compile("(?i)(.)yses$"), "\\1ysis"),
(re.compile("(?i)(h|d|r|o|n|b|cl|p)oses$"), "\\1ose"),
(
re.compile("(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$"),
"\\1ose",
),
(re.compile("(?i)(.)oses$"), "\\1osis"),
# -a
["(?i)([ti])a$", "\\1um"],
["(?i)(n)ews$", "\\1ews"],
["(?i)s$", ""],
(re.compile("(?i)([ti])a$"), "\\1um"),
(re.compile("(?i)(n)ews$"), "\\1ews"),
(re.compile("(?i)s$"), ""),
]

# For performance, compile the regular expressions only once:
for rule in singular_rules:
rule[0] = re.compile(rule[0])

singular_uninflected = [
"aircraft",
"antelope",
@@ -833,7 +844,7 @@ def pluralize(word, pos=NOUN, custom=None, classical=True):
}


def singularize(word, pos=NOUN, custom=None):
def singularize(word: str, pos=NOUN, custom: MutableMapping[str, str] | None = None):
if custom is None:
custom = {}
if word in list(custom.keys()):
20 changes: 11 additions & 9 deletions src/textblob/en/np_extractors.py
Original file line number Diff line number Diff line change
@@ -9,6 +9,8 @@


class ChunkParser(nltk.ChunkParserI):
_trained: bool

def __init__(self):
self._trained = False

@@ -25,22 +27,21 @@ def train(self):
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True

def parse(self, sentence):
def parse(self, tokens):
"""Return the parse tree for the sentence."""
if not self._trained:
self.train()
pos_tags = [pos for (word, pos) in sentence]
pos_tags = [pos for (_, pos) in tokens]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
chunktags = [chunktag for (_, chunktag) in tagged_pos_tags]
conlltags = [
(word, pos, chunktag)
for ((word, pos), chunktag) in zip(sentence, chunktags)
for ((word, pos), chunktag) in zip(tokens, chunktags)
]
return nltk.chunk.util.conlltags2tree(conlltags)
return nltk.chunk.conlltags2tree(conlltags)


class ConllExtractor(BaseNPExtractor):

"""A noun phrase extractor that uses chunk parsing trained with the
ConLL-2000 training corpus.
"""
@@ -89,14 +90,15 @@ def _parse_sentence(self, sentence):


class FastNPExtractor(BaseNPExtractor):

"""A fast and simple noun phrase extractor.
Credit to Shlomi Babluk. Link to original blog post:
http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
"""

_trained: bool

CFG = {
("NNP", "NNP"): "NNP",
("NN", "NN"): "NNI",
@@ -137,11 +139,11 @@ def _tokenize_sentence(self, sentence):
tokens = nltk.word_tokenize(sentence)
return tokens

def extract(self, sentence):
def extract(self, text):
"""Return a list of noun phrases (strings) for body of text."""
if not self._trained:
self.train()
tokens = self._tokenize_sentence(sentence)
tokens = self._tokenize_sentence(text)
tagged = self.tagger.tag(tokens)
tags = _normalize_tags(tagged)
merge = True
7 changes: 5 additions & 2 deletions src/textblob/formats.py
Original file line number Diff line number Diff line change
@@ -21,6 +21,8 @@ class PipeDelimitedFormat(formats.DelimitedFormat):
cl = NaiveBayesAnalyzer(fp, format="psv")
"""

from __future__ import annotations

import csv
import json
from collections import OrderedDict
@@ -48,7 +50,7 @@ def to_iterable(self):
raise NotImplementedError('Must implement a "to_iterable" method.')

@classmethod
def detect(cls, stream):
def detect(cls, stream: str):
"""Detect the file format given a filename.
Return True if a stream is this file format.
@@ -61,6 +63,7 @@ def detect(cls, stream):
class DelimitedFormat(BaseFormat):
"""A general character-delimited format."""

data: list[list[str]]
delimiter = ","

def __init__(self, fp, **kwargs):
@@ -121,7 +124,7 @@ def to_iterable(self):
return [(d["text"], d["label"]) for d in self.dict]

@classmethod
def detect(cls, stream):
def detect(cls, stream: str | bytes | bytearray):
"""Return True if stream is valid JSON."""
try:
json.loads(stream)
10 changes: 8 additions & 2 deletions src/textblob/mixins.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,9 @@
class ComparableMixin:
"""Implements rich operators for an object."""

def _cmpkey(self):
raise NotImplementedError("Class must implement _cmpkey method")

def _compare(self, other, method):
try:
return method(self._cmpkey(), other._cmpkey())
@@ -49,6 +52,9 @@ class StringlikeMixin:
of __str__ ensures consistent behavior between Python 2 and 3.
"""

def _strkey(self) -> str:
raise NotImplementedError("Class must implement _strkey method")

def __repr__(self):
"""Returns a string representation for debugging."""
class_name = self.__class__.__name__
@@ -94,7 +100,7 @@ def find(self, sub, start=0, end=sys.maxsize):

def rfind(self, sub, start=0, end=sys.maxsize):
"""Behaves like the built-in str.rfind() method. Returns an integer,
the index of he last (right-most) occurence of the substring argument
the index of the last (right-most) occurrence of the substring argument
sub in the sub-sequence given by [start:end].
"""
return self._strkey().rfind(sub, start, end)
@@ -161,7 +167,7 @@ def join(self, iterable):
return self.__class__(self._strkey().join(iterable))

def replace(self, old, new, count=sys.maxsize):
"""Return a new blob object with all the occurence of `old` replaced
"""Return a new blob object with all occurrences of `old` replaced
by `new`.
"""
return self.__class__(self._strkey().replace(old, new, count))
24 changes: 18 additions & 6 deletions src/textblob/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from __future__ import annotations

import re
import string
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from collections.abc import Iterable

PUNCTUATION_REGEX = re.compile(f"[{re.escape(string.punctuation)}]")


def strip_punc(s, all=False):
def strip_punc(s: str, all=False):
"""Removes punctuation from a string.
:param s: The string.
@@ -17,7 +23,7 @@ def strip_punc(s, all=False):
return s.strip().strip(string.punctuation)


def lowerstrip(s, all=False):
def lowerstrip(s: str, all=False):
"""Makes text all lowercase and strips punctuation and whitespace.
:param s: The string.
@@ -33,12 +39,14 @@ def tree2str(tree, concat=" "):
For example:
(NP a/DT beautiful/JJ new/JJ dashboard/NN) -> "a beautiful dashboard"
"""
return concat.join([word for (word, tag) in tree])
return concat.join([word for (word, _) in tree])


def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")):
def filter_insignificant(
chunk, tag_suffixes: Iterable[str] = ("DT", "CC", "PRP$", "PRP")
):
"""Filter out insignificant (word, tag) tuples from a chunk of text."""
good = []
good: list[tuple[str, str]] = []
for word, tag in chunk:
ok = True
for suffix in tag_suffixes:
@@ -52,4 +60,8 @@ def filter_insignificant(chunk, tag_suffixes=("DT", "CC", "PRP$", "PRP")):

def is_filelike(obj):
"""Return whether ``obj`` is a file-like object."""
return hasattr(obj, "read")
if not hasattr(obj, "read"):
return False
if not callable(obj.read):
return False
return True

0 comments on commit 813a787

Please sign in to comment.