From b7be90d9cf5655d1a46ac4f9313f11aaa2787d1b Mon Sep 17 00:00:00 2001 From: pgambhir888 Date: Sun, 4 Sep 2022 14:13:47 -0400 Subject: [PATCH] Update example.py --- example.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/example.py b/example.py index 41f292b..53e095f 100755 --- a/example.py +++ b/example.py @@ -1,11 +1,20 @@ import metapy def tokens_lowercase(doc): - #Write a token stream that tokenizes with ICUTokenizer (use the argument "suppress_tags=True"), + #Write a token stream that tokenizes with ICUTokenizer, #lowercases, removes words with less than 2 and more than 5 characters #performs stemming and creates trigrams (name the final call to ana.analyze as "trigrams") '''Place your code here''' + metapy.log_to_stderr() + tok = metapy.analyzers.ICUTokenizer(suppress_tags=True) + tok = metapy.analyzers.LowercaseFilter(tok) + tok = metapy.analyzers.LengthFilter(tok, min=2, max=5) + tok = metapy.analyzers.Porter2Filter(tok) + ana = metapy.analyzers.NGramWordAnalyzer(3, tok) + trigrams = ana.analyze(doc) + return trigrams + #leave the rest of the code as is tok.set_content(doc.content()) tokens, counts = [], []