-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
Copy pathnp_extractors.py
208 lines (173 loc) · 6.61 KB
/
np_extractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# -*- coding: utf-8 -*-
'''Various noun phrase extractors.'''
from __future__ import unicode_literals, absolute_import
import nltk
from textblob.taggers import PatternTagger
from textblob.decorators import requires_nltk_corpus
from textblob.utils import tree2str, filter_insignificant
from textblob.base import BaseNPExtractor
class ChunkParser(nltk.ChunkParserI):
def __init__(self):
self._trained = False
@requires_nltk_corpus
def train(self):
'''Train the Chunker on the ConLL-2000 corpus.'''
train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in
nltk.corpus.conll2000.chunked_sents('train.txt',
chunk_types=['NP'])]
unigram_tagger = nltk.UnigramTagger(train_data)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
def parse(self, sentence):
'''Return the parse tree for the sentence.'''
if not self._trained:
self.train()
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in
zip(sentence, chunktags)]
return nltk.chunk.util.conlltags2tree(conlltags)
class ConllExtractor(BaseNPExtractor):
'''A noun phrase extractor that uses chunk parsing trained with the
ConLL-2000 training corpus.
'''
POS_TAGGER = PatternTagger()
# The context-free grammar with which to filter the noun phrases
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
# POS suffixes that will be ignored
INSIGNIFICANT_SUFFIXES = ['DT', 'CC', 'PRP$', 'PRP']
def __init__(self, parser=None):
self.parser = ChunkParser() if not parser else parser
def extract(self, text):
'''Return a list of noun phrases (strings) for body of text.'''
sentences = nltk.tokenize.sent_tokenize(text)
noun_phrases = []
for sentence in sentences:
parsed = self._parse_sentence(sentence)
# Get the string representation of each subtree that is a
# noun phrase tree
phrases = [_normalize_tags(filter_insignificant(each,
self.INSIGNIFICANT_SUFFIXES)) for each in parsed
if isinstance(each, nltk.tree.Tree) and each.label()
== 'NP' and len(filter_insignificant(each)) >= 1
and _is_match(each, cfg=self.CFG)]
nps = [tree2str(phrase) for phrase in phrases]
noun_phrases.extend(nps)
return noun_phrases
def _parse_sentence(self, sentence):
'''Tag and parse a sentence (a plain, untagged string).'''
tagged = self.POS_TAGGER.tag(sentence)
return self.parser.parse(tagged)
class FastNPExtractor(BaseNPExtractor):
'''A fast and simple noun phrase extractor.
Credit to Shlomi Babluk. Link to original blog post:
http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
'''
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
def __init__(self):
self._trained = False
@requires_nltk_corpus
def train(self):
train_data = nltk.corpus.brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger([
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'(-|:|;)$', ':'),
(r'\'*$', 'MD'),
(r'(The|the|A|a|An|an)$', 'AT'),
(r'.*able$', 'JJ'),
(r'^[A-Z].*$', 'NNP'),
(r'.*ness$', 'NN'),
(r'.*ly$', 'RB'),
(r'.*s$', 'NNS'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*', 'NN'),
])
unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
return None
def _tokenize_sentence(self, sentence):
'''Split the sentence into single words/tokens'''
tokens = nltk.word_tokenize(sentence)
return tokens
def extract(self, sentence):
'''Return a list of noun phrases (strings) for body of text.'''
if not self._trained:
self.train()
tokens = self._tokenize_sentence(sentence)
tagged = self.tagger.tag(tokens)
tags = _normalize_tags(tagged)
merge = True
while merge:
merge = False
x = 0
while x < len(tags) - 1:
t1 = tags[x]
t2 = tags[x + 1]
key = t1[1], t2[1]
value = self.CFG.get(key, '')
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = '%s %s' % (t1[0], t2[0])
pos = value
tags.insert(x, (match, pos))
x = x + 1
matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
return matches
### Utility methods ###
def _normalize_tags(chunk):
'''Normalize the corpus tags.
("NN", "NN-PL", "NNS") -> "NN"
'''
ret = []
for word, tag in chunk:
if tag == 'NP-TL' or tag == 'NP':
ret.append((word, 'NNP'))
continue
if tag.endswith('-TL'):
ret.append((word, tag[:-3]))
continue
if tag.endswith('S'):
ret.append((word, tag[:-1]))
continue
ret.append((word, tag))
return ret
def _is_match(tagged_phrase, cfg):
'''Return whether or not a tagged phrases matches a context-free grammar.
'''
copy = list(tagged_phrase) # A copy of the list
merge = True
while merge:
merge = False
i = 0
while i < len(copy) - 1:
first, second = copy[i], copy[i + 1]
key = first[1], second[1] # Tuple of tags e.g. ('NN', 'JJ')
value = cfg.get(key, None)
if value:
merge = True
copy.pop(i)
copy.pop(i)
match = '{0} {1}'.format(first[0], second[0])
pos = value
copy.insert(i, (match, pos))
i = i + 1
match = any([t[1] in ('NNP', 'NNI') for t in copy])
return match