-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This release has a simpler file structure and is 100% Python3. This version includes legal topic specific improvements. It also provides a way to save the results of the Relevance Filter in order to use this part of the program more efficiently.
- Loading branch information
1 parent
36cf2e9
commit b1be6e2
Showing
2,986 changed files
with
765,345 additions
and
3,622,398 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os, pickle | ||
from nltk import FreqDist | ||
import NPParser, Filter, Section, Settings | ||
|
||
class Document: | ||
"""A Document object contains: filename - file location, and \ | ||
words - a dict with words as keys and word counts as values.""" | ||
def __init__(self, filename=None,overwrite=False): | ||
self.filename = filename | ||
#REMINDER, INCLUDE METATAG DATA (SECTION, WORDCLASS, ETC.) | ||
#self.positions = {} | ||
# If filename given, input the file | ||
if filename: | ||
if not os.path.isfile(filename): | ||
raise OSError(2, 'No such file', filename) | ||
# get terms from noun groups | ||
parser = NPParser.NPParser() | ||
# expand abbreviations, stemming phrase endings, etc. | ||
#filters = ['abbreviation', 'isWord', 'case', 'stops', 'stem'] | ||
filters = Settings.getDocumentFilters() | ||
self.counts = parser.getTerms(filename, filters,overwrite=overwrite) | ||
# get individual word counts -- for tokenized measures | ||
self.token_counts = FreqDist() | ||
for kw in list(self.counts.keys()): | ||
words = kw.split() | ||
for w in words: | ||
self.token_counts[w] += 1 | ||
# now to section data as raw text blocks | ||
#self.sections = Section.getSections(filename) | ||
else: | ||
self.counts = FreqDist() | ||
self.token_counts = FreqDist() | ||
#self.sections = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,187 @@ | ||
import re, pickle, logging, os | ||
from nltk.stem import PorterStemmer as Stemmer # NLTK's license, Apache | ||
from nltk.corpus import stopwords # not encumbered by license, see stopwords.readme() | ||
|
||
abbreviations={} | ||
stops = [] | ||
stemmer = None | ||
stemdict = {} # stemming dictionary | ||
unstemdict = {} # reverse stemming dictionary | ||
dir_name = os.path.dirname(os.path.realpath(__file__)) + os.sep | ||
logger = logging.getLogger() | ||
|
||
def _get_abbreviations(filename='./jargon.out'): | ||
"""Import abbreviations from jargon file""" | ||
f = open(filename) | ||
for line in f: | ||
temp = line.split('|||') | ||
fullword = temp[0] | ||
shortwords = temp[1].split('||') | ||
for w in shortwords: | ||
abbreviations[w] = fullword | ||
f.close() | ||
def _get_stops(filename=dir_name + 'patentstops.txt'): | ||
"""Import stop words either from a text file or stopwords corpus""" | ||
global stops | ||
if filename: | ||
f = open(filename) | ||
for line in f: | ||
stops += line.split() | ||
f.close() | ||
else: | ||
stops = stopwords.words('english') | ||
def _get_stemdict(filename): | ||
logger.debug('Loading stemming dictionary...') | ||
f = open(filename) | ||
global stemdict | ||
global unstemdict | ||
stemdict, unstemdict = pickle.load(f) | ||
f.close() | ||
def _save_stemdict(filename): | ||
logger.debug('Saving stemming dictionary...') | ||
f = open(filename, 'w') | ||
global stemdict | ||
global unstemdict | ||
pickle.dump((stemdict, unstemdict),f) | ||
f.close() | ||
def _reGlue(words): | ||
"""Helper function to turn a list of words into a string""" | ||
ret = "" | ||
for i in range(len(words)): | ||
ret += words[i] + " " | ||
ret = ret.strip() | ||
return ret | ||
def expand(string): | ||
"""Expand abbreviations within string""" | ||
global abbreviations | ||
if not abbreviations: | ||
_get_abbreviations() | ||
words = string.split() | ||
for i in range(len(words)): | ||
temp = abbreviations.get(words[i]) | ||
if temp: | ||
words[i] = temp | ||
string = _reGlue(words) | ||
return string | ||
def removeStops(string): #NOT NEEDED AS NP EXTRACTING REMOVES THEM | ||
"""Strip stop words off the beginning and ending of a phrase""" | ||
global stops | ||
if not stops: | ||
_get_stops() | ||
# entire phrase in stops | ||
if string in stops: | ||
return "" | ||
words = string.split() | ||
if not words: | ||
return "" | ||
# beginning stops (loses case of multiword stops) | ||
while words[0] in stops: | ||
words.pop(0) | ||
if not words: | ||
return "" | ||
# ending stops (loses case of multiword stops) | ||
while words[-1] in stops: | ||
words.pop(0) | ||
if not words: | ||
return "" | ||
string = _reGlue(words) | ||
return string | ||
|
||
def bad_unicode(string): | ||
for char in string: | ||
if ord(char)>127: | ||
print(char) | ||
return(True) | ||
|
||
def remove_non_unicode(string): | ||
output = '' | ||
for char in string: | ||
if ord(char)>127: | ||
output=output+' ' | ||
else: | ||
output=output+char | ||
output = output.strip(' ') | ||
return(output) | ||
|
||
def stem(string): | ||
"""Stem a phrase""" | ||
global stemmer | ||
if not stemmer: | ||
stemmer = Stemmer() | ||
#words = string.split() | ||
#for i in range(len(words)): | ||
# words[i] = self.stemmer.stem(words[i]) | ||
# stemming last word only | ||
#string = self._reGlue(words) | ||
# | ||
#string2 = stemmer.stem(string) | ||
#if string2 not in stemdict: | ||
# stemdict[string2] = string | ||
# FIX ME | ||
if string not in stemdict: | ||
if bad_unicode(string): | ||
## added A. Meyers 8/28/15 | ||
temp = stemmer.stem(remove_non_unicode(string)) | ||
else: | ||
temp = stemmer.stem(string) | ||
if temp: | ||
stemdict[string] = temp | ||
if not temp: | ||
pass | ||
elif temp not in unstemdict: | ||
unstemdict[temp] = [string] | ||
elif string not in unstemdict[temp]: | ||
unstemdict[temp].append(string) | ||
else: | ||
temp = stemdict[string] | ||
return temp | ||
|
||
def unstem(string): | ||
"""Undo stemming of a phrase""" | ||
global stemdict | ||
#if string in stemdict: | ||
# return stemdict[string] | ||
#else: | ||
# return string | ||
if string in unstemdict: | ||
return unstemdict[string] | ||
else: | ||
return [string] | ||
def lowercase(string): | ||
"""Return an all lowercase representation of a string""" | ||
return string.lower() | ||
def isWord(string): | ||
"""Test the legitimacy of the proposed phrase. Taken from Shasha's implementation""" | ||
#criteria: | ||
pattern = re.compile(r""" | ||
( | ||
< | ||
|% | ||
|/ | ||
|\\ | ||
|&\ lt | ||
|\) | ||
|\( | ||
|\. | ||
|\+ | ||
|and\ | ||
|\ and | ||
|\ and\ | ||
) | ||
""", re.I | re.VERBOSE | re.UNICODE) | ||
if len(string) < 2: | ||
return '' | ||
elif re.findall(pattern, string): | ||
return '' | ||
#must contain at least one letter | ||
for i in range(len(string)): | ||
if string[i].isalpha(): | ||
return string | ||
return '' | ||
|
||
# available filters: | ||
criteria={'abbreviation': expand, | ||
'stops': removeStops, | ||
'stem': stem, | ||
'case': lowercase, | ||
'isWord': isWord} |
Oops, something went wrong.