Skip to content

Commit

Permalink
New Version:
Browse files Browse the repository at this point in the history
This release has a simpler file structure and is 100% Python3. This version includes legal topic specific improvements. It also provides a way to save the results of the Relevance Filter in order to use this part of the program more efficiently.
  • Loading branch information
AdamMeyers authored and Adam Meyers committed Apr 11, 2017
1 parent 36cf2e9 commit b1be6e2
Show file tree
Hide file tree
Showing 2,986 changed files with 765,345 additions and 3,622,398 deletions.
33 changes: 33 additions & 0 deletions Document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os, pickle
from nltk import FreqDist
import NPParser, Filter, Section, Settings

class Document:
"""A Document object contains: filename - file location, and \
words - a dict with words as keys and word counts as values."""
def __init__(self, filename=None,overwrite=False):
self.filename = filename
#REMINDER, INCLUDE METATAG DATA (SECTION, WORDCLASS, ETC.)
#self.positions = {}
# If filename given, input the file
if filename:
if not os.path.isfile(filename):
raise OSError(2, 'No such file', filename)
# get terms from noun groups
parser = NPParser.NPParser()
# expand abbreviations, stemming phrase endings, etc.
#filters = ['abbreviation', 'isWord', 'case', 'stops', 'stem']
filters = Settings.getDocumentFilters()
self.counts = parser.getTerms(filename, filters,overwrite=overwrite)
# get individual word counts -- for tokenized measures
self.token_counts = FreqDist()
for kw in list(self.counts.keys()):
words = kw.split()
for w in words:
self.token_counts[w] += 1
# now to section data as raw text blocks
#self.sections = Section.getSections(filename)
else:
self.counts = FreqDist()
self.token_counts = FreqDist()
#self.sections = []
187 changes: 187 additions & 0 deletions Filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import re, pickle, logging, os
from nltk.stem import PorterStemmer as Stemmer # NLTK's license, Apache
from nltk.corpus import stopwords # not encumbered by license, see stopwords.readme()

abbreviations={}
stops = []
stemmer = None
stemdict = {} # stemming dictionary
unstemdict = {} # reverse stemming dictionary
dir_name = os.path.dirname(os.path.realpath(__file__)) + os.sep
logger = logging.getLogger()

def _get_abbreviations(filename='./jargon.out'):
"""Import abbreviations from jargon file"""
f = open(filename)
for line in f:
temp = line.split('|||')
fullword = temp[0]
shortwords = temp[1].split('||')
for w in shortwords:
abbreviations[w] = fullword
f.close()
def _get_stops(filename=dir_name + 'patentstops.txt'):
"""Import stop words either from a text file or stopwords corpus"""
global stops
if filename:
f = open(filename)
for line in f:
stops += line.split()
f.close()
else:
stops = stopwords.words('english')
def _get_stemdict(filename):
logger.debug('Loading stemming dictionary...')
f = open(filename)
global stemdict
global unstemdict
stemdict, unstemdict = pickle.load(f)
f.close()
def _save_stemdict(filename):
logger.debug('Saving stemming dictionary...')
f = open(filename, 'w')
global stemdict
global unstemdict
pickle.dump((stemdict, unstemdict),f)
f.close()
def _reGlue(words):
"""Helper function to turn a list of words into a string"""
ret = ""
for i in range(len(words)):
ret += words[i] + " "
ret = ret.strip()
return ret
def expand(string):
"""Expand abbreviations within string"""
global abbreviations
if not abbreviations:
_get_abbreviations()
words = string.split()
for i in range(len(words)):
temp = abbreviations.get(words[i])
if temp:
words[i] = temp
string = _reGlue(words)
return string
def removeStops(string): #NOT NEEDED AS NP EXTRACTING REMOVES THEM
"""Strip stop words off the beginning and ending of a phrase"""
global stops
if not stops:
_get_stops()
# entire phrase in stops
if string in stops:
return ""
words = string.split()
if not words:
return ""
# beginning stops (loses case of multiword stops)
while words[0] in stops:
words.pop(0)
if not words:
return ""
# ending stops (loses case of multiword stops)
while words[-1] in stops:
words.pop(0)
if not words:
return ""
string = _reGlue(words)
return string

def bad_unicode(string):
for char in string:
if ord(char)>127:
print(char)
return(True)

def remove_non_unicode(string):
output = ''
for char in string:
if ord(char)>127:
output=output+' '
else:
output=output+char
output = output.strip(' ')
return(output)

def stem(string):
"""Stem a phrase"""
global stemmer
if not stemmer:
stemmer = Stemmer()
#words = string.split()
#for i in range(len(words)):
# words[i] = self.stemmer.stem(words[i])
# stemming last word only
#string = self._reGlue(words)
#
#string2 = stemmer.stem(string)
#if string2 not in stemdict:
# stemdict[string2] = string
# FIX ME
if string not in stemdict:
if bad_unicode(string):
## added A. Meyers 8/28/15
temp = stemmer.stem(remove_non_unicode(string))
else:
temp = stemmer.stem(string)
if temp:
stemdict[string] = temp
if not temp:
pass
elif temp not in unstemdict:
unstemdict[temp] = [string]
elif string not in unstemdict[temp]:
unstemdict[temp].append(string)
else:
temp = stemdict[string]
return temp

def unstem(string):
"""Undo stemming of a phrase"""
global stemdict
#if string in stemdict:
# return stemdict[string]
#else:
# return string
if string in unstemdict:
return unstemdict[string]
else:
return [string]
def lowercase(string):
"""Return an all lowercase representation of a string"""
return string.lower()
def isWord(string):
"""Test the legitimacy of the proposed phrase. Taken from Shasha's implementation"""
#criteria:
pattern = re.compile(r"""
(
&lt
|%
|/
|\\
|&\ lt
|\)
|\(
|\.
|\+
|and\
|\ and
|\ and\
)
""", re.I | re.VERBOSE | re.UNICODE)
if len(string) < 2:
return ''
elif re.findall(pattern, string):
return ''
#must contain at least one letter
for i in range(len(string)):
if string[i].isalpha():
return string
return ''

# available filters:
criteria={'abbreviation': expand,
'stops': removeStops,
'stem': stem,
'case': lowercase,
'isWord': isWord}
Loading

0 comments on commit b1be6e2

Please sign in to comment.