Skip to content

Commit

Permalink
1) Fixed bug (found by N. Vintilla). String 'NOUN_OOV' replaced with …
Browse files Browse the repository at this point in the history
…list ['NOUN_OOV'].

2) Corrected function calls in filter_term_output.py for the filtering stage to better accomodate the websearch filter
3) split make_one_termolator_fact_txt_file.py into 2 files to make it easier to combine this code with other processes
4) In filter terms, make accomodation for missing abbreviation dictionaries
5) Correcting run_termolator.sh script to correctly handle new optimizations involving caching of background stats plus
   caching of search results
  • Loading branch information
Adam Meyers committed Aug 15, 2017
1 parent 19f5829 commit 2ac3be8
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 153 deletions.
10 changes: 6 additions & 4 deletions filter_term_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,20 @@ def main(args):
elif args[3].lower() in ['false','f']:
use_web_score = False
else:
print('You set the webscore flag to', args[2], 'but it must be either "True" or "False".')
print('You set the webscore flag to', args[3], 'but it must be either "True" or "False".')
print('Use "True" if you want the system to use the webscore function and the system will run slowly and be more accurate.')
print('Use "False" otherwise.')
max_term_number = int(args[4])
if (len(args)>5) and (args[5].lower() != 'false'):
special_domains.extend(args[5].split('+'))
abbr_file_list = args[5]
if (len(args)>6) and (args[6].lower() != 'false'):
special_domains.extend(args[6].split('+'))
initialize_utilities()
input_file = file_prefix + ".all_terms"
output_file = file_prefix + ".scored_output"
abbr_full_file = file_prefix + ".dict_abbr_to_full"
full_abbr_file = file_prefix + ".dict_full_to_abbr"
reject_file = file_prefix + ".rejected-terms"
filter_terms(input_file,output_file,abbr_full_file,full_abbr_file,use_web_score,numeric_cutoff=max_term_number,reject_file=reject_file,web_score_dict_file=web_score_dict_file)
print('***',abbr_file_list)
filter_terms(input_file,output_file,abbr_full_file,full_abbr_file,use_web_score=use_web_score,numeric_cutoff=max_term_number,reject_file=reject_file,web_score_dict_file=web_score_dict_file,abbr_files=abbr_file_list)

if __name__ == '__main__': sys.exit(main(sys.argv))
12 changes: 10 additions & 2 deletions filter_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,6 +898,7 @@ def filter_terms (infile, \
outfile,\
abbr_full_file,\
full_abbr_file, \
abbr_files = False, \
use_web_score = True, \
ranking_pref_cutoff = .001, \
percent_cutoff=.3, \
Expand All @@ -908,7 +909,11 @@ def filter_terms (infile, \
):
## it is possible that some people may want to allow NPs as well as noun groups as terms
if abbr_full_file and full_abbr_file:
read_in_abbrev_dicts_from_files(abbr_full_file,full_abbr_file)
if os.path.isfile(abbr_full_file) and os.path.isfile(full_abbr_file):
read_in_abbrev_dicts_from_files(abbr_full_file,full_abbr_file)
elif abbr_files:
make_abbr_dicts_from_abbr(abbr_files,full_abbr_file,abbr_full_file)
## this creates abbr dicts and loads them
if use_web_score and web_score_dict_file:
load_web_score_dict_file(web_score_dict_file)
use_web_score_dict = True
Expand Down Expand Up @@ -956,7 +961,10 @@ def filter_terms (infile, \
output.sort()
output.reverse()
confidence_position = min(round(len(output)*percent_cutoff),numeric_cutoff)
confidence_cutoff = output[confidence_position][0]
if len(output)>0:
confidence_cutoff = output[confidence_position][0]
else:
confidence_cutoff = 0
no_more_good_ones = False
for out in output:
confidence,term,keep,classification,rating,well_formedness_score,rank_score = out
Expand Down
4 changes: 2 additions & 2 deletions inline_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,12 +637,12 @@ def get_topic_terms(text,offset,filter_off=False):
if result[4]:
ARG2_start = paren_pat.start(2)+offset
ARG2_end = ARG2_start+len(abbreviation)-1
if filter_off or (topic_term_ok_boolean([result[2]],'NOUN_OOV',result[2]) and topic_term_ok_boolean([abbreviation[1:]],'NOUN_OOV',abbreviation[1:])):
if filter_off or (topic_term_ok_boolean([result[2]],['NOUN_OOV'],result[2]) and topic_term_ok_boolean([abbreviation[1:]],'NOUN_OOV',abbreviation[1:])):
topic_terms.extend([[ARG1_start,ARG1_end,result[2]],[ARG2_start,ARG2_end,abbreviation[1:]]])
else:
ARG2_start = paren_pat.start(2)+offset
ARG2_end = ARG2_start+len(abbreviation)
if filter_off or (topic_term_ok_boolean([result[2]],'NOUN_OOV',result[2]) and topic_term_ok_boolean([abbreviation],'NOUN_OOV',abbreviation)):
if filter_off or (topic_term_ok_boolean([result[2]],['NOUN_OOV'],result[2]) and topic_term_ok_boolean([abbreviation],'NOUN_OOV',abbreviation)):
topic_terms.extend([[ARG1_start,ARG1_end,result[2]],[ARG2_start,ARG2_end,abbreviation]])
pieces.append([start,text[start:paren_pat.start()]])
if txt_markup_match and (txt_markup_match.start()>start) and (txt_markup_match.end()<paren_pat.end()):
Expand Down
142 changes: 1 addition & 141 deletions make_termolator_fact_txt_files.py
Original file line number Diff line number Diff line change
@@ -1,146 +1,6 @@
#!/usr/bin/env python3

import os
import sys
from term_utilities import *
initialize_utilities()

def modify_paragraph_delimiters(paragraph_starts,paragraph_ends,paragraph_non_starts,paragraph_non_ends):
matched_outstarts = []
matched_outends = []
next_start = 'Empty'
while (len(paragraph_starts)>0):
next_start = paragraph_starts.pop(0)
matched_outstarts.append(next_start)
if (len(paragraph_ends)>0):
if (len(paragraph_starts)>0) and (paragraph_starts[0] < paragraph_ends[0]):
matched_outends.append(paragraph_starts[0])
else:
matched_outends.append(paragraph_ends.pop(0))
elif len(paragraph_starts)>0:
matched_outends.append(paragraph_starts[0])
paragraph_starts = matched_outstarts
paragraph_ends = matched_outends
out_starts = []
out_ends = []
current_start,current_end,current_non_start,current_non_end = 'Empty','Empty','Empty','Empty'
## First step, use paragraph starts and ends to block unprintable sections (as per paragraph_non_starts,paragraph_non_ends)
## 'Empty' means no value -- cannot use 0 because 0 is a possible file position; cannot use False, because False == 0 in Python
if ((paragraph_non_ends)==0) or (len(paragraph_non_starts)==0):
out_starts = paragraph_starts
out_ends = paragraph_ends
else:
while (len(paragraph_ends)>0) or (current_end !='Empty'):
if (current_start=='Empty') and (len(paragraph_starts)>0):
current_start = paragraph_starts.pop(0)
if (current_end=='Empty'):
current_end = paragraph_ends.pop(0)
if (current_non_start=='Empty') and (len(paragraph_non_starts)>0):
current_non_start = paragraph_non_starts.pop(0)
if (current_non_end=='Empty') and (len(paragraph_non_ends)>0):
current_non_end = paragraph_non_ends.pop(0)
if (current_non_start != 'Empty') and (current_non_start <= current_start) and (current_non_end >= current_end):
current_non_start = current_end
current_start = 'Empty'
current_end = 'Empty'
if (current_non_end != 'Empty') and (current_start != 'Empty') and (current_non_end <= current_start):
current_non_start = 'Empty'
current_non_end = 'Empty'
if (current_non_start != 'Empty') and (current_start != 'Empty') and (current_non_end != 'Empty') and (current_non_start <= current_start) \
and (current_non_end >= current_start) and (current_non_end <= current_end):
current_non_start = current_start
if (current_start == 'Empty'):
pass
elif (current_non_start == 'Empty') or ((current_end != 'Empty') and (current_non_start >= current_end)) or \
((current_non_end != 'Empty') and (current_non_end <= current_start)):
if current_start !='Empty':
out_starts.append(current_start)
out_ends.append(current_end)
current_start = 'Empty'
current_end = 'Empty'
elif (current_non_start != 'Empty') and (current_non_end != 'Empty') and (current_non_start <= current_start) and (current_end != 'Empty') and (current_non_end >= current_end):
current_start = 'Empty'
current_end = 'Empty'
elif (current_end != 'Empty') and (current_non_start <= current_end) and (current_non_start>=current_start):
out_starts.append(current_start)
out_ends.append(current_non_start)
last_start = current_start
current_start = 'Empty'
current_non_start = 'Empty'
if (current_non_end != 'Empty') and (current_non_end <= current_end):
if current_non_end>=last_start:
current_start = current_non_end
current_non_end = 'Empty'
elif (len(paragraph_starts)>0) and (current_non_end != 'Empty') and (paragraph_starts[0]<=current_non_end):
current_end = 'Empty'
num = 0
while (num < len(paragraph_starts)) and (paragraph_starts[num]<=current_non_end):
paragraph_starts[num] = current_non_end
num = 1 + num
num = 0
while (num < len(paragraph_ends)) and (paragraph_ends[num]<=current_non_end):
paragraph_ends[num] = current_non_end
num = 1 + num
current_end = 'Empty'
current_non_end = 'Empty'
else:
current_end = 'Empty'
current_non_end = 'Empty'
elif (current_non_end >= current_start) and (current_end != 'Empty') and (current_non_end<=current_end):
current_start = current_non_end
current_non_end = 'Empty'
current_non_start = 'Empty'
return(out_starts,out_ends)

def create_termolotator_fact_txt_files(input_file,txt2_file,txt3_file,fact_file):
global paragraph_starts
global paragraph_ends
paragraph_starts = [0]
paragraph_ends = []
nonprint_starts = []
nonprint_ends = []
bad_chars = []
inlinelist = get_my_string_list(input_file)
with open(txt2_file,'w') as txt2_stream,open(txt3_file,'w') as txt3_stream:
start = 0
length = 0
for line in merge_multiline_and_fix_xml(inlinelist):
string2,starts1,ends1,nonprint_starts1,nonprint_ends1 = remove_xml_spit_out_paragraph_start_end(line,start)
string3, bad1 = replace_less_than_with_positions(string2,start)
if (len(paragraph_ends) == 0) and (len(starts1)>0) and (len(paragraph_ends) == 0):
hypothetical_end = (starts1[0]-1)
if not hypothetical_end in ends1:
ends1.append(hypothetical_end)
ends1.sort()
## balances the addition of 0 as a start
length = length+len(string2)
start = start+len(string2)
txt2_stream.write(string2)
txt3_stream.write(string3)
paragraph_starts.extend(starts1)
paragraph_ends.extend(ends1)
nonprint_starts.extend(nonprint_starts1)
nonprint_ends.extend(nonprint_ends1)
bad_chars.extend(bad1)
if len(paragraph_ends)>0:
paragraph_starts.append(1 + paragraph_ends[-1])
paragraph_ends.append(length)
paragraph_starts,paragraph_ends=modify_paragraph_delimiters(paragraph_starts,paragraph_ends,nonprint_starts,nonprint_ends)
with open(fact_file,'w') as factstream:
if len(paragraph_starts) == len(paragraph_ends):
for item_num in range(len(paragraph_starts)):
factstream.write('STRUCTURE TYPE="TEXT" START='+str(paragraph_starts[item_num])+' END='+str(paragraph_ends[item_num])+os.linesep)
elif (len(paragraph_starts)>1) and len(paragraph_ends) == 1:
last_start = 0
for start in paragraph_starts:
if start != 0:
factstream.write('STRUCTURE TYPE="TEXT" START='+str(last_start)+' END='+str(start)+os.linesep)
last_start = start
factstream.write('STRUCTURE TYPE="TEXT" START='+str(last_start)+' END='+str(paragraph_ends[0])+os.linesep)
else:
factstream.write('STRUCTURE TYPE="TEXT" START=0 END='+str(paragraph_ends[0])+os.linesep)
for bad_char in bad_chars:
factstream.write('BAD_CHARACTER START='+str(bad_char[0])+' END='+str(bad_char[1])+' STRING="<"'+os.linesep)
from termolator_fact_txt import *

def main(args):
## infile is the output file from distributional term extraction
Expand Down
9 changes: 5 additions & 4 deletions run_termolator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ echo "max number of terms? $7"
echo "keep terms? $8"
echo "termolator path $9"
echo "dedicated webscore file ${12}"
echo "use previous saved pickel file for terms ${13}"
echo "use previous saved pickle file for terms ${13}"

## Step 1: Finding inline terms for foreground files
echo "Running Step 1: finding inline terms for foreground files"
Expand All @@ -50,6 +50,7 @@ if [ "${11}" = "False" ]; then
$TERMOLATOR/make_io_file.py $1 $4.internal_pos_terms_abbr_list .pos .terms .abbr
fi

$TERMOLATOR/make_io_file.py $1 $4.internal_abbr_list .abbr
$TERMOLATOR/make_io_file.py $1 $4.internal_foreground_tchunk_list .tchunk
$TERMOLATOR/make_io_file.py $2 $4.internal_background_tchunk_list .tchunk

Expand Down Expand Up @@ -107,10 +108,10 @@ fi

if [ "${12}" = "False" ]; then
echo "calling filter_term_output.py with filter_term_output.py $4 $4.outputweb.score $6 $7 ${10}"
$TERMOLATOR/filter_term_output.py $4 $4.outputweb.score $6 $7 ${10}
$TERMOLATOR/filter_term_output.py $4 $4.outputweb.score $6 $7 $4.internal_abbr_list ${10}
else
echo "calling filter_term_output.py with filter_term_output.py $4 ${12} $6 $7 ${10}"
$TERMOLATOR/filter_term_output.py $4 ${12} $6 $7 ${10}
$TERMOLATOR/filter_term_output.py $4 ${12} $6 $7 $4.internal_abbr_list ${10}
fi

echo "Final terms can be found in $4.out_term_list from the scored file in $4.scored_output"
Expand All @@ -119,5 +120,5 @@ head -$8 $4.scored_output | cut -f 1 > $4.out_term_list
echo "Cleaning up files"
rm -f $4.internal_prefix_list $4.internal_pos_list $4.internal_txt_fact_list $4.internal_fact_pos_list
rm -f $4.internal_txt_fact_pos_list $4.internal_pos_terms_abbr_list $4.internal_foreground_tchunk_list
rm -f $4.internal_background_tchunk_list
rm -f $4.internal_background_tchunk_list $4.internal_abbr_list

Loading

0 comments on commit 2ac3be8

Please sign in to comment.