1) Fixed bug (found by N. Vintilla). String 'NOUN_OOV' replaced with …

…list ['NOUN_OOV']. 2) Corrected function calls in filter_term_output.py for the filtering stage to better accomodate the websearch filter 3) split make_one_termolator_fact_txt_file.py into 2 files to make it easier to combine this code with other processes 4) In filter terms, make accomodation for missing abbreviation dictionaries 5) Correcting run_termolator.sh script to correctly handle new optimizations involving caching of background stats plus caching of search results
AdamMeyers · Aug 15, 2017 · 2ac3be8 · 2ac3be8
1 parent 19f5829
commit 2ac3be8
Show file tree

Hide file tree

Showing 6 changed files with 181 additions and 153 deletions.
diff --git a/filter_term_output.py b/filter_term_output.py
@@ -14,18 +14,20 @@ def main(args):
     elif args[3].lower() in ['false','f']:
         use_web_score = False
     else:
-        print('You set the webscore flag to', args[2], 'but it must be either "True" or "False".')
+        print('You set the webscore flag to', args[3], 'but it must be either "True" or "False".')
         print('Use "True" if you want the system to use the webscore function and the system will run slowly and be more accurate.')
         print('Use "False" otherwise.')
     max_term_number = int(args[4])
-    if (len(args)>5) and (args[5].lower() != 'false'):        
-        special_domains.extend(args[5].split('+'))
+    abbr_file_list = args[5]
+    if (len(args)>6) and (args[6].lower() != 'false'):        
+        special_domains.extend(args[6].split('+'))
     initialize_utilities()
     input_file = file_prefix + ".all_terms"
     output_file = file_prefix + ".scored_output"
     abbr_full_file = file_prefix + ".dict_abbr_to_full"
     full_abbr_file = file_prefix + ".dict_full_to_abbr"
     reject_file = file_prefix + ".rejected-terms"
-    filter_terms(input_file,output_file,abbr_full_file,full_abbr_file,use_web_score,numeric_cutoff=max_term_number,reject_file=reject_file,web_score_dict_file=web_score_dict_file)
+    print('***',abbr_file_list)
+    filter_terms(input_file,output_file,abbr_full_file,full_abbr_file,use_web_score=use_web_score,numeric_cutoff=max_term_number,reject_file=reject_file,web_score_dict_file=web_score_dict_file,abbr_files=abbr_file_list)
 
 if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/filter_terms.py b/filter_terms.py
@@ -898,6 +898,7 @@ def filter_terms (infile, \
                   outfile,\
                   abbr_full_file,\
                   full_abbr_file, \
+                  abbr_files = False, \
                   use_web_score = True, \
                   ranking_pref_cutoff = .001, \
                   percent_cutoff=.3, \
@@ -908,7 +909,11 @@ def filter_terms (infile, \
                   ):
     ## it is possible that some people may want to allow NPs as well as noun groups as terms
     if abbr_full_file and full_abbr_file:
-        read_in_abbrev_dicts_from_files(abbr_full_file,full_abbr_file)
+        if os.path.isfile(abbr_full_file) and os.path.isfile(full_abbr_file):
+            read_in_abbrev_dicts_from_files(abbr_full_file,full_abbr_file)
+        elif abbr_files:
+            make_abbr_dicts_from_abbr(abbr_files,full_abbr_file,abbr_full_file)
+            ## this creates abbr dicts and loads them
     if use_web_score and web_score_dict_file:
         load_web_score_dict_file(web_score_dict_file)
         use_web_score_dict = True
@@ -956,7 +961,10 @@ def filter_terms (infile, \
     output.sort()
     output.reverse()
     confidence_position = min(round(len(output)*percent_cutoff),numeric_cutoff)
-    confidence_cutoff = output[confidence_position][0]
+    if len(output)>0:
+        confidence_cutoff = output[confidence_position][0]
+    else:
+        confidence_cutoff = 0
     no_more_good_ones = False
     for out in output:
         confidence,term,keep,classification,rating,well_formedness_score,rank_score = out

diff --git a/inline_terms.py b/inline_terms.py
@@ -637,12 +637,12 @@ def get_topic_terms(text,offset,filter_off=False):
                 if result[4]:
                     ARG2_start = paren_pat.start(2)+offset
                     ARG2_end = ARG2_start+len(abbreviation)-1
-                    if filter_off or (topic_term_ok_boolean([result[2]],'NOUN_OOV',result[2]) and topic_term_ok_boolean([abbreviation[1:]],'NOUN_OOV',abbreviation[1:])):
+                    if filter_off or (topic_term_ok_boolean([result[2]],['NOUN_OOV'],result[2]) and topic_term_ok_boolean([abbreviation[1:]],'NOUN_OOV',abbreviation[1:])):
                         topic_terms.extend([[ARG1_start,ARG1_end,result[2]],[ARG2_start,ARG2_end,abbreviation[1:]]])
                 else:
                     ARG2_start = paren_pat.start(2)+offset
                     ARG2_end = ARG2_start+len(abbreviation)
-                    if filter_off or (topic_term_ok_boolean([result[2]],'NOUN_OOV',result[2]) and topic_term_ok_boolean([abbreviation],'NOUN_OOV',abbreviation)):
+                    if filter_off or (topic_term_ok_boolean([result[2]],['NOUN_OOV'],result[2]) and topic_term_ok_boolean([abbreviation],'NOUN_OOV',abbreviation)):
                         topic_terms.extend([[ARG1_start,ARG1_end,result[2]],[ARG2_start,ARG2_end,abbreviation]])
                 pieces.append([start,text[start:paren_pat.start()]])
                 if txt_markup_match and (txt_markup_match.start()>start) and (txt_markup_match.end()<paren_pat.end()):

diff --git a/make_termolator_fact_txt_files.py b/make_termolator_fact_txt_files.py
@@ -1,146 +1,6 @@
 #!/usr/bin/env python3
 
-import os
-import sys
-from term_utilities import *
-initialize_utilities()
-
-def modify_paragraph_delimiters(paragraph_starts,paragraph_ends,paragraph_non_starts,paragraph_non_ends):
-    matched_outstarts = []
-    matched_outends = []
-    next_start = 'Empty'
-    while (len(paragraph_starts)>0):
-        next_start = paragraph_starts.pop(0)
-        matched_outstarts.append(next_start)
-        if (len(paragraph_ends)>0):
-            if (len(paragraph_starts)>0) and (paragraph_starts[0] < paragraph_ends[0]):
-                matched_outends.append(paragraph_starts[0])
-            else:
-                matched_outends.append(paragraph_ends.pop(0))
-        elif len(paragraph_starts)>0:
-            matched_outends.append(paragraph_starts[0])
-    paragraph_starts = matched_outstarts
-    paragraph_ends = matched_outends
-    out_starts = []
-    out_ends = []
-    current_start,current_end,current_non_start,current_non_end = 'Empty','Empty','Empty','Empty'
-    ## First step, use paragraph starts and ends to block unprintable sections (as per paragraph_non_starts,paragraph_non_ends)
-    ## 'Empty' means no value -- cannot use 0 because 0 is a possible file position; cannot use False, because False == 0 in Python
-    if ((paragraph_non_ends)==0) or (len(paragraph_non_starts)==0):
-        out_starts = paragraph_starts
-        out_ends = paragraph_ends
-    else:
-        while (len(paragraph_ends)>0) or (current_end !='Empty'):
-            if (current_start=='Empty') and (len(paragraph_starts)>0):
-                current_start = paragraph_starts.pop(0)
-            if (current_end=='Empty'):
-                current_end = paragraph_ends.pop(0)
-            if (current_non_start=='Empty') and (len(paragraph_non_starts)>0):
-                current_non_start = paragraph_non_starts.pop(0)
-            if (current_non_end=='Empty') and (len(paragraph_non_ends)>0):
-                current_non_end = paragraph_non_ends.pop(0)
-            if (current_non_start != 'Empty') and (current_non_start <= current_start) and (current_non_end >= current_end):
-                current_non_start = current_end
-                current_start = 'Empty'
-                current_end = 'Empty'
-            if (current_non_end != 'Empty') and (current_start != 'Empty') and (current_non_end <= current_start):
-                current_non_start = 'Empty'
-                current_non_end = 'Empty'
-            if (current_non_start != 'Empty') and (current_start != 'Empty') and (current_non_end != 'Empty') and (current_non_start <= current_start) \
-              and (current_non_end >= current_start) and (current_non_end <= current_end):
-              current_non_start = current_start
-            if (current_start == 'Empty'):
-                pass
-            elif (current_non_start == 'Empty')  or  ((current_end != 'Empty') and (current_non_start >= current_end)) or \
-              ((current_non_end != 'Empty') and (current_non_end <= current_start)):
-                if current_start !='Empty':
-                    out_starts.append(current_start)
-                out_ends.append(current_end)
-                current_start = 'Empty'
-                current_end = 'Empty'
-            elif (current_non_start != 'Empty') and (current_non_end != 'Empty') and (current_non_start <= current_start) and (current_end != 'Empty') and (current_non_end >= current_end):
-                current_start = 'Empty'
-                current_end = 'Empty'
-            elif (current_end != 'Empty') and (current_non_start <= current_end) and (current_non_start>=current_start):
-                out_starts.append(current_start)
-                out_ends.append(current_non_start)
-                last_start = current_start
-                current_start = 'Empty'
-                current_non_start = 'Empty'
-                if (current_non_end != 'Empty') and (current_non_end <= current_end):
-                    if current_non_end>=last_start:
-                        current_start = current_non_end
-                    current_non_end = 'Empty'
-                elif (len(paragraph_starts)>0) and (current_non_end != 'Empty') and (paragraph_starts[0]<=current_non_end):
-                    current_end = 'Empty'
-                    num = 0
-                    while (num < len(paragraph_starts)) and (paragraph_starts[num]<=current_non_end):
-                        paragraph_starts[num] = current_non_end
-                        num = 1 + num
-                    num = 0
-                    while (num < len(paragraph_ends)) and (paragraph_ends[num]<=current_non_end):
-                        paragraph_ends[num] = current_non_end
-                        num = 1 + num
-                    current_end = 'Empty'  
-                    current_non_end = 'Empty'                  
-                else:
-                    current_end = 'Empty'
-                    current_non_end = 'Empty'
-            elif (current_non_end >= current_start) and (current_end != 'Empty') and (current_non_end<=current_end):
-                current_start = current_non_end
-                current_non_end = 'Empty'
-                current_non_start = 'Empty'
-    return(out_starts,out_ends)
-
-def create_termolotator_fact_txt_files(input_file,txt2_file,txt3_file,fact_file):
-    global paragraph_starts
-    global paragraph_ends
-    paragraph_starts = [0]
-    paragraph_ends = []
-    nonprint_starts = []
-    nonprint_ends = []
-    bad_chars = []
-    inlinelist  = get_my_string_list(input_file)
-    with open(txt2_file,'w') as txt2_stream,open(txt3_file,'w') as txt3_stream:
-        start = 0
-        length = 0
-        for line in merge_multiline_and_fix_xml(inlinelist):
-            string2,starts1,ends1,nonprint_starts1,nonprint_ends1 = remove_xml_spit_out_paragraph_start_end(line,start)
-            string3, bad1 = replace_less_than_with_positions(string2,start)
-            if (len(paragraph_ends) == 0) and (len(starts1)>0) and (len(paragraph_ends) == 0):
-                hypothetical_end = (starts1[0]-1)
-                if not hypothetical_end in ends1:
-                    ends1.append(hypothetical_end)
-                    ends1.sort()
-                    ## balances the addition of 0 as a start
-            length = length+len(string2)
-            start = start+len(string2)
-            txt2_stream.write(string2)
-            txt3_stream.write(string3)
-            paragraph_starts.extend(starts1)
-            paragraph_ends.extend(ends1)
-            nonprint_starts.extend(nonprint_starts1)
-            nonprint_ends.extend(nonprint_ends1)
-            bad_chars.extend(bad1)
-        if len(paragraph_ends)>0:
-            paragraph_starts.append(1 + paragraph_ends[-1])
-        paragraph_ends.append(length)
-    paragraph_starts,paragraph_ends=modify_paragraph_delimiters(paragraph_starts,paragraph_ends,nonprint_starts,nonprint_ends)
-    with open(fact_file,'w') as factstream:
-        if len(paragraph_starts) == len(paragraph_ends):
-            for item_num in range(len(paragraph_starts)):
-                factstream.write('STRUCTURE TYPE="TEXT" START='+str(paragraph_starts[item_num])+' END='+str(paragraph_ends[item_num])+os.linesep)
-        elif (len(paragraph_starts)>1) and len(paragraph_ends) == 1:
-            last_start = 0
-            for start in paragraph_starts:
-                if start != 0:
-                    factstream.write('STRUCTURE TYPE="TEXT" START='+str(last_start)+' END='+str(start)+os.linesep)
-                last_start = start
-            factstream.write('STRUCTURE TYPE="TEXT" START='+str(last_start)+' END='+str(paragraph_ends[0])+os.linesep)
-        else:
-            factstream.write('STRUCTURE TYPE="TEXT" START=0 END='+str(paragraph_ends[0])+os.linesep)
-        for bad_char in bad_chars:
-            factstream.write('BAD_CHARACTER START='+str(bad_char[0])+' END='+str(bad_char[1])+' STRING="<"'+os.linesep)
+from termolator_fact_txt import *
 
 def main(args):
     ## infile is the output file from distributional term extraction

diff --git a/run_termolator.sh b/run_termolator.sh
@@ -35,7 +35,7 @@ echo "max number of terms? $7"
 echo "keep terms? $8"
 echo "termolator path $9"
 echo "dedicated webscore file ${12}"
-echo "use previous saved pickel file for terms ${13}"
+echo "use previous saved pickle file for terms ${13}"
 
 ## Step 1: Finding inline terms for foreground files
 echo "Running Step 1: finding inline terms for foreground files"
@@ -50,6 +50,7 @@ if [ "${11}" = "False" ]; then
     $TERMOLATOR/make_io_file.py $1 $4.internal_pos_terms_abbr_list .pos .terms .abbr
 fi
 
+$TERMOLATOR/make_io_file.py $1 $4.internal_abbr_list .abbr
 $TERMOLATOR/make_io_file.py $1 $4.internal_foreground_tchunk_list .tchunk
 $TERMOLATOR/make_io_file.py $2 $4.internal_background_tchunk_list .tchunk
 
@@ -107,10 +108,10 @@ fi
 
 if [ "${12}" = "False" ]; then
    echo "calling filter_term_output.py with filter_term_output.py $4 $4.outputweb.score $6 $7 ${10}"
-   $TERMOLATOR/filter_term_output.py $4 $4.outputweb.score $6 $7 ${10}
+   $TERMOLATOR/filter_term_output.py $4 $4.outputweb.score $6 $7 $4.internal_abbr_list ${10}
 else
    echo "calling filter_term_output.py with filter_term_output.py $4 ${12} $6 $7 ${10}"
-   $TERMOLATOR/filter_term_output.py $4 ${12} $6 $7 ${10}
+   $TERMOLATOR/filter_term_output.py $4 ${12} $6 $7 $4.internal_abbr_list ${10}
 fi
 
 echo "Final terms can be found in $4.out_term_list from the scored file in $4.scored_output"
@@ -119,5 +120,5 @@ head -$8 $4.scored_output | cut -f 1 > $4.out_term_list
 echo "Cleaning up files"
 rm -f $4.internal_prefix_list $4.internal_pos_list $4.internal_txt_fact_list $4.internal_fact_pos_list
 rm -f $4.internal_txt_fact_pos_list $4.internal_pos_terms_abbr_list $4.internal_foreground_tchunk_list
-rm -f $4.internal_background_tchunk_list 
+rm -f $4.internal_background_tchunk_list $4.internal_abbr_list