@@ -35,56 +35,71 @@ def process_word(word, ratio):
3535def process_document (doc_path , ratio ):
3636 # Check if the file path is for a .docx file
3737 if doc_path .endswith ('.docx' ):
38- print ("already in docx format" )
38+ print ("Already in docx format" )
3939 elif doc_path .endswith ('.pdf' ):
4040 docx_path = doc_path .replace ('.pdf' , '.docx' )
41- # Run the conversion script as a subprocess with the same Python interpreter
42- subprocess .run ([sys .executable , 'converter.py' , '--pdf' , doc_path , '--docx' , docx_path ])
43- # Replace '.pdf' with '.docx' in the file path
44- doc_path = docx_path
41+ try :
42+ # Run the conversion script as a subprocess
43+ subprocess .run ([sys .executable , 'converter.py' , '--pdf' , doc_path , '--docx' , docx_path ])
44+ # Replace '.pdf' with '.docx' in the file path
45+ doc_path = docx_path
46+ except subprocess .CalledProcessError as e :
47+ print ("Error converting PDF:" , e )
48+ sys .exit (1 )
4549 else :
46- print ("files of this format are not supported yet" )
47- print ("please use either .pdf or .docx files" )
50+ print ("Files of this format are not supported yet" )
51+ print ("Please use either .pdf or .docx files" )
4852 sys .exit ()
49-
50- # Load the spacy model for word recognition
51- nlp = spacy .load ('en_core_web_sm' )
52-
53- # Open the .docx file
54- word_doc = Document (doc_path )
55-
56- for paragraph in word_doc .paragraphs :
57- for run in paragraph .runs :
58- # Skip if the run is already bold
59- if run .bold :
60- continue
61-
62- # Split the run text into words
63- # words = run.text.split()
64- # words = [word + ' ' for word in run.text.split()]
65- # words = re.findall(r'\s*\S+', run.text)
66- # words = re.findall(r'(?:^|\s)\S+', run.text)
67- words = run .text .split (' ' )
68- words = [' ' + word if i != 0 else word for i , word in enumerate (words )]
69-
70- # Process each word
71- new_runs = []
72- for word in words :
73- # Use spacy to recognize the words
74- doc = nlp (word )
75- for token in doc :
76- # Bolden a ratio of the characters in the word
77- runs = process_word (token .text , ratio )
78- new_runs .extend (runs )
79-
80- # Clear the original run
81- run .text = ''
82-
83- # Add new runs with the appropriate formatting
84- for text , is_bold in new_runs :
85- new_run = paragraph .add_run (text )
86- new_run .bold = is_bold
8753
54+ # Load the spacy model for word recognition (wrap in try-except)
55+ try :
56+ nlp = spacy .load ('en_core_web_sm' )
57+ except OSError as e :
58+ print ("Error loading spaCy model:" , e )
59+ sys .exit (1 )
60+
61+ try :
62+ # Open the .docx file
63+ word_doc = Document (doc_path )
64+
65+ for paragraph in word_doc .paragraphs :
66+ for run in paragraph .runs :
67+ # Skip if the run is already bold
68+ if run .bold :
69+ continue
70+
71+ # Split the run text into words
72+ words = run .text .split (' ' )
73+ words = [' ' + word if i != 0 else word for i , word in enumerate (words )]
74+
75+ # Process each word
76+ new_runs = []
77+ for word in words :
78+ # Use spacy to recognize the words
79+ doc = nlp (word )
80+ for token in doc :
81+ # Bolden a ratio of the characters in the word
82+ runs = process_word (token .text , ratio )
83+ new_runs .extend (runs )
84+
85+ # Clear the original run
86+ run .text = ''
87+
88+ # Add new runs with the appropriate formatting
89+ for text , is_bold in new_runs :
90+ new_run = paragraph .add_run (text )
91+ new_run .bold = is_bold
92+
93+ # Save the document (wrap in try-except)
94+ try :
95+ word_doc .save (output_path )
96+ except PermissionError as e :
97+ print ("Error saving document:" , e )
98+ sys .exit (1 )
99+
100+ except Exception as e : # Catch any other unexpected errors
101+ print ("Unexpected error processing document:" , e )
102+ sys .exit (1 )
88103
89104 # Get the directory and filename from the input path
90105 dir_name , file_name = os .path .split (doc_path )
@@ -96,8 +111,6 @@ def process_document(doc_path, ratio):
96111 output_path = os .path .splitext (doc_path )[0 ] + '_modified.docx'
97112 print (output_path )
98113
99- word_doc .save (output_path )
100-
101114
102115def main ():
103116 parser = argparse .ArgumentParser ()
0 commit comments