-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdeprecated.py
181 lines (155 loc) · 7.08 KB
/
deprecated.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import datetime
import logging
import spacy
import random
import pickle
from os import listdir
from os.path import isfile, join
import srsly
logger = logging.getLogger("Deprecated")
"""
All functions in here are not being used. Mind that references to other functions / files may not be right.
"""
def train_all_files_in_folder(
self,
training_files_path: str,
n_iter: int,
model_path: str,
entities: list,
best_model_path: str,
max_losses: float,
):
"""
Given the path to a dataturks .json format input file directory, a list
of entities and a path to an existent Spacy model, trains that model for
`n_iter` iterations with the given entities. Whenever a best model is
found it is writen to disk at the given output path.
:param training_files_path: A string representing the path to the input
files directory.
:param n_iter: An integer representing a number of iterations.
:param model_path: A string representing the path to the model to train.
:param entities: A list of string representing the entities to consider
during training.
:param best_model_path: A string representing the path to write the best
trained model.
:param max_losses: A float representing the maximum NER losses value
to consider before start writing best models output.
"""
begin_time = datetime.datetime.now()
onlyfiles = [f for f in listdir(training_files_path) if isfile(join(training_files_path, f))]
random.shuffle(onlyfiles)
self.add_new_entity_to_model(entities, model_path)
best_losses = max_losses
processed_docs = 0
for file_name in onlyfiles:
logger.info(f'Started processing file at "{file_name}"...')
best_losses = self.train_model(
training_files_path + file_name,
n_iter,
model_path,
entities,
best_model_path,
best_losses,
)
processed_docs = processed_docs + 1
print(f"Processed {processed_docs} out of {len(onlyfiles)} documents.")
logger.info(f'Maximum considerable losses is "{best_losses}".')
logger.info((f"Processed {processed_docs} out of {len(onlyfiles)} documents."))
diff = datetime.datetime.now() - begin_time
logger.info(f"Lasted {diff} to process {len(onlyfiles)} documents.")
def convert_dataturks_to_training_cli(self, input_files_path: str, entities: list, output_file_path: str):
"""
Given an input directory and a list of entities, converts every .json
file from the directory into a single .json recognisable by Spacy and
writes it out to the given output directory.
:param input_files_path: Directory pointing to dataturks .json files to
be converted.
:param entities: A list of entities, separated by comma, to be
considered during annotations extraction from each dadaturks batch.
:param output_file_path: The path and name of the output file.
"""
nlp = spacy.load("es_core_news_lg", disable=["ner"])
TRAIN_DATA = []
begin_time = datetime.datetime.now()
input_files = [f for f in listdir(input_files_path) if isfile(join(input_files_path, f))]
for input_file in input_files:
logger.info(f'Extracting raw data and occurrences from file: "{input_file}"...')
extracted_data = convert_dataturks_to_spacy(f"{input_files_path}/{input_file}", entities)
TRAIN_DATA = TRAIN_DATA + extracted_data
logger.info(f'Finished extracting data from file "{input_file}".')
diff = datetime.datetime.now() - begin_time
logger.info(f"Lasted {diff} to extract dataturks data from {len(input_files)} documents.")
logger.info(
f"Converting {len(TRAIN_DATA)} Documents with Occurences extracted from {len(input_files)} files into Spacy supported format..."
)
docs = []
for text, annot in TRAIN_DATA:
doc = nlp(text)
new_ents = []
for start_idx, end_idx, label in annot["entities"]:
span = doc.char_span(start_idx, end_idx, label=label)
if span is None:
conflicted_entity = {
"file_dir": input_files_path,
"label": label,
"start_index": start_idx,
"end_index": end_idx,
"matches_text": text[start_idx:end_idx],
}
logger.critical(
f'Conflicted entity: could not save an entity because it does not match an entity in the given document. Output: "{conflicted_entity}".'
)
else:
new_ents.append(span)
doc.ents = new_ents
docs.append(doc)
diff = datetime.datetime.now() - begin_time
logger.info(f"Finished Converting {len(TRAIN_DATA)} Spacy Documents into trainable data. Lasted: {diff}")
def docs_to_json(doc):
return doc
try:
logger.info(f'💾 Writing final output at "{output_file_path}"...')
srsly.write_json(output_file_path, [docs_to_json(docs)])
logger.info("💾 Done.")
except Exception:
logging.exception(f'An error occured writing the output file at "{output_file_path}".')
def convert_dataturks_to_spacy(self, input_file_path: str, output_file_path: str, entities: list):
"""
Given a dataturks format .json file, an output path and a list of
entities, converts the input data into Spacy recognisable format to
pickle dump it at the given output path.
:param input_file_path: A string representing the path to a dataturks
.json format input file.
:param output_file_path: A string representing the output path.
:param entities: A list of string representing entities to recognise
during data conversion.
"""
logger.info(f'Starts converting data from "{input_file_path}"...')
training_data = []
with open(output_file_path, "a+"):
training_data.append(convert_dataturks_to_spacy(input_file_path, entities))
with open(output_file_path, "wb") as output:
pickle.dump(training_data, output, pickle.HIGHEST_PROTOCOL)
logger.info(f'Succesfully converted data at "{output_file_path}".')
def count_examples(self, files_path: str, entities: list):
"""
Given the path to a dataturks .json format input file directory and a
list of entities, prints the total number of examples by label.
:param files_path: Directory pointing to dataturks .json files to
be converted.
:param entities: A list of entities, separated by comma, to be
considered in the final output.
"""
entities = entities.split(",")
input_files_dir_path = files_path
onlyfiles = [f for f in listdir(input_files_dir_path) if isfile(join(input_files_dir_path, f))]
all_entities = {}
for entity in entities:
entity_length = 0
for file_ in onlyfiles:
validation_data = convert_dataturks_to_spacy(input_files_dir_path + file_, [entity])
for _, annotations in validation_data:
occurences = annotations.get("entities")
entity_length = entity_length + len(occurences)
all_entities[entity] = entity_length
logger.info(f"Total entities output: {all_entities}")