-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtokenizers_encoder.py
127 lines (106 loc) · 3.81 KB
/
tokenizers_encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import argparse
import time
import multiprocessing as mp
import fairseq
from transformers import PreTrainedTokenizerFast
# Run this file on slurm via tokenizers_encoder_run.sh
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--filename", type=str)
parser.add_argument(
"data_folder",
nargs="?",
type=str,
default="/ceph/hpc/home/eufatonr/data/text/kb_bart_data/split",
)
parser.add_argument(
"-d",
"--dest_folder",
nargs="?",
type=str,
default="/ceph/hpc/home/eufatonr/data/text/kb_bart_data/tokenized",
)
args = parser.parse_args()
tokenizer = PreTrainedTokenizerFast(
tokenizer_file="tokenizer.json",
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
mask_token="<mask>",
pad_token="<pad>",
)
def per_document(iterator, is_delimiter=lambda x: x.isspace()):
"""
# Read text file where sentences are separated by newline and
# documents by empty line into a list of lists.
# https://stackoverflow.com/questions/25226871/splitting-textfile-into-section-with-special-delimiter-line-python/25226944#25226944
"""
sentences = []
for line in iterator:
if is_delimiter(line):
if sentences:
yield sentences # OR ''.join(sentences)
sentences = []
else:
sentences.append(line.rstrip()) # OR sentences.append(line)
if sentences:
yield sentences
def tokenize_text(document):
"""
Document is a list of lists where each nested list is a sentence.
[[sentence1], [sentence2], ...]
"""
tokenized_sentences = []
for sentence in document:
tokenized_sentence = tokenizer.tokenize(sentence)
tokenized_sentence = " ".join(tokenized_sentence)
tokenized_sentences.append(tokenized_sentence)
return tokenized_sentences
def split_long_docs(doc, max_len=1022):
"""
Split documents longer than 1022 tokens into chunks
"""
new_doc = []
doc_len = 0
for i, sen in enumerate(doc):
sen_len = len(sen.split()) # word split
if doc_len + sen_len < max_len:
new_doc.append(sen)
doc_len += sen_len
else:
yield new_doc
new_doc = [sen]
doc_len = sen_len
yield new_doc
def preprocess_text(document, max_sequence_length=1022):
tokenized_document = tokenize_text(document)
total_doc_length = sum([len(sentence) for sentence in tokenized_document])
if total_doc_length > max_sequence_length:
tokenized_document_splits = split_long_docs(tokenized_document, max_sequence_length)
return list(tokenized_document_splits)
else:
return [tokenized_document]
# data_folder = "/ceph/hpc/home/eufatonr/data/text/kb_bart_data/split"
text_shard_file = os.path.join(args.data_folder, args.filename)
t0 = time.time()
with open(text_shard_file) as f:
documents = list(per_document(f)) # default delimiter
t1 = time.time()
print(f"Reading sentences from file {text_shard_file}. Completed in {t1 - t0} seconds.")
t0 = time.time()
pool = mp.Pool(processes=20)
tokenized_sentences = pool.map(preprocess_text, documents)
pool.close()
t1 = time.time()
# Unnest the inner lists in tokenized_sentences
flat_list = [item for sublist in tokenized_sentences for item in sublist]
flat_list = [" ".join(sen) for sen in flat_list] # join list of sentences to doc
output_filename = os.path.basename(text_shard_file) + ".docs.token"
output_path = os.path.join(args.dest_folder, output_filename)
# Use regular file write line by line to avoid quoting/escaping issues of pandas.
with open(output_path, "w") as wf:
for line in flat_list:
wf.write(line)
wf.write("\n")
print(f"{os.path.basename(text_shard_file)} has been tokenized and saved to {output_path}.")
print(f"Time to tokenize sentences in shard: {t1 - t0} seconds.")