-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_data.py
59 lines (48 loc) · 1.58 KB
/
index_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from datetime import time
from pathlib import Path
import elasticsearch
import nltk.data
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("msmarco-distilroberta-base-v2")
sent_detector = nltk.data.load("tokenizers/punkt/english.pickle")
es = elasticsearch.Elasticsearch(
"https://admin:admin@localhost:9200",
verify_certs=False,
ssl_show_warn=False,
)
def read_sentences(text: str, doc_name: str):
text = text.replace("\n", "")
text = text.replace("=", "")
text = text.replace("'", "'")
sentences = sent_detector.tokenize(text.strip())
encoded_sents = [
{
"document_name": doc_name,
"sentence_emb": model.encode(s),
"sentence_text": s,
}
for s in sentences
]
return encoded_sents
def read_data(data_path: str):
file_paths = [p for p in Path(data_path).glob("**/*")]
docs = []
for path in file_paths:
if path.suffix.lower() == ".txt":
with open(path) as doc:
text = doc.read()
sentences = read_sentences(text, path.name)
for sent in sentences:
docs.append({"index": {}})
docs.append(sent)
if len(docs) >= 10000:
es.bulk(docs, index="got")
time.sleep(1)
docs = []
print(len(sentences))
else:
raise Exception(
f"Indexing of {path.suffix} files is not supported."
)
if __name__ == "__main__":
docs = read_data("data/article_txt_got")