-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathingest_data.py
41 lines (32 loc) · 1.35 KB
/
ingest_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from pathlib import Path
from tqdm import tqdm
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from pdf2image import convert_from_path
import pytesseract
import faiss
import pickle
data = list()
sources = list()
docs = list(Path("./docs").glob("*.pdf"))
# Iterate over all PDFs in the docs folder.
for pdf in tqdm(docs):
# Convert the PDF to images.
pages = convert_from_path(pdf)
with open(f"./out/{pdf.stem}.txt", "w", encoding="UTF-8") as f:
for i, page in enumerate(pages):
# Read the text from the page using OCR.
text = pytesseract.image_to_string(page)
f.write(text)
# Split text into smaller chunks. Needed due to the context limits of the LLMs.
text_splitter = CharacterTextSplitter(chunk_size=1000, separator="\n")
splits = text_splitter.split_text(text)
data.extend(splits)
sources.extend([{"source": f"{pdf.stem} (p.{i+1})"}] * len(splits))
# Create a vector store from the documents and save it to disk.
store = FAISS.from_texts(data, HuggingFaceEmbeddings(), metadatas=sources)
faiss.write_index(store.index, "docs.index")
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)