-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
121 lines (94 loc) · 3.66 KB
/
utils.py
File metadata and controls
121 lines (94 loc) · 3.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import docx
import PyPDF2
import os
def read_text_file(file_path: str):
"""Read content from a text file"""
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def read_pdf_file(file_path: str):
"""Read content from a PDF file"""
text = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
def read_docx_file(file_path: str):
"""Read content from a Word document"""
doc = docx.Document(file_path)
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
def read_document(file_path: str):
"""Read document content based on file extension"""
_, file_extension = os.path.splitext(file_path)
file_extension = file_extension.lower()
if file_extension == '.txt':
return read_text_file(file_path)
elif file_extension == '.pdf':
return read_pdf_file(file_path)
elif file_extension == '.docx':
return read_docx_file(file_path)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
def split_text(text: str, chunk_size: int = 500):
"""Split text into chunks while preserving sentence boundaries"""
sentences = text.replace('\n', ' ').split('. ')
chunks = []
current_chunk = []
current_size = 0
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# Ensure proper sentence ending
if not sentence.endswith('.'):
sentence += '.'
sentence_size = len(sentence)
# Check if adding this sentence would exceed chunk size
if current_size + sentence_size > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_size = sentence_size
else:
current_chunk.append(sentence)
current_size += sentence_size
# Add the last chunk if it exists
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def process_document(file_path: str):
"""Process a single document and prepare it for ChromaDB"""
try:
# Read the document
content = read_document(file_path)
# Split into chunks
chunks = split_text(content)
# Prepare metadata
file_name = os.path.basename(file_path)
metadatas = [{"source": file_name, "chunk": i} for i in range(len(chunks))]
ids = [f"{file_name}_chunk_{i}" for i in range(len(chunks))]
return ids, chunks, metadatas
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
return [], [], []
def add_to_collection(collection, ids, texts, metadatas):
"""Add documents to collection in batches"""
if not texts:
return
batch_size = 100
for i in range(0, len(texts), batch_size):
end_idx = min(i + batch_size, len(texts))
collection.add(
documents=texts[i:end_idx],
metadatas=metadatas[i:end_idx],
ids=ids[i:end_idx]
)
def process_and_add_documents(collection, folder_path: str):
"""Process all documents in a folder and add to collection"""
files = [os.path.join(folder_path, file)
for file in os.listdir(folder_path)
if os.path.isfile(os.path.join(folder_path, file))]
for file_path in files:
print(f"Processing {os.path.basename(file_path)}...")
ids, texts, metadatas = process_document(file_path)
add_to_collection(collection, ids, texts, metadatas)
print(f"Added {len(texts)} chunks to collection")