Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,7 @@ If you use this package in your research, please cite our technical report:
url = {https://research.trychroma.com/evaluating-chunking},
}
```

## Contributions
We welcome contributions and are excited you'd like to get involved!
Make sure your pull request goes to the dev branch. We will test it and then later merge it to main.
26 changes: 17 additions & 9 deletions chunking_evaluation/evaluation_framework/base_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,26 @@ def _load_questions_df(self):
self.corpus_list = self.questions_df['corpus_id'].unique().tolist()

def _get_chunks_and_metadata(self, splitter):
# Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. This isn't pratically an issue for chunks over 1000 characters.
# Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index.
# This isn't pratically an issue for chunks over 1000 characters.
documents = []
metadatas = []
for corpus_id in self.corpus_list:
corpus_path = corpus_id
if self.corpora_id_paths is not None:
corpus_path = self.corpora_id_paths[corpus_id]

with open(corpus_path, 'r') as file:
corpus = file.read()


# Check the operating system and use UTF-8 encoding on Windows
# This prevents UnicodeDecodeError when reading files with non-ASCII characters
import platform
if platform.system() == 'Windows':
with open(corpus_path, 'r', encoding='utf-8') as file:
corpus = file.read()
else:
# Use default encoding on other systems
with open(corpus_path, 'r') as file:
corpus = file.read()

current_documents = splitter.split_text(corpus)
current_metadatas = []
for document in current_documents:
Expand All @@ -136,7 +145,6 @@ def _get_chunks_and_metadata(self, splitter):
except:
print(f"Error in finding {document} in {corpus_id}")
raise Exception(f"Error in finding {document} in {corpus_id}")
# start_index, end_index = find_target_in_document(corpus, document)
current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
documents.extend(current_documents)
metadatas.extend(current_metadatas)
Expand Down Expand Up @@ -286,7 +294,7 @@ def _chunker_to_collection(self, chunker, embedding_function, chroma_db_path:str
if collection is None:
try:
self.chroma_client.delete_collection(collection_name)
except ValueError as e:
except Exception as e:
pass
collection = self.chroma_client.create_collection(collection_name, embedding_function=embedding_function, metadata={"hnsw:search_ef":50})

Expand Down Expand Up @@ -373,7 +381,7 @@ def run(self, chunker, embedding_function=None, retrieve:int = 5, db_to_save_chu
# print("FAILED TO LOAD GENERAL EVALUATION")
try:
self.chroma_client.delete_collection("auto_questions")
except ValueError as e:
except Exception as e:
pass
question_collection = self.chroma_client.create_collection("auto_questions", embedding_function=embedding_function, metadata={"hnsw:search_ef":50})
question_collection.add(
Expand Down Expand Up @@ -454,4 +462,4 @@ def run(self, chunker, embedding_function=None, retrieve:int = 5, db_to_save_chu
"precision_omega_std": brute_iou_std,
"precision_mean": precision_mean,
"precision_std": precision_std
}
}