diff --git a/README.md b/README.md index cd24f46..4db5836 100644 --- a/README.md +++ b/README.md @@ -181,3 +181,7 @@ If you use this package in your research, please cite our technical report: url = {https://research.trychroma.com/evaluating-chunking}, } ``` + +## Contributions +We welcome contributions and are excited you'd like to get involved! +Make sure your pull request goes to the dev branch. We will test it and then later merge it to main. diff --git a/chunking_evaluation/evaluation_framework/base_evaluation.py b/chunking_evaluation/evaluation_framework/base_evaluation.py index a5fb1b8..0a39a39 100644 --- a/chunking_evaluation/evaluation_framework/base_evaluation.py +++ b/chunking_evaluation/evaluation_framework/base_evaluation.py @@ -117,17 +117,26 @@ def _load_questions_df(self): self.corpus_list = self.questions_df['corpus_id'].unique().tolist() def _get_chunks_and_metadata(self, splitter): - # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. This isn't pratically an issue for chunks over 1000 characters. + # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. + # This isn't pratically an issue for chunks over 1000 characters. documents = [] metadatas = [] for corpus_id in self.corpus_list: corpus_path = corpus_id if self.corpora_id_paths is not None: corpus_path = self.corpora_id_paths[corpus_id] - - with open(corpus_path, 'r') as file: - corpus = file.read() - + + # Check the operating system and use UTF-8 encoding on Windows + # This prevents UnicodeDecodeError when reading files with non-ASCII characters + import platform + if platform.system() == 'Windows': + with open(corpus_path, 'r', encoding='utf-8') as file: + corpus = file.read() + else: + # Use default encoding on other systems + with open(corpus_path, 'r') as file: + corpus = file.read() + current_documents = splitter.split_text(corpus) current_metadatas = [] for document in current_documents: @@ -136,7 +145,6 @@ def _get_chunks_and_metadata(self, splitter): except: print(f"Error in finding {document} in {corpus_id}") raise Exception(f"Error in finding {document} in {corpus_id}") - # start_index, end_index = find_target_in_document(corpus, document) current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id}) documents.extend(current_documents) metadatas.extend(current_metadatas) @@ -286,7 +294,7 @@ def _chunker_to_collection(self, chunker, embedding_function, chroma_db_path:str if collection is None: try: self.chroma_client.delete_collection(collection_name) - except ValueError as e: + except Exception as e: pass collection = self.chroma_client.create_collection(collection_name, embedding_function=embedding_function, metadata={"hnsw:search_ef":50}) @@ -373,7 +381,7 @@ def run(self, chunker, embedding_function=None, retrieve:int = 5, db_to_save_chu # print("FAILED TO LOAD GENERAL EVALUATION") try: self.chroma_client.delete_collection("auto_questions") - except ValueError as e: + except Exception as e: pass question_collection = self.chroma_client.create_collection("auto_questions", embedding_function=embedding_function, metadata={"hnsw:search_ef":50}) question_collection.add( @@ -454,4 +462,4 @@ def run(self, chunker, embedding_function=None, retrieve:int = 5, db_to_save_chu "precision_omega_std": brute_iou_std, "precision_mean": precision_mean, "precision_std": precision_std - } \ No newline at end of file + }