brandonstarxel · R09722akaBennett · Sep 27, 2024 · Mar 13, 2025 · Mar 14, 2025 · Jul 17, 2025
diff --git a/README.md b/README.md
@@ -181,3 +181,7 @@ If you use this package in your research, please cite our technical report:
   url = {https://research.trychroma.com/evaluating-chunking},
 }
 ```
+
+## Contributions
+We welcome contributions and are excited you'd like to get involved! 
+Make sure your pull request goes to the dev branch. We will test it and then later merge it to main.
diff --git a/chunking_evaluation/evaluation_framework/base_evaluation.py b/chunking_evaluation/evaluation_framework/base_evaluation.py
@@ -117,17 +117,26 @@ def _load_questions_df(self):
         self.corpus_list = self.questions_df['corpus_id'].unique().tolist()
 
     def _get_chunks_and_metadata(self, splitter):
-        # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. This isn't pratically an issue for chunks over 1000 characters.
+        # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. 
+        # This isn't pratically an issue for chunks over 1000 characters.
         documents = []
         metadatas = []
         for corpus_id in self.corpus_list:
             corpus_path = corpus_id
             if self.corpora_id_paths is not None:
                 corpus_path = self.corpora_id_paths[corpus_id]
-
-            with open(corpus_path, 'r') as file:
-                corpus = file.read()
-
+
+            # Check the operating system and use UTF-8 encoding on Windows
+            # This prevents UnicodeDecodeError when reading files with non-ASCII characters
+            import platform
+            if platform.system() == 'Windows':
+                with open(corpus_path, 'r', encoding='utf-8') as file:
+                    corpus = file.read()
+            else:
+                # Use default encoding on other systems
+                with open(corpus_path, 'r') as file:
+                    corpus = file.read()
+
             current_documents = splitter.split_text(corpus)
             current_metadatas = []
             for document in current_documents:
@@ -136,7 +145,6 @@ def _get_chunks_and_metadata(self, splitter):
                 except:
                     print(f"Error in finding {document} in {corpus_id}")
                     raise Exception(f"Error in finding {document} in {corpus_id}")
-                # start_index, end_index = find_target_in_document(corpus, document)
                 current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
             documents.extend(current_documents)
             metadatas.extend(current_metadatas)
@@ -286,7 +294,7 @@ def _chunker_to_collection(self, chunker, embedding_function, chroma_db_path:str
         if collection is None:
             try:
                 self.chroma_client.delete_collection(collection_name)
-            except ValueError as e:
+            except Exception as e:
                 pass
             collection = self.chroma_client.create_collection(collection_name, embedding_function=embedding_function, metadata={"hnsw:search_ef":50})
 
@@ -373,7 +381,7 @@ def run(self, chunker, embedding_function=None, retrieve:int = 5, db_to_save_chu
             #     print("FAILED TO LOAD GENERAL EVALUATION")
             try:
                 self.chroma_client.delete_collection("auto_questions")
-            except ValueError as e:
+            except Exception as e:
                 pass
             question_collection = self.chroma_client.create_collection("auto_questions", embedding_function=embedding_function, metadata={"hnsw:search_ef":50})
             question_collection.add(
@@ -454,4 +462,4 @@ def run(self, chunker, embedding_function=None, retrieve:int = 5, db_to_save_chu
             "precision_omega_std": brute_iou_std,
             "precision_mean": precision_mean,
             "precision_std": precision_std
-        }
+        }