Skip to content

Commit

Permalink
linting
Browse files Browse the repository at this point in the history
  • Loading branch information
ashraq1455 committed May 21, 2024
1 parent 6d055ab commit 283a9b0
Showing 1 changed file with 7 additions and 5 deletions.
12 changes: 7 additions & 5 deletions semantic_chunkers/chunkers/statistical.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def __call__(self, docs: List[str], batch_size: int = 500) -> List[List[Chunk]]:
all_chunks = []

new_docs = []
# Split the docs that already exceed max_split_tokens to smaller chunks
# Split the docs that already exceed max_split_tokens to smaller chunks
for doc in docs:
token_count = tiktoken_length(doc)
if token_count > self.max_split_tokens:
Expand All @@ -92,11 +92,13 @@ def __call__(self, docs: List[str], batch_size: int = 500) -> List[List[Chunk]]:
else:
new_docs.append(doc)

docs = [doc for doc in new_docs if doc and doc.strip()]
docs = [doc for doc in new_docs if doc and doc.strip()]

last_split = None
for i in tqdm(range(0, len(docs), batch_size), desc="Processing document batches"):
batch_docs = docs[i:i + batch_size]
for i in tqdm(
range(0, len(docs), batch_size), desc="Processing document batches"
):
batch_docs = docs[i : i + batch_size]
if last_split is not None:
batch_docs = last_split.splits + batch_docs

Expand All @@ -120,7 +122,7 @@ def __call__(self, docs: List[str], batch_size: int = 500) -> List[List[Chunk]]:

if self.enable_statistics:
print(self.statistics)

if last_split:
all_chunks.append(last_split)

Expand Down

0 comments on commit 283a9b0

Please sign in to comment.