Skip to content

bugs for long texts? #19

@jackfsuia

Description

@jackfsuia

When I run the following code:

class semChunker(BaseChunker):
    def __init__(self,chunk_size):
        super().__init__()
        self.chunker =semchunk.chunkerify('isaacus/kanon-tokenizer', chunk_size)#0.5054 0.50543

    def split_text(self, text):

        chunks = self.chunker(text)
        return chunks
semchunker = semChunker(chunk_size=800)
results = evaluation.run(semchunker, default_ef)

I come across this error:

Token indices sequence length is longer than the specified maximum sequence length for this model (750 > 512). Running this sequence through the model will result in indexing errors
Traceback (most recent call last):
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 90, in wrapper
return func(self, *args, **kwargs)
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 213, in _validate_and_prepare_add_request
add_embeddings = self._embed_record_set(record_set=add_records)
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 526, in _embed_record_set
return self._embed(input=record_set[field]) # type: ignore[literal-required]
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 539, in _embed
return self._embedding_function(input=input)
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\types.py", line 466, in call
result = call(self, input)
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\utils\embedding_functions\openai_embedding_function.py", line 113, in call
embeddings = self._client.create(
File "D:\miniconda\envs\d2l\lib\site-packages\openai\resources\embeddings.py", line 125, in create
return self._post(
File "D:\miniconda\envs\d2l\lib\site-packages\openai_base_client.py", line 1283, in post
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File "D:\miniconda\envs\d2l\lib\site-packages\openai_base_client.py", line 960, in request
return self._request(
File "D:\miniconda\envs\d2l\lib\site-packages\openai_base_client.py", line 1064, in _request
raise self._make_status_error_from_response(err.response) from None
openai.BadRequestError: Error code: 400 - {'error': {'message': 'Requested 345414 tokens, max 300000 tokens per request', 'type': 'max_tokens_per_request', 'param': None, 'code': 'max_tokens_per_request
'}}
During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "D:\github-my-project\eval2.py", line 342, in
results = evaluation.run(semchunker, default_ef)
File "D:\miniconda\envs\d2l\lib\site-packages\chunking_evaluation\evaluation_framework\base_evaluation.py", line 358, in run
collection = self._chunker_to_collection(chunker, embedding_function)
File "D:\miniconda\envs\d2l\lib\site-packages\chunking_evaluation\evaluation_framework\base_evaluation.py", line 308, in _chunker_to_collection
collection.add(
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\Collection.py", line 82, in add
add_request = self._validate_and_prepare_add_request(
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 93, in wrapper
raise type(e)(msg).with_traceback(e.traceback)
TypeError: init() missing 2 required keyword-only arguments: 'response' and 'body'

but when I set chunk_size=400, it run perfectly. I think it is a bug, what should i do? thanks for your work!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions