When I run the following code:
class semChunker(BaseChunker):
def __init__(self,chunk_size):
super().__init__()
self.chunker =semchunk.chunkerify('isaacus/kanon-tokenizer', chunk_size)#0.5054 0.50543
def split_text(self, text):
chunks = self.chunker(text)
return chunks
semchunker = semChunker(chunk_size=800)
results = evaluation.run(semchunker, default_ef)
I come across this error:
Token indices sequence length is longer than the specified maximum sequence length for this model (750 > 512). Running this sequence through the model will result in indexing errors
Traceback (most recent call last):
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 90, in wrapper
return func(self, *args, **kwargs)
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 213, in _validate_and_prepare_add_request
add_embeddings = self._embed_record_set(record_set=add_records)
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 526, in _embed_record_set
return self._embed(input=record_set[field]) # type: ignore[literal-required]
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 539, in _embed
return self._embedding_function(input=input)
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\types.py", line 466, in call
result = call(self, input)
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\utils\embedding_functions\openai_embedding_function.py", line 113, in call
embeddings = self._client.create(
File "D:\miniconda\envs\d2l\lib\site-packages\openai\resources\embeddings.py", line 125, in create
return self._post(
File "D:\miniconda\envs\d2l\lib\site-packages\openai_base_client.py", line 1283, in post
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
File "D:\miniconda\envs\d2l\lib\site-packages\openai_base_client.py", line 960, in request
return self._request(
File "D:\miniconda\envs\d2l\lib\site-packages\openai_base_client.py", line 1064, in _request
raise self._make_status_error_from_response(err.response) from None
openai.BadRequestError: Error code: 400 - {'error': {'message': 'Requested 345414 tokens, max 300000 tokens per request', 'type': 'max_tokens_per_request', 'param': None, 'code': 'max_tokens_per_request
'}}
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\github-my-project\eval2.py", line 342, in
results = evaluation.run(semchunker, default_ef)
File "D:\miniconda\envs\d2l\lib\site-packages\chunking_evaluation\evaluation_framework\base_evaluation.py", line 358, in run
collection = self._chunker_to_collection(chunker, embedding_function)
File "D:\miniconda\envs\d2l\lib\site-packages\chunking_evaluation\evaluation_framework\base_evaluation.py", line 308, in _chunker_to_collection
collection.add(
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\Collection.py", line 82, in add
add_request = self._validate_and_prepare_add_request(
File "D:\miniconda\envs\d2l\lib\site-packages\chromadb\api\models\CollectionCommon.py", line 93, in wrapper
raise type(e)(msg).with_traceback(e.traceback)
TypeError: init() missing 2 required keyword-only arguments: 'response' and 'body'
but when I set chunk_size=400, it run perfectly. I think it is a bug, what should i do? thanks for your work!
When I run the following code:
I come across this error:
but when I set chunk_size=400, it run perfectly. I think it is a bug, what should i do? thanks for your work!