Skip to content

Commit 2929529

Browse files
Merge pull request #11 from ShoaibMajidDar/main
added encoder to whisper function in LLMWhisperClient
2 parents a7a58d4 + be02721 commit 2929529

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

src/unstract/llmwhisperer/client.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def whisper(
169169
ocr_provider: str = "advanced",
170170
line_splitter_tolerance: float = 0.4,
171171
horizontal_stretch_factor: float = 1.0,
172+
encoding: str = "utf-8"
172173
) -> dict:
173174
"""
174175
Sends a request to the LLMWhisperer API to process a document.
@@ -190,6 +191,7 @@ def whisper(
190191
ocr_provider (str, optional): The OCR provider. Can be "advanced" or "basic". Defaults to "advanced".
191192
line_splitter_tolerance (float, optional): The line splitter tolerance. Defaults to 0.4.
192193
horizontal_stretch_factor (float, optional): The horizontal stretch factor. Defaults to 1.0.
194+
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".
193195
194196
Returns:
195197
dict: The response from the API as a dictionary.
@@ -268,6 +270,7 @@ def generate():
268270
prepared = req.prepare()
269271
s = requests.Session()
270272
response = s.send(prepared, timeout=self.api_timeout, stream=should_stream)
273+
response.encoding = encoding
271274
if response.status_code != 200 and response.status_code != 202:
272275
message = json.loads(response.text)
273276
message["status_code"] = response.status_code
@@ -318,7 +321,7 @@ def whisper_status(self, whisper_hash: str) -> dict:
318321
message["status_code"] = response.status_code
319322
return message
320323

321-
def whisper_retrieve(self, whisper_hash: str) -> dict:
324+
def whisper_retrieve(self, whisper_hash: str, encoding: str = "utf-8") -> dict:
322325
"""Retrieves the result of the whisper operation from the LLMWhisperer
323326
API.
324327
@@ -329,6 +332,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:
329332
330333
Args:
331334
whisper_hash (str): The hash of the whisper operation.
335+
encoding (str): The character encoding to use for processing the text. Defaults to "utf-8".
332336
333337
Returns:
334338
dict: A dictionary containing the status code and the extracted text from the whisper operation.
@@ -345,6 +349,7 @@ def whisper_retrieve(self, whisper_hash: str) -> dict:
345349
prepared = req.prepare()
346350
s = requests.Session()
347351
response = s.send(prepared, timeout=self.api_timeout)
352+
response.encoding = encoding
348353
if response.status_code != 200:
349354
err = json.loads(response.text)
350355
err["status_code"] = response.status_code

0 commit comments

Comments
 (0)