From c90c41c391de4366db554d7a71ce9a35467c981e Mon Sep 17 00:00:00 2001 From: guglie Date: Tue, 3 Dec 2024 11:21:18 +0100 Subject: [PATCH] fix: ParserError EOF inside string (#470) (#472) Signed-off-by: guglie --- docling/models/tesseract_ocr_cli_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index ababe670..16e1629d 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -1,3 +1,4 @@ +import csv import io import logging import os @@ -96,7 +97,7 @@ def _run_tesseract(self, ifilename: str): # _log.info(decoded_data) # Read the TSV file generated by Tesseract - df = pd.read_csv(io.StringIO(decoded_data), sep="\t") + df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t") # Display the dataframe (optional) # _log.info("df: ", df.head())