Skip to content

Working with PDF and OCR

Somkiat Puisungnoen edited this page Jul 13, 2025 · 4 revisions

Working with PDF and OCR

1. OpenAI API

# Ref: https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python
from openai import OpenAI
client = OpenAI()

file_path = "./data/doc-scan.pdf"
file = client.files.create(
    file=open(file_path, "rb"),
    purpose="user_data"
)

completion = client.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "file",
                    "file": {
                        "file_id": file.id,
                    }
                },
                {
                    "type": "text",
                    "text": "Extract the text from the PDF file",
                },
            ]
        }
    ]
)

print(completion.choices[0].message.content)

2. Mistral OCR

import base64
import os
from mistralai import Mistral

def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None

# Create a markdown file from the OCR response
def create_markdown_file(ocr_response, output_filename = "output.md"):
  with open(output_filename, "wt") as f:
    for page in ocr_response.pages:
      f.write(page.markdown)

if __name__ == "__main__":
    if "MISTRAL_API_KEY" not in os.environ:
        print("Error: MISTRAL_API_KEY environment variable is not set.")
        exit(1)

    # Path to your pdf
    pdf_path = "doc-scan.pdf"

    # API key and client initialization
    api_key = os.environ["MISTRAL_API_KEY"]
    client = Mistral(api_key=api_key)

    # Getting the base64 string
    base64_pdf = encode_pdf(pdf_path)

    # Check if the base64 encoding was successful
    if base64_pdf is None:
        print("Error: Failed to encode the PDF file.")
        exit(1)

    # Process the OCR request
    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{base64_pdf}" 
        },
        include_image_base64=True
    )
    # Print the OCR response
    print("OCR Response:", ocr_response)
    
    # Create a markdown file from the OCR response
    create_markdown_file(ocr_response)
    print("OCR processing complete. Markdown file created.")

3. Typhoon OCR

from typhoon_ocr import ocr_document
import os

# please set env TYPHOON_API_KEY or OPENAI_API_KEY to use this function

image_path = "./data/doc-scan.pdf"

markdown = ocr_document(image_path)
print(markdown)

Clone this wiki locally