Working with PDF and OCR

1. OpenAI API

# Ref: https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python
from openai import OpenAI
client = OpenAI()

file_path = "./data/doc-scan.pdf"
file = client.files.create(
    file=open(file_path, "rb"),
    purpose="user_data"
)

completion = client.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "file",
                    "file": {
                        "file_id": file.id,
                    }
                },
                {
                    "type": "text",
                    "text": "Extract the text from the PDF file",
                },
            ]
        }
    ]
)

print(completion.choices[0].message.content)

2. Mistral OCR

import base64
import os
from mistralai import Mistral

def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None

# Create a markdown file from the OCR response
def create_markdown_file(ocr_response, output_filename = "output.md"):
  with open(output_filename, "wt") as f:
    for page in ocr_response.pages:
      f.write(page.markdown)

if __name__ == "__main__":
    if "MISTRAL_API_KEY" not in os.environ:
        print("Error: MISTRAL_API_KEY environment variable is not set.")
        exit(1)

    # Path to your pdf
    pdf_path = "doc-scan.pdf"

    # API key and client initialization
    api_key = os.environ["MISTRAL_API_KEY"]
    client = Mistral(api_key=api_key)

    # Getting the base64 string
    base64_pdf = encode_pdf(pdf_path)

    # Check if the base64 encoding was successful
    if base64_pdf is None:
        print("Error: Failed to encode the PDF file.")
        exit(1)

    # Process the OCR request
    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{base64_pdf}" 
        },
        include_image_base64=True
    )
    # Print the OCR response
    print("OCR Response:", ocr_response)
    
    # Create a markdown file from the OCR response
    create_markdown_file(ocr_response)
    print("OCR processing complete. Markdown file created.")

3. Typhoon OCR

https://docs.opentyphoon.ai/en/ocr/

from typhoon_ocr import ocr_document
import os

# please set env TYPHOON_API_KEY or OPENAI_API_KEY to use this function

image_path = "./data/doc-scan.pdf"

markdown = ocr_document(image_path)
print(markdown)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Working with PDF and OCR

Working with PDF and OCR

1. OpenAI API

2. Mistral OCR

3. Typhoon OCR

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally