vision-llm-extraction/invoice_analyzer.py at main · learnwithparam/vision-llm-extraction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
import logging
import json
from utils.llm_provider import LLMProvider

logger = logging.getLogger(__name__)

class InvoiceItem(BaseModel):
    """Structured data for a single item in an invoice"""
    description: str = Field(description="Description of the product or service")
    quantity: Optional[float] = Field(None, description="Quantity of the item")
    unit_price: Optional[float] = Field(None, description="Price per unit")
    amount: float = Field(description="Total amount for this line item")

class InvoiceData(BaseModel):
    """Structured data extracted from an invoice"""
    is_invoice: bool = Field(description="Whether the document is actually an invoice")
    vendor_name: Optional[str] = Field(None, description="Name of the company providing the invoice")
    invoice_number: Optional[str] = Field(None, description="The unique identifier for the invoice")
    invoice_date: Optional[str] = Field(None, description="Date the invoice was issued")
    due_date: Optional[str] = Field(None, description="Date the payment is due")
    items: List[InvoiceItem] = Field(default_factory=list, description="List of items or services billed")
    subtotal: Optional[float] = Field(None, description="Total before taxes")
    tax_amount: Optional[float] = Field(None, description="Amount of tax applied")
    total_amount: Optional[float] = Field(None, description="The final total amount due")
    currency: Optional[str] = Field("USD", description="Currency of the invoice (e.g., USD, EUR, GBP)")

class InvoiceAnalyzer:
    """Multi-modal analysis for parsing invoices"""

    def __init__(self, llm_provider: LLMProvider):
        self.llm_provider = llm_provider

    async def analyze_invoice(self, base64_image: str, mime_type: str) -> InvoiceData:
        """
        Analyze an invoice image/PDF and extract structured data.

        Args:
            base64_image: Base64 encoded file content
            mime_type: Mime type of the file

        Returns:
            Structured InvoiceData object
        """
        prompt = """
        You are an expert invoice processing agent. Your task is to extract structured data from the provided document.

        1. First, determine if this document is an invoice, a receipt, or a similar billing document.
        2. If it IS an invoice, extract all relevant fields accurately.
        3. If it IS NOT an invoice, set `is_invoice` to false and return empty values for other fields.

        Extract the following fields in JSON format:
        - is_invoice: boolean
        - vendor_name: string
        - invoice_number: string
        - invoice_date: string (YYYY-MM-DD format if possible)
        - due_date: string (YYYY-MM-DD format if possible)
        - items: list of objects with (description, quantity, unit_price, amount)
        - subtotal: number
        - tax_amount: number
        - total_amount: number
        - currency: string (3-letter code, default USD)

        Return ONLY the JSON object.
        """

        # Check if we need to convert PDF to image
        # OpenAI-compatible providers (Fireworks, OpenRouter) typically don't support PDF inputs in image_url
        # Google Gemini supports PDFs natively

        provider_name = getattr(self.llm_provider, "provider_name", "")
        # If it's a PDF and NOT Gemini, reject it
        if mime_type == "application/pdf" and "gemini" not in provider_name.lower():
            logger.warning(f"PDF input is not supported for provider '{provider_name}'. Please use an image (PNG/JPG).")
            # Return a valid Empty InvoiceData to prevent frontend crashing
            return InvoiceData(
                is_invoice=False,
                vendor_name="Error: PDF not supported with this provider. Please use an image."
            )

        # Build multimodal content for litellm
        content = [
            {"type": "text", "text": prompt},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:{mime_type};base64,{base64_image}"
                }
            }
        ]

        # Using the llm_provider directly for multimodal completion
        response_text = await self.llm_provider.generate_text(content)

        # Clean up response text if it contains markdown code blocks
        clean_json = response_text.strip()
        if clean_json.startswith("```json"):
            clean_json = clean_json.replace("```json", "", 1)
        if clean_json.endswith("```"):
            clean_json = clean_json.rsplit("```", 1)[0]
        clean_json = clean_json.strip()

        data = json.loads(clean_json)
        return InvoiceData(**data)


def get_invoice_analyzer(llm_provider: LLMProvider):
    return InvoiceAnalyzer(llm_provider)