vas3k · 0xxy0 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.env.example b/.env.example
@@ -15,6 +15,16 @@ GOOGLE_API_KEY=""
 MISTRAL_MODEL_NAME="mistral-medium-latest"
 MISTRAL_API_KEY=""
 
+# Optional: command for local Python AI/ML enrichment post-processor.
+# TaxHacker writes JSON to STDIN and expects JSON on STDOUT.
+# Keep this fast because it runs in request path.
+# Example:
+# TAXHACKER_PYTHON_ENRICHER_CMD="python3"
+# TAXHACKER_PYTHON_ENRICHER_ARGS='["/app/python/enricher.py"]'
+TAXHACKER_PYTHON_ENRICHER_CMD=""
+TAXHACKER_PYTHON_ENRICHER_ARGS=""
+TAXHACKER_PYTHON_ENRICHER_TIMEOUT_MS="1200"
+
 # Auth Config
 BETTER_AUTH_SECRET="random-secret-key"  # please use any long random string here
 

diff --git a/README.md b/README.md
@@ -165,6 +165,28 @@ You can also configure LLM provider settings in the application or via environme
 - **OpenAI**: `OPENAI_MODEL_NAME` and `OPENAI_API_KEY`
 - **Google Gemini**: `GOOGLE_MODEL_NAME` and `GOOGLE_API_KEY`
 - **Mistral**: `MISTRAL_MODEL_NAME` and `MISTRAL_API_KEY`
+- **Optional Python post-enrichment hook**: `TAXHACKER_PYTHON_ENRICHER_CMD`
+
+### Optional Python AI/ML post-enrichment
+
+For teams that want lightweight Python-based heuristics or ML enrichment without changing the core extraction flow, TaxHacker can call an optional local Python command after LLM extraction.
+
+- Configure:
+  - `TAXHACKER_PYTHON_ENRICHER_CMD` (for example: `python3`)
+  - `TAXHACKER_PYTHON_ENRICHER_ARGS` (JSON array of args, for example: `["/app/python/enricher.py"]`)
+  - `TAXHACKER_PYTHON_ENRICHER_TIMEOUT_MS` (optional, default `1200`, clamped to 100..5000)
+- TaxHacker sends JSON payload to `stdin`:
+  - `output`: extracted fields
+  - `warnings`: current warnings list
+  - `confidence`: current confidence score (0..1)
+- The Python command may return JSON on `stdout`:
+  - `output` (partial field overrides/normalization)
+  - `warnings` (additional warnings)
+  - `confidenceDelta` (number to add to confidence)
+
+If the command is not configured, fails, times out, or returns invalid JSON, TaxHacker safely ignores it and continues with standard extraction behavior.
+
+> For security and reliability, command execution runs with shell disabled. Set these variables only by trusted administrators.
 
 ## ⌨️ Local Development
 

diff --git a/ai/analyze.ts b/ai/analyze.ts
@@ -4,11 +4,15 @@ import { ActionState } from "@/lib/actions"
 import { updateFile } from "@/models/files"
 import { getLLMSettings, getSettings } from "@/models/settings"
 import { AnalyzeAttachment } from "./attachments"
+import { enrichAnalysisOutput } from "./enrichment"
 import { requestLLM } from "./providers/llmProvider"
 
 export type AnalysisResult = {
   output: Record<string, string>
   tokensUsed: number
+  confidence: number
+  warnings: string[]
+  usedPythonEnricher: boolean
 }
 
 export async function analyzeTransaction(
@@ -32,19 +36,23 @@ export async function analyzeTransaction(
       throw new Error(response.error)
     }
 
-    const result = response.output
+    const result = response.output as Record<string, unknown>
     const tokensUsed = response.tokensUsed || 0
+    const enriched = await enrichAnalysisOutput(result)
 
-    console.log("LLM response:", result)
+    console.log("LLM response:", enriched.output)
     console.log("LLM tokens used:", tokensUsed)
 
-    await updateFile(fileId, userId, { cachedParseResult: result })
+    await updateFile(fileId, userId, { cachedParseResult: enriched.output })
 
     return {
       success: true,
       data: {
-        output: result,
+        output: enriched.output as Record<string, string>,
         tokensUsed: tokensUsed,
+        confidence: enriched.confidence,
+        warnings: enriched.warnings,
+        usedPythonEnricher: enriched.usedPythonEnricher,
       },
     }
   } catch (error) {

diff --git a/ai/enrichment.ts b/ai/enrichment.ts
@@ -0,0 +1,285 @@
+"use server"
+
+import { spawn } from "child_process"
+
+export type AnalysisEnrichmentResult = {
+  output: Record<string, unknown>
+  warnings: string[]
+  confidence: number
+  usedPythonEnricher: boolean
+}
+
+type PythonEnricherResponse = {
+  output?: Record<string, unknown>
+  warnings?: string[]
+  confidenceDelta?: number
+}
+
+// Keep 3-5 symbols to support common fiat (ISO-4217) and user-defined/crypto tickers used in the app.
+const CURRENCY_CODE_REGEX = /^[A-Z]{3,5}$/
+const DEFAULT_PYTHON_ENRICHER_TIMEOUT_MS = 1200
+const SIGTERM_TO_SIGKILL_DELAY_MS = 100
+const MAX_STDOUT_BUFFER_SIZE = 20_000
+const MAX_STDERR_BUFFER_SIZE = 4_000
+
+export async function enrichAnalysisOutput(output: Record<string, unknown>): Promise<AnalysisEnrichmentResult> {
+  const normalized = normalizeOutput(output)
+  let warnings = [...normalized.warnings]
+  let confidence = estimateConfidence(normalized.output, warnings)
+  let usedPythonEnricher = false
+
+  const pythonEnriched = await runPythonEnricher({
+    output: normalized.output,
+    warnings,
+    confidence,
+  })
+
+  if (pythonEnriched) {
+    usedPythonEnricher = true
+    if (pythonEnriched.output && typeof pythonEnriched.output === "object") {
+      normalized.output = {
+        ...normalized.output,
+        ...pythonEnriched.output,
+      }
+    }
+    if (Array.isArray(pythonEnriched.warnings) && pythonEnriched.warnings.length > 0) {
+      warnings = [...warnings, ...pythonEnriched.warnings.filter((warning) => typeof warning === "string")]
+    }
+    if (typeof pythonEnriched.confidenceDelta === "number") {
+      confidence = clampConfidence(confidence + pythonEnriched.confidenceDelta)
+    } else {
+      confidence = estimateConfidence(normalized.output, warnings)
+    }
+  }
+
+  return {
+    output: normalized.output,
+    warnings: Array.from(new Set(warnings)),
+    confidence,
+    usedPythonEnricher,
+  }
+}
+
+function normalizeOutput(output: Record<string, unknown>): { output: Record<string, unknown>; warnings: string[] } {
+  const normalized: Record<string, unknown> = { ...output }
+  const warnings: string[] = []
+
+  normalized.currencyCode = normalizeCurrencyCode(normalized.currencyCode, "currencyCode", warnings)
+  normalized.convertedCurrencyCode = normalizeCurrencyCode(
+    normalized.convertedCurrencyCode,
+    "convertedCurrencyCode",
+    warnings
+  )
+
+  normalized.total = normalizeMoneyValue(normalized.total)
+  normalized.convertedTotal = normalizeMoneyValue(normalized.convertedTotal)
+
+  if (normalized.issuedAt) {
+    const parsedDate = new Date(String(normalized.issuedAt))
+    if (!Number.isNaN(parsedDate.getTime())) {
+      normalized.issuedAt = parsedDate.toISOString().split("T")[0]
+      if (parsedDate.getTime() > Date.now()) {
+        warnings.push("Detected date is in the future and may need manual correction.")
+      }
+    } else {
+      warnings.push("Detected date is invalid and may need manual correction.")
+    }
+  }
+
+  if (Array.isArray(normalized.items)) {
+    normalized.items = normalized.items.map((item) => {
+      if (!item || typeof item !== "object") {
+        return item
+      }
+      const itemRecord = item as Record<string, unknown>
+      return {
+        ...itemRecord,
+        total: normalizeMoneyValue(itemRecord.total),
+        currencyCode: normalizeCurrencyCode(itemRecord.currencyCode, "items.currencyCode", warnings),
+      }
+    })
+  }
+
+  return { output: normalized, warnings }
+}
+
+function normalizeMoneyValue(value: unknown): unknown {
+  if (value == null || value === "") return value
+  if (typeof value === "number") return Number(value.toFixed(2))
+  const rawValue = String(value).trim().replace(/\s+/g, "")
+  let normalized = rawValue
+
+  if (/^[+-]?\d{1,3}(,\d{3})+(\.\d+)?$/.test(rawValue)) {
+    normalized = rawValue.replace(/,/g, "")
+  } else if (/^[+-]?\d{1,3}(\.\d{3})+(,\d+)?$/.test(rawValue)) {
+    normalized = rawValue.replace(/\./g, "").replace(",", ".")
+  } else if (rawValue.includes(",") && !rawValue.includes(".")) {
+    normalized = rawValue.replace(",", ".")
+  }
+
+  const parsed = Number.parseFloat(normalized)
+  if (Number.isNaN(parsed)) return value
+  return Number(parsed.toFixed(2))
+}
+
+function normalizeCurrencyCode(value: unknown, fieldName: string, warnings: string[]): unknown {
+  if (typeof value !== "string") return value
+  const normalized = value.trim().toUpperCase()
+  if (!normalized) return value
+  if (!CURRENCY_CODE_REGEX.test(normalized)) {
+    warnings.push(`Detected ${fieldName} looks unusual: ${value}.`)
+  }
+  return normalized
+}
+
+function estimateConfidence(output: Record<string, unknown>, warnings: string[]): number {
+  let confidence = 1
+  const hasName = typeof output.name === "string" && output.name.trim().length > 1
+  const hasMerchant = typeof output.merchant === "string" && output.merchant.trim().length > 1
+  const hasIssuedAt = typeof output.issuedAt === "string" && output.issuedAt.trim().length > 0
+  const totalValue = typeof output.total === "number" ? output.total : Number.parseFloat(String(output.total || ""))
+
+  if (!hasName && !hasMerchant) confidence -= 0.25
+  if (!hasIssuedAt) confidence -= 0.1
+  if (Number.isNaN(totalValue)) confidence -= 0.2
+  else if (totalValue <= 0) confidence -= 0.15
+
+  const currencyCode =
+    typeof output.currencyCode === "string" ? output.currencyCode.trim().toUpperCase() : String(output.currencyCode || "")
+  if (!currencyCode || !CURRENCY_CODE_REGEX.test(currencyCode)) confidence -= 0.1
+
+  confidence -= Math.min(warnings.length * 0.05, 0.25)
+  return clampConfidence(confidence)
+}
+
+function clampConfidence(value: number): number {
+  return Math.max(0, Math.min(1, Number(value.toFixed(2))))
+}
+
+async function runPythonEnricher(payload: {
+  output: Record<string, unknown>
+  warnings: string[]
+  confidence: number
+}): Promise<PythonEnricherResponse | null> {
+  const commandText = process.env.TAXHACKER_PYTHON_ENRICHER_CMD?.trim()
+  const argsText = process.env.TAXHACKER_PYTHON_ENRICHER_ARGS?.trim()
+  if (!commandText) return null
+
+  const parsedCommand = parseCommand(commandText, argsText)
+  const [command, ...args] = parsedCommand
+  if (!command) return null
+
+  const timeoutMs = getPythonEnricherTimeoutMs()
+
+  return new Promise((resolve) => {
+    const child = spawn(command, args, {
+      stdio: ["pipe", "pipe", "pipe"],
+      // Keep shell disabled to prevent shell injection from command/args configuration.
+      shell: false,
+    })
+    let didResolve = false
+
+    const safeResolve = (value: PythonEnricherResponse | null) => {
+      if (didResolve) return
+      didResolve = true
+      resolve(value)
+    }
+
+    const timeout = setTimeout(() => {
+      child.kill("SIGTERM")
+      setTimeout(() => child.kill("SIGKILL"), SIGTERM_TO_SIGKILL_DELAY_MS)
+      console.warn("Python enricher timed out and was terminated")
+      safeResolve(null)
+    }, timeoutMs)
+
+    const stdoutChunks: string[] = []
+    const stderrChunks: string[] = []
+    let stdoutSize = 0
+    let stderrSize = 0
+
+    child.stdout.on("data", (chunk: Buffer | string) => {
+      if (stdoutSize < MAX_STDOUT_BUFFER_SIZE) {
+        const chunkText = chunk.toString()
+        stdoutChunks.push(chunkText)
+        stdoutSize += chunkText.length
+      }
+    })
+
+    child.stderr.on("data", (chunk: Buffer | string) => {
+      if (stderrSize < MAX_STDERR_BUFFER_SIZE) {
+        const chunkText = chunk.toString()
+        stderrChunks.push(chunkText)
+        stderrSize += chunkText.length
+      }
+    })
+
+    child.on("error", (error: Error) => {
+      clearTimeout(timeout)
+      console.warn("Python enricher failed to start:", error.message)
+      safeResolve(null)
+    })
+
+    child.on("close", (code) => {
+      clearTimeout(timeout)
+      const stdout = stdoutChunks.join("")
+      const stderr = stderrChunks.join("")
+      if (code !== 0 || !stdout.trim()) {
+        if (stderr) console.warn("Python enricher stderr:", stderr)
+        safeResolve(null)
+        return
+      }
+      try {
+        const parsed = JSON.parse(stdout) as PythonEnricherResponse
+        safeResolve(parsed)
+      } catch {
+        safeResolve(null)
+      }
+    })
+
+    try {
+      if (!child.stdin.writable) {
+        safeResolve(null)
+        return
+      }
+      child.stdin.write(JSON.stringify(payload))
+      child.stdin.end()
+    } catch {
+      safeResolve(null)
+    }
+  })
+}
+
+function parseCommand(commandText: string, argsText?: string): string[] {
+  if (!argsText) {
+    const legacyTokens = commandText.split(/\s+/).filter(Boolean)
+    if (legacyTokens.length > 1) {
+      console.warn(
+        "Legacy TAXHACKER_PYTHON_ENRICHER_CMD with inline args is deprecated and will be removed in v0.7.0; use TAXHACKER_PYTHON_ENRICHER_ARGS JSON array"
+      )
+      return legacyTokens
+    }
+    return [commandText]
+  }
+
+  try {
+    const parsedArgs = JSON.parse(argsText)
+    if (Array.isArray(parsedArgs) && parsedArgs.every((arg) => typeof arg === "string")) {
+      return [commandText, ...parsedArgs]
+    }
+  } catch {
+    // ignore malformed args config
+  }
+
+  return [commandText]
+}
+
+function getPythonEnricherTimeoutMs(): number {
+  const timeoutRaw = process.env.TAXHACKER_PYTHON_ENRICHER_TIMEOUT_MS?.trim()
+  if (!timeoutRaw) return DEFAULT_PYTHON_ENRICHER_TIMEOUT_MS
+
+  const parsed = Number.parseInt(timeoutRaw, 10)
+  if (Number.isNaN(parsed)) return DEFAULT_PYTHON_ENRICHER_TIMEOUT_MS
+  if (parsed < 100) return 100
+  if (parsed > 5000) return 5000
+  return parsed
+}
diff --git a/components/forms/warning.tsx b/components/forms/warning.tsx
@@ -0,0 +1,18 @@
+import { cn } from "@/lib/utils"
+import { AlertTriangle } from "lucide-react"
+
+export function FormWarning({ children, className }: { children: React.ReactNode; className?: string }) {
+  return (
+    <div
+      role="alert"
+      aria-live="polite"
+      className={cn(
+        "inline-flex items-center gap-2 px-3 py-2 rounded-md bg-amber-50 text-amber-900 border border-amber-300",
+        className
+      )}
+    >
+      <AlertTriangle className="w-4 h-4 flex-shrink-0" />
+      <p className="text-sm">{children}</p>
+    </div>
+  )
+}