vectorless-rag/main.py at main · learnwithparam/vectorless-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
main.py
-------
Vectorless RAG — LangGraph Agent + PDF Tree (no PageIndex)

Flow:
  1. Download Bigtable PDF
  2. Parse PDF → DocumentTree  (one-time, cached to JSON)
  3. For each question: agent traverses the tree → retrieves sections → generates answer

Install:
  pip install PyMuPDF openai langgraph pydantic python-dotenv

.env:
  OPENAI_API_KEY=sk-...
"""

import json
import os
import urllib.request
from pathlib import Path
from dataclasses import asdict

from dotenv import load_dotenv
from openai import OpenAI

from questions import QUESTIONS
from retriever import retrieve, generate_workflow_png
from tree import parse_pdf, TreeNode

load_dotenv()

# ── Config ────────────────────────────────────────────────────────────────────
PDF_URL = (
    "https://static.googleusercontent.com/media/research.google.com"
    "/en//archive/bigtable-osdi06.pdf"
)
PDF_PATH        = Path("bigtable-osdi06.pdf")
TREE_CACHE_PATH = Path("results/document_tree.json")

LLM_BASE_URL = os.environ.get("LLM_BASE_URL", "https://openrouter.ai/api/v1")
LLM_API_KEY  = os.environ.get("LLM_API_KEY") or os.environ.get("OPENROUTER_API_KEY") or os.environ.get("OPENAI_API_KEY")
MODEL        = os.environ.get("LLM_MODEL", "google/gemini-2.5-flash-lite")

if not LLM_API_KEY:
    raise SystemExit(
        "No LLM API key set.\n"
        "Create a .env file with OPENROUTER_API_KEY=sk-or-... (recommended) or OPENAI_API_KEY=sk-..."
    )

client = OpenAI(api_key=LLM_API_KEY, base_url=LLM_BASE_URL)


# ── Step 1: Download PDF ──────────────────────────────────────────────────────
def download_pdf() -> None:
    if PDF_PATH.exists():
        print(f"[✓] PDF already present: {PDF_PATH}")
        return
    print("[↓] Downloading Bigtable paper …")
    urllib.request.urlretrieve(PDF_URL, PDF_PATH)
    print(f"[✓] Saved → {PDF_PATH}")


# ── Step 2: Build / load tree ─────────────────────────────────────────────────
def dict_to_treenode(data: dict) -> TreeNode:
    """Recursively reconstruct TreeNode from dictionary."""
    children = [
        dict_to_treenode(child) for child in data.get("children", [])
    ]
    data_copy = data.copy()
    data_copy["children"] = children
    return TreeNode(**data_copy)


def get_tree() -> TreeNode:
    """
    Load cached TreeNode from JSON, or build and cache it fresh.
    """
    TREE_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)

    if TREE_CACHE_PATH.exists():
        print(f"[✓] Loading cached tree from {TREE_CACHE_PATH}")
        with open(TREE_CACHE_PATH) as f:
            data = json.load(f)
        # Reconstruct TreeNode from cached dict (extract root from DocumentTree)
        tree = dict_to_treenode(data.get("root", data))
        print(f"    {len(tree.children)} sections loaded")
        return tree

    print("[~] Building tree (first run — takes ~10–30 sec with PyMuPDF4LLM) …")
    tree = parse_pdf(str(PDF_PATH))

    with open(TREE_CACHE_PATH, "w") as f:
        json.dump(asdict(tree), f, indent=2, default=str)
    print(f"[✓] Tree cached → {TREE_CACHE_PATH}")
    return tree


# ── Step 3: Ask a question ────────────────────────────────────────────────────
def ask(question: str, tree: TreeNode) -> dict:
    print(f"\n{'─'*70}")
    print(f"  Q: {question}")
    print(f"{'─'*70}")

    # Pass the LLM client to retrieve function
    result = retrieve(question, tree, client)

    print(f"\n  [Reasoning]  {result['reasoning']}")
    print(f"  [Confidence] {result['confidence']:.0%}")
    print(f"  [Path]       {' → '.join(result['path'])}")

    if result["sources"]:
        print(f"\n  [Sources]")
        for src in result["sources"][:2]:
            print(f"    {src.splitlines()[0]}")   # just the header line

    print(f"\n  [Answer]\n{result['answer']}")
    return result


# ── Main ──────────────────────────────────────────────────────────────────────
def main() -> None:
    download_pdf()

    # Load or build the tree
    tree = get_tree()
    # Generate workflow visualization
    TREE_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
    workflow_png_path = TREE_CACHE_PATH.parent / "workflow.png"
    generate_workflow_png(output_path=str(workflow_png_path))
    print(f"[✓] Workflow diagram saved → {workflow_png_path}")
    print(f"\n{'═'*70}")
    print("  Vectorless RAG — Google Bigtable (no PageIndex)")
    print(f"{'═'*70}")

    results = []
    for i, question in enumerate(QUESTIONS, 1):
        print(f"\n[{i}/{len(QUESTIONS)}]")
        try:
            result = ask(question, tree)
            results.append({"question": question, "result": result, "ok": True})
        except Exception as e:
            print(f"  [ERROR] {e}")
            results.append({"question": question, "error": str(e), "ok": False})

    ok = sum(r["ok"] for r in results)
    print(f"\n{'═'*70}")
    print(f"  Done: {ok}/{len(results)} questions answered successfully")
    print(f"{'═'*70}\n")


if __name__ == "__main__":
    main()