-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
153 lines (120 loc) · 5.46 KB
/
Copy pathmain.py
File metadata and controls
153 lines (120 loc) · 5.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
main.py
-------
Vectorless RAG — LangGraph Agent + PDF Tree (no PageIndex)
Flow:
1. Download Bigtable PDF
2. Parse PDF → DocumentTree (one-time, cached to JSON)
3. For each question: agent traverses the tree → retrieves sections → generates answer
Install:
pip install PyMuPDF openai langgraph pydantic python-dotenv
.env:
OPENAI_API_KEY=sk-...
"""
import json
import os
import urllib.request
from pathlib import Path
from dataclasses import asdict
from dotenv import load_dotenv
from openai import OpenAI
from questions import QUESTIONS
from retriever import retrieve, generate_workflow_png
from tree import parse_pdf, TreeNode
load_dotenv()
# ── Config ────────────────────────────────────────────────────────────────────
PDF_URL = (
"https://static.googleusercontent.com/media/research.google.com"
"/en//archive/bigtable-osdi06.pdf"
)
PDF_PATH = Path("bigtable-osdi06.pdf")
TREE_CACHE_PATH = Path("results/document_tree.json")
LLM_BASE_URL = os.environ.get("LLM_BASE_URL", "https://openrouter.ai/api/v1")
LLM_API_KEY = os.environ.get("LLM_API_KEY") or os.environ.get("OPENROUTER_API_KEY") or os.environ.get("OPENAI_API_KEY")
MODEL = os.environ.get("LLM_MODEL", "google/gemini-2.5-flash-lite")
if not LLM_API_KEY:
raise SystemExit(
"No LLM API key set.\n"
"Create a .env file with OPENROUTER_API_KEY=sk-or-... (recommended) or OPENAI_API_KEY=sk-..."
)
client = OpenAI(api_key=LLM_API_KEY, base_url=LLM_BASE_URL)
# ── Step 1: Download PDF ──────────────────────────────────────────────────────
def download_pdf() -> None:
if PDF_PATH.exists():
print(f"[✓] PDF already present: {PDF_PATH}")
return
print("[↓] Downloading Bigtable paper …")
urllib.request.urlretrieve(PDF_URL, PDF_PATH)
print(f"[✓] Saved → {PDF_PATH}")
# ── Step 2: Build / load tree ─────────────────────────────────────────────────
def dict_to_treenode(data: dict) -> TreeNode:
"""Recursively reconstruct TreeNode from dictionary."""
children = [
dict_to_treenode(child) for child in data.get("children", [])
]
data_copy = data.copy()
data_copy["children"] = children
return TreeNode(**data_copy)
def get_tree() -> TreeNode:
"""
Load cached TreeNode from JSON, or build and cache it fresh.
"""
TREE_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
if TREE_CACHE_PATH.exists():
print(f"[✓] Loading cached tree from {TREE_CACHE_PATH}")
with open(TREE_CACHE_PATH) as f:
data = json.load(f)
# Reconstruct TreeNode from cached dict (extract root from DocumentTree)
tree = dict_to_treenode(data.get("root", data))
print(f" {len(tree.children)} sections loaded")
return tree
print("[~] Building tree (first run — takes ~10–30 sec with PyMuPDF4LLM) …")
tree = parse_pdf(str(PDF_PATH))
with open(TREE_CACHE_PATH, "w") as f:
json.dump(asdict(tree), f, indent=2, default=str)
print(f"[✓] Tree cached → {TREE_CACHE_PATH}")
return tree
# ── Step 3: Ask a question ────────────────────────────────────────────────────
def ask(question: str, tree: TreeNode) -> dict:
print(f"\n{'─'*70}")
print(f" Q: {question}")
print(f"{'─'*70}")
# Pass the LLM client to retrieve function
result = retrieve(question, tree, client)
print(f"\n [Reasoning] {result['reasoning']}")
print(f" [Confidence] {result['confidence']:.0%}")
print(f" [Path] {' → '.join(result['path'])}")
if result["sources"]:
print(f"\n [Sources]")
for src in result["sources"][:2]:
print(f" {src.splitlines()[0]}") # just the header line
print(f"\n [Answer]\n{result['answer']}")
return result
# ── Main ──────────────────────────────────────────────────────────────────────
def main() -> None:
download_pdf()
# Load or build the tree
tree = get_tree()
# Generate workflow visualization
TREE_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
workflow_png_path = TREE_CACHE_PATH.parent / "workflow.png"
generate_workflow_png(output_path=str(workflow_png_path))
print(f"[✓] Workflow diagram saved → {workflow_png_path}")
print(f"\n{'═'*70}")
print(" Vectorless RAG — Google Bigtable (no PageIndex)")
print(f"{'═'*70}")
results = []
for i, question in enumerate(QUESTIONS, 1):
print(f"\n[{i}/{len(QUESTIONS)}]")
try:
result = ask(question, tree)
results.append({"question": question, "result": result, "ok": True})
except Exception as e:
print(f" [ERROR] {e}")
results.append({"question": question, "error": str(e), "ok": False})
ok = sum(r["ok"] for r in results)
print(f"\n{'═'*70}")
print(f" Done: {ok}/{len(results)} questions answered successfully")
print(f"{'═'*70}\n")
if __name__ == "__main__":
main()