Skip to content

Commit 996bbca

Browse files
Fix some cells in Ingest pdf with PDF Plumber notebook (#152)
* fix two cells * add cell * address comments * addres comments
1 parent b4026ba commit 996bbca

File tree

1 file changed

+75
-47
lines changed

1 file changed

+75
-47
lines changed

notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb

Lines changed: 75 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
"metadata": {},
3737
"outputs": [],
3838
"source": [
39-
"!pip install pdfplumber"
39+
"!pip install pdfplumber==0.11.0"
4040
]
4141
},
4242
{
@@ -53,12 +53,12 @@
5353
{
5454
"cell_type": "code",
5555
"execution_count": 2,
56+
"id": "87c6c286",
5657
"metadata": {},
5758
"outputs": [],
5859
"source": [
59-
"!pip install \"openai\""
60-
],
61-
"id": "87c6c286"
60+
"!pip install \"openai==0.28.1\""
61+
]
6262
},
6363
{
6464
"cell_type": "code",
@@ -94,14 +94,14 @@
9494
{
9595
"cell_type": "code",
9696
"execution_count": 4,
97+
"id": "9dbe989a",
9798
"metadata": {},
9899
"outputs": [],
99100
"source": [
100101
"import os\n",
101102
"from getpass import getpass\n",
102103
"os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key: \")"
103-
],
104-
"id": "9dbe989a"
104+
]
105105
},
106106
{
107107
"cell_type": "code",
@@ -135,12 +135,13 @@
135135
"\n",
136136
"\n",
137137
"References:\n",
138-
"- pdfplumber: https://github.com/jsvine/pdfplumber\n",
139-
"- PyMuPDF (optional alternative): https://pymupdf.readthedocs.io/en/latest/"
138+
"- pdfplumber: [https://github.com/jsvine/pdfplumber]\n",
139+
"- PyMuPDF (optional alternative): [https://pymupdf.readthedocs.io/en/latest/]"
140140
]
141141
},
142142
{
143143
"cell_type": "markdown",
144+
"id": "050463a1",
144145
"metadata": {},
145146
"source": [
146147
"## Uploading PDF File to Stage\n",
@@ -149,19 +150,18 @@
149150
"\n",
150151
"References:\n",
151152
"- [Stage documentation](https://docs.singlestore.com/cloud/load-data/load-data-from-files/stage/)"
152-
],
153-
"id": "050463a1"
153+
]
154154
},
155155
{
156156
"cell_type": "code",
157157
"execution_count": 6,
158+
"id": "91b47930",
158159
"metadata": {},
159160
"outputs": [],
160161
"source": [
161162
"%%sql\n",
162-
"DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf'OVERWRITE"
163-
],
164-
"id": "91b47930"
163+
"DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf' OVERWRITE"
164+
]
165165
},
166166
{
167167
"cell_type": "code",
@@ -176,6 +176,7 @@
176176
{
177177
"cell_type": "code",
178178
"execution_count": 8,
179+
"id": "53fc1109",
179180
"metadata": {},
180181
"outputs": [],
181182
"source": [
@@ -229,8 +230,7 @@
229230
" })\n",
230231
"\n",
231232
"print(f\"Chunking produced {len(chunks)} chunks.\")"
232-
],
233-
"id": "53fc1109"
233+
]
234234
},
235235
{
236236
"attachments": {},
@@ -358,57 +358,85 @@
358358
{
359359
"cell_type": "code",
360360
"execution_count": 13,
361-
"id": "00b7c77b",
361+
"id": "2a82d48e",
362362
"metadata": {},
363363
"outputs": [],
364364
"source": [
365-
"import time, os\n",
366-
"\n",
367-
"# Ensure API key is set (fallback to environment if not already assigned)\n",
368-
"if not getattr(openai, 'api_key', None):\n",
369-
" env_key = os.getenv('OPENAI_API_KEY')\n",
370-
" if env_key:\n",
371-
" openai.api_key = env_key.strip()\n",
372-
" print('Hydrated openai.api_key from environment variable.')\n",
373-
" else:\n",
374-
" raise ValueError('OpenAI API key not set. Set OPENAI_API_KEY env or rerun key input cell.')\n",
375-
"\n",
376-
"# Re-initialize new SDK client if available and was None\n",
377-
"if _use_new and _openai_client is None:\n",
365+
"import os, time, json\n",
366+
"DEFAULT_EMBED_MODEL = \"text-embedding-3-small\"\n",
367+
"DEFAULT_CHAT_MODEL = \"gpt-4o-mini\"\n",
368+
"\n",
369+
"_openai_client = None\n",
370+
"_use_new = False\n",
371+
"\n",
372+
"def _ensure_key():\n",
373+
" key = os.getenv(\"OPENAI_API_KEY\")\n",
374+
" if key and not getattr(openai, 'api_key', None):\n",
375+
" openai.api_key = key.strip()\n",
376+
" if not getattr(openai, 'api_key', None):\n",
377+
" raise ValueError(\"OpenAI API key not set. Set OPENAI_API_KEY env or run the key input cell.\")\n",
378+
"\n",
379+
"def _init_client():\n",
380+
" global _openai_client, _use_new\n",
381+
" if _openai_client is not None:\n",
382+
" return\n",
378383
" try:\n",
379384
" from openai import OpenAI\n",
380385
" _openai_client = OpenAI(api_key=openai.api_key)\n",
381-
" print('Reinitialized OpenAI client.')\n",
382-
" except Exception as e:\n",
383-
" print(f'Failed to reinitialize OpenAI client: {e}')\n",
386+
" _use_new = True\n",
387+
" except Exception:\n",
388+
" _openai_client = None\n",
384389
" _use_new = False\n",
385390
"\n",
391+
"def embed_texts(text_list, model=DEFAULT_EMBED_MODEL):\n",
392+
" _ensure_key(); _init_client()\n",
393+
" if _use_new and _openai_client is not None:\n",
394+
" resp = _openai_client.embeddings.create(model=model, input=text_list)\n",
395+
" return [d.embedding for d in resp.data]\n",
396+
" else:\n",
397+
" resp = openai.Embedding.create(model=model, input=text_list)\n",
398+
" return [d['embedding'] for d in resp['data']]\n",
399+
"\n",
400+
"def embed_text(text, model=DEFAULT_EMBED_MODEL):\n",
401+
" return embed_texts([text], model=model)[0]\n",
402+
"\n",
403+
"def chat_completion(messages, model=DEFAULT_CHAT_MODEL, temperature=0):\n",
404+
" _ensure_key(); _init_client()\n",
405+
" if _use_new and _openai_client is not None:\n",
406+
" resp = _openai_client.chat.completions.create(model=model, messages=messages, temperature=temperature)\n",
407+
" return resp.choices[0].message.content\n",
408+
" else:\n",
409+
" resp = openai.ChatCompletion.create(model=model, messages=messages, temperature=temperature)\n",
410+
" return resp['choices'][0]['message']['content']"
411+
]
412+
},
413+
{
414+
"cell_type": "code",
415+
"execution_count": 14,
416+
"id": "00b7c77b",
417+
"metadata": {},
418+
"outputs": [],
419+
"source": [
420+
"import json, time\n",
421+
"\n",
422+
"EMBED_MODEL = \"text-embedding-3-small\"\n",
386423
"BATCH_SIZE = 10\n",
387-
"MODEL = EMBED_MODEL\n",
388424
"MAX_RETRIES = 3\n",
389425
"\n",
426+
"# Fetch rows needing embeddings\n",
390427
"s2_cur.execute(\"SELECT element_id, text FROM unstructured_data WHERE text_embedding IS NULL OR text_embedding = '';\")\n",
391428
"rows = s2_cur.fetchall()\n",
392429
"print(f\"Rows needing embeddings: {len(rows)}\")\n",
393430
"\n",
394-
"use_new = _use_new\n",
395-
"\n",
396-
"def embed_batch(text_list):\n",
397-
" if use_new and _openai_client is not None:\n",
398-
" resp = _openai_client.embeddings.create(model=MODEL, input=text_list)\n",
399-
" return [item.embedding for item in resp.data]\n",
400-
" else:\n",
401-
" resp = openai.Embedding.create(model=MODEL, input=text_list)\n",
402-
" return [item['embedding'] for item in resp['data']]\n",
403-
"\n",
404431
"for i in range(0, len(rows), BATCH_SIZE):\n",
405432
" batch = rows[i:i+BATCH_SIZE]\n",
406433
" texts = [t for _, t in batch]\n",
407434
" attempt = 0\n",
408435
" while True:\n",
409436
" try:\n",
410-
" embeddings = embed_batch(texts)\n",
437+
" embeddings = embed_texts(texts, model=EMBED_MODEL)\n",
411438
" break\n",
439+
"\n",
412440
" except Exception as e:\n",
413441
" attempt += 1\n",
414442
" if attempt >= MAX_RETRIES:\n",
@@ -440,7 +468,7 @@
440468
},
441469
{
442470
"cell_type": "code",
443-
"execution_count": 14,
471+
"execution_count": 15,
444472
"id": "35e10fa7",
445473
"metadata": {},
446474
"outputs": [],
@@ -452,7 +480,7 @@
452480
},
453481
{
454482
"cell_type": "code",
455-
"execution_count": 15,
483+
"execution_count": 16,
456484
"id": "876a636b",
457485
"metadata": {},
458486
"outputs": [],
@@ -491,7 +519,7 @@
491519
},
492520
{
493521
"cell_type": "code",
494-
"execution_count": 16,
522+
"execution_count": 17,
495523
"id": "8a57d965",
496524
"metadata": {},
497525
"outputs": [],

0 commit comments

Comments
 (0)