|
36 | 36 | "metadata": {}, |
37 | 37 | "outputs": [], |
38 | 38 | "source": [ |
39 | | - "!pip install pdfplumber" |
| 39 | + "!pip install pdfplumber==0.11.0" |
40 | 40 | ] |
41 | 41 | }, |
42 | 42 | { |
|
53 | 53 | { |
54 | 54 | "cell_type": "code", |
55 | 55 | "execution_count": 2, |
| 56 | + "id": "87c6c286", |
56 | 57 | "metadata": {}, |
57 | 58 | "outputs": [], |
58 | 59 | "source": [ |
59 | | - "!pip install \"openai\"" |
60 | | - ], |
61 | | - "id": "87c6c286" |
| 60 | + "!pip install \"openai==0.28.1\"" |
| 61 | + ] |
62 | 62 | }, |
63 | 63 | { |
64 | 64 | "cell_type": "code", |
|
94 | 94 | { |
95 | 95 | "cell_type": "code", |
96 | 96 | "execution_count": 4, |
| 97 | + "id": "9dbe989a", |
97 | 98 | "metadata": {}, |
98 | 99 | "outputs": [], |
99 | 100 | "source": [ |
100 | 101 | "import os\n", |
101 | 102 | "from getpass import getpass\n", |
102 | 103 | "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key: \")" |
103 | | - ], |
104 | | - "id": "9dbe989a" |
| 104 | + ] |
105 | 105 | }, |
106 | 106 | { |
107 | 107 | "cell_type": "code", |
|
135 | 135 | "\n", |
136 | 136 | "\n", |
137 | 137 | "References:\n", |
138 | | - "- pdfplumber: https://github.com/jsvine/pdfplumber\n", |
139 | | - "- PyMuPDF (optional alternative): https://pymupdf.readthedocs.io/en/latest/" |
| 138 | + "- pdfplumber: [https://github.com/jsvine/pdfplumber]\n", |
| 139 | + "- PyMuPDF (optional alternative): [https://pymupdf.readthedocs.io/en/latest/]" |
140 | 140 | ] |
141 | 141 | }, |
142 | 142 | { |
143 | 143 | "cell_type": "markdown", |
| 144 | + "id": "050463a1", |
144 | 145 | "metadata": {}, |
145 | 146 | "source": [ |
146 | 147 | "## Uploading PDF File to Stage\n", |
|
149 | 150 | "\n", |
150 | 151 | "References:\n", |
151 | 152 | "- [Stage documentation](https://docs.singlestore.com/cloud/load-data/load-data-from-files/stage/)" |
152 | | - ], |
153 | | - "id": "050463a1" |
| 153 | + ] |
154 | 154 | }, |
155 | 155 | { |
156 | 156 | "cell_type": "code", |
157 | 157 | "execution_count": 6, |
| 158 | + "id": "91b47930", |
158 | 159 | "metadata": {}, |
159 | 160 | "outputs": [], |
160 | 161 | "source": [ |
161 | 162 | "%%sql\n", |
162 | | - "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf'OVERWRITE" |
163 | | - ], |
164 | | - "id": "91b47930" |
| 163 | + "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf' OVERWRITE" |
| 164 | + ] |
165 | 165 | }, |
166 | 166 | { |
167 | 167 | "cell_type": "code", |
|
176 | 176 | { |
177 | 177 | "cell_type": "code", |
178 | 178 | "execution_count": 8, |
| 179 | + "id": "53fc1109", |
179 | 180 | "metadata": {}, |
180 | 181 | "outputs": [], |
181 | 182 | "source": [ |
|
229 | 230 | " })\n", |
230 | 231 | "\n", |
231 | 232 | "print(f\"Chunking produced {len(chunks)} chunks.\")" |
232 | | - ], |
233 | | - "id": "53fc1109" |
| 233 | + ] |
234 | 234 | }, |
235 | 235 | { |
236 | 236 | "attachments": {}, |
|
358 | 358 | { |
359 | 359 | "cell_type": "code", |
360 | 360 | "execution_count": 13, |
361 | | - "id": "00b7c77b", |
| 361 | + "id": "2a82d48e", |
362 | 362 | "metadata": {}, |
363 | 363 | "outputs": [], |
364 | 364 | "source": [ |
365 | | - "import time, os\n", |
366 | | - "\n", |
367 | | - "# Ensure API key is set (fallback to environment if not already assigned)\n", |
368 | | - "if not getattr(openai, 'api_key', None):\n", |
369 | | - " env_key = os.getenv('OPENAI_API_KEY')\n", |
370 | | - " if env_key:\n", |
371 | | - " openai.api_key = env_key.strip()\n", |
372 | | - " print('Hydrated openai.api_key from environment variable.')\n", |
373 | | - " else:\n", |
374 | | - " raise ValueError('OpenAI API key not set. Set OPENAI_API_KEY env or rerun key input cell.')\n", |
375 | | - "\n", |
376 | | - "# Re-initialize new SDK client if available and was None\n", |
377 | | - "if _use_new and _openai_client is None:\n", |
| 365 | + "import os, time, json\n", |
| 366 | + "DEFAULT_EMBED_MODEL = \"text-embedding-3-small\"\n", |
| 367 | + "DEFAULT_CHAT_MODEL = \"gpt-4o-mini\"\n", |
| 368 | + "\n", |
| 369 | + "_openai_client = None\n", |
| 370 | + "_use_new = False\n", |
| 371 | + "\n", |
| 372 | + "def _ensure_key():\n", |
| 373 | + " key = os.getenv(\"OPENAI_API_KEY\")\n", |
| 374 | + " if key and not getattr(openai, 'api_key', None):\n", |
| 375 | + " openai.api_key = key.strip()\n", |
| 376 | + " if not getattr(openai, 'api_key', None):\n", |
| 377 | + " raise ValueError(\"OpenAI API key not set. Set OPENAI_API_KEY env or run the key input cell.\")\n", |
| 378 | + "\n", |
| 379 | + "def _init_client():\n", |
| 380 | + " global _openai_client, _use_new\n", |
| 381 | + " if _openai_client is not None:\n", |
| 382 | + " return\n", |
378 | 383 | " try:\n", |
379 | 384 | " from openai import OpenAI\n", |
380 | 385 | " _openai_client = OpenAI(api_key=openai.api_key)\n", |
381 | | - " print('Reinitialized OpenAI client.')\n", |
382 | | - " except Exception as e:\n", |
383 | | - " print(f'Failed to reinitialize OpenAI client: {e}')\n", |
| 386 | + " _use_new = True\n", |
| 387 | + " except Exception:\n", |
| 388 | + " _openai_client = None\n", |
384 | 389 | " _use_new = False\n", |
385 | 390 | "\n", |
| 391 | + "def embed_texts(text_list, model=DEFAULT_EMBED_MODEL):\n", |
| 392 | + " _ensure_key(); _init_client()\n", |
| 393 | + " if _use_new and _openai_client is not None:\n", |
| 394 | + " resp = _openai_client.embeddings.create(model=model, input=text_list)\n", |
| 395 | + " return [d.embedding for d in resp.data]\n", |
| 396 | + " else:\n", |
| 397 | + " resp = openai.Embedding.create(model=model, input=text_list)\n", |
| 398 | + " return [d['embedding'] for d in resp['data']]\n", |
| 399 | + "\n", |
| 400 | + "def embed_text(text, model=DEFAULT_EMBED_MODEL):\n", |
| 401 | + " return embed_texts([text], model=model)[0]\n", |
| 402 | + "\n", |
| 403 | + "def chat_completion(messages, model=DEFAULT_CHAT_MODEL, temperature=0):\n", |
| 404 | + " _ensure_key(); _init_client()\n", |
| 405 | + " if _use_new and _openai_client is not None:\n", |
| 406 | + " resp = _openai_client.chat.completions.create(model=model, messages=messages, temperature=temperature)\n", |
| 407 | + " return resp.choices[0].message.content\n", |
| 408 | + " else:\n", |
| 409 | + " resp = openai.ChatCompletion.create(model=model, messages=messages, temperature=temperature)\n", |
| 410 | + " return resp['choices'][0]['message']['content']" |
| 411 | + ] |
| 412 | + }, |
| 413 | + { |
| 414 | + "cell_type": "code", |
| 415 | + "execution_count": 14, |
| 416 | + "id": "00b7c77b", |
| 417 | + "metadata": {}, |
| 418 | + "outputs": [], |
| 419 | + "source": [ |
| 420 | + "import json, time\n", |
| 421 | + "\n", |
| 422 | + "EMBED_MODEL = \"text-embedding-3-small\"\n", |
386 | 423 | "BATCH_SIZE = 10\n", |
387 | | - "MODEL = EMBED_MODEL\n", |
388 | 424 | "MAX_RETRIES = 3\n", |
389 | 425 | "\n", |
| 426 | + "# Fetch rows needing embeddings\n", |
390 | 427 | "s2_cur.execute(\"SELECT element_id, text FROM unstructured_data WHERE text_embedding IS NULL OR text_embedding = '';\")\n", |
391 | 428 | "rows = s2_cur.fetchall()\n", |
392 | 429 | "print(f\"Rows needing embeddings: {len(rows)}\")\n", |
393 | 430 | "\n", |
394 | | - "use_new = _use_new\n", |
395 | | - "\n", |
396 | | - "def embed_batch(text_list):\n", |
397 | | - " if use_new and _openai_client is not None:\n", |
398 | | - " resp = _openai_client.embeddings.create(model=MODEL, input=text_list)\n", |
399 | | - " return [item.embedding for item in resp.data]\n", |
400 | | - " else:\n", |
401 | | - " resp = openai.Embedding.create(model=MODEL, input=text_list)\n", |
402 | | - " return [item['embedding'] for item in resp['data']]\n", |
403 | | - "\n", |
404 | 431 | "for i in range(0, len(rows), BATCH_SIZE):\n", |
405 | 432 | " batch = rows[i:i+BATCH_SIZE]\n", |
406 | 433 | " texts = [t for _, t in batch]\n", |
407 | 434 | " attempt = 0\n", |
408 | 435 | " while True:\n", |
409 | 436 | " try:\n", |
410 | | - " embeddings = embed_batch(texts)\n", |
| 437 | + " embeddings = embed_texts(texts, model=EMBED_MODEL)\n", |
411 | 438 | " break\n", |
| 439 | + "\n", |
412 | 440 | " except Exception as e:\n", |
413 | 441 | " attempt += 1\n", |
414 | 442 | " if attempt >= MAX_RETRIES:\n", |
|
440 | 468 | }, |
441 | 469 | { |
442 | 470 | "cell_type": "code", |
443 | | - "execution_count": 14, |
| 471 | + "execution_count": 15, |
444 | 472 | "id": "35e10fa7", |
445 | 473 | "metadata": {}, |
446 | 474 | "outputs": [], |
|
452 | 480 | }, |
453 | 481 | { |
454 | 482 | "cell_type": "code", |
455 | | - "execution_count": 15, |
| 483 | + "execution_count": 16, |
456 | 484 | "id": "876a636b", |
457 | 485 | "metadata": {}, |
458 | 486 | "outputs": [], |
|
491 | 519 | }, |
492 | 520 | { |
493 | 521 | "cell_type": "code", |
494 | | - "execution_count": 16, |
| 522 | + "execution_count": 17, |
495 | 523 | "id": "8a57d965", |
496 | 524 | "metadata": {}, |
497 | 525 | "outputs": [], |
|
0 commit comments