Fix some cells in Ingest pdf with PDF Plumber notebook (#152)

lsingh4634426 · web-flow · commit 996bbca083a2 · 2025-11-19T09:55:44.000-06:00
* fix two cells

* add cell

* address comments

* addres comments
diff --git a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb
@@ -36,7 +36,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "!pip install pdfplumber"
+        "!pip install pdfplumber==0.11.0"
       ]
     },
     {
@@ -53,12 +53,12 @@
     {
       "cell_type": "code",
       "execution_count": 2,
+      "id": "87c6c286",
       "metadata": {},
       "outputs": [],
       "source": [
-        "!pip install \"openai\""
-      ],
-      "id": "87c6c286"
+        "!pip install \"openai==0.28.1\""
+      ]
     },
     {
       "cell_type": "code",
@@ -94,14 +94,14 @@
     {
       "cell_type": "code",
       "execution_count": 4,
+      "id": "9dbe989a",
       "metadata": {},
       "outputs": [],
       "source": [
         "import os\n",
         "from getpass import getpass\n",
         "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key: \")"
-      ],
-      "id": "9dbe989a"
+      ]
     },
     {
       "cell_type": "code",
@@ -135,12 +135,13 @@
         "\n",
         "\n",
         "References:\n",
-        "- pdfplumber: https://github.com/jsvine/pdfplumber\n",
-        "- PyMuPDF (optional alternative): https://pymupdf.readthedocs.io/en/latest/"
+        "- pdfplumber: [https://github.com/jsvine/pdfplumber]\n",
+        "- PyMuPDF (optional alternative): [https://pymupdf.readthedocs.io/en/latest/]"
       ]
     },
     {
       "cell_type": "markdown",
+      "id": "050463a1",
       "metadata": {},
       "source": [
         "## Uploading PDF File to Stage\n",
@@ -149,19 +150,18 @@
         "\n",
         "References:\n",
         "- [Stage documentation](https://docs.singlestore.com/cloud/load-data/load-data-from-files/stage/)"
-      ],
-      "id": "050463a1"
+      ]
     },
     {
       "cell_type": "code",
       "execution_count": 6,
+      "id": "91b47930",
       "metadata": {},
       "outputs": [],
       "source": [
         "%%sql\n",
-        "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf'OVERWRITE"
-      ],
-      "id": "91b47930"
+        "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf' OVERWRITE"
+      ]
     },
     {
       "cell_type": "code",
@@ -176,6 +176,7 @@
     {
       "cell_type": "code",
       "execution_count": 8,
+      "id": "53fc1109",
       "metadata": {},
       "outputs": [],
       "source": [
@@ -229,8 +230,7 @@
         "    })\n",
         "\n",
         "print(f\"Chunking produced {len(chunks)} chunks.\")"
-      ],
-      "id": "53fc1109"
+      ]
     },
     {
       "attachments": {},
@@ -358,57 +358,85 @@
     {
       "cell_type": "code",
       "execution_count": 13,
-      "id": "00b7c77b",
+      "id": "2a82d48e",
       "metadata": {},
       "outputs": [],
       "source": [
-        "import time, os\n",
-        "\n",
-        "# Ensure API key is set (fallback to environment if not already assigned)\n",
-        "if not getattr(openai, 'api_key', None):\n",
-        "    env_key = os.getenv('OPENAI_API_KEY')\n",
-        "    if env_key:\n",
-        "        openai.api_key = env_key.strip()\n",
-        "        print('Hydrated openai.api_key from environment variable.')\n",
-        "    else:\n",
-        "        raise ValueError('OpenAI API key not set. Set OPENAI_API_KEY env or rerun key input cell.')\n",
-        "\n",
-        "# Re-initialize new SDK client if available and was None\n",
-        "if _use_new and _openai_client is None:\n",
+        "import os, time, json\n",
+        "DEFAULT_EMBED_MODEL = \"text-embedding-3-small\"\n",
+        "DEFAULT_CHAT_MODEL = \"gpt-4o-mini\"\n",
+        "\n",
+        "_openai_client = None\n",
+        "_use_new = False\n",
+        "\n",
+        "def _ensure_key():\n",
+        "    key = os.getenv(\"OPENAI_API_KEY\")\n",
+        "    if key and not getattr(openai, 'api_key', None):\n",
+        "        openai.api_key = key.strip()\n",
+        "    if not getattr(openai, 'api_key', None):\n",
+        "        raise ValueError(\"OpenAI API key not set. Set OPENAI_API_KEY env or run the key input cell.\")\n",
+        "\n",
+        "def _init_client():\n",
+        "    global _openai_client, _use_new\n",
+        "    if _openai_client is not None:\n",
+        "        return\n",
         "    try:\n",
         "        from openai import OpenAI\n",
         "        _openai_client = OpenAI(api_key=openai.api_key)\n",
-        "        print('Reinitialized OpenAI client.')\n",
-        "    except Exception as e:\n",
-        "        print(f'Failed to reinitialize OpenAI client: {e}')\n",
+        "        _use_new = True\n",
+        "    except Exception:\n",
+        "        _openai_client = None\n",
         "        _use_new = False\n",
         "\n",
+        "def embed_texts(text_list, model=DEFAULT_EMBED_MODEL):\n",
+        "    _ensure_key(); _init_client()\n",
+        "    if _use_new and _openai_client is not None:\n",
+        "        resp = _openai_client.embeddings.create(model=model, input=text_list)\n",
+        "        return [d.embedding for d in resp.data]\n",
+        "    else:\n",
+        "        resp = openai.Embedding.create(model=model, input=text_list)\n",
+        "        return [d['embedding'] for d in resp['data']]\n",
+        "\n",
+        "def embed_text(text, model=DEFAULT_EMBED_MODEL):\n",
+        "    return embed_texts([text], model=model)[0]\n",
+        "\n",
+        "def chat_completion(messages, model=DEFAULT_CHAT_MODEL, temperature=0):\n",
+        "    _ensure_key(); _init_client()\n",
+        "    if _use_new and _openai_client is not None:\n",
+        "        resp = _openai_client.chat.completions.create(model=model, messages=messages, temperature=temperature)\n",
+        "        return resp.choices[0].message.content\n",
+        "    else:\n",
+        "        resp = openai.ChatCompletion.create(model=model, messages=messages, temperature=temperature)\n",
+        "        return resp['choices'][0]['message']['content']"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "id": "00b7c77b",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import json, time\n",
+        "\n",
+        "EMBED_MODEL = \"text-embedding-3-small\"\n",
         "BATCH_SIZE = 10\n",
-        "MODEL = EMBED_MODEL\n",
         "MAX_RETRIES = 3\n",
         "\n",
+        "# Fetch rows needing embeddings\n",
         "s2_cur.execute(\"SELECT element_id, text FROM unstructured_data WHERE text_embedding IS NULL OR text_embedding = '';\")\n",
         "rows = s2_cur.fetchall()\n",
         "print(f\"Rows needing embeddings: {len(rows)}\")\n",
         "\n",
-        "use_new = _use_new\n",
-        "\n",
-        "def embed_batch(text_list):\n",
-        "    if use_new and _openai_client is not None:\n",
-        "        resp = _openai_client.embeddings.create(model=MODEL, input=text_list)\n",
-        "        return [item.embedding for item in resp.data]\n",
-        "    else:\n",
-        "        resp = openai.Embedding.create(model=MODEL, input=text_list)\n",
-        "        return [item['embedding'] for item in resp['data']]\n",
-        "\n",
         "for i in range(0, len(rows), BATCH_SIZE):\n",
         "    batch = rows[i:i+BATCH_SIZE]\n",
         "    texts = [t for _, t in batch]\n",
         "    attempt = 0\n",
         "    while True:\n",
         "        try:\n",
-        "            embeddings = embed_batch(texts)\n",
+        "            embeddings = embed_texts(texts, model=EMBED_MODEL)\n",
         "            break\n",
+        "\n",
         "        except Exception as e:\n",
         "            attempt += 1\n",
         "            if attempt >= MAX_RETRIES:\n",
@@ -440,7 +468,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 14,
+      "execution_count": 15,
       "id": "35e10fa7",
       "metadata": {},
       "outputs": [],
@@ -452,7 +480,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 15,
+      "execution_count": 16,
       "id": "876a636b",
       "metadata": {},
       "outputs": [],
@@ -491,7 +519,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 16,
+      "execution_count": 17,
       "id": "8a57d965",
       "metadata": {},
       "outputs": [],