From c21d4c484d051123bb3d493a3898679d7df4a627 Mon Sep 17 00:00:00 2001
From: Roberto Rodriguez <9653181+Cyb3rWard0g@users.noreply.github.com>
Date: Thu, 23 Jan 2025 01:32:27 -0500
Subject: [PATCH] added persistence for sentencetransformer models

---
 ...entencetransformers_all-MiniLM-L6-v2.ipynb | 193 ++++++++++++++----
 src/floki/document/embedder/sentence.py       |  27 ++-
 2 files changed, 182 insertions(+), 38 deletions(-)

diff --git a/cookbook/vectorstores/chroma_sentencetransformers_all-MiniLM-L6-v2.ipynb b/cookbook/vectorstores/chroma_sentencetransformers_all-MiniLM-L6-v2.ipynb
index e31bc5d..c44e0c4 100644
--- a/cookbook/vectorstores/chroma_sentencetransformers_all-MiniLM-L6-v2.ipynb
+++ b/cookbook/vectorstores/chroma_sentencetransformers_all-MiniLM-L6-v2.ipynb
@@ -34,6 +34,24 @@
     "!pip install floki-ai chromadb"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Enable Logging"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -45,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -54,7 +72,7 @@
        "True"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -75,14 +93,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:datasets:PyTorch version 2.5.1 available.\n",
+      "INFO:floki.document.embedder.sentence:Loading SentenceTransformer model from local path: model\n",
+      "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: model\n",
+      "INFO:floki.document.embedder.sentence:Model loaded successfully.\n"
+     ]
+    }
+   ],
    "source": [
     "from floki.document.embedder import SentenceTransformerEmbedder\n",
     "\n",
     "embedding_function = SentenceTransformerEmbedder(\n",
-    "    model=\"all-MiniLM-L6-v2\"\n",
+    "    model=\"all-MiniLM-L6-v2\",\n",
+    "    cache_dir=\"model\"\n",
     ")"
    ]
   },
@@ -97,9 +127,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:floki.storage.vectorstores.chroma:ChromaVectorStore initialized with collection: example_collection\n"
+     ]
+    }
+   ],
    "source": [
     "from floki.storage import ChromaVectorStore\n",
     "\n",
@@ -130,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -190,9 +228,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:floki.document.embedder.sentence:Generating embeddings for 10 input(s).\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2de0ae5fbe84c838b47b2cf393ca7ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -217,7 +276,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -225,16 +284,16 @@
      "output_type": "stream",
      "text": [
       "Retrieved documents:\n",
-      "ID: b6020c96-2c81-452f-b01f-a7143d6aacff, Text: Gandalf: A wizard is never late, Frodo Baggins. Nor is he early; he arrives precisely when he means to., Metadata: {'location': 'The Shire', 'topic': 'wisdom'}\n",
-      "ID: f864aba5-1c70-451c-8c9b-e681cb5dc1c2, Text: Frodo: I wish the Ring had never come to me. I wish none of this had happened., Metadata: {'location': 'Moria', 'topic': 'destiny'}\n",
-      "ID: b3bce064-a6f3-4b8c-9c5b-66f9664b5c4a, Text: Aragorn: You cannot wield it! None of us can. The One Ring answers to Sauron alone. It has no other master., Metadata: {'location': 'Rivendell', 'topic': 'power'}\n",
-      "ID: 3bd8be9e-8573-4a10-b83b-ab2e46f11045, Text: Sam: I can't carry it for you, but I can carry you!, Metadata: {'location': 'Mount Doom', 'topic': 'friendship'}\n",
-      "ID: b51c9a0d-4698-46fa-af42-8878dd0466f8, Text: Legolas: A red sun rises. Blood has been spilled this night., Metadata: {'location': 'Rohan', 'topic': 'war'}\n",
-      "ID: fe633494-08ee-4c8e-86d4-5d54331a9896, Text: Gimli: Certainty of death. Small chance of success. What are we waiting for?, Metadata: {'location': \"Helm's Deep\", 'topic': 'bravery'}\n",
-      "ID: 6e2676a6-79b7-4837-9c2d-c93aebeb046e, Text: Boromir: One does not simply walk into Mordor., Metadata: {'location': 'Rivendell', 'topic': 'impossible tasks'}\n",
-      "ID: 2b2aeda6-2629-46d3-8f5c-6bafac9b893f, Text: Galadriel: Even the smallest person can change the course of the future., Metadata: {'location': 'Lothlórien', 'topic': 'hope'}\n",
-      "ID: 8aee61b6-9e7a-4187-bf1e-b269c27776a6, Text: Théoden: So it begins., Metadata: {'location': \"Helm's Deep\", 'topic': 'battle'}\n",
-      "ID: 091fa07a-2672-4d6f-adf8-6812990f440f, Text: Elrond: The strength of the Ring-bearer is failing. In his heart, Frodo begins to understand. The quest will claim his life., Metadata: {'location': 'Rivendell', 'topic': 'sacrifice'}\n"
+      "ID: b70624bd-e2cd-45c1-91ea-5793e7ca379b, Text: Gandalf: A wizard is never late, Frodo Baggins. Nor is he early; he arrives precisely when he means to., Metadata: {'location': 'The Shire', 'topic': 'wisdom'}\n",
+      "ID: 9138873e-19a8-4261-bb2a-4dc7cf88160a, Text: Frodo: I wish the Ring had never come to me. I wish none of this had happened., Metadata: {'location': 'Moria', 'topic': 'destiny'}\n",
+      "ID: 97f0faca-c592-4464-8caf-35a8bf334250, Text: Aragorn: You cannot wield it! None of us can. The One Ring answers to Sauron alone. It has no other master., Metadata: {'location': 'Rivendell', 'topic': 'power'}\n",
+      "ID: e953a8ad-e73d-41bb-9fc6-275e0abd3c71, Text: Sam: I can't carry it for you, but I can carry you!, Metadata: {'location': 'Mount Doom', 'topic': 'friendship'}\n",
+      "ID: 7698118d-33b7-4d63-8fc8-c81ef7514d29, Text: Legolas: A red sun rises. Blood has been spilled this night., Metadata: {'location': 'Rohan', 'topic': 'war'}\n",
+      "ID: 28a85d66-d0ce-4cc0-a60d-2d7978b6b337, Text: Gimli: Certainty of death. Small chance of success. What are we waiting for?, Metadata: {'location': \"Helm's Deep\", 'topic': 'bravery'}\n",
+      "ID: d5608037-ec12-4fc6-bb4d-cdcc5ef0cad6, Text: Boromir: One does not simply walk into Mordor., Metadata: {'location': 'Rivendell', 'topic': 'impossible tasks'}\n",
+      "ID: c7f2c3a0-abf3-4077-8a94-791f8ed35c6c, Text: Galadriel: Even the smallest person can change the course of the future., Metadata: {'location': 'Lothlórien', 'topic': 'hope'}\n",
+      "ID: 91362ebe-c4bd-4c97-a4ae-5a93c637053a, Text: Théoden: So it begins., Metadata: {'location': \"Helm's Deep\", 'topic': 'battle'}\n",
+      "ID: 34a3b15c-52a4-46ee-8c84-da177e2639f0, Text: Elrond: The strength of the Ring-bearer is failing. In his heart, Frodo begins to understand. The quest will claim his life., Metadata: {'location': 'Rivendell', 'topic': 'sacrifice'}\n"
      ]
     }
    ],
@@ -257,14 +316,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:floki.document.embedder.sentence:Generating embeddings for 1 input(s).\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "46a7ca139eaa4bc8b95616d3405e0b1e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Updated document: [{'id': 'b6020c96-2c81-452f-b01f-a7143d6aacff', 'metadata': {'location': 'Fangorn Forest', 'topic': 'hope and wisdom'}, 'document': 'Gandalf: Even the wisest cannot foresee all ends, but hope remains while the Company is true.'}]\n"
+      "Updated document: [{'id': 'b70624bd-e2cd-45c1-91ea-5793e7ca379b', 'metadata': {'location': 'Fangorn Forest', 'topic': 'hope and wisdom'}, 'document': 'Gandalf: Even the wisest cannot foresee all ends, but hope remains while the Company is true.'}]\n"
      ]
     }
    ],
@@ -296,7 +376,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -327,9 +407,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:floki.document.embedder.sentence:Generating embeddings for 1 input(s).\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ff73c84b07b44669fea90d946f61696",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -363,9 +464,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:floki.document.embedder.sentence:Generating embeddings for 1 input(s).\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "065019811989450ca349856d3ee75d36",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "# Search for documents with specific metadata filters\n",
     "filter_conditions = {\n",
@@ -380,13 +503,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'ids': [['b6020c96-2c81-452f-b01f-a7143d6aacff']],\n",
+       "{'ids': [['b70624bd-e2cd-45c1-91ea-5793e7ca379b']],\n",
        " 'embeddings': None,\n",
        " 'documents': [['Gandalf: Even the wisest cannot foresee all ends, but hope remains while the Company is true.']],\n",
        " 'uris': None,\n",
@@ -398,7 +521,7 @@
        "  <IncludeEnum.metadatas: 'metadatas'>]}"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -418,16 +541,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['example_collection']"
+       "[Collection(name=example_collection)]"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -438,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -448,7 +571,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -457,7 +580,7 @@
        "[]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/src/floki/document/embedder/sentence.py b/src/floki/document/embedder/sentence.py
index 10b48b9..3a0fa06 100644
--- a/src/floki/document/embedder/sentence.py
+++ b/src/floki/document/embedder/sentence.py
@@ -2,6 +2,7 @@
 from typing import List, Any, Optional, Union, Literal
 from pydantic import Field
 import logging
+import os
 
 logger = logging.getLogger(__name__)
 
@@ -15,6 +16,7 @@ class SentenceTransformerEmbedder(EmbedderBase):
     device: Literal["cpu", "cuda", "mps", "npu"] = Field(default="cpu", description="Device for computation.")
     normalize_embeddings: bool = Field(default=False, description="Whether to normalize embeddings.")
     multi_process: bool = Field(default=False, description="Whether to use multi-process encoding.")
+    cache_dir: Optional[str] = Field(default=None, description="Directory to cache or load the model.")
     
     client: Optional[Any] = Field(default=None, init=False, description="Loaded SentenceTransformer model.")
 
@@ -32,9 +34,28 @@ def model_post_init(self, __context: Any) -> None:
                 "Install it using `pip install sentence-transformers`."
             )
 
-        logger.info(f"Loading SentenceTransformer model: {self.model}")
-        self.client: SentenceTransformer = SentenceTransformer(model_name_or_path=self.model, device=self.device)
-        logger.info("Model loaded successfully.")
+        # Determine whether to load from cache or download
+        model_path = self.cache_dir if self.cache_dir and os.path.exists(self.cache_dir) else self.model
+
+        # Attempt to load the model
+        try:
+            if os.path.exists(model_path):
+                logger.info(f"Loading SentenceTransformer model from local path: {model_path}")
+            else:
+                logger.info(f"Downloading SentenceTransformer model: {self.model}")
+                if self.cache_dir:
+                    logger.info(f"Model will be cached to: {self.cache_dir}")
+
+            self.client: SentenceTransformer = SentenceTransformer(model_name_or_path=model_path, device=self.device)
+            logger.info("Model loaded successfully.")
+        except Exception as e:
+            logger.error(f"Failed to load SentenceTransformer model: {e}")
+            raise
+
+        # Save to cache directory if downloaded
+        if model_path == self.model and self.cache_dir and not os.path.exists(self.cache_dir):
+            logger.info(f"Saving the downloaded model to: {self.cache_dir}")
+            self.client.save(self.cache_dir)
 
     def embed(self, input: Union[str, List[str]]) -> Union[List[float], List[List[float]]]:
         """