usemoss · Adityakk9031 · May 1, 2026 · May 1, 2026
@@ -0,0 +1,7 @@
+# Moss credentials — get these from https://moss.dev
+MOSS_PROJECT_ID=your_project_id
+MOSS_PROJECT_KEY=your_project_key
+MOSS_INDEX_NAME=your_index_name
+
+# HuggingFace token — required for gated models (e.g. Llama-3)
+HUGGING_FACE_HUB_TOKEN=your_hf_token
@@ -0,0 +1,100 @@
+# Moss + Smolagents Cookbook
+
+Use [Moss](https://moss.dev) semantic search as a retrieval tool for [Smolagents](https://huggingface.co/docs/smolagents) agents.
+
+## Why Moss with Smolagents?
+
+Traditional vector databases add 200–500 ms per retrieval hop. Moss loads index and model weights directly into your application process, delivering **sub-10 ms** search — fast enough that the retrieval step disappears from the agent's latency budget.
+
+## Installation
+
+We recommend [uv](https://docs.astral.sh/uv/) for dependency management.
+
+```bash
+uv sync
+```
+
+Or install dependencies directly:
+
+```bash
+uv pip install smolagents moss python-dotenv
+```
+
+## Setup
+
+Copy the example env file and fill in your credentials:
+
+```bash
+cp .env.example .env
+```
+
+Required variables:
+
+```env
+MOSS_PROJECT_ID=your_project_id
+MOSS_PROJECT_KEY=your_project_key
+MOSS_INDEX_NAME=your_index_name
+```
+
+`HUGGING_FACE_HUB_TOKEN` is needed if the chosen model requires authentication.
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `tool.py` | `MossRetrievalTool` — the reusable smolagents `Tool` subclass |
+| `moss_smol_agent_demo.py` | End-to-end demo: load index, build agent, run a question |
+| `test_integration.py` | Unit tests with mocked Moss client |
+
+## Running the demo
+
+```bash
+uv run moss_smol_agent_demo.py
+```
+
+## How it works
+
+### Loading the index
+
+The index must be pulled into local memory **once** before the agent starts. This is the step that switches retrieval from cloud-round-trip speed to local speed:
+
+```python
+asyncio.run(client.load_index("my-index"))
+```
+
+Call this in your setup/startup code, not inside the tool, so the cost is paid once.
+
+### Async / sync bridge
+
+Smolagents' `Tool.forward()` is synchronous, but `MossClient` is async. The tool solves this with a **persistent event loop running in a daemon thread**, started in `__init__`:
+
+```python
+self._loop = asyncio.new_event_loop()
+self._thread = threading.Thread(target=self._loop.run_forever, daemon=True)
+self._thread.start()
+```
+
+Each `forward()` call submits the coroutine to that loop and blocks until it completes:
+
+```python
+asyncio.run_coroutine_threadsafe(coro, self._loop).result()
+```
+
+This approach is better than `asyncio.run()` for two reasons:
+- **No per-call overhead** — creating and tearing down an event loop on every search would add latency, defeating the purpose of local retrieval.
+- **Works in Jupyter / async frameworks** — `asyncio.run()` raises `RuntimeError` when called from an already-running loop; `run_coroutine_threadsafe` does not.
+
+### Metadata filtering
+
+Pass structured filters using the Moss filter DSL:
+
+```python
+metadata_filter = {
+    "$and": [
+        {"field": "category", "condition": {"$eq": "refunds"}},
+        {"field": "price", "condition": {"$lt": 50}},
+    ]
+}
+```
+
+Available operators: `$eq`, `$ne`, `$gt`, `$gte`, `$lt`, `$lte`, `$in`, `$and`, `$or`.
@@ -0,0 +1,50 @@
+import asyncio
+import os
+
+from dotenv import load_dotenv
+from smolagents import CodeAgent, InferenceClientModel
+
+from moss import MossClient
+from tool import MossRetrievalTool
+
+load_dotenv()
+
+
+def main():
+    project_id = os.getenv("MOSS_PROJECT_ID")
+    project_key = os.getenv("MOSS_PROJECT_KEY")
+    index_name = os.getenv("MOSS_INDEX_NAME")
+
+    if not all([project_id, project_key, index_name]):
+        raise EnvironmentError(
+            "Please set MOSS_PROJECT_ID, MOSS_PROJECT_KEY, and MOSS_INDEX_NAME "
+            "in your environment or .env file."
+        )
+
+    client = MossClient(project_id, project_key)
+
+    # Load the index into local memory before the agent runs.
+    # This one-time setup is what enables sub-10ms retrieval inside the agent loop.
+    print(f"Loading index '{index_name}' into local memory...")
+    asyncio.run(client.load_index(index_name))
+    print("Index loaded.\n")
+
+    retrieval_tool = MossRetrievalTool(client, index_name)
+
+    # InferenceClientModel uses the HuggingFace Inference API.
+    # Set HUGGING_FACE_HUB_TOKEN in your .env if the model requires authentication.
+    model = InferenceClientModel("meta-llama/Llama-3.3-70B-Instruct")
+    agent = CodeAgent(tools=[retrieval_tool], model=model, add_base_tools=True)
+
+    question = "What is the policy for processing refunds for digital goods?"
+    print(f"Question: {question}")
+    print("-" * 50)
+
+    response = agent.run(question)
+
+    print("\n--- Agent Response ---")
+    print(response)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,30 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "smolagents-moss"
+version = "0.1.0"
+description = "Smolagents integration for Moss semantic search"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "BSD-2-Clause" }
+authors = [
+    { name = "InferEdge Inc.", email = "contact@moss.dev" }
+]
+dependencies = [
+    "smolagents",
+    "moss>=1.0.0",
+    "python-dotenv",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["tool.py"]
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "README.md",
+    "tool.py",
+    "moss_smol_agent_demo.py",
+    ".env.example",
+]
@@ -0,0 +1,64 @@
+import unittest
+from unittest.mock import MagicMock, patch
+
+from tool import MossRetrievalTool
+
+
+class TestMossRetrievalTool(unittest.TestCase):
+    def setUp(self):
+        self.mock_client = MagicMock()
+        self.tool = MossRetrievalTool(self.mock_client, "test-index")
+
+    def tearDown(self):
+        self.tool._loop.call_soon_threadsafe(self.tool._loop.stop)
+        self.tool._thread.join(timeout=1)
+
+    @patch.object(MossRetrievalTool, "_run_async")
+    def test_forward_formats_results(self, mock_run_async):
+        mock_docs = [
+            MagicMock(id="d1", text="First result content", score=0.9),
+            MagicMock(id="d2", text="Second result content", score=0.8),
+        ]
+        mock_run_async.return_value = MagicMock(docs=mock_docs)
+
+        result = self.tool.forward("test query", top_k=2)
+
+        self.assertIn("Result ID: d1", result)
+        self.assertIn("First result content", result)
+        self.assertIn("Score: 0.900", result)
+        self.assertIn("Result ID: d2", result)
+        self.assertIn("Second result content", result)
+        self.assertIn("Score: 0.800", result)
+
+    @patch.object(MossRetrievalTool, "_run_async")
+    def test_forward_empty_results(self, mock_run_async):
+        mock_run_async.return_value = MagicMock(docs=[])
+        result = self.tool.forward("empty query")
+        self.assertEqual(result, "")
+
+    @patch.object(MossRetrievalTool, "_run_async")
+    def test_forward_passes_metadata_filter(self, mock_run_async):
+        mock_run_async.return_value = MagicMock(docs=[])
+        filt = {"$and": [{"field": "category", "condition": {"$eq": "refunds"}}]}
+        self.tool.forward("query", top_k=3, metadata_filter=filt)
+
+        _, call_kwargs = mock_run_async.call_args
+        # _run_async is called with a positional coroutine arg — just verify it was called
+        self.assertTrue(mock_run_async.called)
+
+    @patch.object(MossRetrievalTool, "_run_async")
+    def test_forward_propagates_errors(self, mock_run_async):
+        mock_run_async.side_effect = RuntimeError("connection failed")
+        with self.assertRaises(RuntimeError, msg="connection failed"):
+            self.tool.forward("error query")
+
+    def test_tool_schema(self):
+        self.assertEqual(self.tool.name, "moss_retrieval")
+        self.assertIn("query", self.tool.inputs)
+        self.assertIn("top_k", self.tool.inputs)
+        self.assertIn("metadata_filter", self.tool.inputs)
+        self.assertEqual(self.tool.output_type, "string")
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,65 @@
+import asyncio
+import threading
+from typing import Any, Dict, Optional
+
+from smolagents import Tool
+from moss import MossClient, QueryOptions
+
+
+class MossRetrievalTool(Tool):
+    """Smolagents Tool that runs semantic search against a locally loaded Moss index."""
+
+    name = "moss_retrieval"
+    description = (
+        "Finds relevant information from a knowledge base using semantic search. "
+        "Use this when the answer is likely contained in indexed documents."
+    )
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The search query string.",
+        },
+        "top_k": {
+            "type": "integer",
+            "description": "Number of results to return (default: 5).",
+            "nullable": True,
+            "default": 5,
+        },
+        "metadata_filter": {
+            "type": "object",
+            "description": (
+                "Optional filter using the Moss filter DSL. "
+                "Example: {'$and': [{'field': 'category', 'condition': {'$eq': 'refunds'}}]}"
+            ),
+            "nullable": True,
+        },
+    }
+    output_type = "string"
+
+    def __init__(self, client: MossClient, index_name: str):
+        super().__init__()
+        self.client = client
+        self.index_name = index_name
+        # A persistent event loop in a daemon thread avoids two problems:
+        # 1. Per-call loop creation/teardown overhead (kills sub-10ms latency).
+        # 2. RuntimeError when forward() is called from an already-running loop
+        #    (Jupyter notebooks, async frameworks).
+        self._loop = asyncio.new_event_loop()
+        self._thread = threading.Thread(target=self._loop.run_forever, daemon=True)
+        self._thread.start()
+
+    def _run_async(self, coro) -> Any:
+        return asyncio.run_coroutine_threadsafe(coro, self._loop).result()
+
+    def forward(
+        self,
+        query: str,
+        top_k: int = 5,
+        metadata_filter: Optional[Dict[str, Any]] = None,
+    ) -> str:
+        options = QueryOptions(top_k=top_k, filter=metadata_filter)
+        results = self._run_async(self.client.query(self.index_name, query, options))
+        return "\n\n".join(
+            f"--- Result ID: {doc.id} (Score: {doc.score:.3f}) ---\n{doc.text}"
+            for doc in results.docs
+        )