diff --git a/.gitignore b/.gitignore index b18528112..c8b7382aa 100644 --- a/.gitignore +++ b/.gitignore @@ -2,10 +2,21 @@ logs tmp wandb +make_test_log.xml +runs/ +runs_old/ +runs_v0/ +output/ + +#Test gaia +wb/ +pdb5wb7.ent +downloads_folder/ +model_performance_comparison.png +langfuse_test.py # Data data -outputs data/ # Apple @@ -148,8 +159,11 @@ interpreter_workspace/ # Archive archive/ savedir/ -output/ +#output/ tool_output/ # Gradio runtime -.gradio/ \ No newline at end of file +.gradio/ + +#Other cache +.ruff_cache/ \ No newline at end of file diff --git a/Makefile b/Makefile index 602248712..98cc19564 100644 --- a/Makefile +++ b/Makefile @@ -14,4 +14,4 @@ style: # Run smolagents tests test: - pytest ./tests/ \ No newline at end of file + pytest ./tests/ --junitxml=make_test_log.xml \ No newline at end of file diff --git a/README.md b/README.md index dac23c04b..61e58c643 100644 --- a/README.md +++ b/README.md @@ -30,56 +30,98 @@ limitations under the License. -`smolagents` is a library that enables you to run powerful agents in a few lines of code. It offers: +`smolagents` is a lightweight library for building autonomous agents that solve tasks using code and tools. Agents run as independent processes and communicate directly via message queues using the `SendMessageTool` and `ReceiveMessagesTool`. Each agent maintains its own queue in a shared dictionary for decentralized task processing. -✨ **Simplicity**: the logic for agents fits in ~1,000 lines of code (see [agents.py](https://github.com/huggingface/smolagents/blob/main/src/smolagents/agents.py)). We kept abstractions to their minimal shape above raw code! -πŸ§‘β€πŸ’» **First-class support for Code Agents**. Our [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) writes its actions in code (as opposed to "agents being used to write code"). To make it secure, we support executing in sandboxed environments via [E2B](https://e2b.dev/), [Modal](https://modal.com/), Docker, or Pyodide+Deno WebAssembly sandbox. +✨ **Simplicity**: The core logic fits in ~1,000 lines of code (see [agents.py](https://github.com/huggingface/smolagents/blob/main/src/smolagents/agents.py)), keeping abstractions minimal. +πŸ§‘β€πŸ’» **First-class support for Code Agents**. Our [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) writes its actions in code (as opposed to "agents being used to write code"). To make it secure, we support executing in sandboxed environments via [E2B](https://e2b.dev/), Docker, or Pyodide+Deno WebAssembly sandbox. -πŸ€— **Hub integrations**: you can [share/pull tools or agents to/from the Hub](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_hub) for instant sharing of the most efficient agents! -🌐 **Model-agnostic**: smolagents supports any LLM. It can be a local `transformers` or `ollama` model, one of [many providers on the Hub](https://huggingface.co/blog/inference-providers), or any model from OpenAI, Anthropic and many others via our [LiteLLM](https://www.litellm.ai/) integration. +πŸ€— **Hub integrations**: Share or pull tools and agents to/from the Hub for instant collaboration (see [Tool.from_hub](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_hub)). -πŸ‘οΈ **Modality-agnostic**: Agents support text, vision, video, even audio inputs! Cf [this tutorial](https://huggingface.co/docs/smolagents/examples/web_browser) for vision. +🌐 **Model-agnostic**: Supports any LLM, including local `transformers` or `ollama` models, [HF inference providers](https://huggingface.co/blog/inference-providers), or models from OpenAI, Anthropic, and others via [LiteLLM](https://www.litellm.ai/). -πŸ› οΈ **Tool-agnostic**: you can use tools from any [MCP server](https://huggingface.co/docs/smolagents/reference/tools#smolagents.ToolCollection.from_mcp), from [LangChain](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_langchain), you can even use a [Hub Space](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_space) as a tool. +πŸ‘οΈ **Modality-agnostic**: Agents handle text, vision, video, and audio inputs (see [vision tutorial](https://huggingface.co/docs/smolagents/examples/web_browser)). -Full documentation can be found [here](https://huggingface.co/docs/smolagents/index). +πŸ› οΈ **Tool-agnostic**: Use tools from [MCP servers](https://huggingface.co/docs/smolagents/reference/tools#smolagents.ToolCollection.from_mcp), [LangChain](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_langchain), or [Hub Spaces](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_space). + +Full documentation is available [here](https://huggingface.co/docs/smolagents/index). > [!NOTE] -> Check the our [launch blog post](https://huggingface.co/blog/smolagents) to learn more about `smolagents`! +> Check our [launch blog post](https://huggingface.co/blog/smolagents) to learn more about `smolagents`! + +## Quick Demo -## Quick demo +Install the package with default tools: -First install the package with a default set of tools: ```bash pip install "smolagents[toolkit]" ``` -Then define your agent, give it the tools it needs and run it! -```py -from smolagents import CodeAgent, WebSearchTool, InferenceClientModel -model = InferenceClientModel() -agent = CodeAgent(tools=[WebSearchTool()], model=model, stream_outputs=True) +Set your Hugging Face API key: -agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") +```bash +export HF_TOKEN=your_huggingface_api_key_here ``` -https://github.com/user-attachments/assets/84b149b4-246c-40c9-a48d-ba013b08e600 +Run multiple agents to solve a task collaboratively: -You can even share your agent to the Hub, as a Space repository: -```py -agent.push_to_hub("m-ric/my_agent") +```python +from multiprocessing import Manager, Process +from smolagents import ( + CodeAgent, + InferenceClientModel, + WebSearchTool, + SendMessageTool, + ReceiveMessagesTool, +) +def start_agent(agent_id, queue_dict, task=None): + model = InferenceClientModel() + tools = [ + WebSearchTool(), + SendMessageTool(queue_dict, agent_id), + ReceiveMessagesTool(queue_dict, agent_id), + ] + agent = CodeAgent( + tools=tools, + model=model, + additional_authorized_imports=["numpy", "pandas"], + ) + agent.run(task=task) + +if __name__ == "__main__": + task = "How many seconds would it take for a leopard at full speed to run through Pont des Arts?" + num_agents = 2 + with Manager() as manager: + queue_dict = manager.dict() + for i in range(num_agents): + queue_dict[i] = manager.Queue() + processes = [ + Process(target=start_agent, args=(i, queue_dict, task if i == 0 else None)) + for i in range(num_agents) + ] + for p in processes: + p.start() + for p in processes: + p.join() +``` + +This launches two `CodeAgent` instances. Agent 0 processes the task and may send subtasks (e.g., code or search results) to Agent 1 via message queues using `SendMessageTool` and `ReceiveMessagesTool`. Agents use tools like `web_search` or `python_interpreter` and return results with `final_answer()`. + +You can share your agent to the Hub as a Space repository: + +```python +agent.push_to_hub("m-ric/my_agent") # agent.from_hub("m-ric/my_agent") to load an agent from Hub ``` -Our library is LLM-agnostic: you could switch the example above to any inference provider. +`smolagents` is LLM-agnostic. Switch the model as needed:
- InferenceClientModel, gateway for all inference providers supported on HF + InferenceClientModel (HF inference providers) -```py +```python from smolagents import InferenceClientModel model = InferenceClientModel( @@ -89,9 +131,9 @@ model = InferenceClientModel( ```
- LiteLLM to access 100+ LLMs + LiteLLM (100+ LLMs) -```py +```python from smolagents import LiteLLMModel model = LiteLLMModel( @@ -104,36 +146,35 @@ model = LiteLLMModel(
OpenAI-compatible servers: Together AI -```py +```python import os from smolagents import OpenAIModel model = OpenAIModel( model_id="deepseek-ai/DeepSeek-R1", - api_base="https://api.together.xyz/v1/", # Leave this blank to query OpenAI servers. - api_key=os.environ["TOGETHER_API_KEY"], # Switch to the API key for the server you're targeting. + api_base="https://api.together.xyz/v1/", + api_key=os.environ["TOGETHER_API_KEY"], ) ```
OpenAI-compatible servers: OpenRouter -```py +```python import os from smolagents import OpenAIModel model = OpenAIModel( model_id="openai/gpt-4o", - api_base="https://openrouter.ai/api/v1", # Leave this blank to query OpenAI servers. - api_key=os.environ["OPENROUTER_API_KEY"], # Switch to the API key for the server you're targeting. + api_base="https://openrouter.ai/api/v1", + api_key=os.environ["OPENROUTER_API_KEY"], ) ``` -
Local `transformers` model -```py +```python from smolagents import TransformersModel model = TransformersModel( @@ -146,126 +187,96 @@ model = TransformersModel(
Azure models -```py +```python import os from smolagents import AzureOpenAIModel + model = AzureOpenAIModel( model_id = os.environ.get("AZURE_OPENAI_MODEL"), + azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"), api_key=os.environ.get("AZURE_OPENAI_API_KEY"), - api_version=os.environ.get("OPENAI_API_VERSION") + api_version=os.environ.get("OPENAI_API_VERSION") ) ```
Amazon Bedrock models -```py +```python import os from smolagents import AmazonBedrockModel + model = AmazonBedrockModel( model_id = os.environ.get("AMAZON_BEDROCK_MODEL_ID") + ) ```
## CLI -You can run agents from CLI using two commands: `smolagent` and `webagent`. +Run agents from the CLI using `smolagent` or `webagent`. -`smolagent` is a generalist command to run a multi-step `CodeAgent` that can be equipped with various tools. +`smolagent` runs multiple `CodeAgent` or `ToolCallingAgent` instances that collaborate via message queues: ```bash + smolagent "Plan a trip to Tokyo, Kyoto and Osaka between Mar 28 and Apr 7." --model-type "InferenceClientModel" --model-id "Qwen/Qwen3-Next-80B-A3B-Thinking" --imports pandas numpy --tools web_search ``` -Meanwhile `webagent`Β is a specific web-browsing agent using [helium](https://github.com/mherrmann/helium) (read more [here](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py)). +`webagent` is a specific web-browsing agent using [helium](https://github.com/mherrmann/helium) (read more [here](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py)): -For instance: ```bash + webagent "go to xyz.com/men, get to sale section, click the first clothing item you see. Get the product details, and the price, return them. note that I'm shopping from France" --model-type "LiteLLMModel" --model-id "gpt-5" ``` -## How do Code agents work? - -Our [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) works mostly like classical ReAct agents - the exception being that the LLM engine writes its actions as Python code snippets. - -```mermaid -flowchart TB - Task[User Task] - Memory[agent.memory] - Generate[Generate from agent.model] - Execute[Execute Code action - Tool calls are written as functions] - Answer[Return the argument given to 'final_answer'] - - Task -->|Add task to agent.memory| Memory - - subgraph ReAct[ReAct loop] - Memory -->|Memory as chat messages| Generate - Generate -->|Parse output to extract code action| Execute - Execute -->|No call to 'final_answer' tool => Store execution logs in memory and keep running| Memory - end - - Execute -->|Call to 'final_answer' tool| Answer - - %% Styling - classDef default fill:#d4b702,stroke:#8b7701,color:#ffffff - classDef io fill:#4a5568,stroke:#2d3748,color:#ffffff - - class Task,Answer io -``` +## How Do Agents Work? -Actions are now Python code snippets. Hence, tool calls will be performed as Python function calls. For instance, here is how the agent can perform web search over several websites in one single action: -```py -requests_to_search = ["gulf of mexico america", "greenland denmark", "tariffs"] -for request in requests_to_search: - print(f"Here are the search results for {request}:", web_search(request)) -``` +Agents in `smolagents` run as independent processes. Each agent has a queue inside a shared dictionary, and they communicate by sending messages with `SendMessageTool` and retrieving them with `ReceiveMessagesTool`. This decentralized approach eliminates the need for a centralized ReAct loop. -Writing actions as code snippets is demonstrated to work better than the current industry practice of letting the LLM output a dictionary of the tools it wants to call: [uses 30% fewer steps](https://huggingface.co/papers/2402.01030) (thus 30% fewer LLM calls) and [reaches higher performance on difficult benchmarks](https://huggingface.co/papers/2411.01747). Head to [our high-level intro to agents](https://huggingface.co/docs/smolagents/conceptual_guides/intro_agents) to learn more on that. +[`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) writes actions as Python code snippets, executed securely in sandboxed environments (e.g., [E2B](https://e2b.dev/) or Docker). Code-based actions [use 30% fewer steps](https://huggingface.co/papers/2402.01030) and [achieve higher performance](https://huggingface.co/papers/2411.01747) compared to traditional tool-calling methods. -Especially, since code execution can be a security concern (arbitrary code execution!), we provide options at runtime: - - a secure python interpreter to run code more safely in your environment (more secure than raw code execution but still risky) - - a sandboxed environment using [E2B](https://e2b.dev/) or Docker (removes the risk to your own system). +[`ToolCallingAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.ToolCallingAgent) writes actions as JSON blobs, suitable for tasks requiring structured tool calls. Both agent types support collaborative workflows via message queues. -Alongside [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent), we also provide the standard [`ToolCallingAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.ToolCallingAgent) which writes actions as JSON/text blobs. You can pick whichever style best suits your use case. +Example workflow for two agents solving "Compute 5 + 3": -## How smol is this library? +- **Agent 0**: Generates code (`result = 5 + 3; print(result)`) and sends it to Agent 1. +- **Agent 1**: Receives the code, executes it using `python_interpreter`, and returns the result with `final_answer()`. -We strived to keep abstractions to a strict minimum: the main code in `agents.py` has <1,000 lines of code. -Still, we implement several types of agents: `CodeAgent` writes its actions as Python code snippets, and the more classic `ToolCallingAgent` leverages built-in tool calling methods. We also have multi-agent hierarchies, import from tool collections, remote code execution, vision models... +See [our intro to agents](https://huggingface.co/docs/smolagents/conceptual_guides/intro_agents) for more details. -By the way, why use a framework at all? Well, because a big part of this stuff is non-trivial. For instance, the code agent has to keep a consistent format for code throughout its system prompt, its parser, the execution. So our framework handles this complexity for you. But of course we still encourage you to hack into the source code and use only the bits that you need, to the exclusion of everything else! +## How Smol Is This Library? -## How strong are open models for agentic workflows? +The core logic in `agents.py` is <1,000 lines, minimizing abstractions. We support `CodeAgent` (Python code actions), `ToolCallingAgent` (JSON actions), multi-agent collaboration, tool collections, remote code execution, and vision models. The framework handles complex tasks like consistent code formatting, parsing, and secure execution, but you can hack into the source code to use only what you need. -We've created [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) instances with some leading models, and compared them on [this benchmark](https://huggingface.co/datasets/m-ric/agents_medium_benchmark_2) that gathers questions from a few different benchmarks to propose a varied blend of challenges. +## How Strong Are Open Models for Agentic Workflows? -[Find the benchmarking code here](https://github.com/huggingface/smolagents/blob/main/examples/smolagents_benchmark/run.py) for more detail on the agentic setup used, and see a comparison of using LLMs code agents compared to vanilla (spoilers: code agents works better). +We’ve benchmarked [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) with leading models on [this benchmark](https://huggingface.co/datasets/m-ric/agents_medium_benchmark_2), combining varied challenges. [See the benchmarking code](https://github.com/huggingface/smolagents/blob/main/examples/smolagents_benchmark/run.py) for details. Open-source models like DeepSeek-R1 often outperform closed-source models in agentic tasks.

benchmark of different models on agentic workflows. Open model DeepSeek-R1 beats closed-source models.

-This comparison shows that open-source models can now take on the best closed models! - ## Security + Security is a critical consideration when working with code-executing agents. Our library provides: - Sandboxed execution options using [E2B](https://e2b.dev/), [Modal](https://modal.com/), Docker, or Pyodide+Deno WebAssembly sandbox - Best practices for running agent code securely -For security policies, vulnerability reporting, and more information on secure agent execution, please see our [Security Policy](SECURITY.md). +See our [Security Policy](SECURITY.md) for vulnerability reporting and secure execution guidelines. ## Contribute -Everyone is welcome to contribute, get started with our [contribution guide](https://github.com/huggingface/smolagents/blob/main/CONTRIBUTING.md). +Everyone is welcome to contribute. See our [contribution guide](https://github.com/huggingface/smolagents/blob/main/CONTRIBUTING.md). ## Cite smolagents -If you use `smolagents` in your publication, please cite it by using the following BibTeX entry. +If you use `smolagents` in your publication, please cite it: ```bibtex @Misc{smolagents, @@ -274,4 +285,4 @@ If you use `smolagents` in your publication, please cite it by using the followi howpublished = {\url{https://github.com/huggingface/smolagents}}, year = {2025} } -``` +``` \ No newline at end of file diff --git a/examples/decentralized_smolagents_benchmark/README.md b/examples/decentralized_smolagents_benchmark/README.md new file mode 100644 index 000000000..3b021f877 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/README.md @@ -0,0 +1,186 @@ +# Decentralized smolagents Benchmark + +This folder contains a decentralized multi-agent system implementation for benchmarking against the smolagents benchmark dataset. The system coordinates multiple specialized agents working collaboratively to solve complex problems. + +## Overview + +The decentralized approach distributes problem-solving across multiple specialized agents that communicate and coordinate through a message-passing system with consensus mechanisms. This contrasts with the centralized approach where a single agent has access to all tools. + +### Architecture + +The system consists of: + +- **4 Specialized Agents**: + - **CodeAgent**: Handles code execution and computational tasks + - **WebSearchAgent**: Performs web searches and information retrieval + - **DeepResearchAgent**: Conducts in-depth research using web browsing + - **DocumentReaderAgent**: Reads and analyzes various document formats + +- **Message Store**: Central communication hub for agent coordination +- **Consensus Protocol**: Voting mechanism for final answer agreement + +## Files + +- **`decentralized_agent.py`**: Main entry point for running a single question through the decentralized agent team +- **`run.py`**: Benchmark runner that evaluates the decentralized system across the entire benchmark dataset +- **`run_centralized.py`**: Comparison implementation using a centralized agent approach +- **`requirements.txt`**: Python dependencies required for the project +- **`scripts/`**: Supporting modules for agents, tools, communication, and utilities + +### Key Scripts + +- `scripts/agents.py`: Agent definitions and team coordination logic +- `scripts/message_store.py`: Message-passing infrastructure for agent communication +- `scripts/consensus_protocol.py`: Voting mechanism for reaching consensus on answers +- `scripts/decentralized_tools.py`: Custom tools for decentralized agent communication +- `scripts/text_web_browser.py`: Text-based web browsing tools +- `scripts/text_inspector_tool.py`: Document reading and analysis tools +- `scripts/visual_qa.py`: Visual question answering capabilities +- `scripts/html_renderer.py`: HTML visualization of agent runs +- `scripts/convert_messages_to_html.py`: Convert message logs to HTML format +- `scripts/gaia_scorer.py`: Scoring utilities for GAIA benchmark format + +## Installation + +1. Install the required dependencies: +```bash +pip install -r requirements.txt +``` + +2. Set up your environment variables in a `.env` file: +```bash +# API Keys +OPENAI_API_KEY=your_openai_key #You can replace it by whatever model you want to use +ANTHROPIC_API_KEY=your_anthropic_key #You can replace it by whatever model you want to use +SERPAPI_API_KEY=your_serpapi_key # For web search functionality +LANGFUSE_PUBLIC_KEY=your_langfuse_public_key # Optional: for tracing +LANGFUSE_SECRET_KEY=your_langfuse_secret_key # Optional: for tracing +LANGFUSE_HOST=your_langfuse_host # Optional: for tracing +``` + +## Usage + +### Running a Single Question + +Use `decentralized_agent.py` to run a single question through the decentralized team: + +```bash +python decentralized_agent.py \ + --model-type LiteLLMModel \ + --model-id gpt-4o \ #or another model + --provider openai \ #or another provider + "What is the half of the speed of a Leopard?" +``` + +**Arguments:** +- `--model-type`: Model type to use (e.g., `LiteLLMModel`) +- `--model-id`: Specific model identifier (e.g., `gpt-4o`, `claude-3-5-sonnet-20241022`) +- `--provider`: Model provider (e.g., `openai`, `anthropic`, `hf-inference`) +- `question`: The question to answer (positional argument) + +**Output:** +- Creates a `runs/{run_id}/` directory with: + - `run.log`: JSON-formatted execution logs + - Agent interaction traces and message history + +### Running the Full Benchmark + +Use `run.py` to evaluate across the entire benchmark dataset: + +```bash +python run.py \ + --model-type LiteLLMModel \ + --model-id gpt-4o \ #or another model + --provider openai \ #or another provider + --parallel-workers 4 +``` + +**Arguments:** +- `--date`: Date string for the evaluation (default: current date) +- `--eval-dataset`: Dataset to evaluate on (default: `smolagents/benchmark-v1`) +- `--model-type`: Model type to use +- `--model-id`: Specific model identifier +- `--provider`: Model provider +- `--parallel-workers`: Number of concurrent benchmark runs (default: 4) +- `--num-examples`: Limit examples per task for testing (optional) +- `--push-answers-to-hub`: Push results to HuggingFace Hub +- `--answers-dataset`: Dataset name for answers (default: `smolagents/answers`) + +**Output:** +- `output/results_{date}_{model_id}.csv`: Benchmark results +- `output/answers_{date}_{model_id}.json`: Generated answers +- Individual run directories under `runs/` + +### Running the Centralized Baseline + +For comparison, run the centralized agent: + +```bash +python run_centralized.py \ + --model-type LiteLLMModel \ + --model-id gpt-4o \ #or another model + --provider openai \ #or another provider + --parallel-workers 4 +``` + +Uses the same arguments as `run.py`. + +## Features + +### Decentralized Coordination + +- **Message-Based Communication**: Agents communicate through a shared message store +- **Consensus Protocol**: Multiple agents must agree on the final answer through voting +- **Specialized Roles**: Each agent has specific capabilities and responsibilities +- **Parallel Execution**: Agents can work concurrently on different aspects of the problem + +### Monitoring & Observability + +- **Langfuse Integration**: Optional tracing and monitoring of agent interactions +- **JSON Logging**: Structured logs for debugging and analysis +- **HTML Visualization**: Convert message logs to interactive HTML reports +- **Run Tracking**: Unique run IDs for tracking individual executions + +### Tool Capabilities + +The agents have access to various tools: +- Python code execution +- Google search +- Web browsing (text-based) +- Document reading (PDF, DOCX, PPTX, etc.) +- Visual question answering +- File downloads +- Archive searching + +## Project Structure + +``` +decentralized_smolagents_benchmark/ +β”œβ”€β”€ decentralized_agent.py # Single question entry point +β”œβ”€β”€ run.py # Benchmark runner (decentralized) +β”œβ”€β”€ run_centralized.py # Benchmark runner (centralized baseline) +β”œβ”€β”€ requirements.txt # Dependencies +β”œβ”€β”€ scripts/ # Supporting modules +β”‚ β”œβ”€β”€ agents.py # Agent definitions +β”‚ β”œβ”€β”€ message_store.py # Communication infrastructure +β”‚ β”œβ”€β”€ consensus_protocol.py # Voting mechanism +β”‚ β”œβ”€β”€ decentralized_tools.py # Communication tools +β”‚ β”œβ”€β”€ text_web_browser.py # Web browsing tools +β”‚ β”œβ”€β”€ text_inspector_tool.py # Document tools +β”‚ β”œβ”€β”€ visual_qa.py # Visual QA +β”‚ β”œβ”€β”€ html_renderer.py # HTML visualization +β”‚ └── ... # Other utilities +β”œβ”€β”€ runs/ # Individual run outputs (created at runtime) +└── output/ # Benchmark results (created at runtime) +``` + +## Contributing + +When contributing to this project, please follow the guidelines in the root-level `AGENTS.md`: +- Follow OOP principles +- Be Pythonic: follow Python best practices and idiomatic patterns +- Write unit tests for new functionality + +## License + +This project is part of the smolagents repository. Please refer to the root LICENSE file for licensing information. diff --git a/examples/decentralized_smolagents_benchmark/decentralized_agent.py b/examples/decentralized_smolagents_benchmark/decentralized_agent.py new file mode 100644 index 000000000..64d6affcd --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/decentralized_agent.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# Example run: python examples/decentralized_smolagents_benchmark/decentralized_agent.py --model-type LiteLLMModel --model-id gpt-4o --provider openai "What is the half of the speed of a Leopard?" +"""Entry point for decentralized agent team execution.""" + +import argparse +import json +import logging +import sys +import uuid +from pathlib import Path + +from scripts.agents import DecentralizedAgents +from scripts.message_store import MessageStore + + +QUESTION_ADDON = """It is critical to respect the format of the answer when it is asked. """ + + +# Langfuse instrumentation setup +try: + from dotenv import load_dotenv + + load_dotenv() + + from langfuse import Langfuse + from openinference.instrumentation.smolagents import SmolagentsInstrumentor + + # Initialize Langfuse client + langfuse_client = Langfuse() + if langfuse_client.auth_check(): + print("βœ… Langfuse client authenticated successfully") + SmolagentsInstrumentor().instrument() + print("βœ… SmolagentsInstrumentor enabled") + else: + print("⚠️ Langfuse authentication failed - tracing disabled") + langfuse_client = None +except ImportError as e: + print(f"⚠️ Langfuse not available: {e}") + langfuse_client = None +except Exception as e: + print(f"⚠️ Langfuse setup error: {e}") + langfuse_client = None + + +def setup_logging(run_dir: Path) -> None: + """Setup JSON logging to file.""" + log_file = run_dir / "run.log" + + # Clear existing handlers to avoid duplication + logger = logging.getLogger() + logger.handlers.clear() + + # Create formatters + json_formatter = logging.Formatter('{"timestamp":"%(asctime)s", "level":"%(levelname)s", "message":%(message)s}') + # console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + + # File handler with JSON format + file_handler = logging.FileHandler(log_file, encoding="utf-8") + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(json_formatter) + + # Console handler with readable format (optional, for debugging) + # Uncomment the next 4 lines if you want console logging too + # console_handler = logging.StreamHandler(sys.stdout) + # console_handler.setLevel(logging.INFO) + # console_handler.setFormatter(console_formatter) + # logger.addHandler(console_handler) + + # Add handlers to logger + logger.addHandler(file_handler) + logger.setLevel(logging.INFO) + + +def main(args: argparse.Namespace) -> int: + """Main entry point - simplified execution.""" + print(f"πŸš€ Starting decentralized agent team for: {args.question}") + + # Create message store with proper agent names for correct voting thresholds + run_id = str(uuid.uuid4())[:8] # Short run ID + agent_names = ["CodeAgent", "WebSearchAgent", "DeepResearchAgent", "DocumentReaderAgent"] + message_store = MessageStore(run_id, agent_names=agent_names) + + # Handle the case where __file__ might not be defined + try: + script_dir = Path(__file__).parent + except NameError: + # Fallback if __file__ is not defined + script_dir = Path(sys.argv[0]).parent.absolute() if sys.argv[0] else Path.cwd() + + run_dir = script_dir / "runs" / run_id + run_dir.mkdir(parents=True, exist_ok=True) + + # Setup logging + setup_logging(run_dir) + logging.info(json.dumps({"event": "run_started", "run_id": run_id, "args": vars(args)})) + + try: + # Create the decentralized agent team + logging.info(json.dumps({"event": "creating_team", "run_id": run_id})) + decentralized_team = DecentralizedAgents( + message_store=message_store, + model_type=args.model_type, + model_id=args.model_id, + provider=args.provider, + run_id=run_id, + ) + + # Run the team on the task with enhanced collaboration instructions + enhanced_task = f"{args.question}" + logging.info(json.dumps({"event": "starting_execution", "run_id": run_id, "question": args.question})) + result = decentralized_team.run(enhanced_task) + + logging.info( + json.dumps( + { + "event": "execution_completed", + "run_id": run_id, + "status": result.get("status", "unknown"), + "has_answer": "answer" in result, + } + ) + ) + + # Output the result + if result["status"] in ["success", "success_early", "success_fallback"]: + print(json.dumps({"answer": result["answer"]})) + return 0 + else: + error_msg = result.get("error", "No valid results") + logging.error( + json.dumps({"event": "execution_failed", "run_id": run_id, "error": error_msg, "result": result}) + ) + print(f"\n❌ Team execution failed: {error_msg}") + return 1 + + except Exception as e: + # Catch any unexpected errors and log them with full context + logging.error( + json.dumps( + { + "event": "unexpected_error", + "run_id": run_id, + "error_type": type(e).__name__, + "error_message": str(e), + "question": args.question, + } + ) + ) + + # Also log the full stack trace for debugging + import traceback + + logging.error(json.dumps({"event": "error_traceback", "run_id": run_id, "traceback": traceback.format_exc()})) + + print(f"\n❌ Unexpected error: {e}") + return 1 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run decentralized agent team") + parser.add_argument("--model-type", required=True, help="Model type to use") + parser.add_argument("--model-id", required=True, help="Model ID to use") + parser.add_argument("--provider", help="Model provider") + parser.add_argument("question", help="Question to answer") + + args = parser.parse_args() + sys.exit(main(args)) diff --git a/examples/decentralized_smolagents_benchmark/model_performance_comparison.png b/examples/decentralized_smolagents_benchmark/model_performance_comparison.png new file mode 100644 index 000000000..30cb23506 Binary files /dev/null and b/examples/decentralized_smolagents_benchmark/model_performance_comparison.png differ diff --git a/examples/decentralized_smolagents_benchmark/requirements.txt b/examples/decentralized_smolagents_benchmark/requirements.txt new file mode 100644 index 000000000..29c868a34 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/requirements.txt @@ -0,0 +1,38 @@ +anthropic>=0.37.1 +audioop-lts<1.0; python_version >= "3.13" # required to use pydub in Python >=3.13; LTS port of the removed Python builtin module audioop +beautifulsoup4>=4.12.3 +datasets>=2.21.0 +google_search_results>=2.4.2 +huggingface_hub>=0.23.4 +mammoth>=1.8.0 +markdownify>=0.13.1 +numexpr>=2.10.1 +numpy>=2.1.2 +openai>=1.52.2 +openpyxl +pandas>=2.2.3 +pathvalidate>=3.2.1 +pdfminer.six>=20240706 +Pillow>=11.0.0 +puremagic>=1.28 +pypdf>=5.1.0 +python-dotenv>=1.0.1 +python_pptx>=1.0.2 +Requests>=2.32.3 +tqdm>=4.66.4 +torch>=2.2.2 +torchvision>=0.17.2 +transformers>=4.46.0 +youtube_transcript_api>=0.6.2 +chess +sympy +pubchempy +Bio +scikit-learn +scipy +pydub +PyPDF2 +python-pptx +torch +xlrd +SpeechRecognition diff --git a/examples/decentralized_smolagents_benchmark/run.py b/examples/decentralized_smolagents_benchmark/run.py new file mode 100644 index 000000000..4c586fc8c --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/run.py @@ -0,0 +1,520 @@ +#!/usr/bin/env python +# Example usage: python run.py --model-type LiteLLMModel --model-id gpt-4o --provider openai +"""Benchmarking script for decentralized agent implementation.""" + +import argparse +import datetime +import json +import os +import re +import subprocess +import threading +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +import datasets +import pandas as pd +from dotenv import load_dotenv +from tqdm import tqdm + + +# Langfuse instrumentation setup +try: + from dotenv import load_dotenv + + load_dotenv() + + from langfuse import Langfuse + from openinference.instrumentation.smolagents import SmolagentsInstrumentor + + # Initialize Langfuse client + langfuse_client = Langfuse() + if langfuse_client.auth_check(): + print("βœ… Langfuse client authenticated successfully") + SmolagentsInstrumentor().instrument() + print("βœ… SmolagentsInstrumentor enabled") + else: + print("⚠️ Langfuse authentication failed - tracing disabled") + langfuse_client = None +except ImportError as e: + print(f"⚠️ Langfuse not available: {e}") + langfuse_client = None +except Exception as e: + print(f"⚠️ Langfuse setup error: {e}") + langfuse_client = None + + +script_dir = Path(__file__).parent +output_path = script_dir / "output" +os.makedirs(output_path, exist_ok=True) + + +APPEND_ANSWER_LOCK = threading.Lock() + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Runs decentralized agent team on smolagent benchmark.") + parser.add_argument( + "--date", + type=str, + default=None, + help="The date for the evaluation.", + ) + parser.add_argument( + "--eval-dataset", + type=str, + default="smolagents/benchmark-v1", + ) + parser.add_argument( + "--model-type", + type=str, + required=True, + help="The model type to use for the decentralized agents", + ) + parser.add_argument( + "--model-id", + type=str, + required=True, + help="The model ID to use for the specified model type", + ) + parser.add_argument( + "--provider", + type=str, + default="hf-inference", + help="The provider for the model", + ) + parser.add_argument( + "--parallel-workers", + type=int, + default=4, + help="The number of concurrent benchmark runs", + ) + parser.add_argument( + "--num-examples", + type=int, + default=None, + help="Limit the number of examples per task (useful for testing)", + ) + parser.add_argument( + "--push-answers-to-hub", + action="store_true", + help="Push the answers to the hub", + ) + parser.add_argument( + "--answers-dataset", + type=str, + default="smolagents/answers", + ) + return parser.parse_args() + + +def load_eval_dataset(eval_dataset, num_examples=None): + """Load the evaluation dataset.""" + # Get all available tasks + tasks = datasets.get_dataset_config_names(eval_dataset) + print(f"Available tasks: {tasks}") + + # Load each task's dataset + all_data = [] + for task in tasks: + dataset = datasets.load_dataset(eval_dataset, task, split="test") + data_list = list(dataset) + print(f"Loaded {len(data_list)} examples for {task}") + + # Add task information to each item and create unique IDs + for i, item in enumerate(data_list): + item["task"] = task + item["source"] = task # Add source field to match original format + item["id"] = f"{task}_{i}" # Create unique ID for each item + + all_data.extend(data_list) + + df = pd.DataFrame(all_data) + + if num_examples is not None: + # Sample num_examples from each task + df = ( + df.groupby("task", group_keys=False) + .apply(lambda x: x.sample(n=min(num_examples, len(x)), random_state=42)) + .reset_index(drop=True) + ) + + return df + + +def run_decentralized_agent(row, args): + """Run decentralized agent on a single benchmark example.""" + start_time = time.time() + + # Get the date for file naming + date = args.date if hasattr(args, "date") and args.date else datetime.datetime.now().strftime("%Y-%m-%d") + + try: + # Prepare question from the dataset row + question = row["question"] + if row.get("context"): + question = f"Context: {row['context']}\n\nQuestion: {question}" + + # Run the decentralized agent process + script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "decentralized_agent.py") + cmd = [ + "python", + script_path, + "--model-type", + args.model_type, + "--model-id", + args.model_id, + "--provider", + args.provider, + question, + ] + + # Run the process and capture output + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + + # Parse the final answer and run_id from JSON output + final_answer = None + run_id = None + + # First, check stderr for run_id from logging output + if result.stderr: + for line in result.stderr.split("\n"): + line = line.strip() + if '"run_id"' in line and '"event"' in line: + try: + # Parse the JSON structure + parsed = json.loads(line) + if "message" in parsed and isinstance(parsed["message"], dict): + if "run_id" in parsed["message"]: + run_id = parsed["message"]["run_id"] + break + except json.JSONDecodeError: + # Try regex as fallback + import re + + match = re.search(r'"run_id":\s*"([^"]+)"', line) + if match: + run_id = match.group(1) + break + + # Parse final answer from stdout + for line in result.stdout.split("\n"): + line = line.strip() + if line: + try: + parsed = json.loads(line) + if "answer" in parsed: + final_answer = parsed["answer"] + break + except json.JSONDecodeError: + continue + + if not final_answer: + # Also check stderr for error messages + error_msg = result.stderr.strip() if result.stderr else "No final answer reached" + raise Exception(error_msg) + + # Calculate metrics + duration = time.time() - start_time + success = True + error = None + + except subprocess.TimeoutExpired as e: + duration = time.time() - start_time + success = False + error = f"Process timed out after duration {duration}, error: {str(e)}" + final_answer = None + run_id = None # Set run_id to None for timeout cases + except subprocess.CalledProcessError as e: + duration = time.time() - start_time + success = False + error_output = e.stderr.strip() if e.stderr else "No stderr output" + stdout_output = e.stdout.strip() if e.stdout else "No stdout output" + error = f"Process failed with exit code {e.returncode}. Stderr: {error_output}. Stdout: {stdout_output}" + final_answer = None + run_id = None # Set run_id to None for process error cases + except Exception as e: + duration = time.time() - start_time + success = False + error = str(e) + final_answer = None + run_id = None # Set run_id to None for error cases + + # Prepare result dictionary matching the expected format + model_id = f"decentralized-{args.model_type}-{args.model_id}" + action_type = "decentralized-consensus" + + result = { + "task": row["task"], + "question_id": row["id"], + "success": success, + "error": error, + "duration": duration, + "answer": final_answer, + "model_type": args.model_type, + "model_id": args.model_id, + "provider": args.provider, + "run_id": run_id, # Add the extracted run_id + "timestamp": datetime.datetime.now().isoformat(), + } + + # Also save in the format expected by the scoring system + scoring_result = { + "model_id": model_id, + "agent_action_type": action_type, + "question": row["question"], + "original_question": row["question"], + "answer": final_answer, + "true_answer": row.get("true_answer", ""), + "source": row["task"], + "start_time": start_time, + "end_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "duration": duration, + "run_id": run_id, # Add run_id here too + } + + # Save result to general output file + output_file = Path(output_path) / f"results_{date}.jsonl" + with APPEND_ANSWER_LOCK: + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "a") as f: + json.dump(result, f) + f.write("\n") + + # Save result to task-specific file for scoring + task_output_file = Path(output_path) / f"{model_id.replace('/', '__')}__{action_type}__{row['task']}__{date}.jsonl" + with APPEND_ANSWER_LOCK: + task_output_file.parent.mkdir(parents=True, exist_ok=True) + with open(task_output_file, "a") as f: + json.dump(scoring_result, f) + f.write("\n") + + return result + + +def normalize_answer(answer): + """ + Normalize answer for comparison. + + Removes extra whitespace, converts to lowercase, and strips punctuation + to enable more flexible answer matching. + """ + if answer is None: + return "" + answer = str(answer).strip().lower() + # Remove extra whitespace + answer = re.sub(r"\s+", " ", answer) + # Remove common punctuation at the end + answer = re.sub(r"[.!?;,]+$", "", answer) + return answer + + +def calculate_exact_match_score(predicted_answer, true_answer): + """ + Calculate exact match score (1.0 for perfect match, 0.0 otherwise). + + This is the strictest scoring metric. + """ + return 1.0 if normalize_answer(predicted_answer) == normalize_answer(true_answer) else 0.0 + + +def calculate_contains_score(predicted_answer, true_answer): + """Calculate score based on whether the predicted answer contains the true answer.""" + normalized_pred = normalize_answer(predicted_answer) + normalized_true = normalize_answer(true_answer) + + if not normalized_true: + return 0.0 + + return 1.0 if normalized_true in normalized_pred else 0.0 + + +def calculate_benchmark_scores(jsonl_file_path): + """Calculate scores for a benchmark result file.""" + if not os.path.exists(jsonl_file_path): + return {"error": "File not found"} + + total_questions = 0 + exact_matches = 0 + contains_matches = 0 + + with open(jsonl_file_path, "r", encoding="utf-8") as f: + for line in f: + try: + data = json.loads(line.strip()) + if not data: + continue + + predicted = data.get("answer", "") + true_answer = data.get("true_answer", "") + + total_questions += 1 + exact_matches += calculate_exact_match_score(predicted, true_answer) + contains_matches += calculate_contains_score(predicted, true_answer) + + except json.JSONDecodeError: + continue + + if total_questions == 0: + return {"error": "No valid questions found"} + + return { + "total_questions": total_questions, + "exact_match_score": exact_matches / total_questions, + "contains_score": contains_matches / total_questions, + "exact_matches": exact_matches, + "contains_matches": contains_matches, + } + + +def save_benchmark_scores(output_dir, model_id, action_type, date, eval_ds): + """Calculate and save scores for all benchmarks.""" + scores_file = f"{output_dir}/benchmark_scores_{model_id.replace('/', '__')}__{action_type}__{date}.json" + + all_scores = { + "model_id": model_id, + "action_type": action_type, + "date": date, + "timestamp": datetime.datetime.now().isoformat(), + "benchmarks": {}, + } + + total_questions_all = 0 + total_exact_matches_all = 0 + total_contains_matches_all = 0 + + print("\nπŸ“Š Calculating benchmark scores...") + + for task in eval_ds: + jsonl_file = f"{output_dir}/{model_id.replace('/', '__')}__{action_type}__{task}__{date}.jsonl" + scores = calculate_benchmark_scores(jsonl_file) + + if "error" not in scores: + all_scores["benchmarks"][task] = scores + total_questions_all += scores["total_questions"] + total_exact_matches_all += scores["exact_matches"] + total_contains_matches_all += scores["contains_matches"] + + print(f" πŸ“ˆ {task.upper()}:") + print(f" Questions: {scores['total_questions']}") + print( + f" Exact Match: {scores['exact_match_score']:.1%} ({scores['exact_matches']}/{scores['total_questions']})" + ) + print( + f" Contains: {scores['contains_score']:.1%} ({scores['contains_matches']}/{scores['total_questions']})" + ) + else: + print(f" ❌ {task.upper()}: {scores['error']}") + + # Overall scores + if total_questions_all > 0: + all_scores["overall"] = { + "total_questions": total_questions_all, + "exact_match_score": total_exact_matches_all / total_questions_all, + "contains_score": total_contains_matches_all / total_questions_all, + "exact_matches": total_exact_matches_all, + "contains_matches": total_contains_matches_all, + } + + print("\n🎯 OVERALL SCORES:") + print(f" Questions: {total_questions_all}") + print( + f" Exact Match: {all_scores['overall']['exact_match_score']:.1%} ({total_exact_matches_all}/{total_questions_all})" + ) + print( + f" Contains: {all_scores['overall']['contains_score']:.1%} ({total_contains_matches_all}/{total_questions_all})" + ) + + # Save scores to file + with open(scores_file, "w", encoding="utf-8") as f: + json.dump(all_scores, f, indent=2, ensure_ascii=False) + + print(f"\nπŸ’Ύ Scores saved to: {scores_file}") + return all_scores + + +def main(): + """Main benchmarking function.""" + args = parse_arguments() + + # Set date if not provided + if args.date is None: + args.date = datetime.datetime.now().strftime("%Y-%m-%d") + + # Load dataset + print(f"Loading dataset {args.eval_dataset}...") + df = load_eval_dataset(args.eval_dataset, args.num_examples) + + # If num_examples is specified, sample from each task + if args.num_examples is not None: + df = ( + df.groupby("task", group_keys=False) + .apply(lambda x: x.sample(n=min(args.num_examples, len(x)), random_state=42)) + .reset_index(drop=True) + ) + + print(f"\nLoaded {len(df)} examples total:") + for task in df["task"].unique(): + task_count = len(df[df["task"] == task]) + print(f"- {task}: {task_count} examples") + + # Run benchmark + results = [] + with ThreadPoolExecutor(max_workers=args.parallel_workers) as executor: + futures = [] + + # Group examples by task for better progress tracking + for task in df["task"].unique(): + task_df = df[df["task"] == task] + print(f"\nπŸš€ Starting benchmark for {task} with {len(task_df)} examples...") + + for _, row in task_df.iterrows(): + future = executor.submit(run_decentralized_agent, row, args) + futures.append(future) + + # Process results with progress bar + for future in tqdm(as_completed(futures), total=len(futures), desc="Processing examples"): + try: + result = future.result() + results.append(result) + except Exception as e: + print(f"Error processing example: {str(e)}") + + # Calculate and print summary statistics per task + print("\nπŸ“Š Results Summary:") + for task in df["task"].unique(): + task_results = [r for r in results if r["task"] == task] + task_success = sum(1 for r in task_results if r["success"]) + task_duration = sum(r["duration"] for r in task_results) / len(task_results) if task_results else 0 + + print(f"\n{task}:") + print(f" Total examples: {len(task_results)}") + print(f" Success rate: {task_success / len(task_results):.2%}") + print(f" Average duration: {task_duration:.2f}s") + + # Overall statistics + total_success = sum(1 for r in results if r["success"]) + avg_duration = sum(r["duration"] for r in results) / len(results) if results else 0 + + print("\nπŸ“ˆ Overall Statistics:") + print(f" Total examples: {len(results)}") + print(f" Success rate: {total_success / len(results):.2%}") + print(f" Average duration: {avg_duration:.2f}s") + + # Push results to hub if requested + if args.push_answers_to_hub: + print("\nπŸš€ Pushing results to hub not yet implemented") + + # Calculate and save benchmark scores with proper variables + output_dir = output_path + model_id = f"decentralized-{args.model_type}-{args.model_id}" + action_type = "decentralized-consensus" + date = args.date + eval_ds = df["task"].unique() + + save_benchmark_scores(output_dir, model_id, action_type, date, eval_ds) + + +if __name__ == "__main__": + main() diff --git a/examples/decentralized_smolagents_benchmark/run_centralized.py b/examples/decentralized_smolagents_benchmark/run_centralized.py new file mode 100644 index 000000000..f762c3075 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/run_centralized.py @@ -0,0 +1,718 @@ +#!/usr/bin/env python +# Example usage: python run_centralized.py --model-type LiteLLMModel --model-id gpt-4o --provider openai +"""Benchmarking script for centralized agent implementation.""" + +import argparse +import datetime +import json +import os +import re +import threading +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import List + +import datasets +import pandas as pd +from dotenv import load_dotenv +from scripts.text_inspector_tool import FileReaderTool, TextInspectorTool +from scripts.text_web_browser import ( + ArchiveSearchTool, + DownloadTool, + FinderTool, + FindNextTool, + PageDownTool, + PageUpTool, + SimpleTextBrowser, + VisitTool, +) +from scripts.visual_qa import visualizer +from tqdm import tqdm + +from smolagents import ( + CodeAgent, + GoogleSearchTool, + LiteLLMModel, + Model, + ToolCallingAgent, +) +from smolagents.default_tools import PythonInterpreterTool +from smolagents.tools import Tool + + +# Langfuse instrumentation setup +try: + from dotenv import load_dotenv + + load_dotenv() + + from langfuse import Langfuse + from openinference.instrumentation.smolagents import SmolagentsInstrumentor + + # Initialize Langfuse client + langfuse_client = Langfuse() + if langfuse_client.auth_check(): + print("βœ… Langfuse client authenticated successfully") + SmolagentsInstrumentor().instrument() + print("βœ… SmolagentsInstrumentor enabled") + else: + print("⚠️ Langfuse authentication failed - tracing disabled") + langfuse_client = None +except ImportError as e: + print(f"⚠️ Langfuse not available: {e}") + langfuse_client = None +except Exception as e: + print(f"⚠️ Langfuse setup error: {e}") + langfuse_client = None + + +script_dir = Path(__file__).parent +output_path = script_dir / "output" +os.makedirs(output_path, exist_ok=True) + +custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"} + + +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" + +BROWSER_CONFIG = { + "viewport_size": 1024 * 5, + "downloads_folder": "downloads_folder", + "request_kwargs": { + "headers": {"User-Agent": user_agent}, + "timeout": 300, + }, + "serpapi_key": os.getenv("SERPAPI_API_KEY"), +} + +os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True) + +# Base prompt for team charter +TEAM_CHARTER = """You are part of an expert AI agent team working collaboratively to solve complex problems. + +TEAM COORDINATION PRINCIPLES: +- Work systematically and build on each other's findings +- Communicate clearly about your role, findings, and next steps +- Verify important information through multiple approaches when possible +- Focus on accuracy and provide evidence for your conclusions +- Coordinate efforts to avoid duplication and ensure comprehensive coverage + +RESPONSE FORMAT: +Always structure your responses with clear sections and evidence-based conclusions.""" + +APPEND_ANSWER_LOCK = threading.Lock() + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Runs centralized agent team on smolagent benchmark.") + parser.add_argument( + "--date", + type=str, + default=None, + help="The date for the evaluation.", + ) + parser.add_argument( + "--eval-dataset", + type=str, + default="smolagents/benchmark-v1", + ) + parser.add_argument( + "--model-type", + type=str, + required=True, + help="The model type to use for centralized multi-agent system", + ) + parser.add_argument( + "--model-id", + type=str, + required=True, + help="The model ID to use for the specified model type", + ) + parser.add_argument( + "--provider", + type=str, + default="hf-inference", + help="The provider for the model", + ) + parser.add_argument( + "--parallel-workers", + type=int, + default=4, + help="The number of concurrent benchmark runs", + ) + parser.add_argument( + "--num-examples", + type=int, + default=None, + help="Limit the number of examples per task (useful for testing)", + ) + parser.add_argument( + "--push-answers-to-hub", + action="store_true", + help="Push the answers to the hub", + ) + parser.add_argument( + "--answers-dataset", + type=str, + default="smolagents/answers", + ) + return parser.parse_args() + + +def load_eval_dataset(eval_dataset, num_examples=None): + """Load the evaluation dataset.""" + # Get all available tasks + tasks = datasets.get_dataset_config_names(eval_dataset) + print(f"Available tasks: {tasks}") + + # Load each task's dataset + all_data = [] + for task in tasks: + dataset = datasets.load_dataset(eval_dataset, task, split="test") + data_list = list(dataset) + print(f"Loaded {len(data_list)} examples for {task}") + + # Add task information to each item and create unique IDs + for i, item in enumerate(data_list): + item["task"] = task + item["source"] = task # Add source field to match original format + item["id"] = f"{task}_{i}" # Create unique ID for each item + + all_data.extend(data_list) + + df = pd.DataFrame(all_data) + + if num_examples is not None: + # Sample num_examples from each task + df = ( + df.groupby("task", group_keys=False) + .apply(lambda x: x.sample(n=min(num_examples, len(x)), random_state=42)) + .reset_index(drop=True) + ) + + return df + + +def run_centralized_agents(row, args, model): + """Run centralized agent on a single benchmark example.""" + start_time = time.time() + + # Get the date for file naming + date = args.date if hasattr(args, "date") and args.date else datetime.datetime.now().strftime("%Y-%m-%d") + + try: + # Create the centralized agent + agent = create_agent_team(model) + + # Prepare question from the dataset row + question = row["question"] + if row.get("context"): + question = f"Context: {row['context']}\n\nQuestion: {question}" + + # Run the agent + print(f"Running agent on question: {question[:100]}...") + result = agent.run(question) + print(f"Agent result type: {type(result)}") + print(f"Agent result: {str(result)[:200]}") + + # Extract final answer from result + if hasattr(result, "content"): + final_answer = result.content + elif hasattr(result, "final_answer"): + final_answer = result.final_answer + elif isinstance(result, str): + final_answer = result + else: + final_answer = str(result) + + print(f"Final answer: {final_answer}") + + # Calculate metrics + duration = time.time() - start_time + success = True + error = None + run_id = f"centralized_{int(time.time() * 1000)}" # Generate a simple run ID + + except Exception as e: + duration = time.time() - start_time + success = False + error = str(e) + final_answer = None + run_id = None + + # Prepare result dictionary matching the expected format + model_id = f"centralized-{args.model_type}-{args.model_id}" + action_type = "centralized-agents" + + result = { + "task": row["task"], + "question_id": row["id"], + "success": success, + "error": error, + "duration": duration, + "answer": final_answer, + "model_type": args.model_type, + "model_id": args.model_id, + "provider": args.provider, + "run_id": run_id, + "timestamp": datetime.datetime.now().isoformat(), + } + + # Also save in the format expected by the scoring system + scoring_result = { + "model_id": model_id, + "agent_action_type": action_type, + "question": row["question"], + "original_question": row["question"], + "answer": final_answer, + "true_answer": row.get("true_answer", ""), + "source": row["task"], + "start_time": start_time, + "end_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "duration": duration, + "run_id": run_id, + } + + # Save result to general output file + output_file = Path(output_path) / f"centralized_results_{date}.jsonl" + with APPEND_ANSWER_LOCK: + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "a") as f: + json.dump(result, f) + f.write("\n") + + # Save result to task-specific file for scoring + task_output_file = Path(output_path) / f"{model_id.replace('/', '__')}__{action_type}__{row['task']}__{date}.jsonl" + with APPEND_ANSWER_LOCK: + task_output_file.parent.mkdir(parents=True, exist_ok=True) + with open(task_output_file, "a") as f: + json.dump(scoring_result, f) + f.write("\n") + + return result + + +def normalize_answer(answer): + """ + Normalize answer for comparison. + + Removes extra whitespace, converts to lowercase, and strips punctuation + to enable more flexible answer matching. + """ + if answer is None: + return "" + answer = str(answer).strip().lower() + # Remove extra whitespace + answer = re.sub(r"\s+", " ", answer) + # Remove common punctuation at the end + answer = re.sub(r"[.!?;,]+$", "", answer) + return answer + + +def calculate_exact_match_score(predicted_answer, true_answer): + """ + Calculate exact match score (1.0 for perfect match, 0.0 otherwise). + + This is the strictest scoring metric. + """ + return 1.0 if normalize_answer(predicted_answer) == normalize_answer(true_answer) else 0.0 + + +def calculate_contains_score(predicted_answer, true_answer): + """Calculate score based on whether the predicted answer contains the true answer.""" + normalized_pred = normalize_answer(predicted_answer) + normalized_true = normalize_answer(true_answer) + + if not normalized_true: + return 0.0 + + return 1.0 if normalized_true in normalized_pred else 0.0 + + +def calculate_benchmark_scores(jsonl_file_path): + """Calculate scores for a benchmark result file.""" + if not os.path.exists(jsonl_file_path): + return {"error": "File not found"} + + total_questions = 0 + exact_matches = 0 + contains_matches = 0 + + with open(jsonl_file_path, "r", encoding="utf-8") as f: + for line in f: + try: + data = json.loads(line.strip()) + if not data: + continue + + predicted = data.get("answer", "") + true_answer = data.get("true_answer", "") + + total_questions += 1 + exact_matches += calculate_exact_match_score(predicted, true_answer) + contains_matches += calculate_contains_score(predicted, true_answer) + + except json.JSONDecodeError: + continue + + if total_questions == 0: + return {"error": "No valid questions found"} + + return { + "total_questions": total_questions, + "exact_match_score": exact_matches / total_questions, + "contains_score": contains_matches / total_questions, + "exact_matches": exact_matches, + "contains_matches": contains_matches, + } + + +def save_benchmark_scores(output_dir, model_id, action_type, date, eval_ds): + """Calculate and save scores for all benchmarks.""" + scores_file = f"{output_dir}/benchmark_scores_{model_id.replace('/', '__')}__{action_type}__{date}.json" + + all_scores = { + "model_id": model_id, + "action_type": action_type, + "date": date, + "timestamp": datetime.datetime.now().isoformat(), + "benchmarks": {}, + } + + total_questions_all = 0 + total_exact_matches_all = 0 + total_contains_matches_all = 0 + + print("\nπŸ“Š Calculating benchmark scores...") + + for task in eval_ds: + jsonl_file = f"{output_dir}/{model_id.replace('/', '__')}__{action_type}__{task}__{date}.jsonl" + scores = calculate_benchmark_scores(jsonl_file) + + if "error" not in scores: + all_scores["benchmarks"][task] = scores + total_questions_all += scores["total_questions"] + total_exact_matches_all += scores["exact_matches"] + total_contains_matches_all += scores["contains_matches"] + + print(f" πŸ“ˆ {task.upper()}:") + print(f" Questions: {scores['total_questions']}") + print( + f" Exact Match: {scores['exact_match_score']:.1%} ({scores['exact_matches']}/{scores['total_questions']})" + ) + print( + f" Contains: {scores['contains_score']:.1%} ({scores['contains_matches']}/{scores['total_questions']})" + ) + else: + print(f" ❌ {task.upper()}: {scores['error']}") + + # Overall scores + if total_questions_all > 0: + all_scores["overall"] = { + "total_questions": total_questions_all, + "exact_match_score": total_exact_matches_all / total_questions_all, + "contains_score": total_contains_matches_all / total_questions_all, + "exact_matches": total_exact_matches_all, + "contains_matches": total_contains_matches_all, + } + + print("\n🎯 OVERALL SCORES:") + print(f" Questions: {total_questions_all}") + print( + f" Exact Match: {all_scores['overall']['exact_match_score']:.1%} ({total_exact_matches_all}/{total_questions_all})" + ) + print( + f" Contains: {all_scores['overall']['contains_score']:.1%} ({total_contains_matches_all}/{total_questions_all})" + ) + + # Save scores to file + with open(scores_file, "w", encoding="utf-8") as f: + json.dump(all_scores, f, indent=2, ensure_ascii=False) + + print(f"\nπŸ’Ύ Scores saved to: {scores_file}") + return all_scores + + +def create_agent_team(model: Model): + """Create a centralized multi-agent system with a manager and 4 specialized agents.""" + text_limit = 100000 + ti_tool = TextInspectorTool(model, text_limit) + + browser = SimpleTextBrowser(**BROWSER_CONFIG) + + # Create base tools that are shared between agents + # shared_tools = [visualizer, ti_tool] + shared_tools = [] + + # Define tool sets for different agent types + web_tools: List[Tool] = [ + GoogleSearchTool(provider="serper"), + VisitTool(browser), + DownloadTool(browser), + PageUpTool(browser), + PageDownTool(browser), + FinderTool(browser), + FindNextTool(browser), + ArchiveSearchTool(browser), + ] + + code_tools: List[Tool] = [PythonInterpreterTool()] + reader_tools: List[Tool] = [FileReaderTool(), visualizer, ti_tool] + + # Add shared tools to each tool set + web_tools.extend(shared_tools) + code_tools.extend(shared_tools) + reader_tools.extend(shared_tools) + + # Create 4 specialized agents with detailed descriptions + + # Agent 1: Python Code Execution and Algorithm Implementation Specialist + code_agent = CodeAgent( + model=model, + tools=code_tools, + max_steps=15, + verbosity_level=1, + additional_authorized_imports=["*"], + planning_interval=3, + name="CodeAgent", + description="""Python Code Execution and Algorithm Implementation Specialist. + +Responsibilities: +- Write, test, and execute Python code with proper error handling and validation +- Create modular, testable functions with comprehensive docstrings and documentation +- Implement mathematical algorithms, computational solutions, and numerical analysis +- Run validation tests and smoke tests on code changes to ensure reliability +- Optimize code performance, debug complex issues, and handle edge cases +- Handle numerical computations, data processing tasks, and statistical analysis +- Extract precise numerical results for mathematical problems (e.g., "7", "12.5", "3/4") +- Validate results through multiple calculation methods when possible + +Special focus on mathematical problem solving: +- Use Python to calculate exact results with proper precision +- Show all calculations, code, and reasoning processes +- Follow specific format requirements (decimal, fraction, etc.) +- Provide executable demonstrations of solutions""", + ) + + # Agent 2: Fast Web Research and Information Gathering Specialist + web_research_agent = ToolCallingAgent( + model=model, + tools=web_tools, + max_steps=15, + verbosity_level=1, + name="WebResearchAgent", + description="""Fast Web Research and Information Gathering Specialist. + +Responsibilities: +- Conduct rapid, targeted web searches for relevant and up-to-date information +- Evaluate source credibility, authority, and reliability of information sources +- Perform thorough fact-checking and information verification from multiple sources +- Gather real-time data, current information, and recent developments +- Identify trending topics, breaking news, and emerging information patterns +- Extract key facts, summarize findings concisely, and present structured results +- Cross-reference findings against multiple independent sources for accuracy +- Provide source URLs, publication dates, and credibility assessments + +Research methodology: +- Always verify information from multiple independent, authoritative sources +- Prioritize recent, peer-reviewed, and official sources over outdated information +- Include comprehensive source attribution with URLs and publication dates +- Flag conflicting information and present different perspectives objectively +- Focus on factual accuracy over speed when sources present contradictory claims""", + ) + + # Agent 3: Document Analysis and Technical Specification Specialist + document_agent = ToolCallingAgent( + model=model, + tools=reader_tools, + max_steps=10, + verbosity_level=1, + name="DocumentAgent", + description="""Document Analysis and Technical Specification Specialist. + +Responsibilities: +- Analyze and extract critical information from technical documents, PDFs, and specifications +- Maintain precise citations with page numbers, section references, and source attribution +- Structure complex documentation into digestible, actionable summaries and insights +- Validate technical specifications against implementation requirements and standards +- Cross-reference multiple documents for consistency, completeness, and accuracy +- Identify critical details, potential implementation considerations, and technical constraints +- Extract both explicit information and implied requirements from documentation +- Highlight contradictions, ambiguities, and gaps found in documents + +Documentation standards: +- Always include precise citations with page numbers or section references +- Maintain clear separation between documented facts and personal interpretations +- Focus on actionable information that directly impacts problem-solving and decision-making +- Cross-reference claims against other available documentation and external sources +- Present structured analysis with clear organization and logical flow""", + ) + + # Agent 4: Comprehensive Analysis and Advanced Research Specialist + analysis_tools = list(set(web_tools + code_tools)) + analysis_agent = CodeAgent( + model=model, + tools=analysis_tools, + max_steps=20, + verbosity_level=1, + additional_authorized_imports=["*"], + planning_interval=4, + name="AnalysisAgent", + description="""Comprehensive Analysis and Advanced Research Specialist. + +Responsibilities: +- Conduct thorough, multi-layered investigations and comprehensive analysis of complex problems +- Develop, test, and rigorously validate complex hypotheses and theoretical frameworks +- Synthesize information from diverse sources into coherent, actionable insights and conclusions +- Perform advanced reasoning, logical validation, and critical evaluation of findings +- Design and execute comprehensive research methodologies with systematic approaches +- Validate findings through multiple analytical approaches and cross-verification methods +- Integrate quantitative computational analysis with qualitative research insights +- Challenge assumptions, test edge cases, and explore alternative explanations + +Advanced analytical framework: +- Apply systematic analytical frameworks and methodologies to complex, multi-faceted problems +- Present multiple perspectives and consider alternative explanations and interpretations +- Use both web research and code execution capabilities to validate hypotheses comprehensively +- Document reasoning processes, analytical methodologies, and decision-making criteria clearly +- Integrate diverse analytical approaches including statistical, computational, and theoretical methods +- Maintain objectivity while exploring creative and innovative solution approaches""", + ) + + # Create the manager agent with comprehensive coordination capabilities + manager_agent = CodeAgent( + model=model, + tools=shared_tools, # Manager has minimal tools, delegates to agents + max_steps=25, + verbosity_level=2, + additional_authorized_imports=["*"], + planning_interval=5, + managed_agents=[code_agent, web_research_agent, document_agent, analysis_agent], + name="ManagerAgent", + description="""Centralized Manager and Team Coordination Specialist. + +Primary Role: +- Orchestrate and coordinate the work of 4 specialized agents to solve complex, multi-faceted problems +- Strategically delegate tasks to the most appropriate agent based on their expertise and capabilities +- Synthesize results from multiple agents into comprehensive, well-reasoned final answers +- Ensure quality control and consistency across all agent contributions and outputs +- Manage workflow optimization, resource allocation, and inter-agent communication +- Don't hesitate to ask all agents for their input and expertise. + +Coordination Responsibilities: +- CodeAgent: Delegate mathematical computations, algorithm implementation, and code execution tasks +- WebResearchAgent: Assign web research, fact-checking, and real-time information gathering tasks +- DocumentAgent: Direct document analysis, technical specification review, and citation extraction +- AnalysisAgent: Coordinate comprehensive analysis, hypothesis testing, and multi-source synthesis + +Management Strategy: +- Break down complex problems into specialized subtasks aligned with agent expertise +- Ensure comprehensive coverage of all problem aspects through strategic task distribution +- Validate and cross-check results from multiple agents for accuracy and consistency +- Integrate diverse perspectives and methodologies into cohesive, evidence-based solutions +- Maintain high standards for accuracy, completeness, and methodological rigor""", + ) + + return manager_agent + + +def main(): + """Main benchmarking function.""" + args = parse_arguments() + + # Set date if not provided + if args.date is None: + args.date = datetime.datetime.now().strftime("%Y-%m-%d") + + # Initialize the model + if args.model_type == "LiteLLMModel": + if args.provider == "openai": + # For OpenAI, use the model_id directly + model = LiteLLMModel(model_id=args.model_id) + else: + # For other providers, include provider in model_id + model_id = f"{args.provider}/{args.model_id}" if args.provider != "hf-inference" else args.model_id + model = LiteLLMModel(model_id=model_id) + else: + raise ValueError(f"Unsupported model type: {args.model_type}") + + # Create downloads folder for browser + os.makedirs(BROWSER_CONFIG["downloads_folder"], exist_ok=True) + + # Load dataset + print(f"Loading dataset {args.eval_dataset}...") + df = load_eval_dataset(args.eval_dataset, args.num_examples) + + # If num_examples is specified, sample from each task + if args.num_examples is not None: + df = ( + df.groupby("task", group_keys=False) + .apply(lambda x: x.sample(n=min(args.num_examples, len(x)), random_state=42)) + .reset_index(drop=True) + ) + + print(f"\nLoaded {len(df)} examples total:") + for task in df["task"].unique(): + task_count = len(df[df["task"] == task]) + print(f"- {task}: {task_count} examples") + + # Run benchmark + results = [] + with ThreadPoolExecutor(max_workers=args.parallel_workers) as executor: + futures = [] + + # Group examples by task for better progress tracking + for task in df["task"].unique(): + task_df = df[df["task"] == task] + print(f"\nπŸš€ Starting benchmark for {task} with {len(task_df)} examples...") + + for _, row in task_df.iterrows(): + future = executor.submit(run_centralized_agents, row, args, model) + futures.append(future) + + # Process results with progress bar + for future in tqdm(as_completed(futures), total=len(futures), desc="Processing examples"): + try: + result = future.result() + results.append(result) + except Exception as e: + print(f"Error processing example: {str(e)}") + + # Calculate and print summary statistics per task + print("\nπŸ“Š Results Summary:") + for task in df["task"].unique(): + task_results = [r for r in results if r["task"] == task] + task_success = sum(1 for r in task_results if r["success"]) + task_duration = sum(r["duration"] for r in task_results) / len(task_results) if task_results else 0 + + print(f"\n{task}:") + print(f" Total examples: {len(task_results)}") + print(f" Success rate: {task_success / len(task_results):.2%}") + print(f" Average duration: {task_duration:.2f}s") + + # Overall statistics + total_success = sum(1 for r in results if r["success"]) + avg_duration = sum(r["duration"] for r in results) / len(results) if results else 0 + + print("\nπŸ“ˆ Overall Statistics:") + print(f" Total examples: {len(results)}") + print(f" Success rate: {total_success / len(results):.2%}") + print(f" Average duration: {avg_duration:.2f}s") + + # Push results to hub if requested + if args.push_answers_to_hub: + print("\nπŸš€ Pushing results to hub not yet implemented") + + # Calculate and save benchmark scores with proper variables + output_dir = output_path + model_id = f"centralized-{args.model_type}-{args.model_id}" + action_type = "centralized-agents" + date = args.date + eval_ds = df["task"].unique() + + save_benchmark_scores(output_dir, model_id, action_type, date, eval_ds) + + +if __name__ == "__main__": + main() diff --git a/examples/decentralized_smolagents_benchmark/scripts/__init__.py b/examples/decentralized_smolagents_benchmark/scripts/__init__.py new file mode 100644 index 000000000..b618e8995 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/__init__.py @@ -0,0 +1 @@ +"""Decentralized agent scripts package.""" diff --git a/examples/decentralized_smolagents_benchmark/scripts/agents.py b/examples/decentralized_smolagents_benchmark/scripts/agents.py new file mode 100644 index 000000000..d9b77dead --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/agents.py @@ -0,0 +1,1051 @@ +"""Agent definitions and utilities for decentralized team.""" + +import json +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + + +try: + from PIL import Image as PIL +except ImportError: + PIL = None + +from scripts.text_inspector_tool import FileReaderTool, TextInspectorTool +from scripts.text_web_browser import ( + ArchiveSearchTool, + DownloadTool, + FinderTool, + FindNextTool, + PageDownTool, + PageUpTool, + SimpleTextBrowser, + VisitTool, +) +from scripts.visual_qa import visualizer +from smolagents import CodeAgent, GoogleSearchTool, LiteLLMModel, ToolCallingAgent +from smolagents.default_tools import PythonInterpreterTool +from smolagents.tools import Tool + +from .decentralized_tools import create_decentralized_tools +from .message_store import MessageStore + + +# evaluation roles +custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"} + +script_dir = Path(__file__).parent.parent + +# Browser configuration +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" +BROWSER_CONFIG = { + "viewport_size": 1024 * 5, + "downloads_folder": script_dir / "downloads_folder", + "request_kwargs": { + "headers": {"User-Agent": user_agent}, + "timeout": 300, + }, + "serpapi_key": os.getenv("SERPAPI_API_KEY"), +} + +# ------------------------ Prompts (Team Charter) --------------------------- + +TEAM_CHARTER = """ +You are a specialized agent working as part of a decentralized team composed of 4 agents collaborating to solve problems. +CRITICAL: You must regularly use the communication tools provided to stay connected with your team. Check messages and notifications frequently! + +COMMUNICATION TOOLS AVAILABLE: +1. send_message_to_agent: Send private messages to specific agents +2. send_message_to_channel: Post messages to team channels/threads +3. read_messages: Check all your messages and mentions +4. read_notifications: Check notifications including polls needing votes +5. search_messages: Search through message history +6. create_general_poll: Create polls for team decisions +7. create_final_answer_poll: Propose final answers for consensus +8. vote_on_poll: Vote on active polls +9. view_active_polls: See what polls are currently active + +FILE HANDLING WORKFLOW: +CRITICAL: When working with files from URLs, follow this exact sequence: +1. For .docx, .xlsx, .pptx, .wav, .mp3, .m4a, .png files from URLs: + - FIRST: Use 'download_file' tool to download the file locally + - THEN: Use 'inspect_file_as_text' with the downloaded file path for analysis + - For .png files: Use visualizer tool after downloading + +2. For .pdf, .txt, .htm files from URLs: + - Use 'visit_page' tool directly (do NOT use download_file) + +3. For already local files (existing paths): + - Use 'inspect_file_as_text' or 'file_reader' directly + +NEVER try to use text inspection tools on URLs directly - this will cause errors! + +COMMUNICATION CHANNELS: +1. Public Messages (Message Store): + - Post in the main thread or create topic-specific threads (#topic) + - Mention others with @name to get their attention + - Use @all for team-wide announcements + +2. Direct Messages: + - Use send_message tool for private agent-to-agent communication. You are encouraged to communicate with other agents to get help from them. + - Valid target_ids: 0=CodeAgent, 1=WebSearchAgent, 2=DeepResearchAgent, 3=DocumentReaderAgent + - Use receive_messages tool to check your private messages + - Useful for detailed discussions or sharing sensitive information + +3. Threads: + - Default thread is #main + - Create new threads for subtopics (#analysis, #research, etc.) + - Follow relevant threads to stay updated + +COLLABORATION PROTOCOL: +1. When receiving a task: + - FIRST: Check notifications and messages to see team status + - Share your initial thoughts in #main channel + - Break down complex problems into subtasks + - Create relevant threads for different aspects + +2. During Investigation: + - Check messages frequently to see team updates + - Share findings and progress regularly via messaging + - Ask questions and request help when needed + - Combine private and public messages strategically + - Offer assistance to teammates + - Respond to messages and mentions promptly + +3. For Proposals: + - Use create_general_poll to propose intermediate solutions + - Use create_final_answer_poll to propose final answers that would be sent to the user + - Multiple polls can be active simultaneously - you can create additional polls if needed + - When voting, specify the poll_id if multiple polls are active + - Polls with 2+ NO votes are automatically deleted to allow new proposals + - Always include evidence and reasoning in proposals + - Required votes for consensus: floor(N/2)+1 = 3 + +4. Voting Guidelines: + - Check notifications regularly for polls needing votes + - Vote honestly based on your expertise using vote_on_poll + - When multiple polls are active, specify poll_id parameter when voting + - Include confidence level (scoring from 0.0 to 1.0) and rationale + - Suggest improvements if voting NO + +5. Best Practices: + - Keep messages concise and cite sources + - Use thread-specific discussions + - Be proactive in seeking consensus. Therefore, answer to polls promptly. + - Use tools deliberately; include only minimal outputs and sources. + - Use messaging tools proactively - the team relies on communication! + - Combine expertise with other agents through discussion + - Be responsive to team communications + - Never leak secrets or tokens. Sandbox untrusted code/data. + - CRITICAL: Always check for and vote on active polls using read_notifications and vote_on_poll + - You can create multiple polls and vote on all relevant polls + +COMMUNICATION APPROACH: +- Use send_message_to_channel to share deep analysis in #analysis threads +- Use send_message_to_agent for complex hypothesis discussions +- Collaborate with all agents to develop comprehensive understanding +- Lead consensus-building through thoughtful poll creation and voting +- Start each task by checking your notifications with read_notifications +- Use read_messages regularly to stay updated on team discussions, after having looked at notifications +- Send messages to collaborate: use send_message_to_agent for private discussions, send_message_to_channel for team communication (with context and more than one agent, but not cessarely all the agents) +- When you have findings or need input, share them through messaging +- Before making decisions, you can create polls using create_general_poll +- Vote on polls promptly when you see them in notifications +- For final answers, use create_final_answer_poll to get team consensus + +KEY COMMUNICATION PATTERNS: +1. Use read_notifications to gather context from other agents' work +2. Share comprehensive analysis with send_message_to_channel in #analysis +3. Use send_message_to_agent for deep technical and hypothesis discussions +4. Create well-reasoned polls with create_general_poll for major findings +5. Vote thoughtfully on polls using vote_on_poll with detailed rationale + +Remember: A solution is only accepted when a majority agrees (3/4 agents). Work together to reach consensus! + +CRITICAL: make sure that you respect the required format of answer, when you propose one. +MUCH CRITICAL: Never call the FinalAnswerTool, use create_final_answer_poll instead.to propose final answers. + +ANSWER FORMAT REQUIREMENTS: +- For MATH problems: final_answer must be ONLY the number, expression, or result (e.g., "7", "12.5", "$100", "x = 3") +- For FACTUAL questions: final_answer must be ONLY the specific fact requested (e.g., "1925", "John Smith", "Paris", "Blue") +- For YES/NO questions: final_answer must be ONLY "Yes" or "No" +- Do NOT include phrases like "The answer is...", "approximately...", "roughly..." in final_answer +- Do NOT include units unless specifically requested (e.g., if asked "how many", answer "5" not "5 items") +- Put ALL explanations, reasoning, and context in supporting_evidence, NOT in final_answer +- The final_answer field will be returned directly to the user, so keep it clean and minimal + +Your success depends on active communication. Use the messaging and notification tools regularly to coordinate with your team! +""".strip() + + +def _generate_agent_addon(agent_config: dict, all_agents: List[dict]) -> str: + """Generate dynamic agent addon based on configuration and team composition.""" + + # Get other agents for collaboration patterns + other_agents = [a for a in all_agents if a["name"] != agent_config["name"]] + + collaboration_patterns = [] + for i, other_agent in enumerate(other_agents, 1): + pattern = f"{i}. With {other_agent['full_role']} (@{other_agent['name']}):\n" + pattern += f" - {other_agent['collaboration_with'][agent_config['name']]}" + collaboration_patterns.append(pattern) + + addon = f""" +ROLE: {agent_config["full_role"]} +Primary Responsibilities: +{chr(10).join(f"- {resp}" for resp in agent_config["responsibilities"])} + +COLLABORATION PATTERNS: +{chr(10).join(collaboration_patterns)} + +COMMUNICATION: +{chr(10).join(f"- {comm}" for comm in agent_config["communication_patterns"])} + +{agent_config.get("special_instructions", "")} +""".strip() + + return addon + + +# TODO: mettre les descriptions des tools comme dans chacun des autres tools dans default_tools. +# Mettre les notifications +# Post and send messages sous la forme de tools. +# Ou crΓ©er des conversations ids. Et envoyer Γ  la conversation, avec SendMessageTools. 2 tools: SendMessageToChannel (liste agent id + main channel + titre channel) et SendMessageToAgent. +# Utiliser un poll diffΓ©rent pour les pols et ne pas hΓ©riter de FinalAnswerTool. Ne plus l'utiliser et arrΓͺter les agents lorsque c'est fini. Le but est aussi d'avoir une trace complΓ¨te. + +# ---------------End of prompts---------------- + + +class DecentralizedToolCallingAgent(ToolCallingAgent): + """ToolCallingAgent that creates polls for final answers.""" + + def __init__(self, message_store: MessageStore, agent_name: str, *args, **kwargs): + super().__init__(*args, **kwargs) + self.message_store = message_store + self.agent_name = agent_name + + +class DecentralizedCodeAgent(CodeAgent): + """CodeAgent that creates polls for final answers.""" + + def __init__(self, message_store: MessageStore, agent_name: str, *args, **kwargs): + super().__init__(*args, **kwargs) + self.message_store = message_store + self.agent_name = agent_name + + +@dataclass +class AgentConfig: + name: str + role: str + tools: List[Tool] + system_prompt: str + model: Any # Model instance + max_turns: int = 20 + keywords: List[str] = field(default_factory=list) + agent_type: str = "tool_calling" # "tool_calling" or "code" + + +class DecentralizedAgent: + """Minimal wrapper around vanilla smolagents that provides decentralized tools.""" + + def __init__( + self, + config: "AgentConfig", + message_store: MessageStore, + model_type: str, + model_id: str, + provider: Optional[str] = None, + ): + self.config = config + self.message_store = message_store + + # Create the appropriate vanilla smolagents agent + # The LLM will decide when and how to use the communication tools + if config.agent_type == "code": + self.agent = CodeAgent( + tools=config.tools, + model=config.model, + instructions=config.system_prompt, + max_steps=20, + ) + else: + self.agent = ToolCallingAgent( + tools=config.tools, + model=config.model, + instructions=config.system_prompt, + max_steps=20, + ) + + def run(self, task: str, **kwargs): + """Run the agent on a task - delegate to the vanilla smolagents.""" + return self.agent.run(task, **kwargs) + + def step(self) -> List[Dict[str, Any]]: + """Single step interface for compatibility.""" + return [] + + +class DecentralizedAgents: + """Main orchestrator class that combines all the created agents of the team.""" + + def __init__( + self, + message_store: MessageStore, + model_type: str, + model_id: str, + provider: Optional[str] = None, + run_id: Optional[str] = None, + ): + """Initialize the decentralized agent team. + + Args: + message_store: The message store for agent communication + model_type: Type of model to use (e.g., "LiteLLMModel") + model_id: Model ID to use (e.g., "gpt-4o") + provider: Model provider (e.g., "openai") + run_id: Optional run ID for tracking + """ + self.message_store = message_store + self.model_type = model_type + self.model_id = model_id + self.provider = provider + self.run_id = run_id or "default" + + # Create the team of agents + self.agents = self._create_decentralized_team() + + # Track execution state + self.results = [] + self.consensus_reached = False + self.final_answer = None + + # Log team creation + agent_info = [] + for agent in self.agents: + agent_details = {"name": agent.config.name, "role": agent.config.role} + agent_info.append(agent_details) + + logging.info( + json.dumps( + { + "event": "decentralized_team_created", + "run_id": self.run_id, + "agent_count": len(self.agents), + "agents": agent_info, + } + ) + ) + + def _create_decentralized_team(self) -> List[DecentralizedAgent]: + """Create the team of specialized agents.""" + base_prompt = TEAM_CHARTER + + # Initialize shared components + text_limit = 100000 + browser = SimpleTextBrowser(**BROWSER_CONFIG) + + # Initialize model + model_params: dict[str, Any] = { + "model_id": self.model_id, + "custom_role_conversions": custom_role_conversions, + } + + model_params["max_tokens"] = 8192 + + model = LiteLLMModel(**model_params) + + ti_tool = TextInspectorTool(model, text_limit) + + # Create directory for browser downloads if needed + os.makedirs(BROWSER_CONFIG["downloads_folder"], exist_ok=True) + + # Create base tools that are shared between code and web tools + shared_tools = [visualizer, ti_tool] + + # Base web tools + web_tools = [ + GoogleSearchTool(provider="serper"), + VisitTool(browser), + DownloadTool(browser), + PageUpTool(browser), + PageDownTool(browser), + FinderTool(browser), + FindNextTool(browser), + ArchiveSearchTool(browser), + ] + + # Base code tools + code_tools: List[Tool] = [PythonInterpreterTool()] + + # Use simple file reader tool + reader_tools: List[Tool] = [FileReaderTool()] + + # Add shared tools to both collections + code_tools.extend(shared_tools) + web_tools.extend(shared_tools) + reader_tools.extend(shared_tools) + + # Define agent configurations with enhanced role descriptions + agent_definitions = [ + { + "name": "CodeAgent", + "full_role": "Python Code Execution and Algorithm Implementation Specialist", + "short_role": "Python code execution specialist", + "responsibilities": [ + "Write, test, and execute Python code with proper error handling", + "Create modular, testable functions with comprehensive docstrings", + "Implement mathematical algorithms and computational solutions", + "Run validation tests and smoke tests on code changes", + "Optimize code performance and debug complex issues", + "Handle numerical computations and data processing tasks", + ], + "communication_patterns": [ + "Create #implementation threads for coding tasks and technical discussions", + "Use private messages for detailed algorithmic discussions", + "Share code execution results and outputs in public threads for team review", + "Tag @all for major implementation decisions requiring team input", + "Provide code examples and executable demonstrations of solutions", + ], + "special_instructions": """MATH PROBLEM FORMAT: +When solving math problems and creating final answer polls: +- Use Python to calculate exact results with proper precision +- Extract ONLY the numerical result for final_answer (e.g., "7", "12.5", "3/4") +- Do NOT include "The answer is..." or explanations in final_answer +- Show all calculations, code, and reasoning in supporting_evidence +- Follow the specific format requested by the question (decimal, fraction, etc.) +- Validate results through multiple calculation methods when possible""", + "tools": code_tools, + "agent_type": "code", + "keywords": ["code", "python", "execution", "algorithm", "computation"], + }, + { + "name": "WebSearchAgent", + "full_role": "Fast Web Research and Information Gathering Specialist", + "short_role": "Fast web research specialist", + "responsibilities": [ + "Conduct rapid, targeted web searches for relevant information", + "Evaluate source credibility and cross-reference findings", + "Perform initial fact-checking and information triage", + "Gather real-time data and current information from multiple sources", + "Identify trending topics and recent developments", + "Extract key facts and summarize findings concisely", + ], + "communication_patterns": [ + "Create #research threads for new investigation topics", + "Share quick findings and preliminary results in #main channel", + "Use private messages for hypothesis formation and validation discussions", + "Tag @all for significant discoveries that impact the entire team", + "Provide source links and credibility assessments with all findings", + ], + "special_instructions": """RESEARCH METHODOLOGY: +- Always verify information from multiple independent sources +- Prioritize recent, authoritative sources over outdated information +- Include source URLs and publication dates in all research findings +- Flag conflicting information and present different perspectives +- Focus on factual accuracy over speed when sources conflict""", + "tools": web_tools, + "agent_type": "tool_calling", + "keywords": ["research", "web", "search", "facts", "verification"], + }, + { + "name": "DeepResearchAgent", + "full_role": "Comprehensive Analysis and Advanced Research Specialist", + "short_role": "Deep analysis and advanced research specialist", + "responsibilities": [ + "Conduct thorough, multi-layered investigations and analysis", + "Develop and rigorously test complex hypotheses and theories", + "Synthesize information from diverse sources into coherent insights", + "Perform advanced reasoning and logical validation of conclusions", + "Design and execute comprehensive research methodologies", + "Validate findings through multiple analytical approaches", + ], + "communication_patterns": [ + "Maintain #analysis thread for deep analytical discussions", + "Create hypothesis-specific threads for focused investigation", + "Use private messages for complex theoretical discussions", + "Tag @all for major research breakthroughs and validated findings", + "Present comprehensive analysis with supporting evidence and methodology", + ], + "special_instructions": """ANALYSIS FRAMEWORK: +- Apply systematic analytical frameworks to complex problems +- Present multiple perspectives and consider alternative explanations +- Use both web research and code execution to validate hypotheses +- Document reasoning processes and analytical methodologies clearly +- Integrate quantitative and qualitative analysis approaches +- Challenge assumptions and test edge cases thoroughly""", + "tools": list(set(web_tools + code_tools)), + "agent_type": "code", + "keywords": ["research", "deep", "analysis", "hypothesis", "validation", "synthesis"], + }, + { + "name": "DocumentReaderAgent", + "full_role": "Document Analysis and Technical Specification Specialist", + "short_role": "Document analysis and technical specification specialist", + "responsibilities": [ + "Analyze and extract key information from technical documents", + "Maintain precise citations and track information sources", + "Structure complex documentation into digestible summaries", + "Validate technical specifications against implementation requirements", + "Cross-reference multiple documents for consistency and completeness", + "Identify critical details and potential implementation considerations", + ], + "communication_patterns": [ + "Maintain #documentation thread for document-related discussions", + "Create topic-specific threads for major document analyses", + "Use private messages for detailed technical specification reviews", + "Tag @all for critical documentation updates affecting team decisions", + "Provide structured summaries with precise page/section references", + ], + "special_instructions": """DOCUMENTATION STANDARDS: +- Always include precise citations with page numbers or section references +- Highlight contradictions or ambiguities found in documents +- Extract both explicit information and implied requirements +- Cross-reference claims against other available documentation +- Focus on actionable information that impacts problem-solving +- Maintain clear separation between documented facts and interpretations""", + "tools": reader_tools, + "agent_type": "tool_calling", + "keywords": ["document", "pdf", "extract", "page", "specification", "analysis"], + }, + ] + + # Define collaboration patterns between agents + collaboration_matrix = { + "CodeAgent": { + "WebSearchAgent": "Receive algorithm suggestions, implementation requirements, and real-world examples to guide development", + "DocumentReaderAgent": "Get technical specifications, API documentation, and implementation patterns to follow standards", + "DeepResearchAgent": "Implement complex algorithms, run validation experiments, and execute computational analyses", + }, + "WebSearchAgent": { + "CodeAgent": "Share algorithm ideas, provide implementation examples, and verify technical information through search", + "DocumentReaderAgent": "Cross-reference web findings with official documentation and validate external claims", + "DeepResearchAgent": "Provide initial research foundation for deeper analysis and hypothesis development", + }, + "DeepResearchAgent": { + "CodeAgent": "Design computational experiments, request algorithm implementations, and validate results mathematically", + "WebSearchAgent": "Expand on initial findings, request targeted searches, and cross-validate information sources", + "DocumentReaderAgent": "Deep dive into technical documentation, analyze architectural decisions, and validate against specifications", + }, + "DocumentReaderAgent": { + "CodeAgent": "Share technical specifications, validate implementations against documentation, and track API requirements", + "WebSearchAgent": "Compare documentation with external sources, validate technical claims, and share relevant sections", + "DeepResearchAgent": "Provide detailed technical background, support hypothesis validation with documented evidence", + }, + } + + # Add collaboration information to agent definitions + for agent_def in agent_definitions: + agent_def["collaboration_with"] = collaboration_matrix[agent_def["name"]] + + # Add decentralized tools to each agent's tool set and create configs + configs = [] + for agent_def in agent_definitions: + # Add decentralized communication tools + agent_tools = agent_def["tools"] + create_decentralized_tools(self.message_store, agent_def["name"]) + + # Generate dynamic addon + agent_addon = _generate_agent_addon(agent_def, agent_definitions) + + config = AgentConfig( + name=agent_def["name"], + role=agent_def["short_role"], + tools=agent_tools, + model=model, + system_prompt=base_prompt + "\n" + agent_addon, + keywords=agent_def["keywords"], + agent_type=agent_def["agent_type"], + ) + configs.append(config) + + return [ + DecentralizedAgent( + config=config, + message_store=self.message_store, + model_type=self.model_type, + model_id=self.model_id, + provider=self.provider, + ) + for config in configs + ] + + def run(self, task: str) -> Dict[str, Any]: + """Run the entire decentralized agent team on a task. + + Args: + task: The task/question to solve + + Returns: + Dict containing the result with status, answer, and metadata + """ + print(f"πŸš€ Starting decentralized agent team for: {task}") + logging.info(json.dumps({"event": "team_execution_started", "run_id": self.run_id, "task": task})) + + # Post initial task to message store + initial_msg = self.message_store.append_message( + sender="system", content=task, thread_id="main", msg_type="task" + ) + print("πŸ“ Posted initial task to message store") + logging.info( + json.dumps( + {"event": "task_posted", "run_id": self.run_id, "message_id": initial_msg.get("id"), "task": task} + ) + ) + + # Print team info + print(f"πŸ‘₯ Created team of {len(self.agents)} agents:") + for agent in self.agents: + print(f" - {agent.config.name} ({agent.config.role})") + + print("\nπŸ’¬ Running agents in parallel with collaborative approach...") + + # Prepare collaborative task prompt with enhanced instructions + QUESTION_ADDON = """ +IMPORTANT: Before answering, please: +1. Use read_notifications to check if there are any ongoing team discussions or polls +2. Use view_active_polls to see if there are any polls you should vote on +3. Use read_messages to see what other agents have contributed +4. If there's an active poll about the final answer, vote on it using vote_on_poll +5. If no poll exists yet and you're confident in an answer, create a final answer poll using create_final_answer_poll + +Work collaboratively with your team!""" + + collaborative_task = f"{task}\n\n{QUESTION_ADDON}" + + # Run all agents in parallel + self.results = [] + start_time = time.time() + + with ThreadPoolExecutor(max_workers=len(self.agents)) as executor: + # Submit all agent tasks + future_to_agent = { + executor.submit(self._run_single_agent, agent, collaborative_task, i): agent + for i, agent in enumerate(self.agents) + } + + # Process completed agents and check for early consensus + for future in as_completed(future_to_agent): + agent = future_to_agent[future] + try: + agent_result = future.result() + self.results.append(agent_result) + + print(f"βœ… {agent.config.name} completed ({len(self.results)}/{len(self.agents)} done)") + + # Check for early consensus after each completion + if len(self.results) >= 2: # Need at least 2 agents for meaningful consensus + consensus_result = self._check_for_consensus() + if consensus_result: + print(f"\n🎯 Consensus reached early after {agent.config.name}: {consensus_result}") + logging.info( + json.dumps( + { + "event": "consensus_reached_early", + "run_id": self.run_id, + "after_agent": agent.config.name, + "completed_agents": len(self.results), + "final_answer": str(consensus_result)[:200], + } + ) + ) + + # Cancel remaining tasks (best effort) + for remaining_future in future_to_agent: + if not remaining_future.done(): + remaining_future.cancel() + + self.consensus_reached = True + self.final_answer = consensus_result + total_duration = time.time() - start_time + return self._create_result_summary("success_early", total_duration) + + except Exception as e: + print(f"❌ {agent.config.name} failed: {e}") + logging.error( + json.dumps( + { + "event": "agent_failed_in_parallel", + "run_id": self.run_id, + "agent_name": agent.config.name, + "error": str(e), + } + ) + ) + # Continue with other agents + + # Final consensus check + logging.info(json.dumps({"event": "final_consensus_check_started", "run_id": self.run_id})) + final_consensus = self._check_for_consensus() + + total_duration = time.time() - start_time + + if final_consensus: + print(f"\n🎯 Final consensus reached: {final_consensus}") + self.consensus_reached = True + self.final_answer = final_consensus + return self._create_result_summary("success", total_duration) + else: + # Fallback to last valid result + return self._create_fallback_result(total_duration) + + def _run_single_agent(self, agent: DecentralizedAgent, task: str, index: int) -> Dict[str, Any]: + """Run a single agent and return result metadata.""" + print(f"\nπŸ€– Running {agent.config.name}...") + logging.info( + json.dumps( + { + "event": "agent_started", + "run_id": self.run_id, + "agent_name": agent.config.name, + "agent_index": index, + } + ) + ) + + start_time = time.time() + + try: + result = agent.run(task) + end_time = time.time() + duration = end_time - start_time + + print(f"βœ… {agent.config.name} completed") + logging.info( + json.dumps( + { + "event": "agent_completed", + "run_id": self.run_id, + "agent_name": agent.config.name, + "duration_seconds": round(duration, 2), + "result_type": type(result).__name__, + "result_preview": str(result)[:200] if result else None, + } + ) + ) + + return { + "agent_name": agent.config.name, + "status": "success", + "result": result, + "duration": duration, + "error": None, + } + + except Exception as e: + end_time = time.time() + duration = end_time - start_time + + # Special handling for the "int is not iterable" error we're hunting + error_str = str(e) + if "not iterable" in error_str: + import traceback + + logging.error( + json.dumps( + { + "event": "type_iteration_error_caught", + "run_id": self.run_id, + "agent_name": agent.config.name, + "error": error_str, + "error_type": type(e).__name__, + "duration_seconds": round(duration, 2), + "full_traceback": traceback.format_exc(), + "task_preview": task[:300] + "..." if len(task) > 300 else task, + } + ) + ) + print(f"πŸ” FOUND TYPE ERROR in {agent.config.name}: {error_str}") + else: + logging.error( + json.dumps( + { + "event": "agent_failed", + "run_id": self.run_id, + "agent_name": agent.config.name, + "error": error_str, + "error_type": type(e).__name__, + "duration_seconds": round(duration, 2), + } + ) + ) + + print(f"❌ {agent.config.name} failed: {e}") + + return { + "agent_name": agent.config.name, + "status": "error", + "result": f"Error: {e}", + "duration": duration, + "error": str(e), + } + + def _check_for_consensus(self) -> Optional[str]: + """Check if agents reached consensus through polling. + + IMPORTANT: This method ensures that when multiple polls reach consensus, + the answer from the FIRST poll (chronologically) is returned, not just + any successful poll. This maintains consistency with the FinalAnswerTool + behavior and ensures reproducible results. + + Processing order: + 1. First check for any existing final_answer messages (from previously finalized polls) + 2. Then check active polls in chronological order (oldest created first) + 3. Return the answer from the first successful poll encountered + + Returns: + str: The final answer from the first successful poll, or None if no consensus + """ + logging.info(json.dumps({"event": "consensus_check_started", "run_id": self.run_id})) + + try: + # First, look for any existing finalized polls with final answers + # These are final answers from polls that already achieved voting threshold + all_messages = list(self.message_store._iter_messages()) + logging.info( + json.dumps({"event": "messages_retrieved", "run_id": self.run_id, "message_count": len(all_messages)}) + ) + + # Look for the first final_answer message (first poll that achieved threshold) + for msg in all_messages: + if isinstance(msg, dict): + content = msg.get("content", {}) + if isinstance(content, dict) and content.get("type") == "final_answer": + answer = content.get("answer", "") + poll_id = content.get("poll_id", "unknown") + logging.info( + json.dumps( + { + "event": "existing_final_answer_found", + "run_id": self.run_id, + "message_id": msg.get("id"), + "poll_id": poll_id, + "answer": str(answer)[:200], + "note": "first_finalized_poll_in_message_history", + } + ) + ) + print(f"βœ… Using answer from first finalized poll: {answer}") + return answer + + # Check for active polls that might need finalization + # IMPORTANT: We check all active polls to see which one FIRST achieves + # the voting threshold (N//2+1), regardless of creation order + active_polls = self.message_store.get_active_polls() + + print(f"πŸ” Found {len(active_polls)} active polls") + logging.info( + json.dumps( + { + "event": "active_polls_check", + "run_id": self.run_id, + "poll_count": len(active_polls), + "processing_strategy": "first_to_achieve_threshold", + } + ) + ) + + first_successful_answer = None + first_successful_poll_id = None + + for poll in active_polls: + if isinstance(poll, dict): + poll_id = poll.get("poll_id") + print(f"πŸ—³οΈ Checking if poll {poll_id} has achieved voting threshold") + logging.info( + json.dumps( + { + "event": "poll_threshold_check", + "run_id": self.run_id, + "poll_id": poll_id, + "poll_question": poll.get("question", ""), + } + ) + ) + + if poll_id: + # Check if this poll has achieved the voting threshold + result = self.message_store.finalize_poll_if_ready(poll_id) + print(f"πŸ“Š Poll {poll_id} finalization result: {result}") + logging.info( + json.dumps( + { + "event": "poll_finalization_attempted", + "run_id": self.run_id, + "poll_id": poll_id, + "result": str(result)[:200] if result else None, + } + ) + ) + + if result and not result.get("deleted"): + answer_content = result.get("content", {}) + if isinstance(answer_content, dict): + answer = answer_content.get("answer", "") + if first_successful_answer is None: + # This is the FIRST poll to achieve the voting threshold + first_successful_answer = answer + first_successful_poll_id = poll_id + print(f"βœ… First poll to achieve consensus: {poll_id} -> {answer}") + logging.info( + json.dumps( + { + "event": "first_poll_achieved_threshold", + "run_id": self.run_id, + "poll_id": poll_id, + "answer": str(answer)[:200], + "strategy": "first_to_reach_vote_threshold", + } + ) + ) + else: + # Another poll also achieved threshold, but we keep the first one + print( + f"ℹ️ Poll {poll_id} also achieved consensus, but {first_successful_poll_id} was first" + ) + logging.info( + json.dumps( + { + "event": "subsequent_poll_achieved_threshold", + "run_id": self.run_id, + "poll_id": poll_id, + "first_successful_poll": first_successful_poll_id, + "answer_ignored": str(answer)[:200], + } + ) + ) + elif result and result.get("deleted"): + print(f"πŸ—‘οΈ Poll {poll_id} was deleted due to insufficient support") + logging.info( + json.dumps( + { + "event": "poll_deleted", + "run_id": self.run_id, + "poll_id": poll_id, + "reason": "insufficient_support", + } + ) + ) + + # Return the answer from the first poll that achieved the voting threshold + if first_successful_answer is not None: + logging.info( + json.dumps( + { + "event": "returning_first_threshold_answer", + "run_id": self.run_id, + "winning_poll_id": first_successful_poll_id, + "answer": str(first_successful_answer)[:200], + } + ) + ) + return first_successful_answer + + except Exception as e: + print(f"⚠️ Error checking consensus: {e}") + logging.error( + json.dumps( + { + "event": "consensus_check_error", + "run_id": self.run_id, + "error": str(e), + "error_type": type(e).__name__, + } + ) + ) + + logging.info( + json.dumps({"event": "consensus_check_completed", "run_id": self.run_id, "result": "no_consensus"}) + ) + return None + + def _create_result_summary(self, status: str, duration: float) -> Dict[str, Any]: + """Create a comprehensive result summary.""" + logging.info( + json.dumps( + { + "event": "run_completed", + "run_id": self.run_id, + "status": status, + "duration_seconds": round(duration, 2), + } + ) + ) + + return { + "status": status, + "answer": self.final_answer, + "consensus_reached": self.consensus_reached, + "total_duration": round(duration, 2), + "agent_results": self.results, + "run_id": self.run_id, + "agent_count": len(self.agents), + "successful_agents": len([r for r in self.results if r.get("status") == "success"]), + "failed_agents": len([r for r in self.results if r.get("status") == "error"]), + } + + def _create_fallback_result(self, duration: float) -> Dict[str, Any]: + """Create result when no consensus is reached.""" + # Fallback to last valid result + valid_results = [r for r in self.results if r.get("status") == "success"] + + if valid_results: + answer = valid_results[-1]["result"] # Use the last valid result + print(f"\nπŸ“ Final result (fallback): {answer}") + logging.info( + json.dumps( + { + "event": "fallback_result_used", + "run_id": self.run_id, + "result": str(answer)[:200], + "valid_result_count": len(valid_results), + } + ) + ) + + self.final_answer = str(answer) + return self._create_result_summary("success_fallback", duration) + else: + print("\n❌ No valid results obtained") + logging.error( + json.dumps( + { + "event": "no_valid_results", + "run_id": self.run_id, + "total_results": len(self.results), + "error_results": len([r for r in self.results if r.get("status") == "error"]), + } + ) + ) + + self.final_answer = None + return self._create_result_summary("failure", duration) + + def get_agent_by_name(self, name: str) -> Optional[DecentralizedAgent]: + """Get an agent by name.""" + for agent in self.agents: + if agent.config.name == name: + return agent + return None + + def get_results(self) -> Dict[str, Any]: + """Get the current execution results.""" + return { + "consensus_reached": self.consensus_reached, + "final_answer": self.final_answer, + "results": self.results, + "run_id": self.run_id, + } + + @property + def agent_names(self) -> List[str]: + """Get list of agent names.""" + return [agent.config.name for agent in self.agents] + + @property + def agent_count(self) -> int: + """Get number of agents in the team.""" + return len(self.agents) diff --git a/examples/decentralized_smolagents_benchmark/scripts/consensus_protocol.py b/examples/decentralized_smolagents_benchmark/scripts/consensus_protocol.py new file mode 100644 index 000000000..80da1e26d --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/consensus_protocol.py @@ -0,0 +1,152 @@ +"""Decentralized consensus protocol based on emergent behavior.""" + +import re +from typing import Dict, List, Optional, Tuple + + +class ConsensusProtocol: + def __init__(self): + self.local_state = {} + self.confidence_threshold = 0.7 + self.adaptation_rate = 0.2 + + def analyze_conversation(self, messages: List[Dict]) -> Tuple[bool, Optional[str], float]: + """Analyze conversation to detect emerging consensus.""" + # Extract all proposals and their votes + proposals = {} # proposal_id -> {content, votes, confidence} + + for msg in messages: + if msg.get("type") == "final_answer_proposal": + pid = msg["id"] + proposals[pid] = { + "content": msg["content"], + "votes": {"yes": 0, "no": 0}, + "confidence": 0, + "timestamp": msg["timestamp"], + } + elif msg.get("type") == "vote": + content = msg.get("content", {}) + pid = content.get("proposal_id") + if pid in proposals: + vote = content.get("vote") + confidence = content.get("confidence", 0) + if vote in ("yes", "no"): + proposals[pid]["votes"][vote] += 1 + proposals[pid]["confidence"] = max(proposals[pid]["confidence"], confidence) + + if not proposals: + return False, None, 0 + + # Analyze emerging patterns + patterns = self._detect_patterns([p["content"] for p in proposals.values()]) + + # Look for consensus candidates + consensus_candidates = [] + for pid, prop in proposals.items(): + votes = prop["votes"] + total_votes = votes["yes"] + votes["no"] + if total_votes == 0: + continue + + agreement = votes["yes"] / total_votes + if agreement >= 0.5: # At least 50% agreement + score = self._calculate_consensus_score( + agreement, prop["confidence"], self._matches_pattern(prop["content"], patterns), prop["timestamp"] + ) + consensus_candidates.append((score, prop["content"])) + + if not consensus_candidates: + return False, None, 0 + + # Return the strongest consensus candidate + consensus_candidates.sort(reverse=True) + best_score, content = consensus_candidates[0] + + # Need a minimum consensus strength + if best_score > self.confidence_threshold: + return True, content, best_score + + return False, None, best_score + + def _detect_patterns(self, proposals: List[str]) -> List[Dict]: + """Detect emerging patterns in proposals.""" + patterns = [] + + # Number format pattern + number_formats = [] + for p in proposals: + numbers = re.findall(r"\d+\.?\d*", p) + if numbers: + number_formats.extend(numbers) + + if number_formats: + # Check if numbers tend to be integers or decimals + decimals = len([n for n in number_formats if "." in n]) + if decimals > len(number_formats) / 2: + patterns.append({"type": "number", "format": "decimal"}) + else: + patterns.append({"type": "number", "format": "integer"}) + + # Unit format pattern + unit_formats = [] + for p in proposals: + # Try to extract unit part + parts = p.strip().split(" ") + if len(parts) > 1: + unit_formats.append(" ".join(parts[1:])) + + if unit_formats: + # Find most common unit format + from collections import Counter + + unit_counts = Counter(unit_formats) + common_unit = unit_counts.most_common(1)[0][0] + patterns.append({"type": "unit", "format": common_unit}) + + return patterns + + def _matches_pattern(self, content: str, patterns: List[Dict]) -> float: + """Calculate how well content matches detected patterns.""" + if not patterns: + return 0.5 # Neutral when no patterns + + matches = 0 + total = len(patterns) + + for pattern in patterns: + if pattern["type"] == "number": + numbers = re.findall(r"\d+\.?\d*", content) + if numbers: + num = numbers[0] + if pattern["format"] == "decimal" and "." in num: + matches += 1 + elif pattern["format"] == "integer" and "." not in num: + matches += 1 + + elif pattern["type"] == "unit": + parts = content.strip().split(" ") + if len(parts) > 1: + unit = " ".join(parts[1:]) + if unit == pattern["format"]: + matches += 1 + + return matches / total if total > 0 else 0.5 + + def _calculate_consensus_score( + self, agreement: float, confidence: float, pattern_match: float, timestamp: str + ) -> float: + """Calculate overall consensus score combining multiple factors.""" + # Weight recent proposals more heavily + recency = 1.0 # Could factor in timestamp if needed + + # Combine factors with weights + weights = {"agreement": 0.4, "confidence": 0.3, "pattern_match": 0.2, "recency": 0.1} + + score = ( + agreement * weights["agreement"] + + confidence * weights["confidence"] + + pattern_match * weights["pattern_match"] + + recency * weights["recency"] + ) + + return score diff --git a/examples/decentralized_smolagents_benchmark/scripts/convert_all_runs_to_html.py b/examples/decentralized_smolagents_benchmark/scripts/convert_all_runs_to_html.py new file mode 100644 index 000000000..ef36a990c --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/convert_all_runs_to_html.py @@ -0,0 +1,61 @@ +"""Batch convert all runs' messages.jsonl to HTML index files. + +Example: + python -m examples.decentralized_smolagents_benchmark.scripts.convert_all_runs_to_html \ + --runs-dir /home/ecca/GitFiles/dec_smolagents/examples/decentralized_smolagents_benchmark/runs \ + --force +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +from .html_renderer import MessagesHtmlRenderer + + +def main() -> None: + parser = argparse.ArgumentParser(description="Convert all run messages.jsonl to HTML") + parser.add_argument( + "--runs-dir", + required=True, + help="Path to runs directory containing subfolders with messages.jsonl", + ) + parser.add_argument( + "--title-prefix", + default="Run", + help="Prefix for HTML title; run folder name is appended", + ) + parser.add_argument( + "--force", + action="store_true", + help="Overwrite existing index.html files if present", + ) + args = parser.parse_args() + + runs_dir = Path(args.runs_dir) + if not runs_dir.exists() or not runs_dir.is_dir(): + raise SystemExit(f"Runs dir not found or not a directory: {runs_dir}") + + renderer = MessagesHtmlRenderer() + count = 0 + for messages_file in runs_dir.glob("*/messages.jsonl"): + run_folder = messages_file.parent + out_file = run_folder / "index.html" + if out_file.exists() and not args.force: + print(f"Skipping {run_folder.name}: index.html exists (use --force to overwrite)") + continue + renderer.title = f"{args.title_prefix} {run_folder.name}" + html = renderer.render_file(messages_file) + out_file.write_text(html, encoding="utf-8") + print(f"Wrote {out_file}") + count += 1 + + if count == 0: + print("No messages.jsonl files found.") + else: + print(f"Converted {count} run(s).") + + +if __name__ == "__main__": + main() diff --git a/examples/decentralized_smolagents_benchmark/scripts/convert_messages_to_html.py b/examples/decentralized_smolagents_benchmark/scripts/convert_messages_to_html.py new file mode 100644 index 000000000..7219d2c46 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/convert_messages_to_html.py @@ -0,0 +1,35 @@ +"""CLI to convert messages.jsonl runs into a readable HTML file. + +Example: + python -m examples.decentralized_smolagents_benchmark.scripts.convert_messages_to_html \ + --input examples/decentralized_smolagents_benchmark/runs/4f0079d6/messages.jsonl \ + --output examples/decentralized_smolagents_benchmark/runs/4f0079d6/index.html \ + --title "Run 4f0079d6" +""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +from .html_renderer import MessagesHtmlRenderer + + +def main() -> None: + parser = argparse.ArgumentParser(description="Convert messages.jsonl to HTML") + parser.add_argument("--input", required=True, help="Path to messages.jsonl") + parser.add_argument("--output", required=True, help="Path to write HTML file") + parser.add_argument("--title", default="Run Messages", help="HTML title") + args = parser.parse_args() + + renderer = MessagesHtmlRenderer(title=args.title) + html = renderer.render_file(args.input) + + out_path = Path(args.output) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(html, encoding="utf-8") + print(f"Wrote HTML to {out_path}") + + +if __name__ == "__main__": + main() diff --git a/examples/decentralized_smolagents_benchmark/scripts/cookies.py b/examples/decentralized_smolagents_benchmark/scripts/cookies.py new file mode 100644 index 000000000..8e4233356 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/cookies.py @@ -0,0 +1,715 @@ +from requests.cookies import RequestsCookieJar + + +COOKIES_LIST = [ + { + "domain": ".youtube.com", + "expirationDate": 1718884961, + "hostOnly": False, + "httpOnly": False, + "name": "ST-xuwub9", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753004444.745411, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-YEC", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050824, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-3PSID", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420959.974642, + "hostOnly": False, + "httpOnly": False, + "name": "SIDCC", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050652, + "hostOnly": False, + "httpOnly": False, + "name": "SID", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420958.397534, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-1PSIDTS", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753433494.44729, + "hostOnly": False, + "httpOnly": False, + "name": "_ga_M0180HEFCY", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GS1.1.1718871908.1.0.1718873494.0.0.0", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050933, + "hostOnly": False, + "httpOnly": False, + "name": "SAPISID", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420959.974764, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-1PSIDCC", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050881, + "hostOnly": False, + "httpOnly": True, + "name": "SSID", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "AmlwXHnQvOQ10LVd-", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050959, + "hostOnly": False, + "httpOnly": False, + "name": "__Secure-1PAPISID", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050795, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-1PSID", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050993, + "hostOnly": False, + "httpOnly": False, + "name": "__Secure-3PAPISID", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420959.974815, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-3PSIDCC", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420958.397647, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-3PSIDTS", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050908, + "hostOnly": False, + "httpOnly": False, + "name": "APISID", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050855, + "hostOnly": False, + "httpOnly": True, + "name": "HSID", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "AasA7hmRuTFv7vjoq", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753435873.577793, + "hostOnly": False, + "httpOnly": True, + "name": "LOGIN_INFO", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753444956.555608, + "hostOnly": False, + "httpOnly": False, + "name": "PREF", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100", + }, +] + +COOKIES_LIST += [ + { + "domain": ".www.researchgate.net", + "hostOnly": False, + "httpOnly": True, + "name": "isInstIp", + "path": "/", + "sameSite": None, + "secure": True, + "session": True, + "storeId": None, + "value": "False", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1734423981, + "hostOnly": False, + "httpOnly": False, + "name": "__eoi", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc", + }, + { + "domain": ".www.researchgate.net", + "expirationDate": 1753444909.646103, + "hostOnly": False, + "httpOnly": True, + "name": "ptc", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "RG1.8947708639250500550.1718872043", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1750507578, + "hostOnly": False, + "httpOnly": False, + "name": "euconsent-v2-didomi", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1718885236, + "hostOnly": False, + "httpOnly": False, + "name": "_gat", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "1", + }, + { + "domain": "www.researchgate.net", + "expirationDate": 1721477183, + "hostOnly": True, + "httpOnly": False, + "name": "_pbjs_userid_consent_data", + "path": "/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "3524755945110770", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1752567981, + "hostOnly": False, + "httpOnly": False, + "name": "__gads", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1718886709.646173, + "hostOnly": False, + "httpOnly": True, + "name": "__cf_bm", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1752567981, + "hostOnly": False, + "httpOnly": False, + "name": "__gpi", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg", + }, + { + "domain": ".researchgate.net", + "hostOnly": False, + "httpOnly": True, + "name": "_cfuvid", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": True, + "storeId": None, + "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1753445177.271667, + "hostOnly": False, + "httpOnly": False, + "name": "_ga", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.1.1525244793.1718885177", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1753445177.271482, + "hostOnly": False, + "httpOnly": False, + "name": "_ga_4P31SJ70EJ", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GS1.1.1718885177.1.0.1718885177.0.0.0", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1718971576, + "hostOnly": False, + "httpOnly": False, + "name": "_gid", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.2.854907463.1718885177", + }, + { + "domain": ".www.researchgate.net", + "expirationDate": 1750407982.506505, + "hostOnly": False, + "httpOnly": True, + "name": "did", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1750507578, + "hostOnly": False, + "httpOnly": False, + "name": "didomi_token", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9", + }, + { + "domain": ".www.researchgate.net", + "hostOnly": False, + "httpOnly": True, + "name": "hasPdpNext", + "path": "/", + "sameSite": None, + "secure": True, + "session": True, + "storeId": None, + "value": "False", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1750421183, + "hostOnly": False, + "httpOnly": False, + "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D", + }, + { + "domain": ".www.researchgate.net", + "hostOnly": False, + "httpOnly": True, + "name": "sid", + "path": "/", + "sameSite": None, + "secure": True, + "session": True, + "storeId": None, + "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ", + }, +] + +COOKIES_LIST += [ + { + "domain": "github.com", + "hostOnly": True, + "httpOnly": True, + "name": "_gh_sess", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": True, + "storeId": None, + "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D", + }, + { + "domain": ".github.com", + "expirationDate": 1750408875.763785, + "hostOnly": False, + "httpOnly": False, + "name": "_octo", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "GH1.1.728652011.1718872875", + }, + { + "domain": ".github.com", + "expirationDate": 1750408875.763926, + "hostOnly": False, + "httpOnly": True, + "name": "logged_in", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "no", + }, + { + "domain": ".github.com", + "hostOnly": False, + "httpOnly": False, + "name": "preferred_color_mode", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": True, + "storeId": None, + "value": "dark", + }, + { + "domain": ".github.com", + "hostOnly": False, + "httpOnly": False, + "name": "tz", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": True, + "storeId": None, + "value": "Europe%2FParis", + }, +] + +COOKIES_LIST += [ + { + "domain": ".web.archive.org", + "expirationDate": 1718886430, + "hostOnly": False, + "httpOnly": False, + "name": "_gat", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "1", + }, + { + "domain": ".web.archive.org", + "expirationDate": 1718972770, + "hostOnly": False, + "httpOnly": False, + "name": "_gid", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.2.402246368.1606169825", + }, + { + "domain": ".web.archive.org", + "expirationDate": 1753446370.315621, + "hostOnly": False, + "httpOnly": False, + "name": "_ga", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.2.1301409987.1606169825", + }, + { + "domain": ".web.archive.org", + "expirationDate": 1750422367, + "hostOnly": False, + "httpOnly": False, + "name": "_hjid", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2", + }, + { + "domain": ".web.archive.org", + "expirationDate": 1718888167, + "hostOnly": False, + "httpOnly": False, + "name": "_hjFirstSeen", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "1", + }, +] +COOKIES_LIST += [ + { + "domain": "orcid.org", + "hostOnly": True, + "httpOnly": False, + "name": "AWSELBCORS", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": True, + "storeId": None, + "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F", + }, + { + "domain": ".orcid.org", + "expirationDate": 1753452454.637671, + "hostOnly": False, + "httpOnly": False, + "name": "_ga_9R61FWK9H5", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GS1.1.1718892454.1.0.1718892454.0.0.0", + }, + { + "domain": ".orcid.org", + "expirationDate": 1753452454.63421, + "hostOnly": False, + "httpOnly": False, + "name": "_ga", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.1.2021310691.1718892455", + }, + { + "domain": "orcid.org", + "hostOnly": True, + "httpOnly": False, + "name": "AWSELB", + "path": "/", + "sameSite": None, + "secure": False, + "session": True, + "storeId": None, + "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F", + }, + { + "domain": ".orcid.org", + "expirationDate": 1750428454, + "hostOnly": False, + "httpOnly": False, + "name": "OptanonAlertBoxClosed", + "path": "/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "2024-06-20T14:07:34.583Z", + }, + { + "domain": ".orcid.org", + "expirationDate": 1750428454, + "hostOnly": False, + "httpOnly": False, + "name": "OptanonConsent", + "path": "/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1", + }, + { + "domain": "orcid.org", + "hostOnly": True, + "httpOnly": False, + "name": "XSRF-TOKEN", + "path": "/", + "sameSite": None, + "secure": True, + "session": True, + "storeId": None, + "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9", + }, +] + +# Create a RequestsCookieJar instance +COOKIES = RequestsCookieJar() + +# Add cookies to the jar +for cookie in COOKIES_LIST: + COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"]) diff --git a/examples/decentralized_smolagents_benchmark/scripts/decentralized_tools.py b/examples/decentralized_smolagents_benchmark/scripts/decentralized_tools.py new file mode 100644 index 000000000..062b75b84 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/decentralized_tools.py @@ -0,0 +1,873 @@ +""" +Decentralized communication and polling tools for multi-agent collaboration. + +This module provides tools for agents to: +1. Send private messages to specific agents +2. Send messages to channels/threads for group discussion +3. Read messages and notifications +4. Create and vote on polls for decision making +5. Propose final answers through consensus mechanism +""" + +import re +from typing import Any, Dict, List, Optional + +from smolagents.tools import Tool + +from .message_store import MessageStore + + +# ============================================================================= +# MESSAGING TOOLS +# ============================================================================= + + +class SendMessageToAgent(Tool): + """Tool for sending private messages directly to specific agents.""" + + name = "send_message_to_agent" + description = """Send a private message directly to a specific agent for one-on-one communication. + Use this for sharing sensitive information, detailed technical discussions, or coordination that + doesn't need to involve the whole team. The target agent will receive a notification.""" + inputs = { + "target_agent": { + "type": "string", + "description": "Name of the recipient agent (CodeAgent, WebSearchAgent, DeepResearchAgent, DocumentReaderAgent)", + }, + "message": {"type": "string", "description": "The message content to send to the target agent"}, + } + output_type = "string" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self, target_agent: str, message: str) -> str: + """Send a private message to the specified agent.""" + try: + result = self.message_store.append_message( + sender=self.agent_name, + content=message, + recipients=[target_agent], + thread_id=None, # Private messages don't use threads + msg_type="private_message", + ) + return f"βœ‰οΈ Private message sent to {target_agent}: {message[:50]}, result: {result}..." + except Exception as e: + return f"❌ Failed to send message to {target_agent}: {str(e)}" + + +class CreateChannel(Tool): + """Tool for creating a new channel for group discussions.""" + + name = "create_channel" + description = """Create a new channel for team discussions on specific topics. + Channels help organize conversations by topic or participant group. + Use this to establish dedicated spaces for focused discussions. + A unique channel ID will be automatically generated and returned.""" + inputs = { + "channel_subject": { + "type": "string", + "description": "Short topic or theme for the new channel (e.g., 'research', 'analysis', 'implementation')", + }, + "channel_description": { + "type": "string", + "description": "Brief description of the channel's purpose and topic", + }, + "initial_members": { + "type": "string", + "description": "Optional comma-separated list of agent names to initially notify about this channel", + "nullable": True, + }, + } + output_type = "object" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward( + self, channel_subject: str, channel_description: str, initial_members: Optional[str] = None + ) -> Dict[str, Any]: + """Create a new channel for team discussions with auto-generated unique ID.""" + try: + import re + import uuid + + # Generate a unique channel ID based on subject and timestamp + # Clean the subject to make it URL/ID friendly + clean_subject = re.sub(r"[^a-zA-Z0-9\-_]", "-", channel_subject.lower().strip()) + clean_subject = re.sub(r"-+", "-", clean_subject).strip("-") + + # Generate unique ID with subject and short UUID + short_uuid = str(uuid.uuid4())[:8] + channel_id = f"{clean_subject}-{short_uuid}" + + # Parse initial members + member_list = None + if initial_members: + member_list = [m.strip() for m in initial_members.split(",") if m.strip()] + + # Create channel by posting a channel creation message + channel_message = { + "type": "channel_created", + "channel_id": channel_id, + "subject": channel_subject, + "description": channel_description, + "creator": self.agent_name, + "initial_members": member_list or [], + } + + self.message_store.append_message( + sender=self.agent_name, + content=channel_message, + recipients=member_list or ["@all"], + thread_id=channel_id, + msg_type="channel_created", + ) + + member_info = f" with members: {', '.join(member_list)}" if member_list else "" + + return { + "channel_id": channel_id, + "subject": channel_subject, + "description": channel_description, + "creator": self.agent_name, + "initial_members": member_list or [], + "message": f"πŸ“’ Channel created: '{channel_id}' - {channel_description}{member_info}", + } + + except Exception as e: + return {"error": f"❌ Failed to create channel: {str(e)}", "channel_id": None} + + +class SendMessageToChannel(Tool): + """Tool for sending messages to channels/threads for group discussion.""" + + name = "send_message_to_channel" + description = """Send a message to a channel/thread where multiple agents can participate. + You can specify existing channel IDs or create new topic-based channels: + + - Use existing channel IDs: from create_channel or list_channels + - Create topic-based channels: 'research', 'analysis', 'implementation', 'hypothesis-testing' + - Create agent-group channels by listing agents: 'CodeAgent,WebSearchAgent' + - Use 'main' for general discussion + + Channels will be auto-created if they don't exist. Use @AgentName to mention specific agents.""" + inputs = { + "thread_id": { + "type": "string", + "description": """Channel/thread ID or specification: + - Existing channel ID (e.g., 'research-a1b2c3d4') + - Topic-based: 'research', 'analysis', 'implementation', 'hypothesis-testing' + - Agent-based: 'CodeAgent,WebSearchAgent' (comma-separated agent names) + - General: 'main' for general discussion + Channels will be auto-created if they don't exist.""", + }, + "message": { + "type": "string", + "description": "The message content to send. Use @AgentName to mention specific agents.", + }, + "recipients": { + "type": "string", + "description": "Optional comma-separated list of specific agent names to notify. If not provided, uses channel members or mentioned agents.", + "nullable": True, + }, + } + output_type = "object" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self, thread_id: str, message: str, recipients: Optional[str] = None) -> Dict[str, Any]: + """Send a message to the specified channel/thread with auto-creation.""" + try: + # Ensure thread_id is a string to prevent type errors + if not isinstance(thread_id, str): + thread_id = str(thread_id) + + # Process the thread_id to determine channel type and auto-create if needed + processed_channel_id, auto_recipients = self._process_channel_id(thread_id) + + # Process mentions in the message + mentioned_agents = self._extract_mentions(message) + + # Determine final recipient list + recipient_list = None + if recipients: + recipient_list = [r.strip() for r in recipients.split(",")] + elif mentioned_agents: + recipient_list = mentioned_agents + elif auto_recipients: + recipient_list = auto_recipients + # If no specific recipients, use @all for public channels + + result = self.message_store.append_message( + sender=self.agent_name, + content=message, + recipients=recipient_list, + thread_id=processed_channel_id, + msg_type="channel_message", + ) + + # Create info message about recipients/mentions + recipient_info = "" + if mentioned_agents: + recipient_info += f" (mentioning: {', '.join(mentioned_agents)})" + if recipient_list and recipient_list != mentioned_agents: + recipient_info += f" (recipients: {', '.join(recipient_list)})" + + return { + "channel_id": processed_channel_id, + "message_sent": message, + "recipients": recipient_list, + "mentions": mentioned_agents, + "message": f"πŸ“’ Message sent to #{processed_channel_id}{recipient_info}: {message[:50]}...", + "message_id": result.get("id"), + } + + except Exception as e: + return {"error": f"❌ Failed to send message to #{thread_id}: {str(e)}", "channel_id": None} + + def _process_channel_id(self, thread_id: str) -> tuple[str, Optional[List[str]]]: + """Process thread_id to determine channel and auto-create if needed.""" + + # Ensure thread_id is a string to avoid 'int is not iterable' errors + if not isinstance(thread_id, str): + thread_id = str(thread_id) + + # If thread_id contains agent names (has comma or ends with 'Agent'), it's an agent-based channel + if "," in thread_id or thread_id.endswith("Agent"): + # Extract agent names + if "," in thread_id: + agent_names = [name.strip() for name in thread_id.split(",") if name.strip()] + else: + agent_names = [thread_id.strip()] + + # Create a channel ID from agent names + channel_id = f"group-{'-'.join(sorted(agent_names)).lower()}" + + # Check if this agent-based channel exists + if not self._channel_exists(channel_id): + self._auto_create_channel( + channel_id=channel_id, + description=f"Private discussion group for: {', '.join(agent_names)}", + members=agent_names, + ) + + return channel_id, agent_names + + # Topic-based or standard channels + else: + # Check if the channel exists + if not self._channel_exists(thread_id): + # Auto-create topic-based channel + description = self._generate_topic_description(thread_id) + self._auto_create_channel( + channel_id=thread_id, + description=description, + members=None, # Public channel + ) + + return thread_id, None + + def _channel_exists(self, channel_id: str) -> bool: + """Check if a channel already exists by looking for channel creation messages.""" + try: + # Look for existing messages in this thread + existing_messages = self.message_store.get_thread_messages(channel_id) + return len(existing_messages) > 0 + except Exception: + return False + + def _auto_create_channel(self, channel_id: str, description: str, members: Optional[List[str]]) -> None: + """Auto-create a channel with given parameters.""" + try: + channel_message = { + "type": "channel_created", + "channel_id": channel_id, + "description": description, + "creator": self.agent_name, + "auto_created": True, + "initial_members": members or [], + } + + self.message_store.append_message( + sender="system", + content=channel_message, + recipients=members or ["@all"], + thread_id=channel_id, + msg_type="channel_created", + ) + + print(f"πŸ†• Auto-created channel: #{channel_id} - {description}") + + except Exception as e: + print(f"⚠️ Failed to auto-create channel #{channel_id}: {e}") + + def _generate_topic_description(self, topic: str) -> str: + """Generate a description for topic-based channels.""" + # Ensure topic is a string to prevent type errors + if not isinstance(topic, str): + topic = str(topic) + + topic_descriptions = { + "research": "Web research and information gathering", + "analysis": "Deep analysis and data examination", + "implementation": "Code development and implementation", + "hypothesis": "Hypothesis development and testing", + "documentation": "Document analysis and review", + "planning": "Project planning and coordination", + "testing": "Testing and validation discussions", + "review": "Code and content review sessions", + "brainstorm": "Brainstorming and ideation", + "debug": "Debugging and troubleshooting", + } + + # Try to match common topics + for key, desc in topic_descriptions.items(): + if key in topic.lower(): + return desc + + # Default description + return f"Discussion channel for {topic.replace('-', ' ').replace('_', ' ')}" + + def _extract_mentions(self, text: str) -> List[str]: + """Extract @mentions from message text.""" + return re.findall(r"@([A-Za-z0-9_-]+)", text or "") + + +# ============================================================================= +# MESSAGE READING TOOLS +# ============================================================================= + + +class ReadMessagesTool(Tool): + """Tool for reading messages addressed to the current agent.""" + + name = "read_messages" + description = """Read all new messages addressed to this agent including: + - Private messages sent directly to you + - Channel/thread messages where you were mentioned or that match your interests + - Poll notifications where your vote is needed + Returns a list of message objects with sender, content, thread, and type information. + Messages are marked as read after retrieval.""" + inputs = { + "since_timestamp": { + "type": "string", + "description": "Optional timestamp to get only messages after this time (ISO format). If not provided, gets recent unread messages.", + "nullable": True, + }, + "thread_id": { + "type": "string", + "description": "Optional channel/thread ID to filter messages from specific discussions", + "nullable": True, + }, + "mark_as_read": { + "type": "boolean", + "description": "Whether to mark messages as read after retrieving them (default: True)", + "nullable": True, + }, + } + output_type = "array" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward( + self, since_timestamp: Optional[str] = None, thread_id: Optional[str] = None, mark_as_read: bool = True + ) -> List[Dict[str, Any]]: + """Read all messages for this agent with enhanced filtering.""" + try: + messages = self.message_store.get_messages( + agent_id=self.agent_name, + last_seen_ts=since_timestamp, + thread_id=thread_id, + include_mentions=True, + include_private=True, + ) + + if not messages: + return [] + + # Format messages for display + formatted_messages = [] + for msg in messages: + formatted_msg = { + "id": msg.get("id"), + "sender": msg.get("sender"), + "type": msg.get("type", "message"), + "content": msg.get("content"), + "thread_id": msg.get("thread_id"), + "timestamp": msg.get("timestamp"), + "recipients": msg.get("recipients"), + "is_mention": f"@{self.agent_name}" in str(msg.get("content", "")), + "is_private": self.agent_name in msg.get("recipients", []), + } + formatted_messages.append(formatted_msg) + + # Sort by timestamp (oldest first) + formatted_messages.sort(key=lambda m: m.get("timestamp", "")) + + return formatted_messages + + except Exception as e: + return [{"error": f"Failed to read messages: {str(e)}"}] + + +class ReadNotificationsTool(Tool): + """Tool for checking notifications including mentions, direct messages, and polls.""" + + name = "read_notifications" + description = """Check all notifications for this agent. Returns categorized notifications: + - mentions: Messages where you were mentioned with @YourName + - direct_messages: Private messages sent directly to you + - polls_needing_votes: Active polls where you haven't voted yet + - thread_updates: New activity in threads you're following + Use this to stay updated on important communications and required actions.""" + inputs = { + "since_timestamp": { + "type": "string", + "description": "Optional timestamp to get notifications since a specific time (ISO format)", + "nullable": True, + } + } + output_type = "object" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self, since_timestamp: Optional[str] = None) -> Dict[str, Any]: + """Get categorized notifications for this agent.""" + try: + notifications = self.message_store.get_notifications(agent_id=self.agent_name, since_ts=since_timestamp) + + # Add polls needing votes + active_polls = self.message_store.get_active_polls() + polls_needing_votes = [] + + for poll in active_polls: + poll_id = poll.get("poll_id") + if poll_id: + vote_info = self.message_store.count_votes(poll_id) + voters = vote_info.get("votes_by_voter", {}) + if self.agent_name not in voters: + polls_needing_votes.append( + { + "poll_id": poll_id, + "question": poll.get("question"), + "proposal": poll.get("proposal"), + "proposer": poll.get("proposer"), + "thread_id": poll.get("thread_id", "main"), + } + ) + + notifications["polls_needing_votes"] = polls_needing_votes + + # Add channel information to thread updates + if "thread_updates" in notifications: + channels_info = self.message_store.get_channels_info(agent_id=self.agent_name) + for thread_id in notifications["thread_updates"]: + if thread_id in channels_info: + notifications["thread_updates"][thread_id] = { + "messages": notifications["thread_updates"][thread_id], + "channel_info": { + "subject": channels_info[thread_id].get("subject", thread_id), + "description": channels_info[thread_id].get("description", ""), + "members": channels_info[thread_id].get("members", []), + }, + } + + return notifications + + except Exception as e: + return {"error": f"Failed to get notifications: {str(e)}"} + + +class ListChannelsTool(Tool): + """Tool for listing all available channels and their details.""" + + name = "list_channels" + description = """List all available discussion channels with their details including: + - Channel IDs and subjects + - Descriptions and creators + - Member lists and activity information + - Message counts and last activity timestamps + Use this to see what discussion channels are available for communication.""" + inputs = { + "include_inactive": { + "type": "boolean", + "description": "Include channels with no recent activity (default: False)", + "nullable": True, + }, + "since_timestamp": { + "type": "string", + "description": "Optional timestamp to filter channels by recent activity (ISO format)", + "nullable": True, + }, + } + output_type = "array" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self, include_inactive: bool = False, since_timestamp: Optional[str] = None) -> List[Dict[str, Any]]: + """List all channels visible to this agent with filtering options.""" + try: + channels_info = self.message_store.get_channels_info(agent_id=self.agent_name) + + channels_list = [] + for channel_id, info in channels_info.items(): + # Filter by activity if timestamp provided + if since_timestamp and info.get("last_activity", "") <= since_timestamp: + if not include_inactive: + continue + + # Skip channels with very low activity if not including inactive + if not include_inactive and info.get("message_count", 0) < 2: + continue + + channel_data = { + "channel_id": channel_id, + "subject": info.get("subject", channel_id), + "description": info.get("description", ""), + "creator": info.get("creator", ""), + "created_at": info.get("created_at", ""), + "members": info.get("members", []), + "member_count": len(info.get("members", [])), + "message_count": info.get("message_count", 0), + "last_activity": info.get("last_activity", ""), + "is_created_channel": info.get("is_created_channel", False), + "is_member": self.agent_name in info.get("members", []), + } + channels_list.append(channel_data) + + # Sort by last activity (most recent first) + channels_list.sort(key=lambda x: x.get("last_activity", ""), reverse=True) + + return channels_list + + except Exception as e: + return [{"error": f"Failed to list channels: {str(e)}"}] + + +class SearchMessagesTool(Tool): + """Tool for searching through message history.""" + + name = "search_messages" + description = """Search through the message history to find relevant information. + Use this to find previous discussions, research findings, or decisions made by the team. + You can search by keywords, filter by thread, or limit results by time period.""" + inputs = { + "query": { + "type": "string", + "description": "Search query - keywords or phrases to search for in message content", + }, + "thread_id": { + "type": "string", + "description": "Optional thread ID to search within (e.g., 'main', 'research', 'implementation')", + "nullable": True, + }, + "limit": { + "type": "integer", + "description": "Maximum number of results to return (default: 20)", + "nullable": True, + }, + } + output_type = "array" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self, query: str, thread_id: Optional[str] = None, limit: int = 20) -> List[Dict[str, Any]]: + """Search messages visible to this agent.""" + try: + messages = self.message_store.search_messages( + query=query, agent_id=self.agent_name, thread_id=thread_id, limit=limit + ) + return messages + except Exception as e: + return [{"error": f"Failed to search messages: {str(e)}"}] + + +# ============================================================================= +# POLLING TOOLS +# ============================================================================= + + +class CreateGeneralPollTool(Tool): + """Tool for creating general polls for team decision making.""" + + name = "create_general_poll" + description = """Create a general poll to gather team consensus on intermediate decisions, approaches, or strategies. + Use this for collaborative decision-making when you need team input on research directions, implementation approaches, + or other non-final decisions.""" + inputs = { + "question": {"type": "string", "description": "The question or decision to vote on"}, + "proposal": {"type": "string", "description": "Your proposed answer or approach"}, + "thread_id": {"type": "string", "description": "Thread ID for the poll (default: 'main')", "nullable": True}, + } + output_type = "string" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self, question: str, proposal: str, thread_id: str = "main") -> str: + """Create a general poll for team decision-making.""" + try: + # Check if there are any active polls + # active_polls = self.message_store.get_active_polls() + # if active_polls: + # return f"🚫 Cannot create poll: There is already an active poll in progress (ID: {active_polls[0].get('poll_id')})" + + result = self.message_store.create_poll( + question=question, proposal=proposal, proposer=self.agent_name, thread_id=thread_id + ) + + poll_id = result.get("content", {}).get("poll_id", "unknown") + return f"πŸ—³οΈ General poll created (ID: {poll_id}): {question[:50]}..." + + except Exception as e: + return f"❌ Failed to create poll: {str(e)}" + + +class CreateFinalAnswerPollTool(Tool): + """Tool for creating polls specifically for proposing final answers.""" + + name = "create_final_answer_poll" + description = """Create a poll to propose a final answer to the user's original question. + Use this when you have a confident, complete answer that should be presented to the user. + The proposal will be voted on by all agents, and if it reaches majority consensus (N//2 + 1 votes), + it will be returned as the final answer to the user. + Do not put here elements like, "I will do...", just the answer. + + CRITICAL FORMAT INSTRUCTIONS: + Always carefully follow the format required by the question. This could be for instance: + - For math problems: final_answer should be ONLY the number, expression, or mathematical result (e.g., "7", "3.14", "$50") + - For factual questions: final_answer should be ONLY the specific fact requested (e.g., "1925", "John Smith", "Paris") + - For yes/no questions: final_answer should be ONLY "Yes" or "No" + - Do NOT include explanations, reasoning, or phrases like "The answer is..." in final_answer + - Put all explanations in supporting_evidence, not in final_answer + """ + inputs = { + "final_answer": { + "type": "string", + "description": "ONLY the core answer - no explanations or reasoning. For math: just the number/result. For facts: just the specific requested information. For yes/no: just 'Yes' or 'No'.", + }, + "supporting_evidence": { + "type": "string", + "description": "Supporting evidence, reasoning, calculations, or sources for your answer. Include all explanations HERE, not in final_answer.", + "nullable": True, + }, + } + output_type = "string" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self, final_answer: str, supporting_evidence: str = "") -> str: + """Create a poll for a final answer proposal.""" + try: + # Check if there are any active polls + # active_polls = self.message_store.get_active_polls() + # if active_polls: + # return f"🚫 Cannot create final answer poll: There is already an active poll in progress (ID: {active_polls[0].get('poll_id')})" + + # Create question and proposal for final answer + question = "Should this be our final answer to the user?" + full_proposal = ( + f"{final_answer}\n\n**Supporting Evidence:**\n{supporting_evidence}" + if supporting_evidence + else final_answer + ) + + # Store the clean final answer separately in the poll for proper extraction + result = self.message_store.create_poll( + question=question, + proposal=full_proposal, + proposer=self.agent_name, + thread_id="main", + final_answer=final_answer, # Store clean answer separately + ) + + poll_id = result.get("content", {}).get("poll_id", "unknown") + return f"πŸ—³οΈ Final answer poll created (ID: {poll_id}): {final_answer[:50]}..." + + except Exception as e: + return f"❌ Failed to create final answer poll: {str(e)}" + + +class VoteOnPollTool(Tool): + """Tool for voting on active polls with detailed evaluation.""" + + name = "vote_on_poll" + description = """Vote on a specific active poll. Provide the poll ID, your vote (YES/NO), confidence level, + and detailed rationale based on your expertise. Your vote helps reach team consensus on important decisions.""" + inputs = { + "poll_id": {"type": "string", "description": "ID of the poll you want to vote on", "nullable": True}, + "vote": {"type": "string", "description": "Your vote: 'YES' to approve or 'NO' to reject the proposal"}, + "confidence": { + "type": "number", + "description": "Confidence level in your vote (0.0 to 1.0, where 1.0 = completely confident)", + }, + "rationale": { + "type": "string", + "description": "Detailed explanation for your vote, including your reasoning and any suggestions for improvement", + }, + } + output_type = "string" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self, vote: str, confidence: float, rationale: str, poll_id: Optional[str] = None) -> str: + """Vote on an active poll.""" + try: + # Validate vote + vote = vote.upper() + if vote not in ["YES", "NO"]: + return "❌ Invalid vote. Must be 'YES' or 'NO'." + + # Get active polls + active_polls = self.message_store.get_active_polls() + if not active_polls: + return "❌ No active polls to vote on." + + # Find the specific poll to vote on + target_poll = None + if poll_id: + # Vote on specific poll ID + for poll in active_polls: + if poll.get("poll_id") == poll_id: + target_poll = poll + break + if not target_poll: + return f"❌ Poll with ID {poll_id} not found or is not active." + else: + # If no poll_id specified and only one active poll, use it + if len(active_polls) == 1: + target_poll = active_polls[0] + else: + poll_list = "\n".join( + [ + f"- {p.get('poll_id', 'unknown')}: {p.get('question', 'Unknown question')[:60]}..." + for p in active_polls + ] + ) + return f"❌ Multiple active polls found. Please specify poll_id parameter:\n{poll_list}" + + poll_id = target_poll.get("poll_id") + if not poll_id: + return "❌ Invalid poll ID." + + # Check if already voted + vote_info = self.message_store.count_votes(poll_id) + voters = vote_info.get("votes_by_voter", {}) + if self.agent_name in voters: + return ( + f"❌ You have already voted on poll {poll_id}. Current vote: {voters[self.agent_name].get('vote')}" + ) + + # Record vote + self.message_store.record_vote( + poll_id=poll_id, + voter=self.agent_name, + vote=vote, + confidence=confidence, + rationale=rationale, + thread_id=target_poll.get("thread_id", "main"), + ) + + return f"βœ… Vote recorded on poll {poll_id}: {vote} (confidence: {confidence:.1f}) - {rationale[:50]}..." + + except Exception as e: + return f"❌ Failed to record vote: {str(e)}" + + +class ViewActivePollsTool(Tool): + """Tool for viewing currently active polls and their details.""" + + name = "view_active_polls" + description = """View details of currently active polls including the question, proposal, current vote counts, + and which agents have voted. Use this to see what decisions are pending and check voting progress.""" + inputs = {} + output_type = "array" + + def __init__(self, message_store: MessageStore, agent_name: str): + super().__init__() + self.message_store = message_store + self.agent_name = agent_name + + def forward(self) -> List[Dict[str, Any]]: + """View active polls with voting status.""" + try: + active_polls = self.message_store.get_active_polls() + + if not active_polls: + return [{"message": "No active polls"}] + + poll_details = [] + for poll in active_polls: + poll_id = poll.get("poll_id") + if poll_id: + vote_info = self.message_store.count_votes(poll_id) + poll_details.append( + { + "poll_id": poll_id, + "question": poll.get("question"), + "proposal": poll.get("proposal"), + "proposer": poll.get("proposer"), + "thread_id": poll.get("thread_id"), + "vote_counts": vote_info.get("tally", {}), + "voters": list(vote_info.get("votes_by_voter", {}).keys()), + "has_voted": self.agent_name in vote_info.get("votes_by_voter", {}), + } + ) + + return poll_details + + except Exception as e: + return [{"error": f"Failed to view polls: {str(e)}"}] + + +# ============================================================================= +# HELPER FUNCTIONS +# ============================================================================= + + +def create_decentralized_tools(message_store: MessageStore, agent_name: str) -> List[Tool]: + """Create all decentralized communication and polling tools for an agent.""" + return [ + # Communication tools + SendMessageToAgent(message_store, agent_name), + SendMessageToChannel(message_store, agent_name), + CreateChannel(message_store, agent_name), + # Reading tools + ReadMessagesTool(message_store, agent_name), + ReadNotificationsTool(message_store, agent_name), + SearchMessagesTool(message_store, agent_name), + ListChannelsTool(message_store, agent_name), + # Polling tools + CreateGeneralPollTool(message_store, agent_name), + CreateFinalAnswerPollTool(message_store, agent_name), + VoteOnPollTool(message_store, agent_name), + ViewActivePollsTool(message_store, agent_name), + ] diff --git a/examples/decentralized_smolagents_benchmark/scripts/gaia_scorer.py b/examples/decentralized_smolagents_benchmark/scripts/gaia_scorer.py new file mode 100644 index 000000000..532e0c380 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/gaia_scorer.py @@ -0,0 +1,124 @@ +import re +import string +import warnings + + +def normalize_number_str(number_str: str) -> float: + # we replace these common units and commas to allow + # conversion to float + for char in ["$", "%", ","]: + number_str = number_str.replace(char, "") + try: + return float(number_str) + except ValueError: + print(f"String {number_str} cannot be normalized to number str.") + return float("inf") + + +def split_string( + s: str, + char_list: list[str] = [",", ";"], +) -> list[str]: + pattern = f"[{''.join(char_list)}]" + return re.split(pattern, s) + + +def is_float(element: any) -> bool: + try: + float(element) + return True + except ValueError: + return False + + +def question_scorer( + model_answer: str, + ground_truth: str, +) -> bool: + # if gt is a number + if is_float(ground_truth): + normalized_answer = normalize_number_str(str(model_answer)) + return normalized_answer == float(ground_truth) + + # if gt is a list + elif any(char in ground_truth for char in [",", ";"]): + # question with the fish: normalization removes punct + + gt_elems = split_string(ground_truth) + ma_elems = split_string(model_answer) + + # check length is the same + if len(gt_elems) != len(ma_elems): + warnings.warn("Answer lists have different lengths, returning False.", UserWarning) + return False + + # compare each element as float or str + comparisons = [] + for ma_elem, gt_elem in zip(ma_elems, gt_elems): + if is_float(gt_elem): + normalized_ma_elem = normalize_number_str(ma_elem) + comparisons.append(normalized_ma_elem == float(gt_elem)) + else: + # we do not remove punct since comparisons can include punct + comparisons.append( + normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False) + ) + return all(comparisons) + + # if gt is a str + else: + return normalize_str(model_answer) == normalize_str(ground_truth) + + +def check_prediction_contains_answer_letters_in_order(prediction, true_answer): + prediction = prediction.lower() + true_answer = true_answer.lower() + if len(prediction) > len(true_answer) * 3: + return False + i = 0 + for letter in true_answer: + if letter in prediction[i:]: + i += prediction[i:].index(letter) + else: + return False + return True + + +def check_close_call(prediction, true_answer, is_correct): + if is_correct: + return True + else: + if is_float(true_answer): + return is_correct + else: + if ( + check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer)) + and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2 + ): + print(f"Close call: {prediction} vs {true_answer}") + return True + else: + return False + + +def normalize_str(input_str, remove_punct=True) -> str: + """ + Normalize a string by: + - Removing all white spaces + - Optionally removing punctuation (if remove_punct is True) + - Converting to lowercase + Parameters: + - input_str: str, the string to normalize + - remove_punct: bool, whether to remove punctuation (default: True) + Returns: + - str, the normalized string + """ + # Remove all white spaces. Required e.g for seagull vs. sea gull + no_spaces = re.sub(r"\s", "", input_str) + + # Remove punctuation, if specified. + if remove_punct: + translator = str.maketrans("", "", string.punctuation) + return no_spaces.lower().translate(translator) + else: + return no_spaces.lower() diff --git a/examples/decentralized_smolagents_benchmark/scripts/html_renderer.py b/examples/decentralized_smolagents_benchmark/scripts/html_renderer.py new file mode 100644 index 000000000..54138500b --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/html_renderer.py @@ -0,0 +1,168 @@ +"""Render decentralized_smolagents messages.jsonl into a readable HTML page. + +Usage (library): + from scripts.html_renderer import MessagesHtmlRenderer + html = MessagesHtmlRenderer().render_file("path/to/messages.jsonl") + +This module intentionally avoids external deps. +""" + +from __future__ import annotations + +import html +import json +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + + +class MessagesHtmlRenderer: + """Render JSONL chat-like logs produced by MessageStore into HTML. + + The renderer is defensive against schema variations: + - Handles dict or string `content` fields + - Handles missing optional fields + - Escapes HTML in user-provided text + """ + + def __init__(self, title: str = "Run Messages", theme: str = "light") -> None: + self.title = title + self.theme = theme + + def render_file(self, jsonl_path: str | Path) -> str: + messages = list(self._iter_messages(Path(jsonl_path))) + return self.render(messages) + + def render(self, messages: List[Dict[str, Any]]) -> str: + head = self._build_head() + body = self._build_body(messages) + return f'{head}{body}' + + # ------------------------------- internals ------------------------------- + def _iter_messages(self, path: Path) -> Iterable[Dict[str, Any]]: + if not path.exists(): + return + with path.open("r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + if isinstance(obj, dict): + yield obj + except Exception: + # Skip malformed lines + continue + + def _build_head(self) -> str: + # Minimal CSS, no external fonts for offline use + css = ( + "body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Ubuntu,Cantarell,Noto Sans,sans-serif;" + "margin:0;padding:0;background:#0b0c10;color:#e6edf3;}" + ".container{max-width:1100px;margin:0 auto;padding:24px;}" + ".header{position:sticky;top:0;background:#0b0c10;border-bottom:1px solid #1f2328;padding:12px 24px;z-index:1;}" + ".title{font-size:20px;font-weight:600;}" + ".msg{display:grid;grid-template-columns:220px 1fr;gap:12px;padding:14px 10px;border-bottom:1px solid #1f2328;}" + ".meta{color:#9ea7b3;font-size:12px;line-height:1.35;}" + ".sender{font-weight:600;color:#c9d1d9;}" + ".type{display:inline-block;border:1px solid #30363d;border-radius:999px;padding:2px 8px;font-size:11px;color:#c9d1d9;margin-left:6px;}" + ".thread{color:#a5d6ff;margin-left:8px;}" + ".content{white-space:pre-wrap;word-break:break-word;font-size:14px;line-height:1.5;}" + ".pill{display:inline-block;margin-right:6px;margin-top:4px;border:1px solid #30363d;border-radius:999px;padding:2px 8px;font-size:11px;color:#9ea7b3;}" + ".section{margin-top:18px;}" + ".code{background:#161b22;border:1px solid #30363d;border-radius:6px;padding:10px;display:block;overflow:auto;}" + ) + return ( + "" + f'' + f"{html.escape(self.title)}" + f"" + "" + ) + + def _build_body(self, messages: List[Dict[str, Any]]) -> str: + header = f'
{html.escape(self.title)}
' + items = [self._render_message(m) for m in messages] + return f'{header}
{"".join(items)}
' + + def _render_message(self, m: Dict[str, Any]) -> str: + ts = html.escape(str(m.get("timestamp", ""))) + sender = html.escape(str(m.get("sender", ""))) + msg_type = html.escape(str(m.get("type", "message") or "message")) + thread = html.escape(str(m.get("thread_id", "main") or "main")) + recipients = m.get("recipients", []) + recipients = recipients if isinstance(recipients, list) else [recipients] + recipients_html = "".join(f'{html.escape(str(r))}' for r in recipients if r) + + content_html = self._render_content(m.get("content")) + + meta = ( + f'
{sender}' + f'{msg_type}' + f'# {thread}
{ts}
{recipients_html}
' + ) + + return f'
{meta}
{content_html}
' + + def _render_content(self, content: Any) -> str: + # String content + if isinstance(content, str): + return html.escape(content) + + # Dict content with known subtypes + if isinstance(content, dict): + t = content.get("type") + if t == "poll": + return self._render_poll(content) + if t == "vote": + return self._kv_block(content, keys=["poll_id", "voter", "vote", "confidence", "rationale"]) + if t == "final_answer": + return self._kv_block(content, keys=["answer", "poll_id", "tally", "source_proposer"]) + # Generic dict fallback + return self._kv_block(content) + + # Anything else + try: + return self._kv_block(json.loads(str(content))) + except Exception: + return f'{html.escape(str(content))}' + + def _kv_block(self, obj: Dict[str, Any], keys: Optional[List[str]] = None) -> str: + if not isinstance(obj, dict): + return f'{html.escape(str(obj))}' + items: List[str] = [] + if keys is None: + keys = list(obj.keys()) + for k in keys: + if k in obj: + v = obj[k] + pretty = self._pretty(v) + items.append(f"
{html.escape(str(k))}: {pretty}
") + # Include any remaining keys not listed explicitly + for k, v in obj.items(): + if k in keys: + continue + items.append(f"
{html.escape(str(k))}: {self._pretty(v)}
") + return '
' + "".join(items) + "
" + + def _pretty(self, value: Any) -> str: + if isinstance(value, (str, int, float)) or value is None: + return html.escape(str(value)) + try: + dump = json.dumps(value, ensure_ascii=False, indent=2) + return f'{html.escape(dump)}' + except Exception: + return f'{html.escape(str(value))}' + + def _render_poll(self, poll: Dict[str, Any]) -> str: + # Render poll with a friendly layout emphasizing question and proposal + question = html.escape(str(poll.get("question", "Poll"))) + proposal = poll.get("proposal") + proposal_html = self._pretty(proposal) + header = f"
Question: {question}
" + body = f"
Proposal: {proposal_html}
" + meta = self._kv_block( + poll, + keys=["poll_id", "options", "threshold", "status", "proposer", "final_answer"], + ) + return '
' + header + body + meta + "
" diff --git a/examples/decentralized_smolagents_benchmark/scripts/mdconvert.py b/examples/decentralized_smolagents_benchmark/scripts/mdconvert.py new file mode 100644 index 000000000..5d5273336 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/mdconvert.py @@ -0,0 +1,1027 @@ +# This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py +# Thanks to Microsoft researchers for open-sourcing this! +# type: ignore +import base64 +import copy +import html +import json +import mimetypes +import os +import re +import shutil +import subprocess +import sys +import tempfile +import traceback +import zipfile +from typing import Any +from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse + +import mammoth +import markdownify +import pandas as pd +import pdfminer +import pdfminer.high_level +import pptx + +# File-format detection +import puremagic +import pydub +import requests +import speech_recognition as sr +from bs4 import BeautifulSoup +from youtube_transcript_api import YouTubeTranscriptApi +from youtube_transcript_api.formatters import SRTFormatter + + +class _CustomMarkdownify(markdownify.MarkdownConverter): + """ + A custom version of markdownify's MarkdownConverter. Changes include: + + - Altering the default heading style to use '#', '##', etc. + - Removing javascript hyperlinks. + - Truncating images with large data:uri sources. + - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax + """ + + def __init__(self, **options: Any): + options["heading_style"] = options.get("heading_style", markdownify.ATX) + # Explicitly cast options to the expected type if necessary + super().__init__(**options) + + def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: + """Same as usual, but be sure to start with a new line""" + if not convert_as_inline: + if not re.search(r"^\n", text): + return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore + + return super().convert_hn(n, el, text, convert_as_inline) # type: ignore + + def convert_a(self, el: Any, text: str, convert_as_inline: bool): + """Same as usual converter, but removes Javascript links and escapes URIs.""" + prefix, suffix, text = markdownify.chomp(text) # type: ignore + if not text: + return "" + href = el.get("href") + title = el.get("title") + + # Escape URIs and skip non-http or file schemes + if href: + try: + parsed_url = urlparse(href) # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + return "%s%s%s" % (prefix, text, suffix) + href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + except ValueError: # It's not clear if this ever gets thrown + return "%s%s%s" % (prefix, text, suffix) + + # For the replacement see #29: text nodes underscores are escaped + if ( + self.options["autolinks"] + and text.replace(r"\_", "_") == href + and not title + and not self.options["default_title"] + ): + # Shortcut syntax + return "<%s>" % href + if self.options["default_title"] and not title: + title = href + title_part = ' "%s"' % title.replace('"', r"\"") if title else "" + return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text + + def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: + """Same as usual converter, but removes data URIs""" + + alt = el.attrs.get("alt", None) or "" + src = el.attrs.get("src", None) or "" + title = el.attrs.get("title", None) or "" + title_part = ' "%s"' % title.replace('"', r"\"") if title else "" + if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]: + return alt + + # Remove dataURIs + if src.startswith("data:"): + src = src.split(",")[0] + "..." + + return "![%s](%s%s)" % (alt, src, title_part) + + def convert_soup(self, soup: Any) -> str: + return super().convert_soup(soup) # type: ignore + + +class DocumentConverterResult: + """The result of converting a document to text.""" + + def __init__(self, title: str | None = None, text_content: str = ""): + self.title: str | None = title + self.text_content: str = text_content + + +class DocumentConverter: + """Abstract superclass of all DocumentConverters.""" + + def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult: + raise NotImplementedError() + + +class PlainTextConverter(DocumentConverter): + """Anything with content type text/plain""" + + def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult: + # Guess the content type from any file extension that might be around + content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", "")) + + # Only accept text files + if content_type is None: + return None + # elif "text/" not in content_type.lower(): + # return None + + text_content = "" + with open(local_path, "rt", encoding="utf-8") as fh: + text_content = fh.read() + return DocumentConverterResult( + title=None, + text_content=text_content, + ) + + +class HtmlConverter(DocumentConverter): + """Anything with content type text/html""" + + def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult: + # Bail if not html + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".html", ".htm"]: + return None + + result = None + with open(local_path, "rt", encoding="utf-8") as fh: + result = self._convert(fh.read()) + + return result + + def _convert(self, html_content: str) -> None | DocumentConverterResult: + """Helper function that converts and HTML string.""" + + # Parse the string + soup = BeautifulSoup(html_content, "html.parser") + + # Remove javascript and style blocks + for script in soup(["script", "style"]): + script.extract() + + # Print only the main content + body_elm = soup.find("body") + webpage_text = "" + if body_elm: + webpage_text = _CustomMarkdownify().convert_soup(body_elm) + else: + webpage_text = _CustomMarkdownify().convert_soup(soup) + + assert isinstance(webpage_text, str) + + return DocumentConverterResult( + title=None if soup.title is None else soup.title.string, text_content=webpage_text + ) + + +class WikipediaConverter(DocumentConverter): + """Handle Wikipedia pages separately, focusing only on the main document content.""" + + def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult: + # Bail if not Wikipedia + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".html", ".htm"]: + return None + url = kwargs.get("url", "") + if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): + return None + + # Parse the file + soup = None + with open(local_path, "rt", encoding="utf-8") as fh: + soup = BeautifulSoup(fh.read(), "html.parser") + + # Remove javascript and style blocks + for script in soup(["script", "style"]): + script.extract() + + # Print only the main content + body_elm = soup.find("div", {"id": "mw-content-text"}) + title_elm = soup.find("span", {"class": "mw-page-title-main"}) + + webpage_text = "" + main_title = None if soup.title is None else soup.title.string + + if body_elm: + # What's the title + if title_elm and len(title_elm) > 0: + main_title = title_elm.string # type: ignore + assert isinstance(main_title, str) + + # Convert the page + webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm) + else: + webpage_text = _CustomMarkdownify().convert_soup(soup) + + return DocumentConverterResult( + title=main_title, + text_content=webpage_text, + ) + + +class YouTubeConverter(DocumentConverter): + """Handle YouTube specially, focusing on the video title, description, and transcript.""" + + def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult: + # Bail if not YouTube + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".html", ".htm"]: + return None + url = kwargs.get("url", "") + if not url.startswith("https://www.youtube.com/watch?"): + return None + + # Parse the file + soup = None + with open(local_path, "rt", encoding="utf-8") as fh: + soup = BeautifulSoup(fh.read(), "html.parser") + + # Read the meta tags + assert soup.title is not None and soup.title.string is not None + metadata: dict[str, str] = {"title": soup.title.string} + for meta in soup(["meta"]): + for a in meta.attrs: + if a in ["itemprop", "property", "name"]: + metadata[meta[a]] = meta.get("content", "") + break + + # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation + try: + for script in soup(["script"]): + content = script.text + if "ytInitialData" in content: + lines = re.split(r"\r?\n", content) + obj_start = lines[0].find("{") + obj_end = lines[0].rfind("}") + if obj_start >= 0 and obj_end >= 0: + data = json.loads(lines[0][obj_start : obj_end + 1]) + attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore + if attrdesc: + metadata["description"] = str(attrdesc["content"]) + break + except Exception: + pass + + # Start preparing the page + webpage_text = "# YouTube\n" + + title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore + assert isinstance(title, str) + + if title: + webpage_text += f"\n## {title}\n" + + stats = "" + views = self._get(metadata, ["interactionCount"]) # type: ignore + if views: + stats += f"- **Views:** {views}\n" + + keywords = self._get(metadata, ["keywords"]) # type: ignore + if keywords: + stats += f"- **Keywords:** {keywords}\n" + + runtime = self._get(metadata, ["duration"]) # type: ignore + if runtime: + stats += f"- **Runtime:** {runtime}\n" + + if len(stats) > 0: + webpage_text += f"\n### Video Metadata\n{stats}\n" + + description = self._get(metadata, ["description", "og:description"]) # type: ignore + if description: + webpage_text += f"\n### Description\n{description}\n" + + transcript_text = "" + parsed_url = urlparse(url) # type: ignore + params = parse_qs(parsed_url.query) # type: ignore + if "v" in params: + assert isinstance(params["v"][0], str) + video_id = str(params["v"][0]) + try: + # Must be a single transcript. + transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore + # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore + # Alternative formatting: + transcript_text = SRTFormatter().format_transcript(transcript) + except Exception: + pass + if transcript_text: + webpage_text += f"\n### Transcript\n{transcript_text}\n" + + title = title if title else soup.title.string + assert isinstance(title, str) + + return DocumentConverterResult( + title=title, + text_content=webpage_text, + ) + + def _get(self, metadata: dict[str, str], keys: list[str], default: str | None = None) -> str | None: + for k in keys: + if k in metadata: + return metadata[k] + return default + + def _findKey(self, json: Any, key: str) -> str | None: # TODO: Fix json type + if isinstance(json, list): + for elm in json: + ret = self._findKey(elm, key) + if ret is not None: + return ret + elif isinstance(json, dict): + for k in json: + if k == key: + return json[k] + else: + ret = self._findKey(json[k], key) + if ret is not None: + return ret + return None + + +class PdfConverter(DocumentConverter): + """ + Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + """ + + def convert(self, local_path, **kwargs) -> None | DocumentConverterResult: + # Bail if not a PDF + extension = kwargs.get("file_extension", "") + if extension.lower() != ".pdf": + return None + + return DocumentConverterResult( + title=None, + text_content=pdfminer.high_level.extract_text(local_path), + ) + + +class DocxConverter(HtmlConverter): + """ + Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. + """ + + def convert(self, local_path, **kwargs) -> None | DocumentConverterResult: + # Bail if not a DOCX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".docx": + return None + + result = None + with open(local_path, "rb") as docx_file: + result = mammoth.convert_to_html(docx_file) + html_content = result.value + result = self._convert(html_content) + + return result + + +class XlsxConverter(HtmlConverter): + """ + Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. + """ + + def convert(self, local_path, **kwargs) -> None | DocumentConverterResult: + # Bail if not a XLSX + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".xlsx", ".xls"]: + return None + + sheets = pd.read_excel(local_path, sheet_name=None) + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + html_content = sheets[s].to_html(index=False) + md_content += self._convert(html_content).text_content.strip() + "\n\n" + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + +class PptxConverter(HtmlConverter): + """ + Converts PPTX files to Markdown. Supports heading, tables and images with alt text. + """ + + def convert(self, local_path, **kwargs) -> None | DocumentConverterResult: + # Bail if not a PPTX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".pptx": + return None + + md_content = "" + + presentation = pptx.Presentation(local_path) + slide_num = 0 + for slide in presentation.slides: + slide_num += 1 + + md_content += f"\n\n\n" + + title = slide.shapes.title + for shape in slide.shapes: + # Pictures + if self._is_picture(shape): + # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 + alt_text = "" + try: + alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") + except Exception: + pass + + # A placeholder name + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n" + + # Tables + if self._is_table(shape): + html_table = "" + first_row = True + for row in shape.table.rows: + html_table += "" + for cell in row.cells: + if first_row: + html_table += "" + else: + html_table += "" + html_table += "" + first_row = False + html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" + md_content += "\n" + self._convert(html_table).text_content.strip() + "\n" + + # Text areas + elif shape.has_text_frame: + if shape == title: + md_content += "# " + shape.text.lstrip() + "\n" + else: + md_content += shape.text + "\n" + + md_content = md_content.strip() + + if slide.has_notes_slide: + md_content += "\n\n### Notes:\n" + notes_frame = slide.notes_slide.notes_text_frame + if notes_frame is not None: + md_content += notes_frame.text + md_content = md_content.strip() + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + def _is_picture(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: + return True + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: + if hasattr(shape, "image"): + return True + return False + + def _is_table(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: + return True + return False + + +class MediaConverter(DocumentConverter): + """ + Abstract class for multi-modal media (e.g., images and audio) + """ + + def _get_metadata(self, local_path): + exiftool = shutil.which("exiftool") + if not exiftool: + return None + else: + try: + result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout + return json.loads(result)[0] + except Exception: + return None + + +class WavConverter(MediaConverter): + """ + Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). + """ + + def convert(self, local_path, **kwargs) -> None | DocumentConverterResult: + # Bail if not a XLSX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".wav": + return None + + md_content = "" + + # Add metadata + metadata = self._get_metadata(local_path) + if metadata: + for f in [ + "Title", + "Artist", + "Author", + "Band", + "Album", + "Genre", + "Track", + "DateTimeOriginal", + "CreateDate", + "Duration", + ]: + if f in metadata: + md_content += f"{f}: {metadata[f]}\n" + + # Transcribe + try: + transcript = self._transcribe_audio(local_path) + md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript) + except Exception: + md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + def _transcribe_audio(self, local_path) -> str: + recognizer = sr.Recognizer() + with sr.AudioFile(local_path) as source: + audio = recognizer.record(source) + return recognizer.recognize_google(audio).strip() + + +class Mp3Converter(WavConverter): + """ + Converts MP3 and M4A files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). + """ + + def convert(self, local_path, **kwargs) -> None | DocumentConverterResult: + # Bail if not a MP3 + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".mp3", ".m4a"]: + return None + + md_content = "" + + # Add metadata + metadata = self._get_metadata(local_path) + if metadata: + for f in [ + "Title", + "Artist", + "Author", + "Band", + "Album", + "Genre", + "Track", + "DateTimeOriginal", + "CreateDate", + "Duration", + ]: + if f in metadata: + md_content += f"{f}: {metadata[f]}\n" + + # Transcribe + handle, temp_path = tempfile.mkstemp(suffix=".wav") + os.close(handle) + try: + if extension.lower() == ".mp3": + sound = pydub.AudioSegment.from_mp3(local_path) + else: + sound = pydub.AudioSegment.from_file(local_path, format="m4a") + sound.export(temp_path, format="wav") + + _args = dict() + _args.update(kwargs) + _args["file_extension"] = ".wav" + + try: + transcript = super()._transcribe_audio(temp_path).strip() + md_content += "\n\n### Audio Transcript:\n" + ( + "[No speech detected]" if transcript == "" else transcript + ) + except Exception: + md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." + + finally: + os.unlink(temp_path) + + # Return the result + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + +class ZipConverter(DocumentConverter): + """ + Extracts ZIP files to a permanent local directory and returns a listing of extracted files. + """ + + def __init__(self, extract_dir: str = "downloads"): + """ + Initialize with path to extraction directory. + + Args: + extract_dir: The directory where files will be extracted. Defaults to "downloads" + """ + self.extract_dir = extract_dir + # Create the extraction directory if it doesn't exist + os.makedirs(self.extract_dir, exist_ok=True) + + def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult: + # Bail if not a ZIP file + extension = kwargs.get("file_extension", "") + if extension.lower() != ".zip": + return None + + # Verify it's actually a ZIP file + if not zipfile.is_zipfile(local_path): + return None + + # Extract all files and build list + extracted_files = [] + with zipfile.ZipFile(local_path, "r") as zip_ref: + # Extract all files + zip_ref.extractall(self.extract_dir) + # Get list of all files + for file_path in zip_ref.namelist(): + # Skip directories + if not file_path.endswith("/"): + extracted_files.append(self.extract_dir + "/" + file_path) + + # Sort files for consistent output + extracted_files.sort() + + # Build the markdown content + md_content = "Downloaded the following files:\n" + for file in extracted_files: + md_content += f"* {file}\n" + + return DocumentConverterResult(title="Extracted Files", text_content=md_content.strip()) + + +class ImageConverter(MediaConverter): + """ + Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). + """ + + def convert(self, local_path, **kwargs) -> None | DocumentConverterResult: + # Bail if not a XLSX + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".jpg", ".jpeg", ".png"]: + return None + + md_content = "" + + # Add metadata + metadata = self._get_metadata(local_path) + if metadata: + for f in [ + "ImageSize", + "Title", + "Caption", + "Description", + "Keywords", + "Artist", + "Author", + "DateTimeOriginal", + "CreateDate", + "GPSPosition", + ]: + if f in metadata: + md_content += f"{f}: {metadata[f]}\n" + + # Try describing the image with GPTV + mlm_client = kwargs.get("mlm_client") + mlm_model = kwargs.get("mlm_model") + if mlm_client is not None and mlm_model is not None: + md_content += ( + "\n# Description:\n" + + self._get_mlm_description( + local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt") + ).strip() + + "\n" + ) + + return DocumentConverterResult( + title=None, + text_content=md_content, + ) + + def _get_mlm_description(self, local_path, extension, client, model, prompt=None): + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed caption for this image." + + sys.stderr.write(f"MLM Prompt:\n{prompt}\n") + + data_uri = "" + with open(local_path, "rb") as image_file: + content_type, encoding = mimetypes.guess_type("_dummy" + extension) + if content_type is None: + content_type = "image/jpeg" + image_base64 = base64.b64encode(image_file.read()).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content + + +class FileConversionException(Exception): + pass + + +class UnsupportedFormatException(Exception): + pass + + +class MarkdownConverter: + """(In preview) An extremely simple text-based document reader, suitable for LLM use. + This reader will convert common file-types or webpages to Markdown.""" + + def __init__( + self, + requests_session: requests.Session | None = None, + mlm_client: Any | None = None, + mlm_model: Any | None = None, + ): + if requests_session is None: + self._requests_session = requests.Session() + else: + self._requests_session = requests_session + + self._mlm_client = mlm_client + self._mlm_model = mlm_model + + self._page_converters: list[DocumentConverter] = [] + + # Register converters for successful browsing operations + # Later registrations are tried first / take higher priority than earlier registrations + # To this end, the most specific converters should appear below the most generic converters + self.register_page_converter(PlainTextConverter()) + self.register_page_converter(HtmlConverter()) + self.register_page_converter(WikipediaConverter()) + self.register_page_converter(YouTubeConverter()) + self.register_page_converter(DocxConverter()) + self.register_page_converter(XlsxConverter()) + self.register_page_converter(PptxConverter()) + self.register_page_converter(WavConverter()) + self.register_page_converter(Mp3Converter()) + self.register_page_converter(ImageConverter()) + self.register_page_converter(ZipConverter()) + self.register_page_converter(PdfConverter()) + + def convert( + self, source: str | requests.Response, **kwargs: Any + ) -> DocumentConverterResult: # TODO: deal with kwargs + """ + Args: + - source: can be a string representing a path or url, or a requests.response object + - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) + """ + + # Local path or url + if isinstance(source, str): + if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"): + return self.convert_url(source, **kwargs) + else: + return self.convert_local(source, **kwargs) + # Request response + elif isinstance(source, requests.Response): + return self.convert_response(source, **kwargs) + + def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs + # Normalize and resolve the path to handle relative paths properly + normalized_path = os.path.abspath(os.path.expanduser(path)) + + # Check if the file exists before proceeding + if not os.path.exists(normalized_path) or not os.path.isfile(normalized_path): + # Return an error result instead of crashing + return DocumentConverterResult( + title="File Not Found", + text_content=f"Error: File not found at path '{path}' (resolved to '{normalized_path}')", + ) + + # Prepare a list of extensions to try (in order of priority) + ext = kwargs.get("file_extension") + extensions = [ext] if ext is not None else [] + + # Get extension alternatives from the path and puremagic + base, ext = os.path.splitext(normalized_path) + self._append_ext(extensions, ext) + self._append_ext(extensions, self._guess_ext_magic(normalized_path)) + + # Convert using the normalized path + return self._convert(normalized_path, extensions, **kwargs) + + # TODO what should stream's type be? + def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs + # Prepare a list of extensions to try (in order of priority) + ext = kwargs.get("file_extension") + extensions = [ext] if ext is not None else [] + + # Save the file locally to a temporary file. It will be deleted before this method exits + handle, temp_path = tempfile.mkstemp() + fh = os.fdopen(handle, "wb") + result = None + try: + # Write to the temporary file + content = stream.read() + if isinstance(content, str): + fh.write(content.encode("utf-8")) + else: + fh.write(content) + fh.close() + + # Use puremagic to check for more extension options + self._append_ext(extensions, self._guess_ext_magic(temp_path)) + + # Convert + result = self._convert(temp_path, extensions, **kwargs) + # Clean up + finally: + try: + fh.close() + except Exception: + pass + os.unlink(temp_path) + + return result + + def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type + # Send a HTTP request to the URL + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" + response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent}) + response.raise_for_status() + return self.convert_response(response, **kwargs) + + def convert_response( + self, response: requests.Response, **kwargs: Any + ) -> DocumentConverterResult: # TODO fix kwargs type + # Prepare a list of extensions to try (in order of priority) + ext = kwargs.get("file_extension") + extensions = [ext] if ext is not None else [] + + # Guess from the mimetype + content_type = response.headers.get("content-type", "").split(";")[0] + self._append_ext(extensions, mimetypes.guess_extension(content_type)) + + # Read the content disposition if there is one + content_disposition = response.headers.get("content-disposition", "") + m = re.search(r"filename=([^;]+)", content_disposition) + if m: + base, ext = os.path.splitext(m.group(1).strip("\"'")) + self._append_ext(extensions, ext) + + # Read from the extension from the path + base, ext = os.path.splitext(urlparse(response.url).path) + self._append_ext(extensions, ext) + + # Save the file locally to a temporary file. It will be deleted before this method exits + handle, temp_path = tempfile.mkstemp() + fh = os.fdopen(handle, "wb") + result = None + try: + # Download the file + for chunk in response.iter_content(chunk_size=512): + fh.write(chunk) + fh.close() + + # Use puremagic to check for more extension options + self._append_ext(extensions, self._guess_ext_magic(temp_path)) + + # Convert + result = self._convert(temp_path, extensions, url=response.url) + except Exception as e: + print(f"Error in converting: {e}") + + # Clean up + finally: + try: + fh.close() + except Exception: + pass + os.unlink(temp_path) + + return result + + def _convert(self, local_path: str, extensions: list[str | None], **kwargs) -> DocumentConverterResult: + error_trace = "" + for ext in extensions + [None]: # Try last with no extension + for converter in self._page_converters: + _kwargs = copy.deepcopy(kwargs) + + # Overwrite file_extension appropriately + if ext is None: + if "file_extension" in _kwargs: + del _kwargs["file_extension"] + else: + _kwargs.update({"file_extension": ext}) + + # Copy any additional global options + if "mlm_client" not in _kwargs and self._mlm_client is not None: + _kwargs["mlm_client"] = self._mlm_client + + if "mlm_model" not in _kwargs and self._mlm_model is not None: + _kwargs["mlm_model"] = self._mlm_model + + # If we hit an error log it and keep trying + try: + res = converter.convert(local_path, **_kwargs) + except Exception: + error_trace = ("\n\n" + traceback.format_exc()).strip() + + if res is not None: + # Normalize the content + res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)]) + res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) + + # Todo + return res + + # If we got this far without success, report any exceptions + if len(error_trace) > 0: + raise FileConversionException( + f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}" + ) + + # Nothing can handle it! + raise UnsupportedFormatException( + f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported." + ) + + def _append_ext(self, extensions, ext): + """Append a unique non-None, non-empty extension to a list of extensions.""" + if ext is None: + return + ext = ext.strip() + if ext == "": + return + # if ext not in extensions: + if True: + extensions.append(ext) + + def _guess_ext_magic(self, path): + """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" + # Use puremagic to guess + if not path or not os.path.exists(path) or not os.path.isfile(path): + # Log the issue but don't raise an error - just return None to continue processing + print(f"Warning: Invalid or non-existent file path in _guess_ext_magic: {path}") + return None + + try: + guesses = puremagic.magic_file(path) + if len(guesses) > 0: + ext = guesses[0].extension.strip() + if len(ext) > 0: + return ext + except puremagic.PureMagicError as e: + # If puremagic fails, we just ignore it and return None + print(f"Error in puremagic: {e}") + except puremagic.PureMagicNotFoundError: + # If puremagic is not installed, we just ignore it and return None + print("puremagic is not installed. Skipping magic file type detection.") + except ValueError: + # If the path is not a file, we just ignore it and return None + pass + except FileNotFoundError: + pass + except IsADirectoryError: + pass + except PermissionError: + pass + return None + + def register_page_converter(self, converter: DocumentConverter) -> None: + """Register a page text converter.""" + self._page_converters.insert(0, converter) diff --git a/examples/decentralized_smolagents_benchmark/scripts/message_store.py b/examples/decentralized_smolagents_benchmark/scripts/message_store.py new file mode 100644 index 000000000..2e7f7ac13 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/message_store.py @@ -0,0 +1,953 @@ +"""Message store implementation for decentralized agents.""" + +import json +import logging +import threading +import uuid +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + + +def now_ts() -> str: + return datetime.utcnow().replace(tzinfo=timezone.utc).isoformat(timespec="milliseconds").replace("+00:00", "Z") + + +def majority_plus_one(n_agents: int) -> int: + return (n_agents // 2) + 1 + + +class MessageStore: + def __init__(self, run_id: str, agent_names: Optional[List[str]] = None): + self.run_id = run_id + script_dir = Path(__file__).parent.parent + self.run_dir = script_dir / "runs" / run_id + self.messages_file = self.run_dir / "messages.jsonl" + self.run_dir.mkdir(parents=True, exist_ok=True) + self._lock = threading.Lock() + self._configured_agents = list(agent_names or []) + + logging.info( + json.dumps( + { + "event": "message_store_initialized", + "run_id": run_id, + "agent_count": len(self._configured_agents), + "messages_file": str(self.messages_file), + } + ) + ) + + def get_messages( + self, + agent_id: str, + last_seen: Optional[str] = None, + last_seen_ts: Optional[str] = None, + thread_id: Optional[str] = None, + include_mentions: bool = True, + include_private: bool = True, + ) -> List[Dict]: + """Get messages visible to the given agent with enhanced filtering.""" + if not self.messages_file.exists(): + return [] + + messages = [] + with self._lock: + for msg in self._iter_messages(): + # Additional safety check + if not isinstance(msg, dict): + continue + + # Skip messages before last_seen timestamp + if last_seen_ts and msg.get("timestamp", "") <= last_seen_ts: + continue + + # Skip if message is before last_seen ID (legacy support) + if last_seen and msg.get("id", "") <= last_seen: + continue + + # Filter by thread if specified + if thread_id and msg.get("thread_id") != thread_id: + continue + + # Check message visibility with enhanced error logging + recipients = msg.get("recipients", []) + + # Enhanced type safety check with detailed logging - BULLETPROOF VERSION + if not isinstance(recipients, (list, tuple)): + logging.warning( + json.dumps( + { + "event": "type_safety_fix_applied", + "location": "get_messages", + "message_id": msg.get("id", "unknown"), + "recipients_type": type(recipients).__name__, + "recipients_value": str(recipients), + "sender": msg.get("sender", "unknown"), + "timestamp": msg.get("timestamp", "unknown"), + } + ) + ) + if recipients is None: + recipients = [] + elif isinstance(recipients, (int, float, bool)): + # Convert problematic types to empty list for safety + recipients = [] + else: + recipients = [str(recipients)] # Convert single value to list + + # Ensure recipients is a list of strings + recipients = [str(r) for r in recipients if r is not None] + + sender = msg.get("sender", "") + content_str = str(msg.get("content", "")) + + # Message visibility logic: + # 1. Public messages (empty recipients or @all) + # 2. Direct messages to this agent + # 3. Messages mentioning this agent (@agent_id) + # 4. Messages from this agent (own messages) + visible = False + + if not recipients or "@all" in recipients: + visible = True # Public message + elif agent_id in recipients: + visible = include_private # Direct message + elif include_mentions and f"@{agent_id}" in content_str: + visible = True # Mentioned in message + elif sender == agent_id: + visible = True # Own message + + if visible: + messages.append(msg) + + return sorted(messages, key=lambda m: m.get("timestamp", "")) + + def get_thread_messages(self, thread_id: str, agent_id: Optional[str] = None) -> List[Dict[str, Any]]: + """Get all messages from a specific thread, filtered by agent visibility if specified.""" + return self.get_messages( + agent_id=agent_id or "system", # Default to system if no agent specified + thread_id=thread_id, + ) + + def search_messages( + self, + query: str, + agent_id: Optional[str] = None, + thread_id: Optional[str] = None, + limit: int = 50, + after_ts: Optional[str] = None, + before_ts: Optional[str] = None, + ) -> List[Dict[str, Any]]: + """Enhanced search with agent visibility and better filtering.""" + q = query.lower().strip() + if not q: + return [] + + out: List[Dict[str, Any]] = [] + with self._lock: + for msg in self._iter_messages(): + ts = msg.get("timestamp", "") + + # Time filtering + if after_ts and ts <= after_ts: + continue + if before_ts and ts >= before_ts: + continue + + # Thread filtering + if thread_id and msg.get("thread_id") != thread_id: + continue + + # Agent visibility check + if agent_id: + recipients = msg.get("recipients", []) + + # Enhanced type safety check with detailed logging - BULLETPROOF VERSION + if not isinstance(recipients, (list, tuple)): + logging.warning( + json.dumps( + { + "event": "type_safety_fix_applied", + "location": "search_messages", + "message_id": msg.get("id", "unknown"), + "recipients_type": type(recipients).__name__, + "recipients_value": str(recipients), + "sender": msg.get("sender", "unknown"), + "query": query, + "timestamp": msg.get("timestamp", "unknown"), + } + ) + ) + if recipients is None: + recipients = [] + elif isinstance(recipients, (int, float, bool)): + # Convert problematic types to empty list for safety + recipients = [] + else: + recipients = [str(recipients)] # Convert single value to list + + # Ensure recipients is a list of strings + recipients = [str(r) for r in recipients if r is not None] + + sender = msg.get("sender", "") + content_str = str(msg.get("content", "")) + + # Additional safety check for the actual operations + try: + visible = ( + not recipients + or "@all" in recipients # Public + or agent_id in recipients # Direct message + or f"@{agent_id}" in content_str # Mentioned + or sender == agent_id # Own message + ) + except TypeError as e: + # Log detailed error information if the type check above missed something + logging.error( + json.dumps( + { + "event": "type_error_caught", + "location": "search_messages_visibility_check", + "error": str(e), + "message_id": msg.get("id", "unknown"), + "recipients": recipients, + "recipients_type": type(recipients).__name__, + "agent_id": agent_id, + "sender": sender, + "query": query, + "run_id": self.run_id, + } + ) + ) + # Default to not visible if we can't determine visibility + continue + if not visible: + continue + + # Content search + blob = json.dumps(msg, ensure_ascii=False).lower() + if q in blob: + out.append(msg) + if len(out) >= limit: + break + + out.sort(key=lambda m: m.get("timestamp", "")) + return out + + # ------------------------------ notifications & agent tools ----------- + def get_notifications(self, agent_id: str, since_ts: Optional[str] = None) -> Dict[str, List[Dict[str, Any]]]: + """Get notifications for an agent including mentions, direct messages, and thread activity.""" + notifications = {"mentions": [], "direct_messages": [], "thread_updates": {}, "polls_needing_votes": []} + + with self._lock: + for msg in self._iter_messages(): + if since_ts and msg.get("timestamp", "") <= since_ts: + continue + + msg_type = msg.get("type", "message") + sender = msg.get("sender", "") + recipients = msg.get("recipients", []) + + # Enhanced type safety check with detailed logging - BULLETPROOF VERSION + if not isinstance(recipients, (list, tuple)): + logging.warning( + json.dumps( + { + "event": "type_safety_fix_applied", + "location": "get_notifications", + "message_id": msg.get("id", "unknown"), + "recipients_type": type(recipients).__name__, + "recipients_value": str(recipients), + "sender": sender, + "agent_id": agent_id, + "timestamp": msg.get("timestamp", "unknown"), + } + ) + ) + if recipients is None: + recipients = [] + elif isinstance(recipients, (int, float, bool)): + # Convert problematic types to empty list for safety + recipients = [] + else: + recipients = [str(recipients)] # Convert single value to list + + # Ensure recipients is a list of strings + recipients = [str(r) for r in recipients if r is not None] + + content_str = str(msg.get("content", "")) + thread_id = msg.get("thread_id", "main") + + # Skip own messages + if sender == agent_id: + continue + + # Additional safety check for the actual operations + try: + # Direct messages + if agent_id in recipients: + notifications["direct_messages"].append(msg) + + # Mentions + if f"@{agent_id}" in content_str: + notifications["mentions"].append(msg) + + # Thread updates (public messages in threads agent is following) + if not recipients or "@all" in recipients: + if thread_id not in notifications["thread_updates"]: + notifications["thread_updates"][thread_id] = [] + notifications["thread_updates"][thread_id].append(msg) + + except TypeError as e: + # Log detailed error information if the type check above missed something + logging.error( + json.dumps( + { + "event": "type_error_caught", + "location": "get_notifications_processing", + "error": str(e), + "message_id": msg.get("id", "unknown"), + "recipients": recipients, + "recipients_type": type(recipients).__name__, + "agent_id": agent_id, + "sender": sender, + "run_id": self.run_id, + } + ) + ) + continue # Skip this message if we can't process it safely + + # Polls needing votes + if msg_type == "poll": + poll_content = msg.get("content", {}) + if poll_content.get("status") == "open": + # Check if agent hasn't voted yet + poll_id = poll_content.get("poll_id") + if poll_id: + vote_info = self.count_votes(poll_id) + if agent_id not in vote_info.get("votes_by_voter", {}): + notifications["polls_needing_votes"].append(msg) + + # Sort all notification lists by timestamp + for key, value in notifications.items(): + if isinstance(value, list): + notifications[key] = sorted(value, key=lambda m: m.get("timestamp", "")) + elif isinstance(value, dict): + for thread, msgs in value.items(): + notifications[key][thread] = sorted(msgs, key=lambda m: m.get("timestamp", "")) + + return notifications + + def get_active_threads(self, agent_id: Optional[str] = None, since_ts: Optional[str] = None) -> List[str]: + """Get list of active thread IDs, optionally filtered by agent visibility.""" + threads = set() + + with self._lock: + for msg in self._iter_messages(): + if since_ts and msg.get("timestamp", "") <= since_ts: + continue + + if agent_id: + # Apply agent visibility filtering + recipients = msg.get("recipients", []) + + # Enhanced type safety check with detailed logging - BULLETPROOF VERSION + if not isinstance(recipients, (list, tuple)): + logging.warning( + json.dumps( + { + "event": "type_safety_fix_applied", + "location": "get_active_threads", + "message_id": msg.get("id", "unknown"), + "recipients_type": type(recipients).__name__, + "recipients_value": str(recipients), + "sender": msg.get("sender", "unknown"), + "agent_id": agent_id, + "timestamp": msg.get("timestamp", "unknown"), + } + ) + ) + if recipients is None: + recipients = [] + elif isinstance(recipients, (int, float, bool)): + # Convert problematic types to empty list for safety + recipients = [] + else: + recipients = [str(recipients)] # Convert single value to list + + # Ensure recipients is a list of strings + recipients = [str(r) for r in recipients if r is not None] + + sender = msg.get("sender", "") + content_str = str(msg.get("content", "")) + + # Additional safety check for the actual operations + try: + visible = ( + not recipients + or "@all" in recipients # Public + or agent_id in recipients # Direct message + or f"@{agent_id}" in content_str # Mentioned + or sender == agent_id # Own message + ) + except TypeError as e: + # Log detailed error information if the type check above missed something + logging.error( + json.dumps( + { + "event": "type_error_caught", + "location": "get_active_threads_visibility_check", + "error": str(e), + "message_id": msg.get("id", "unknown"), + "recipients": recipients, + "recipients_type": type(recipients).__name__, + "agent_id": agent_id, + "sender": sender, + "run_id": self.run_id, + } + ) + ) + # Skip this message if we can't process it safely + continue + if not visible: + continue + + thread_id = msg.get("thread_id", "main") + threads.add(thread_id) + + return sorted(list(threads)) + + def get_channels_info(self, agent_id: Optional[str] = None) -> Dict[str, Any]: + """Get detailed information about all channels visible to the agent.""" + channels = {} + + with self._lock: + for msg in self._iter_messages(): + msg_type = msg.get("type", "message") + thread_id = msg.get("thread_id", "main") + + # Skip if agent filtering is enabled and message not visible to agent + if agent_id: + recipients = msg.get("recipients", []) + + # Enhanced type safety check - BULLETPROOF VERSION + if not isinstance(recipients, (list, tuple)): + if recipients is None: + recipients = [] + elif isinstance(recipients, (int, float, bool)): + # Convert problematic types to empty list for safety + recipients = [] + else: + recipients = [str(recipients)] + + # Ensure recipients is a list of strings + recipients = [str(r) for r in recipients if r is not None] + + sender = msg.get("sender", "") + content_str = str(msg.get("content", "")) + + try: + visible = ( + not recipients + or "@all" in recipients # Public + or agent_id in recipients # Direct message + or f"@{agent_id}" in content_str # Mentioned + or sender == agent_id # Own message + ) + except TypeError: + continue # Skip problematic messages + + if not visible: + continue + + # Initialize channel info if not exists + if thread_id not in channels: + channels[thread_id] = { + "channel_id": thread_id, + "subject": None, + "description": None, + "creator": None, + "created_at": None, + "members": set(), + "last_activity": None, + "message_count": 0, + "is_created_channel": False, + } + + # Update channel info + channels[thread_id]["message_count"] += 1 + channels[thread_id]["last_activity"] = msg.get("timestamp", "") + + # Extract channel creation info + if msg_type == "channel_created": + content = msg.get("content", {}) + if isinstance(content, dict): + channels[thread_id].update( + { + "subject": content.get("subject") or content.get("channel_id", thread_id), + "description": content.get("description", ""), + "creator": content.get("creator") or msg.get("sender", ""), + "created_at": msg.get("timestamp", ""), + "is_created_channel": True, + } + ) + initial_members = content.get("initial_members", []) + if initial_members: + channels[thread_id]["members"].update(initial_members) + + # Add message sender to members + sender = msg.get("sender", "") + if sender and sender not in ["system", "Coordinator"]: + channels[thread_id]["members"].add(sender) + + # Convert sets to lists for JSON serialization + for channel in channels.values(): + channel["members"] = sorted(list(channel["members"])) + + return channels + + def _append_line(self, obj: Dict[str, Any]) -> None: + line = json.dumps(obj, ensure_ascii=False) + with self.messages_file.open("a", encoding="utf-8") as f: + f.write(line + "\n") + + def _iter_messages(self) -> Iterable[Dict[str, Any]]: + if not self.messages_file.exists(): + return + with self.messages_file.open("r", encoding="utf-8") as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + parsed = json.loads(line) + # Ensure we only yield dictionary objects + if isinstance(parsed, dict): + yield parsed + except json.JSONDecodeError as e: + # Log JSON parsing errors but continue + print(f"Warning: Skipping malformed JSON on line {line_num}: {e}") + continue + except Exception as e: + # Handle other parsing errors + print(f"Warning: Unexpected error parsing line {line_num}: {e}") + continue + + # ------------------------------ core API --------------------------------- + def append_message( + self, + *, + sender: str, + content: Any, + recipients: Optional[List[str]] = None, + thread_id: Optional[str] = None, + msg_type: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> Dict[str, Any]: + """Append a message to the log and return the full record.""" + with self._lock: + # Ensure recipients is always properly defined and type-safe + if recipients is None: + # Default behavior: most messages should be visible to all agents + if msg_type in ["task", "poll", "final_answer", "channel_message"]: + recipients = ["@all"] + elif msg_type == "private_message": + # This should have been specified, but default to empty for safety + recipients = [] + else: + # Default to public for unknown message types + recipients = ["@all"] + elif not isinstance(recipients, (list, tuple)): + # Convert non-list types to proper format + if isinstance(recipients, (int, float, bool)): + # These types are problematic - convert to empty list for safety + logging.warning( + json.dumps( + { + "event": "problematic_recipients_type_fixed", + "location": "append_message", + "recipients_type": type(recipients).__name__, + "recipients_value": str(recipients), + "sender": sender, + "msg_type": msg_type, + } + ) + ) + recipients = [] + else: + # Convert to list + recipients = [str(recipients)] + + # Ensure all recipients are strings + recipients = [str(r) for r in recipients if r is not None] + + msg_id = str(uuid.uuid4()) + record = { + "id": msg_id, + "timestamp": now_ts(), + "sender": sender, + "type": msg_type or (content.get("type") if isinstance(content, dict) else None), + "content": content, + "recipients": recipients, # Now guaranteed to be a list + "thread_id": thread_id, + "reply_to": reply_to, + } + + # Log message creation with proper recipients + logging.info( + json.dumps( + { + "event": "message_posted", + "message_id": msg_id, + "sender": sender, + "type": msg_type, + "thread_id": thread_id, + "recipients": recipients, + "content_preview": str(content)[:100], + } + ) + ) + + self._append_line(record) + return record + + # ------------------------------ agent set -------------------------------- + def get_all_agents(self) -> List[str]: + if self._configured_agents: + return list(self._configured_agents) + seen = set() + for m in self._iter_messages() or []: + s = m.get("sender") + if s: + seen.add(s) + return sorted(seen) + + # ------------------------------ POLLS ------------------------------------ + def get_active_polls(self) -> List[Dict[str, Any]]: + """Get all currently active (open) polls. + + IMPORTANT: Polls are NOT sorted by timestamp. The first poll to achieve + voting consensus (N//2+1 votes) should provide the final answer, regardless + of creation order. + + Returns: + List[Dict]: Active polls in message iteration order + """ + import json + + active_polls = [] + for msg in self._iter_messages() or []: + if not isinstance(msg, dict): + continue + + content = msg.get("content", {}) + + if isinstance(content, dict) and content.get("type") == "poll": + status = content.get("status") + if status == "open": # Only open polls are active + active_polls.append(content) + elif isinstance(content, str): + # Try to parse string content as JSON + try: + parsed_content = json.loads(content) + if isinstance(parsed_content, dict) and parsed_content.get("type") == "poll": + status = parsed_content.get("status") + if status == "open": + active_polls.append(parsed_content) + except json.JSONDecodeError: + pass # Skip invalid JSON + + return active_polls + + def create_poll( + self, + *, + question: str, + proposal: Any, + proposer: str, + options: Optional[List[str]] = None, + threshold: Optional[int] = None, + thread_id: Optional[str] = None, + final_answer: Optional[str] = None, # Store clean final answer separately + ) -> Dict[str, Any]: + # Check if there are any active polls - only allow one poll at a time + # active_polls = self.get_active_polls() + # if active_polls: + # print("🚫 Cannot create poll: There is already an active poll in progress") + # print(f" Active poll ID: {active_polls[0].get('poll_id')} by {active_polls[0].get('proposer')}") + # logging.info(json.dumps({ + # "event": "poll_creation_blocked", + # "reason": "active_poll_exists", + # "blocked_proposer": proposer, + # "active_poll_id": active_polls[0].get('poll_id'), + # "active_poll_proposer": active_polls[0].get('proposer') + # })) + # Return the existing active poll instead of creating a new one + # return {"error": "Poll already active", "active_poll": active_polls[0]} + + poll_id = str(uuid.uuid4()) + n_agents = len(self.get_all_agents()) or len(self._configured_agents) or 4 # Default to 4 agents + thr = threshold if threshold is not None else majority_plus_one(n_agents) + + # Log poll creation with explicit threshold + print(f"πŸ—³οΈ Creating poll: {thr} out of {n_agents} agents must vote YES for consensus") + logging.info( + json.dumps( + { + "event": "poll_created", + "poll_id": poll_id, + "proposer": proposer, + "question": question, + "proposal_preview": str(proposal)[:200], + "threshold": thr, + "total_agents": n_agents, + "options": options or ["YES", "NO"], + "thread_id": thread_id, + "configured_agents": self._configured_agents, # Debug info + "detected_agents": self.get_all_agents(), # Debug info + } + ) + ) + + payload = { + "type": "poll", + "poll_id": poll_id, + "question": question, + "proposal": proposal, + "options": options or ["YES", "NO"], + "threshold": thr, + "status": "open", + "proposer": proposer, + } + + # Store clean final answer if provided (for final answer polls) + if final_answer is not None: + payload["final_answer"] = final_answer + + return self.append_message( + sender=proposer, + content=payload, + thread_id=thread_id, + msg_type="poll", + ) + + def record_vote( + self, + *, + poll_id: str, + voter: str, + vote: str, + confidence: float = 0.5, + rationale: str = "", + thread_id: Optional[str] = None, + ) -> Dict[str, Any]: + v = vote.upper() + if v not in {"YES", "NO"}: + raise ValueError("vote must be 'YES' or 'NO'") + + logging.info( + json.dumps( + { + "event": "vote_recorded", + "poll_id": poll_id, + "voter": voter, + "vote": v, + "confidence": confidence, + "rationale": rationale[:200] if rationale else "", + } + ) + ) + + return self.append_message( + sender=voter, + content={ + "type": "vote", + "poll_id": poll_id, + "voter": voter, + "vote": v, + "confidence": float(confidence), + "rationale": rationale, + }, + thread_id=thread_id, + msg_type="vote", + reply_to=poll_id, + ) + + def count_votes(self, poll_id: str) -> Dict[str, Any]: + """Tally votes for a poll; latest vote per voter wins.""" + import json + + poll = None + closed = False + deleted = False + votes_by_voter: Dict[str, Dict[str, Any]] = {} + for msg in self._iter_messages() or []: + if not isinstance(msg, dict): + continue # Skip non-dict messages + + c = msg.get("content", {}) + + # Handle case where content might be a string (JSON) + if isinstance(c, str): + try: + c = json.loads(c) + except json.JSONDecodeError: + continue # Skip messages with unparsable content + + if not isinstance(c, dict): + continue # Skip if content is not a dict after parsing + + t = c.get("type") + if t == "poll" and c.get("poll_id") == poll_id: + poll = c + status = c.get("status") + closed = status == "closed" + deleted = status == "deleted" + elif t == "vote" and c.get("poll_id") == poll_id and not deleted: + # Don't count votes for deleted polls + voter = c.get("voter") + if voter: + votes_by_voter[voter] = c # last wins + elif t == "final_answer" and c.get("poll_id") == poll_id: + closed = True + + tally = defaultdict(int) + for v in votes_by_voter.values(): + vv = v.get("vote") + if vv in ("YES", "NO"): + tally[vv] += 1 + + n_agents = len(self.get_all_agents()) or 1 + # Ensure poll is a dict before accessing its attributes + if isinstance(poll, dict): + threshold = poll.get("threshold", majority_plus_one(n_agents)) + else: + threshold = majority_plus_one(n_agents) + + return { + "poll": poll, + "closed": closed, + "deleted": deleted, + "tally": {"YES": tally["YES"], "NO": tally["NO"], "eligible": n_agents, "threshold": threshold}, + "votes_by_voter": votes_by_voter, + } + + def finalize_poll_if_ready(self, poll_id: str) -> Optional[Dict[str, Any]]: + info = self.count_votes(poll_id) + poll = info.get("poll") + if not poll or info.get("closed"): + logging.info( + json.dumps( + { + "event": "poll_finalization_skipped", + "poll_id": poll_id, + "reason": "poll_not_found_or_closed", + "has_poll": bool(poll), + "is_closed": info.get("closed", False), + } + ) + ) + return None + + # Ensure poll is a dict before accessing its attributes + if not isinstance(poll, dict): + logging.error( + json.dumps( + { + "event": "poll_finalization_error", + "poll_id": poll_id, + "error": "poll_is_not_dict", + "poll_type": type(poll).__name__, + } + ) + ) + return None + + tally = info["tally"] + logging.info( + json.dumps( + { + "event": "poll_finalization_check", + "poll_id": poll_id, + "yes_votes": tally["YES"], + "no_votes": tally["NO"], + "threshold": tally["threshold"], + "eligible_voters": tally["eligible"], + } + ) + ) + + # Check if poll should be deleted due to too many NO votes + if tally["NO"] >= 2: # Delete poll if 2+ NO votes + print(f"πŸ—‘οΈ Deleting poll {poll_id} due to {tally['NO']} NO votes (threshold: 2)") + logging.info( + json.dumps( + { + "event": "poll_deleted", + "poll_id": poll_id, + "no_votes": tally["NO"], + "reason": "too_many_no_votes", + } + ) + ) + # Mark poll as closed/deleted + self.append_message( + sender="Coordinator", + content={**poll, "status": "deleted", "reason": f"Too many NO votes ({tally['NO']})"}, + msg_type="poll", + reply_to=poll_id, + ) + return {"deleted": True, "poll_id": poll_id, "reason": f"Too many NO votes ({tally['NO']})"} + + # Check if poll passed with enough YES votes + if tally["YES"] >= tally["threshold"]: + print(f"βœ… Poll {poll_id} passed with {tally['YES']} YES votes (threshold: {tally['threshold']})") + logging.info( + json.dumps( + { + "event": "poll_passed", + "poll_id": poll_id, + "yes_votes": tally["YES"], + "threshold": tally["threshold"], + "proposer": poll.get("proposer"), + "answer_preview": str(poll.get("proposal", ""))[:200], + } + ) + ) + # mark closed (shadow append) + self.append_message( + sender=poll.get("proposer", "system"), + content={**poll, "status": "closed"}, + msg_type="poll", + reply_to=poll_id, + ) + # emit final answer + clean_answer = poll.get("final_answer") or poll.get("proposal", "No proposal found") + return self.append_message( + sender="Coordinator", + content={ + "type": "final_answer", + "poll_id": poll_id, + "answer": clean_answer, + "tally": tally, + "source_proposer": poll.get("proposer"), + }, + msg_type="final_answer", + reply_to=poll_id, + ) + + logging.info( + json.dumps( + { + "event": "poll_not_ready", + "poll_id": poll_id, + "yes_votes": tally["YES"], + "no_votes": tally["NO"], + "threshold": tally["threshold"], + "reason": "insufficient_votes", + } + ) + ) + return None diff --git a/examples/decentralized_smolagents_benchmark/scripts/reformulator.py b/examples/decentralized_smolagents_benchmark/scripts/reformulator.py new file mode 100644 index 000000000..a56c2c1bf --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/reformulator.py @@ -0,0 +1,86 @@ +# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource! +# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py +import copy + +from smolagents.models import MessageRole, Model + + +def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str: + messages = [ + { + "role": MessageRole.SYSTEM, + "content": [ + { + "type": "text", + "text": f"""Earlier you were asked the following: + +{original_task} + +Your team then worked diligently to address that request. Read below a transcript of that conversation:""", + } + ], + } + ] + + # The first message just repeats the question, so remove it + # if len(inner_messages) > 1: + # del inner_messages[0] + + # copy them to this context + try: + for message in inner_messages: + if not message.content: + continue + message = copy.deepcopy(message) + message.role = MessageRole.USER + messages.append(message) + except Exception: + messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}] + + # ask for the final answer + messages.append( + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": f""" +Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience: + +{original_task} + +To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER] +Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. +ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.) +If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise. +If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'. +If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings. +If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine' +""", + } + ], + } + ) + + response = reformulation_model(messages).content + + final_answer = response.split("FINAL ANSWER: ")[-1].strip() + print("> Reformulated answer: ", final_answer) + + # if "unable to determine" in final_answer.lower(): + # messages.append({"role": MessageRole.ASSISTANT, "content": response }) + # messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """ + # I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation. + + # To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS] + # Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc. + # ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.) + # If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise. + # If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'. + # If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings. + # """.strip()}]}) + + # response = model(messages).content + # print("\n>>>Making an educated guess.\n", response) + # final_answer = response.split("EDUCATED GUESS: ")[-1].strip() + return final_answer diff --git a/examples/decentralized_smolagents_benchmark/scripts/run_agents.py b/examples/decentralized_smolagents_benchmark/scripts/run_agents.py new file mode 100644 index 000000000..fa2df2ad2 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/run_agents.py @@ -0,0 +1,96 @@ +import json +import os +import shutil +import textwrap +from pathlib import Path + +# import tqdm.asyncio +from smolagents.utils import AgentError + + +def serialize_agent_error(obj): + """Serialize AgentError objects for JSON output.""" + if isinstance(obj, AgentError): + return {"error_type": obj.__class__.__name__, "message": obj.message} + else: + return str(obj) + + +def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str: + """ + Get description of an image file for context. + + DEBUGGING FIX: Changed parameter from file_path to image_path + - ISSUE: TypeError due to mismatched parameter name + - SOLUTION: Use image_path parameter as expected by visualizer tool + """ + prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question: +{question}. But do not try to answer the question directly! +Do not add any information that is not present in the image.""" + return visual_inspection_tool(image_path=file_name, question=prompt) + + +def get_document_description(file_path: str, question: str, document_inspection_tool) -> str: + prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question: +{question}. But do not try to answer the question directly! +Do not add any information that is not present in the document.""" + return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt) + + +def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool): + file_extension = file_path.split(".")[-1] + if file_extension in ["png", "jpg", "jpeg"]: + file_description = f" - Attached image: {file_path}" + file_description += ( + f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}" + ) + return file_description + elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]: + image_path = file_path.split(".")[0] + ".png" + if os.path.exists(image_path): + description = get_image_description(image_path, question, visual_inspection_tool) + file_path = image_path + else: + description = get_document_description(file_path, question, document_inspection_tool) + file_description = f" - Attached document: {file_path}" + file_description += f"\n -> File description: {description}" + return file_description + elif file_extension in ["mp3", "m4a", "wav"]: + return f" - Attached audio: {file_path}" + else: + return f" - Attached file: {file_path}" + + +def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool): + folder_path = file_path.replace(".zip", "") + os.makedirs(folder_path, exist_ok=True) + shutil.unpack_archive(file_path, folder_path) + + prompt_use_files = "" + for root, dirs, files in os.walk(folder_path): + for file in files: + file_path = os.path.join(root, file) + prompt_use_files += "\n" + textwrap.indent( + get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool), + prefix=" ", + ) + return prompt_use_files + + +def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]): + f = base_filename.parent / f"{base_filename.stem}_answers.jsonl" + done = set() + if f.exists(): + with open(f, encoding="utf-8") as fh: + done = {json.loads(line)["task_id"] for line in fh if line.strip()} + + tasks = [] + for i in range(total): + task_id = int(data[i]["task_id"]) + if task_id not in done: + if tasks_ids is not None: + if task_id in tasks_ids: + tasks.append(data[i]) + else: + tasks.append(data[i]) + return tasks diff --git a/examples/decentralized_smolagents_benchmark/scripts/text_inspector_tool.py b/examples/decentralized_smolagents_benchmark/scripts/text_inspector_tool.py new file mode 100644 index 000000000..cbe00209b --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/text_inspector_tool.py @@ -0,0 +1,140 @@ +from smolagents import Tool +from smolagents.models import Model + + +class TextInspectorTool(Tool): + name = "inspect_file_as_text" + description = """ +You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it. +This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES. + +IMPORTANT: If you have a URL to a file (like a .docx, .xlsx, .pptx, .wav, .mp3, .m4a, .png file), you MUST first use the 'download_file' tool to download it locally, then use this tool with the downloaded file path. For .pdf, .txt, .htm files, use 'visit_page' tool instead.""" + + inputs = { + "file_path": { + "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!", + "type": "string", + }, + "question": { + "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.", + "type": "string", + "nullable": True, + }, + } + output_type = "string" + + def __init__(self, model: Model = None, text_limit: int = 100000): + super().__init__() + self.model = model + self.text_limit = text_limit + from .mdconvert import MarkdownConverter + + self.md_converter = MarkdownConverter() + + def forward_initial_exam_mode(self, file_path, question): + from smolagents.models import MessageRole + + result = self.md_converter.convert(file_path) + + if file_path[-4:] in [".png", ".jpg"]: + raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!") + + if ".zip" in file_path: + return result.text_content + + if not question: + return result.text_content + + if len(result.text_content) < 4000: + return "Document content: " + result.text_content + + messages = [ + { + "role": MessageRole.SYSTEM, + "content": [ + { + "type": "text", + "text": "Here is a file:\n### " + + str(result.title) + + "\n\n" + + result.text_content[: self.text_limit], + } + ], + }, + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: " + + question + + "\n\nDon't answer the question yourself! Just provide useful notes on the document", + } + ], + }, + ] + return self.model(messages).content + + def forward(self, file_path, question: str | None = None) -> str: + from smolagents.models import MessageRole + + result = self.md_converter.convert(file_path) + + if file_path[-4:] in [".png", ".jpg"]: + raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!") + + if ".zip" in file_path: + return result.text_content + + if not question: + return result.text_content + + messages = [ + { + "role": MessageRole.SYSTEM, + "content": [ + { + "type": "text", + "text": "You will have to write a short caption for this file, then answer this question:" + + question, + } + ], + }, + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": "Here is the complete file:\n### " + + str(result.title) + + "\n\n" + + result.text_content[: self.text_limit], + } + ], + }, + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'." + + question, + } + ], + }, + ] + return self.model(messages).content + + +class FileReaderTool(Tool): + """Tool for reading file contents.""" + + name = "file_reader" + inputs = {"file_path": {"type": "string", "description": "Path to the file to read"}} + output_type = "string" + description = "Read raw text contents of a file. IMPORTANT: If you have a URL to a file, first use 'download_file' tool to download it, then use this tool with the downloaded file path." + + def forward(self, file_path: str) -> str: + """Read contents of a file.""" + with open(file_path) as f: + return f.read() diff --git a/examples/decentralized_smolagents_benchmark/scripts/text_web_browser.py b/examples/decentralized_smolagents_benchmark/scripts/text_web_browser.py new file mode 100644 index 000000000..efd03df62 --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/text_web_browser.py @@ -0,0 +1,622 @@ +# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource! +# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py +import mimetypes +import os +import pathlib +import re +import time +import uuid +from typing import Any +from urllib.parse import unquote, urljoin, urlparse + +import pathvalidate +import requests +from serpapi import GoogleSearch + +from smolagents import Tool + +from .cookies import COOKIES +from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException + + +class SimpleTextBrowser: + """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use.""" + + def __init__( + self, + start_page: str | None = None, + viewport_size: int | None = 1024 * 8, + downloads_folder: str | None | None = None, + serpapi_key: str | None | None = None, + request_kwargs: dict[str, Any] | None | None = None, + ): + self.start_page: str = start_page if start_page else "about:blank" + self.viewport_size = viewport_size # Applies only to the standard uri types + self.downloads_folder = downloads_folder + self.history: list[tuple[str, float]] = list() + self.page_title: str | None = None + self.viewport_current_page = 0 + self.viewport_pages: list[tuple[int, int]] = list() + self.set_address(self.start_page) + self.serpapi_key = serpapi_key + self.request_kwargs = request_kwargs + self.request_kwargs["cookies"] = COOKIES + self._mdconvert = MarkdownConverter() + self._page_content: str = "" + + self._find_on_page_query: str | None = None + self._find_on_page_last_result: int | None = None # Location of the last result + + @property + def address(self) -> str: + """Return the address of the current page.""" + return self.history[-1][0] + + def set_address(self, uri_or_path: str, filter_year: int | None = None) -> None: + # TODO: Handle anchors + self.history.append((uri_or_path, time.time())) + + # Handle special URIs + if uri_or_path == "about:blank": + self._set_page_content("") + elif uri_or_path.startswith("google:"): + self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year) + else: + if ( + not uri_or_path.startswith("http:") + and not uri_or_path.startswith("https:") + and not uri_or_path.startswith("file:") + ): + if len(self.history) > 1: + prior_address = self.history[-2][0] + uri_or_path = urljoin(prior_address, uri_or_path) + # Update the address with the fully-qualified path + self.history[-1] = (uri_or_path, self.history[-1][1]) + self._fetch_page(uri_or_path) + + self.viewport_current_page = 0 + self.find_on_page_query = None + self.find_on_page_viewport = None + + @property + def viewport(self) -> str: + """Return the content of the current viewport.""" + bounds = self.viewport_pages[self.viewport_current_page] + return self.page_content[bounds[0] : bounds[1]] + + @property + def page_content(self) -> str: + """Return the full contents of the current page.""" + return self._page_content + + def _set_page_content(self, content: str) -> None: + """Sets the text content of the current page.""" + self._page_content = content + self._split_pages() + if self.viewport_current_page >= len(self.viewport_pages): + self.viewport_current_page = len(self.viewport_pages) - 1 + + def page_down(self) -> None: + self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1) + + def page_up(self) -> None: + self.viewport_current_page = max(self.viewport_current_page - 1, 0) + + def find_on_page(self, query: str) -> str | None: + """Searches for the query from the current viewport forward, looping back to the start if necessary.""" + + # Did we get here via a previous find_on_page search with the same query? + # If so, map to find_next + if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result: + return self.find_next() + + # Ok it's a new search start from the current viewport + self._find_on_page_query = query + viewport_match = self._find_next_viewport(query, self.viewport_current_page) + if viewport_match is None: + self._find_on_page_last_result = None + return None + else: + self.viewport_current_page = viewport_match + self._find_on_page_last_result = viewport_match + return self.viewport + + def find_next(self) -> str | None: + """Scroll to the next viewport that matches the query""" + + if self._find_on_page_query is None: + return None + + starting_viewport = self._find_on_page_last_result + if starting_viewport is None: + starting_viewport = 0 + else: + starting_viewport += 1 + if starting_viewport >= len(self.viewport_pages): + starting_viewport = 0 + + viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport) + if viewport_match is None: + self._find_on_page_last_result = None + return None + else: + self.viewport_current_page = viewport_match + self._find_on_page_last_result = viewport_match + return self.viewport + + def _find_next_viewport(self, query: str, starting_viewport: int) -> int | None: + """Search for matches between the starting viewport looping when reaching the end.""" + + if query is None: + return None + + # Normalize the query, and convert to a regular expression + nquery = re.sub(r"\*", "__STAR__", query) + nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " " + nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word + nquery = nquery.replace("__STAR__", ".*").lower() + + if nquery.strip() == "": + return None + + idxs = list() + idxs.extend(range(starting_viewport, len(self.viewport_pages))) + idxs.extend(range(0, starting_viewport)) + + for i in idxs: + bounds = self.viewport_pages[i] + content = self.page_content[bounds[0] : bounds[1]] + + # TODO: Remove markdown links and images + ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " " + if re.search(nquery, ncontent): + return i + + return None + + def visit_page(self, path_or_uri: str, filter_year: int | None = None) -> str: + """Update the address, visit the page, and return the content of the viewport.""" + self.set_address(path_or_uri, filter_year=filter_year) + return self.viewport + + def _split_pages(self) -> None: + # Do not split search results + if self.address.startswith("google:"): + self.viewport_pages = [(0, len(self._page_content))] + return + + # Handle empty pages + if len(self._page_content) == 0: + self.viewport_pages = [(0, 0)] + return + + # Break the viewport into pages + self.viewport_pages = [] + start_idx = 0 + while start_idx < len(self._page_content): + end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator] + # Adjust to end on a space + while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]: + end_idx += 1 + self.viewport_pages.append((start_idx, end_idx)) + start_idx = end_idx + + def _serpapi_search(self, query: str, filter_year: int | None = None) -> None: + if self.serpapi_key is None: + raise ValueError("Missing SerpAPI key.") + + params = { + "engine": "google", + "q": query, + "api_key": self.serpapi_key, + } + if filter_year is not None: + params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}" + + search = GoogleSearch(params) + results = search.get_dict() + self.page_title = f"{query} - Search" + if "organic_results" not in results.keys(): + raise Exception(f"No results found for query: '{query}'. Use a less specific query.") + if len(results["organic_results"]) == 0: + year_filter_message = f" with filter year={filter_year}" if filter_year is not None else "" + self._set_page_content( + f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter." + ) + return + + def _prev_visit(url): + for i in range(len(self.history) - 1, -1, -1): + if self.history[i][0] == url: + return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n" + return "" + + web_snippets: list[str] = list() + idx = 0 + if "organic_results" in results: + for page in results["organic_results"]: + idx += 1 + date_published = "" + if "date" in page: + date_published = "\nDate published: " + page["date"] + + source = "" + if "source" in page: + source = "\nSource: " + page["source"] + + snippet = "" + if "snippet" in page: + snippet = "\n" + page["snippet"] + + redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}" + + redacted_version = redacted_version.replace("Your browser can't play this video.", "") + web_snippets.append(redacted_version) + + content = ( + f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n" + + "\n\n".join(web_snippets) + ) + + self._set_page_content(content) + + def _fetch_page(self, url: str) -> None: + download_path = "" + try: + if url.startswith("file://"): + download_path = os.path.normcase(os.path.normpath(unquote(url[7:]))) + res = self._mdconvert.convert_local(download_path) + self.page_title = res.title + self._set_page_content(res.text_content) + else: + # Prepare the request parameters + request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {} + request_kwargs["stream"] = True + + # Send a HTTP request to the URL + response = requests.get(url, **request_kwargs) + response.raise_for_status() + + # If the HTTP request was successful + content_type = response.headers.get("content-type", "") + + # Text or HTML + if "text/" in content_type.lower(): + res = self._mdconvert.convert_response(response) + self.page_title = res.title + self._set_page_content(res.text_content) + # A download + else: + # Try producing a safe filename + fname = None + download_path = None + try: + fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip() + download_path = os.path.abspath(os.path.join(self.downloads_folder, fname)) + + suffix = 0 + while os.path.exists(download_path) and suffix < 1000: + suffix += 1 + base, ext = os.path.splitext(fname) + new_fname = f"{base}__{suffix}{ext}" + download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname)) + + except NameError: + pass + + # No suitable name, so make one + if fname is None: + extension = mimetypes.guess_extension(content_type) + if extension is None: + extension = ".download" + fname = str(uuid.uuid4()) + extension + download_path = os.path.abspath(os.path.join(self.downloads_folder, fname)) + + # Open a file for writing + with open(download_path, "wb") as fh: + for chunk in response.iter_content(chunk_size=512): + fh.write(chunk) + + # Render it + local_uri = pathlib.Path(download_path).as_uri() + self.set_address(local_uri) + + except UnsupportedFormatException as e: + print(e) + self.page_title = ("Download complete.",) + self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'") + except FileConversionException as e: + print(e) + self.page_title = ("Download complete.",) + self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'") + except FileNotFoundError: + self.page_title = "Error 404" + self._set_page_content(f"## Error 404\n\nFile not found: {download_path}") + except requests.exceptions.RequestException as request_exception: + try: + self.page_title = f"Error {response.status_code}" + + # If the error was rendered in HTML we might as well render it + content_type = response.headers.get("content-type", "") + if content_type is not None and "text/html" in content_type.lower(): + res = self._mdconvert.convert(response) + self.page_title = f"Error {response.status_code}" + self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}") + else: + text = "" + for chunk in response.iter_content(chunk_size=512, decode_unicode=True): + text += chunk + self.page_title = f"Error {response.status_code}" + self._set_page_content(f"## Error {response.status_code}\n\n{text}") + except NameError: + self.page_title = "Error" + self._set_page_content(f"## Error\n\n{str(request_exception)}") + + def _state(self) -> tuple[str, str]: + header = f"Address: {self.address}\n" + if self.page_title is not None: + header += f"Title: {self.page_title}\n" + + current_page = self.viewport_current_page + total_pages = len(self.viewport_pages) + + address = self.address + for i in range(len(self.history) - 2, -1, -1): # Start from the second last + if self.history[i][0] == address: + header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n" + break + + header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n" + return (header, self.viewport) + + +class SearchInformationTool(Tool): + name = "web_search" + description = "Perform a web search query (think a google search) and returns the search results." + inputs = {"query": {"type": "string", "description": "The web search query to perform."}} + inputs["filter_year"] = { + "type": "string", + "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!", + "nullable": True, + } + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self, query: str, filter_year: int | None = None) -> str: + self.browser.visit_page(f"google: {query}", filter_year=filter_year) + header, content = self.browser._state() + return header.strip() + "\n=======================\n" + content + + +class VisitTool(Tool): + name = "visit_page" + description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript." + inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webpage to visit."}} + output_type = "string" + + def __init__(self, browser=None): + super().__init__() + self.browser = browser + + def forward(self, url: str) -> str: + self.browser.visit_page(url) + header, content = self.browser._state() + return header.strip() + "\n=======================\n" + content + + +class DownloadTool(Tool): + name = "download_file" + description = """ +Download a file at a given URL to inspect its contents. Use this tool for files with extensions: [".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".png", ".docx"] + +WORKFLOW: +1. Use this tool to download the file locally +2. Then use 'inspect_file_as_text' tool with the returned file path to read and analyze the content +3. For images (.png), use the visualizer tool after downloading + +DO NOT use this tool for .pdf or .txt or .htm files: for these types use visit_page instead.""" + inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}} + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self, url: str) -> str: + import os + import uuid + from urllib.parse import urlparse + + import pathvalidate + import requests + + try: + if "arxiv" in url: + url = url.replace("abs", "pdf") + + response = requests.get(url, stream=True, timeout=30) + response.raise_for_status() + + content_type = response.headers.get("content-type", "") + + # Try to get filename from URL or Content-Disposition header + fname = None + if "Content-Disposition" in response.headers: + import re + + cd = response.headers["Content-Disposition"] + filename_match = re.search(r'filename="?([^"]+)"?', cd) + if filename_match: + fname = pathvalidate.sanitize_filename(filename_match.group(1)).strip() + + if not fname: + # Extract from URL path + parsed_url = urlparse(url) + path_fname = os.path.basename(parsed_url.path) + if path_fname: + fname = pathvalidate.sanitize_filename(path_fname).strip() + + if not fname: + # Generate filename from content type + extension = mimetypes.guess_extension(content_type) + if not extension: + extension = ".download" + fname = f"file_{str(uuid.uuid4())[:8]}{extension}" + + # Ensure downloads directory exists + downloads_dir = self.browser.downloads_folder or "./downloads" + os.makedirs(downloads_dir, exist_ok=True) + + # Create unique path to avoid overwriting + download_path = os.path.join(downloads_dir, fname) + suffix = 0 + while os.path.exists(download_path) and suffix < 1000: + suffix += 1 + base, ext = os.path.splitext(fname) + new_fname = f"{base}__{suffix}{ext}" + download_path = os.path.join(downloads_dir, new_fname) + + # Download the file + with open(download_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + # Get file extension to check restrictions + _, extension = os.path.splitext(download_path) + extension = extension.lower() + + return f"File was downloaded and saved under path {download_path}." + + except requests.RequestException as e: + return f"Error downloading file: Network error - {str(e)}" + except Exception as e: + return f"Error downloading file: {str(e)}" + + +class ArchiveSearchTool(Tool): + name = "find_archived_url" + description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date." + inputs = { + "url": {"type": "string", "description": "The url you need the archive for."}, + "date": { + "type": "string", + "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.", + }, + } + output_type = "string" + + def __init__(self, browser=None): + super().__init__() + self.browser = browser + + def forward(self, url, date) -> str: + import requests + + no_timestamp_url = f"https://archive.org/wayback/available?url={url}" + archive_url = no_timestamp_url + f"×tamp={date}" + response = requests.get(archive_url).json() + response_notimestamp = requests.get(no_timestamp_url).json() + if "archived_snapshots" in response and "closest" in response["archived_snapshots"]: + closest = response["archived_snapshots"]["closest"] + print("Archive found!", closest) + + elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]: + closest = response_notimestamp["archived_snapshots"]["closest"] + print("Archive found!", closest) + else: + raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.") + target_url = closest["url"] + self.browser.visit_page(target_url) + header, content = self.browser._state() + return ( + f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n" + + header.strip() + + "\n=======================\n" + + content + ) + + +class PageUpTool(Tool): + name = "page_up" + description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content." + inputs = {} + output_type = "string" + + def __init__(self, browser=None): + super().__init__() + self.browser = browser + + def forward(self) -> str: + self.browser.page_up() + header, content = self.browser._state() + return header.strip() + "\n=======================\n" + content + + +class PageDownTool(Tool): + name = "page_down" + description = ( + "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content." + ) + inputs = {} + output_type = "string" + + def __init__(self, browser=None): + super().__init__() + self.browser = browser + + def forward(self) -> str: + self.browser.page_down() + header, content = self.browser._state() + return header.strip() + "\n=======================\n" + content + + +class FinderTool(Tool): + name = "find_on_page_ctrl_f" + description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F." + inputs = { + "search_string": { + "type": "string", + "description": "The string to search for on the page. This search string supports wildcards like '*'", + } + } + output_type = "string" + + def __init__(self, browser=None): + super().__init__() + self.browser = browser + + def forward(self, search_string: str) -> str: + find_result = self.browser.find_on_page(search_string) + header, content = self.browser._state() + + if find_result is None: + return ( + header.strip() + + f"\n=======================\nThe search string '{search_string}' was not found on this page." + ) + else: + return header.strip() + "\n=======================\n" + content + + +class FindNextTool(Tool): + name = "find_next" + description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search." + inputs = {} + output_type = "string" + + def __init__(self, browser=None): + super().__init__() + self.browser = browser + + def forward(self) -> str: + find_result = self.browser.find_next() + header, content = self.browser._state() + + if find_result is None: + return header.strip() + "\n=======================\nThe search string was not found on this page." + else: + return header.strip() + "\n=======================\n" + content diff --git a/examples/decentralized_smolagents_benchmark/scripts/visual_qa.py b/examples/decentralized_smolagents_benchmark/scripts/visual_qa.py new file mode 100644 index 000000000..01d60b30a --- /dev/null +++ b/examples/decentralized_smolagents_benchmark/scripts/visual_qa.py @@ -0,0 +1,189 @@ +import base64 +import json +import mimetypes +import os +import uuid +from io import BytesIO + +import PIL.Image +import requests +from dotenv import load_dotenv +from huggingface_hub import InferenceClient + +from smolagents import Tool, tool + + +load_dotenv(override=True) + + +def process_images_and_text(image_path, query, client): + from transformers import AutoProcessor + + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": query}, + ], + }, + ] + idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty") + prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True) + + # load images from local directory + + # encode images to strings which can be sent to the endpoint + def encode_local_image(image_path): + # load image + image = PIL.Image.open(image_path).convert("RGB") + + # Convert the image to a base64 string + buffer = BytesIO() + image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG) + base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8") + + # add string formatting required by the endpoint + image_string = f"data:image/jpeg;base64,{base64_image}" + + return image_string + + image_string = encode_local_image(image_path) + prompt_with_images = prompt_with_template.replace("", "![]({}) ").format(image_string) + + payload = { + "inputs": prompt_with_images, + "parameters": { + "return_full_text": False, + "max_new_tokens": 200, + }, + } + + return json.loads(client.post(json=payload).decode())[0] + + +# Function to encode the image +def encode_image(image_path): + if image_path.startswith("http"): + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" + request_kwargs = { + "headers": {"User-Agent": user_agent}, + "stream": True, + } + + # Send a HTTP request to the URL + response = requests.get(image_path, **request_kwargs) + response.raise_for_status() + content_type = response.headers.get("content-type", "") + + extension = mimetypes.guess_extension(content_type) + if extension is None: + extension = ".download" + + fname = str(uuid.uuid4()) + extension + download_path = os.path.abspath(os.path.join("downloads", fname)) + + with open(download_path, "wb") as fh: + for chunk in response.iter_content(chunk_size=512): + fh.write(chunk) + + image_path = download_path + + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + +def resize_image(image_path): + img = PIL.Image.open(image_path) + width, height = img.size + img = img.resize((int(width / 2), int(height / 2))) + new_image_path = f"resized_{image_path}" + img.save(new_image_path) + return new_image_path + + +class VisualQATool(Tool): + name = "visualizer" + description = "A tool that can answer questions about attached images." + inputs = { + "image_path": { + "description": "The path to the image on which to answer the question", + "type": "string", + }, + "question": {"description": "the question to answer", "type": "string", "nullable": True}, + } + output_type = "string" + + client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty") + + def forward(self, image_path: str, question: str | None = None) -> str: + output = "" + add_note = False + if not question: + add_note = True + question = "Please write a detailed caption for this image." + try: + output = process_images_and_text(image_path, question, self.client) + except Exception as e: + print(e) + if "Payload Too Large" in str(e): + new_image_path = resize_image(image_path) + output = process_images_and_text(new_image_path, question, self.client) + + if add_note: + output = ( + f"You did not provide a particular question, so here is a detailed caption for the image: {output}" + ) + + return output + + +@tool +def visualizer(image_path: str, question: str | None = None) -> str: + """A tool that can answer questions about attached images. + + Args: + image_path: The path to the image on which to answer the question. This should be a local path to downloaded image. + question: The question to answer. + """ + import mimetypes + import os + + import requests + + from .visual_qa import encode_image + + add_note = False + if not question: + add_note = True + question = "Please write a detailed caption for this image." + if not isinstance(image_path, str): + raise Exception("You should provide at least `image_path` string argument to this tool!") + + mime_type, _ = mimetypes.guess_type(image_path) + base64_image = encode_image(image_path) + + payload = { + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}}, + ], + } + ], + "max_tokens": 1000, + } + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"} + response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) + try: + output = response.json()["choices"][0]["message"]["content"] + except Exception: + raise Exception(f"Response format unexpected: {response.json()}") + + if add_note: + output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}" + + return output diff --git a/examples/open_deep_research/analysis.ipynb b/examples/open_deep_research/analysis.ipynb index 53e7bd430..736d86105 100644 --- a/examples/open_deep_research/analysis.ipynb +++ b/examples/open_deep_research/analysis.ipynb @@ -6,14 +6,30 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install plotly kaleido datasets nbformat -U -q" + "!pip install plotly kaleido datasets nbformat -U -q\n", + "#!export HF_TOKEN=" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "97da58e888e1464b9bfb1a79abe11f02", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='
\u001b[39m\u001b[32m10\u001b[39m result_df = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 11\u001b[39m result_df[\u001b[33m\"\u001b[39m\u001b[33mprediction\u001b[39m\u001b[33m\"\u001b[39m] = result_df[\u001b[33m\"\u001b[39m\u001b[33mprediction\u001b[39m\u001b[33m\"\u001b[39m].fillna(\u001b[33m\"\u001b[39m\u001b[33mNo prediction\u001b[39m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/GitFiles/smolagents/.venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:382\u001b[39m, in \u001b[36mconcat\u001b[39m\u001b[34m(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)\u001b[39m\n\u001b[32m 379\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m copy \u001b[38;5;129;01mand\u001b[39;00m using_copy_on_write():\n\u001b[32m 380\u001b[39m copy = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m382\u001b[39m op = \u001b[43m_Concatenator\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 383\u001b[39m \u001b[43m \u001b[49m\u001b[43mobjs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 384\u001b[39m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m=\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 385\u001b[39m \u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m=\u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 386\u001b[39m \u001b[43m \u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m=\u001b[49m\u001b[43mjoin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 387\u001b[39m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m=\u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 388\u001b[39m \u001b[43m \u001b[49m\u001b[43mlevels\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlevels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 389\u001b[39m \u001b[43m \u001b[49m\u001b[43mnames\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 390\u001b[39m \u001b[43m \u001b[49m\u001b[43mverify_integrity\u001b[49m\u001b[43m=\u001b[49m\u001b[43mverify_integrity\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 391\u001b[39m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 392\u001b[39m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[43m=\u001b[49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 393\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 395\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m op.get_result()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/GitFiles/smolagents/.venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:445\u001b[39m, in \u001b[36m_Concatenator.__init__\u001b[39m\u001b[34m(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)\u001b[39m\n\u001b[32m 442\u001b[39m \u001b[38;5;28mself\u001b[39m.verify_integrity = verify_integrity\n\u001b[32m 443\u001b[39m \u001b[38;5;28mself\u001b[39m.copy = copy\n\u001b[32m--> \u001b[39m\u001b[32m445\u001b[39m objs, keys = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_clean_keys_and_objs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobjs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 447\u001b[39m \u001b[38;5;66;03m# figure out what our result ndim is going to be\u001b[39;00m\n\u001b[32m 448\u001b[39m ndims = \u001b[38;5;28mself\u001b[39m._get_ndims(objs)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/GitFiles/smolagents/.venv/lib/python3.12/site-packages/pandas/core/reshape/concat.py:507\u001b[39m, in \u001b[36m_Concatenator._clean_keys_and_objs\u001b[39m\u001b[34m(self, objs, keys)\u001b[39m\n\u001b[32m 504\u001b[39m objs_list = \u001b[38;5;28mlist\u001b[39m(objs)\n\u001b[32m 506\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(objs_list) == \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m507\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mNo objects to concatenate\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 509\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m keys \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 510\u001b[39m objs_list = \u001b[38;5;28mlist\u001b[39m(com.not_none(*objs_list))\n", + "\u001b[31mValueError\u001b[39m: No objects to concatenate" + ] + } + ], "source": [ "import glob\n", "\n", @@ -435,7 +491,7 @@ ], "metadata": { "kernelspec": { - "display_name": "agents", + "display_name": "smolagents", "language": "python", "name": "python3" }, @@ -449,7 +505,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.0" + "version": "3.12.11" } }, "nbformat": 4, diff --git a/examples/open_deep_research/requirements.txt b/examples/open_deep_research/requirements.txt index fe6c98ef2..29c868a34 100644 --- a/examples/open_deep_research/requirements.txt +++ b/examples/open_deep_research/requirements.txt @@ -12,7 +12,6 @@ openai>=1.52.2 openpyxl pandas>=2.2.3 pathvalidate>=3.2.1 -pdfminer>=20191125 pdfminer.six>=20240706 Pillow>=11.0.0 puremagic>=1.28 diff --git a/examples/open_deep_research/run_gaia.py b/examples/open_deep_research/run_gaia.py index 9c7bacd4e..dadb8dc43 100644 --- a/examples/open_deep_research/run_gaia.py +++ b/examples/open_deep_research/run_gaia.py @@ -1,4 +1,10 @@ -# EXAMPLE COMMAND: from folder examples/open_deep_research, run: python run_gaia.py --concurrency 32 --run-name generate-traces-03-apr-noplanning --model-id gpt-4o +# GAIA Benchmark Runner for Open Deep Research +# ============================================= +# +# +# EXAMPLE COMMAND: from folder examples/open_deep_research, run: +# python run_gaia.py --concurrency 32 --run-name generate-traces-03-apr-noplanning --model-id gpt-4o + import argparse import json import os @@ -8,10 +14,16 @@ from pathlib import Path from typing import Any +from langfuse import get_client + + +BASE_DIR = Path(__file__).resolve().parent + import datasets import pandas as pd from dotenv import load_dotenv from huggingface_hub import login, snapshot_download +from scripts.gaia_scorer import check_close_call, question_scorer from scripts.reformulator import prepare_response from scripts.run_agents import ( get_single_file_description, @@ -40,6 +52,25 @@ load_dotenv(override=True) + +langfuse = get_client() + +# Verify connection +if langfuse.auth_check(): + print("Langfuse client is authenticated and ready!") +else: + print("Authentication failed. Please check your credentials and host.") + +from openinference.instrumentation.smolagents import SmolagentsInstrumentor + + +with langfuse.start_as_current_span(name="another-operation"): + # Add to the current trace + langfuse.update_current_trace(session_id="zero_shot_1", user_id="cvt8") + + +SmolagentsInstrumentor().instrument() + login(os.getenv("HF_TOKEN")) append_answer_lock = threading.Lock() @@ -93,6 +124,7 @@ def create_agent_team(model: Model): FindNextTool(browser), ArchiveSearchTool(browser), TextInspectorTool(model, text_limit), + visualizer, # Correcting error: "prediction": "Critical Error: Cannot use inspect_file_as_text tool with images: use visualizer instead!", ] text_webbrowser_agent = ToolCallingAgent( @@ -127,12 +159,13 @@ def create_agent_team(model: Model): def load_gaia_dataset(use_raw_dataset: bool, set_to_run: str) -> datasets.Dataset: - if not os.path.exists("data/gaia"): + data_dir = BASE_DIR / "data" / "gaia" + if not data_dir.exists(): if use_raw_dataset: snapshot_download( repo_id="gaia-benchmark/GAIA", repo_type="dataset", - local_dir="data/gaia", + local_dir=str(data_dir), ignore_patterns=[".gitattributes", "README.md"], ) else: @@ -140,20 +173,20 @@ def load_gaia_dataset(use_raw_dataset: bool, set_to_run: str) -> datasets.Datase snapshot_download( repo_id="smolagents/GAIA-annotated", repo_type="dataset", - local_dir="data/gaia", + local_dir=str(data_dir), ignore_patterns=[".gitattributes", "README.md"], ) def preprocess_file_paths(row): if len(row["file_name"]) > 0: - row["file_name"] = f"data/gaia/{set_to_run}/" + row["file_name"] + row["file_name"] = str(data_dir / "2023" / set_to_run / row["file_name"]) return row eval_ds = datasets.load_dataset( - "data/gaia/GAIA.py", - name="2023_all", + path=str(data_dir), + name="default", split=set_to_run, - # data_files={"validation": "validation/metadata.jsonl", "test": "test/metadata.jsonl"}, + data_files={"validation": "2023/validation/metadata.jsonl", "test": "2023/test/metadata.jsonl"}, ) eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"}) @@ -161,91 +194,132 @@ def preprocess_file_paths(row): return eval_ds -def append_answer(entry: dict, jsonl_file: str) -> None: +def append_answer(entry: dict, jsonl_file: Path) -> None: jsonl_path = Path(jsonl_file) jsonl_path.parent.mkdir(parents=True, exist_ok=True) - with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp: - fp.write(json.dumps(entry) + "\n") - assert jsonl_path.exists(), "File not found!" - print("Answer exported to file:", jsonl_path.resolve()) + + def convert_to_serializable(obj): + """Convert objects to JSON serializable format""" + if hasattr(obj, "dict"): + return obj.dict() + elif hasattr(obj, "__dict__"): + return obj.__dict__ + else: + return str(obj) + + try: + with append_answer_lock, jsonl_path.open("a", encoding="utf-8") as fp: + fp.write(json.dumps(entry, default=convert_to_serializable, ensure_ascii=False) + "\n") + fp.flush() # Ensure the buffer is flushed immediately + os.fsync(fp.fileno()) # Force the file system to write the data to disk + except Exception as e: + print(f"Error writing to answers file {jsonl_path}: {e}") def answer_single_question( - example: dict, model_id: str, answers_file: str, visual_inspection_tool: TextInspectorTool + example: dict, model_id: str, answers_file: str, errors_file: str, visual_inspection_tool ) -> None: - model_params: dict[str, Any] = { - "model_id": model_id, - "custom_role_conversions": custom_role_conversions, - } - if model_id == "o1": - model_params["reasoning_effort"] = "high" - model_params["max_completion_tokens"] = 8192 - else: - model_params["max_tokens"] = 4096 - model = LiteLLMModel(**model_params) - # model = InferenceClientModel(model_id="Qwen/Qwen3-32B", provider="novita", max_tokens=4096) - document_inspection_tool = TextInspectorTool(model, 100000) + print(f"Processing question: {example['question']}") - agent = create_agent_team(model) + # Initialize variables to avoid unbound errors + start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + end_time = None + intermediate_steps = [] + parsing_error = False + iteration_limit_exceeded = False + raised_exception = False + exception = None + output = None + augmented_question = example["question"] # Default value + + try: + model_params: dict[str, Any] = { + "model_id": model_id, + "custom_role_conversions": custom_role_conversions, + } + if model_id == "o1": + model_params["reasoning_effort"] = "high" + model_params["max_completion_tokens"] = 8192 + else: + model_params["max_tokens"] = 4096 + model = LiteLLMModel(**model_params) + document_inspection_tool = TextInspectorTool(model, 100000) - augmented_question = """You have one question to answer. It is paramount that you provide a correct answer. + agent = create_agent_team(model) + + augmented_question = """You have one question to answer. It is paramount that you provide a correct answer. Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded. Run verification steps if that's needed, you must make sure you find the correct answer! Here is the task: """ + example["question"] - if example["file_name"]: - if ".zip" in example["file_name"]: - prompt_use_files = "\n\nTo solve the task above, you will have to use these attached files:\n" - prompt_use_files += get_zip_description( - example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool - ) - else: - prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:\n" - prompt_use_files += get_single_file_description( - example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool - ) - augmented_question += prompt_use_files - - start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - try: - # Run agent πŸš€ - final_result = agent.run(augmented_question) - - agent_memory = agent.write_memory_to_messages() - - final_result = prepare_response(augmented_question, agent_memory, reformulation_model=model) - - output = str(final_result) - for memory_step in agent.memory.steps: - memory_step.model_input_messages = None - intermediate_steps = agent_memory - - # Check for parsing errors which indicate the LLM failed to follow the required format - parsing_error = True if any(["AgentParsingError" in step for step in intermediate_steps]) else False - - # check if iteration limit exceeded - iteration_limit_exceeded = True if "Agent stopped due to iteration limit or time limit." in output else False - raised_exception = False + if example.get("file_name"): + if ".zip" in example["file_name"]: + prompt_use_files = "\n\nTo solve the task above, you will have to use these attached files:\n" + prompt_use_files += get_zip_description( + example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool + ) + else: + prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:\n" + prompt_use_files += get_single_file_description( + example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool + ) + augmented_question += prompt_use_files + + try: + final_result = agent.run(augmented_question) + agent_memory = agent.write_memory_to_messages() + final_result = prepare_response(augmented_question, agent_memory, reformulation_model=model) + output = str(final_result) + # Fix: Use .dict() method for ChatMessage objects instead of dict() constructor + intermediate_steps = [msg.dict() if hasattr(msg, "dict") else str(msg) for msg in agent_memory] + parsing_error = any("AgentParsingError" in str(step) for step in intermediate_steps) + iteration_limit_exceeded = "Agent stopped due to iteration limit or time limit." in output + except Exception as e: + print(f"Error processing question '{example['question']}': {e}") + output = f"Error: {str(e)}" + exception = e + raised_exception = True + + end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # Safely get token counts with fallback + try: + token_counts_manager = agent.monitor.get_total_token_counts() + token_counts_web = list(agent.managed_agents.values())[0].monitor.get_total_token_counts() + total_token_counts = { + "input": getattr(token_counts_manager, "input", 0) + getattr(token_counts_web, "input", 0), + "output": getattr(token_counts_manager, "output", 0) + getattr(token_counts_web, "output", 0), + } + except Exception as e: + print(f"Error getting token counts: {e}") + total_token_counts = {"input": 0, "output": 0} except Exception as e: - print("Error on ", augmented_question, e) - output = None - intermediate_steps = [] - parsing_error = False - iteration_limit_exceeded = False + print(f"Critical error in answer_single_question for '{example['question']}': {e}") + end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + output = f"Critical Error: {str(e)}" exception = e raised_exception = True - end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - token_counts_manager = agent.monitor.get_total_token_counts() - token_counts_web = list(agent.managed_agents.values())[0].monitor.get_total_token_counts() - total_token_counts = { - "input": token_counts_manager["input"] + token_counts_web["input"], - "output": token_counts_manager["output"] + token_counts_web["output"], - } + total_token_counts = {"input": 0, "output": 0} + intermediate_steps = [] + + # Always compute scores and append answer, regardless of errors + try: + is_correct = question_scorer(str(output), str(example["true_answer"])) + is_near_correct = check_close_call(str(output), str(example["true_answer"]), is_correct) + except Exception as e: + print(f"Error computing scores: {e}") + is_correct = False + is_near_correct = False + + print(f"Question: {example['question'][:50]}{'...' if len(example['question']) > 50 else ''}") + print(f"Prediction: {str(output)[:100]}{'...' if len(str(output)) > 100 else ''}") + print(f"Correct: {is_correct}") + annotated_example = { - "agent_name": model.model_id, + "agent_name": model_id, "question": example["question"], "augmented_question": augmented_question, "prediction": output, @@ -256,14 +330,22 @@ def answer_single_question( "task": example["task"], "task_id": example["task_id"], "true_answer": example["true_answer"], + "is_correct": is_correct, + "is_near_correct": is_near_correct, "start_time": start_time, "end_time": end_time, "token_counts": total_token_counts, } - append_answer(annotated_example, answers_file) + + # Always append to main answers file + append_answer(annotated_example, Path(answers_file)) + + # If there was an error, also append to errors file + if raised_exception or parsing_error or iteration_limit_exceeded: + append_answer(annotated_example, Path(errors_file)) -def get_examples_to_answer(answers_file: str, eval_ds: datasets.Dataset) -> list[dict]: +def get_examples_to_answer(answers_file: Path, eval_ds: datasets.Dataset) -> list[dict]: print(f"Loading answers from {answers_file}...") try: done_questions = pd.read_json(answers_file, lines=True)["question"].tolist() @@ -272,31 +354,193 @@ def get_examples_to_answer(answers_file: str, eval_ds: datasets.Dataset) -> list print("Error when loading records: ", e) print("No usable records! ▢️ Starting new.") done_questions = [] - return [line for line in eval_ds.to_list() if line["question"] not in done_questions and line["file_name"]] + return [line for line in eval_ds.to_list() if line["question"] not in done_questions] + + +def compute_score(answers_file: Path) -> None: + if not answers_file.exists(): + print(f"Error: The file {answers_file} does not exist.") + return + + if answers_file.stat().st_size == 0: + print(f"Error: The file {answers_file} is empty.") + return + + try: + df = pd.read_json(answers_file, lines=True) + except ValueError as e: + print(f"Error reading JSON from {answers_file}: {e}") + return + + if "is_correct" not in df.columns: + df["is_correct"] = df.apply(lambda x: question_scorer(str(x["prediction"]), str(x["true_answer"])), axis=1) + + # Calculate comprehensive scores + total_questions = len(df) + correct_answers = df["is_correct"].sum() + accuracy = df["is_correct"].mean() + + # Calculate additional metrics + error_count = df["agent_error"].notna().sum() + parsing_error_count = df["parsing_error"].sum() + iteration_limit_count = df["iteration_limit_exceeded"].sum() + + # Group by task level for detailed analysis + task_scores = None + if "task" in df.columns: + task_scores = ( + df.groupby("task") + .agg( + { + "is_correct": ["count", "sum", "mean"], + "agent_error": lambda x: x.notna().sum(), + "parsing_error": "sum", + "iteration_limit_exceeded": "sum", + } + ) + .round(3) + ) + task_scores.columns = ["total", "correct", "accuracy", "errors", "parsing_errors", "iteration_limits"] + + # Save detailed score analysis + score_file = answers_file.parent / "detailed_scores.txt" + with score_file.open("w", encoding="utf-8") as f: + f.write("GAIA Benchmark Detailed Results\n") + f.write("=" * 50 + "\n\n") + f.write("Overall Performance:\n") + f.write(f" Total Questions: {total_questions}\n") + f.write(f" Correct Answers: {correct_answers}\n") + f.write(f" Accuracy: {accuracy * 100:.2f}%\n\n") + + f.write("Error Analysis:\n") + f.write(f" Agent Errors: {error_count} ({error_count / total_questions * 100:.1f}%)\n") + f.write(f" Parsing Errors: {parsing_error_count} ({parsing_error_count / total_questions * 100:.1f}%)\n") + f.write( + f" Iteration Limits: {iteration_limit_count} ({iteration_limit_count / total_questions * 100:.1f}%)\n\n" + ) + + if "task" in df.columns: + f.write("Performance by Task Level:\n") + if task_scores is not None: + f.write(str(task_scores) + "\n\n") + + f.write("Individual Results:\n") + f.write("-" * 50 + "\n") + for idx, row in df.iterrows(): + status = "βœ… CORRECT" if row["is_correct"] else "❌ INCORRECT" + f.write(f"{status} | Task {row.get('task', 'N/A')} | {row['question'][:80]}...\n") + f.write(f" Predicted: {str(row['prediction'])[:100]}...\n") + f.write(f" Expected: {str(row['true_answer'])[:100]}...\n") + if row.get("agent_error"): + f.write(f" Error: {str(row['agent_error'])[:100]}...\n") + f.write("\n") + + # Save JSON summary for programmatic access + summary_file = answers_file.parent / "score_summary.json" + summary_data = { + "total_questions": int(total_questions), + "correct_answers": int(correct_answers), + "accuracy": float(accuracy), + "error_rate": float(error_count / total_questions), + "parsing_error_rate": float(parsing_error_count / total_questions), + "iteration_limit_rate": float(iteration_limit_count / total_questions), + "timestamp": datetime.now().isoformat(), + "answers_file": str(answers_file.name), + } + + if "task" in df.columns and task_scores is not None: + summary_data["task_performance"] = task_scores.to_dict() + + with summary_file.open("w", encoding="utf-8") as f: + json.dump(summary_data, f, indent=2, ensure_ascii=False) + + print("\n" + "=" * 60) + print("πŸ“Š GAIA BENCHMARK RESULTS") + print("=" * 60) + print("πŸ“ˆ Overall Performance:") + print(f" Total Questions: {total_questions}") + print(f" Correct Answers: {correct_answers}") + print(f" Accuracy: {accuracy * 100:.2f}%") + print() + print("⚠️ Error Analysis:") + print(f" Agent Errors: {error_count} ({error_count / total_questions * 100:.1f}%)") + print(f" Parsing Errors: {parsing_error_count} ({parsing_error_count / total_questions * 100:.1f}%)") + print(f" Iteration Limits: {iteration_limit_count} ({iteration_limit_count / total_questions * 100:.1f}%)") + + if "task" in df.columns: + print() + print("πŸ“‹ Performance by Task Level:") + for task_level in sorted(df["task"].unique()): + task_data = df[df["task"] == task_level] + task_acc = task_data["is_correct"].mean() + task_count = len(task_data) + task_correct = task_data["is_correct"].sum() + print(f" Level {task_level}: {task_acc * 100:.1f}% ({task_correct}/{task_count})") + + print() + print(f"πŸ’Ύ Detailed results saved to: {score_file}") + print(f"πŸ’Ύ Summary data saved to: {summary_file}") + print("=" * 60) def main(): args = parse_args() print(f"Starting run with arguments: {args}") + def create_output_folders(set_to_run): + """Create output folders if they don't exist.""" + output_folder = Path(f"output/{set_to_run}") + output_folder.mkdir(parents=True, exist_ok=True) + print(f"Ensured output folder exists at: {output_folder}") + + create_output_folders(args.set_to_run) + eval_ds = load_gaia_dataset(args.use_raw_dataset, args.set_to_run) print("Loaded evaluation dataset:") - print(pd.DataFrame(eval_ds)["task"].value_counts()) + eval_df = pd.DataFrame(list(eval_ds)) + print(eval_df["task"].value_counts()) - answers_file = f"output/{args.set_to_run}/{args.run_name}.jsonl" + answers_file = BASE_DIR / "output" / args.set_to_run / f"{args.run_name}.jsonl" + errors_file = BASE_DIR / "output" / args.set_to_run / f"{args.run_name}_errors.jsonl" tasks_to_run = get_examples_to_answer(answers_file, eval_ds) + print(f"Tasks to run: {len(tasks_to_run)}") + if len(tasks_to_run) == 0: + print("No new tasks to process. All questions may have been completed already.") + print("To rerun all tasks, delete the existing output file.") + return + with ThreadPoolExecutor(max_workers=args.concurrency) as exe: futures = [ - exe.submit(answer_single_question, example, args.model_id, answers_file, visualizer) + exe.submit( + answer_single_question, + example, + args.model_id, + answers_file, + errors_file, + visualizer, # Fix: Use visualizer for images instead of TextInspectorTool + ) for example in tasks_to_run ] for f in tqdm(as_completed(futures), total=len(tasks_to_run), desc="Processing tasks"): - f.result() + try: + f.result() + except Exception as e: + print(f"Error in task: {e}") + continue + + if not answers_file.exists(): + print(f"Error: The answers file {answers_file} was not created. Check the append_answer function.") + else: + print(f"βœ… Main answers file created successfully: {answers_file}") + + if errors_file.exists(): + print(f"⚠️ Errors file created: {errors_file}") + else: + print("βœ… No errors file created (no errors encountered)") - # for example in tasks_to_run: - # answer_single_question(example, args.model_id, answers_file, visualizer) print("All tasks processed.") + compute_score(answers_file) if __name__ == "__main__": diff --git a/examples/open_deep_research/scripts/mdconvert.py b/examples/open_deep_research/scripts/mdconvert.py index 939cd121a..6a4b40f62 100644 --- a/examples/open_deep_research/scripts/mdconvert.py +++ b/examples/open_deep_research/scripts/mdconvert.py @@ -983,12 +983,23 @@ def _append_ext(self, extensions, ext): def _guess_ext_magic(self, path): """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" # Use puremagic to guess + if not os.path.isfile(path): + raise ValueError(f"Invalid file path passed to _guess_ext_magic: {path}") try: guesses = puremagic.magic_file(path) if len(guesses) > 0: ext = guesses[0].extension.strip() if len(ext) > 0: return ext + except puremagic.PureMagicError as e: + # If puremagic fails, we just ignore it and return None + print(f"Error in puremagic: {e}") + except puremagic.PureMagicNotFoundError: + # If puremagic is not installed, we just ignore it and return None + print("puremagic is not installed. Skipping magic file type detection.") + except ValueError: + # If the path is not a file, we just ignore it and return None + pass except FileNotFoundError: pass except IsADirectoryError: diff --git a/examples/open_deep_research/scripts/run_agents.py b/examples/open_deep_research/scripts/run_agents.py index e2e020cb3..fa2df2ad2 100644 --- a/examples/open_deep_research/scripts/run_agents.py +++ b/examples/open_deep_research/scripts/run_agents.py @@ -9,6 +9,7 @@ def serialize_agent_error(obj): + """Serialize AgentError objects for JSON output.""" if isinstance(obj, AgentError): return {"error_type": obj.__class__.__name__, "message": obj.message} else: @@ -16,6 +17,13 @@ def serialize_agent_error(obj): def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str: + """ + Get description of an image file for context. + + DEBUGGING FIX: Changed parameter from file_path to image_path + - ISSUE: TypeError due to mismatched parameter name + - SOLUTION: Use image_path parameter as expected by visualizer tool + """ prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question: {question}. But do not try to answer the question directly! Do not add any information that is not present in the image.""" diff --git a/examples/open_deep_research/visual_vs_text_browser.ipynb b/examples/open_deep_research/visual_vs_text_browser.ipynb index 1acc5c704..5eae43434 100644 --- a/examples/open_deep_research/visual_vs_text_browser.ipynb +++ b/examples/open_deep_research/visual_vs_text_browser.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -20,9 +20,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using the latest cached version of the dataset since gaia-benchmark/GAIA couldn't be found on the Hugging Face Hub\n", + "Found the latest cached dataset configuration '2023_all' at /home/ecca/.cache/huggingface/datasets/gaia-benchmark___gaia/2023_all/0.0.1/ec492fe4320ee795b1aed6bb46229c5f693226b0f1316347501c24b4baeee005 (last modified on Mon Jul 21 11:03:18 2025).\n" + ] + } + ], "source": [ "import datasets\n", "\n", @@ -34,7 +43,22 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0132ebaa6aed4c348770d58aae5ef8a1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Filter: 0%| | 0/165 [00:00 \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mscripts\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrun_agents\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m answer_questions\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mscripts\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtext_inspector_tool\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m TextInspectorTool\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mscripts\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mtext_web_browser\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[32m 4\u001b[39m ArchiveSearchTool,\n\u001b[32m 5\u001b[39m FinderTool,\n\u001b[32m (...)\u001b[39m\u001b[32m 11\u001b[39m VisitTool,\n\u001b[32m 12\u001b[39m )\n", + "\u001b[31mImportError\u001b[39m: cannot import name 'answer_questions' from 'scripts.run_agents' (/home/ecca/GitFiles/smolagents/examples/open_deep_research/scripts/run_agents.py)" + ] + } + ], "source": [ "from scripts.run_agents import answer_questions\n", "from scripts.text_inspector_tool import TextInspectorTool\n", @@ -337,7 +388,7 @@ ], "metadata": { "kernelspec": { - "display_name": "gaia", + "display_name": "smolagents", "language": "python", "name": "python3" }, @@ -351,7 +402,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.0" + "version": "3.12.11" } }, "nbformat": 4, diff --git a/examples/smolagents_benchmark/model_performance_comparison.png b/examples/smolagents_benchmark/model_performance_comparison.png new file mode 100644 index 000000000..30cb23506 Binary files /dev/null and b/examples/smolagents_benchmark/model_performance_comparison.png differ diff --git a/examples/smolagents_benchmark/run.py b/examples/smolagents_benchmark/run.py index aa8e48570..f7716fc0a 100644 --- a/examples/smolagents_benchmark/run.py +++ b/examples/smolagents_benchmark/run.py @@ -1,15 +1,25 @@ +# Smolagents Benchmark Runner +# =========================== +# +# Example usage: +# python run.py --model-type LiteLLMModel --model-id gpt-4o --provider openai --agent-action-type tool-calling +# python run.py --model-type LiteLLMModel --model-id gpt-4o --provider openai --agent-action-type code + import argparse import datetime import json import os +import re import threading import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path +from typing import Generator import datasets import pandas as pd from dotenv import load_dotenv +from langfuse import get_client from tqdm import tqdm from smolagents import ( @@ -22,9 +32,30 @@ ToolCallingAgent, VisitWebpageTool, ) +from smolagents.memory import ActionStep, FinalAnswerStep, PlanningStep +from smolagents.models import ChatMessageStreamDelta load_dotenv() + +langfuse = get_client() + +# Verify connection +if langfuse.auth_check(): + print("Langfuse client is authenticated and ready!") +else: + print("Authentication failed. Please check your credentials and host.") + +from openinference.instrumentation.smolagents import SmolagentsInstrumentor + + +with langfuse.start_as_current_span(name="another-operation"): + # Add to the current trace + langfuse.update_current_trace(session_id="zero_shot_1", user_id="cvt8") + + +SmolagentsInstrumentor().instrument() + os.makedirs("output", exist_ok=True) APPEND_ANSWER_LOCK = threading.Lock() @@ -129,7 +160,6 @@ def answer_single_question(example, model, answers_file, action_type): agent = CodeAgent( tools=[GoogleSearchTool(provider="serper"), VisitWebpageTool()], model=model, - additional_authorized_imports=["numpy", "sympy"], max_steps=10, ) elif action_type == "tool-calling": @@ -154,17 +184,45 @@ def answer_single_question(example, model, answers_file, action_type): try: if action_type == "vanilla": answer = agent([{"role": "user", "content": augmented_question}]).content - token_counts = agent.monitor.get_total_token_counts() + # For vanilla agents, the agent is just the model and doesn't have a monitor + token_counts = {"input": 0, "output": 0} intermediate_steps = answer else: # Run agent πŸš€ - answer = str(agent.run(augmented_question)) + result = agent.run(augmented_question) + try: + if isinstance(result, Generator): + steps = list(result) + final_step = steps[-1] if steps else None + if isinstance(final_step, FinalAnswerStep): + answer = final_step.output + elif isinstance(final_step, ActionStep): + answer = final_step.action_output + elif isinstance(final_step, PlanningStep): + answer = final_step.plan + elif isinstance(final_step, ChatMessageStreamDelta): + answer = final_step.content + else: + answer = str(final_step) + else: + answer = result if isinstance(result, str) else str(result) + except Exception as e: + answer = f"Error extracting answer: {str(e)}" + token_counts = agent.monitor.get_total_token_counts() + intermediate_steps = [message.dict() for message in agent.write_memory_to_messages()] end_time = time.time() + duration = end_time - start_time + answer_preview = str(answer)[:100] + ("..." if len(str(answer)) > 100 else "") if answer else "No answer" + print(f"βœ… Question processed in {duration:.2f}s - Answer: {answer_preview}") except Exception as e: - print("Error on ", augmented_question, e) + end_time = time.time() + duration = end_time - start_time + question_preview = str(augmented_question)[:50] + ("..." if len(str(augmented_question)) > 50 else "") + print(f"❌ Error after {duration:.2f}s on question: {question_preview}") + print(f" Error details: {str(e)}") intermediate_steps = [] token_counts = {"input": 0, "output": 0} answer = str(e) @@ -185,13 +243,161 @@ def answer_single_question(example, model, answers_file, action_type): append_answer(annotated_example, answers_file) +# ============================== +# SCORING SYSTEM (ADDED FEATURE) +# ============================== +# This section was added to provide comprehensive benchmark evaluation +# with multiple scoring metrics and detailed performance analysis. + + +def normalize_answer(answer): + """ + Normalize answer for comparison. + + Removes extra whitespace, converts to lowercase, and strips punctuation + to enable more flexible answer matching. + """ + if answer is None: + return "" + answer = str(answer).strip().lower() + # Remove extra whitespace + answer = re.sub(r"\s+", " ", answer) + # Remove common punctuation at the end + answer = re.sub(r"[.!?;,]+$", "", answer) + return answer + + +def calculate_exact_match_score(predicted_answer, true_answer): + """ + Calculate exact match score (1.0 for perfect match, 0.0 otherwise). + + This is the strictest scoring metric. + """ + return 1.0 if normalize_answer(predicted_answer) == normalize_answer(true_answer) else 0.0 + + +def calculate_contains_score(predicted_answer, true_answer): + """Calculate score based on whether the predicted answer contains the true answer.""" + normalized_pred = normalize_answer(predicted_answer) + normalized_true = normalize_answer(true_answer) + + if not normalized_true: + return 0.0 + + return 1.0 if normalized_true in normalized_pred else 0.0 + + +def calculate_benchmark_scores(jsonl_file_path): + """Calculate scores for a benchmark result file.""" + if not os.path.exists(jsonl_file_path): + return {"error": "File not found"} + + total_questions = 0 + exact_matches = 0 + contains_matches = 0 + + with open(jsonl_file_path, "r", encoding="utf-8") as f: + for line in f: + try: + data = json.loads(line.strip()) + if not data: + continue + + predicted = data.get("answer", "") + true_answer = data.get("true_answer", "") + + total_questions += 1 + exact_matches += calculate_exact_match_score(predicted, true_answer) + contains_matches += calculate_contains_score(predicted, true_answer) + + except json.JSONDecodeError: + continue + + if total_questions == 0: + return {"error": "No valid questions found"} + + return { + "total_questions": total_questions, + "exact_match_score": exact_matches / total_questions, + "contains_score": contains_matches / total_questions, + "exact_matches": exact_matches, + "contains_matches": contains_matches, + } + + +def save_benchmark_scores(output_dir, model_id, action_type, date, eval_ds): + """Calculate and save scores for all benchmarks.""" + scores_file = f"{output_dir}/benchmark_scores_{model_id.replace('/', '__')}__{action_type}__{date}.json" + + all_scores = { + "model_id": model_id, + "action_type": action_type, + "date": date, + "timestamp": datetime.datetime.now().isoformat(), + "benchmarks": {}, + } + + total_questions_all = 0 + total_exact_matches_all = 0 + total_contains_matches_all = 0 + + print("\nπŸ“Š Calculating benchmark scores...") + + for task in eval_ds: + jsonl_file = f"{output_dir}/{model_id.replace('/', '__')}__{action_type}__{task}__{date}.jsonl" + scores = calculate_benchmark_scores(jsonl_file) + + if "error" not in scores: + all_scores["benchmarks"][task] = scores + total_questions_all += scores["total_questions"] + total_exact_matches_all += scores["exact_matches"] + total_contains_matches_all += scores["contains_matches"] + + print(f" πŸ“ˆ {task.upper()}:") + print(f" Questions: {scores['total_questions']}") + print( + f" Exact Match: {scores['exact_match_score']:.1%} ({scores['exact_matches']}/{scores['total_questions']})" + ) + print( + f" Contains: {scores['contains_score']:.1%} ({scores['contains_matches']}/{scores['total_questions']})" + ) + else: + print(f" ❌ {task.upper()}: {scores['error']}") + + # Overall scores + if total_questions_all > 0: + all_scores["overall"] = { + "total_questions": total_questions_all, + "exact_match_score": total_exact_matches_all / total_questions_all, + "contains_score": total_contains_matches_all / total_questions_all, + "exact_matches": total_exact_matches_all, + "contains_matches": total_contains_matches_all, + } + + print("\n🎯 OVERALL SCORES:") + print(f" Questions: {total_questions_all}") + print( + f" Exact Match: {all_scores['overall']['exact_match_score']:.1%} ({total_exact_matches_all}/{total_questions_all})" + ) + print( + f" Contains: {all_scores['overall']['contains_score']:.1%} ({total_contains_matches_all}/{total_questions_all})" + ) + + # Save scores to file + with open(scores_file, "w", encoding="utf-8") as f: + json.dump(all_scores, f, indent=2, ensure_ascii=False) + + print(f"\nπŸ’Ύ Scores saved to: {scores_file}") + return all_scores + + def answer_questions( eval_ds, model, date, action_type: str = "code", output_dir: str = "output", - answers_dataset: str = None, + answers_dataset: str | None = None, push_answers_to_hub: bool = False, parallel_workers: int = 32, ): @@ -200,7 +406,8 @@ def answer_questions( for task in eval_ds: file_name = f"{output_dir}/{model_id.replace('/', '__')}__{action_type}__{task}__{date}.jsonl" - print(f"Starting processing and writing output to '{file_name}'") + print(f"\nπŸš€ Starting benchmark: {task}") + print(f"πŸ“„ Writing output to: {file_name}") answered_questions = [] if os.path.exists(file_name): with open(file_name, "r") as f: @@ -208,7 +415,19 @@ def answer_questions( answered_questions.append(json.loads(line)["original_question"]) examples_todo = [example for example in eval_ds[task] if example["question"] not in answered_questions] - print(f"Launching {parallel_workers} parallel workers.") + total_questions = len(eval_ds[task]) + remaining_questions = len(examples_todo) + completed_questions = total_questions - remaining_questions + + # IMPROVED LOGGING: Added detailed progress tracking with emojis for better readability + print( + f"πŸ“Š Progress: {completed_questions}/{total_questions} questions completed ({remaining_questions} remaining)" + ) + if remaining_questions == 0: + print(f"βœ… All questions for {task} already completed!") + continue + + print(f"πŸ‘₯ Launching {parallel_workers} parallel workers...") with ThreadPoolExecutor(max_workers=parallel_workers) as exe: futures = [ @@ -217,7 +436,7 @@ def answer_questions( for f in tqdm(as_completed(futures), total=len(examples_todo), desc="Processing tasks"): f.result() - print("All tasks processed.") + print(f"βœ… All tasks for {task} processed.") if push_answers_to_hub and answers_dataset: print("Pushing answers to hub...") @@ -232,6 +451,9 @@ def answer_questions( commit_message=f"Upload {config}", ) + # Calculate and save benchmark scores + save_benchmark_scores(output_dir, model_id, action_type, date, eval_ds) + if __name__ == "__main__": args = parse_arguments() diff --git a/examples/smolagents_benchmark/score.ipynb b/examples/smolagents_benchmark/score.ipynb index 7d8709498..659bc0c49 100644 --- a/examples/smolagents_benchmark/score.ipynb +++ b/examples/smolagents_benchmark/score.ipynb @@ -2,24 +2,26 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "!pip install -e .. datasets sympy numpy matplotlib seaborn -q # Install dev version of smolagents + some packages" + "#!pip install -e .. datasets sympy numpy matplotlib seaborn -q # Install dev version of smolagents + some packages\n", + "#!export HF_TOKEN=" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Benchmark date\n", "# - set a concrete date:\n", - "DATE = \"2024-12-26\"\n", + "DATE = \"2025-07-17\"\n", "# - or use default: today\n", "# DATE = None\n", + "# DATE = DATE or datetime.date.today().isoformat()\n", "\n", "# Evaluation dataset\n", "# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1\n", @@ -33,7 +35,7 @@ "# Results dataset\n", "RESULTS_DATASET = \"smolagents/results\"\n", "# Whether to push the results dataset to the Hub\n", - "PUSH_RESULTS_DATASET_TO_HUB = True" + "PUSH_RESULTS_DATASET_TO_HUB = False" ] }, { @@ -218,7 +220,10 @@ " if push_to_hub_dataset:\n", " ds = datasets.Dataset.from_pandas(df)\n", " config = date\n", - " ds.push_to_hub(push_to_hub_dataset, config_name=config, commit_message=f\"Upload {config} results\")\n", + " ds.push_to_hub(\n", + " push_to_hub_dataset, config_name=config, commit_message=f\"Upload {config} results\", create_pr=True\n", + " )\n", + " # ds.push_to_hub(push_to_hub_dataset, config_name=config, commit_message=f\"Upload {config} results\")\n", " return df" ] }, @@ -231,9 +236,1035 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ecca/GitFiles/dec_smolagents/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of answers_subsets 129\n", + "Example of answers_subset Qwen__QwQ-32B__code__gaia\n", + "smolagents/answers Qwen__QwQ-32B__code__gaia\n", + "smolagents/answers Qwen__QwQ-32B__code__math\n", + "smolagents/answers Qwen__QwQ-32B__code__simpleqa\n", + "smolagents/answers Qwen__Qwen2.5-72B-Instruct__code__gaia\n", + "smolagents/answers Qwen__Qwen2.5-72B-Instruct__code__math\n", + "smolagents/answers Qwen__Qwen2.5-72B-Instruct__code__simpleqa\n", + "smolagents/answers Qwen__Qwen2.5-72B-Instruct__vanilla__gaia\n", + "smolagents/answers Qwen__Qwen2.5-72B-Instruct__vanilla__math\n", + "smolagents/answers Qwen__Qwen2.5-72B-Instruct__vanilla__simpleqa\n", + "smolagents/answers Qwen__Qwen2.5-7B-Instruct__vanilla__gaia\n", + "smolagents/answers Qwen__Qwen2.5-7B-Instruct__vanilla__math\n", + "smolagents/answers Qwen__Qwen2.5-7B-Instruct__vanilla__simpleqa\n", + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__code__gaia\n", + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__code__math\n", + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__code__simpleqa\n", + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__tool-calling__gaia\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 1%| | 1/129 [00:09<20:22, 9.55s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__tool-calling__math\n", + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__tool-calling__simpleqa\n", + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__vanilla__gaia\n", + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__vanilla__math\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 4%|▍ | 5/129 [00:09<03:03, 1.48s/it]/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 6%|β–Œ | 8/129 [00:09<01:35, 1.26it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers Qwen__Qwen2.5-Coder-32B-Instruct__vanilla__simpleqa\n", + "smolagents/answers Qwen__Qwen3-235B-A22B__code__gaia\n", + "smolagents/answers Qwen__Qwen3-235B-A22B__code__math\n", + "smolagents/answers Qwen__Qwen3-235B-A22B__code__simpleqa\n", + "smolagents/answers Qwen__Qwen3-32B__code__gaia\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 8%|β–Š | 10/129 [00:10<01:09, 1.70it/s]/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers Qwen__Qwen3-32B__code__math\n", + "smolagents/answers Qwen__Qwen3-32B__code__simpleqa\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 9%|β–‰ | 12/129 [00:11<01:02, 1.88it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-5-sonnet-latest__code__gaia\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 10%|β–ˆ | 13/129 [00:14<02:09, 1.11s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-5-sonnet-latest__code__math\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 11%|β–ˆ | 14/129 [00:15<02:05, 1.09s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-5-sonnet-latest__code__simpleqa\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 12%|β–ˆβ– | 15/129 [00:16<01:54, 1.01s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-5-sonnet-latest__tool-calling__gaia\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 12%|β–ˆβ– | 16/129 [00:16<01:35, 1.18it/s]/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 13%|β–ˆβ–Ž | 17/129 [00:17<01:15, 1.48it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-5-sonnet-latest__tool-calling__math\n", + "smolagents/answers anthropic__claude-3-5-sonnet-latest__tool-calling__simpleqa\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 14%|β–ˆβ– | 18/129 [00:17<01:06, 1.68it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-5-sonnet-latest__vanilla__gaia\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 15%|β–ˆβ– | 19/129 [00:17<00:56, 1.93it/s]/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 16%|β–ˆβ–‹ | 21/129 [00:17<00:33, 3.18it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-5-sonnet-latest__vanilla__math\n", + "smolagents/answers anthropic__claude-3-5-sonnet-latest__vanilla__simpleqa\n", + "smolagents/answers anthropic__claude-3-7-sonnet-20250219__code__gaia\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 17%|β–ˆβ–‹ | 22/129 [00:18<00:33, 3.22it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-7-sonnet-20250219__code__math\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 19%|β–ˆβ–Š | 24/129 [00:18<00:31, 3.37it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-7-sonnet-20250219__code__simpleqa\n", + "smolagents/answers anthropic__claude-3-7-sonnet__code__gaia\n", + "smolagents/answers anthropic__claude-3-7-sonnet__code__math\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 20%|β–ˆβ–ˆ | 26/129 [00:19<00:30, 3.35it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-3-7-sonnet__code__simpleqa\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 21%|β–ˆβ–ˆ | 27/129 [00:22<01:31, 1.11it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-opus-4-20250514__code__gaia\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 22%|β–ˆβ–ˆβ– | 28/129 [00:23<01:30, 1.12it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answerssmolagents/answers anthropic__claude-opus-4-20250514__code__simpleqa\n", + " anthropic__claude-opus-4-20250514__code__math\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 23%|β–ˆβ–ˆβ–Ž | 30/129 [00:23<01:03, 1.55it/s]/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "Processing tasks: 24%|β–ˆβ–ˆβ– | 31/129 [00:23<00:48, 2.00it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-sonnet-4-20250514__code__gaia\n", + "smolagents/answers anthropic__claude-sonnet-4-20250514__code__math\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers anthropic__claude-sonnet-4-20250514__code__simpleqa\n", + "smolagents/answers deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__code__gaia\n", + "smolagents/answers deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__code__math\n", + "smolagents/answers deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__code__simpleqa\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing tasks: 27%|β–ˆβ–ˆβ–‹ | 35/129 [00:27<01:10, 1.34it/s]/tmp/ipykernel_121064/3316273531.py:86: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "smolagents/answers deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__tool-calling__gaia\n", + "smolagents/answers deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__tool-calling__math\n", + "smolagents/answers deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__tool-calling__simpleqa\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Generating test split: 0%| | 0/50 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
model_idagent_action_typesourceacc
0Qwen/Qwen2.5-Coder-32B-InstructcodeMATH84.0
1Qwen/Qwen2.5-7B-InstructvanillaMATH38.0
2Qwen/Qwen2.5-Coder-32B-InstructcodeSimpleQA74.0
3Qwen/QwQ-32BcodeSimpleQA74.0
4Qwen/Qwen2.5-72B-InstructvanillaSimpleQA10.0
\n", + "" + ], + "text/plain": [ + " model_id agent_action_type source acc\n", + "0 Qwen/Qwen2.5-Coder-32B-Instruct code MATH 84.0\n", + "1 Qwen/Qwen2.5-7B-Instruct vanilla MATH 38.0\n", + "2 Qwen/Qwen2.5-Coder-32B-Instruct code SimpleQA 74.0\n", + "3 Qwen/QwQ-32B code SimpleQA 74.0\n", + "4 Qwen/Qwen2.5-72B-Instruct vanilla SimpleQA 10.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import datasets\n", "import pandas as pd\n", @@ -253,7 +1284,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -274,18 +1305,800 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agent_action_typemodel_idsourcecodetool-callingtool_callingvanilla
0Qwen/QwQ-32BGAIA37.50NaNNaNNaN
1Qwen/QwQ-32BMATH94.00NaNNaNNaN
2Qwen/QwQ-32BSimpleQA74.00NaNNaNNaN
3Qwen/Qwen2.5-72B-InstructGAIA28.12NaNNaN6.25
4Qwen/Qwen2.5-72B-InstructMATH76.00NaNNaN30.00
5Qwen/Qwen2.5-72B-InstructSimpleQA88.00NaNNaN10.00
6Qwen/Qwen2.5-7B-InstructGAIANaNNaNNaN3.12
7Qwen/Qwen2.5-7B-InstructMATHNaNNaNNaN38.00
8Qwen/Qwen2.5-7B-InstructSimpleQANaNNaNNaN4.00
9Qwen/Qwen2.5-Coder-32B-InstructGAIA21.8818.75NaN3.12
10Qwen/Qwen2.5-Coder-32B-InstructMATH84.0044.00NaN62.00
11Qwen/Qwen2.5-Coder-32B-InstructSimpleQA74.0058.00NaN8.00
12Qwen/Qwen3-235B-A22BGAIA15.62NaNNaNNaN
13Qwen/Qwen3-235B-A22BMATH58.00NaNNaNNaN
14Qwen/Qwen3-235B-A22BSimpleQA76.00NaNNaNNaN
15Qwen/Qwen3-32BGAIA25.00NaNNaNNaN
16Qwen/Qwen3-32BMATH90.00NaNNaNNaN
17Qwen/Qwen3-32BSimpleQA78.00NaNNaNNaN
18anthropic/claude-3-5-sonnet-latestGAIA56.25NaN0.003.12
19anthropic/claude-3-5-sonnet-latestMATH86.00NaN54.0050.00
20anthropic/claude-3-5-sonnet-latestSimpleQA82.00NaN0.0034.00
21anthropic/claude-3-7-sonnet-20250219GAIA50.00NaNNaNNaN
22anthropic/claude-3-7-sonnet-20250219MATH96.00NaNNaNNaN
23anthropic/claude-3-7-sonnet-20250219SimpleQA86.00NaNNaNNaN
24anthropic/claude-opus-4-20250514GAIA59.38NaNNaNNaN
25anthropic/claude-opus-4-20250514MATH98.00NaNNaNNaN
26anthropic/claude-opus-4-20250514SimpleQA94.00NaNNaNNaN
27anthropic/claude-sonnet-4-20250514GAIA56.25NaNNaNNaN
28anthropic/claude-sonnet-4-20250514MATH100.00NaNNaNNaN
29anthropic/claude-sonnet-4-20250514SimpleQA82.00NaNNaNNaN
30deepseek-ai/DeepSeek-R1GAIA50.00NaNNaNNaN
31deepseek-ai/DeepSeek-R1MATH94.00NaNNaNNaN
32deepseek-ai/DeepSeek-R1SimpleQA74.00NaNNaNNaN
33deepseek-ai/DeepSeek-R1-Distill-Qwen-32BGAIA31.2512.50NaNNaN
34deepseek-ai/DeepSeek-R1-Distill-Qwen-32BMATH92.0044.00NaNNaN
35deepseek-ai/DeepSeek-R1-Distill-Qwen-32BSimpleQA40.0048.00NaNNaN
36gpt-4.5-previewGAIA56.25NaNNaN15.62
37gpt-4.5-previewMATH92.00NaNNaN32.00
38gpt-4.5-previewSimpleQA88.00NaNNaN58.00
39gpt-4oGAIA34.38NaN15.623.12
40gpt-4oMATH78.00NaN58.0040.00
41gpt-4oSimpleQA80.00NaN86.006.00
42meta-llama/Llama-3.1-70B-InstructGAIA15.6218.75NaNNaN
43meta-llama/Llama-3.1-70B-InstructMATH70.0016.00NaNNaN
44meta-llama/Llama-3.1-70B-InstructSimpleQA64.0018.00NaNNaN
45meta-llama/Llama-3.1-8B-InstructGAIA0.006.25NaN0.00
46meta-llama/Llama-3.1-8B-InstructMATH14.0012.00NaN18.00
47meta-llama/Llama-3.1-8B-InstructSimpleQA2.0012.00NaN6.00
48meta-llama/Llama-3.2-3B-InstructGAIA3.12NaNNaN0.00
49meta-llama/Llama-3.2-3B-InstructMATH40.00NaNNaN12.00
50meta-llama/Llama-3.2-3B-InstructSimpleQA20.00NaNNaN0.00
51meta-llama/Llama-3.3-70B-InstructGAIA31.25NaNNaN3.12
52meta-llama/Llama-3.3-70B-InstructMATH72.00NaNNaN40.00
53meta-llama/Llama-3.3-70B-InstructSimpleQA78.00NaNNaN12.00
54meta-llama/Llama-4-Maverick-17B-128E-InstructGAIA46.88NaNNaNNaN
55meta-llama/Llama-4-Maverick-17B-128E-InstructMATH88.00NaNNaNNaN
56meta-llama/Llama-4-Maverick-17B-128E-InstructSimpleQA88.00NaNNaNNaN
57meta-llama/Llama-4-Scout-17B-16E-InstructGAIA25.00NaNNaNNaN
58meta-llama/Llama-4-Scout-17B-16E-InstructMATH84.00NaNNaNNaN
59meta-llama/Llama-4-Scout-17B-16E-InstructSimpleQA90.00NaNNaNNaN
60o1GAIA50.00NaNNaN18.75
61o1MATH96.00NaNNaN80.00
62o1SimpleQA84.00NaNNaN34.00
63o3-miniGAIA46.88NaNNaNNaN
64o3-miniMATH98.00NaNNaNNaN
65o3-miniSimpleQA80.00NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + "agent_action_type model_id source \\\n", + "0 Qwen/QwQ-32B GAIA \n", + "1 Qwen/QwQ-32B MATH \n", + "2 Qwen/QwQ-32B SimpleQA \n", + "3 Qwen/Qwen2.5-72B-Instruct GAIA \n", + "4 Qwen/Qwen2.5-72B-Instruct MATH \n", + "5 Qwen/Qwen2.5-72B-Instruct SimpleQA \n", + "6 Qwen/Qwen2.5-7B-Instruct GAIA \n", + "7 Qwen/Qwen2.5-7B-Instruct MATH \n", + "8 Qwen/Qwen2.5-7B-Instruct SimpleQA \n", + "9 Qwen/Qwen2.5-Coder-32B-Instruct GAIA \n", + "10 Qwen/Qwen2.5-Coder-32B-Instruct MATH \n", + "11 Qwen/Qwen2.5-Coder-32B-Instruct SimpleQA \n", + "12 Qwen/Qwen3-235B-A22B GAIA \n", + "13 Qwen/Qwen3-235B-A22B MATH \n", + "14 Qwen/Qwen3-235B-A22B SimpleQA \n", + "15 Qwen/Qwen3-32B GAIA \n", + "16 Qwen/Qwen3-32B MATH \n", + "17 Qwen/Qwen3-32B SimpleQA \n", + "18 anthropic/claude-3-5-sonnet-latest GAIA \n", + "19 anthropic/claude-3-5-sonnet-latest MATH \n", + "20 anthropic/claude-3-5-sonnet-latest SimpleQA \n", + "21 anthropic/claude-3-7-sonnet-20250219 GAIA \n", + "22 anthropic/claude-3-7-sonnet-20250219 MATH \n", + "23 anthropic/claude-3-7-sonnet-20250219 SimpleQA \n", + "24 anthropic/claude-opus-4-20250514 GAIA \n", + "25 anthropic/claude-opus-4-20250514 MATH \n", + "26 anthropic/claude-opus-4-20250514 SimpleQA \n", + "27 anthropic/claude-sonnet-4-20250514 GAIA \n", + "28 anthropic/claude-sonnet-4-20250514 MATH \n", + "29 anthropic/claude-sonnet-4-20250514 SimpleQA \n", + "30 deepseek-ai/DeepSeek-R1 GAIA \n", + "31 deepseek-ai/DeepSeek-R1 MATH \n", + "32 deepseek-ai/DeepSeek-R1 SimpleQA \n", + "33 deepseek-ai/DeepSeek-R1-Distill-Qwen-32B GAIA \n", + "34 deepseek-ai/DeepSeek-R1-Distill-Qwen-32B MATH \n", + "35 deepseek-ai/DeepSeek-R1-Distill-Qwen-32B SimpleQA \n", + "36 gpt-4.5-preview GAIA \n", + "37 gpt-4.5-preview MATH \n", + "38 gpt-4.5-preview SimpleQA \n", + "39 gpt-4o GAIA \n", + "40 gpt-4o MATH \n", + "41 gpt-4o SimpleQA \n", + "42 meta-llama/Llama-3.1-70B-Instruct GAIA \n", + "43 meta-llama/Llama-3.1-70B-Instruct MATH \n", + "44 meta-llama/Llama-3.1-70B-Instruct SimpleQA \n", + "45 meta-llama/Llama-3.1-8B-Instruct GAIA \n", + "46 meta-llama/Llama-3.1-8B-Instruct MATH \n", + "47 meta-llama/Llama-3.1-8B-Instruct SimpleQA \n", + "48 meta-llama/Llama-3.2-3B-Instruct GAIA \n", + "49 meta-llama/Llama-3.2-3B-Instruct MATH \n", + "50 meta-llama/Llama-3.2-3B-Instruct SimpleQA \n", + "51 meta-llama/Llama-3.3-70B-Instruct GAIA \n", + "52 meta-llama/Llama-3.3-70B-Instruct MATH \n", + "53 meta-llama/Llama-3.3-70B-Instruct SimpleQA \n", + "54 meta-llama/Llama-4-Maverick-17B-128E-Instruct GAIA \n", + "55 meta-llama/Llama-4-Maverick-17B-128E-Instruct MATH \n", + "56 meta-llama/Llama-4-Maverick-17B-128E-Instruct SimpleQA \n", + "57 meta-llama/Llama-4-Scout-17B-16E-Instruct GAIA \n", + "58 meta-llama/Llama-4-Scout-17B-16E-Instruct MATH \n", + "59 meta-llama/Llama-4-Scout-17B-16E-Instruct SimpleQA \n", + "60 o1 GAIA \n", + "61 o1 MATH \n", + "62 o1 SimpleQA \n", + "63 o3-mini GAIA \n", + "64 o3-mini MATH \n", + "65 o3-mini SimpleQA \n", + "\n", + "agent_action_type code tool-calling tool_calling vanilla \n", + "0 37.50 NaN NaN NaN \n", + "1 94.00 NaN NaN NaN \n", + "2 74.00 NaN NaN NaN \n", + "3 28.12 NaN NaN 6.25 \n", + "4 76.00 NaN NaN 30.00 \n", + "5 88.00 NaN NaN 10.00 \n", + "6 NaN NaN NaN 3.12 \n", + "7 NaN NaN NaN 38.00 \n", + "8 NaN NaN NaN 4.00 \n", + "9 21.88 18.75 NaN 3.12 \n", + "10 84.00 44.00 NaN 62.00 \n", + "11 74.00 58.00 NaN 8.00 \n", + "12 15.62 NaN NaN NaN \n", + "13 58.00 NaN NaN NaN \n", + "14 76.00 NaN NaN NaN \n", + "15 25.00 NaN NaN NaN \n", + "16 90.00 NaN NaN NaN \n", + "17 78.00 NaN NaN NaN \n", + "18 56.25 NaN 0.00 3.12 \n", + "19 86.00 NaN 54.00 50.00 \n", + "20 82.00 NaN 0.00 34.00 \n", + "21 50.00 NaN NaN NaN \n", + "22 96.00 NaN NaN NaN \n", + "23 86.00 NaN NaN NaN \n", + "24 59.38 NaN NaN NaN \n", + "25 98.00 NaN NaN NaN \n", + "26 94.00 NaN NaN NaN \n", + "27 56.25 NaN NaN NaN \n", + "28 100.00 NaN NaN NaN \n", + "29 82.00 NaN NaN NaN \n", + "30 50.00 NaN NaN NaN \n", + "31 94.00 NaN NaN NaN \n", + "32 74.00 NaN NaN NaN \n", + "33 31.25 12.50 NaN NaN \n", + "34 92.00 44.00 NaN NaN \n", + "35 40.00 48.00 NaN NaN \n", + "36 56.25 NaN NaN 15.62 \n", + "37 92.00 NaN NaN 32.00 \n", + "38 88.00 NaN NaN 58.00 \n", + "39 34.38 NaN 15.62 3.12 \n", + "40 78.00 NaN 58.00 40.00 \n", + "41 80.00 NaN 86.00 6.00 \n", + "42 15.62 18.75 NaN NaN \n", + "43 70.00 16.00 NaN NaN \n", + "44 64.00 18.00 NaN NaN \n", + "45 0.00 6.25 NaN 0.00 \n", + "46 14.00 12.00 NaN 18.00 \n", + "47 2.00 12.00 NaN 6.00 \n", + "48 3.12 NaN NaN 0.00 \n", + "49 40.00 NaN NaN 12.00 \n", + "50 20.00 NaN NaN 0.00 \n", + "51 31.25 NaN NaN 3.12 \n", + "52 72.00 NaN NaN 40.00 \n", + "53 78.00 NaN NaN 12.00 \n", + "54 46.88 NaN NaN NaN \n", + "55 88.00 NaN NaN NaN \n", + "56 88.00 NaN NaN NaN \n", + "57 25.00 NaN NaN NaN \n", + "58 84.00 NaN NaN NaN \n", + "59 90.00 NaN NaN NaN \n", + "60 50.00 NaN NaN 18.75 \n", + "61 96.00 NaN NaN 80.00 \n", + "62 84.00 NaN NaN 34.00 \n", + "63 46.88 NaN NaN NaN \n", + "64 98.00 NaN NaN NaN \n", + "65 80.00 NaN NaN NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ + "pd.set_option(\"display.max_rows\", None)\n", "display(pivot_df)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import matplotlib.pyplot as plt\n", "from matplotlib.legend_handler import HandlerTuple # Added import\n", @@ -297,12 +2110,12 @@ "\n", "# Create figure and axis\n", "plt.style.use(\"seaborn-v0_8-white\")\n", - "fig, ax = plt.subplots(figsize=(15, 6))\n", + "fig, ax = plt.subplots(figsize=(38, 15))\n", "\n", "# Set the width of each bar group and positions of the bars\n", "width = 0.15 # width of each bar\n", "spacing = 0.02 # space between bars within a group\n", - "group_spacing = 0.2 # space between model groups\n", + "group_spacing = 1 # space between model groups\n", "\n", "# Calculate positions for the bars\n", "num_sources = len(sources)\n", @@ -367,19 +2180,88 @@ " loc=\"upper left\",\n", ")\n", "\n", + "for text in custom_legend.get_texts():\n", + " current_size = text.get_fontsize()\n", + " text.set_fontsize(current_size * 2)\n", + "\n", "ax.yaxis.grid(True, linestyle=\"--\", alpha=0.3)\n", "ax.set_ylim(bottom=0)\n", "plt.tight_layout()\n", "ax.spines[\"top\"].set_visible(False)\n", "ax.spines[\"right\"].set_visible(False)\n", "\n", + "plt.savefig(\"model_performance_comparison.png\", bbox_inches=\"tight\", dpi=300)\n", "plt.show()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Todos\n", + "\n", + "- Run one model on the benchmark, as a sanity check.\n", + "- Run les modΓ¨les de base sur Gaia\n", + "- Check agents id dans le systΓ¨me prompt.\n", + "\n", + "- Run les diffΓ©rents benchmarks et regarder les traces. Voir si jamais erreurs d'implΓ©mentation.\n", + "\n", + "\n", + "### Done\n", + "\n", + "- Check timeout tools, the 200 first Fibonacchi number (timeout in the interpreter) ; #OK : Timeout at the trajectory level (kill at more than 5 minutes) #OK - Implemented in the run/py function + in the local_python_executor.py file.\n", + "- Make the decentralization in the tools.py rather that in agents.py\n", + "- Check if they return well the final answer (is the final_asnwer check is still well implemented or broken ? If yes, reimplement it. For instance, ask to make a tag or a majority vote or global concensus).\n", + "- Check if they have one tool to show the traces. If not, create one - We decided to use langfuse as a visualisation tool of the trace (json/html format)\n", + "\n", + "\n", + "\n", + "### Next steps\n", + "- debug the tests noted in installation.md\n", + "- Actualize make tests\n", + "- Create a new score.ipynb\n", + "- Integrate trl\n", + "- RΓ©flΓ©chir sur les mΓ©thodes de final answer : avoir en tΓͺte que l'on veut avoir plusieurs faΓ§ons de le faire.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Meeting of Michal W (July 11th 2025)\n", + "\n", + "This dynamic batching for vllm should be integrated in TRL. So far, not much infos.\n", + "\n", + "Dynamic batching = run several agents and try to batch their code. The issue is when they send it to the API. One of the issue could be that when we send the old batch, one of the old ...\n", + "\n", + "Check the score.ipynb, but with the new model (maybe only a part of this to get time, just for a sanity check)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Web search tools used:\n", + "\n", + "The repository uses several implementations of a web_search tool:\n", + "\n", + " - DuckDuckGoSearchTool – relies on the duckduckgo_search library.\n", + "\n", + " - GoogleSearchTool – queries Google via SerpAPI or Serper using requests and API keys.\n", + "\n", + " - ApiWebSearchTool – performs searches using Brave’s API.\n", + "\n", + " - WebSearchTool – a general tool that can scrape DuckDuckGo or Bing results with plain HTTP requests.\n", + "\n", + "These tools provide markdown-formatted search results for agents in the project." + ] } ], "metadata": { "kernelspec": { - "display_name": "agents", + "display_name": "dec_smolagents", "language": "python", "name": "python3" }, @@ -393,7 +2275,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.0" + "version": "3.12.11" } }, "nbformat": 4, diff --git a/installation.md b/installation.md new file mode 100644 index 000000000..bb82e9f6c --- /dev/null +++ b/installation.md @@ -0,0 +1,42 @@ +# Installation of smolagent + +1. Install vllm + 1. Install uv (see https://docs.astral.sh/uv/getting-started/installation/) + 2. Run + ```bash + uv venv --python 3.12 --seed + source .venv/bin/activate + uv pip install vllm --torch-backend=auto + ``` + +2. Install other python packages: + +```bash +uv pip install flit +uv pip install . +flit install + +uv pip install -e "smolagents[dev] @ ." +``` + +3. Pass Hugging Face key as an environment variable: run `export HF_TOKEN=your_huggingface_api_key_here` +You can also set mistral-AI API keys. For commodity, put all the keys that you need in a .env file. For instance + +```bash +LANGFUSE_PUBLIC_KEY= +LANGFUSE_SECRET_KEY= +LANGFUSE_HOST="https://cloud.langfuse.com" +HF_TOKEN= +``` + +4. Compile the file: +```bash +make +make test +``` + +You can test easlily that the code works by using: +```bash +smolagent "Compute 5 + 3" --num-agents 2 --tools python_interpreter --imports langfuse +python examples/decentralized_smolagents_benchmark/run.py --model-id gpt-4o --agent-action-type vanilla #agent-action-type code or tool-calling require a SERPER API key +``` diff --git a/pyproject.toml b/pyproject.toml index 05577220b..a193109d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,11 @@ build-backend = "setuptools.build_meta" [project] name = "smolagents" + version = "1.23.0.dev0" + description = "πŸ€— smolagents: a barebones library for agents. Agents write python code to call tools or orchestrate other agents." + authors = [ { name="Aymeric Roucher", email="aymeric@hf.co" }, ] @@ -16,8 +19,8 @@ dependencies = [ "requests>=2.32.3", "rich>=13.9.4", "jinja2>=3.1.4", - "pillow>=10.0.1", - # Security fix for CVE-2023-4863: https://pillow.readthedocs.io/en/stable/releasenotes/10.0.1.html + "pillow>=10.0.1", # Security fix for CVE-2023-4863: https://pillow.readthedocs.io/en/stable/releasenotes/10.0.1.html + "multiprocess", "python-dotenv", ] @@ -63,6 +66,7 @@ openai = [ "openai>=1.58.1" ] telemetry = [ + "langfuse>=2.0.0", "arize-phoenix", "opentelemetry-sdk", "opentelemetry-exporter-otlp", diff --git a/tests/test_local_python_executor.py b/tests/test_local_python_executor.py index d9da9e78c..7f59a3916 100644 --- a/tests/test_local_python_executor.py +++ b/tests/test_local_python_executor.py @@ -48,8 +48,9 @@ def add_two(x): class TestEvaluatePythonCode: def assertDictEqualNoPrint(self, dict1, dict2): - assert {k: v for k, v in dict1.items() if k != "_print_outputs"} == { - k: v for k, v in dict2.items() if k != "_print_outputs" + exclude_keys = ["_print_outputs", "_start_time", "_timeout"] + assert {k: v for k, v in dict1.items() if k not in exclude_keys} == { + k: v for k, v in dict2.items() if k not in exclude_keys } def test_evaluate_assign(self):