Skip to content

Commit

Permalink
feat: Internet Agent using Firecrawl SaaS (#735)
Browse files Browse the repository at this point in the history
* WIP

* CI対応

* CI対応

* discriminatedの対応

* model_validate

* CIのビルドエラー修正

* レビューコメント反映

* CIエラー修正

* UIのインデント調整

* reformat

* delete_secret_managerの実施場所変更.

* search_engineがfirecrawlなのに、firecrawl_configが未入力の場合はエラーにする

* apikeyのvalidation

* Refactor API key handling and update tool models in the bot framework

* Rename delete_secret_manager to delete_api_key_from_secret_manager

* fix: raise error when failed for internet search tool

* feat: enhance Firecrawl integration with improved validation and legacy tool handling

* chore: lint

---------

Co-authored-by: statefb <[email protected]>
  • Loading branch information
fsatsuki and statefb authored Feb 28, 2025
1 parent 57c55b4 commit 15bdb8e
Show file tree
Hide file tree
Showing 27 changed files with 1,983 additions and 573 deletions.
133 changes: 117 additions & 16 deletions backend/app/agents/tools/internet_search.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import logging

from app.agents.tools.agent_tool import AgentTool
from app.repositories.models.custom_bot import BotModel
from app.repositories.models.custom_bot import BotModel, InternetToolModel
from app.routes.schemas.conversation import type_model_name
from duckduckgo_search import DDGS
from firecrawl.firecrawl import FirecrawlApp
from pydantic import BaseModel, Field, root_validator

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class InternetSearchInput(BaseModel):
query: str = Field(description="The query to search for on the internet.")
Expand Down Expand Up @@ -33,38 +39,133 @@ def validate_country(cls, values):
return values


def internet_search(
tool_input: InternetSearchInput, bot: BotModel | None, model: type_model_name | None
) -> list:
query = tool_input.query
time_limit = tool_input.time_limit
country = tool_input.country

def _search_with_duckduckgo(query: str, time_limit: str, country: str) -> list:
REGION = country
SAFE_SEARCH = "moderate"
MAX_RESULTS = 20
BACKEND = "api"
logger.info(
f"Executing DuckDuckGo search with query: {query}, region: {REGION}, time_limit: {time_limit}"
)
with DDGS() as ddgs:
return [
{
"content": result["body"],
"source_name": result["title"],
"source_link": result["href"],
}
for result in ddgs.text(
results = list(
ddgs.text(
keywords=query,
region=REGION,
safesearch=SAFE_SEARCH,
timelimit=time_limit,
max_results=MAX_RESULTS,
backend=BACKEND,
)
)
logger.info(f"DuckDuckGo search completed. Found {len(results)} results")
return [
{
"content": result["body"],
"source_name": result["title"],
"source_link": result["href"],
}
for result in results
]


def _search_with_firecrawl(
query: str, api_key: str, country: str, max_results: int = 10
) -> list:
logger.info(f"Searching with Firecrawl. Query: {query}, Max Results: {max_results}")

try:
app = FirecrawlApp(api_key=api_key)

# Search using Firecrawl
# SearchParams: https://github.com/mendableai/firecrawl/blob/main/apps/python-sdk/firecrawl/firecrawl.py#L24
results = app.search(
query,
{
"limit": max_results,
"lang": country,
"scrapeOptions": {"formats": ["markdown"], "onlyMainContent": True},
},
)

if not results:
logger.warning("No results found")
return []
logger.info(f"results of firecrawl: {results}")

# Format search results
search_results = [
{
"content": data.get("markdown", {}),
"source_name": data.get("title", ""),
"source_link": data.get("metadata", {}).get("sourceURL", ""),
}
for data in results.get("data", [])
if isinstance(data, dict)
]

logger.info(f"Found {len(search_results)} results from Firecrawl")
return search_results

except Exception as e:
logger.error(f"Error searching with Firecrawl: {e}")
raise e


def _internet_search(
tool_input: InternetSearchInput, bot: BotModel | None, model: type_model_name | None
) -> list:
query = tool_input.query
time_limit = tool_input.time_limit
country = tool_input.country

logger.info(
f"Internet search request - Query: {query}, Time Limit: {time_limit}, Country: {country}"
)

if bot is None:
logger.warning("Bot is None, defaulting to DuckDuckGo search")
return _search_with_duckduckgo(query, time_limit, country)

# Find internet search tool
internet_tool = next(
(tool for tool in bot.agent.tools if isinstance(tool, InternetToolModel)),
None,
)

# If no internet tool found or search engine is duckduckgo, use DuckDuckGo
if not internet_tool or internet_tool.search_engine == "duckduckgo":
logger.info("No internet tool found or search engine is DuckDuckGo")
return _search_with_duckduckgo(query, time_limit, country)

# Handle Firecrawl search
if internet_tool.search_engine == "firecrawl":
if not internet_tool.firecrawl_config:
raise ValueError("Firecrawl configuration is not set in the bot.")

try:
api_key = internet_tool.firecrawl_config.api_key
if not api_key:
raise ValueError("Firecrawl API key is empty")

return _search_with_firecrawl(
query=query,
api_key=api_key,
country=country,
max_results=internet_tool.firecrawl_config.max_results,
)
except Exception as e:
logger.error(f"Error with Firecrawl search: {e}")
raise e

# Fallback to DuckDuckGo for any unexpected cases
logger.warning("Unexpected search engine configuration, falling back to DuckDuckGo")
return _search_with_duckduckgo(query, time_limit, country)


internet_search_tool = AgentTool(
name="internet_search",
description="Search the internet for information.",
args_schema=InternetSearchInput,
function=internet_search,
function=_internet_search,
)
2 changes: 2 additions & 0 deletions backend/app/bot_remove.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
find_usage_plan_by_id,
)
from app.repositories.common import RecordNotFoundError, decompose_bot_id
from app.utils import delete_api_key_from_secret_manager

DOCUMENT_BUCKET = os.environ.get("DOCUMENT_BUCKET", "documents")
BEDROCK_REGION = os.environ.get("BEDROCK_REGION", "us-east-1")
Expand Down Expand Up @@ -75,6 +76,7 @@ def handler(event: dict, context: Any) -> None:

delete_from_s3(user_id, bot_id)
delete_custom_bot_stack_by_bot_id(bot_id)
delete_api_key_from_secret_manager(user_id, bot_id, "firecrawl")

# Check if api published stack exists
try:
Expand Down
9 changes: 5 additions & 4 deletions backend/app/repositories/custom_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import logging
import os
from typing import Union
from datetime import datetime
from decimal import Decimal as decimal
from functools import partial
Expand Down Expand Up @@ -34,7 +35,7 @@
)
from app.repositories.models.custom_bot_guardrails import BedrockGuardrailsModel
from app.repositories.models.custom_bot_kb import BedrockKnowledgeBaseModel
from app.routes.schemas.bot import BotMetaOutput, type_sync_status
from app.routes.schemas.bot import type_sync_status
from app.utils import get_current_time
from boto3.dynamodb.conditions import Attr, Key
from botocore.exceptions import ClientError
Expand All @@ -49,7 +50,7 @@
)

logger = logging.getLogger(__name__)
sts_client = boto3.client("sts")
logger.setLevel("INFO")


class BotNotFoundException(Exception):
Expand Down Expand Up @@ -449,7 +450,7 @@ def find_private_bot_by_id(user_id: str, bot_id: str) -> BotModel:
else DEFAULT_GENERATION_CONFIG
),
agent=(
AgentModel(**item["AgentData"])
AgentModel.model_validate(item["AgentData"])
if "AgentData" in item
else AgentModel(tools=[])
),
Expand Down Expand Up @@ -531,7 +532,7 @@ def find_public_bot_by_id(bot_id: str) -> BotModel:
else DEFAULT_GENERATION_CONFIG
),
agent=(
AgentModel(**item["AgentData"])
AgentModel.model_validate(item["AgentData"])
if "AgentData" in item
else AgentModel(tools=[])
),
Expand Down
8 changes: 8 additions & 0 deletions backend/app/repositories/models/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def decode_base64_string(value: Any) -> bytes:
),
]

# Ensure that the value is set to empty when serializing.
# When deserializing, need to care to fetch the value from the secret store
# such as Secrets Manager.
SecureString = Annotated[
str,
PlainSerializer(lambda v: "", return_type=str),
]


class DynamicBaseModel(BaseModel):
model_config = ConfigDict(extra="allow")
Loading

0 comments on commit 15bdb8e

Please sign in to comment.