From b3139dc3412efbd119428571f3e93cb4a8f14245 Mon Sep 17 00:00:00 2001 From: Vikrant-Khedkar Date: Sat, 9 Aug 2025 13:33:03 +0530 Subject: [PATCH 1/2] Refactor web crawling methods in server.py to SmartCrawler terminology. Update method signatures and documentation for clarity on AI extraction and markdown conversion modes. --- src/scrapegraph_mcp/server.py | 112 +++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 50 deletions(-) diff --git a/src/scrapegraph_mcp/server.py b/src/scrapegraph_mcp/server.py index 42e638c..2917598 100644 --- a/src/scrapegraph_mcp/server.py +++ b/src/scrapegraph_mcp/server.py @@ -5,8 +5,8 @@ - markdownify: Convert any webpage into clean, formatted markdown - smartscraper: Extract structured data from any webpage using AI - searchscraper: Perform AI-powered web searches with structured results -- crawl_requester: Initiate intelligent web crawling requests (step 1) -- crawl_fetcher: Fetch results from crawling requests (step 2) +- smartcrawler_initiate: Initiate intelligent multi-page web crawling with AI extraction or markdown conversion +- smartcrawler_fetch_results: Retrieve results from asynchronous crawling operations """ import os @@ -126,49 +126,54 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr return response.json() - def crawl_requester( + def smartcrawler_initiate( self, url: str, prompt: str = None, - cache_website: bool = None, + extraction_mode: str = "ai", depth: int = None, max_pages: int = None, - same_domain_only: bool = None, - markdown_only: bool = None + same_domain_only: bool = None ) -> Dict[str, Any]: """ - Initiate a web crawling request and get a request ID. + Initiate a SmartCrawler request for multi-page web crawling. + + SmartCrawler supports two modes: + - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt + - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown + + Smartcrawler takes some time to process the request and returns the request id. + Use smartcrawler_fetch_results to get the results of the request. + You have to keep polling the smartcrawler_fetch_results until the request is complete. + The request is complete when the status is "completed". Args: url: Starting URL to crawl - prompt: AI prompt for data extraction (optional, if not provided returns markdown only) - cache_website: Whether to cache the website content (optional) - depth: Maximum crawling depth (optional) + prompt: AI prompt for data extraction (required for AI mode) + extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai") + depth: Maximum link traversal depth (optional) max_pages: Maximum number of pages to crawl (optional) same_domain_only: Whether to crawl only within the same domain (optional) - markdown_only: Whether to return only markdown content without AI processing (optional) Returns: - Dictionary containing the request ID and status + Dictionary containing the request ID for async processing """ - endpoint = f"{self.BASE_URL}/crawl/requester" + endpoint = f"{self.BASE_URL}/crawl" data = { "url": url } - # Add optional parameters if provided - if prompt is not None: + # Handle extraction mode + if extraction_mode == "markdown": + data["markdown_only"] = True + elif prompt is not None: data["prompt"] = prompt - if cache_website is not None: - data["cache_website"] = cache_website if depth is not None: data["depth"] = depth if max_pages is not None: data["max_pages"] = max_pages if same_domain_only is not None: data["same_domain_only"] = same_domain_only - if markdown_only is not None: - data["markdown_only"] = markdown_only response = self.client.post(endpoint, headers=self.headers, json=data) @@ -178,22 +183,27 @@ def crawl_requester( return response.json() - def crawl_fetcher(self, request_id: str) -> Dict[str, Any]: + def smartcrawler_fetch_results(self, request_id: str) -> Dict[str, Any]: """ - Fetch the results of a crawling request using the request ID. + Fetch the results of a SmartCrawler operation. Args: - request_id: The request ID returned by crawl_requester + request_id: The request ID returned by smartcrawler_initiate Returns: - Dictionary containing the crawl results or status + Dictionary containing the crawled data (structured extraction or markdown) + and metadata about processed pages + + Note: + It takes some time to process the request and returns the results. + Meanwhile it returns the status of the request. + You have to keep polling the smartcrawler_fetch_results until the request is complete. + The request is complete when the status is "completed". and you get results + Keep polling the smartcrawler_fetch_results until the request is complete. """ - endpoint = f"{self.BASE_URL}/crawl/fetcher" - data = { - "request_id": request_id - } - - response = self.client.post(endpoint, headers=self.headers, json=data) + endpoint = f"{self.BASE_URL}/crawl/{request_id}" + + response = self.client.get(endpoint, headers=self.headers) if response.status_code != 200: error_msg = f"Error {response.status_code}: {response.text}" @@ -291,66 +301,68 @@ def searchscraper( return {"error": str(e)} -# Add tool for crawl requester (smartcrawler step 1) +# Add tool for SmartCrawler initiation @mcp.tool() -def crawl_requester( +def smartcrawler_initiate( url: str, prompt: str = None, - cache_website: bool = None, + extraction_mode: str = "ai", depth: int = None, max_pages: int = None, - same_domain_only: bool = None, - markdown_only: bool = None + same_domain_only: bool = None ) -> Dict[str, Any]: """ - Initiate a web crawling request and get a request ID. + Initiate a SmartCrawler request for intelligent multi-page web crawling. + + SmartCrawler supports two modes: + - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt + - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown Args: url: Starting URL to crawl - prompt: AI prompt for data extraction (optional, if not provided returns markdown only) - cache_website: Whether to cache the website content (optional) - depth: Maximum crawling depth (optional) + prompt: AI prompt for data extraction (required for AI mode) + extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai") + depth: Maximum link traversal depth (optional) max_pages: Maximum number of pages to crawl (optional) same_domain_only: Whether to crawl only within the same domain (optional) - markdown_only: Whether to return only markdown content without AI processing (optional) Returns: - Dictionary containing the request ID and status + Dictionary containing the request ID for async processing """ if scrapegraph_client is None: return {"error": "ScapeGraph client not initialized. Please provide an API key."} try: - return scrapegraph_client.crawl_requester( + return scrapegraph_client.smartcrawler_initiate( url=url, prompt=prompt, - cache_website=cache_website, + extraction_mode=extraction_mode, depth=depth, max_pages=max_pages, - same_domain_only=same_domain_only, - markdown_only=markdown_only + same_domain_only=same_domain_only ) except Exception as e: return {"error": str(e)} -# Add tool for crawl fetcher (smartcrawler step 2) +# Add tool for fetching SmartCrawler results @mcp.tool() -def crawl_fetcher(request_id: str) -> Dict[str, Any]: +def smartcrawler_fetch_results(request_id: str) -> Dict[str, Any]: """ - Fetch the results of a crawling request using the request ID. + Fetch the results of a SmartCrawler operation. Args: - request_id: The request ID returned by crawl_requester + request_id: The request ID returned by smartcrawler_initiate Returns: - Dictionary containing the crawl results or status + Dictionary containing the crawled data (structured extraction or markdown) + and metadata about processed pages """ if scrapegraph_client is None: return {"error": "ScapeGraph client not initialized. Please provide an API key."} try: - return scrapegraph_client.crawl_fetcher(request_id) + return scrapegraph_client.smartcrawler_fetch_results(request_id) except Exception as e: return {"error": str(e)} From 54b330da981dc392a6f4d92fa02d745dfc43b700 Mon Sep 17 00:00:00 2001 From: Vikrant-Khedkar Date: Mon, 11 Aug 2025 12:42:47 +0530 Subject: [PATCH 2/2] Enhance error handling in ScapeGraphClient for extraction modes. Ensure prompt is required for 'ai' mode and validate extraction_mode input. --- src/scrapegraph_mcp/server.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/scrapegraph_mcp/server.py b/src/scrapegraph_mcp/server.py index 2917598..c250074 100644 --- a/src/scrapegraph_mcp/server.py +++ b/src/scrapegraph_mcp/server.py @@ -166,8 +166,12 @@ def smartcrawler_initiate( # Handle extraction mode if extraction_mode == "markdown": data["markdown_only"] = True - elif prompt is not None: + elif extraction_mode == "ai": + if prompt is None: + raise ValueError("prompt is required when extraction_mode is 'ai'") data["prompt"] = prompt + else: + raise ValueError(f"Invalid extraction_mode: {extraction_mode}. Must be 'ai' or 'markdown'") if depth is not None: data["depth"] = depth if max_pages is not None: