feat: serper api search

PeriniM · PeriniM · commit 1c0141fd2818 · 2025-01-06T16:34:45.000+01:00
diff --git a/scrapegraphai/nodes/search_internet_node.py b/scrapegraphai/nodes/search_internet_node.py
@@ -44,11 +44,17 @@ def __init__(
         self.verbose = (
             False if node_config is None else node_config.get("verbose", False)
         )
+        self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
         self.search_engine = (
             node_config["search_engine"]
             if node_config.get("search_engine")
             else "google"
         )
+
+        self.serper_api_key = (
+            node_config["serper_api_key"] if node_config.get("serper_api_key") else None
+        )
+
         self.max_results = node_config.get("max_results", 3)
 
     def execute(self, state: dict) -> dict:
@@ -100,13 +106,12 @@ def execute(self, state: dict) -> dict:
             query=search_query,
             max_results=self.max_results,
             search_engine=self.search_engine,
+            proxy=self.proxy,
+            serper_api_key=self.serper_api_key,
         )
 
         if len(answer) == 0:
             raise ValueError("Zero results found for the search query.")
 
-        # Store both the URLs and considered_urls in the state
         state.update({self.output[0]: answer})
-        state["considered_urls"] = answer  # Add this as a backup
-
         return state
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
@@ -1,5 +1,5 @@
 """
-Research_web module
+research_web module
 """
 
 import re
@@ -12,68 +12,153 @@
 
 
 def search_on_web(
-    query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080
+    query: str,
+    search_engine: str = "Google",
+    max_results: int = 10,
+    port: int = 8080,
+    timeout: int = 10,
+    proxy: str | dict = None,
+    serper_api_key: str = None,
 ) -> List[str]:
-    """
-    Searches the web for a given query using specified search engine options.
+    """Search web function with improved error handling and validation"""
 
-    Args:
-        query (str): The search query to find on the internet.
-        search_engine (str, optional): Specifies the search engine to use,
-        options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
-        max_results (int, optional): The maximum number of search results to return.
-        port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
+    # Input validation
+    if not query or not isinstance(query, str):
+        raise ValueError("Query must be a non-empty string")
 
-    Returns:
-        List[str]: A list of URLs as strings that are the search results.
+    search_engine = search_engine.lower()
+    valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
+    if search_engine not in valid_engines:
+        raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
 
-    Raises:
-        ValueError: If the search engine specified is not supported.
+    # Format proxy once
+    formatted_proxy = None
+    if proxy:
+        formatted_proxy = format_proxy(proxy)
 
-    Example:
-        >>> search_on_web("example query", search_engine="Google", max_results=5)
-        ['http://example.com', 'http://example.org', ...]
-    """
+    try:
+        results = []
+        if search_engine == "google":
+            results = list(
+                google_search(query, num_results=max_results, proxy=formatted_proxy)
+            )
+
+        elif search_engine == "duckduckgo":
+            research = DuckDuckGoSearchResults(max_results=max_results)
+            res = research.run(query)
+            results = re.findall(r"https?://[^\s,\]]+", res)
+
+        elif search_engine == "bing":
+            results = _search_bing(query, max_results, timeout, formatted_proxy)
+
+        elif search_engine == "searxng":
+            results = _search_searxng(query, max_results, port, timeout)
+
+        elif search_engine.lower() == "serper":
+            results = _search_serper(query, max_results, serper_api_key, timeout)
+
+        return filter_pdf_links(results)
+
+    except requests.Timeout:
+        raise TimeoutError(f"Search request timed out after {timeout} seconds")
+    except requests.RequestException as e:
+        raise RuntimeError(f"Search request failed: {str(e)}")
+
+
+def _search_bing(
+    query: str, max_results: int, timeout: int, proxy: str = None
+) -> List[str]:
+    """Helper function for Bing search"""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    }
+    search_url = f"https://www.bing.com/search?q={query}"
+
+    proxies = {"http": proxy, "https": proxy} if proxy else None
+    response = requests.get(
+        search_url, headers=headers, timeout=timeout, proxies=proxies
+    )
+    response.raise_for_status()
+
+    soup = BeautifulSoup(response.text, "html.parser")
+    return [
+        result.find("a")["href"]
+        for result in soup.find_all("li", class_="b_algo", limit=max_results)
+    ]
+
+
+def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
+    """Helper function for SearXNG search"""
+    url = f"http://localhost:{port}/search"
+    params = {
+        "q": query,
+        "format": "json",
+        "engines": "google,duckduckgo,brave,qwant,bing",
+    }
+    response = requests.get(url, params=params, timeout=timeout)
+    response.raise_for_status()
+    return [
+        result["url"] for result in response.json().get("results", [])[:max_results]
+    ]
+
+
+def _search_serper(
+    query: str, max_results: int, serper_api_key: str, timeout: int
+) -> List[str]:
+    """Helper function for Serper API to get Google search results"""
+    if not serper_api_key:
+        raise ValueError("API key is required for Serper API")
 
-    if search_engine.lower() == "google":
-        res = []
-        for url in google_search(query, num_results=max_results):
-            res.append(url)
-        return res
-
-    elif search_engine.lower() == "duckduckgo":
-        research = DuckDuckGoSearchResults(max_results=max_results)
-        res = research.run(query)
-        links = re.findall(r"https?://[^\s,\]]+", res)
-        return links[:max_results]
-
-    elif search_engine.lower() == "bing":
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-        }
-        search_url = f"https://www.bing.com/search?q={query}"
-        response = requests.get(search_url, headers=headers)
+    url = "https://google.serper.dev/search"
+    payload = {"q": query, "num": max_results}
+
+    headers = {"X-API-KEY": serper_api_key, "Content-Type": "application/json"}
+
+    try:
+        response = requests.post(
+            url,
+            headers=headers,
+            json=payload,  # requests will handle JSON serialization
+            timeout=timeout,
+        )
         response.raise_for_status()
-        soup = BeautifulSoup(response.text, "html.parser")
 
-        search_results = []
-        for result in soup.find_all("li", class_="b_algo", limit=max_results):
-            link = result.find("a")["href"]
-            search_results.append(link)
-        return search_results
+        # Extract only the organic search results
+        results = response.json()
+        organic_results = results.get("organic", [])
+        urls = [result.get("link") for result in organic_results if result.get("link")]
+
+        return urls[:max_results]
 
-    elif search_engine.lower() == "searxng":
-        url = f"http://localhost:{port}"
-        params = {"q": query, "format": "json"}
+    except requests.exceptions.RequestException as e:
+        raise RuntimeError(f"Serper API request failed: {str(e)}")
 
-        # Send the GET request to the server
-        response = requests.get(url, params=params)
 
-        data = response.json()
-        limited_results = [result["url"] for result in data["results"][:max_results]]
-        return limited_results
+def format_proxy(proxy):
+    if isinstance(proxy, dict):
+        server = proxy.get("server")
+        username = proxy.get("username")
+        password = proxy.get("password")
 
+        if all([username, password, server]):
+            proxy_url = f"http://{username}:{password}@{server}"
+            return proxy_url
+        else:
+            raise ValueError("Proxy dictionary is missing required fields.")
+    elif isinstance(proxy, str):
+        return proxy  # "https://username:password@ip:port"
     else:
-        raise ValueError(
-            "The only search engines available are DuckDuckGo, Google, Bing, or SearXNG"
-        )
+        raise TypeError("Proxy should be a dictionary or a string.")
+
+
+def filter_pdf_links(links: List[str]) -> List[str]:
+    """
+    Filters out any links that point to PDF files.
+
+    Args:
+        links (List[str]): A list of URLs as strings.
+
+    Returns:
+        List[str]: A list of URLs excluding any that end with '.pdf'.
+    """
+    return [link for link in links if not link.lower().endswith(".pdf")]