Skip to content

Commit 1c0141f

Browse files
committed
feat: serper api search
1 parent 4380afb commit 1c0141f

File tree

2 files changed

+146
-56
lines changed

2 files changed

+146
-56
lines changed

scrapegraphai/nodes/search_internet_node.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,17 @@ def __init__(
4444
self.verbose = (
4545
False if node_config is None else node_config.get("verbose", False)
4646
)
47+
self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
4748
self.search_engine = (
4849
node_config["search_engine"]
4950
if node_config.get("search_engine")
5051
else "google"
5152
)
53+
54+
self.serper_api_key = (
55+
node_config["serper_api_key"] if node_config.get("serper_api_key") else None
56+
)
57+
5258
self.max_results = node_config.get("max_results", 3)
5359

5460
def execute(self, state: dict) -> dict:
@@ -100,13 +106,12 @@ def execute(self, state: dict) -> dict:
100106
query=search_query,
101107
max_results=self.max_results,
102108
search_engine=self.search_engine,
109+
proxy=self.proxy,
110+
serper_api_key=self.serper_api_key,
103111
)
104112

105113
if len(answer) == 0:
106114
raise ValueError("Zero results found for the search query.")
107115

108-
# Store both the URLs and considered_urls in the state
109116
state.update({self.output[0]: answer})
110-
state["considered_urls"] = answer # Add this as a backup
111-
112117
return state
Lines changed: 138 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Research_web module
2+
research_web module
33
"""
44

55
import re
@@ -12,68 +12,153 @@
1212

1313

1414
def search_on_web(
15-
query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080
15+
query: str,
16+
search_engine: str = "Google",
17+
max_results: int = 10,
18+
port: int = 8080,
19+
timeout: int = 10,
20+
proxy: str | dict = None,
21+
serper_api_key: str = None,
1622
) -> List[str]:
17-
"""
18-
Searches the web for a given query using specified search engine options.
23+
"""Search web function with improved error handling and validation"""
1924

20-
Args:
21-
query (str): The search query to find on the internet.
22-
search_engine (str, optional): Specifies the search engine to use,
23-
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
24-
max_results (int, optional): The maximum number of search results to return.
25-
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
25+
# Input validation
26+
if not query or not isinstance(query, str):
27+
raise ValueError("Query must be a non-empty string")
2628

27-
Returns:
28-
List[str]: A list of URLs as strings that are the search results.
29+
search_engine = search_engine.lower()
30+
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
31+
if search_engine not in valid_engines:
32+
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")
2933

30-
Raises:
31-
ValueError: If the search engine specified is not supported.
34+
# Format proxy once
35+
formatted_proxy = None
36+
if proxy:
37+
formatted_proxy = format_proxy(proxy)
3238

33-
Example:
34-
>>> search_on_web("example query", search_engine="Google", max_results=5)
35-
['http://example.com', 'http://example.org', ...]
36-
"""
39+
try:
40+
results = []
41+
if search_engine == "google":
42+
results = list(
43+
google_search(query, num_results=max_results, proxy=formatted_proxy)
44+
)
45+
46+
elif search_engine == "duckduckgo":
47+
research = DuckDuckGoSearchResults(max_results=max_results)
48+
res = research.run(query)
49+
results = re.findall(r"https?://[^\s,\]]+", res)
50+
51+
elif search_engine == "bing":
52+
results = _search_bing(query, max_results, timeout, formatted_proxy)
53+
54+
elif search_engine == "searxng":
55+
results = _search_searxng(query, max_results, port, timeout)
56+
57+
elif search_engine.lower() == "serper":
58+
results = _search_serper(query, max_results, serper_api_key, timeout)
59+
60+
return filter_pdf_links(results)
61+
62+
except requests.Timeout:
63+
raise TimeoutError(f"Search request timed out after {timeout} seconds")
64+
except requests.RequestException as e:
65+
raise RuntimeError(f"Search request failed: {str(e)}")
66+
67+
68+
def _search_bing(
69+
query: str, max_results: int, timeout: int, proxy: str = None
70+
) -> List[str]:
71+
"""Helper function for Bing search"""
72+
headers = {
73+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
74+
}
75+
search_url = f"https://www.bing.com/search?q={query}"
76+
77+
proxies = {"http": proxy, "https": proxy} if proxy else None
78+
response = requests.get(
79+
search_url, headers=headers, timeout=timeout, proxies=proxies
80+
)
81+
response.raise_for_status()
82+
83+
soup = BeautifulSoup(response.text, "html.parser")
84+
return [
85+
result.find("a")["href"]
86+
for result in soup.find_all("li", class_="b_algo", limit=max_results)
87+
]
88+
89+
90+
def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
91+
"""Helper function for SearXNG search"""
92+
url = f"http://localhost:{port}/search"
93+
params = {
94+
"q": query,
95+
"format": "json",
96+
"engines": "google,duckduckgo,brave,qwant,bing",
97+
}
98+
response = requests.get(url, params=params, timeout=timeout)
99+
response.raise_for_status()
100+
return [
101+
result["url"] for result in response.json().get("results", [])[:max_results]
102+
]
103+
104+
105+
def _search_serper(
106+
query: str, max_results: int, serper_api_key: str, timeout: int
107+
) -> List[str]:
108+
"""Helper function for Serper API to get Google search results"""
109+
if not serper_api_key:
110+
raise ValueError("API key is required for Serper API")
37111

38-
if search_engine.lower() == "google":
39-
res = []
40-
for url in google_search(query, num_results=max_results):
41-
res.append(url)
42-
return res
43-
44-
elif search_engine.lower() == "duckduckgo":
45-
research = DuckDuckGoSearchResults(max_results=max_results)
46-
res = research.run(query)
47-
links = re.findall(r"https?://[^\s,\]]+", res)
48-
return links[:max_results]
49-
50-
elif search_engine.lower() == "bing":
51-
headers = {
52-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
53-
}
54-
search_url = f"https://www.bing.com/search?q={query}"
55-
response = requests.get(search_url, headers=headers)
112+
url = "https://google.serper.dev/search"
113+
payload = {"q": query, "num": max_results}
114+
115+
headers = {"X-API-KEY": serper_api_key, "Content-Type": "application/json"}
116+
117+
try:
118+
response = requests.post(
119+
url,
120+
headers=headers,
121+
json=payload, # requests will handle JSON serialization
122+
timeout=timeout,
123+
)
56124
response.raise_for_status()
57-
soup = BeautifulSoup(response.text, "html.parser")
58125

59-
search_results = []
60-
for result in soup.find_all("li", class_="b_algo", limit=max_results):
61-
link = result.find("a")["href"]
62-
search_results.append(link)
63-
return search_results
126+
# Extract only the organic search results
127+
results = response.json()
128+
organic_results = results.get("organic", [])
129+
urls = [result.get("link") for result in organic_results if result.get("link")]
130+
131+
return urls[:max_results]
64132

65-
elif search_engine.lower() == "searxng":
66-
url = f"http://localhost:{port}"
67-
params = {"q": query, "format": "json"}
133+
except requests.exceptions.RequestException as e:
134+
raise RuntimeError(f"Serper API request failed: {str(e)}")
68135

69-
# Send the GET request to the server
70-
response = requests.get(url, params=params)
71136

72-
data = response.json()
73-
limited_results = [result["url"] for result in data["results"][:max_results]]
74-
return limited_results
137+
def format_proxy(proxy):
138+
if isinstance(proxy, dict):
139+
server = proxy.get("server")
140+
username = proxy.get("username")
141+
password = proxy.get("password")
75142

143+
if all([username, password, server]):
144+
proxy_url = f"http://{username}:{password}@{server}"
145+
return proxy_url
146+
else:
147+
raise ValueError("Proxy dictionary is missing required fields.")
148+
elif isinstance(proxy, str):
149+
return proxy # "https://username:password@ip:port"
76150
else:
77-
raise ValueError(
78-
"The only search engines available are DuckDuckGo, Google, Bing, or SearXNG"
79-
)
151+
raise TypeError("Proxy should be a dictionary or a string.")
152+
153+
154+
def filter_pdf_links(links: List[str]) -> List[str]:
155+
"""
156+
Filters out any links that point to PDF files.
157+
158+
Args:
159+
links (List[str]): A list of URLs as strings.
160+
161+
Returns:
162+
List[str]: A list of URLs excluding any that end with '.pdf'.
163+
"""
164+
return [link for link in links if not link.lower().endswith(".pdf")]

0 commit comments

Comments
 (0)