|
1 | 1 | """ |
2 | | -Research_web module |
| 2 | +research_web module |
3 | 3 | """ |
4 | 4 |
|
5 | 5 | import re |
|
12 | 12 |
|
13 | 13 |
|
14 | 14 | def search_on_web( |
15 | | - query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080 |
| 15 | + query: str, |
| 16 | + search_engine: str = "Google", |
| 17 | + max_results: int = 10, |
| 18 | + port: int = 8080, |
| 19 | + timeout: int = 10, |
| 20 | + proxy: str | dict = None, |
| 21 | + serper_api_key: str = None, |
16 | 22 | ) -> List[str]: |
17 | | - """ |
18 | | - Searches the web for a given query using specified search engine options. |
| 23 | + """Search web function with improved error handling and validation""" |
19 | 24 |
|
20 | | - Args: |
21 | | - query (str): The search query to find on the internet. |
22 | | - search_engine (str, optional): Specifies the search engine to use, |
23 | | - options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. |
24 | | - max_results (int, optional): The maximum number of search results to return. |
25 | | - port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. |
| 25 | + # Input validation |
| 26 | + if not query or not isinstance(query, str): |
| 27 | + raise ValueError("Query must be a non-empty string") |
26 | 28 |
|
27 | | - Returns: |
28 | | - List[str]: A list of URLs as strings that are the search results. |
| 29 | + search_engine = search_engine.lower() |
| 30 | + valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"} |
| 31 | + if search_engine not in valid_engines: |
| 32 | + raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") |
29 | 33 |
|
30 | | - Raises: |
31 | | - ValueError: If the search engine specified is not supported. |
| 34 | + # Format proxy once |
| 35 | + formatted_proxy = None |
| 36 | + if proxy: |
| 37 | + formatted_proxy = format_proxy(proxy) |
32 | 38 |
|
33 | | - Example: |
34 | | - >>> search_on_web("example query", search_engine="Google", max_results=5) |
35 | | - ['http://example.com', 'http://example.org', ...] |
36 | | - """ |
| 39 | + try: |
| 40 | + results = [] |
| 41 | + if search_engine == "google": |
| 42 | + results = list( |
| 43 | + google_search(query, num_results=max_results, proxy=formatted_proxy) |
| 44 | + ) |
| 45 | + |
| 46 | + elif search_engine == "duckduckgo": |
| 47 | + research = DuckDuckGoSearchResults(max_results=max_results) |
| 48 | + res = research.run(query) |
| 49 | + results = re.findall(r"https?://[^\s,\]]+", res) |
| 50 | + |
| 51 | + elif search_engine == "bing": |
| 52 | + results = _search_bing(query, max_results, timeout, formatted_proxy) |
| 53 | + |
| 54 | + elif search_engine == "searxng": |
| 55 | + results = _search_searxng(query, max_results, port, timeout) |
| 56 | + |
| 57 | + elif search_engine.lower() == "serper": |
| 58 | + results = _search_serper(query, max_results, serper_api_key, timeout) |
| 59 | + |
| 60 | + return filter_pdf_links(results) |
| 61 | + |
| 62 | + except requests.Timeout: |
| 63 | + raise TimeoutError(f"Search request timed out after {timeout} seconds") |
| 64 | + except requests.RequestException as e: |
| 65 | + raise RuntimeError(f"Search request failed: {str(e)}") |
| 66 | + |
| 67 | + |
| 68 | +def _search_bing( |
| 69 | + query: str, max_results: int, timeout: int, proxy: str = None |
| 70 | +) -> List[str]: |
| 71 | + """Helper function for Bing search""" |
| 72 | + headers = { |
| 73 | + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" |
| 74 | + } |
| 75 | + search_url = f"https://www.bing.com/search?q={query}" |
| 76 | + |
| 77 | + proxies = {"http": proxy, "https": proxy} if proxy else None |
| 78 | + response = requests.get( |
| 79 | + search_url, headers=headers, timeout=timeout, proxies=proxies |
| 80 | + ) |
| 81 | + response.raise_for_status() |
| 82 | + |
| 83 | + soup = BeautifulSoup(response.text, "html.parser") |
| 84 | + return [ |
| 85 | + result.find("a")["href"] |
| 86 | + for result in soup.find_all("li", class_="b_algo", limit=max_results) |
| 87 | + ] |
| 88 | + |
| 89 | + |
| 90 | +def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: |
| 91 | + """Helper function for SearXNG search""" |
| 92 | + url = f"http://localhost:{port}/search" |
| 93 | + params = { |
| 94 | + "q": query, |
| 95 | + "format": "json", |
| 96 | + "engines": "google,duckduckgo,brave,qwant,bing", |
| 97 | + } |
| 98 | + response = requests.get(url, params=params, timeout=timeout) |
| 99 | + response.raise_for_status() |
| 100 | + return [ |
| 101 | + result["url"] for result in response.json().get("results", [])[:max_results] |
| 102 | + ] |
| 103 | + |
| 104 | + |
| 105 | +def _search_serper( |
| 106 | + query: str, max_results: int, serper_api_key: str, timeout: int |
| 107 | +) -> List[str]: |
| 108 | + """Helper function for Serper API to get Google search results""" |
| 109 | + if not serper_api_key: |
| 110 | + raise ValueError("API key is required for Serper API") |
37 | 111 |
|
38 | | - if search_engine.lower() == "google": |
39 | | - res = [] |
40 | | - for url in google_search(query, num_results=max_results): |
41 | | - res.append(url) |
42 | | - return res |
43 | | - |
44 | | - elif search_engine.lower() == "duckduckgo": |
45 | | - research = DuckDuckGoSearchResults(max_results=max_results) |
46 | | - res = research.run(query) |
47 | | - links = re.findall(r"https?://[^\s,\]]+", res) |
48 | | - return links[:max_results] |
49 | | - |
50 | | - elif search_engine.lower() == "bing": |
51 | | - headers = { |
52 | | - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" |
53 | | - } |
54 | | - search_url = f"https://www.bing.com/search?q={query}" |
55 | | - response = requests.get(search_url, headers=headers) |
| 112 | + url = "https://google.serper.dev/search" |
| 113 | + payload = {"q": query, "num": max_results} |
| 114 | + |
| 115 | + headers = {"X-API-KEY": serper_api_key, "Content-Type": "application/json"} |
| 116 | + |
| 117 | + try: |
| 118 | + response = requests.post( |
| 119 | + url, |
| 120 | + headers=headers, |
| 121 | + json=payload, # requests will handle JSON serialization |
| 122 | + timeout=timeout, |
| 123 | + ) |
56 | 124 | response.raise_for_status() |
57 | | - soup = BeautifulSoup(response.text, "html.parser") |
58 | 125 |
|
59 | | - search_results = [] |
60 | | - for result in soup.find_all("li", class_="b_algo", limit=max_results): |
61 | | - link = result.find("a")["href"] |
62 | | - search_results.append(link) |
63 | | - return search_results |
| 126 | + # Extract only the organic search results |
| 127 | + results = response.json() |
| 128 | + organic_results = results.get("organic", []) |
| 129 | + urls = [result.get("link") for result in organic_results if result.get("link")] |
| 130 | + |
| 131 | + return urls[:max_results] |
64 | 132 |
|
65 | | - elif search_engine.lower() == "searxng": |
66 | | - url = f"http://localhost:{port}" |
67 | | - params = {"q": query, "format": "json"} |
| 133 | + except requests.exceptions.RequestException as e: |
| 134 | + raise RuntimeError(f"Serper API request failed: {str(e)}") |
68 | 135 |
|
69 | | - # Send the GET request to the server |
70 | | - response = requests.get(url, params=params) |
71 | 136 |
|
72 | | - data = response.json() |
73 | | - limited_results = [result["url"] for result in data["results"][:max_results]] |
74 | | - return limited_results |
| 137 | +def format_proxy(proxy): |
| 138 | + if isinstance(proxy, dict): |
| 139 | + server = proxy.get("server") |
| 140 | + username = proxy.get("username") |
| 141 | + password = proxy.get("password") |
75 | 142 |
|
| 143 | + if all([username, password, server]): |
| 144 | + proxy_url = f"http://{username}:{password}@{server}" |
| 145 | + return proxy_url |
| 146 | + else: |
| 147 | + raise ValueError("Proxy dictionary is missing required fields.") |
| 148 | + elif isinstance(proxy, str): |
| 149 | + return proxy # "https://username:password@ip:port" |
76 | 150 | else: |
77 | | - raise ValueError( |
78 | | - "The only search engines available are DuckDuckGo, Google, Bing, or SearXNG" |
79 | | - ) |
| 151 | + raise TypeError("Proxy should be a dictionary or a string.") |
| 152 | + |
| 153 | + |
| 154 | +def filter_pdf_links(links: List[str]) -> List[str]: |
| 155 | + """ |
| 156 | + Filters out any links that point to PDF files. |
| 157 | +
|
| 158 | + Args: |
| 159 | + links (List[str]): A list of URLs as strings. |
| 160 | +
|
| 161 | + Returns: |
| 162 | + List[str]: A list of URLs excluding any that end with '.pdf'. |
| 163 | + """ |
| 164 | + return [link for link in links if not link.lower().endswith(".pdf")] |
0 commit comments