diff --git a/Sagar_Bawankule/README.md b/Sagar_Bawankule/README.md index efa87fb6..6460a430 100644 --- a/Sagar_Bawankule/README.md +++ b/Sagar_Bawankule/README.md @@ -1,37 +1,37 @@ # Google Search Scraper using Selenium -A Python script that uses Selenium WebDriver to scrape Google search results. +A simple and efficient Python script that uses Selenium WebDriver with XPath selectors to scrape Google search results. -## Features +## 🌟 Features -- āœ… Scrapes Google search results (title, URL, description) -- āœ… Configurable number of results to scrape -- āœ… Headless mode option for background execution -- āœ… Export results to CSV or JSON format -- āœ… Anti-detection measures to avoid being blocked -- āœ… User-friendly command-line interface +- āœ… **XPath-only selectors** - Uses only XPath for element location +- āœ… **Anti-detection measures** - Bypasses bot detection with custom configurations +- āœ… **Smart result filtering** - Automatically skips empty/invalid results +- āœ… **Browser stays open** - View results in Chrome as long as you need +- āœ… **Clean terminal output** - Displays results with rank, title, and URL +- āœ… **Error handling** - Robust exception handling for stable scraping -## Prerequisites +## šŸ“‹ Prerequisites - Python 3.7 or higher - Google Chrome browser installed - ChromeDriver (automatically managed by Selenium 4.6+) -## Installation +## šŸš€ Installation -1. **Install required packages:** +1. **Install Selenium:** ```bash pip install selenium ``` -Or install all dependencies from requirements.txt (if available): +Or use the requirements file: ```bash pip install -r requirements.txt ``` -## Usage +## šŸ’» Usage ### Basic Usage @@ -41,159 +41,156 @@ Run the script: python google_scraper.py ``` -The script will prompt you for: -- Search query -- Number of results to scrape -- Export format (CSV/JSON/none) +Follow the prompts: +1. **Enter search query**: Type what you want to search for +2. **Enter number of results**: Specify how many results (default: 10) +3. **View results**: Results appear both in browser and terminal +4. **Exit**: Press `Ctrl+C` to close the browser and exit -### Using as a Module +### Example -You can also import and use the scraper in your own code: - -```python -from google_scraper import GoogleScraper - -# Create scraper instance -scraper = GoogleScraper(headless=True) +``` +Search query: Python programming +Number of results (10): 10 + +Opening Google... +Searching... +Extracting results... +1. Welcome to Python.org +2. Python Tutorial - W3Schools +3. Learn Python - Free Interactive Python Tutorial +... +10. Python Programming Language -# Search and get results -results = scraper.search_google("Python programming", num_results=10) +āœ“ Scraped 10 results +Browser will remain open. Press Ctrl+C in terminal to exit. -# Save results -scraper.save_to_csv("my_results.csv") -scraper.save_to_json("my_results.json") +====================================================================== +1. Welcome to Python.org + https://www.python.org/ -# Close the browser -scraper.close() +2. Python Tutorial - W3Schools + https://www.w3schools.com/python/ +... +====================================================================== ``` -### Advanced Usage +## šŸ”§ How It Works +### 1. **Anti-Detection Setup** ```python -from google_scraper import GoogleScraper - -# Initialize with headless mode -scraper = GoogleScraper(headless=True) - -try: - # Perform search - results = scraper.search_google("machine learning tutorials", num_results=20) - - # Process results - for result in results: - print(f"{result['rank']}. {result['title']}") - print(f" URL: {result['url']}") - print(f" Description: {result['description']}\n") - - # Save to both formats - scraper.save_to_csv("ml_tutorials.csv") - scraper.save_to_json("ml_tutorials.json") - -finally: - scraper.close() +- Custom user agent +- Disabled automation flags +- CDP command for user agent override +- Webdriver property masking ``` -## Output Format - -### CSV Format -The CSV file contains the following columns: -- `rank`: Position in search results (1, 2, 3, ...) -- `title`: Title of the search result -- `url`: URL of the webpage -- `description`: Snippet/description from Google - -### JSON Format -```json -[ - { - "rank": 1, - "title": "Example Title", - "url": "https://example.com", - "description": "Example description text..." - }, - ... -] -``` +### 2. **XPath-Based Scraping** +The script uses XPath expressions to locate elements: +- Search box: `//textarea[@name='q']` or `//input[@name='q']` +- Results container: `(//div[contains(@class, 'g') and .//h3])[i]` +- Title: `//h3` +- URL: `//a[@href]` +- Description: `//div[contains(@class, 'VwiC3b')]` + +### 3. **Smart Result Collection** +- Skips empty or invalid results +- Continues searching until requested number of valid results found +- Maximum 20 extra attempts to avoid infinite loops + +## šŸ“Š Output Format + +Results are displayed in terminal with: +- **Rank**: Position number (1, 2, 3, ...) +- **Title**: Page title from search result +- **URL**: Full webpage URL +- **Description**: Snippet from Google (if available) -## Configuration Options +## āš™ļø Configuration ### Headless Mode -Run the browser in the background without GUI: +To run without opening a visible browser: ```python -scraper = GoogleScraper(headless=True) +results = scrape_google(query, num_results, headless=True) ``` -### Number of Results -Specify how many results to scrape: +### Custom Number of Results +Default is 10, but you can specify any number: ```python -results = scraper.search_google("query", num_results=20) +results = scrape_google("your query", 20) # Get 20 results ``` -## Features Explained +## šŸ› ļø Code Structure -### Anti-Detection Measures -The scraper includes several techniques to avoid being detected as a bot: -- Custom user agent -- Disabled automation flags -- Randomized delays -- WebDriver property masking +``` +google_scraper.py +ā”œā”€ā”€ setup_driver() # Configures Chrome with anti-detection +ā”œā”€ā”€ scrape_google() # Main scraping function +└── main() # User interface and result display +``` + +## āš ļø Important Notes + +### Browser Behavior +- Browser **stays open** after scraping completes +- Press `Ctrl+C` in terminal to close browser and exit +- Script keeps running in an infinite loop to maintain browser session + +### Rate Limiting +- Built-in delays (3-5 seconds) between actions +- Respectful of Google's servers +- Avoid running too frequently -### Error Handling -The script includes robust error handling for: -- Network issues -- Element not found errors -- Browser crashes -- Invalid queries +### Legal & Ethical Use +- āš ļø **Respect Google's Terms of Service** +- 🚫 Don't use for automated/commercial scraping at scale +- āœ… Use responsibly for educational/personal purposes only +- āš ļø Consider Google's robots.txt policies -## Troubleshooting +## šŸ› Troubleshooting ### ChromeDriver Issues -If you get ChromeDriver errors: ```bash pip install --upgrade selenium ``` +Selenium 4.6+ manages ChromeDriver automatically. -Selenium 4.6+ automatically manages ChromeDriver. +### Empty Results +- Some results may be ads or special content +- Script automatically skips these and continues +- Searches up to 20 extra results to find valid ones + +### Browser Closes Immediately +- Make sure you don't have errors in terminal +- The infinite loop should keep browser open +- Check that Selenium is properly installed ### Import Errors -If you see "ModuleNotFoundError: No module named 'selenium'": ```bash pip install selenium ``` -### Google Blocking -If Google blocks your requests: -- Add delays between searches -- Use headless mode sparingly -- Don't scrape too frequently -- Consider using proxies for large-scale scraping - -## Notes - -- **Respect Google's Terms of Service**: Use this tool responsibly -- **Rate Limiting**: Don't send too many requests in a short time -- **Legal Considerations**: Ensure your use case complies with applicable laws -- **Robots.txt**: Be aware of Google's robots.txt file - -## Example Output +## šŸ“ Requirements ``` -Searching for: Python programming -Chrome WebDriver started successfully. -Result 1: Welcome to Python.org -Result 2: Python Tutorial - W3Schools -Result 3: Learn Python - Free Interactive Python Tutorial -... -Successfully scraped 10 results. - -Results saved to google_search_results.csv -WebDriver closed. +selenium>=4.6.0 ``` -## License +## šŸŽÆ Best Practices + +1. **Don't scrape too frequently** - Add delays between runs +2. **Respect robots.txt** - Check Google's crawling policies +3. **Use for learning** - Great for understanding web scraping +4. **Be ethical** - Don't overwhelm servers with requests + +## šŸ“„ License This project is for educational purposes only. -## Contributing +## šŸ¤ Contributing Feel free to submit issues or pull requests for improvements. + +--- + +**Note**: Web scraping should always be done responsibly and in compliance with the website's terms of service and applicable laws. diff --git a/Sagar_Bawankule/google_scraper.py b/Sagar_Bawankule/google_scraper.py index ea646ee1..fcb7d9ac 100644 --- a/Sagar_Bawankule/google_scraper.py +++ b/Sagar_Bawankule/google_scraper.py @@ -1,248 +1,119 @@ -""" -Google Search Results Scraper using Selenium -This script scrapes Google search results for a given query. -""" +"""Google Search Scraper using Selenium with XPath""" from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options -from selenium.webdriver.chrome.service import Service import time -import csv -import json -class GoogleScraper: - """A class to scrape Google search results using Selenium.""" +def setup_driver(headless=False): + options = Options() + if headless: + options.add_argument('--headless') + options.add_argument('--disable-blink-features=AutomationControlled') + options.add_argument('--disable-gpu') + options.add_argument('--no-sandbox') + options.add_argument('--disable-dev-shm-usage') + options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') + options.add_experimental_option("excludeSwitches", ["enable-automation"]) + options.add_experimental_option('useAutomationExtension', False) - def __init__(self, headless=False): - """ - Initialize the scraper with Chrome options. - - Args: - headless (bool): Run browser in headless mode if True - """ - self.options = Options() - - if headless: - self.options.add_argument('--headless') - - # Add arguments to avoid detection - self.options.add_argument('--disable-blink-features=AutomationControlled') - self.options.add_argument('--no-sandbox') - self.options.add_argument('--disable-dev-shm-usage') - self.options.add_argument('--start-maximized') - self.options.add_experimental_option("excludeSwitches", ["enable-automation"]) - self.options.add_experimental_option('useAutomationExtension', False) - - # Set user agent to look more like a real browser - self.options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') - - self.driver = None - self.results = [] - - def start_driver(self): - """Start the Chrome WebDriver.""" - try: - self.driver = webdriver.Chrome(options=self.options) - # Execute script to remove webdriver property - self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") - print("Chrome WebDriver started successfully.") - except Exception as e: - print(f"Error starting WebDriver: {e}") - raise - - def search_google(self, query, num_results=10): - """ - Search Google for the given query and scrape results. - - Args: - query (str): The search query - num_results (int): Number of results to scrape (default: 10) - - Returns: - list: List of dictionaries containing search results - """ - if not self.driver: - self.start_driver() - - try: - # Navigate to Google - print(f"Searching for: {query}") - self.driver.get("https://www.google.com") - - # Wait for search box to be present - search_box = WebDriverWait(self.driver, 10).until( - EC.presence_of_element_located((By.NAME, "q")) - ) - - # Enter search query and submit - search_box.send_keys(query) - search_box.send_keys(Keys.RETURN) - - # Wait for results to load - time.sleep(2) - WebDriverWait(self.driver, 10).until( - EC.presence_of_element_located((By.ID, "search")) - ) - - # Scrape the results - self.results = self._extract_results(num_results) - - print(f"Successfully scraped {len(self.results)} results.") - return self.results - - except Exception as e: - print(f"Error during search: {e}") - return [] + driver = webdriver.Chrome(options=options) + driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}) + driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") + return driver + + +def scrape_google(query, num_results=10, headless=False): + driver = setup_driver(headless) + results = [] - def _extract_results(self, num_results): - """ - Extract search results from the current page. - - Args: - num_results (int): Maximum number of results to extract - - Returns: - list: List of dictionaries with result data - """ - results = [] + try: + print("Opening Google...") + driver.get("https://www.google.com") + time.sleep(3) + print("Searching...") try: - # Find all search result divs - search_results = self.driver.find_elements(By.CSS_SELECTOR, "div.g") - - for index, result in enumerate(search_results[:num_results], 1): - try: - # Extract title - title_element = result.find_element(By.CSS_SELECTOR, "h3") - title = title_element.text if title_element else "N/A" - - # Extract URL - link_element = result.find_element(By.CSS_SELECTOR, "a") - url = link_element.get_attribute("href") if link_element else "N/A" - - # Extract description/snippet - try: - description_element = result.find_element(By.CSS_SELECTOR, "div.VwiC3b") - description = description_element.text - except: - description = "N/A" - - # Store the result - result_data = { - 'rank': index, - 'title': title, - 'url': url, - 'description': description - } - - results.append(result_data) - print(f"Result {index}: {title}") - - except Exception as e: - print(f"Error extracting result {index}: {e}") + search_box = driver.find_element(By.XPATH, "//textarea[@name='q']") + except: + search_box = driver.find_element(By.XPATH, "//input[@name='q']") + + search_box.send_keys(query) + time.sleep(1) + search_box.send_keys(Keys.RETURN) + time.sleep(5) + + print("Extracting results...") + count = 0 + i = 1 + while count < num_results: + try: + base = f"(//div[contains(@class, 'g') and .//h3])[{i}]" + title = driver.find_element(By.XPATH, f"{base}//h3").text + + # Skip if title is empty + if not title or title.strip() == "": + i += 1 continue - - except Exception as e: - print(f"Error finding search results: {e}") - - return results - - def save_to_csv(self, filename="google_search_results.csv"): - """ - Save the scraped results to a CSV file. - - Args: - filename (str): Name of the CSV file - """ - if not self.results: - print("No results to save.") - return - - try: - with open(filename, 'w', newline='', encoding='utf-8') as file: - writer = csv.DictWriter(file, fieldnames=['rank', 'title', 'url', 'description']) - writer.writeheader() - writer.writerows(self.results) - - print(f"Results saved to {filename}") - except Exception as e: - print(f"Error saving to CSV: {e}") - - def save_to_json(self, filename="google_search_results.json"): - """ - Save the scraped results to a JSON file. - - Args: - filename (str): Name of the JSON file - """ - if not self.results: - print("No results to save.") - return + + url = driver.find_element(By.XPATH, f"{base}//a[@href]").get_attribute("href") + + try: + desc = driver.find_element(By.XPATH, f"{base}//div[contains(@class, 'VwiC3b')]").text + except: + desc = "N/A" + + count += 1 + results.append({'rank': count, 'title': title, 'url': url, 'description': desc}) + print(f"{count}. {title}") + i += 1 + + except Exception as e: + # Try next element + i += 1 + if i > num_results + 20: # Stop after trying 20 extra elements + print(f"Stopped searching after {i} attempts") + break + + print(f"\nāœ“ Scraped {len(results)} results") + print("Browser will remain open. Press Ctrl+C in terminal to exit.") + + # Keep the browser open indefinitely + while True: + time.sleep(1) + except KeyboardInterrupt: + print("\n\nClosing browser...") + driver.quit() + except Exception as e: + print(f"Error: {e}") + print("Browser will remain open. Press Ctrl+C in terminal to exit.") try: - with open(filename, 'w', encoding='utf-8') as file: - json.dump(self.results, file, indent=4, ensure_ascii=False) - - print(f"Results saved to {filename}") - except Exception as e: - print(f"Error saving to JSON: {e}") + while True: + time.sleep(1) + except KeyboardInterrupt: + print("\n\nClosing browser...") + driver.quit() - def close(self): - """Close the WebDriver.""" - if self.driver: - self.driver.quit() - print("WebDriver closed.") + return results def main(): - """Main function to demonstrate the scraper.""" + query = input("Search query: ") + num = int(input("Number of results (10): ") or "10") - # Create scraper instance - scraper = GoogleScraper(headless=False) # Set to True for headless mode + results = scrape_google(query, num) - try: - # Get search query from user - query = input("Enter your search query: ") - num_results = int(input("How many results do you want to scrape? (default 10): ") or "10") - - # Perform search - results = scraper.search_google(query, num_results) - - # Display results - print("\n" + "="*80) - print("SEARCH RESULTS") - print("="*80 + "\n") - - for result in results: - print(f"Rank: {result['rank']}") - print(f"Title: {result['title']}") - print(f"URL: {result['url']}") - print(f"Description: {result['description']}") - print("-" * 80) - - # Ask user if they want to save results - save_option = input("\nDo you want to save the results? (csv/json/no): ").lower() - - if save_option == 'csv': - scraper.save_to_csv() - elif save_option == 'json': - scraper.save_to_json() - else: - print("Results not saved.") - - except KeyboardInterrupt: - print("\n\nScraping interrupted by user.") - except Exception as e: - print(f"An error occurred: {e}") - finally: - # Close the browser - scraper.close() + if results: + print("\n" + "="*70) + for r in results: + print(f"{r['rank']}. {r['title']}\n {r['url']}\n") + print("="*70) + else: + print("\nNo results found!") if __name__ == "__main__": - main() + main() \ No newline at end of file