diff --git a/Ronak_Sarvaya/README.md b/Ronak_Sarvaya/README.md new file mode 100644 index 00000000..141559b3 --- /dev/null +++ b/Ronak_Sarvaya/README.md @@ -0,0 +1,165 @@ +# Google Search Scraper - Ronak Sarvaya + +A robust Selenium-based web scraper for extracting Google search results. + +## Features + +- ✅ Modern Selenium 4+ implementation +- ✅ Automatic ChromeDriver management (no manual driver download needed) +- ✅ Extracts title, URL, and snippet for each result +- ✅ Saves results to CSV file with timestamp +- ✅ Configurable headless/visible browser mode +- ✅ Proper error handling and explicit waits +- ✅ Anti-detection measures (user-agent, automation flags) + +## Installation + +### Prerequisites +- Python 3.7 or higher +- Google Chrome browser installed + +### Install Required Packages + +```bash +pip install selenium webdriver-manager +``` + +Or install from requirements file: + +```bash +pip install -r requirements.txt +``` + +## Usage + +### Basic Usage + +Run the scraper with default settings: + +```bash +python my_scraper.py +``` + +### Customize Search Query + +Edit the `main()` function in `my_scraper.py`: + +```python +# Configuration +SEARCH_QUERY = "Your search query here" # Change this +NUM_RESULTS = 10 # Number of results to scrape +HEADLESS_MODE = True # False to see browser window +``` + +### Use as a Module + +```python +from my_scraper import GoogleScraper + +# Create scraper instance +scraper = GoogleScraper(headless=True) + +# Scrape Google +results = scraper.scrape( + query="Python tutorials", + num_results=15, + save_csv=True +) + +# Access results +for result in results: + print(f"Title: {result['title']}") + print(f"URL: {result['url']}") + print(f"Snippet: {result['snippet']}") +``` + +## Output + +The scraper generates a CSV file with the following columns: +- **rank**: Position in search results (1-based) +- **title**: Page title +- **url**: Page URL +- **snippet**: Description/snippet from search results + +Output filename format: `google_results_YYYYMMDD_HHMMSS.csv` + +## Configuration Options + +### GoogleScraper Class + +```python +scraper = GoogleScraper(headless=True) +``` + +- `headless` (bool): Run browser in headless mode (default: True) + +### scrape() Method + +```python +results = scraper.scrape(query, num_results=10, save_csv=True) +``` + +- `query` (str): Search query string +- `num_results` (int): Maximum number of results to extract (default: 10) +- `save_csv` (bool): Save results to CSV file (default: True) + +## Troubleshooting + +### ChromeDriver Issues +The scraper uses `webdriver-manager` to automatically download and manage ChromeDriver. If you encounter issues: + +```bash +pip install --upgrade webdriver-manager +``` + +### Import Errors +Make sure all dependencies are installed: + +```bash +pip install selenium webdriver-manager +``` + +### No Results Found +- Check your internet connection +- Try running in non-headless mode to see what's happening +- Google may be blocking automated requests - try adding delays + +## Example Output + +``` +============================================================ +Google Search Scraper - Ronak Sarvaya +============================================================ +✅ Chrome WebDriver initialized successfully +🔍 Searching Google for: 'Python programming tutorials' +✅ Search completed successfully +📊 Found 10 search results +✓ Extracted result #1: Python Tutorial - W3Schools... +✓ Extracted result #2: Learn Python Programming... +... +✅ Successfully extracted 10 results +✅ Results saved to: Ronak_Sarvaya/google_results_20240115_143022.csv +✅ Browser closed + +============================================================ +SEARCH RESULTS +============================================================ + +[1] Python Tutorial - W3Schools +URL: https://www.w3schools.com/python/ +Snippet: Well organized and easy to understand Web building tutorials... +------------------------------------------------------------ +... +``` + +## Notes + +- Respect Google's Terms of Service +- Use reasonable delays between requests +- Consider using Google's official APIs for production use +- This scraper is for educational purposes + +## Author + +**Ronak Sarvaya** +GC-Internship Project diff --git a/Ronak_Sarvaya/google_results_20251116_111743.csv b/Ronak_Sarvaya/google_results_20251116_111743.csv new file mode 100644 index 00000000..26a6eaf9 --- /dev/null +++ b/Ronak_Sarvaya/google_results_20251116_111743.csv @@ -0,0 +1,10 @@ +rank,title,url,snippet +1,Python Tutorial,https://www.w3schools.com/python/,"This Python tutorial covers file handling, database handling, exercises, and examples. It also includes a ""Try it Yourself"" editor." +2,Python Tutorial,https://www.geeksforgeeks.org/python/python-programming-language-tutorial/,"5 Nov 2025 — In this section, we'll cover the basics of Python programming, including installing Python, writing first program, understanding comments and working with ..." +3,The Python Tutorial,https://docs.python.org/3/tutorial/index.html,"This tutorial is for new Python programmers, introduces basic concepts, and helps you write Python modules and programs, but is not comprehensive." +4,"Python Full Course for Beginners [2025] +YouTube · Programming with Mosh +12 Feb 2025",https://www.youtube.com/watch?v=K5KVEU3aaeQ,No description available +5,Python Tutorial,https://www.tutorialspoint.com/python/index.htm,"This Python tutorial is for beginners to learn basic to advanced concepts of Python, a popular, general-purpose, interpreted, object-oriented language." +6,Learn Python - Free Interactive Python Tutorial,https://www.learnpython.org/,"This free tutorial offers interactive coding challenges, videos, and covers topics like variables, lists, loops, functions, and more. It is for everyone." +7,Python for Beginners (Full Course),https://www.youtube.com/playlist?list=PLu0W_9lII9agwh1XjRt242xIpHhPT2llg,Introduction to Programming & Python | Python Tutorial - Day #1 · Some Amazing Python Programs - The Power of Python | Python Tutorial - Day #2 · Modules and Pip ... diff --git a/Ronak_Sarvaya/my_scraper.py b/Ronak_Sarvaya/my_scraper.py new file mode 100644 index 00000000..db729aa0 --- /dev/null +++ b/Ronak_Sarvaya/my_scraper.py @@ -0,0 +1,327 @@ +""" +Google Search Scraper using Selenium +Author: Ronak Sarvaya +Description: Scrapes Google search results including title, URL, and snippet +""" + +import time +import csv +from datetime import datetime +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from webdriver_manager.chrome import ChromeDriverManager + + +class GoogleScraper: + """A class to scrape Google search results using Selenium""" + + def __init__(self, headless=True): + """ + Initialize the Google Scraper + + Args: + headless (bool): Run browser in headless mode (default: True) + """ + self.headless = headless + self.driver = None + + def setup_driver(self): + """Setup Chrome WebDriver with options""" + chrome_options = Options() + + if self.headless: + chrome_options.add_argument("--headless=new") + + # Additional options for stability + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--disable-blink-features=AutomationControlled") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--window-size=1920,1080") + chrome_options.add_argument("--disable-extensions") + chrome_options.add_argument("--disable-software-rasterizer") + chrome_options.add_argument("--log-level=3") + chrome_options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"]) + chrome_options.add_experimental_option('useAutomationExtension', False) + + # Set user agent to avoid detection + chrome_options.add_argument( + "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + ) + + try: + # Initialize driver with webdriver_manager + service = Service(ChromeDriverManager().install()) + self.driver = webdriver.Chrome(service=service, options=chrome_options) + self.driver.set_page_load_timeout(30) + print("✅ Chrome WebDriver initialized successfully") + except Exception as e: + print(f"❌ Failed to initialize Chrome WebDriver: {e}") + print("\n💡 Troubleshooting tips:") + print("1. Make sure Google Chrome is installed") + print("2. Try updating Chrome to the latest version") + print("3. Run: pip install --upgrade selenium webdriver-manager") + raise + + def search_google(self, query): + """ + Perform a Google search + + Args: + query (str): Search query string + """ + try: + # Navigate to Google + self.driver.get("https://www.google.com") + print(f"🔍 Searching Google for: '{query}'") + + # Wait for search box and handle cookie consent if present + try: + # Try to click "Accept all" button if it appears + accept_button = WebDriverWait(self.driver, 3).until( + EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Accept all') or contains(., 'I agree')]")) + ) + accept_button.click() + time.sleep(1) + except: + pass # No cookie consent or already accepted + + # Find search box + search_box = WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.NAME, "q")) + ) + + # Enter search query + search_box.clear() + search_box.send_keys(query) + search_box.send_keys(Keys.RETURN) + + # Wait for results to load + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.ID, "search")) + ) + + print("✅ Search completed successfully") + + except Exception as e: + print(f"❌ Error during search: {e}") + raise + + def extract_results(self, num_results=10): + """ + Extract search results from the page + + Args: + num_results (int): Maximum number of results to extract + + Returns: + list: List of dictionaries containing search results + """ + results = [] + + try: + # Wait a bit for all results to load + time.sleep(3) + + # Try multiple CSS selectors for search results + search_results = [] + selectors = [ + "div.g", + "div[data-sokoban-container]", + "div.Gx5Zad.fP1Qef.xpd.EtOod.pkphOe", + "div[jscontroller][data-hveid]" + ] + + for selector in selectors: + search_results = self.driver.find_elements(By.CSS_SELECTOR, selector) + if len(search_results) > 0: + print(f"📊 Found {len(search_results)} search results using selector: {selector}") + break + + if len(search_results) == 0: + print("⚠️ No search results found with any selector") + # Save screenshot for debugging + try: + self.driver.save_screenshot("debug_screenshot.png") + print("📸 Screenshot saved as debug_screenshot.png") + except: + pass + return results + + for i, result in enumerate(search_results[:num_results]): + try: + # Extract title - try multiple selectors + title = None + title_selectors = ["h3", "div[role='heading']", ".LC20lb"] + for sel in title_selectors: + try: + title_element = result.find_element(By.CSS_SELECTOR, sel) + title = title_element.text + if title: + break + except: + continue + + if not title: + continue + + # Extract URL + url = None + try: + url_element = result.find_element(By.CSS_SELECTOR, "a") + url = url_element.get_attribute("href") + except: + continue + + # Extract snippet/description - try multiple selectors + snippet = "No description available" + snippet_selectors = [ + "div[data-sncf='1']", + "div.VwiC3b", + "div.IsZvec", + "span.aCOpRe", + "div[style*='-webkit-line-clamp']" + ] + for sel in snippet_selectors: + try: + snippet_element = result.find_element(By.CSS_SELECTOR, sel) + snippet = snippet_element.text + if snippet: + break + except: + continue + + # Only add if we have valid data + if title and url and url.startswith("http"): + results.append({ + "rank": len(results) + 1, + "title": title, + "url": url, + "snippet": snippet + }) + + print(f"✓ Extracted result #{len(results)}: {title[:50]}...") + + except Exception as e: + print(f"⚠️ Could not extract result: {e}") + continue + + print(f"✅ Successfully extracted {len(results)} results") + return results + + except Exception as e: + print(f"❌ Error extracting results: {e}") + return results + + def save_to_csv(self, results, query, filename=None): + """ + Save results to CSV file + + Args: + results (list): List of result dictionaries + query (str): Search query used + filename (str): Output filename (optional) + """ + if not results: + print("⚠️ No results to save") + return + + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"google_results_{timestamp}.csv" + + try: + with open(filename, 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['rank', 'title', 'url', 'snippet'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + writer.writeheader() + writer.writerows(results) + + print(f"✅ Results saved to: {filename}") + + except Exception as e: + print(f"❌ Error saving to CSV: {e}") + + def close(self): + """Close the browser""" + if self.driver: + self.driver.quit() + print("✅ Browser closed") + + def scrape(self, query, num_results=10, save_csv=True): + """ + Main method to scrape Google search results + + Args: + query (str): Search query + num_results (int): Number of results to extract + save_csv (bool): Whether to save results to CSV + + Returns: + list: List of search results + """ + try: + self.setup_driver() + self.search_google(query) + results = self.extract_results(num_results) + + if save_csv and results: + self.save_to_csv(results, query) + + return results + + except Exception as e: + print(f"❌ Scraping failed: {e}") + return [] + + finally: + self.close() + + +def main(): + """Main function to demonstrate the scraper""" + print("=" * 60) + print("Google Search Scraper - Ronak Sarvaya") + print("=" * 60) + + # Configuration + SEARCH_QUERY = "Python programming tutorials" + NUM_RESULTS = 10 + HEADLESS_MODE = False # Set to False to see the browser + + # Create scraper instance + scraper = GoogleScraper(headless=HEADLESS_MODE) + + # Perform scraping + results = scraper.scrape( + query=SEARCH_QUERY, + num_results=NUM_RESULTS, + save_csv=True + ) + + # Display results + if results: + print("\n" + "=" * 60) + print("SEARCH RESULTS") + print("=" * 60) + + for result in results: + print(f"\n[{result['rank']}] {result['title']}") + print(f"URL: {result['url']}") + print(f"Snippet: {result['snippet'][:100]}...") + print("-" * 60) + else: + print("\n⚠️ No results found or scraping failed") + + print("\n✅ Scraping completed!") + + +if __name__ == "__main__": + main() diff --git a/Ronak_Sarvaya/requirements.txt b/Ronak_Sarvaya/requirements.txt new file mode 100644 index 00000000..58d2d47c --- /dev/null +++ b/Ronak_Sarvaya/requirements.txt @@ -0,0 +1,2 @@ +selenium>=4.15.0 +webdriver-manager>=4.0.1