From eb17f882f1f5bd07ec7cba07cfe662693ed17174 Mon Sep 17 00:00:00 2001 From: hellothere012 Date: Wed, 4 Jun 2025 08:03:59 -0700 Subject: [PATCH 1/4] feat: add proxy support and cleanup --- .env.example | 7 + README.md | 25 +++ app.py | 123 ----------- app/__init__.py | 0 app/crud.py | 50 ----- app/database.py | 25 --- app/main.py | 265 ------------------------ app/models.py | 45 ---- app/schemas.py | 41 ---- app/scraper.py | 373 ---------------------------------- config.py | 34 ---- database.py | 26 --- render.yaml | 20 +- src/api/routes.py | 6 +- src/automation/browser_sim.py | 8 + src/config.py | 5 + 16 files changed, 64 insertions(+), 989 deletions(-) delete mode 100644 app.py delete mode 100644 app/__init__.py delete mode 100644 app/crud.py delete mode 100644 app/database.py delete mode 100644 app/main.py delete mode 100644 app/models.py delete mode 100644 app/schemas.py delete mode 100644 app/scraper.py delete mode 100644 config.py delete mode 100644 database.py diff --git a/.env.example b/.env.example index 767687c..4744016 100644 --- a/.env.example +++ b/.env.example @@ -13,3 +13,10 @@ API_PORT=8000 # Scraping Limits MAX_LISTINGS_PER_SESSION=25 + +# Optional Proxy Configuration +# If using rotating proxies (e.g., Webshare), uncomment and provide the proxy URL. +# Example: http://username:password@proxyhost:port +# PROXY_SERVER= +# PROXY_USERNAME= +# PROXY_PASSWORD= diff --git a/README.md b/README.md index f5d383b..7525e16 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,28 @@ The scraper can also be run standalone: ```bash python main.py scrape_test ``` + +## Environment Variables + +Set the following variables in a `.env` file or your deployment environment: + +| Variable | Description | Default | +| --- | --- | --- | +| `DATABASE_URL` | Database connection URL | `sqlite+aiosqlite:///./vehicle_data.db` | +| `HEADLESS` | Run the browser in headless mode | `true` | +| `BROWSER_TIMEOUT` | Playwright launch timeout (ms) | `60000` | +| `PAGE_DELAY` | Base delay after page loads (ms) | `5000` | +| `MIN_DELAY_BETWEEN_ACTIONS` | Delay between scraping actions (s) | `2.5` | +| `API_HOST` | Host for the FastAPI server | `127.0.0.1` | +| `API_PORT` | Port for the FastAPI server | `8000` | +| `MAX_LISTINGS_PER_SESSION` | Maximum listings fetched per scrape | `25` | +| `PROXY_SERVER` | *(Optional)* Proxy URL for Playwright | - | +| `PROXY_USERNAME` | *(Optional)* Proxy username | - | +| `PROXY_PASSWORD` | *(Optional)* Proxy password | - | + +### Pagination + +The `/api/v1/vehicles/` endpoint accepts `skip` and `limit` query parameters to paginate results. +Example: `/api/v1/vehicles/?skip=25&limit=25`. + + diff --git a/app.py b/app.py deleted file mode 100644 index 8fa8500..0000000 --- a/app.py +++ /dev/null @@ -1,123 +0,0 @@ -import logging -# import os # No longer needed for getenv in background task -import asyncio -from fastapi import FastAPI, Depends, BackgroundTasks -from pydantic import BaseModel -from typing import Dict -from datetime import datetime -from database import CarListing, get_db, Session, SessionLocal -from scraper import scrape_autotrader_and_update_db -from fastapi.middleware.cors import CORSMiddleware -from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT, LOG_LEVEL # Import from config - -# Configure basic logging using LOG_LEVEL from config -# Ensure this is called only once. If FastAPI/Uvicorn also configures logging, -# this might need adjustment or to be handled by the logger instance directly. -# For now, assume this is the primary logging config. -logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s', force=True) -# Added force=True to ensure this config takes precedence if uvicorn also tries to set basicConfig. - -app = FastAPI() -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_methods=["*"], - allow_headers=["*"], -) - -class CarListingRaw(BaseModel): - platform: str - extracted_at: datetime - source_url: str - data_points: Dict - -@app.post("/api/v1/listings/ingest") -async def ingest_listing(payload: CarListingRaw, db: Session = Depends(get_db)): - listing = CarListing( - platform=payload.platform, - extracted_at=payload.extracted_at, - source_url=payload.source_url, - data_points=payload.data_points - ) - db.add(listing) - db.commit() - db.refresh(listing) - return {"status": "saved", "listing_id": listing.id} - -@app.get("/") -def read_root(): - return {"message": "πŸš— Car Tracker API is running!"} - -# Global variable to store scraping status -scrape_status = { - "last_run_time": None, - "status": "idle", # States: idle, running, success, error - "message": "", - "added": 0, - "updated": 0, - "scraped_count": 0 -} - -# Background task wrapper -async def _background_scraper_task_wrapper(): - global scrape_status - db_task_session: Session = SessionLocal() - logging.info("Background scraper task started.") - scrape_status["status"] = "running" - scrape_status["message"] = "Scraping in progress..." - scrape_status["last_run_time"] = datetime.utcnow().isoformat() - scrape_status["added"] = 0 # Reset counts for current run - scrape_status["updated"] = 0 - scrape_status["scraped_count"] = 0 - - try: - # Use imported config values - # autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") - # headless_str = os.getenv("HEADLESS_BROWSER", "True") - # headless = headless_str.lower() == "true" - # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - # try: - # scrape_timeout = int(scrape_timeout_str) - # except ValueError: - # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.") - # scrape_timeout = 120000 - - logging.info(f"Background task using URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms") - - result = await scrape_autotrader_and_update_db( - db=db_task_session, - autotrader_url=AUTOTRADER_URL, - headless=HEADLESS_BROWSER, - scrape_timeout=SCRAPE_TIMEOUT - ) - - if result.get("status") == "success": - scrape_status["status"] = "success" - scrape_status["message"] = "Scraping completed successfully." - scrape_status["added"] = result.get("added", 0) - scrape_status["updated"] = result.get("updated", 0) - scrape_status["scraped_count"] = result.get("scraped_count", 0) - else: - scrape_status["status"] = "error" - scrape_status["message"] = result.get("message", "Scraping failed with an unknown error.") - - logging.info(f"Background scraper task completed: {result}") - - except Exception as e: - logging.error(f"Error in background scraper task: {e}", exc_info=True) - scrape_status["status"] = "error" - scrape_status["message"] = str(e) - finally: - db_task_session.close() - logging.info("Background scraper DB session closed.") - -@app.post("/api/v1/scrape/autotrader") -async def trigger_autotrader_scrape(background_tasks: BackgroundTasks): - if scrape_status["status"] == "running": - return {"message": "AutoTrader scraping job is already running."} - background_tasks.add_task(_background_scraper_task_wrapper) - return {"message": "AutoTrader scraping job started in the background."} - -@app.get("/api/v1/scrape/status") -async def get_scrape_status(): - return scrape_status diff --git a/app/__init__.py b/app/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/crud.py b/app/crud.py deleted file mode 100644 index fb8708b..0000000 --- a/app/crud.py +++ /dev/null @@ -1,50 +0,0 @@ -from sqlalchemy.orm import Session -from . import models, schemas -from datetime import datetime - -def get_car_listing_by_url(db: Session, url: str): - return db.query(models.ScrapedData).filter(models.ScrapedData.url == url).first() - -def create_car_listing(db: Session, listing: schemas.CarListingCreate): - db_listing = models.ScrapedData( - job_id=listing.job_id, - platform=listing.platform, - url=str(listing.url), # Ensure HttpUrl is converted to string - title=listing.title, - price=listing.price, - mileage=listing.mileage, - vin=listing.vin, - image_urls=listing.image_urls, # Assuming image_urls is already a list of strings or compatible JSON - raw_data=listing.raw_data, - scraped_at=datetime.utcnow() - ) - db.add(db_listing) - db.commit() - db.refresh(db_listing) - return db_listing - -def create_scrape_job(db: Session) -> models.ScrapeJob: - db_job = models.ScrapeJob(timestamp=datetime.utcnow(), status="pending") - db.add(db_job) - db.commit() - db.refresh(db_job) - return db_job - -def update_scrape_job_status(db: Session, job_id: int, status: str, results_count: int = 0, error_message: str = None): - db_job = db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first() - if db_job: - db_job.status = status - db_job.results_count = results_count - db_job.error_message = error_message - db.commit() - db.refresh(db_job) - return db_job - -def get_scrape_job(db: Session, job_id: int): - return db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first() - -def get_all_scrape_jobs(db: Session, skip: int = 0, limit: int = 100): - return db.query(models.ScrapeJob).order_by(models.ScrapeJob.timestamp.desc()).offset(skip).limit(limit).all() - -def get_listings_for_job(db: Session, job_id: int, skip: int = 0, limit: int = 100): - return db.query(models.ScrapedData).filter(models.ScrapedData.job_id == job_id).offset(skip).limit(limit).all() diff --git a/app/database.py b/app/database.py deleted file mode 100644 index bf32154..0000000 --- a/app/database.py +++ /dev/null @@ -1,25 +0,0 @@ -from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker -import os - -DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db") - -engine_args = {} -if DATABASE_URL.startswith("sqlite"): - engine_args["connect_args"] = {"check_same_thread": False} - -engine = create_engine(DATABASE_URL, **engine_args) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - -Base = declarative_base() - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -def create_tables(): - Base.metadata.create_all(bind=engine) diff --git a/app/main.py b/app/main.py deleted file mode 100644 index 4817f15..0000000 --- a/app/main.py +++ /dev/null @@ -1,265 +0,0 @@ -import logging -import os -from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks -from sqlalchemy.orm import Session -from typing import List - -from . import crud, models, schemas, scraper -from .database import SessionLocal, engine - -# Create database tables if they don't exist -models.Base.metadata.create_all(bind=engine) - -# Configure logging -LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() -logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -app = FastAPI(title="AutoTrader Scraper API", version="1.0.0") - -# Dependency to get DB session -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -# Global variable to store scraping status (simple approach) -scrape_status = { - "job_id": None, - "status": "idle", # States: idle, pending, running, completed, failed - "message": "No scraping job initiated yet.", - "last_run_time": None, - "duration_seconds": None, - "results_count": 0, - "error_message": None -} - -async def run_scraping_task(job_id: int, autotrader_url: str, headless: bool, scrape_timeout: int): - """ - The actual scraping task that runs in the background. - It creates its own database session. - """ - global scrape_status - db: Session = SessionLocal() - try: - logger.info(f"Background task started for job_id: {job_id}") - crud.update_scrape_job_status(db, job_id, status="running") - scrape_status.update({ - "job_id": job_id, - "status": "running", - "message": f"Scraping from {autotrader_url}...", - "last_run_time": datetime.utcnow().isoformat(), - "duration_seconds": None, - "results_count": 0, - "error_message": None - }) - - start_time = datetime.utcnow() - - scraped_data_list = await scraper.scrape_autotrader_data( - autotrader_url=autotrader_url, - headless=headless, - timeout=scrape_timeout - ) - - end_time = datetime.utcnow() - duration = (end_time - start_time).total_seconds() - scrape_status["duration_seconds"] = round(duration, 2) - - added_count = 0 - updated_count = 0 # Placeholder for future update logic - - if not scraped_data_list: - logger.info(f"No listings found for job_id: {job_id}") - crud.update_scrape_job_status(db, job_id, status="completed", results_count=0) - scrape_status.update({ - "status": "completed", - "message": "Scraping completed. No new listings found or page was inaccessible.", - "results_count": 0 - }) - return - - for item_data in scraped_data_list: - # Ensure all required fields for CarListingCreate are present - listing_create = schemas.CarListingCreate( - job_id=job_id, - platform=item_data.get("source_name", "autotrader"), # Get platform from scraper or default - url=item_data.get("listing_url"), - title=item_data.get("title"), - price=item_data.get("price"), - mileage=item_data.get("mileage"), - vin=item_data.get("vin"), - image_urls=item_data.get("image_urls", []), - raw_data=item_data.get("data_points", {}) - ) - - existing_listing = crud.get_car_listing_by_url(db, str(listing_create.url)) - if existing_listing: - # For now, we just count updates. Actual update logic could be added here. - # e.g., existing_listing.price = listing_create.price - # existing_listing.extracted_at = datetime.utcnow() - updated_count += 1 - else: - crud.create_car_listing(db=db, listing=listing_create) - added_count += 1 - - crud.update_scrape_job_status(db, job_id, status="completed", results_count=added_count) - scrape_status.update({ - "status": "completed", - "message": f"Scraping finished. Added: {added_count}, Updated: {updated_count} (placeholder).", - "results_count": added_count + updated_count # Or just added_count if updates aren't really changing data - }) - logger.info(f"Background task for job_id: {job_id} completed. Added: {added_count}, Updated: {updated_count}") - - except Exception as e: - logger.error(f"Error in background scraper task for job_id {job_id}: {e}", exc_info=True) - crud.update_scrape_job_status(db, job_id, status="failed", error_message=str(e)) - scrape_status.update({ - "status": "failed", - "message": f"Error during scraping: {str(e)}", - "error_message": str(e) - }) - finally: - db.close() - logger.info(f"DB session closed for job_id: {job_id}") - - -@app.post("/scrape/", response_model=schemas.ScrapeJob, status_code=202) -async def trigger_scrape(background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - """ - Triggers a new scraping job for Autotrader. - """ - global scrape_status - if scrape_status.get("status") == "running": - raise HTTPException(status_code=409, detail="A scraping job is already in progress.") - - autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/all-cars/cars-under-10000") # Default to a common search if not set - headless_str = os.getenv("HEADLESS_BROWSER", "True") - headless = headless_str.lower() == "true" - scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - - try: - scrape_timeout = int(scrape_timeout_str) - except ValueError: - scrape_timeout = 120000 # Default timeout if parsing fails - logger.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Using default {scrape_timeout}ms.") - - job = crud.create_scrape_job(db) - scrape_status.update({ - "job_id": job.id, - "status": "pending", - "message": f"Scraping job {job.id} initiated for URL: {autotrader_url}", - "last_run_time": job.timestamp.isoformat(), - "duration_seconds": None, - "results_count": 0, - "error_message": None - }) - - # Pass job_id to the background task - background_tasks.add_task(run_scraping_task, job.id, autotrader_url, headless, scrape_timeout) - - logger.info(f"Scraping job {job.id} queued for URL: {autotrader_url}") - return job - -@app.post("/api/v1/listings/ingest", response_model=schemas.CarListing, status_code=201) -async def ingest_listing(payload: schemas.CarListingCreate, db: Session = Depends(get_db)): - """ - Ingests a new car listing into the database. - This endpoint is useful for manually adding or testing data. - """ - # Check if listing with this URL already exists to prevent duplicates, - # though the database constraint should also handle this. - db_listing = crud.get_car_listing_by_url(db, url=str(payload.url)) - if db_listing: - raise HTTPException(status_code=400, detail="Listing with this URL already exists.") - - # The job_id in CarListingCreate might be problematic if this is a direct ingest - # not tied to a specific scrape job. For now, we'll assume it's provided or - # we could adjust the schema/logic if direct ingestion shouldn't have a job_id. - # For testing, we might need to create a dummy job or adjust schema. - # Let's assume for now a valid job_id is provided or handle it if not. - if not payload.job_id: - # Create a dummy job or handle as per requirements for listings not tied to a job - # For simplicity, let's assume job_id is optional in the schema for this use case - # or a default/placeholder job_id is used. - # For this test, the payload includes job_id, so we'll proceed. - # If CarListingCreate schema requires job_id, this endpoint needs to handle it. - # For now, let's assume it's provided in the payload. - pass - - try: - created_listing = crud.create_car_listing(db=db, listing=payload) - return created_listing - except Exception as e: - logger.error(f"Error ingesting listing: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") - - -@app.get("/scrape/status", response_model=schemas.ScrapeJob) # Using ScrapeJob schema for better structure -async def get_current_scrape_status(db: Session = Depends(get_db)): - """ - Returns the status of the current or last scraping job. - """ - global scrape_status - if scrape_status.get("job_id"): - job = crud.get_scrape_job(db, scrape_status["job_id"]) - if job: - # Update status from DB if available, otherwise use in-memory for simplicity - # A more robust system might always fetch from DB or use a proper job queue status - return job - return scrape_status # Fallback to in-memory status if job not found or not started - -@app.get("/scrape/jobs/", response_model=List[schemas.ScrapeJob]) -async def read_jobs(skip: int = 0, limit: int = 10, db: Session = Depends(get_db)): - """ - Retrieve all scrape jobs. - """ - jobs = crud.get_all_scrape_jobs(db, skip=skip, limit=limit) - return jobs - -@app.get("/scrape/jobs/{job_id}/results", response_model=List[schemas.CarListing]) -async def read_job_results(job_id: int, skip: int = 0, limit: int = 10, db: Session = Depends(get_db)): - """ - Retrieve results for a specific scrape job. - """ - job = crud.get_scrape_job(db, job_id=job_id) - if job is None: - raise HTTPException(status_code=404, detail="Job not found") - listings = crud.get_listings_for_job(db, job_id=job_id, skip=skip, limit=limit) - return listings - -@app.get("/") -async def read_root(): - return {"message": "AutoTrader Scraper API is running!"} - -# This is for local development if you run `python app/main.py` -# Uvicorn will be started by Procfile in production environments like Heroku -if __name__ == "__main__": - # Ensure tables are created before starting the app if they don't exist - # This is useful for local development but might be handled differently in production - from .database import create_tables - create_tables() - - # Get port from environment variable or default to 8000 - port = int(os.getenv("PORT", "8000")) - uvicorn.run(app, host="0.0.0.0", port=port) - -# Remove the old main.py content if it exists in the root directory -# This is now handled by app/main.py -# Ensure Procfile points to app.main:app or similar based on your directory structure -# e.g., web: uvicorn app.main:app --host=0.0.0.0 --port=${PORT:-8000} -# (Assuming app.py is moved to app/main.py) -# If app.py remains in root, then Procfile is fine. - -# The `models.Base.metadata.create_all(bind=engine)` should ideally be called once, -# perhaps in main.py or a startup script, not every time database.py is imported. -# For simplicity in this single-file app structure, it's often put there. -# If app.py is the main entry point for uvicorn, it's a good place. -# For Render, buildCommand in render.yaml can also handle migrations/table creation. - -# Let's ensure the imports are correct considering the file structure -# If main.py is in root and imports from app/, it should be `from app import crud, models, schemas, scraper` -# If this file is app/main.py, then `from . import crud, models, schemas, scraper` is correct. -# The prompt implies this file is app/main.py. diff --git a/app/models.py b/app/models.py deleted file mode 100644 index b0d4e5d..0000000 --- a/app/models.py +++ /dev/null @@ -1,45 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship -from datetime import datetime - -Base = declarative_base() - -class ScrapeResult(Base): - __tablename__ = "scrape_results" - - id = Column(Integer, primary_key=True, index=True) - url = Column(String, index=True) - title = Column(String) - price = Column(String, nullable=True) # Store as string to handle variations like 'Contact Seller' - mileage = Column(String, nullable=True) # Store as string to handle non-numeric values - vin = Column(String, nullable=True, unique=True) - images = Column(JSON, nullable=True) # Store list of image URLs - scraped_at = Column(DateTime) - details = Column(JSON, nullable=True) # Store other details as JSON - -class ScrapeJob(Base): - __tablename__ = "scrape_jobs" - - id = Column(Integer, primary_key=True, index=True) - timestamp = Column(DateTime, default=datetime.utcnow) - status = Column(String, default="pending") # e.g., pending, running, completed, failed - results_count = Column(Integer, default=0) - error_message = Column(String, nullable=True) - -class ScrapedData(Base): - __tablename__ = "scraped_data" - - id = Column(Integer, primary_key=True, index=True) - job_id = Column(Integer, ForeignKey("scrape_jobs.id")) - platform = Column(String) # e.g., 'autotrader', 'cars.com' - url = Column(String, unique=True, index=True) - title = Column(String, nullable=True) - price = Column(String, nullable=True) - mileage = Column(String, nullable=True) - vin = Column(String, nullable=True, index=True) - image_urls = Column(JSON, nullable=True) # List of image URLs - raw_data = Column(JSON, nullable=True) # Full raw data if needed - scraped_at = Column(DateTime, default=datetime.utcnow) - - job = relationship("ScrapeJob") diff --git a/app/schemas.py b/app/schemas.py deleted file mode 100644 index 2ee0d8d..0000000 --- a/app/schemas.py +++ /dev/null @@ -1,41 +0,0 @@ -from pydantic import BaseModel, HttpUrl -from typing import List, Optional, Dict, Any -from datetime import datetime - -class CarListingBase(BaseModel): - url: HttpUrl - title: Optional[str] = None - price: Optional[str] = None # Keep as string to handle variations - mileage: Optional[str] = None # Keep as string - vin: Optional[str] = None - image_urls: Optional[List[HttpUrl]] = [] - raw_data: Optional[Dict[str, Any]] = {} # For any other unstructured data - -class CarListingCreate(CarListingBase): - platform: str - job_id: int - -class CarListing(CarListingBase): - id: int - platform: str - job_id: int - scraped_at: datetime - - class Config: - orm_mode = True - -class ScrapeJobBase(BaseModel): - pass - -class ScrapeJobCreate(ScrapeJobBase): - pass - -class ScrapeJob(ScrapeJobBase): - id: int - timestamp: datetime - status: str - results_count: int = 0 - error_message: Optional[str] = None - - class Config: - orm_mode = True diff --git a/app/scraper.py b/app/scraper.py deleted file mode 100644 index b946ecb..0000000 --- a/app/scraper.py +++ /dev/null @@ -1,373 +0,0 @@ -import asyncio -import logging -# import os # No longer needed for getenv in main -import datetime # Keep for now, might be used in data processing -from playwright.async_api import async_playwright -# Required for main test function -from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT -# DATABASE_URL is used by database.py, SessionLocal will pick it up via config - -# Assuming database.py is in the same directory or accessible in PYTHONPATH -from database import get_db, CarListing, SessionLocal # Added SessionLocal for main example -from sqlalchemy.orm import Session -from datetime import datetime # Ensure datetime is imported directly - -# Configure basic logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - -from stealth_utils import apply_stealth_js # Import new stealth utility -# from playwright_stealth import stealth_async # Commenting out old stealth - -class AutoTraderScraper: - """Scraper for AutoTrader private party listings using Playwright.""" - - def __init__(self, source_name: str = "autotrader"): - """ - Initializes the AutoTraderScraper. - Args: - source_name (str): Name of the source platform. - """ - self.source_name = source_name - # Potentially load other configs from a config file or env vars here - # For example: self.base_url = "https://www.autotrader.com/cars-for-sale/private-seller" - - async def get_private_listings(self, autotrader_url: str, headless: bool, timeout: int = 120000) -> list[dict]: - """ - Scrapes private party listings from AutoTrader using Playwright. - - Args: - autotrader_url (str): The starting URL for scraping AutoTrader private listings. - headless (bool): Whether to run the browser in headless mode. - timeout (int): Maximum time in milliseconds for page operations. - - Returns: - list[dict]: A list of dictionaries, where each dictionary represents a scraped vehicle listing. - """ - listings_data = [] - browser = None - - launch_options = { - "headless": headless, - "args": [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-infobars', - '--window-position=0,0', - '--ignore-certificate-errors', - '--ignore-certificate-errors-spki-list', - # '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"' # User agent is set in context - '--disable-gpu' # Already there but keep - ], - # "channel": "chrome" # This might require full Chrome install, trying without first to see if args help - } - - # Try with 'msedge' or 'chrome' if default chromium fails and they are available - # For now, stick to chromium and args. If 'channel' is needed, it's a bigger setup change. - - async with async_playwright() as p: - try: - # browser = await p.chromium.launch(**launch_options) # Default chromium - # Let's try specifying channel, assuming it might use a locally installed Chrome if available, or a Playwright-managed one. - # This is a common suggestion if the default Playwright Chromium build is too easily detected. - # If "chrome" channel is not found by Playwright, it will error. - try: - browser = await p.chromium.launch( - **launch_options, - channel="chrome" # Attempt to use a branded Chrome build - ) - logging.info("Attempting to launch with channel='chrome'") - except Exception as e_channel: - logging.warning(f"Failed to launch with channel='chrome' ({e_channel}). Falling back to default Playwright Chromium.") - # Remove channel from launch_options if it failed - launch_options_no_channel = launch_options.copy() - if "channel" in launch_options_no_channel: # Should not be needed based on above structure but good practice - del launch_options_no_channel["channel"] - browser = await p.chromium.launch(**launch_options_no_channel) - logging.info("Launched with default Playwright Chromium.") - - - context = await browser.new_context( - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36', # A fairly common user agent - java_script_enabled=True, - ) - context.set_default_navigation_timeout(timeout) - context.set_default_timeout(timeout) - - page = await context.new_page() - await page.set_viewport_size({"width": 1920, "height": 1080}) - - # Apply custom JS stealth - await apply_stealth_js(page) - - logging.info(f"Navigating to {autotrader_url}") - await page.goto(autotrader_url, wait_until="domcontentloaded", timeout=timeout) # Reverted to domcontentloaded - - title = await page.title() - logging.info(f"Page title: {title}") - - if "unavailable" in title.lower() or "block" in title.lower() or "access denied" in title.lower(): - logging.critical(f"Failed to load AutoTrader listings page. Blocked by website. Title: {title}") - await browser.close() # Ensure browser is closed before returning - return [] - - # Using speculative selectors for AutoTrader - # Main container for listings: 'div[data-qaid="cntnr-lstng-main"]' (this might be too broad or incorrect) - # A more specific item selector might be needed, e.g., an article or a div with a specific class. - # For now, let's assume individual listing cards can be found with a selector like: - # "div.inventory-listing" or "div[data-cmp='inventoryListing']" - these are common patterns. - # The provided example 'div[data-qaid="cntnr-lstng-main"]' seems like it might be a single container FOR ALL listings. - # Let's try a more specific (but still guessed) selector for individual listing items. - # A common pattern is items within a list or grid. Let's try to find items: - # This selector is a **GUESS** based on common AutoTrader structures. - listing_item_selector = "div[data-cmp='inventoryListing']" # GUESS - - # Fallback if the primary guess doesn't work, try another common pattern - # listing_item_selector_fallback = "div.inventory-listing.new-listing.stub" # Another GUESS - - # await page.wait_for_selector(listing_item_selector, timeout=15000) # Wait for items to appear - - listing_containers = await page.query_selector_all(listing_item_selector) - - # if not listing_containers: - # logging.info(f"No listings found with primary selector '{listing_item_selector}'. Trying fallback...") - # listing_containers = await page.query_selector_all(listing_item_selector_fallback) - - logging.info(f"Found {len(listing_containers)} potential listing containers using selector '{listing_item_selector}'.") - - processed_count = 0 - # first_container_processed_for_html_dump = False # REMOVE HTML DUMP FLAG - for i, container in enumerate(listing_containers): - url_path = None - title_text = "N/A" # Default to N/A - price_text = "N/A" # Default to N/A - mileage_text = "N/A" # Default to N/A (as it's not reliably on card) - listing_url = None - - try: - logging.debug(f"Processing container {i+1}/{len(listing_containers)}") - - # Attempt to get Title - title_el = await container.query_selector("h2[data-cmp='subheading']") # Updated selector from HTML dump - if title_el: - raw_title_text = await title_el.inner_text() - title_text = raw_title_text.strip() if raw_title_text else "N/A" - - # Attempt to get URL from parent of title_el - # Playwright's query_selector does not directly support xpath like "ancestor::a". - # A common structure is

...

or

...

- # We can try to find 'a' that contains this h2, or assume the 'a[data-cmp="link"]' is the one. - - # Let's use the a[data-cmp="link"] which was identified as containing the title h2 - parent_link_el = await container.query_selector("a[data-cmp='link']") - if parent_link_el: - url_path = await parent_link_el.get_attribute("href") - else: # Fallback if the above structure isn't found - logging.warning(f"Could not find parent a[data-cmp='link'] for title in listing {i+1}") - else: - logging.warning(f"Title not found with h2[data-cmp='subheading'] for listing {i+1}.") - - # Fallback or alternative for URL if not found via title's parent link - if not url_path: - url_el_alt = await container.query_selector("a[data-cmp='relLnk']") # Keep this fallback - if url_el_alt: - url_path = await url_el_alt.get_attribute("href") - - if not url_path: # Last resort for URL - first_a = await container.query_selector("a[href]") # Broadest fallback - if first_a: - url_path = await first_a.get_attribute("href") - - if not url_path: - logging.warning(f"Could not extract URL for listing {i+1} (Title: {title_text}). Skipping.") - continue - - if not url_path.startswith(('http://', 'https://')): - listing_url = f"https://www.autotrader.com{url_path}" - else: - listing_url = url_path - - # Attempt to get Price - price_el = await container.query_selector("div[data-cmp='firstPrice']") # Updated selector - if price_el: - raw_price_text = await price_el.inner_text() - price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A" - else: - # Fallback for price (e.g. .first-price class directly) - price_el_fallback = await container.query_selector(".first-price") - if price_el_fallback: - raw_price_text = await price_el_fallback.inner_text() - price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A" - else: - logging.warning(f"Price not found for listing {listing_url}") - price_text = "N/A" - - - # Mileage - Set to N/A as it's not reliably on the card from previous findings - mileage_text = "N/A" - # logging.info(f"Mileage not scraped from listing card for {listing_url} (by design for now).") - - vin_text = None - - listing_data = { - "listing_url": listing_url, - "title": title_text, # Already defaults to N/A or has value - "price": price_text, # Already defaults to N/A or has value - "mileage": mileage_text, # Is N/A - "vin": vin_text, - "source_name": self.source_name, - "data_points": { - "page_title_at_scrape": title # page's title, not listing's - } - } - listings_data.append(listing_data) - processed_count += 1 - logging.info(f"Successfully processed listing: {title_text[:50]}... URL: {listing_url}") - - except Exception as e: - logging.error(f"Error processing listing container {i+1} for URL {listing_url if listing_url else 'Unknown'}: {e}", exc_info=True) - continue - - logging.info(f"Successfully processed {processed_count} out of {len(listing_containers)} listing containers.") - - except Exception as e: - logging.error(f"An error occurred during Playwright scraping phase: {e}", exc_info=True) - finally: - if browser: - logging.info("Closing browser.") - await browser.close() - - return listings_data - - -async def scrape_autotrader_data(autotrader_url: str, headless: bool = True, timeout: int = 120000) -> list[dict]: - """ - High-level function to scrape data from AutoTrader. - Initializes the scraper and calls its scraping method. - - Args: - autotrader_url (str): The URL to scrape. - headless (bool): Whether to run the browser in headless mode. - timeout (int): Timeout for scraping operations in milliseconds. - - Returns: - list[dict]: A list of scraped listing data. - """ - scraper = AutoTraderScraper() - listings = await scraper.get_private_listings(autotrader_url=autotrader_url, headless=headless, timeout=timeout) - return listings - - -async def scrape_autotrader_and_update_db(db: Session, autotrader_url: str, headless: bool, scrape_timeout: int): - """ - Scrapes listings from AutoTrader and updates the database. - - Args: - db (Session): The SQLAlchemy database session. - autotrader_url (str): The URL to scrape. - headless (bool): Whether to run the browser in headless mode. - scrape_timeout (int): Timeout for scraping operations in milliseconds. - - Returns: - dict: A status dictionary with counts of added, updated, and scraped listings. - """ - logging.info(f"Starting scrape and update for URL: {autotrader_url}") - - try: - listings_data = await scrape_autotrader_data( - autotrader_url=autotrader_url, - headless=headless, - timeout=scrape_timeout - ) - except Exception as e: - logging.error(f"Failed to scrape data from {autotrader_url}: {e}", exc_info=True) - return {"status": "error", "message": f"Scraping failed: {e}"} - - added_count = 0 - updated_count = 0 - scraped_count = len(listings_data) - - for listing_data in listings_data: - source_url = listing_data.get('listing_url') # Renamed from 'url' to 'listing_url' in dummy data - if not source_url: - logging.warning(f"Scraped item missing 'listing_url': {listing_data.get('title')}. Skipping.") - continue - - try: - existing_listing = db.query(CarListing).filter(CarListing.source_url == source_url).first() - - if existing_listing: - # Placeholder for update logic - # existing_listing.extracted_at = datetime.utcnow() - # existing_listing.data_points = {k: v for k, v in listing_data.items() if k != 'listing_url'} - # # Update other fields like price if necessary - # db.add(existing_listing) # Not strictly necessary if only mutable fields changed and session tracks - updated_count += 1 - logging.info(f"Listing at {source_url} already exists. Marked for update (placeholder).") - else: - new_listing = CarListing( - platform="autotrader", - extracted_at=datetime.utcnow(), - source_url=source_url, - # Ensure data_points stores everything else from listing_data - data_points={k: v for k, v in listing_data.items() if k != 'listing_url'} - ) - db.add(new_listing) - added_count += 1 - logging.info(f"New listing added from {source_url}") - except Exception as e: - logging.error(f"Error processing listing {source_url} for DB: {e}", exc_info=True) - # Decide if you want to rollback here or continue with other listings - - try: - db.commit() - logging.info("Database changes committed.") - except Exception as e: - logging.error(f"Database commit failed: {e}", exc_info=True) - db.rollback() - return {"status": "error", "message": f"DB commit failed: {e}", "added": 0, "updated": 0, "scraped_count": scraped_count} - - status_summary = { - "status": "success", - "added": added_count, - "updated": updated_count, - "scraped_count": scraped_count - } - logging.info(f"DB update summary: {status_summary}") - return status_summary - -async def main(): - # Use settings from config.py - # url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") - # headless_str = os.getenv("HEADLESS_BROWSER", "True") - # headless = headless_str.lower() == "true" - # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - # try: - # scrape_timeout = int(scrape_timeout_str) - # except ValueError: - # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.") - # scrape_timeout = 120000 - - # from database import SessionLocal # Already imported at the top - db: Session = SessionLocal() # SessionLocal now uses DATABASE_URL from config.py via database.py - try: - logging.info(f"Starting scraper and DB update for URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms") - stats = await scrape_autotrader_and_update_db( - db=db, - autotrader_url=AUTOTRADER_URL, - headless=HEADLESS_BROWSER, - scrape_timeout=SCRAPE_TIMEOUT - ) - logging.info(f"Scraping and DB update completed: {stats}") - except Exception as e: - logging.error(f"Error during scraping and DB update in main: {e}", exc_info=True) - finally: - logging.info("Closing DB session in main.") - db.close() - -if __name__ == "__main__": - # To run this: - # 1. Ensure Playwright browsers are installed: `playwright install chromium` - # 2. Set environment variables if needed (AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT) - # 3. Uncomment the line below - asyncio.run(main()) - # pass # Keep it passive for now, to be run manually when needed diff --git a/config.py b/config.py deleted file mode 100644 index 44148ca..0000000 --- a/config.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from dotenv import load_dotenv - -# Load environment variables from .env file if it exists -# This is useful for local development. -load_dotenv() - -# Database Configuration -DATABASE_URL: str = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db") - -# Scraper Configuration -AUTOTRADER_URL: str = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") -SCRAPE_TIMEOUT: int = int(os.getenv("SCRAPE_TIMEOUT", "120000")) # Milliseconds -HEADLESS_BROWSER: bool = os.getenv("HEADLESS_BROWSER", "True").lower() == "true" - -# API Configuration (if any specific ones are needed later) -# Example: API_HOST: str = os.getenv("API_HOST", "0.0.0.0") -# Example: API_PORT: int = int(os.getenv("API_PORT", "8000")) - -# Logging Configuration (can also be added here if more complex) -LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO").upper() - -# Ensure critical URLs have a scheme for robustness -if not AUTOTRADER_URL.startswith(("http://", "https://")): - # This print statement is for immediate feedback during startup/import. - # In a pure library, side effects on import are sometimes discouraged, - # but for an application's main config, it's often acceptable. - print(f"Warning: AUTOTRADER_URL ('{AUTOTRADER_URL}') did not have a scheme, prepended https://.") - AUTOTRADER_URL = "https://" + AUTOTRADER_URL - print(f"Corrected AUTOTRADER_URL: {AUTOTRADER_URL}") - - -# Example of how to handle SQLite connect_args based on config -DB_CONNECT_ARGS: dict = {"check_same_thread": False} if DATABASE_URL.startswith("sqlite") else {} diff --git a/database.py b/database.py deleted file mode 100644 index 58a11dc..0000000 --- a/database.py +++ /dev/null @@ -1,26 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, JSON, create_engine -from sqlalchemy.orm import declarative_base, sessionmaker, Session -from config import DATABASE_URL, DB_CONNECT_ARGS # Import from config - -# Use imported configuration -engine = create_engine(DATABASE_URL, connect_args=DB_CONNECT_ARGS) - -SessionLocal = sessionmaker(bind=engine, autoflush=False) -Base = declarative_base() - -class CarListing(Base): - __tablename__ = "listings" - id = Column(Integer, primary_key=True, index=True) - platform = Column(String) - extracted_at = Column(DateTime) - source_url = Column(String, unique=True) - data_points = Column(JSON) - -Base.metadata.create_all(bind=engine) - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() diff --git a/render.yaml b/render.yaml index b3c7392..f755e35 100644 --- a/render.yaml +++ b/render.yaml @@ -11,9 +11,19 @@ services: value: 3.11 - key: DATABASE_URL generateValue: true - - key: AUTOTRADER_URL - value: "https://www.autotrader.com/cars-for-sale/private-seller" - - key: SCRAPE_TIMEOUT - value: 120000 - - key: HEADLESS_BROWSER + - key: HEADLESS value: "True" + - key: BROWSER_TIMEOUT + value: "60000" + - key: PAGE_DELAY + value: "5000" + - key: MIN_DELAY_BETWEEN_ACTIONS + value: "2.5" + - key: MAX_LISTINGS_PER_SESSION + value: "25" + - key: PROXY_SERVER + value: "" + - key: PROXY_USERNAME + value: "" + - key: PROXY_PASSWORD + value: "" diff --git a/src/api/routes.py b/src/api/routes.py index c1f5764..612fb3f 100644 --- a/src/api/routes.py +++ b/src/api/routes.py @@ -121,8 +121,10 @@ async def api_v1_root_info(): @router.get("/vehicles/", response_model=List[VehicleListingResponse]) async def get_all_vehicles( + skip: int = Query(0, ge=0), + limit: int = Query(settings.MAX_LISTINGS_PER_SESSION, ge=1, le=200), db: AsyncSession = Depends(get_db), - filters: SearchFilters = Depends() + filters: SearchFilters = Depends(), ): query = select(VehicleListing) conditions = [] @@ -151,7 +153,7 @@ async def get_all_vehicles( if conditions: query = query.where(and_(*conditions)) query = query.order_by(VehicleListing.last_scraped_at.desc(), VehicleListing.created_at.desc()) - result = await db.execute(query) + result = await db.execute(query.offset(skip).limit(limit)) vehicles = result.scalars().all() response_vehicles = [] for vehicle_db_item in vehicles: diff --git a/src/automation/browser_sim.py b/src/automation/browser_sim.py index 96ed36f..c93f204 100644 --- a/src/automation/browser_sim.py +++ b/src/automation/browser_sim.py @@ -25,8 +25,16 @@ async def __aenter__(self): logger.info("Initializing AutoTrader Scraper...") self.playwright_instance = await async_playwright().start() try: + proxy_cfg = None + if settings.PROXY_SERVER: + proxy_cfg = {"server": settings.PROXY_SERVER} + if settings.PROXY_USERNAME and settings.PROXY_PASSWORD: + proxy_cfg["username"] = settings.PROXY_USERNAME + proxy_cfg["password"] = settings.PROXY_PASSWORD + self.browser = await self.playwright_instance.chromium.launch( headless=settings.HEADLESS, + proxy=proxy_cfg, args=[ '--no-sandbox', '--disable-setuid-sandbox', diff --git a/src/config.py b/src/config.py index 598bc2e..94a9b77 100644 --- a/src/config.py +++ b/src/config.py @@ -13,4 +13,9 @@ class Settings: API_PORT: int = int(os.getenv("API_PORT", "8000")) MAX_LISTINGS_PER_SESSION: int = int(os.getenv("MAX_LISTINGS_PER_SESSION", "25")) + # Proxy configuration + PROXY_SERVER: str | None = os.getenv("PROXY_SERVER") + PROXY_USERNAME: str | None = os.getenv("PROXY_USERNAME") + PROXY_PASSWORD: str | None = os.getenv("PROXY_PASSWORD") + settings = Settings() From bdd4820e71f0effb98c57a75fd3cef963aac810e Mon Sep 17 00:00:00 2001 From: hellothere012 Date: Wed, 4 Jun 2025 08:04:42 -0700 Subject: [PATCH 2/4] feat: add proxy support and cleanup --- .env.example | 37 +-- Procfile | 2 +- README.md | 48 +++- app.py | 123 ---------- app/crud.py | 50 ---- app/database.py | 25 -- app/main.py | 265 --------------------- app/models.py | 45 ---- app/schemas.py | 41 ---- app/scraper.py | 373 ------------------------------ config.py | 34 --- database.py | 26 --- main.py | 93 ++++++++ render.yaml | 26 ++- requirements.txt | 15 +- {app => src}/__init__.py | 0 src/api/__init__.py | 0 src/api/routes.py | 237 +++++++++++++++++++ src/automation/__init__.py | 0 src/automation/browser_sim.py | 420 ++++++++++++++++++++++++++++++++++ src/config.py | 21 ++ src/database.py | 28 +++ src/models/__init__.py | 0 src/models/vehicle.py | 84 +++++++ 24 files changed, 975 insertions(+), 1018 deletions(-) delete mode 100644 app.py delete mode 100644 app/crud.py delete mode 100644 app/database.py delete mode 100644 app/main.py delete mode 100644 app/models.py delete mode 100644 app/schemas.py delete mode 100644 app/scraper.py delete mode 100644 config.py delete mode 100644 database.py create mode 100644 main.py rename {app => src}/__init__.py (100%) create mode 100644 src/api/__init__.py create mode 100644 src/api/routes.py create mode 100644 src/automation/__init__.py create mode 100644 src/automation/browser_sim.py create mode 100644 src/config.py create mode 100644 src/database.py create mode 100644 src/models/__init__.py create mode 100644 src/models/vehicle.py diff --git a/.env.example b/.env.example index f7b9b12..4744016 100644 --- a/.env.example +++ b/.env.example @@ -1,21 +1,22 @@ -# Autotrader Configuration -AUTOTRADER_URL="https://www.autotrader.com/cars-for-sale/by-owner/fullerton-ca?zip=92833&searchRadius=50&numRecords=100&sortBy=priceDESC" +# Database Configuration +DATABASE_URL=sqlite+aiosqlite:///./vehicle_data.db -# Webshare Proxy Configuration -PROXY_HOST="your_webshare_proxy_host" -PROXY_PORT="your_webshare_proxy_port" -WEBSHARE_USERNAME="your_webshare_username" -WEBSHARE_PASSWORD="your_webshare_password" +# Browser Configuration +HEADLESS=true +BROWSER_TIMEOUT=60000 +PAGE_DELAY=5000 +MIN_DELAY_BETWEEN_ACTIONS=2.5 -# Database Configuration -# For local SQLite (default): -DATABASE_URL="sqlite+aiosqlite:///./data/vehicle_tracker.db" -DATABASE_TYPE="sqlite" -# Example for PostgreSQL: -# DATABASE_URL="postgresql+asyncpg://user:password@host:port/dbname" -# DATABASE_TYPE="postgresql" +# API Configuration +API_HOST=127.0.0.1 +API_PORT=8000 + +# Scraping Limits +MAX_LISTINGS_PER_SESSION=25 -# Application Configuration -LOG_LEVEL="INFO" -HEADLESS_BROWSER="True" # For Playwright -SCRAPE_TIMEOUT="120000" # For Playwright page/navigation timeout (milliseconds) +# Optional Proxy Configuration +# If using rotating proxies (e.g., Webshare), uncomment and provide the proxy URL. +# Example: http://username:password@proxyhost:port +# PROXY_SERVER= +# PROXY_USERNAME= +# PROXY_PASSWORD= diff --git a/Procfile b/Procfile index 3972b54..84b6dde 100644 --- a/Procfile +++ b/Procfile @@ -1 +1 @@ -web: uvicorn app.main:app --host=0.0.0.0 --port=${PORT:-8000} +web: uvicorn main:app --host=0.0.0.0 --port=${PORT:-8000} diff --git a/README.md b/README.md index 894b33c..7525e16 100644 --- a/README.md +++ b/README.md @@ -1 +1,47 @@ -# vehicle-tracker \ No newline at end of file +# Vehicle Tracker + +This project provides a FastAPI-based API and web scraper for collecting and storing vehicle listings from sites like AutoTrader. It uses Playwright for scraping and SQLAlchemy with SQLite for storage. + +## Usage + +1. Install dependencies: + ```bash + pip install -r requirements.txt + playwright install chromium + ``` +2. Copy `.env.example` to `.env` and adjust settings as needed. +3. Run the API: + ```bash + uvicorn main:app --reload + ``` +4. Trigger scraping via the `/api/v1/vehicles/scrape` endpoint. + +The scraper can also be run standalone: +```bash +python main.py scrape_test +``` + +## Environment Variables + +Set the following variables in a `.env` file or your deployment environment: + +| Variable | Description | Default | +| --- | --- | --- | +| `DATABASE_URL` | Database connection URL | `sqlite+aiosqlite:///./vehicle_data.db` | +| `HEADLESS` | Run the browser in headless mode | `true` | +| `BROWSER_TIMEOUT` | Playwright launch timeout (ms) | `60000` | +| `PAGE_DELAY` | Base delay after page loads (ms) | `5000` | +| `MIN_DELAY_BETWEEN_ACTIONS` | Delay between scraping actions (s) | `2.5` | +| `API_HOST` | Host for the FastAPI server | `127.0.0.1` | +| `API_PORT` | Port for the FastAPI server | `8000` | +| `MAX_LISTINGS_PER_SESSION` | Maximum listings fetched per scrape | `25` | +| `PROXY_SERVER` | *(Optional)* Proxy URL for Playwright | - | +| `PROXY_USERNAME` | *(Optional)* Proxy username | - | +| `PROXY_PASSWORD` | *(Optional)* Proxy password | - | + +### Pagination + +The `/api/v1/vehicles/` endpoint accepts `skip` and `limit` query parameters to paginate results. +Example: `/api/v1/vehicles/?skip=25&limit=25`. + + diff --git a/app.py b/app.py deleted file mode 100644 index 8fa8500..0000000 --- a/app.py +++ /dev/null @@ -1,123 +0,0 @@ -import logging -# import os # No longer needed for getenv in background task -import asyncio -from fastapi import FastAPI, Depends, BackgroundTasks -from pydantic import BaseModel -from typing import Dict -from datetime import datetime -from database import CarListing, get_db, Session, SessionLocal -from scraper import scrape_autotrader_and_update_db -from fastapi.middleware.cors import CORSMiddleware -from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT, LOG_LEVEL # Import from config - -# Configure basic logging using LOG_LEVEL from config -# Ensure this is called only once. If FastAPI/Uvicorn also configures logging, -# this might need adjustment or to be handled by the logger instance directly. -# For now, assume this is the primary logging config. -logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s', force=True) -# Added force=True to ensure this config takes precedence if uvicorn also tries to set basicConfig. - -app = FastAPI() -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_methods=["*"], - allow_headers=["*"], -) - -class CarListingRaw(BaseModel): - platform: str - extracted_at: datetime - source_url: str - data_points: Dict - -@app.post("/api/v1/listings/ingest") -async def ingest_listing(payload: CarListingRaw, db: Session = Depends(get_db)): - listing = CarListing( - platform=payload.platform, - extracted_at=payload.extracted_at, - source_url=payload.source_url, - data_points=payload.data_points - ) - db.add(listing) - db.commit() - db.refresh(listing) - return {"status": "saved", "listing_id": listing.id} - -@app.get("/") -def read_root(): - return {"message": "πŸš— Car Tracker API is running!"} - -# Global variable to store scraping status -scrape_status = { - "last_run_time": None, - "status": "idle", # States: idle, running, success, error - "message": "", - "added": 0, - "updated": 0, - "scraped_count": 0 -} - -# Background task wrapper -async def _background_scraper_task_wrapper(): - global scrape_status - db_task_session: Session = SessionLocal() - logging.info("Background scraper task started.") - scrape_status["status"] = "running" - scrape_status["message"] = "Scraping in progress..." - scrape_status["last_run_time"] = datetime.utcnow().isoformat() - scrape_status["added"] = 0 # Reset counts for current run - scrape_status["updated"] = 0 - scrape_status["scraped_count"] = 0 - - try: - # Use imported config values - # autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") - # headless_str = os.getenv("HEADLESS_BROWSER", "True") - # headless = headless_str.lower() == "true" - # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - # try: - # scrape_timeout = int(scrape_timeout_str) - # except ValueError: - # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.") - # scrape_timeout = 120000 - - logging.info(f"Background task using URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms") - - result = await scrape_autotrader_and_update_db( - db=db_task_session, - autotrader_url=AUTOTRADER_URL, - headless=HEADLESS_BROWSER, - scrape_timeout=SCRAPE_TIMEOUT - ) - - if result.get("status") == "success": - scrape_status["status"] = "success" - scrape_status["message"] = "Scraping completed successfully." - scrape_status["added"] = result.get("added", 0) - scrape_status["updated"] = result.get("updated", 0) - scrape_status["scraped_count"] = result.get("scraped_count", 0) - else: - scrape_status["status"] = "error" - scrape_status["message"] = result.get("message", "Scraping failed with an unknown error.") - - logging.info(f"Background scraper task completed: {result}") - - except Exception as e: - logging.error(f"Error in background scraper task: {e}", exc_info=True) - scrape_status["status"] = "error" - scrape_status["message"] = str(e) - finally: - db_task_session.close() - logging.info("Background scraper DB session closed.") - -@app.post("/api/v1/scrape/autotrader") -async def trigger_autotrader_scrape(background_tasks: BackgroundTasks): - if scrape_status["status"] == "running": - return {"message": "AutoTrader scraping job is already running."} - background_tasks.add_task(_background_scraper_task_wrapper) - return {"message": "AutoTrader scraping job started in the background."} - -@app.get("/api/v1/scrape/status") -async def get_scrape_status(): - return scrape_status diff --git a/app/crud.py b/app/crud.py deleted file mode 100644 index fb8708b..0000000 --- a/app/crud.py +++ /dev/null @@ -1,50 +0,0 @@ -from sqlalchemy.orm import Session -from . import models, schemas -from datetime import datetime - -def get_car_listing_by_url(db: Session, url: str): - return db.query(models.ScrapedData).filter(models.ScrapedData.url == url).first() - -def create_car_listing(db: Session, listing: schemas.CarListingCreate): - db_listing = models.ScrapedData( - job_id=listing.job_id, - platform=listing.platform, - url=str(listing.url), # Ensure HttpUrl is converted to string - title=listing.title, - price=listing.price, - mileage=listing.mileage, - vin=listing.vin, - image_urls=listing.image_urls, # Assuming image_urls is already a list of strings or compatible JSON - raw_data=listing.raw_data, - scraped_at=datetime.utcnow() - ) - db.add(db_listing) - db.commit() - db.refresh(db_listing) - return db_listing - -def create_scrape_job(db: Session) -> models.ScrapeJob: - db_job = models.ScrapeJob(timestamp=datetime.utcnow(), status="pending") - db.add(db_job) - db.commit() - db.refresh(db_job) - return db_job - -def update_scrape_job_status(db: Session, job_id: int, status: str, results_count: int = 0, error_message: str = None): - db_job = db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first() - if db_job: - db_job.status = status - db_job.results_count = results_count - db_job.error_message = error_message - db.commit() - db.refresh(db_job) - return db_job - -def get_scrape_job(db: Session, job_id: int): - return db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first() - -def get_all_scrape_jobs(db: Session, skip: int = 0, limit: int = 100): - return db.query(models.ScrapeJob).order_by(models.ScrapeJob.timestamp.desc()).offset(skip).limit(limit).all() - -def get_listings_for_job(db: Session, job_id: int, skip: int = 0, limit: int = 100): - return db.query(models.ScrapedData).filter(models.ScrapedData.job_id == job_id).offset(skip).limit(limit).all() diff --git a/app/database.py b/app/database.py deleted file mode 100644 index bf32154..0000000 --- a/app/database.py +++ /dev/null @@ -1,25 +0,0 @@ -from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker -import os - -DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db") - -engine_args = {} -if DATABASE_URL.startswith("sqlite"): - engine_args["connect_args"] = {"check_same_thread": False} - -engine = create_engine(DATABASE_URL, **engine_args) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - -Base = declarative_base() - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -def create_tables(): - Base.metadata.create_all(bind=engine) diff --git a/app/main.py b/app/main.py deleted file mode 100644 index 4817f15..0000000 --- a/app/main.py +++ /dev/null @@ -1,265 +0,0 @@ -import logging -import os -from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks -from sqlalchemy.orm import Session -from typing import List - -from . import crud, models, schemas, scraper -from .database import SessionLocal, engine - -# Create database tables if they don't exist -models.Base.metadata.create_all(bind=engine) - -# Configure logging -LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() -logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -app = FastAPI(title="AutoTrader Scraper API", version="1.0.0") - -# Dependency to get DB session -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -# Global variable to store scraping status (simple approach) -scrape_status = { - "job_id": None, - "status": "idle", # States: idle, pending, running, completed, failed - "message": "No scraping job initiated yet.", - "last_run_time": None, - "duration_seconds": None, - "results_count": 0, - "error_message": None -} - -async def run_scraping_task(job_id: int, autotrader_url: str, headless: bool, scrape_timeout: int): - """ - The actual scraping task that runs in the background. - It creates its own database session. - """ - global scrape_status - db: Session = SessionLocal() - try: - logger.info(f"Background task started for job_id: {job_id}") - crud.update_scrape_job_status(db, job_id, status="running") - scrape_status.update({ - "job_id": job_id, - "status": "running", - "message": f"Scraping from {autotrader_url}...", - "last_run_time": datetime.utcnow().isoformat(), - "duration_seconds": None, - "results_count": 0, - "error_message": None - }) - - start_time = datetime.utcnow() - - scraped_data_list = await scraper.scrape_autotrader_data( - autotrader_url=autotrader_url, - headless=headless, - timeout=scrape_timeout - ) - - end_time = datetime.utcnow() - duration = (end_time - start_time).total_seconds() - scrape_status["duration_seconds"] = round(duration, 2) - - added_count = 0 - updated_count = 0 # Placeholder for future update logic - - if not scraped_data_list: - logger.info(f"No listings found for job_id: {job_id}") - crud.update_scrape_job_status(db, job_id, status="completed", results_count=0) - scrape_status.update({ - "status": "completed", - "message": "Scraping completed. No new listings found or page was inaccessible.", - "results_count": 0 - }) - return - - for item_data in scraped_data_list: - # Ensure all required fields for CarListingCreate are present - listing_create = schemas.CarListingCreate( - job_id=job_id, - platform=item_data.get("source_name", "autotrader"), # Get platform from scraper or default - url=item_data.get("listing_url"), - title=item_data.get("title"), - price=item_data.get("price"), - mileage=item_data.get("mileage"), - vin=item_data.get("vin"), - image_urls=item_data.get("image_urls", []), - raw_data=item_data.get("data_points", {}) - ) - - existing_listing = crud.get_car_listing_by_url(db, str(listing_create.url)) - if existing_listing: - # For now, we just count updates. Actual update logic could be added here. - # e.g., existing_listing.price = listing_create.price - # existing_listing.extracted_at = datetime.utcnow() - updated_count += 1 - else: - crud.create_car_listing(db=db, listing=listing_create) - added_count += 1 - - crud.update_scrape_job_status(db, job_id, status="completed", results_count=added_count) - scrape_status.update({ - "status": "completed", - "message": f"Scraping finished. Added: {added_count}, Updated: {updated_count} (placeholder).", - "results_count": added_count + updated_count # Or just added_count if updates aren't really changing data - }) - logger.info(f"Background task for job_id: {job_id} completed. Added: {added_count}, Updated: {updated_count}") - - except Exception as e: - logger.error(f"Error in background scraper task for job_id {job_id}: {e}", exc_info=True) - crud.update_scrape_job_status(db, job_id, status="failed", error_message=str(e)) - scrape_status.update({ - "status": "failed", - "message": f"Error during scraping: {str(e)}", - "error_message": str(e) - }) - finally: - db.close() - logger.info(f"DB session closed for job_id: {job_id}") - - -@app.post("/scrape/", response_model=schemas.ScrapeJob, status_code=202) -async def trigger_scrape(background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - """ - Triggers a new scraping job for Autotrader. - """ - global scrape_status - if scrape_status.get("status") == "running": - raise HTTPException(status_code=409, detail="A scraping job is already in progress.") - - autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/all-cars/cars-under-10000") # Default to a common search if not set - headless_str = os.getenv("HEADLESS_BROWSER", "True") - headless = headless_str.lower() == "true" - scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - - try: - scrape_timeout = int(scrape_timeout_str) - except ValueError: - scrape_timeout = 120000 # Default timeout if parsing fails - logger.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Using default {scrape_timeout}ms.") - - job = crud.create_scrape_job(db) - scrape_status.update({ - "job_id": job.id, - "status": "pending", - "message": f"Scraping job {job.id} initiated for URL: {autotrader_url}", - "last_run_time": job.timestamp.isoformat(), - "duration_seconds": None, - "results_count": 0, - "error_message": None - }) - - # Pass job_id to the background task - background_tasks.add_task(run_scraping_task, job.id, autotrader_url, headless, scrape_timeout) - - logger.info(f"Scraping job {job.id} queued for URL: {autotrader_url}") - return job - -@app.post("/api/v1/listings/ingest", response_model=schemas.CarListing, status_code=201) -async def ingest_listing(payload: schemas.CarListingCreate, db: Session = Depends(get_db)): - """ - Ingests a new car listing into the database. - This endpoint is useful for manually adding or testing data. - """ - # Check if listing with this URL already exists to prevent duplicates, - # though the database constraint should also handle this. - db_listing = crud.get_car_listing_by_url(db, url=str(payload.url)) - if db_listing: - raise HTTPException(status_code=400, detail="Listing with this URL already exists.") - - # The job_id in CarListingCreate might be problematic if this is a direct ingest - # not tied to a specific scrape job. For now, we'll assume it's provided or - # we could adjust the schema/logic if direct ingestion shouldn't have a job_id. - # For testing, we might need to create a dummy job or adjust schema. - # Let's assume for now a valid job_id is provided or handle it if not. - if not payload.job_id: - # Create a dummy job or handle as per requirements for listings not tied to a job - # For simplicity, let's assume job_id is optional in the schema for this use case - # or a default/placeholder job_id is used. - # For this test, the payload includes job_id, so we'll proceed. - # If CarListingCreate schema requires job_id, this endpoint needs to handle it. - # For now, let's assume it's provided in the payload. - pass - - try: - created_listing = crud.create_car_listing(db=db, listing=payload) - return created_listing - except Exception as e: - logger.error(f"Error ingesting listing: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") - - -@app.get("/scrape/status", response_model=schemas.ScrapeJob) # Using ScrapeJob schema for better structure -async def get_current_scrape_status(db: Session = Depends(get_db)): - """ - Returns the status of the current or last scraping job. - """ - global scrape_status - if scrape_status.get("job_id"): - job = crud.get_scrape_job(db, scrape_status["job_id"]) - if job: - # Update status from DB if available, otherwise use in-memory for simplicity - # A more robust system might always fetch from DB or use a proper job queue status - return job - return scrape_status # Fallback to in-memory status if job not found or not started - -@app.get("/scrape/jobs/", response_model=List[schemas.ScrapeJob]) -async def read_jobs(skip: int = 0, limit: int = 10, db: Session = Depends(get_db)): - """ - Retrieve all scrape jobs. - """ - jobs = crud.get_all_scrape_jobs(db, skip=skip, limit=limit) - return jobs - -@app.get("/scrape/jobs/{job_id}/results", response_model=List[schemas.CarListing]) -async def read_job_results(job_id: int, skip: int = 0, limit: int = 10, db: Session = Depends(get_db)): - """ - Retrieve results for a specific scrape job. - """ - job = crud.get_scrape_job(db, job_id=job_id) - if job is None: - raise HTTPException(status_code=404, detail="Job not found") - listings = crud.get_listings_for_job(db, job_id=job_id, skip=skip, limit=limit) - return listings - -@app.get("/") -async def read_root(): - return {"message": "AutoTrader Scraper API is running!"} - -# This is for local development if you run `python app/main.py` -# Uvicorn will be started by Procfile in production environments like Heroku -if __name__ == "__main__": - # Ensure tables are created before starting the app if they don't exist - # This is useful for local development but might be handled differently in production - from .database import create_tables - create_tables() - - # Get port from environment variable or default to 8000 - port = int(os.getenv("PORT", "8000")) - uvicorn.run(app, host="0.0.0.0", port=port) - -# Remove the old main.py content if it exists in the root directory -# This is now handled by app/main.py -# Ensure Procfile points to app.main:app or similar based on your directory structure -# e.g., web: uvicorn app.main:app --host=0.0.0.0 --port=${PORT:-8000} -# (Assuming app.py is moved to app/main.py) -# If app.py remains in root, then Procfile is fine. - -# The `models.Base.metadata.create_all(bind=engine)` should ideally be called once, -# perhaps in main.py or a startup script, not every time database.py is imported. -# For simplicity in this single-file app structure, it's often put there. -# If app.py is the main entry point for uvicorn, it's a good place. -# For Render, buildCommand in render.yaml can also handle migrations/table creation. - -# Let's ensure the imports are correct considering the file structure -# If main.py is in root and imports from app/, it should be `from app import crud, models, schemas, scraper` -# If this file is app/main.py, then `from . import crud, models, schemas, scraper` is correct. -# The prompt implies this file is app/main.py. diff --git a/app/models.py b/app/models.py deleted file mode 100644 index b0d4e5d..0000000 --- a/app/models.py +++ /dev/null @@ -1,45 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship -from datetime import datetime - -Base = declarative_base() - -class ScrapeResult(Base): - __tablename__ = "scrape_results" - - id = Column(Integer, primary_key=True, index=True) - url = Column(String, index=True) - title = Column(String) - price = Column(String, nullable=True) # Store as string to handle variations like 'Contact Seller' - mileage = Column(String, nullable=True) # Store as string to handle non-numeric values - vin = Column(String, nullable=True, unique=True) - images = Column(JSON, nullable=True) # Store list of image URLs - scraped_at = Column(DateTime) - details = Column(JSON, nullable=True) # Store other details as JSON - -class ScrapeJob(Base): - __tablename__ = "scrape_jobs" - - id = Column(Integer, primary_key=True, index=True) - timestamp = Column(DateTime, default=datetime.utcnow) - status = Column(String, default="pending") # e.g., pending, running, completed, failed - results_count = Column(Integer, default=0) - error_message = Column(String, nullable=True) - -class ScrapedData(Base): - __tablename__ = "scraped_data" - - id = Column(Integer, primary_key=True, index=True) - job_id = Column(Integer, ForeignKey("scrape_jobs.id")) - platform = Column(String) # e.g., 'autotrader', 'cars.com' - url = Column(String, unique=True, index=True) - title = Column(String, nullable=True) - price = Column(String, nullable=True) - mileage = Column(String, nullable=True) - vin = Column(String, nullable=True, index=True) - image_urls = Column(JSON, nullable=True) # List of image URLs - raw_data = Column(JSON, nullable=True) # Full raw data if needed - scraped_at = Column(DateTime, default=datetime.utcnow) - - job = relationship("ScrapeJob") diff --git a/app/schemas.py b/app/schemas.py deleted file mode 100644 index 2ee0d8d..0000000 --- a/app/schemas.py +++ /dev/null @@ -1,41 +0,0 @@ -from pydantic import BaseModel, HttpUrl -from typing import List, Optional, Dict, Any -from datetime import datetime - -class CarListingBase(BaseModel): - url: HttpUrl - title: Optional[str] = None - price: Optional[str] = None # Keep as string to handle variations - mileage: Optional[str] = None # Keep as string - vin: Optional[str] = None - image_urls: Optional[List[HttpUrl]] = [] - raw_data: Optional[Dict[str, Any]] = {} # For any other unstructured data - -class CarListingCreate(CarListingBase): - platform: str - job_id: int - -class CarListing(CarListingBase): - id: int - platform: str - job_id: int - scraped_at: datetime - - class Config: - orm_mode = True - -class ScrapeJobBase(BaseModel): - pass - -class ScrapeJobCreate(ScrapeJobBase): - pass - -class ScrapeJob(ScrapeJobBase): - id: int - timestamp: datetime - status: str - results_count: int = 0 - error_message: Optional[str] = None - - class Config: - orm_mode = True diff --git a/app/scraper.py b/app/scraper.py deleted file mode 100644 index b946ecb..0000000 --- a/app/scraper.py +++ /dev/null @@ -1,373 +0,0 @@ -import asyncio -import logging -# import os # No longer needed for getenv in main -import datetime # Keep for now, might be used in data processing -from playwright.async_api import async_playwright -# Required for main test function -from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT -# DATABASE_URL is used by database.py, SessionLocal will pick it up via config - -# Assuming database.py is in the same directory or accessible in PYTHONPATH -from database import get_db, CarListing, SessionLocal # Added SessionLocal for main example -from sqlalchemy.orm import Session -from datetime import datetime # Ensure datetime is imported directly - -# Configure basic logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - -from stealth_utils import apply_stealth_js # Import new stealth utility -# from playwright_stealth import stealth_async # Commenting out old stealth - -class AutoTraderScraper: - """Scraper for AutoTrader private party listings using Playwright.""" - - def __init__(self, source_name: str = "autotrader"): - """ - Initializes the AutoTraderScraper. - Args: - source_name (str): Name of the source platform. - """ - self.source_name = source_name - # Potentially load other configs from a config file or env vars here - # For example: self.base_url = "https://www.autotrader.com/cars-for-sale/private-seller" - - async def get_private_listings(self, autotrader_url: str, headless: bool, timeout: int = 120000) -> list[dict]: - """ - Scrapes private party listings from AutoTrader using Playwright. - - Args: - autotrader_url (str): The starting URL for scraping AutoTrader private listings. - headless (bool): Whether to run the browser in headless mode. - timeout (int): Maximum time in milliseconds for page operations. - - Returns: - list[dict]: A list of dictionaries, where each dictionary represents a scraped vehicle listing. - """ - listings_data = [] - browser = None - - launch_options = { - "headless": headless, - "args": [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-infobars', - '--window-position=0,0', - '--ignore-certificate-errors', - '--ignore-certificate-errors-spki-list', - # '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"' # User agent is set in context - '--disable-gpu' # Already there but keep - ], - # "channel": "chrome" # This might require full Chrome install, trying without first to see if args help - } - - # Try with 'msedge' or 'chrome' if default chromium fails and they are available - # For now, stick to chromium and args. If 'channel' is needed, it's a bigger setup change. - - async with async_playwright() as p: - try: - # browser = await p.chromium.launch(**launch_options) # Default chromium - # Let's try specifying channel, assuming it might use a locally installed Chrome if available, or a Playwright-managed one. - # This is a common suggestion if the default Playwright Chromium build is too easily detected. - # If "chrome" channel is not found by Playwright, it will error. - try: - browser = await p.chromium.launch( - **launch_options, - channel="chrome" # Attempt to use a branded Chrome build - ) - logging.info("Attempting to launch with channel='chrome'") - except Exception as e_channel: - logging.warning(f"Failed to launch with channel='chrome' ({e_channel}). Falling back to default Playwright Chromium.") - # Remove channel from launch_options if it failed - launch_options_no_channel = launch_options.copy() - if "channel" in launch_options_no_channel: # Should not be needed based on above structure but good practice - del launch_options_no_channel["channel"] - browser = await p.chromium.launch(**launch_options_no_channel) - logging.info("Launched with default Playwright Chromium.") - - - context = await browser.new_context( - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36', # A fairly common user agent - java_script_enabled=True, - ) - context.set_default_navigation_timeout(timeout) - context.set_default_timeout(timeout) - - page = await context.new_page() - await page.set_viewport_size({"width": 1920, "height": 1080}) - - # Apply custom JS stealth - await apply_stealth_js(page) - - logging.info(f"Navigating to {autotrader_url}") - await page.goto(autotrader_url, wait_until="domcontentloaded", timeout=timeout) # Reverted to domcontentloaded - - title = await page.title() - logging.info(f"Page title: {title}") - - if "unavailable" in title.lower() or "block" in title.lower() or "access denied" in title.lower(): - logging.critical(f"Failed to load AutoTrader listings page. Blocked by website. Title: {title}") - await browser.close() # Ensure browser is closed before returning - return [] - - # Using speculative selectors for AutoTrader - # Main container for listings: 'div[data-qaid="cntnr-lstng-main"]' (this might be too broad or incorrect) - # A more specific item selector might be needed, e.g., an article or a div with a specific class. - # For now, let's assume individual listing cards can be found with a selector like: - # "div.inventory-listing" or "div[data-cmp='inventoryListing']" - these are common patterns. - # The provided example 'div[data-qaid="cntnr-lstng-main"]' seems like it might be a single container FOR ALL listings. - # Let's try a more specific (but still guessed) selector for individual listing items. - # A common pattern is items within a list or grid. Let's try to find items: - # This selector is a **GUESS** based on common AutoTrader structures. - listing_item_selector = "div[data-cmp='inventoryListing']" # GUESS - - # Fallback if the primary guess doesn't work, try another common pattern - # listing_item_selector_fallback = "div.inventory-listing.new-listing.stub" # Another GUESS - - # await page.wait_for_selector(listing_item_selector, timeout=15000) # Wait for items to appear - - listing_containers = await page.query_selector_all(listing_item_selector) - - # if not listing_containers: - # logging.info(f"No listings found with primary selector '{listing_item_selector}'. Trying fallback...") - # listing_containers = await page.query_selector_all(listing_item_selector_fallback) - - logging.info(f"Found {len(listing_containers)} potential listing containers using selector '{listing_item_selector}'.") - - processed_count = 0 - # first_container_processed_for_html_dump = False # REMOVE HTML DUMP FLAG - for i, container in enumerate(listing_containers): - url_path = None - title_text = "N/A" # Default to N/A - price_text = "N/A" # Default to N/A - mileage_text = "N/A" # Default to N/A (as it's not reliably on card) - listing_url = None - - try: - logging.debug(f"Processing container {i+1}/{len(listing_containers)}") - - # Attempt to get Title - title_el = await container.query_selector("h2[data-cmp='subheading']") # Updated selector from HTML dump - if title_el: - raw_title_text = await title_el.inner_text() - title_text = raw_title_text.strip() if raw_title_text else "N/A" - - # Attempt to get URL from parent of title_el - # Playwright's query_selector does not directly support xpath like "ancestor::a". - # A common structure is

...

or

...

- # We can try to find 'a' that contains this h2, or assume the 'a[data-cmp="link"]' is the one. - - # Let's use the a[data-cmp="link"] which was identified as containing the title h2 - parent_link_el = await container.query_selector("a[data-cmp='link']") - if parent_link_el: - url_path = await parent_link_el.get_attribute("href") - else: # Fallback if the above structure isn't found - logging.warning(f"Could not find parent a[data-cmp='link'] for title in listing {i+1}") - else: - logging.warning(f"Title not found with h2[data-cmp='subheading'] for listing {i+1}.") - - # Fallback or alternative for URL if not found via title's parent link - if not url_path: - url_el_alt = await container.query_selector("a[data-cmp='relLnk']") # Keep this fallback - if url_el_alt: - url_path = await url_el_alt.get_attribute("href") - - if not url_path: # Last resort for URL - first_a = await container.query_selector("a[href]") # Broadest fallback - if first_a: - url_path = await first_a.get_attribute("href") - - if not url_path: - logging.warning(f"Could not extract URL for listing {i+1} (Title: {title_text}). Skipping.") - continue - - if not url_path.startswith(('http://', 'https://')): - listing_url = f"https://www.autotrader.com{url_path}" - else: - listing_url = url_path - - # Attempt to get Price - price_el = await container.query_selector("div[data-cmp='firstPrice']") # Updated selector - if price_el: - raw_price_text = await price_el.inner_text() - price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A" - else: - # Fallback for price (e.g. .first-price class directly) - price_el_fallback = await container.query_selector(".first-price") - if price_el_fallback: - raw_price_text = await price_el_fallback.inner_text() - price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A" - else: - logging.warning(f"Price not found for listing {listing_url}") - price_text = "N/A" - - - # Mileage - Set to N/A as it's not reliably on the card from previous findings - mileage_text = "N/A" - # logging.info(f"Mileage not scraped from listing card for {listing_url} (by design for now).") - - vin_text = None - - listing_data = { - "listing_url": listing_url, - "title": title_text, # Already defaults to N/A or has value - "price": price_text, # Already defaults to N/A or has value - "mileage": mileage_text, # Is N/A - "vin": vin_text, - "source_name": self.source_name, - "data_points": { - "page_title_at_scrape": title # page's title, not listing's - } - } - listings_data.append(listing_data) - processed_count += 1 - logging.info(f"Successfully processed listing: {title_text[:50]}... URL: {listing_url}") - - except Exception as e: - logging.error(f"Error processing listing container {i+1} for URL {listing_url if listing_url else 'Unknown'}: {e}", exc_info=True) - continue - - logging.info(f"Successfully processed {processed_count} out of {len(listing_containers)} listing containers.") - - except Exception as e: - logging.error(f"An error occurred during Playwright scraping phase: {e}", exc_info=True) - finally: - if browser: - logging.info("Closing browser.") - await browser.close() - - return listings_data - - -async def scrape_autotrader_data(autotrader_url: str, headless: bool = True, timeout: int = 120000) -> list[dict]: - """ - High-level function to scrape data from AutoTrader. - Initializes the scraper and calls its scraping method. - - Args: - autotrader_url (str): The URL to scrape. - headless (bool): Whether to run the browser in headless mode. - timeout (int): Timeout for scraping operations in milliseconds. - - Returns: - list[dict]: A list of scraped listing data. - """ - scraper = AutoTraderScraper() - listings = await scraper.get_private_listings(autotrader_url=autotrader_url, headless=headless, timeout=timeout) - return listings - - -async def scrape_autotrader_and_update_db(db: Session, autotrader_url: str, headless: bool, scrape_timeout: int): - """ - Scrapes listings from AutoTrader and updates the database. - - Args: - db (Session): The SQLAlchemy database session. - autotrader_url (str): The URL to scrape. - headless (bool): Whether to run the browser in headless mode. - scrape_timeout (int): Timeout for scraping operations in milliseconds. - - Returns: - dict: A status dictionary with counts of added, updated, and scraped listings. - """ - logging.info(f"Starting scrape and update for URL: {autotrader_url}") - - try: - listings_data = await scrape_autotrader_data( - autotrader_url=autotrader_url, - headless=headless, - timeout=scrape_timeout - ) - except Exception as e: - logging.error(f"Failed to scrape data from {autotrader_url}: {e}", exc_info=True) - return {"status": "error", "message": f"Scraping failed: {e}"} - - added_count = 0 - updated_count = 0 - scraped_count = len(listings_data) - - for listing_data in listings_data: - source_url = listing_data.get('listing_url') # Renamed from 'url' to 'listing_url' in dummy data - if not source_url: - logging.warning(f"Scraped item missing 'listing_url': {listing_data.get('title')}. Skipping.") - continue - - try: - existing_listing = db.query(CarListing).filter(CarListing.source_url == source_url).first() - - if existing_listing: - # Placeholder for update logic - # existing_listing.extracted_at = datetime.utcnow() - # existing_listing.data_points = {k: v for k, v in listing_data.items() if k != 'listing_url'} - # # Update other fields like price if necessary - # db.add(existing_listing) # Not strictly necessary if only mutable fields changed and session tracks - updated_count += 1 - logging.info(f"Listing at {source_url} already exists. Marked for update (placeholder).") - else: - new_listing = CarListing( - platform="autotrader", - extracted_at=datetime.utcnow(), - source_url=source_url, - # Ensure data_points stores everything else from listing_data - data_points={k: v for k, v in listing_data.items() if k != 'listing_url'} - ) - db.add(new_listing) - added_count += 1 - logging.info(f"New listing added from {source_url}") - except Exception as e: - logging.error(f"Error processing listing {source_url} for DB: {e}", exc_info=True) - # Decide if you want to rollback here or continue with other listings - - try: - db.commit() - logging.info("Database changes committed.") - except Exception as e: - logging.error(f"Database commit failed: {e}", exc_info=True) - db.rollback() - return {"status": "error", "message": f"DB commit failed: {e}", "added": 0, "updated": 0, "scraped_count": scraped_count} - - status_summary = { - "status": "success", - "added": added_count, - "updated": updated_count, - "scraped_count": scraped_count - } - logging.info(f"DB update summary: {status_summary}") - return status_summary - -async def main(): - # Use settings from config.py - # url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") - # headless_str = os.getenv("HEADLESS_BROWSER", "True") - # headless = headless_str.lower() == "true" - # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - # try: - # scrape_timeout = int(scrape_timeout_str) - # except ValueError: - # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.") - # scrape_timeout = 120000 - - # from database import SessionLocal # Already imported at the top - db: Session = SessionLocal() # SessionLocal now uses DATABASE_URL from config.py via database.py - try: - logging.info(f"Starting scraper and DB update for URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms") - stats = await scrape_autotrader_and_update_db( - db=db, - autotrader_url=AUTOTRADER_URL, - headless=HEADLESS_BROWSER, - scrape_timeout=SCRAPE_TIMEOUT - ) - logging.info(f"Scraping and DB update completed: {stats}") - except Exception as e: - logging.error(f"Error during scraping and DB update in main: {e}", exc_info=True) - finally: - logging.info("Closing DB session in main.") - db.close() - -if __name__ == "__main__": - # To run this: - # 1. Ensure Playwright browsers are installed: `playwright install chromium` - # 2. Set environment variables if needed (AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT) - # 3. Uncomment the line below - asyncio.run(main()) - # pass # Keep it passive for now, to be run manually when needed diff --git a/config.py b/config.py deleted file mode 100644 index 44148ca..0000000 --- a/config.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from dotenv import load_dotenv - -# Load environment variables from .env file if it exists -# This is useful for local development. -load_dotenv() - -# Database Configuration -DATABASE_URL: str = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db") - -# Scraper Configuration -AUTOTRADER_URL: str = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") -SCRAPE_TIMEOUT: int = int(os.getenv("SCRAPE_TIMEOUT", "120000")) # Milliseconds -HEADLESS_BROWSER: bool = os.getenv("HEADLESS_BROWSER", "True").lower() == "true" - -# API Configuration (if any specific ones are needed later) -# Example: API_HOST: str = os.getenv("API_HOST", "0.0.0.0") -# Example: API_PORT: int = int(os.getenv("API_PORT", "8000")) - -# Logging Configuration (can also be added here if more complex) -LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO").upper() - -# Ensure critical URLs have a scheme for robustness -if not AUTOTRADER_URL.startswith(("http://", "https://")): - # This print statement is for immediate feedback during startup/import. - # In a pure library, side effects on import are sometimes discouraged, - # but for an application's main config, it's often acceptable. - print(f"Warning: AUTOTRADER_URL ('{AUTOTRADER_URL}') did not have a scheme, prepended https://.") - AUTOTRADER_URL = "https://" + AUTOTRADER_URL - print(f"Corrected AUTOTRADER_URL: {AUTOTRADER_URL}") - - -# Example of how to handle SQLite connect_args based on config -DB_CONNECT_ARGS: dict = {"check_same_thread": False} if DATABASE_URL.startswith("sqlite") else {} diff --git a/database.py b/database.py deleted file mode 100644 index 58a11dc..0000000 --- a/database.py +++ /dev/null @@ -1,26 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, JSON, create_engine -from sqlalchemy.orm import declarative_base, sessionmaker, Session -from config import DATABASE_URL, DB_CONNECT_ARGS # Import from config - -# Use imported configuration -engine = create_engine(DATABASE_URL, connect_args=DB_CONNECT_ARGS) - -SessionLocal = sessionmaker(bind=engine, autoflush=False) -Base = declarative_base() - -class CarListing(Base): - __tablename__ = "listings" - id = Column(Integer, primary_key=True, index=True) - platform = Column(String) - extracted_at = Column(DateTime) - source_url = Column(String, unique=True) - data_points = Column(JSON) - -Base.metadata.create_all(bind=engine) - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() diff --git a/main.py b/main.py new file mode 100644 index 0000000..fb37fc5 --- /dev/null +++ b/main.py @@ -0,0 +1,93 @@ +import asyncio +import uvicorn +from contextlib import asynccontextmanager +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +import logging +from datetime import datetime + +from src.database import create_db_tables +from src.api.routes import router as api_v1_router +from src.config import settings +from src.automation.browser_sim import run_autotrader_scraper_example_standalone + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[logging.StreamHandler()] +) +logger = logging.getLogger(__name__) + +@asynccontextmanager +async def lifespan(app: FastAPI): + logger.info("\ud83d\ude80 Starting Educational Vehicle Tracker API...") + await create_db_tables() + logger.info("\ud83d\udcca Database tables checked/created.") + yield + logger.info("\ud83d\udd1b Shutting down Educational Vehicle Tracker API.") + +app = FastAPI( + title="Educational Vehicle Tracker", + description="An educational system for learning web automation and data pipeline architecture, now with real scraping.", + version="1.1.0", + lifespan=lifespan +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(api_v1_router, prefix="/api/v1") + +@app.get("/", include_in_schema=False) +async def root_redirect_to_docs(): + from fastapi.responses import RedirectResponse + return RedirectResponse(url="/docs") + +@app.get("/health", summary="Health Check") +async def health_check(): + return {"status": "healthy", "service": "vehicle-tracker-api", "timestamp": datetime.utcnow()} + +async def run_standalone_scrape_cli(): + logger.info("\ud83c\udf3d Running Standalone AutoTrader Scraper Example from CLI") + print("=" * 40) + try: + await run_autotrader_scraper_example_standalone() + logger.info("\u2705 Standalone scraper example completed successfully!") + except Exception as e: + logger.error(f"\u274c Standalone scraper example failed: {e}", exc_info=True) + +if __name__ == "__main__": + import sys + print_startup_message = True + if len(sys.argv) > 1: + if sys.argv[1] == "scrape_test": + print_startup_message = False + asyncio.run(run_standalone_scrape_cli()) + elif sys.argv[1] == "create_tables": + print_startup_message = False + asyncio.run(create_db_tables()) + logger.info("Database tables creation process finished.") + else: + logger.warning(f"Unknown command: {sys.argv[1]}") + print("\ud83d\udd0d Usage: python main.py [scrape_test | create_tables]") + if print_startup_message: + logger.info("\ud83c\udf93 Educational Vehicle Tracking System - API Server Mode") + print("=" * 50) + logger.info(f"API Host: {settings.API_HOST}") + logger.info(f"API Port: {settings.API_PORT}") + logger.info(f"Database: {settings.DATABASE_URL}") + logger.info(f"Max Listings per Session (Scrape): {settings.MAX_LISTINGS_PER_SESSION}") + logger.info(f"Playwright Headless: {settings.HEADLESS}") + print("=" * 50) + uvicorn.run( + "main:app", + host=settings.API_HOST, + port=settings.API_PORT, + reload=True, + log_level="info" + ) diff --git a/render.yaml b/render.yaml index c0e7ffa..f755e35 100644 --- a/render.yaml +++ b/render.yaml @@ -5,15 +5,25 @@ services: buildCommand: | pip install -r requirements.txt playwright install chromium - startCommand: uvicorn app:app --host 0.0.0.0 --port ${PORT:-8000} + startCommand: uvicorn main:app --host 0.0.0.0 --port ${PORT:-8000} envVars: - key: PYTHON_VERSION - value: 3.11 # Or your desired Python version + value: 3.11 - key: DATABASE_URL - generateValue: true # For Render PostgreSQL, or set manually for SQLite/external DB - - key: AUTOTRADER_URL - value: "https://www.autotrader.com/cars-for-sale/private-seller" # Example - - key: SCRAPE_TIMEOUT - value: 120000 # Example: 120 seconds - - key: HEADLESS_BROWSER + generateValue: true + - key: HEADLESS value: "True" + - key: BROWSER_TIMEOUT + value: "60000" + - key: PAGE_DELAY + value: "5000" + - key: MIN_DELAY_BETWEEN_ACTIONS + value: "2.5" + - key: MAX_LISTINGS_PER_SESSION + value: "25" + - key: PROXY_SERVER + value: "" + - key: PROXY_USERNAME + value: "" + - key: PROXY_PASSWORD + value: "" diff --git a/requirements.txt b/requirements.txt index 2e8a221..b450ec4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ -aiofiles -asyncpg -fastapi -playwright -playwright-stealth -python-dotenv -sqlalchemy[asyncio] -uvicorn[standard] +fastapi==0.104.1 +uvicorn==0.24.0 +sqlalchemy==2.0.23 +aiosqlite==0.19.0 +playwright==1.40.0 +python-dotenv==1.0.0 +pydantic==2.5.0 diff --git a/app/__init__.py b/src/__init__.py similarity index 100% rename from app/__init__.py rename to src/__init__.py diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api/routes.py b/src/api/routes.py new file mode 100644 index 0000000..612fb3f --- /dev/null +++ b/src/api/routes.py @@ -0,0 +1,237 @@ +from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, and_, exists, update +from datetime import datetime +import json +from typing import List +import logging + +from src.database import get_db, AsyncSessionLocal +from src.models.vehicle import ( + VehicleListing, + VehicleListingCreate, + VehicleListingResponse, + SearchFilters +) +from src.automation.browser_sim import AutoTraderScraper +from src.config import settings + +router = APIRouter() +logger = logging.getLogger(__name__) + +async def scrape_and_store_task(search_url: str, max_listings: int, source_site_name: str = "autotrader"): + logger.info(f"Background task started: Scraping {source_site_name} URL: {search_url} for max {max_listings} listings.") + created_count = 0 + updated_count = 0 + failed_count = 0 + processed_urls = set() + if source_site_name.lower() == "autotrader": + ScraperClass = AutoTraderScraper + else: + logger.error(f"Unsupported source site: {source_site_name}") + return + async with ScraperClass() as scraper: + scraped_listings_pydantic = await scraper.scrape_listings( + search_url=search_url, + max_listings_to_fetch=max_listings + ) + if not scraped_listings_pydantic: + logger.info(f"No listings returned from {source_site_name} scraper for URL: {search_url}") + return + logger.info(f"{source_site_name} scraper returned {len(scraped_listings_pydantic)} listings. Processing for DB storage...") + async with AsyncSessionLocal() as db_session: + for listing_data in scraped_listings_pydantic: + if not listing_data.listing_url: + logger.warning("Scraped data missing listing_url, skipping.") + failed_count += 1 + continue + if listing_data.listing_url in processed_urls: + logger.debug(f"URL {listing_data.listing_url} already processed in this run, skipping duplicate.") + continue + processed_urls.add(listing_data.listing_url) + try: + stmt = select(VehicleListing).where(VehicleListing.listing_url == listing_data.listing_url) + result = await db_session.execute(stmt) + existing_vehicle = result.scalar_one_or_none() + features_json = json.dumps(listing_data.features) if listing_data.features else None + if existing_vehicle: + logger.debug(f"Updating existing listing: {listing_data.listing_url} (ID: {existing_vehicle.id})") + update_values = {} + if listing_data.title and existing_vehicle.title != listing_data.title: + update_values['title'] = listing_data.title + if listing_data.price is not None and existing_vehicle.price != listing_data.price: + update_values['price'] = listing_data.price + if listing_data.mileage is not None and existing_vehicle.mileage != listing_data.mileage: + update_values['mileage'] = listing_data.mileage + if features_json and existing_vehicle.features != features_json: + update_values['features'] = features_json + if listing_data.photo_url and existing_vehicle.photo_url != listing_data.photo_url: + update_values['photo_url'] = listing_data.photo_url + if listing_data.location and existing_vehicle.location != listing_data.location: + update_values['location'] = listing_data.location + if listing_data.year and existing_vehicle.year != listing_data.year: + update_values['year'] = listing_data.year + if listing_data.make and existing_vehicle.make != listing_data.make: + update_values['make'] = listing_data.make + if listing_data.model and existing_vehicle.model != listing_data.model: + update_values['model'] = listing_data.model + if listing_data.trim and existing_vehicle.trim != listing_data.trim: + update_values['trim'] = listing_data.trim + update_values['is_active'] = True + update_values['last_scraped_at'] = datetime.utcnow() + if update_values: + stmt_update = update(VehicleListing).where(VehicleListing.id == existing_vehicle.id).values(**update_values) + await db_session.execute(stmt_update) + updated_count += 1 + else: + logger.debug(f"Adding new listing: {listing_data.listing_url}") + db_vehicle = VehicleListing( + listing_id_external=listing_data.listing_id_external, + title=listing_data.title, + year=listing_data.year, + make=listing_data.make, + model=listing_data.model, + trim=listing_data.trim, + price=listing_data.price, + mileage=listing_data.mileage, + listing_url=listing_data.listing_url, + photo_url=listing_data.photo_url, + features=features_json, + location=listing_data.location, + seller_type=listing_data.seller_type, + source_site=listing_data.source_site, + is_active=True, + last_scraped_at=datetime.utcnow() + ) + db_session.add(db_vehicle) + created_count += 1 + await db_session.commit() + except Exception as e: + failed_count += 1 + logger.error(f"Failed to process/store listing {listing_data.listing_url}: {e}", exc_info=True) + await db_session.rollback() + logger.info(f"Background task for {source_site_name} finished. Created={created_count}, Updated={updated_count}, Failed={failed_count}") + +@router.get("/", response_model=dict, include_in_schema=False) +async def api_v1_root_info(): + return { + "message": "Vehicle Tracking API - V1", + "active_endpoints": ["/vehicles", "/vehicles/search", "/vehicles/scrape", "/vehicles/{id}", "/vehicles/stats/summary"] + } + +@router.get("/vehicles/", response_model=List[VehicleListingResponse]) +async def get_all_vehicles( + skip: int = Query(0, ge=0), + limit: int = Query(settings.MAX_LISTINGS_PER_SESSION, ge=1, le=200), + db: AsyncSession = Depends(get_db), + filters: SearchFilters = Depends(), +): + query = select(VehicleListing) + conditions = [] + if filters.is_active is not None: + conditions.append(VehicleListing.is_active == filters.is_active) + if filters.make: + conditions.append(VehicleListing.make.ilike(f"%{filters.make}%")) + if filters.model: + conditions.append(VehicleListing.model.ilike(f"%{filters.model}%")) + if filters.min_year: + conditions.append(VehicleListing.year >= filters.min_year) + if filters.max_year: + conditions.append(VehicleListing.year <= filters.max_year) + if filters.min_price: + conditions.append(VehicleListing.price >= filters.min_price) + if filters.max_price: + conditions.append(VehicleListing.price <= filters.max_price) + if filters.max_mileage: + conditions.append(VehicleListing.mileage <= filters.max_mileage) + if filters.location: + conditions.append(VehicleListing.location.ilike(f"%{filters.location}%")) + if filters.seller_type: + conditions.append(VehicleListing.seller_type.ilike(f"%{filters.seller_type}%")) + if filters.source_site: + conditions.append(VehicleListing.source_site.ilike(f"%{filters.source_site}%")) + if conditions: + query = query.where(and_(*conditions)) + query = query.order_by(VehicleListing.last_scraped_at.desc(), VehicleListing.created_at.desc()) + result = await db.execute(query.offset(skip).limit(limit)) + vehicles = result.scalars().all() + response_vehicles = [] + for vehicle_db_item in vehicles: + response_vehicles.append(VehicleListingResponse.model_validate(vehicle_db_item)) + return response_vehicles + +@router.get("/vehicles/{vehicle_id}", response_model=VehicleListingResponse) +async def get_vehicle_by_id_route(vehicle_id: int, db: AsyncSession = Depends(get_db)): + query = select(VehicleListing).where(VehicleListing.id == vehicle_id) + result = await db.execute(query) + vehicle_db_item = result.scalar_one_or_none() + if not vehicle_db_item: + raise HTTPException(status_code=404, detail="Vehicle not found") + return VehicleListingResponse.model_validate(vehicle_db_item) + +@router.post("/vehicles/", response_model=VehicleListingResponse, status_code=201) +async def create_vehicle_listing_manual( + vehicle_create_data: VehicleListingCreate, + db: AsyncSession = Depends(get_db) +): + stmt_exists = select(exists().where(VehicleListing.listing_url == vehicle_create_data.listing_url)) + url_exists = await db.scalar(stmt_exists) + if url_exists: + raise HTTPException(status_code=409, detail=f"Vehicle with URL {vehicle_create_data.listing_url} already exists.") + features_json_str = json.dumps(vehicle_create_data.features) if vehicle_create_data.features else None + db_vehicle_item = VehicleListing( + **vehicle_create_data.model_dump(exclude={'features'}), + features=features_json_str, + is_active=True, + last_scraped_at=datetime.utcnow() + ) + db.add(db_vehicle_item) + await db.commit() + await db.refresh(db_vehicle_item) + return VehicleListingResponse.model_validate(db_vehicle_item) + +@router.post("/vehicles/scrape", status_code=202) +async def trigger_site_scrape( + background_tasks: BackgroundTasks, + site_name: str = Query("autotrader", description="Name of the site to scrape (e.g., 'autotrader')."), + search_url: str = Query(..., description="Full search URL for the specified site."), + max_listings: int = Query(settings.MAX_LISTINGS_PER_SESSION, description="Maximum listings to fetch from this scrape.", ge=1, le=100) +): + logger.info(f"Received request to scrape {site_name} URL: {search_url} for max {max_listings} listings.") + if site_name.lower() not in ["autotrader"]: + raise HTTPException(status_code=400, detail=f"Scraping for site '{site_name}' is not supported.") + background_tasks.add_task(scrape_and_store_task, search_url, max_listings, site_name) + return {"message": f"{site_name.capitalize()} scraping task accepted and started in the background for URL: {search_url}"} + +@router.delete("/vehicles/{vehicle_id}", status_code=200) +async def delete_vehicle_listing(vehicle_id: int, db: AsyncSession = Depends(get_db)): + query = select(VehicleListing).where(VehicleListing.id == vehicle_id) + result = await db.execute(query) + vehicle_db_item = result.scalar_one_or_none() + if not vehicle_db_item: + raise HTTPException(status_code=404, detail="Vehicle not found") + await db.delete(vehicle_db_item) + await db.commit() + return {"message": "Vehicle deleted successfully"} + +@router.get("/vehicles/stats/summary", response_model=dict) +async def get_vehicle_listing_stats(db: AsyncSession = Depends(get_db)): + from sqlalchemy import func as sql_func + make_query = select(VehicleListing.make, sql_func.count(VehicleListing.id).label('count'))\ + .where(VehicleListing.make.isnot(None)).group_by(VehicleListing.make).order_by(sql_func.count(VehicleListing.id).desc()) + make_result = await db.execute(make_query) + make_stats = [{"make": row[0], "count": row[1]} for row in make_result.all()] + year_query = select(VehicleListing.year, sql_func.avg(VehicleListing.price).label('avg_price'), sql_func.count(VehicleListing.id).label('count'))\ + .where(VehicleListing.year.isnot(None)).group_by(VehicleListing.year).order_by(VehicleListing.year.desc()) + year_result = await db.execute(year_query) + year_stats = [{"year": row[0], "avg_price": round(row[1], 2) if row[1] else 0.0, "count": row[2]} for row in year_result.all()] + total_query = select(sql_func.count(VehicleListing.id)) + total_count = await db.scalar(total_query) or 0 + active_query = select(sql_func.count(VehicleListing.id)).where(VehicleListing.is_active == True) + active_count = await db.scalar(active_query) or 0 + return { + "total_listings_in_db": total_count, + "active_listings": active_count, + "by_make": make_stats, + "by_year_with_avg_price": year_stats + } diff --git a/src/automation/__init__.py b/src/automation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/automation/browser_sim.py b/src/automation/browser_sim.py new file mode 100644 index 0000000..c93f204 --- /dev/null +++ b/src/automation/browser_sim.py @@ -0,0 +1,420 @@ +import asyncio +import json +import re +import random +from typing import List, Dict, Optional +from playwright.async_api import async_playwright, Page, Browser, PlaywrightException, Locator +from urllib.parse import urljoin, urlparse, parse_qs +from datetime import datetime +import logging +import hashlib + +from src.config import settings +from src.models.vehicle import VehicleListingCreate + +logger = logging.getLogger(__name__) + +class AutoTraderScraper: + def __init__(self): + self.browser: Optional[Browser] = None + self.playwright_instance: Optional[async_playwright] = None + self.base_action_delay = settings.MIN_DELAY_BETWEEN_ACTIONS + self.page_load_delay = settings.PAGE_DELAY / 1000 + + async def __aenter__(self): + logger.info("Initializing AutoTrader Scraper...") + self.playwright_instance = await async_playwright().start() + try: + proxy_cfg = None + if settings.PROXY_SERVER: + proxy_cfg = {"server": settings.PROXY_SERVER} + if settings.PROXY_USERNAME and settings.PROXY_PASSWORD: + proxy_cfg["username"] = settings.PROXY_USERNAME + proxy_cfg["password"] = settings.PROXY_PASSWORD + + self.browser = await self.playwright_instance.chromium.launch( + headless=settings.HEADLESS, + proxy=proxy_cfg, + args=[ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-infobars', + '--window-position=0,0', + '--ignore-certificate-errors', + '--ignore-certificate-errors-spki-list', + '--disable-blink-features=AutomationControlled', + '--disable-dev-shm-usage' + ], + timeout=settings.BROWSER_TIMEOUT + ) + logger.info(f"Browser launched (Headless: {settings.HEADLESS})") + except PlaywrightException as e: + logger.error(f"Failed to launch browser: {e}") + if self.playwright_instance: + await self.playwright_instance.stop() + raise + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + logger.info("Closing AutoTrader Scraper resources...") + if self.browser and self.browser.is_connected(): + try: + await self.browser.close() + logger.info("Browser closed.") + except PlaywrightException as e: + logger.error(f"Error closing browser: {e}") + if self.playwright_instance: + try: + await self.playwright_instance.stop() + logger.info("Playwright instance stopped.") + except Exception as e: + logger.error(f"Error stopping Playwright: {e}") + if exc_type: + logger.error(f"Exception occurred during scraping: {exc_val}", exc_info=(exc_type, exc_val, exc_tb)) + + async def _apply_stealth_measures(self, page: Page): + logger.info("Applying stealth measures to page...") + user_agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36" + ] + await page.set_extra_http_headers({"User-Agent": random.choice(user_agents)}) + await page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en', 'en-GB'] }); + const pluginCount = Math.floor(Math.random() * 3) + 1; + Object.defineProperty(navigator, 'plugins', { + get: () => Array(pluginCount).fill(null).map((_, i) => ({ name: `Plugin ${i}`, filename: `plugin${i}.dll`, description: `Mock plugin ${i}` })) + }); + const mimeTypeCount = Math.floor(Math.random() * 3) + 1; + Object.defineProperty(navigator, 'mimeTypes', { + get: () => Array(mimeTypeCount).fill(null).map((_, i) => ({ type: `application/x-mimetype${i}`, suffixes: `m${i}`, description: `Mock mimetype ${i}` })) + }); + const getParameter = WebGLRenderingContext.prototype.getParameter; + WebGLRenderingContext.prototype.getParameter = function(parameter) { + if (parameter === 37445) return 'Intel Open Source Technology Center'; + if (parameter === 37446) return 'Mesa DRI Intel(R) Iris Xe Graphics (TGL GT2)'; + return getParameter.apply(this, arguments); + }; + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) + ); + try { Date.prototype.getTimezoneOffset = function() { return -Math.floor(Math.random() * 8 + 3) * 60; }; } catch (e) {} + """) + viewports = [{"width": 1920, "height": 1080}, {"width": 1366, "height": 768}, {"width": 1440, "height": 900}, {"width": 2560, "height": 1440}] + await page.set_viewport_size(random.choice(viewports)) + logger.info("Stealth measures applied.") + + async def _human_like_delay(self, min_delay: Optional[float] = None, max_delay: Optional[float] = None): + min_d = min_delay if min_delay is not None else self.base_action_delay + max_d = max_delay if max_delay is not None else self.base_action_delay + 2.0 + delay = random.uniform(min_d, max_d) + logger.debug(f"Waiting {delay:.2f} seconds...") + await asyncio.sleep(delay) + + async def _human_like_scroll(self, page: Page, scroll_attempts=7): + logger.info(f"Performing human-like scrolling: {scroll_attempts} attempts...") + previous_scroll_height = -1.0 + for i in range(scroll_attempts): + current_scroll_height = float(await page.evaluate("document.body.scrollHeight")) + if abs(current_scroll_height - previous_scroll_height) < 1.0 and i > 0: + logger.info(f"Scroll attempt {i+1}: Reached end of scrollable content or no new content loaded.") + break + scroll_amount = await page.evaluate(f"Math.random() * window.innerHeight * 0.7 + window.innerHeight * 0.3") + await page.evaluate(f"window.scrollBy(0, {scroll_amount})") + await self._human_like_delay(min_delay=0.8, max_delay=2.2) + previous_scroll_height = current_scroll_height + logger.info("Scrolling to bottom one last time...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await self._human_like_delay(min_delay=2.0, max_delay=3.5) + logger.info("Scrolling finished.") + + def _extract_listing_id_from_url(self, url: str) -> Optional[str]: + if not url: + return None + try: + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + if 'listingId' in query_params: + return query_params['listingId'][0] + path_parts = [part for part in parsed_url.path.split('/') if part] + if 'vehicle' in path_parts: + vehicle_idx = path_parts.index('vehicle') + if vehicle_idx + 1 < len(path_parts): + return path_parts[vehicle_idx+1] + for part in reversed(path_parts): + if part.isdigit() and len(part) > 5: + return part + except Exception as e: + logger.warning(f"Could not parse structured listing ID from URL {url}: {e}") + logger.debug(f"No structured ID found, hashing URL for ID: {url}") + return hashlib.md5(url.encode()).hexdigest()[:16] + + def _parse_title_details(self, title_str: str) -> Dict: + details = {'year': None, 'make': None, 'model': None, 'trim': None} + if not title_str: + return details + original_title = title_str + year_match = re.search(r'\b(19[89]\d|20[0-2]\d|2030)\b', title_str) + if year_match: + details['year'] = int(year_match.group(1)) + title_str = title_str.replace(year_match.group(1), "", 1).strip() + title_str = re.sub(r'^(Used|New|Certified Pre-Owned|CPO)\s+', '', title_str, flags=re.IGNORECASE).strip() + parts = title_str.split(maxsplit=3) + if len(parts) > 0: + details['make'] = parts[0] + if len(parts) > 1: + details['model'] = parts[1] + if len(parts) > 2: + details['trim'] = " ".join(parts[2:]) + logger.debug(f"Parsed title details: {details} from original title: '{original_title}'") + return details + + async def _extract_listing_data(self, listing_element: Locator, page_url: str) -> Optional[VehicleListingCreate]: + data_dict: Dict[str, any] = {} + listing_html_for_debug = "N/A (HTML not captured)" + try: + link_el_selectors = [ + 'a[data-cmp="inventoryListingCardLink"]', + 'a[data-testid="srp-list-item-link"]', + 'a[href*="vehicledetails.xhtml?listingId="]', + 'h2 > a', + 'h3 > a' + ] + raw_href = None + for selector in link_el_selectors: + link_el = listing_element.locator(selector).first + if await link_el.count(): + raw_href = await link_el.get_attribute("href", timeout=1500) + if raw_href: + break + if not raw_href: + logger.warning("No primary link found for a listing card. Skipping.") + return None + data_dict['listing_url'] = urljoin(page_url, raw_href) + + title_el_selectors = ["h2[data-cmp*='title']", "h3[data-cmp*='title']", "div[data-cmp='displayName'] h2", "h2", "h3"] + raw_title = "Title Not Found" + for selector in title_el_selectors: + title_el = listing_element.locator(selector).first + if await title_el.count(): + try: + raw_title = await title_el.text_content(timeout=1500) + if raw_title and raw_title.strip(): + break + except PlaywrightException: + continue + data_dict['title'] = raw_title.strip() + title_details = self._parse_title_details(data_dict['title']) + data_dict.update(title_details) + + price_selectors = [ + "span[data-cmp='pricingSection'] .text-size-lg-3", + "span[data-cmp='pricingSection']", + ".pricing-section .first-price", + "span[class*='price']", "div[class*='price']" + ] + for selector in price_selectors: + price_el = listing_element.locator(selector).filter(has_text=re.compile(r"\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?")).first + if await price_el.count(): + try: + price_text = await price_el.text_content(timeout=1000) + cleaned_price = re.sub(r'[^\d.]', '', price_text) + if cleaned_price and cleaned_price != '.': + data_dict['price'] = float(cleaned_price) + break + except PlaywrightException: + continue + + mileage_selectors = [ + "div[data-cmp='listUnstyled'] li:has-text('miles')", + "div.item-vehicle-mileage", + "div[class*='mileage']", "span[class*='mileage']" + ] + for selector in mileage_selectors: + mileage_el = listing_element.locator(selector).filter(has_text=re.compile(r"[\d,]+\s*mi(?:les)?", re.IGNORECASE)).first + if await mileage_el.count(): + try: + mileage_text = await mileage_el.text_content(timeout=1000) + match = re.search(r'([\d,]+)\s*mi', mileage_text, re.IGNORECASE) + if match: + data_dict['mileage'] = int(re.sub(r',', '', match.group(1))) + break + except PlaywrightException: + continue + + photo_selectors = [ + 'img[data-cmp="responsiveImage"]', + 'img[data-testid="srp-list-item-image"]', + '.srp-img-container img', + 'img[alt*="vehicle image"]' + ] + for selector in photo_selectors: + photo_el = listing_element.locator(selector).first + if await photo_el.count(): + try: + src = await photo_el.get_attribute("src", timeout=1000) + if src and not src.startswith('data:image'): + data_dict['photo_url'] = urljoin(page_url, src) + break + except PlaywrightException: + continue + + features_list = [] + feature_selectors = ["ul[class*='features'] li", "div[data-cmp='pill']", ".item-特色 span"] + for selector in feature_selectors: + feature_elements = await listing_element.locator(selector).all() + for fe_el in feature_elements[:5]: + try: + f_text = await fe_el.text_content(timeout=500) + if f_text and len(f_text.strip()) > 2 and len(f_text.strip()) < 50: + features_list.append(f_text.strip()) + except PlaywrightException: + continue + if features_list: + break + data_dict['features'] = list(set(features_list)) + + location_selectors = ["div[data-cmp*='location']", "div.text-gray-dark.text-truncate", ".item-location"] + for selector in location_selectors: + location_el = listing_element.locator(selector).first + if await location_el.count(): + try: + loc_text = await location_el.text_content(timeout=1000) + data_dict['location'] = loc_text.replace('Located in', '').replace('Dealership Location', '').strip() + if data_dict['location']: + break + except PlaywrightException: + continue + + data_dict['listing_id_external'] = self._extract_listing_id_from_url(data_dict['listing_url']) + + return VehicleListingCreate(**data_dict) + + except PlaywrightException as e: + listing_html_for_debug = await listing_element.evaluate("element => element.outerHTML", timeout=1000) + logger.error(f"Playwright error extracting data from a listing card: {e}. HTML: {listing_html_for_debug[:500]}...") + except Exception as e: + listing_html_for_debug = await listing_element.evaluate("element => element.outerHTML", timeout=1000) + logger.error(f"General error extracting data from a listing card: {e}. HTML: {listing_html_for_debug[:500]}...") + return None + + async def scrape_listings(self, search_url: str, max_listings_to_fetch: int) -> List[VehicleListingCreate]: + if not self.browser or not self.browser.is_connected(): + logger.error("Browser not initialized or not connected. Call within async context manager.") + return [] + page: Optional[Page] = None + processed_listings: List[VehicleListingCreate] = [] + try: + context = await self.browser.new_context( + java_script_enabled=True, + accept_downloads=False, + locale='en-US' + ) + page = await context.new_page() + await self._apply_stealth_measures(page) + logger.info(f"Navigating to search URL: {search_url}") + await page.goto(search_url, wait_until="domcontentloaded", timeout=settings.BROWSER_TIMEOUT) + await self._human_like_delay(min_delay=self.page_load_delay, max_delay=self.page_load_delay + 3.0) + cookie_selectors = ['#onetrust-accept-btn-handler', 'button:has-text("Accept All Cookies")'] + for cs_selector in cookie_selectors: + try: + cookie_button = page.locator(cs_selector).first + if await cookie_button.is_visible(timeout=3000): + await cookie_button.click(timeout=5000, delay=random.uniform(0.3,0.8)*1000) + logger.info(f"Clicked cookie banner: {cs_selector}") + await self._human_like_delay(1.5, 2.5) + break + except PlaywrightException: + logger.debug(f"Cookie banner not found/visible or clickable with: {cs_selector}") + await self._human_like_scroll(page, scroll_attempts=settings.MAX_LISTINGS_PER_SESSION // 5 or 5) + listing_card_selectors = [ + "article[data-cmp='inventoryListing']", + "div[data-testid='srp-listing-item']", + "div[data-cmp='inventorySpotlightListingCard']", + ".inventory-listing", + "div[class*='srp-results'] div[class*='vehicle-card']" + ] + all_card_elements_locators = [] + for selector in listing_card_selectors: + elements_on_page = await page.locator(selector).count() + if elements_on_page > 0: + logger.info(f"Found {elements_on_page} cards with selector '{selector}'") + all_card_elements_locators.append(page.locator(selector)) + if selector in ["article[data-cmp='inventoryListing']", "div[data-testid='srp-listing-item']"]: + break + final_card_locator = None + if all_card_elements_locators: + final_card_locator = all_card_elements_locators[0] + if not final_card_locator: + logger.warning(f"No listing cards found on page: {search_url}.") + try: + await page.screenshot(path=f"debug_no_listings_{datetime.now():%Y%m%d%H%M%S}.png") + except Exception as e: + logger.error(f"Failed to save screenshot: {e}") + return [] + num_cards_on_page = await final_card_locator.count() + logger.info(f"Total listing cards to process with chosen locator: {num_cards_on_page}") + for i in range(num_cards_on_page): + if len(processed_listings) >= max_listings_to_fetch: + logger.info(f"Reached max listings to fetch: {max_listings_to_fetch}") + break + card_element = final_card_locator.nth(i) + logger.info(f"Processing card {i+1}/{num_cards_on_page}...") + try: + if not await card_element.is_visible(timeout=3000): + await card_element.scroll_into_view_if_needed(timeout=5000) + await self._human_like_delay(0.5, 1.0) + except PlaywrightException as e: + logger.warning(f"Card {i+1} not visible or could not scroll into view, skipping: {e}") + continue + listing_data = await self._extract_listing_data(card_element, page.url) + if listing_data: + processed_listings.append(listing_data) + logger.info(f"Successfully extracted: {listing_data.title[:60]}... ({listing_data.listing_id_external})") + else: + logger.warning(f"Failed to extract complete data from card {i+1}.") + await self._human_like_delay() + except PlaywrightException as e: + logger.error(f"A Playwright error occurred during scraping session for {search_url}: {e}", exc_info=True) + if page: + try: + await page.screenshot(path=f"error_pw_session_{datetime.now():%Y%m%d%H%M%S}.png") + except Exception as se: + logger.error(f"Failed to save error screenshot: {se}") + except Exception as e: + logger.error(f"An unexpected error occurred during scraping session for {search_url}: {e}", exc_info=True) + if page: + try: + await page.screenshot(path=f"error_unexpected_session_{datetime.now():%Y%m%d%H%M%S}.png") + except Exception as se: + logger.error(f"Failed to save error screenshot: {se}") + finally: + if page: + try: + await page.close() + except PlaywrightException as e: + logger.error(f"Error closing page: {e}") + if 'context' in locals() and context: + try: + await context.close() + except PlaywrightException as e: + logger.error(f"Error closing context: {e}") + logger.info(f"Scraping session for {search_url} finished. Extracted {len(processed_listings)} listings.") + return processed_listings[:max_listings_to_fetch] + +async def run_autotrader_scraper_example_standalone(): + example_search_url = "https://www.autotrader.com/cars-for-sale/by-owner/all-states?searchRadius=0&sortBy=datelistedDESC&numRecords=25" + max_to_get = settings.MAX_LISTINGS_PER_SESSION + async with AutoTraderScraper() as scraper: + results = await scraper.scrape_listings(example_search_url, max_listings_to_fetch=max_to_get) + if results: + logger.info(f"\n--- Scraped {len(results)} AutoTrader Listings (Standalone Example Run) ---") + for i, listing in enumerate(results): + logger.info(f"{i+1}. ID_Ext: {listing.listing_id_external} - {listing.title} ({listing.year} {listing.make} {listing.model}) - Price: ${listing.price if listing.price else 'N/A'}") + else: + logger.info("No listings were extracted in the standalone example run.") + return results diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..94a9b77 --- /dev/null +++ b/src/config.py @@ -0,0 +1,21 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + +class Settings: + DATABASE_URL: str = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///./default_vehicle_data.db") + HEADLESS: bool = os.getenv("HEADLESS", "true").lower() == "true" + BROWSER_TIMEOUT: int = int(os.getenv("BROWSER_TIMEOUT", "60000")) + PAGE_DELAY: int = int(os.getenv("PAGE_DELAY", "5000")) + MIN_DELAY_BETWEEN_ACTIONS: float = float(os.getenv("MIN_DELAY_BETWEEN_ACTIONS", "2.5")) + API_HOST: str = os.getenv("API_HOST", "127.0.0.1") + API_PORT: int = int(os.getenv("API_PORT", "8000")) + MAX_LISTINGS_PER_SESSION: int = int(os.getenv("MAX_LISTINGS_PER_SESSION", "25")) + + # Proxy configuration + PROXY_SERVER: str | None = os.getenv("PROXY_SERVER") + PROXY_USERNAME: str | None = os.getenv("PROXY_USERNAME") + PROXY_PASSWORD: str | None = os.getenv("PROXY_PASSWORD") + +settings = Settings() diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..e42f8f1 --- /dev/null +++ b/src/database.py @@ -0,0 +1,28 @@ +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker +from src.config import settings + +DATABASE_URL = settings.DATABASE_URL + +engine = create_async_engine( + DATABASE_URL, + echo=False, + future=True +) + +AsyncSessionLocal = async_sessionmaker( + bind=engine, + class_=AsyncSession, + expire_on_commit=False +) + +async def create_db_tables(): + from src.models.vehicle import Base + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + +async def get_db(): + async with AsyncSessionLocal() as session: + try: + yield session + finally: + await session.close() diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/vehicle.py b/src/models/vehicle.py new file mode 100644 index 0000000..09bd717 --- /dev/null +++ b/src/models/vehicle.py @@ -0,0 +1,84 @@ +from sqlalchemy import Column, Integer, String, Float, DateTime, Text, Boolean +from sqlalchemy.orm import declarative_base +from sqlalchemy.sql import func +from pydantic import BaseModel, Field +from typing import Optional, List +from datetime import datetime + +Base = declarative_base() + +class VehicleListing(Base): + __tablename__ = "vehicle_listings" + + id = Column(Integer, primary_key=True, index=True) + listing_id_external = Column(String, index=True, unique=False, nullable=True) + title = Column(String, nullable=False) + year = Column(Integer, index=True, nullable=True) + make = Column(String, index=True, nullable=True) + model = Column(String, index=True, nullable=True) + trim = Column(String, nullable=True) + price = Column(Float, index=True, nullable=True) + mileage = Column(Integer, index=True, nullable=True) + listing_url = Column(Text, unique=True, nullable=False, index=True) + photo_url = Column(Text, nullable=True) + features = Column(Text, nullable=True) + location = Column(String, nullable=True) + seller_type = Column(String, default="private", nullable=True) + source_site = Column(String, default="autotrader", nullable=True) + created_at = Column(DateTime, default=func.now()) + updated_at = Column(DateTime, default=func.now(), onupdate=func.now()) + last_scraped_at = Column(DateTime, default=func.now(), onupdate=func.now()) + is_active = Column(Boolean, default=True, index=True) + +class VehicleListingCreate(BaseModel): + listing_id_external: Optional[str] = None + title: str + year: Optional[int] = None + make: Optional[str] = None + model: Optional[str] = None + trim: Optional[str] = None + price: Optional[float] = None + mileage: Optional[int] = None + listing_url: str + photo_url: Optional[str] = None + features: Optional[List[str]] = Field(default_factory=list) + location: Optional[str] = None + seller_type: Optional[str] = "private" + source_site: Optional[str] = "autotrader" + +class VehicleListingResponse(BaseModel): + id: int + listing_id_external: Optional[str] = None + title: str + year: Optional[int] = None + make: Optional[str] = None + model: Optional[str] = None + trim: Optional[str] = None + price: Optional[float] = None + mileage: Optional[int] = None + listing_url: str + photo_url: Optional[str] = None + features: Optional[List[str]] = Field(default_factory=list) + location: Optional[str] = None + seller_type: Optional[str] = None + source_site: Optional[str] = None + created_at: datetime + updated_at: datetime + last_scraped_at: datetime + is_active: bool + + class Config: + from_attributes = True + +class SearchFilters(BaseModel): + make: Optional[str] = None + model: Optional[str] = None + min_year: Optional[int] = None + max_year: Optional[int] = None + min_price: Optional[float] = None + max_price: Optional[float] = None + max_mileage: Optional[int] = None + location: Optional[str] = None + seller_type: Optional[str] = None + source_site: Optional[str] = None + is_active: Optional[bool] = True From 4664e8ff3956472ad33974a697d56e08f496d4d6 Mon Sep 17 00:00:00 2001 From: hellothere012 Date: Wed, 4 Jun 2025 08:13:53 -0700 Subject: [PATCH 3/4] chore: remove unused files --- package.json | 13 ------------- stealth_utils.py | 30 ------------------------------ 2 files changed, 43 deletions(-) delete mode 100644 package.json delete mode 100644 stealth_utils.py diff --git a/package.json b/package.json deleted file mode 100644 index 5f62ed5..0000000 --- a/package.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "name": "autotrader-scraper", - "version": "1.0.0", - "description": "A FastAPI application for scraping Autotrader data.", - "main": "index.js", - "scripts": { - "start": "python app.py", - "test": "echo \"Error: no test specified\" && exit 1" - }, - "keywords": ["fastapi", "autotrader", "scraper", "web-scraping"], - "author": "", - "license": "ISC" -} diff --git a/stealth_utils.py b/stealth_utils.py deleted file mode 100644 index e956687..0000000 --- a/stealth_utils.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging - -async def apply_stealth_js(page): - """ - Applies various JavaScript injections to make Playwright less detectable. - """ - try: - # Pass the User-Agent test (though Playwright usually handles this well) - # user_agent = await page.evaluate("() => navigator.userAgent") - # await page.set_extra_http_headers({'User-Agent': user_agent.replace("HeadlessChrome", "Chrome")}) # Example - - # Pass the WebGL test - await page.add_init_script("(() => { const getParameter = WebGLRenderingContext.prototype.getParameter; WebGLRenderingContext.prototype.getParameter = function(parameter) { if (parameter === 37445) { return 'Intel Open Source Technology Center'; } if (parameter === 37446) { return 'Mesa DRI Intel(R) Ivybridge Mobile '; } return getParameter(parameter); }; })()") - - # Pass the Chrome test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); })()") - await page.add_init_script("(() => { window.chrome = { runtime: {}, loadTimes: function(){}, csi: function(){} }; })()") - - # Pass the Permissions test - await page.add_init_script("(() => { const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); })()") - - # Pass the Plugins Length test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); })()") - - # Pass the Languages test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); })()") - - logging.info("Applied JavaScript stealth techniques from stealth_utils.") - except Exception as e: - logging.error(f"Error applying stealth JS from stealth_utils: {e}", exc_info=True) From f21a2335b277111dd996460cbfba3bf134853e4f Mon Sep 17 00:00:00 2001 From: hellothere012 Date: Wed, 4 Jun 2025 08:17:16 -0700 Subject: [PATCH 4/4] chore: remove unused files --- .env.example | 37 +-- Procfile | 2 +- README.md | 48 +++- app.py | 123 ---------- app/crud.py | 50 ---- app/database.py | 25 -- app/main.py | 265 --------------------- app/models.py | 45 ---- app/schemas.py | 41 ---- app/scraper.py | 373 ------------------------------ config.py | 34 --- database.py | 26 --- main.py | 93 ++++++++ package.json | 13 -- render.yaml | 26 ++- requirements.txt | 15 +- {app => src}/__init__.py | 0 src/api/__init__.py | 0 src/api/routes.py | 237 +++++++++++++++++++ src/automation/__init__.py | 0 src/automation/browser_sim.py | 420 ++++++++++++++++++++++++++++++++++ src/config.py | 21 ++ src/database.py | 28 +++ src/models/__init__.py | 0 src/models/vehicle.py | 84 +++++++ stealth_utils.py | 30 --- 26 files changed, 975 insertions(+), 1061 deletions(-) delete mode 100644 app.py delete mode 100644 app/crud.py delete mode 100644 app/database.py delete mode 100644 app/main.py delete mode 100644 app/models.py delete mode 100644 app/schemas.py delete mode 100644 app/scraper.py delete mode 100644 config.py delete mode 100644 database.py create mode 100644 main.py delete mode 100644 package.json rename {app => src}/__init__.py (100%) create mode 100644 src/api/__init__.py create mode 100644 src/api/routes.py create mode 100644 src/automation/__init__.py create mode 100644 src/automation/browser_sim.py create mode 100644 src/config.py create mode 100644 src/database.py create mode 100644 src/models/__init__.py create mode 100644 src/models/vehicle.py delete mode 100644 stealth_utils.py diff --git a/.env.example b/.env.example index f7b9b12..4744016 100644 --- a/.env.example +++ b/.env.example @@ -1,21 +1,22 @@ -# Autotrader Configuration -AUTOTRADER_URL="https://www.autotrader.com/cars-for-sale/by-owner/fullerton-ca?zip=92833&searchRadius=50&numRecords=100&sortBy=priceDESC" +# Database Configuration +DATABASE_URL=sqlite+aiosqlite:///./vehicle_data.db -# Webshare Proxy Configuration -PROXY_HOST="your_webshare_proxy_host" -PROXY_PORT="your_webshare_proxy_port" -WEBSHARE_USERNAME="your_webshare_username" -WEBSHARE_PASSWORD="your_webshare_password" +# Browser Configuration +HEADLESS=true +BROWSER_TIMEOUT=60000 +PAGE_DELAY=5000 +MIN_DELAY_BETWEEN_ACTIONS=2.5 -# Database Configuration -# For local SQLite (default): -DATABASE_URL="sqlite+aiosqlite:///./data/vehicle_tracker.db" -DATABASE_TYPE="sqlite" -# Example for PostgreSQL: -# DATABASE_URL="postgresql+asyncpg://user:password@host:port/dbname" -# DATABASE_TYPE="postgresql" +# API Configuration +API_HOST=127.0.0.1 +API_PORT=8000 + +# Scraping Limits +MAX_LISTINGS_PER_SESSION=25 -# Application Configuration -LOG_LEVEL="INFO" -HEADLESS_BROWSER="True" # For Playwright -SCRAPE_TIMEOUT="120000" # For Playwright page/navigation timeout (milliseconds) +# Optional Proxy Configuration +# If using rotating proxies (e.g., Webshare), uncomment and provide the proxy URL. +# Example: http://username:password@proxyhost:port +# PROXY_SERVER= +# PROXY_USERNAME= +# PROXY_PASSWORD= diff --git a/Procfile b/Procfile index 3972b54..84b6dde 100644 --- a/Procfile +++ b/Procfile @@ -1 +1 @@ -web: uvicorn app.main:app --host=0.0.0.0 --port=${PORT:-8000} +web: uvicorn main:app --host=0.0.0.0 --port=${PORT:-8000} diff --git a/README.md b/README.md index 894b33c..7525e16 100644 --- a/README.md +++ b/README.md @@ -1 +1,47 @@ -# vehicle-tracker \ No newline at end of file +# Vehicle Tracker + +This project provides a FastAPI-based API and web scraper for collecting and storing vehicle listings from sites like AutoTrader. It uses Playwright for scraping and SQLAlchemy with SQLite for storage. + +## Usage + +1. Install dependencies: + ```bash + pip install -r requirements.txt + playwright install chromium + ``` +2. Copy `.env.example` to `.env` and adjust settings as needed. +3. Run the API: + ```bash + uvicorn main:app --reload + ``` +4. Trigger scraping via the `/api/v1/vehicles/scrape` endpoint. + +The scraper can also be run standalone: +```bash +python main.py scrape_test +``` + +## Environment Variables + +Set the following variables in a `.env` file or your deployment environment: + +| Variable | Description | Default | +| --- | --- | --- | +| `DATABASE_URL` | Database connection URL | `sqlite+aiosqlite:///./vehicle_data.db` | +| `HEADLESS` | Run the browser in headless mode | `true` | +| `BROWSER_TIMEOUT` | Playwright launch timeout (ms) | `60000` | +| `PAGE_DELAY` | Base delay after page loads (ms) | `5000` | +| `MIN_DELAY_BETWEEN_ACTIONS` | Delay between scraping actions (s) | `2.5` | +| `API_HOST` | Host for the FastAPI server | `127.0.0.1` | +| `API_PORT` | Port for the FastAPI server | `8000` | +| `MAX_LISTINGS_PER_SESSION` | Maximum listings fetched per scrape | `25` | +| `PROXY_SERVER` | *(Optional)* Proxy URL for Playwright | - | +| `PROXY_USERNAME` | *(Optional)* Proxy username | - | +| `PROXY_PASSWORD` | *(Optional)* Proxy password | - | + +### Pagination + +The `/api/v1/vehicles/` endpoint accepts `skip` and `limit` query parameters to paginate results. +Example: `/api/v1/vehicles/?skip=25&limit=25`. + + diff --git a/app.py b/app.py deleted file mode 100644 index 8fa8500..0000000 --- a/app.py +++ /dev/null @@ -1,123 +0,0 @@ -import logging -# import os # No longer needed for getenv in background task -import asyncio -from fastapi import FastAPI, Depends, BackgroundTasks -from pydantic import BaseModel -from typing import Dict -from datetime import datetime -from database import CarListing, get_db, Session, SessionLocal -from scraper import scrape_autotrader_and_update_db -from fastapi.middleware.cors import CORSMiddleware -from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT, LOG_LEVEL # Import from config - -# Configure basic logging using LOG_LEVEL from config -# Ensure this is called only once. If FastAPI/Uvicorn also configures logging, -# this might need adjustment or to be handled by the logger instance directly. -# For now, assume this is the primary logging config. -logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s', force=True) -# Added force=True to ensure this config takes precedence if uvicorn also tries to set basicConfig. - -app = FastAPI() -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_methods=["*"], - allow_headers=["*"], -) - -class CarListingRaw(BaseModel): - platform: str - extracted_at: datetime - source_url: str - data_points: Dict - -@app.post("/api/v1/listings/ingest") -async def ingest_listing(payload: CarListingRaw, db: Session = Depends(get_db)): - listing = CarListing( - platform=payload.platform, - extracted_at=payload.extracted_at, - source_url=payload.source_url, - data_points=payload.data_points - ) - db.add(listing) - db.commit() - db.refresh(listing) - return {"status": "saved", "listing_id": listing.id} - -@app.get("/") -def read_root(): - return {"message": "πŸš— Car Tracker API is running!"} - -# Global variable to store scraping status -scrape_status = { - "last_run_time": None, - "status": "idle", # States: idle, running, success, error - "message": "", - "added": 0, - "updated": 0, - "scraped_count": 0 -} - -# Background task wrapper -async def _background_scraper_task_wrapper(): - global scrape_status - db_task_session: Session = SessionLocal() - logging.info("Background scraper task started.") - scrape_status["status"] = "running" - scrape_status["message"] = "Scraping in progress..." - scrape_status["last_run_time"] = datetime.utcnow().isoformat() - scrape_status["added"] = 0 # Reset counts for current run - scrape_status["updated"] = 0 - scrape_status["scraped_count"] = 0 - - try: - # Use imported config values - # autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") - # headless_str = os.getenv("HEADLESS_BROWSER", "True") - # headless = headless_str.lower() == "true" - # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - # try: - # scrape_timeout = int(scrape_timeout_str) - # except ValueError: - # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.") - # scrape_timeout = 120000 - - logging.info(f"Background task using URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms") - - result = await scrape_autotrader_and_update_db( - db=db_task_session, - autotrader_url=AUTOTRADER_URL, - headless=HEADLESS_BROWSER, - scrape_timeout=SCRAPE_TIMEOUT - ) - - if result.get("status") == "success": - scrape_status["status"] = "success" - scrape_status["message"] = "Scraping completed successfully." - scrape_status["added"] = result.get("added", 0) - scrape_status["updated"] = result.get("updated", 0) - scrape_status["scraped_count"] = result.get("scraped_count", 0) - else: - scrape_status["status"] = "error" - scrape_status["message"] = result.get("message", "Scraping failed with an unknown error.") - - logging.info(f"Background scraper task completed: {result}") - - except Exception as e: - logging.error(f"Error in background scraper task: {e}", exc_info=True) - scrape_status["status"] = "error" - scrape_status["message"] = str(e) - finally: - db_task_session.close() - logging.info("Background scraper DB session closed.") - -@app.post("/api/v1/scrape/autotrader") -async def trigger_autotrader_scrape(background_tasks: BackgroundTasks): - if scrape_status["status"] == "running": - return {"message": "AutoTrader scraping job is already running."} - background_tasks.add_task(_background_scraper_task_wrapper) - return {"message": "AutoTrader scraping job started in the background."} - -@app.get("/api/v1/scrape/status") -async def get_scrape_status(): - return scrape_status diff --git a/app/crud.py b/app/crud.py deleted file mode 100644 index fb8708b..0000000 --- a/app/crud.py +++ /dev/null @@ -1,50 +0,0 @@ -from sqlalchemy.orm import Session -from . import models, schemas -from datetime import datetime - -def get_car_listing_by_url(db: Session, url: str): - return db.query(models.ScrapedData).filter(models.ScrapedData.url == url).first() - -def create_car_listing(db: Session, listing: schemas.CarListingCreate): - db_listing = models.ScrapedData( - job_id=listing.job_id, - platform=listing.platform, - url=str(listing.url), # Ensure HttpUrl is converted to string - title=listing.title, - price=listing.price, - mileage=listing.mileage, - vin=listing.vin, - image_urls=listing.image_urls, # Assuming image_urls is already a list of strings or compatible JSON - raw_data=listing.raw_data, - scraped_at=datetime.utcnow() - ) - db.add(db_listing) - db.commit() - db.refresh(db_listing) - return db_listing - -def create_scrape_job(db: Session) -> models.ScrapeJob: - db_job = models.ScrapeJob(timestamp=datetime.utcnow(), status="pending") - db.add(db_job) - db.commit() - db.refresh(db_job) - return db_job - -def update_scrape_job_status(db: Session, job_id: int, status: str, results_count: int = 0, error_message: str = None): - db_job = db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first() - if db_job: - db_job.status = status - db_job.results_count = results_count - db_job.error_message = error_message - db.commit() - db.refresh(db_job) - return db_job - -def get_scrape_job(db: Session, job_id: int): - return db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first() - -def get_all_scrape_jobs(db: Session, skip: int = 0, limit: int = 100): - return db.query(models.ScrapeJob).order_by(models.ScrapeJob.timestamp.desc()).offset(skip).limit(limit).all() - -def get_listings_for_job(db: Session, job_id: int, skip: int = 0, limit: int = 100): - return db.query(models.ScrapedData).filter(models.ScrapedData.job_id == job_id).offset(skip).limit(limit).all() diff --git a/app/database.py b/app/database.py deleted file mode 100644 index bf32154..0000000 --- a/app/database.py +++ /dev/null @@ -1,25 +0,0 @@ -from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker -import os - -DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db") - -engine_args = {} -if DATABASE_URL.startswith("sqlite"): - engine_args["connect_args"] = {"check_same_thread": False} - -engine = create_engine(DATABASE_URL, **engine_args) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - -Base = declarative_base() - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -def create_tables(): - Base.metadata.create_all(bind=engine) diff --git a/app/main.py b/app/main.py deleted file mode 100644 index 4817f15..0000000 --- a/app/main.py +++ /dev/null @@ -1,265 +0,0 @@ -import logging -import os -from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks -from sqlalchemy.orm import Session -from typing import List - -from . import crud, models, schemas, scraper -from .database import SessionLocal, engine - -# Create database tables if they don't exist -models.Base.metadata.create_all(bind=engine) - -# Configure logging -LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() -logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -app = FastAPI(title="AutoTrader Scraper API", version="1.0.0") - -# Dependency to get DB session -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -# Global variable to store scraping status (simple approach) -scrape_status = { - "job_id": None, - "status": "idle", # States: idle, pending, running, completed, failed - "message": "No scraping job initiated yet.", - "last_run_time": None, - "duration_seconds": None, - "results_count": 0, - "error_message": None -} - -async def run_scraping_task(job_id: int, autotrader_url: str, headless: bool, scrape_timeout: int): - """ - The actual scraping task that runs in the background. - It creates its own database session. - """ - global scrape_status - db: Session = SessionLocal() - try: - logger.info(f"Background task started for job_id: {job_id}") - crud.update_scrape_job_status(db, job_id, status="running") - scrape_status.update({ - "job_id": job_id, - "status": "running", - "message": f"Scraping from {autotrader_url}...", - "last_run_time": datetime.utcnow().isoformat(), - "duration_seconds": None, - "results_count": 0, - "error_message": None - }) - - start_time = datetime.utcnow() - - scraped_data_list = await scraper.scrape_autotrader_data( - autotrader_url=autotrader_url, - headless=headless, - timeout=scrape_timeout - ) - - end_time = datetime.utcnow() - duration = (end_time - start_time).total_seconds() - scrape_status["duration_seconds"] = round(duration, 2) - - added_count = 0 - updated_count = 0 # Placeholder for future update logic - - if not scraped_data_list: - logger.info(f"No listings found for job_id: {job_id}") - crud.update_scrape_job_status(db, job_id, status="completed", results_count=0) - scrape_status.update({ - "status": "completed", - "message": "Scraping completed. No new listings found or page was inaccessible.", - "results_count": 0 - }) - return - - for item_data in scraped_data_list: - # Ensure all required fields for CarListingCreate are present - listing_create = schemas.CarListingCreate( - job_id=job_id, - platform=item_data.get("source_name", "autotrader"), # Get platform from scraper or default - url=item_data.get("listing_url"), - title=item_data.get("title"), - price=item_data.get("price"), - mileage=item_data.get("mileage"), - vin=item_data.get("vin"), - image_urls=item_data.get("image_urls", []), - raw_data=item_data.get("data_points", {}) - ) - - existing_listing = crud.get_car_listing_by_url(db, str(listing_create.url)) - if existing_listing: - # For now, we just count updates. Actual update logic could be added here. - # e.g., existing_listing.price = listing_create.price - # existing_listing.extracted_at = datetime.utcnow() - updated_count += 1 - else: - crud.create_car_listing(db=db, listing=listing_create) - added_count += 1 - - crud.update_scrape_job_status(db, job_id, status="completed", results_count=added_count) - scrape_status.update({ - "status": "completed", - "message": f"Scraping finished. Added: {added_count}, Updated: {updated_count} (placeholder).", - "results_count": added_count + updated_count # Or just added_count if updates aren't really changing data - }) - logger.info(f"Background task for job_id: {job_id} completed. Added: {added_count}, Updated: {updated_count}") - - except Exception as e: - logger.error(f"Error in background scraper task for job_id {job_id}: {e}", exc_info=True) - crud.update_scrape_job_status(db, job_id, status="failed", error_message=str(e)) - scrape_status.update({ - "status": "failed", - "message": f"Error during scraping: {str(e)}", - "error_message": str(e) - }) - finally: - db.close() - logger.info(f"DB session closed for job_id: {job_id}") - - -@app.post("/scrape/", response_model=schemas.ScrapeJob, status_code=202) -async def trigger_scrape(background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - """ - Triggers a new scraping job for Autotrader. - """ - global scrape_status - if scrape_status.get("status") == "running": - raise HTTPException(status_code=409, detail="A scraping job is already in progress.") - - autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/all-cars/cars-under-10000") # Default to a common search if not set - headless_str = os.getenv("HEADLESS_BROWSER", "True") - headless = headless_str.lower() == "true" - scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - - try: - scrape_timeout = int(scrape_timeout_str) - except ValueError: - scrape_timeout = 120000 # Default timeout if parsing fails - logger.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Using default {scrape_timeout}ms.") - - job = crud.create_scrape_job(db) - scrape_status.update({ - "job_id": job.id, - "status": "pending", - "message": f"Scraping job {job.id} initiated for URL: {autotrader_url}", - "last_run_time": job.timestamp.isoformat(), - "duration_seconds": None, - "results_count": 0, - "error_message": None - }) - - # Pass job_id to the background task - background_tasks.add_task(run_scraping_task, job.id, autotrader_url, headless, scrape_timeout) - - logger.info(f"Scraping job {job.id} queued for URL: {autotrader_url}") - return job - -@app.post("/api/v1/listings/ingest", response_model=schemas.CarListing, status_code=201) -async def ingest_listing(payload: schemas.CarListingCreate, db: Session = Depends(get_db)): - """ - Ingests a new car listing into the database. - This endpoint is useful for manually adding or testing data. - """ - # Check if listing with this URL already exists to prevent duplicates, - # though the database constraint should also handle this. - db_listing = crud.get_car_listing_by_url(db, url=str(payload.url)) - if db_listing: - raise HTTPException(status_code=400, detail="Listing with this URL already exists.") - - # The job_id in CarListingCreate might be problematic if this is a direct ingest - # not tied to a specific scrape job. For now, we'll assume it's provided or - # we could adjust the schema/logic if direct ingestion shouldn't have a job_id. - # For testing, we might need to create a dummy job or adjust schema. - # Let's assume for now a valid job_id is provided or handle it if not. - if not payload.job_id: - # Create a dummy job or handle as per requirements for listings not tied to a job - # For simplicity, let's assume job_id is optional in the schema for this use case - # or a default/placeholder job_id is used. - # For this test, the payload includes job_id, so we'll proceed. - # If CarListingCreate schema requires job_id, this endpoint needs to handle it. - # For now, let's assume it's provided in the payload. - pass - - try: - created_listing = crud.create_car_listing(db=db, listing=payload) - return created_listing - except Exception as e: - logger.error(f"Error ingesting listing: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") - - -@app.get("/scrape/status", response_model=schemas.ScrapeJob) # Using ScrapeJob schema for better structure -async def get_current_scrape_status(db: Session = Depends(get_db)): - """ - Returns the status of the current or last scraping job. - """ - global scrape_status - if scrape_status.get("job_id"): - job = crud.get_scrape_job(db, scrape_status["job_id"]) - if job: - # Update status from DB if available, otherwise use in-memory for simplicity - # A more robust system might always fetch from DB or use a proper job queue status - return job - return scrape_status # Fallback to in-memory status if job not found or not started - -@app.get("/scrape/jobs/", response_model=List[schemas.ScrapeJob]) -async def read_jobs(skip: int = 0, limit: int = 10, db: Session = Depends(get_db)): - """ - Retrieve all scrape jobs. - """ - jobs = crud.get_all_scrape_jobs(db, skip=skip, limit=limit) - return jobs - -@app.get("/scrape/jobs/{job_id}/results", response_model=List[schemas.CarListing]) -async def read_job_results(job_id: int, skip: int = 0, limit: int = 10, db: Session = Depends(get_db)): - """ - Retrieve results for a specific scrape job. - """ - job = crud.get_scrape_job(db, job_id=job_id) - if job is None: - raise HTTPException(status_code=404, detail="Job not found") - listings = crud.get_listings_for_job(db, job_id=job_id, skip=skip, limit=limit) - return listings - -@app.get("/") -async def read_root(): - return {"message": "AutoTrader Scraper API is running!"} - -# This is for local development if you run `python app/main.py` -# Uvicorn will be started by Procfile in production environments like Heroku -if __name__ == "__main__": - # Ensure tables are created before starting the app if they don't exist - # This is useful for local development but might be handled differently in production - from .database import create_tables - create_tables() - - # Get port from environment variable or default to 8000 - port = int(os.getenv("PORT", "8000")) - uvicorn.run(app, host="0.0.0.0", port=port) - -# Remove the old main.py content if it exists in the root directory -# This is now handled by app/main.py -# Ensure Procfile points to app.main:app or similar based on your directory structure -# e.g., web: uvicorn app.main:app --host=0.0.0.0 --port=${PORT:-8000} -# (Assuming app.py is moved to app/main.py) -# If app.py remains in root, then Procfile is fine. - -# The `models.Base.metadata.create_all(bind=engine)` should ideally be called once, -# perhaps in main.py or a startup script, not every time database.py is imported. -# For simplicity in this single-file app structure, it's often put there. -# If app.py is the main entry point for uvicorn, it's a good place. -# For Render, buildCommand in render.yaml can also handle migrations/table creation. - -# Let's ensure the imports are correct considering the file structure -# If main.py is in root and imports from app/, it should be `from app import crud, models, schemas, scraper` -# If this file is app/main.py, then `from . import crud, models, schemas, scraper` is correct. -# The prompt implies this file is app/main.py. diff --git a/app/models.py b/app/models.py deleted file mode 100644 index b0d4e5d..0000000 --- a/app/models.py +++ /dev/null @@ -1,45 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship -from datetime import datetime - -Base = declarative_base() - -class ScrapeResult(Base): - __tablename__ = "scrape_results" - - id = Column(Integer, primary_key=True, index=True) - url = Column(String, index=True) - title = Column(String) - price = Column(String, nullable=True) # Store as string to handle variations like 'Contact Seller' - mileage = Column(String, nullable=True) # Store as string to handle non-numeric values - vin = Column(String, nullable=True, unique=True) - images = Column(JSON, nullable=True) # Store list of image URLs - scraped_at = Column(DateTime) - details = Column(JSON, nullable=True) # Store other details as JSON - -class ScrapeJob(Base): - __tablename__ = "scrape_jobs" - - id = Column(Integer, primary_key=True, index=True) - timestamp = Column(DateTime, default=datetime.utcnow) - status = Column(String, default="pending") # e.g., pending, running, completed, failed - results_count = Column(Integer, default=0) - error_message = Column(String, nullable=True) - -class ScrapedData(Base): - __tablename__ = "scraped_data" - - id = Column(Integer, primary_key=True, index=True) - job_id = Column(Integer, ForeignKey("scrape_jobs.id")) - platform = Column(String) # e.g., 'autotrader', 'cars.com' - url = Column(String, unique=True, index=True) - title = Column(String, nullable=True) - price = Column(String, nullable=True) - mileage = Column(String, nullable=True) - vin = Column(String, nullable=True, index=True) - image_urls = Column(JSON, nullable=True) # List of image URLs - raw_data = Column(JSON, nullable=True) # Full raw data if needed - scraped_at = Column(DateTime, default=datetime.utcnow) - - job = relationship("ScrapeJob") diff --git a/app/schemas.py b/app/schemas.py deleted file mode 100644 index 2ee0d8d..0000000 --- a/app/schemas.py +++ /dev/null @@ -1,41 +0,0 @@ -from pydantic import BaseModel, HttpUrl -from typing import List, Optional, Dict, Any -from datetime import datetime - -class CarListingBase(BaseModel): - url: HttpUrl - title: Optional[str] = None - price: Optional[str] = None # Keep as string to handle variations - mileage: Optional[str] = None # Keep as string - vin: Optional[str] = None - image_urls: Optional[List[HttpUrl]] = [] - raw_data: Optional[Dict[str, Any]] = {} # For any other unstructured data - -class CarListingCreate(CarListingBase): - platform: str - job_id: int - -class CarListing(CarListingBase): - id: int - platform: str - job_id: int - scraped_at: datetime - - class Config: - orm_mode = True - -class ScrapeJobBase(BaseModel): - pass - -class ScrapeJobCreate(ScrapeJobBase): - pass - -class ScrapeJob(ScrapeJobBase): - id: int - timestamp: datetime - status: str - results_count: int = 0 - error_message: Optional[str] = None - - class Config: - orm_mode = True diff --git a/app/scraper.py b/app/scraper.py deleted file mode 100644 index b946ecb..0000000 --- a/app/scraper.py +++ /dev/null @@ -1,373 +0,0 @@ -import asyncio -import logging -# import os # No longer needed for getenv in main -import datetime # Keep for now, might be used in data processing -from playwright.async_api import async_playwright -# Required for main test function -from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT -# DATABASE_URL is used by database.py, SessionLocal will pick it up via config - -# Assuming database.py is in the same directory or accessible in PYTHONPATH -from database import get_db, CarListing, SessionLocal # Added SessionLocal for main example -from sqlalchemy.orm import Session -from datetime import datetime # Ensure datetime is imported directly - -# Configure basic logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - -from stealth_utils import apply_stealth_js # Import new stealth utility -# from playwright_stealth import stealth_async # Commenting out old stealth - -class AutoTraderScraper: - """Scraper for AutoTrader private party listings using Playwright.""" - - def __init__(self, source_name: str = "autotrader"): - """ - Initializes the AutoTraderScraper. - Args: - source_name (str): Name of the source platform. - """ - self.source_name = source_name - # Potentially load other configs from a config file or env vars here - # For example: self.base_url = "https://www.autotrader.com/cars-for-sale/private-seller" - - async def get_private_listings(self, autotrader_url: str, headless: bool, timeout: int = 120000) -> list[dict]: - """ - Scrapes private party listings from AutoTrader using Playwright. - - Args: - autotrader_url (str): The starting URL for scraping AutoTrader private listings. - headless (bool): Whether to run the browser in headless mode. - timeout (int): Maximum time in milliseconds for page operations. - - Returns: - list[dict]: A list of dictionaries, where each dictionary represents a scraped vehicle listing. - """ - listings_data = [] - browser = None - - launch_options = { - "headless": headless, - "args": [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-infobars', - '--window-position=0,0', - '--ignore-certificate-errors', - '--ignore-certificate-errors-spki-list', - # '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"' # User agent is set in context - '--disable-gpu' # Already there but keep - ], - # "channel": "chrome" # This might require full Chrome install, trying without first to see if args help - } - - # Try with 'msedge' or 'chrome' if default chromium fails and they are available - # For now, stick to chromium and args. If 'channel' is needed, it's a bigger setup change. - - async with async_playwright() as p: - try: - # browser = await p.chromium.launch(**launch_options) # Default chromium - # Let's try specifying channel, assuming it might use a locally installed Chrome if available, or a Playwright-managed one. - # This is a common suggestion if the default Playwright Chromium build is too easily detected. - # If "chrome" channel is not found by Playwright, it will error. - try: - browser = await p.chromium.launch( - **launch_options, - channel="chrome" # Attempt to use a branded Chrome build - ) - logging.info("Attempting to launch with channel='chrome'") - except Exception as e_channel: - logging.warning(f"Failed to launch with channel='chrome' ({e_channel}). Falling back to default Playwright Chromium.") - # Remove channel from launch_options if it failed - launch_options_no_channel = launch_options.copy() - if "channel" in launch_options_no_channel: # Should not be needed based on above structure but good practice - del launch_options_no_channel["channel"] - browser = await p.chromium.launch(**launch_options_no_channel) - logging.info("Launched with default Playwright Chromium.") - - - context = await browser.new_context( - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36', # A fairly common user agent - java_script_enabled=True, - ) - context.set_default_navigation_timeout(timeout) - context.set_default_timeout(timeout) - - page = await context.new_page() - await page.set_viewport_size({"width": 1920, "height": 1080}) - - # Apply custom JS stealth - await apply_stealth_js(page) - - logging.info(f"Navigating to {autotrader_url}") - await page.goto(autotrader_url, wait_until="domcontentloaded", timeout=timeout) # Reverted to domcontentloaded - - title = await page.title() - logging.info(f"Page title: {title}") - - if "unavailable" in title.lower() or "block" in title.lower() or "access denied" in title.lower(): - logging.critical(f"Failed to load AutoTrader listings page. Blocked by website. Title: {title}") - await browser.close() # Ensure browser is closed before returning - return [] - - # Using speculative selectors for AutoTrader - # Main container for listings: 'div[data-qaid="cntnr-lstng-main"]' (this might be too broad or incorrect) - # A more specific item selector might be needed, e.g., an article or a div with a specific class. - # For now, let's assume individual listing cards can be found with a selector like: - # "div.inventory-listing" or "div[data-cmp='inventoryListing']" - these are common patterns. - # The provided example 'div[data-qaid="cntnr-lstng-main"]' seems like it might be a single container FOR ALL listings. - # Let's try a more specific (but still guessed) selector for individual listing items. - # A common pattern is items within a list or grid. Let's try to find items: - # This selector is a **GUESS** based on common AutoTrader structures. - listing_item_selector = "div[data-cmp='inventoryListing']" # GUESS - - # Fallback if the primary guess doesn't work, try another common pattern - # listing_item_selector_fallback = "div.inventory-listing.new-listing.stub" # Another GUESS - - # await page.wait_for_selector(listing_item_selector, timeout=15000) # Wait for items to appear - - listing_containers = await page.query_selector_all(listing_item_selector) - - # if not listing_containers: - # logging.info(f"No listings found with primary selector '{listing_item_selector}'. Trying fallback...") - # listing_containers = await page.query_selector_all(listing_item_selector_fallback) - - logging.info(f"Found {len(listing_containers)} potential listing containers using selector '{listing_item_selector}'.") - - processed_count = 0 - # first_container_processed_for_html_dump = False # REMOVE HTML DUMP FLAG - for i, container in enumerate(listing_containers): - url_path = None - title_text = "N/A" # Default to N/A - price_text = "N/A" # Default to N/A - mileage_text = "N/A" # Default to N/A (as it's not reliably on card) - listing_url = None - - try: - logging.debug(f"Processing container {i+1}/{len(listing_containers)}") - - # Attempt to get Title - title_el = await container.query_selector("h2[data-cmp='subheading']") # Updated selector from HTML dump - if title_el: - raw_title_text = await title_el.inner_text() - title_text = raw_title_text.strip() if raw_title_text else "N/A" - - # Attempt to get URL from parent of title_el - # Playwright's query_selector does not directly support xpath like "ancestor::a". - # A common structure is

...

or

...

- # We can try to find 'a' that contains this h2, or assume the 'a[data-cmp="link"]' is the one. - - # Let's use the a[data-cmp="link"] which was identified as containing the title h2 - parent_link_el = await container.query_selector("a[data-cmp='link']") - if parent_link_el: - url_path = await parent_link_el.get_attribute("href") - else: # Fallback if the above structure isn't found - logging.warning(f"Could not find parent a[data-cmp='link'] for title in listing {i+1}") - else: - logging.warning(f"Title not found with h2[data-cmp='subheading'] for listing {i+1}.") - - # Fallback or alternative for URL if not found via title's parent link - if not url_path: - url_el_alt = await container.query_selector("a[data-cmp='relLnk']") # Keep this fallback - if url_el_alt: - url_path = await url_el_alt.get_attribute("href") - - if not url_path: # Last resort for URL - first_a = await container.query_selector("a[href]") # Broadest fallback - if first_a: - url_path = await first_a.get_attribute("href") - - if not url_path: - logging.warning(f"Could not extract URL for listing {i+1} (Title: {title_text}). Skipping.") - continue - - if not url_path.startswith(('http://', 'https://')): - listing_url = f"https://www.autotrader.com{url_path}" - else: - listing_url = url_path - - # Attempt to get Price - price_el = await container.query_selector("div[data-cmp='firstPrice']") # Updated selector - if price_el: - raw_price_text = await price_el.inner_text() - price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A" - else: - # Fallback for price (e.g. .first-price class directly) - price_el_fallback = await container.query_selector(".first-price") - if price_el_fallback: - raw_price_text = await price_el_fallback.inner_text() - price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A" - else: - logging.warning(f"Price not found for listing {listing_url}") - price_text = "N/A" - - - # Mileage - Set to N/A as it's not reliably on the card from previous findings - mileage_text = "N/A" - # logging.info(f"Mileage not scraped from listing card for {listing_url} (by design for now).") - - vin_text = None - - listing_data = { - "listing_url": listing_url, - "title": title_text, # Already defaults to N/A or has value - "price": price_text, # Already defaults to N/A or has value - "mileage": mileage_text, # Is N/A - "vin": vin_text, - "source_name": self.source_name, - "data_points": { - "page_title_at_scrape": title # page's title, not listing's - } - } - listings_data.append(listing_data) - processed_count += 1 - logging.info(f"Successfully processed listing: {title_text[:50]}... URL: {listing_url}") - - except Exception as e: - logging.error(f"Error processing listing container {i+1} for URL {listing_url if listing_url else 'Unknown'}: {e}", exc_info=True) - continue - - logging.info(f"Successfully processed {processed_count} out of {len(listing_containers)} listing containers.") - - except Exception as e: - logging.error(f"An error occurred during Playwright scraping phase: {e}", exc_info=True) - finally: - if browser: - logging.info("Closing browser.") - await browser.close() - - return listings_data - - -async def scrape_autotrader_data(autotrader_url: str, headless: bool = True, timeout: int = 120000) -> list[dict]: - """ - High-level function to scrape data from AutoTrader. - Initializes the scraper and calls its scraping method. - - Args: - autotrader_url (str): The URL to scrape. - headless (bool): Whether to run the browser in headless mode. - timeout (int): Timeout for scraping operations in milliseconds. - - Returns: - list[dict]: A list of scraped listing data. - """ - scraper = AutoTraderScraper() - listings = await scraper.get_private_listings(autotrader_url=autotrader_url, headless=headless, timeout=timeout) - return listings - - -async def scrape_autotrader_and_update_db(db: Session, autotrader_url: str, headless: bool, scrape_timeout: int): - """ - Scrapes listings from AutoTrader and updates the database. - - Args: - db (Session): The SQLAlchemy database session. - autotrader_url (str): The URL to scrape. - headless (bool): Whether to run the browser in headless mode. - scrape_timeout (int): Timeout for scraping operations in milliseconds. - - Returns: - dict: A status dictionary with counts of added, updated, and scraped listings. - """ - logging.info(f"Starting scrape and update for URL: {autotrader_url}") - - try: - listings_data = await scrape_autotrader_data( - autotrader_url=autotrader_url, - headless=headless, - timeout=scrape_timeout - ) - except Exception as e: - logging.error(f"Failed to scrape data from {autotrader_url}: {e}", exc_info=True) - return {"status": "error", "message": f"Scraping failed: {e}"} - - added_count = 0 - updated_count = 0 - scraped_count = len(listings_data) - - for listing_data in listings_data: - source_url = listing_data.get('listing_url') # Renamed from 'url' to 'listing_url' in dummy data - if not source_url: - logging.warning(f"Scraped item missing 'listing_url': {listing_data.get('title')}. Skipping.") - continue - - try: - existing_listing = db.query(CarListing).filter(CarListing.source_url == source_url).first() - - if existing_listing: - # Placeholder for update logic - # existing_listing.extracted_at = datetime.utcnow() - # existing_listing.data_points = {k: v for k, v in listing_data.items() if k != 'listing_url'} - # # Update other fields like price if necessary - # db.add(existing_listing) # Not strictly necessary if only mutable fields changed and session tracks - updated_count += 1 - logging.info(f"Listing at {source_url} already exists. Marked for update (placeholder).") - else: - new_listing = CarListing( - platform="autotrader", - extracted_at=datetime.utcnow(), - source_url=source_url, - # Ensure data_points stores everything else from listing_data - data_points={k: v for k, v in listing_data.items() if k != 'listing_url'} - ) - db.add(new_listing) - added_count += 1 - logging.info(f"New listing added from {source_url}") - except Exception as e: - logging.error(f"Error processing listing {source_url} for DB: {e}", exc_info=True) - # Decide if you want to rollback here or continue with other listings - - try: - db.commit() - logging.info("Database changes committed.") - except Exception as e: - logging.error(f"Database commit failed: {e}", exc_info=True) - db.rollback() - return {"status": "error", "message": f"DB commit failed: {e}", "added": 0, "updated": 0, "scraped_count": scraped_count} - - status_summary = { - "status": "success", - "added": added_count, - "updated": updated_count, - "scraped_count": scraped_count - } - logging.info(f"DB update summary: {status_summary}") - return status_summary - -async def main(): - # Use settings from config.py - # url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") - # headless_str = os.getenv("HEADLESS_BROWSER", "True") - # headless = headless_str.lower() == "true" - # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - # try: - # scrape_timeout = int(scrape_timeout_str) - # except ValueError: - # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.") - # scrape_timeout = 120000 - - # from database import SessionLocal # Already imported at the top - db: Session = SessionLocal() # SessionLocal now uses DATABASE_URL from config.py via database.py - try: - logging.info(f"Starting scraper and DB update for URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms") - stats = await scrape_autotrader_and_update_db( - db=db, - autotrader_url=AUTOTRADER_URL, - headless=HEADLESS_BROWSER, - scrape_timeout=SCRAPE_TIMEOUT - ) - logging.info(f"Scraping and DB update completed: {stats}") - except Exception as e: - logging.error(f"Error during scraping and DB update in main: {e}", exc_info=True) - finally: - logging.info("Closing DB session in main.") - db.close() - -if __name__ == "__main__": - # To run this: - # 1. Ensure Playwright browsers are installed: `playwright install chromium` - # 2. Set environment variables if needed (AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT) - # 3. Uncomment the line below - asyncio.run(main()) - # pass # Keep it passive for now, to be run manually when needed diff --git a/config.py b/config.py deleted file mode 100644 index 44148ca..0000000 --- a/config.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from dotenv import load_dotenv - -# Load environment variables from .env file if it exists -# This is useful for local development. -load_dotenv() - -# Database Configuration -DATABASE_URL: str = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db") - -# Scraper Configuration -AUTOTRADER_URL: str = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") -SCRAPE_TIMEOUT: int = int(os.getenv("SCRAPE_TIMEOUT", "120000")) # Milliseconds -HEADLESS_BROWSER: bool = os.getenv("HEADLESS_BROWSER", "True").lower() == "true" - -# API Configuration (if any specific ones are needed later) -# Example: API_HOST: str = os.getenv("API_HOST", "0.0.0.0") -# Example: API_PORT: int = int(os.getenv("API_PORT", "8000")) - -# Logging Configuration (can also be added here if more complex) -LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO").upper() - -# Ensure critical URLs have a scheme for robustness -if not AUTOTRADER_URL.startswith(("http://", "https://")): - # This print statement is for immediate feedback during startup/import. - # In a pure library, side effects on import are sometimes discouraged, - # but for an application's main config, it's often acceptable. - print(f"Warning: AUTOTRADER_URL ('{AUTOTRADER_URL}') did not have a scheme, prepended https://.") - AUTOTRADER_URL = "https://" + AUTOTRADER_URL - print(f"Corrected AUTOTRADER_URL: {AUTOTRADER_URL}") - - -# Example of how to handle SQLite connect_args based on config -DB_CONNECT_ARGS: dict = {"check_same_thread": False} if DATABASE_URL.startswith("sqlite") else {} diff --git a/database.py b/database.py deleted file mode 100644 index 58a11dc..0000000 --- a/database.py +++ /dev/null @@ -1,26 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, JSON, create_engine -from sqlalchemy.orm import declarative_base, sessionmaker, Session -from config import DATABASE_URL, DB_CONNECT_ARGS # Import from config - -# Use imported configuration -engine = create_engine(DATABASE_URL, connect_args=DB_CONNECT_ARGS) - -SessionLocal = sessionmaker(bind=engine, autoflush=False) -Base = declarative_base() - -class CarListing(Base): - __tablename__ = "listings" - id = Column(Integer, primary_key=True, index=True) - platform = Column(String) - extracted_at = Column(DateTime) - source_url = Column(String, unique=True) - data_points = Column(JSON) - -Base.metadata.create_all(bind=engine) - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() diff --git a/main.py b/main.py new file mode 100644 index 0000000..fb37fc5 --- /dev/null +++ b/main.py @@ -0,0 +1,93 @@ +import asyncio +import uvicorn +from contextlib import asynccontextmanager +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +import logging +from datetime import datetime + +from src.database import create_db_tables +from src.api.routes import router as api_v1_router +from src.config import settings +from src.automation.browser_sim import run_autotrader_scraper_example_standalone + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[logging.StreamHandler()] +) +logger = logging.getLogger(__name__) + +@asynccontextmanager +async def lifespan(app: FastAPI): + logger.info("\ud83d\ude80 Starting Educational Vehicle Tracker API...") + await create_db_tables() + logger.info("\ud83d\udcca Database tables checked/created.") + yield + logger.info("\ud83d\udd1b Shutting down Educational Vehicle Tracker API.") + +app = FastAPI( + title="Educational Vehicle Tracker", + description="An educational system for learning web automation and data pipeline architecture, now with real scraping.", + version="1.1.0", + lifespan=lifespan +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(api_v1_router, prefix="/api/v1") + +@app.get("/", include_in_schema=False) +async def root_redirect_to_docs(): + from fastapi.responses import RedirectResponse + return RedirectResponse(url="/docs") + +@app.get("/health", summary="Health Check") +async def health_check(): + return {"status": "healthy", "service": "vehicle-tracker-api", "timestamp": datetime.utcnow()} + +async def run_standalone_scrape_cli(): + logger.info("\ud83c\udf3d Running Standalone AutoTrader Scraper Example from CLI") + print("=" * 40) + try: + await run_autotrader_scraper_example_standalone() + logger.info("\u2705 Standalone scraper example completed successfully!") + except Exception as e: + logger.error(f"\u274c Standalone scraper example failed: {e}", exc_info=True) + +if __name__ == "__main__": + import sys + print_startup_message = True + if len(sys.argv) > 1: + if sys.argv[1] == "scrape_test": + print_startup_message = False + asyncio.run(run_standalone_scrape_cli()) + elif sys.argv[1] == "create_tables": + print_startup_message = False + asyncio.run(create_db_tables()) + logger.info("Database tables creation process finished.") + else: + logger.warning(f"Unknown command: {sys.argv[1]}") + print("\ud83d\udd0d Usage: python main.py [scrape_test | create_tables]") + if print_startup_message: + logger.info("\ud83c\udf93 Educational Vehicle Tracking System - API Server Mode") + print("=" * 50) + logger.info(f"API Host: {settings.API_HOST}") + logger.info(f"API Port: {settings.API_PORT}") + logger.info(f"Database: {settings.DATABASE_URL}") + logger.info(f"Max Listings per Session (Scrape): {settings.MAX_LISTINGS_PER_SESSION}") + logger.info(f"Playwright Headless: {settings.HEADLESS}") + print("=" * 50) + uvicorn.run( + "main:app", + host=settings.API_HOST, + port=settings.API_PORT, + reload=True, + log_level="info" + ) diff --git a/package.json b/package.json deleted file mode 100644 index 5f62ed5..0000000 --- a/package.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "name": "autotrader-scraper", - "version": "1.0.0", - "description": "A FastAPI application for scraping Autotrader data.", - "main": "index.js", - "scripts": { - "start": "python app.py", - "test": "echo \"Error: no test specified\" && exit 1" - }, - "keywords": ["fastapi", "autotrader", "scraper", "web-scraping"], - "author": "", - "license": "ISC" -} diff --git a/render.yaml b/render.yaml index c0e7ffa..f755e35 100644 --- a/render.yaml +++ b/render.yaml @@ -5,15 +5,25 @@ services: buildCommand: | pip install -r requirements.txt playwright install chromium - startCommand: uvicorn app:app --host 0.0.0.0 --port ${PORT:-8000} + startCommand: uvicorn main:app --host 0.0.0.0 --port ${PORT:-8000} envVars: - key: PYTHON_VERSION - value: 3.11 # Or your desired Python version + value: 3.11 - key: DATABASE_URL - generateValue: true # For Render PostgreSQL, or set manually for SQLite/external DB - - key: AUTOTRADER_URL - value: "https://www.autotrader.com/cars-for-sale/private-seller" # Example - - key: SCRAPE_TIMEOUT - value: 120000 # Example: 120 seconds - - key: HEADLESS_BROWSER + generateValue: true + - key: HEADLESS value: "True" + - key: BROWSER_TIMEOUT + value: "60000" + - key: PAGE_DELAY + value: "5000" + - key: MIN_DELAY_BETWEEN_ACTIONS + value: "2.5" + - key: MAX_LISTINGS_PER_SESSION + value: "25" + - key: PROXY_SERVER + value: "" + - key: PROXY_USERNAME + value: "" + - key: PROXY_PASSWORD + value: "" diff --git a/requirements.txt b/requirements.txt index 2e8a221..b450ec4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ -aiofiles -asyncpg -fastapi -playwright -playwright-stealth -python-dotenv -sqlalchemy[asyncio] -uvicorn[standard] +fastapi==0.104.1 +uvicorn==0.24.0 +sqlalchemy==2.0.23 +aiosqlite==0.19.0 +playwright==1.40.0 +python-dotenv==1.0.0 +pydantic==2.5.0 diff --git a/app/__init__.py b/src/__init__.py similarity index 100% rename from app/__init__.py rename to src/__init__.py diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api/routes.py b/src/api/routes.py new file mode 100644 index 0000000..612fb3f --- /dev/null +++ b/src/api/routes.py @@ -0,0 +1,237 @@ +from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy import select, and_, exists, update +from datetime import datetime +import json +from typing import List +import logging + +from src.database import get_db, AsyncSessionLocal +from src.models.vehicle import ( + VehicleListing, + VehicleListingCreate, + VehicleListingResponse, + SearchFilters +) +from src.automation.browser_sim import AutoTraderScraper +from src.config import settings + +router = APIRouter() +logger = logging.getLogger(__name__) + +async def scrape_and_store_task(search_url: str, max_listings: int, source_site_name: str = "autotrader"): + logger.info(f"Background task started: Scraping {source_site_name} URL: {search_url} for max {max_listings} listings.") + created_count = 0 + updated_count = 0 + failed_count = 0 + processed_urls = set() + if source_site_name.lower() == "autotrader": + ScraperClass = AutoTraderScraper + else: + logger.error(f"Unsupported source site: {source_site_name}") + return + async with ScraperClass() as scraper: + scraped_listings_pydantic = await scraper.scrape_listings( + search_url=search_url, + max_listings_to_fetch=max_listings + ) + if not scraped_listings_pydantic: + logger.info(f"No listings returned from {source_site_name} scraper for URL: {search_url}") + return + logger.info(f"{source_site_name} scraper returned {len(scraped_listings_pydantic)} listings. Processing for DB storage...") + async with AsyncSessionLocal() as db_session: + for listing_data in scraped_listings_pydantic: + if not listing_data.listing_url: + logger.warning("Scraped data missing listing_url, skipping.") + failed_count += 1 + continue + if listing_data.listing_url in processed_urls: + logger.debug(f"URL {listing_data.listing_url} already processed in this run, skipping duplicate.") + continue + processed_urls.add(listing_data.listing_url) + try: + stmt = select(VehicleListing).where(VehicleListing.listing_url == listing_data.listing_url) + result = await db_session.execute(stmt) + existing_vehicle = result.scalar_one_or_none() + features_json = json.dumps(listing_data.features) if listing_data.features else None + if existing_vehicle: + logger.debug(f"Updating existing listing: {listing_data.listing_url} (ID: {existing_vehicle.id})") + update_values = {} + if listing_data.title and existing_vehicle.title != listing_data.title: + update_values['title'] = listing_data.title + if listing_data.price is not None and existing_vehicle.price != listing_data.price: + update_values['price'] = listing_data.price + if listing_data.mileage is not None and existing_vehicle.mileage != listing_data.mileage: + update_values['mileage'] = listing_data.mileage + if features_json and existing_vehicle.features != features_json: + update_values['features'] = features_json + if listing_data.photo_url and existing_vehicle.photo_url != listing_data.photo_url: + update_values['photo_url'] = listing_data.photo_url + if listing_data.location and existing_vehicle.location != listing_data.location: + update_values['location'] = listing_data.location + if listing_data.year and existing_vehicle.year != listing_data.year: + update_values['year'] = listing_data.year + if listing_data.make and existing_vehicle.make != listing_data.make: + update_values['make'] = listing_data.make + if listing_data.model and existing_vehicle.model != listing_data.model: + update_values['model'] = listing_data.model + if listing_data.trim and existing_vehicle.trim != listing_data.trim: + update_values['trim'] = listing_data.trim + update_values['is_active'] = True + update_values['last_scraped_at'] = datetime.utcnow() + if update_values: + stmt_update = update(VehicleListing).where(VehicleListing.id == existing_vehicle.id).values(**update_values) + await db_session.execute(stmt_update) + updated_count += 1 + else: + logger.debug(f"Adding new listing: {listing_data.listing_url}") + db_vehicle = VehicleListing( + listing_id_external=listing_data.listing_id_external, + title=listing_data.title, + year=listing_data.year, + make=listing_data.make, + model=listing_data.model, + trim=listing_data.trim, + price=listing_data.price, + mileage=listing_data.mileage, + listing_url=listing_data.listing_url, + photo_url=listing_data.photo_url, + features=features_json, + location=listing_data.location, + seller_type=listing_data.seller_type, + source_site=listing_data.source_site, + is_active=True, + last_scraped_at=datetime.utcnow() + ) + db_session.add(db_vehicle) + created_count += 1 + await db_session.commit() + except Exception as e: + failed_count += 1 + logger.error(f"Failed to process/store listing {listing_data.listing_url}: {e}", exc_info=True) + await db_session.rollback() + logger.info(f"Background task for {source_site_name} finished. Created={created_count}, Updated={updated_count}, Failed={failed_count}") + +@router.get("/", response_model=dict, include_in_schema=False) +async def api_v1_root_info(): + return { + "message": "Vehicle Tracking API - V1", + "active_endpoints": ["/vehicles", "/vehicles/search", "/vehicles/scrape", "/vehicles/{id}", "/vehicles/stats/summary"] + } + +@router.get("/vehicles/", response_model=List[VehicleListingResponse]) +async def get_all_vehicles( + skip: int = Query(0, ge=0), + limit: int = Query(settings.MAX_LISTINGS_PER_SESSION, ge=1, le=200), + db: AsyncSession = Depends(get_db), + filters: SearchFilters = Depends(), +): + query = select(VehicleListing) + conditions = [] + if filters.is_active is not None: + conditions.append(VehicleListing.is_active == filters.is_active) + if filters.make: + conditions.append(VehicleListing.make.ilike(f"%{filters.make}%")) + if filters.model: + conditions.append(VehicleListing.model.ilike(f"%{filters.model}%")) + if filters.min_year: + conditions.append(VehicleListing.year >= filters.min_year) + if filters.max_year: + conditions.append(VehicleListing.year <= filters.max_year) + if filters.min_price: + conditions.append(VehicleListing.price >= filters.min_price) + if filters.max_price: + conditions.append(VehicleListing.price <= filters.max_price) + if filters.max_mileage: + conditions.append(VehicleListing.mileage <= filters.max_mileage) + if filters.location: + conditions.append(VehicleListing.location.ilike(f"%{filters.location}%")) + if filters.seller_type: + conditions.append(VehicleListing.seller_type.ilike(f"%{filters.seller_type}%")) + if filters.source_site: + conditions.append(VehicleListing.source_site.ilike(f"%{filters.source_site}%")) + if conditions: + query = query.where(and_(*conditions)) + query = query.order_by(VehicleListing.last_scraped_at.desc(), VehicleListing.created_at.desc()) + result = await db.execute(query.offset(skip).limit(limit)) + vehicles = result.scalars().all() + response_vehicles = [] + for vehicle_db_item in vehicles: + response_vehicles.append(VehicleListingResponse.model_validate(vehicle_db_item)) + return response_vehicles + +@router.get("/vehicles/{vehicle_id}", response_model=VehicleListingResponse) +async def get_vehicle_by_id_route(vehicle_id: int, db: AsyncSession = Depends(get_db)): + query = select(VehicleListing).where(VehicleListing.id == vehicle_id) + result = await db.execute(query) + vehicle_db_item = result.scalar_one_or_none() + if not vehicle_db_item: + raise HTTPException(status_code=404, detail="Vehicle not found") + return VehicleListingResponse.model_validate(vehicle_db_item) + +@router.post("/vehicles/", response_model=VehicleListingResponse, status_code=201) +async def create_vehicle_listing_manual( + vehicle_create_data: VehicleListingCreate, + db: AsyncSession = Depends(get_db) +): + stmt_exists = select(exists().where(VehicleListing.listing_url == vehicle_create_data.listing_url)) + url_exists = await db.scalar(stmt_exists) + if url_exists: + raise HTTPException(status_code=409, detail=f"Vehicle with URL {vehicle_create_data.listing_url} already exists.") + features_json_str = json.dumps(vehicle_create_data.features) if vehicle_create_data.features else None + db_vehicle_item = VehicleListing( + **vehicle_create_data.model_dump(exclude={'features'}), + features=features_json_str, + is_active=True, + last_scraped_at=datetime.utcnow() + ) + db.add(db_vehicle_item) + await db.commit() + await db.refresh(db_vehicle_item) + return VehicleListingResponse.model_validate(db_vehicle_item) + +@router.post("/vehicles/scrape", status_code=202) +async def trigger_site_scrape( + background_tasks: BackgroundTasks, + site_name: str = Query("autotrader", description="Name of the site to scrape (e.g., 'autotrader')."), + search_url: str = Query(..., description="Full search URL for the specified site."), + max_listings: int = Query(settings.MAX_LISTINGS_PER_SESSION, description="Maximum listings to fetch from this scrape.", ge=1, le=100) +): + logger.info(f"Received request to scrape {site_name} URL: {search_url} for max {max_listings} listings.") + if site_name.lower() not in ["autotrader"]: + raise HTTPException(status_code=400, detail=f"Scraping for site '{site_name}' is not supported.") + background_tasks.add_task(scrape_and_store_task, search_url, max_listings, site_name) + return {"message": f"{site_name.capitalize()} scraping task accepted and started in the background for URL: {search_url}"} + +@router.delete("/vehicles/{vehicle_id}", status_code=200) +async def delete_vehicle_listing(vehicle_id: int, db: AsyncSession = Depends(get_db)): + query = select(VehicleListing).where(VehicleListing.id == vehicle_id) + result = await db.execute(query) + vehicle_db_item = result.scalar_one_or_none() + if not vehicle_db_item: + raise HTTPException(status_code=404, detail="Vehicle not found") + await db.delete(vehicle_db_item) + await db.commit() + return {"message": "Vehicle deleted successfully"} + +@router.get("/vehicles/stats/summary", response_model=dict) +async def get_vehicle_listing_stats(db: AsyncSession = Depends(get_db)): + from sqlalchemy import func as sql_func + make_query = select(VehicleListing.make, sql_func.count(VehicleListing.id).label('count'))\ + .where(VehicleListing.make.isnot(None)).group_by(VehicleListing.make).order_by(sql_func.count(VehicleListing.id).desc()) + make_result = await db.execute(make_query) + make_stats = [{"make": row[0], "count": row[1]} for row in make_result.all()] + year_query = select(VehicleListing.year, sql_func.avg(VehicleListing.price).label('avg_price'), sql_func.count(VehicleListing.id).label('count'))\ + .where(VehicleListing.year.isnot(None)).group_by(VehicleListing.year).order_by(VehicleListing.year.desc()) + year_result = await db.execute(year_query) + year_stats = [{"year": row[0], "avg_price": round(row[1], 2) if row[1] else 0.0, "count": row[2]} for row in year_result.all()] + total_query = select(sql_func.count(VehicleListing.id)) + total_count = await db.scalar(total_query) or 0 + active_query = select(sql_func.count(VehicleListing.id)).where(VehicleListing.is_active == True) + active_count = await db.scalar(active_query) or 0 + return { + "total_listings_in_db": total_count, + "active_listings": active_count, + "by_make": make_stats, + "by_year_with_avg_price": year_stats + } diff --git a/src/automation/__init__.py b/src/automation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/automation/browser_sim.py b/src/automation/browser_sim.py new file mode 100644 index 0000000..c93f204 --- /dev/null +++ b/src/automation/browser_sim.py @@ -0,0 +1,420 @@ +import asyncio +import json +import re +import random +from typing import List, Dict, Optional +from playwright.async_api import async_playwright, Page, Browser, PlaywrightException, Locator +from urllib.parse import urljoin, urlparse, parse_qs +from datetime import datetime +import logging +import hashlib + +from src.config import settings +from src.models.vehicle import VehicleListingCreate + +logger = logging.getLogger(__name__) + +class AutoTraderScraper: + def __init__(self): + self.browser: Optional[Browser] = None + self.playwright_instance: Optional[async_playwright] = None + self.base_action_delay = settings.MIN_DELAY_BETWEEN_ACTIONS + self.page_load_delay = settings.PAGE_DELAY / 1000 + + async def __aenter__(self): + logger.info("Initializing AutoTrader Scraper...") + self.playwright_instance = await async_playwright().start() + try: + proxy_cfg = None + if settings.PROXY_SERVER: + proxy_cfg = {"server": settings.PROXY_SERVER} + if settings.PROXY_USERNAME and settings.PROXY_PASSWORD: + proxy_cfg["username"] = settings.PROXY_USERNAME + proxy_cfg["password"] = settings.PROXY_PASSWORD + + self.browser = await self.playwright_instance.chromium.launch( + headless=settings.HEADLESS, + proxy=proxy_cfg, + args=[ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-infobars', + '--window-position=0,0', + '--ignore-certificate-errors', + '--ignore-certificate-errors-spki-list', + '--disable-blink-features=AutomationControlled', + '--disable-dev-shm-usage' + ], + timeout=settings.BROWSER_TIMEOUT + ) + logger.info(f"Browser launched (Headless: {settings.HEADLESS})") + except PlaywrightException as e: + logger.error(f"Failed to launch browser: {e}") + if self.playwright_instance: + await self.playwright_instance.stop() + raise + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + logger.info("Closing AutoTrader Scraper resources...") + if self.browser and self.browser.is_connected(): + try: + await self.browser.close() + logger.info("Browser closed.") + except PlaywrightException as e: + logger.error(f"Error closing browser: {e}") + if self.playwright_instance: + try: + await self.playwright_instance.stop() + logger.info("Playwright instance stopped.") + except Exception as e: + logger.error(f"Error stopping Playwright: {e}") + if exc_type: + logger.error(f"Exception occurred during scraping: {exc_val}", exc_info=(exc_type, exc_val, exc_tb)) + + async def _apply_stealth_measures(self, page: Page): + logger.info("Applying stealth measures to page...") + user_agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36" + ] + await page.set_extra_http_headers({"User-Agent": random.choice(user_agents)}) + await page.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en', 'en-GB'] }); + const pluginCount = Math.floor(Math.random() * 3) + 1; + Object.defineProperty(navigator, 'plugins', { + get: () => Array(pluginCount).fill(null).map((_, i) => ({ name: `Plugin ${i}`, filename: `plugin${i}.dll`, description: `Mock plugin ${i}` })) + }); + const mimeTypeCount = Math.floor(Math.random() * 3) + 1; + Object.defineProperty(navigator, 'mimeTypes', { + get: () => Array(mimeTypeCount).fill(null).map((_, i) => ({ type: `application/x-mimetype${i}`, suffixes: `m${i}`, description: `Mock mimetype ${i}` })) + }); + const getParameter = WebGLRenderingContext.prototype.getParameter; + WebGLRenderingContext.prototype.getParameter = function(parameter) { + if (parameter === 37445) return 'Intel Open Source Technology Center'; + if (parameter === 37446) return 'Mesa DRI Intel(R) Iris Xe Graphics (TGL GT2)'; + return getParameter.apply(this, arguments); + }; + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) + ); + try { Date.prototype.getTimezoneOffset = function() { return -Math.floor(Math.random() * 8 + 3) * 60; }; } catch (e) {} + """) + viewports = [{"width": 1920, "height": 1080}, {"width": 1366, "height": 768}, {"width": 1440, "height": 900}, {"width": 2560, "height": 1440}] + await page.set_viewport_size(random.choice(viewports)) + logger.info("Stealth measures applied.") + + async def _human_like_delay(self, min_delay: Optional[float] = None, max_delay: Optional[float] = None): + min_d = min_delay if min_delay is not None else self.base_action_delay + max_d = max_delay if max_delay is not None else self.base_action_delay + 2.0 + delay = random.uniform(min_d, max_d) + logger.debug(f"Waiting {delay:.2f} seconds...") + await asyncio.sleep(delay) + + async def _human_like_scroll(self, page: Page, scroll_attempts=7): + logger.info(f"Performing human-like scrolling: {scroll_attempts} attempts...") + previous_scroll_height = -1.0 + for i in range(scroll_attempts): + current_scroll_height = float(await page.evaluate("document.body.scrollHeight")) + if abs(current_scroll_height - previous_scroll_height) < 1.0 and i > 0: + logger.info(f"Scroll attempt {i+1}: Reached end of scrollable content or no new content loaded.") + break + scroll_amount = await page.evaluate(f"Math.random() * window.innerHeight * 0.7 + window.innerHeight * 0.3") + await page.evaluate(f"window.scrollBy(0, {scroll_amount})") + await self._human_like_delay(min_delay=0.8, max_delay=2.2) + previous_scroll_height = current_scroll_height + logger.info("Scrolling to bottom one last time...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await self._human_like_delay(min_delay=2.0, max_delay=3.5) + logger.info("Scrolling finished.") + + def _extract_listing_id_from_url(self, url: str) -> Optional[str]: + if not url: + return None + try: + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + if 'listingId' in query_params: + return query_params['listingId'][0] + path_parts = [part for part in parsed_url.path.split('/') if part] + if 'vehicle' in path_parts: + vehicle_idx = path_parts.index('vehicle') + if vehicle_idx + 1 < len(path_parts): + return path_parts[vehicle_idx+1] + for part in reversed(path_parts): + if part.isdigit() and len(part) > 5: + return part + except Exception as e: + logger.warning(f"Could not parse structured listing ID from URL {url}: {e}") + logger.debug(f"No structured ID found, hashing URL for ID: {url}") + return hashlib.md5(url.encode()).hexdigest()[:16] + + def _parse_title_details(self, title_str: str) -> Dict: + details = {'year': None, 'make': None, 'model': None, 'trim': None} + if not title_str: + return details + original_title = title_str + year_match = re.search(r'\b(19[89]\d|20[0-2]\d|2030)\b', title_str) + if year_match: + details['year'] = int(year_match.group(1)) + title_str = title_str.replace(year_match.group(1), "", 1).strip() + title_str = re.sub(r'^(Used|New|Certified Pre-Owned|CPO)\s+', '', title_str, flags=re.IGNORECASE).strip() + parts = title_str.split(maxsplit=3) + if len(parts) > 0: + details['make'] = parts[0] + if len(parts) > 1: + details['model'] = parts[1] + if len(parts) > 2: + details['trim'] = " ".join(parts[2:]) + logger.debug(f"Parsed title details: {details} from original title: '{original_title}'") + return details + + async def _extract_listing_data(self, listing_element: Locator, page_url: str) -> Optional[VehicleListingCreate]: + data_dict: Dict[str, any] = {} + listing_html_for_debug = "N/A (HTML not captured)" + try: + link_el_selectors = [ + 'a[data-cmp="inventoryListingCardLink"]', + 'a[data-testid="srp-list-item-link"]', + 'a[href*="vehicledetails.xhtml?listingId="]', + 'h2 > a', + 'h3 > a' + ] + raw_href = None + for selector in link_el_selectors: + link_el = listing_element.locator(selector).first + if await link_el.count(): + raw_href = await link_el.get_attribute("href", timeout=1500) + if raw_href: + break + if not raw_href: + logger.warning("No primary link found for a listing card. Skipping.") + return None + data_dict['listing_url'] = urljoin(page_url, raw_href) + + title_el_selectors = ["h2[data-cmp*='title']", "h3[data-cmp*='title']", "div[data-cmp='displayName'] h2", "h2", "h3"] + raw_title = "Title Not Found" + for selector in title_el_selectors: + title_el = listing_element.locator(selector).first + if await title_el.count(): + try: + raw_title = await title_el.text_content(timeout=1500) + if raw_title and raw_title.strip(): + break + except PlaywrightException: + continue + data_dict['title'] = raw_title.strip() + title_details = self._parse_title_details(data_dict['title']) + data_dict.update(title_details) + + price_selectors = [ + "span[data-cmp='pricingSection'] .text-size-lg-3", + "span[data-cmp='pricingSection']", + ".pricing-section .first-price", + "span[class*='price']", "div[class*='price']" + ] + for selector in price_selectors: + price_el = listing_element.locator(selector).filter(has_text=re.compile(r"\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?")).first + if await price_el.count(): + try: + price_text = await price_el.text_content(timeout=1000) + cleaned_price = re.sub(r'[^\d.]', '', price_text) + if cleaned_price and cleaned_price != '.': + data_dict['price'] = float(cleaned_price) + break + except PlaywrightException: + continue + + mileage_selectors = [ + "div[data-cmp='listUnstyled'] li:has-text('miles')", + "div.item-vehicle-mileage", + "div[class*='mileage']", "span[class*='mileage']" + ] + for selector in mileage_selectors: + mileage_el = listing_element.locator(selector).filter(has_text=re.compile(r"[\d,]+\s*mi(?:les)?", re.IGNORECASE)).first + if await mileage_el.count(): + try: + mileage_text = await mileage_el.text_content(timeout=1000) + match = re.search(r'([\d,]+)\s*mi', mileage_text, re.IGNORECASE) + if match: + data_dict['mileage'] = int(re.sub(r',', '', match.group(1))) + break + except PlaywrightException: + continue + + photo_selectors = [ + 'img[data-cmp="responsiveImage"]', + 'img[data-testid="srp-list-item-image"]', + '.srp-img-container img', + 'img[alt*="vehicle image"]' + ] + for selector in photo_selectors: + photo_el = listing_element.locator(selector).first + if await photo_el.count(): + try: + src = await photo_el.get_attribute("src", timeout=1000) + if src and not src.startswith('data:image'): + data_dict['photo_url'] = urljoin(page_url, src) + break + except PlaywrightException: + continue + + features_list = [] + feature_selectors = ["ul[class*='features'] li", "div[data-cmp='pill']", ".item-特色 span"] + for selector in feature_selectors: + feature_elements = await listing_element.locator(selector).all() + for fe_el in feature_elements[:5]: + try: + f_text = await fe_el.text_content(timeout=500) + if f_text and len(f_text.strip()) > 2 and len(f_text.strip()) < 50: + features_list.append(f_text.strip()) + except PlaywrightException: + continue + if features_list: + break + data_dict['features'] = list(set(features_list)) + + location_selectors = ["div[data-cmp*='location']", "div.text-gray-dark.text-truncate", ".item-location"] + for selector in location_selectors: + location_el = listing_element.locator(selector).first + if await location_el.count(): + try: + loc_text = await location_el.text_content(timeout=1000) + data_dict['location'] = loc_text.replace('Located in', '').replace('Dealership Location', '').strip() + if data_dict['location']: + break + except PlaywrightException: + continue + + data_dict['listing_id_external'] = self._extract_listing_id_from_url(data_dict['listing_url']) + + return VehicleListingCreate(**data_dict) + + except PlaywrightException as e: + listing_html_for_debug = await listing_element.evaluate("element => element.outerHTML", timeout=1000) + logger.error(f"Playwright error extracting data from a listing card: {e}. HTML: {listing_html_for_debug[:500]}...") + except Exception as e: + listing_html_for_debug = await listing_element.evaluate("element => element.outerHTML", timeout=1000) + logger.error(f"General error extracting data from a listing card: {e}. HTML: {listing_html_for_debug[:500]}...") + return None + + async def scrape_listings(self, search_url: str, max_listings_to_fetch: int) -> List[VehicleListingCreate]: + if not self.browser or not self.browser.is_connected(): + logger.error("Browser not initialized or not connected. Call within async context manager.") + return [] + page: Optional[Page] = None + processed_listings: List[VehicleListingCreate] = [] + try: + context = await self.browser.new_context( + java_script_enabled=True, + accept_downloads=False, + locale='en-US' + ) + page = await context.new_page() + await self._apply_stealth_measures(page) + logger.info(f"Navigating to search URL: {search_url}") + await page.goto(search_url, wait_until="domcontentloaded", timeout=settings.BROWSER_TIMEOUT) + await self._human_like_delay(min_delay=self.page_load_delay, max_delay=self.page_load_delay + 3.0) + cookie_selectors = ['#onetrust-accept-btn-handler', 'button:has-text("Accept All Cookies")'] + for cs_selector in cookie_selectors: + try: + cookie_button = page.locator(cs_selector).first + if await cookie_button.is_visible(timeout=3000): + await cookie_button.click(timeout=5000, delay=random.uniform(0.3,0.8)*1000) + logger.info(f"Clicked cookie banner: {cs_selector}") + await self._human_like_delay(1.5, 2.5) + break + except PlaywrightException: + logger.debug(f"Cookie banner not found/visible or clickable with: {cs_selector}") + await self._human_like_scroll(page, scroll_attempts=settings.MAX_LISTINGS_PER_SESSION // 5 or 5) + listing_card_selectors = [ + "article[data-cmp='inventoryListing']", + "div[data-testid='srp-listing-item']", + "div[data-cmp='inventorySpotlightListingCard']", + ".inventory-listing", + "div[class*='srp-results'] div[class*='vehicle-card']" + ] + all_card_elements_locators = [] + for selector in listing_card_selectors: + elements_on_page = await page.locator(selector).count() + if elements_on_page > 0: + logger.info(f"Found {elements_on_page} cards with selector '{selector}'") + all_card_elements_locators.append(page.locator(selector)) + if selector in ["article[data-cmp='inventoryListing']", "div[data-testid='srp-listing-item']"]: + break + final_card_locator = None + if all_card_elements_locators: + final_card_locator = all_card_elements_locators[0] + if not final_card_locator: + logger.warning(f"No listing cards found on page: {search_url}.") + try: + await page.screenshot(path=f"debug_no_listings_{datetime.now():%Y%m%d%H%M%S}.png") + except Exception as e: + logger.error(f"Failed to save screenshot: {e}") + return [] + num_cards_on_page = await final_card_locator.count() + logger.info(f"Total listing cards to process with chosen locator: {num_cards_on_page}") + for i in range(num_cards_on_page): + if len(processed_listings) >= max_listings_to_fetch: + logger.info(f"Reached max listings to fetch: {max_listings_to_fetch}") + break + card_element = final_card_locator.nth(i) + logger.info(f"Processing card {i+1}/{num_cards_on_page}...") + try: + if not await card_element.is_visible(timeout=3000): + await card_element.scroll_into_view_if_needed(timeout=5000) + await self._human_like_delay(0.5, 1.0) + except PlaywrightException as e: + logger.warning(f"Card {i+1} not visible or could not scroll into view, skipping: {e}") + continue + listing_data = await self._extract_listing_data(card_element, page.url) + if listing_data: + processed_listings.append(listing_data) + logger.info(f"Successfully extracted: {listing_data.title[:60]}... ({listing_data.listing_id_external})") + else: + logger.warning(f"Failed to extract complete data from card {i+1}.") + await self._human_like_delay() + except PlaywrightException as e: + logger.error(f"A Playwright error occurred during scraping session for {search_url}: {e}", exc_info=True) + if page: + try: + await page.screenshot(path=f"error_pw_session_{datetime.now():%Y%m%d%H%M%S}.png") + except Exception as se: + logger.error(f"Failed to save error screenshot: {se}") + except Exception as e: + logger.error(f"An unexpected error occurred during scraping session for {search_url}: {e}", exc_info=True) + if page: + try: + await page.screenshot(path=f"error_unexpected_session_{datetime.now():%Y%m%d%H%M%S}.png") + except Exception as se: + logger.error(f"Failed to save error screenshot: {se}") + finally: + if page: + try: + await page.close() + except PlaywrightException as e: + logger.error(f"Error closing page: {e}") + if 'context' in locals() and context: + try: + await context.close() + except PlaywrightException as e: + logger.error(f"Error closing context: {e}") + logger.info(f"Scraping session for {search_url} finished. Extracted {len(processed_listings)} listings.") + return processed_listings[:max_listings_to_fetch] + +async def run_autotrader_scraper_example_standalone(): + example_search_url = "https://www.autotrader.com/cars-for-sale/by-owner/all-states?searchRadius=0&sortBy=datelistedDESC&numRecords=25" + max_to_get = settings.MAX_LISTINGS_PER_SESSION + async with AutoTraderScraper() as scraper: + results = await scraper.scrape_listings(example_search_url, max_listings_to_fetch=max_to_get) + if results: + logger.info(f"\n--- Scraped {len(results)} AutoTrader Listings (Standalone Example Run) ---") + for i, listing in enumerate(results): + logger.info(f"{i+1}. ID_Ext: {listing.listing_id_external} - {listing.title} ({listing.year} {listing.make} {listing.model}) - Price: ${listing.price if listing.price else 'N/A'}") + else: + logger.info("No listings were extracted in the standalone example run.") + return results diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..94a9b77 --- /dev/null +++ b/src/config.py @@ -0,0 +1,21 @@ +import os +from dotenv import load_dotenv + +load_dotenv() + +class Settings: + DATABASE_URL: str = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///./default_vehicle_data.db") + HEADLESS: bool = os.getenv("HEADLESS", "true").lower() == "true" + BROWSER_TIMEOUT: int = int(os.getenv("BROWSER_TIMEOUT", "60000")) + PAGE_DELAY: int = int(os.getenv("PAGE_DELAY", "5000")) + MIN_DELAY_BETWEEN_ACTIONS: float = float(os.getenv("MIN_DELAY_BETWEEN_ACTIONS", "2.5")) + API_HOST: str = os.getenv("API_HOST", "127.0.0.1") + API_PORT: int = int(os.getenv("API_PORT", "8000")) + MAX_LISTINGS_PER_SESSION: int = int(os.getenv("MAX_LISTINGS_PER_SESSION", "25")) + + # Proxy configuration + PROXY_SERVER: str | None = os.getenv("PROXY_SERVER") + PROXY_USERNAME: str | None = os.getenv("PROXY_USERNAME") + PROXY_PASSWORD: str | None = os.getenv("PROXY_PASSWORD") + +settings = Settings() diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..e42f8f1 --- /dev/null +++ b/src/database.py @@ -0,0 +1,28 @@ +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker +from src.config import settings + +DATABASE_URL = settings.DATABASE_URL + +engine = create_async_engine( + DATABASE_URL, + echo=False, + future=True +) + +AsyncSessionLocal = async_sessionmaker( + bind=engine, + class_=AsyncSession, + expire_on_commit=False +) + +async def create_db_tables(): + from src.models.vehicle import Base + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + +async def get_db(): + async with AsyncSessionLocal() as session: + try: + yield session + finally: + await session.close() diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/vehicle.py b/src/models/vehicle.py new file mode 100644 index 0000000..09bd717 --- /dev/null +++ b/src/models/vehicle.py @@ -0,0 +1,84 @@ +from sqlalchemy import Column, Integer, String, Float, DateTime, Text, Boolean +from sqlalchemy.orm import declarative_base +from sqlalchemy.sql import func +from pydantic import BaseModel, Field +from typing import Optional, List +from datetime import datetime + +Base = declarative_base() + +class VehicleListing(Base): + __tablename__ = "vehicle_listings" + + id = Column(Integer, primary_key=True, index=True) + listing_id_external = Column(String, index=True, unique=False, nullable=True) + title = Column(String, nullable=False) + year = Column(Integer, index=True, nullable=True) + make = Column(String, index=True, nullable=True) + model = Column(String, index=True, nullable=True) + trim = Column(String, nullable=True) + price = Column(Float, index=True, nullable=True) + mileage = Column(Integer, index=True, nullable=True) + listing_url = Column(Text, unique=True, nullable=False, index=True) + photo_url = Column(Text, nullable=True) + features = Column(Text, nullable=True) + location = Column(String, nullable=True) + seller_type = Column(String, default="private", nullable=True) + source_site = Column(String, default="autotrader", nullable=True) + created_at = Column(DateTime, default=func.now()) + updated_at = Column(DateTime, default=func.now(), onupdate=func.now()) + last_scraped_at = Column(DateTime, default=func.now(), onupdate=func.now()) + is_active = Column(Boolean, default=True, index=True) + +class VehicleListingCreate(BaseModel): + listing_id_external: Optional[str] = None + title: str + year: Optional[int] = None + make: Optional[str] = None + model: Optional[str] = None + trim: Optional[str] = None + price: Optional[float] = None + mileage: Optional[int] = None + listing_url: str + photo_url: Optional[str] = None + features: Optional[List[str]] = Field(default_factory=list) + location: Optional[str] = None + seller_type: Optional[str] = "private" + source_site: Optional[str] = "autotrader" + +class VehicleListingResponse(BaseModel): + id: int + listing_id_external: Optional[str] = None + title: str + year: Optional[int] = None + make: Optional[str] = None + model: Optional[str] = None + trim: Optional[str] = None + price: Optional[float] = None + mileage: Optional[int] = None + listing_url: str + photo_url: Optional[str] = None + features: Optional[List[str]] = Field(default_factory=list) + location: Optional[str] = None + seller_type: Optional[str] = None + source_site: Optional[str] = None + created_at: datetime + updated_at: datetime + last_scraped_at: datetime + is_active: bool + + class Config: + from_attributes = True + +class SearchFilters(BaseModel): + make: Optional[str] = None + model: Optional[str] = None + min_year: Optional[int] = None + max_year: Optional[int] = None + min_price: Optional[float] = None + max_price: Optional[float] = None + max_mileage: Optional[int] = None + location: Optional[str] = None + seller_type: Optional[str] = None + source_site: Optional[str] = None + is_active: Optional[bool] = True diff --git a/stealth_utils.py b/stealth_utils.py deleted file mode 100644 index e956687..0000000 --- a/stealth_utils.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging - -async def apply_stealth_js(page): - """ - Applies various JavaScript injections to make Playwright less detectable. - """ - try: - # Pass the User-Agent test (though Playwright usually handles this well) - # user_agent = await page.evaluate("() => navigator.userAgent") - # await page.set_extra_http_headers({'User-Agent': user_agent.replace("HeadlessChrome", "Chrome")}) # Example - - # Pass the WebGL test - await page.add_init_script("(() => { const getParameter = WebGLRenderingContext.prototype.getParameter; WebGLRenderingContext.prototype.getParameter = function(parameter) { if (parameter === 37445) { return 'Intel Open Source Technology Center'; } if (parameter === 37446) { return 'Mesa DRI Intel(R) Ivybridge Mobile '; } return getParameter(parameter); }; })()") - - # Pass the Chrome test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); })()") - await page.add_init_script("(() => { window.chrome = { runtime: {}, loadTimes: function(){}, csi: function(){} }; })()") - - # Pass the Permissions test - await page.add_init_script("(() => { const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); })()") - - # Pass the Plugins Length test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); })()") - - # Pass the Languages test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); })()") - - logging.info("Applied JavaScript stealth techniques from stealth_utils.") - except Exception as e: - logging.error(f"Error applying stealth JS from stealth_utils: {e}", exc_info=True)