diff --git a/.env.example b/.env.example index 767687c..4744016 100644 --- a/.env.example +++ b/.env.example @@ -13,3 +13,10 @@ API_PORT=8000 # Scraping Limits MAX_LISTINGS_PER_SESSION=25 + +# Optional Proxy Configuration +# If using rotating proxies (e.g., Webshare), uncomment and provide the proxy URL. +# Example: http://username:password@proxyhost:port +# PROXY_SERVER= +# PROXY_USERNAME= +# PROXY_PASSWORD= diff --git a/README.md b/README.md index f5d383b..7525e16 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,28 @@ The scraper can also be run standalone: ```bash python main.py scrape_test ``` + +## Environment Variables + +Set the following variables in a `.env` file or your deployment environment: + +| Variable | Description | Default | +| --- | --- | --- | +| `DATABASE_URL` | Database connection URL | `sqlite+aiosqlite:///./vehicle_data.db` | +| `HEADLESS` | Run the browser in headless mode | `true` | +| `BROWSER_TIMEOUT` | Playwright launch timeout (ms) | `60000` | +| `PAGE_DELAY` | Base delay after page loads (ms) | `5000` | +| `MIN_DELAY_BETWEEN_ACTIONS` | Delay between scraping actions (s) | `2.5` | +| `API_HOST` | Host for the FastAPI server | `127.0.0.1` | +| `API_PORT` | Port for the FastAPI server | `8000` | +| `MAX_LISTINGS_PER_SESSION` | Maximum listings fetched per scrape | `25` | +| `PROXY_SERVER` | *(Optional)* Proxy URL for Playwright | - | +| `PROXY_USERNAME` | *(Optional)* Proxy username | - | +| `PROXY_PASSWORD` | *(Optional)* Proxy password | - | + +### Pagination + +The `/api/v1/vehicles/` endpoint accepts `skip` and `limit` query parameters to paginate results. +Example: `/api/v1/vehicles/?skip=25&limit=25`. + + diff --git a/app.py b/app.py deleted file mode 100644 index 8fa8500..0000000 --- a/app.py +++ /dev/null @@ -1,123 +0,0 @@ -import logging -# import os # No longer needed for getenv in background task -import asyncio -from fastapi import FastAPI, Depends, BackgroundTasks -from pydantic import BaseModel -from typing import Dict -from datetime import datetime -from database import CarListing, get_db, Session, SessionLocal -from scraper import scrape_autotrader_and_update_db -from fastapi.middleware.cors import CORSMiddleware -from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT, LOG_LEVEL # Import from config - -# Configure basic logging using LOG_LEVEL from config -# Ensure this is called only once. If FastAPI/Uvicorn also configures logging, -# this might need adjustment or to be handled by the logger instance directly. -# For now, assume this is the primary logging config. -logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s', force=True) -# Added force=True to ensure this config takes precedence if uvicorn also tries to set basicConfig. - -app = FastAPI() -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_methods=["*"], - allow_headers=["*"], -) - -class CarListingRaw(BaseModel): - platform: str - extracted_at: datetime - source_url: str - data_points: Dict - -@app.post("/api/v1/listings/ingest") -async def ingest_listing(payload: CarListingRaw, db: Session = Depends(get_db)): - listing = CarListing( - platform=payload.platform, - extracted_at=payload.extracted_at, - source_url=payload.source_url, - data_points=payload.data_points - ) - db.add(listing) - db.commit() - db.refresh(listing) - return {"status": "saved", "listing_id": listing.id} - -@app.get("/") -def read_root(): - return {"message": "🚗 Car Tracker API is running!"} - -# Global variable to store scraping status -scrape_status = { - "last_run_time": None, - "status": "idle", # States: idle, running, success, error - "message": "", - "added": 0, - "updated": 0, - "scraped_count": 0 -} - -# Background task wrapper -async def _background_scraper_task_wrapper(): - global scrape_status - db_task_session: Session = SessionLocal() - logging.info("Background scraper task started.") - scrape_status["status"] = "running" - scrape_status["message"] = "Scraping in progress..." - scrape_status["last_run_time"] = datetime.utcnow().isoformat() - scrape_status["added"] = 0 # Reset counts for current run - scrape_status["updated"] = 0 - scrape_status["scraped_count"] = 0 - - try: - # Use imported config values - # autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") - # headless_str = os.getenv("HEADLESS_BROWSER", "True") - # headless = headless_str.lower() == "true" - # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - # try: - # scrape_timeout = int(scrape_timeout_str) - # except ValueError: - # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.") - # scrape_timeout = 120000 - - logging.info(f"Background task using URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms") - - result = await scrape_autotrader_and_update_db( - db=db_task_session, - autotrader_url=AUTOTRADER_URL, - headless=HEADLESS_BROWSER, - scrape_timeout=SCRAPE_TIMEOUT - ) - - if result.get("status") == "success": - scrape_status["status"] = "success" - scrape_status["message"] = "Scraping completed successfully." - scrape_status["added"] = result.get("added", 0) - scrape_status["updated"] = result.get("updated", 0) - scrape_status["scraped_count"] = result.get("scraped_count", 0) - else: - scrape_status["status"] = "error" - scrape_status["message"] = result.get("message", "Scraping failed with an unknown error.") - - logging.info(f"Background scraper task completed: {result}") - - except Exception as e: - logging.error(f"Error in background scraper task: {e}", exc_info=True) - scrape_status["status"] = "error" - scrape_status["message"] = str(e) - finally: - db_task_session.close() - logging.info("Background scraper DB session closed.") - -@app.post("/api/v1/scrape/autotrader") -async def trigger_autotrader_scrape(background_tasks: BackgroundTasks): - if scrape_status["status"] == "running": - return {"message": "AutoTrader scraping job is already running."} - background_tasks.add_task(_background_scraper_task_wrapper) - return {"message": "AutoTrader scraping job started in the background."} - -@app.get("/api/v1/scrape/status") -async def get_scrape_status(): - return scrape_status diff --git a/app/__init__.py b/app/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/crud.py b/app/crud.py deleted file mode 100644 index fb8708b..0000000 --- a/app/crud.py +++ /dev/null @@ -1,50 +0,0 @@ -from sqlalchemy.orm import Session -from . import models, schemas -from datetime import datetime - -def get_car_listing_by_url(db: Session, url: str): - return db.query(models.ScrapedData).filter(models.ScrapedData.url == url).first() - -def create_car_listing(db: Session, listing: schemas.CarListingCreate): - db_listing = models.ScrapedData( - job_id=listing.job_id, - platform=listing.platform, - url=str(listing.url), # Ensure HttpUrl is converted to string - title=listing.title, - price=listing.price, - mileage=listing.mileage, - vin=listing.vin, - image_urls=listing.image_urls, # Assuming image_urls is already a list of strings or compatible JSON - raw_data=listing.raw_data, - scraped_at=datetime.utcnow() - ) - db.add(db_listing) - db.commit() - db.refresh(db_listing) - return db_listing - -def create_scrape_job(db: Session) -> models.ScrapeJob: - db_job = models.ScrapeJob(timestamp=datetime.utcnow(), status="pending") - db.add(db_job) - db.commit() - db.refresh(db_job) - return db_job - -def update_scrape_job_status(db: Session, job_id: int, status: str, results_count: int = 0, error_message: str = None): - db_job = db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first() - if db_job: - db_job.status = status - db_job.results_count = results_count - db_job.error_message = error_message - db.commit() - db.refresh(db_job) - return db_job - -def get_scrape_job(db: Session, job_id: int): - return db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first() - -def get_all_scrape_jobs(db: Session, skip: int = 0, limit: int = 100): - return db.query(models.ScrapeJob).order_by(models.ScrapeJob.timestamp.desc()).offset(skip).limit(limit).all() - -def get_listings_for_job(db: Session, job_id: int, skip: int = 0, limit: int = 100): - return db.query(models.ScrapedData).filter(models.ScrapedData.job_id == job_id).offset(skip).limit(limit).all() diff --git a/app/database.py b/app/database.py deleted file mode 100644 index bf32154..0000000 --- a/app/database.py +++ /dev/null @@ -1,25 +0,0 @@ -from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker -import os - -DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db") - -engine_args = {} -if DATABASE_URL.startswith("sqlite"): - engine_args["connect_args"] = {"check_same_thread": False} - -engine = create_engine(DATABASE_URL, **engine_args) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - -Base = declarative_base() - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -def create_tables(): - Base.metadata.create_all(bind=engine) diff --git a/app/main.py b/app/main.py deleted file mode 100644 index 4817f15..0000000 --- a/app/main.py +++ /dev/null @@ -1,265 +0,0 @@ -import logging -import os -from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks -from sqlalchemy.orm import Session -from typing import List - -from . import crud, models, schemas, scraper -from .database import SessionLocal, engine - -# Create database tables if they don't exist -models.Base.metadata.create_all(bind=engine) - -# Configure logging -LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() -logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -app = FastAPI(title="AutoTrader Scraper API", version="1.0.0") - -# Dependency to get DB session -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() - -# Global variable to store scraping status (simple approach) -scrape_status = { - "job_id": None, - "status": "idle", # States: idle, pending, running, completed, failed - "message": "No scraping job initiated yet.", - "last_run_time": None, - "duration_seconds": None, - "results_count": 0, - "error_message": None -} - -async def run_scraping_task(job_id: int, autotrader_url: str, headless: bool, scrape_timeout: int): - """ - The actual scraping task that runs in the background. - It creates its own database session. - """ - global scrape_status - db: Session = SessionLocal() - try: - logger.info(f"Background task started for job_id: {job_id}") - crud.update_scrape_job_status(db, job_id, status="running") - scrape_status.update({ - "job_id": job_id, - "status": "running", - "message": f"Scraping from {autotrader_url}...", - "last_run_time": datetime.utcnow().isoformat(), - "duration_seconds": None, - "results_count": 0, - "error_message": None - }) - - start_time = datetime.utcnow() - - scraped_data_list = await scraper.scrape_autotrader_data( - autotrader_url=autotrader_url, - headless=headless, - timeout=scrape_timeout - ) - - end_time = datetime.utcnow() - duration = (end_time - start_time).total_seconds() - scrape_status["duration_seconds"] = round(duration, 2) - - added_count = 0 - updated_count = 0 # Placeholder for future update logic - - if not scraped_data_list: - logger.info(f"No listings found for job_id: {job_id}") - crud.update_scrape_job_status(db, job_id, status="completed", results_count=0) - scrape_status.update({ - "status": "completed", - "message": "Scraping completed. No new listings found or page was inaccessible.", - "results_count": 0 - }) - return - - for item_data in scraped_data_list: - # Ensure all required fields for CarListingCreate are present - listing_create = schemas.CarListingCreate( - job_id=job_id, - platform=item_data.get("source_name", "autotrader"), # Get platform from scraper or default - url=item_data.get("listing_url"), - title=item_data.get("title"), - price=item_data.get("price"), - mileage=item_data.get("mileage"), - vin=item_data.get("vin"), - image_urls=item_data.get("image_urls", []), - raw_data=item_data.get("data_points", {}) - ) - - existing_listing = crud.get_car_listing_by_url(db, str(listing_create.url)) - if existing_listing: - # For now, we just count updates. Actual update logic could be added here. - # e.g., existing_listing.price = listing_create.price - # existing_listing.extracted_at = datetime.utcnow() - updated_count += 1 - else: - crud.create_car_listing(db=db, listing=listing_create) - added_count += 1 - - crud.update_scrape_job_status(db, job_id, status="completed", results_count=added_count) - scrape_status.update({ - "status": "completed", - "message": f"Scraping finished. Added: {added_count}, Updated: {updated_count} (placeholder).", - "results_count": added_count + updated_count # Or just added_count if updates aren't really changing data - }) - logger.info(f"Background task for job_id: {job_id} completed. Added: {added_count}, Updated: {updated_count}") - - except Exception as e: - logger.error(f"Error in background scraper task for job_id {job_id}: {e}", exc_info=True) - crud.update_scrape_job_status(db, job_id, status="failed", error_message=str(e)) - scrape_status.update({ - "status": "failed", - "message": f"Error during scraping: {str(e)}", - "error_message": str(e) - }) - finally: - db.close() - logger.info(f"DB session closed for job_id: {job_id}") - - -@app.post("/scrape/", response_model=schemas.ScrapeJob, status_code=202) -async def trigger_scrape(background_tasks: BackgroundTasks, db: Session = Depends(get_db)): - """ - Triggers a new scraping job for Autotrader. - """ - global scrape_status - if scrape_status.get("status") == "running": - raise HTTPException(status_code=409, detail="A scraping job is already in progress.") - - autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/all-cars/cars-under-10000") # Default to a common search if not set - headless_str = os.getenv("HEADLESS_BROWSER", "True") - headless = headless_str.lower() == "true" - scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - - try: - scrape_timeout = int(scrape_timeout_str) - except ValueError: - scrape_timeout = 120000 # Default timeout if parsing fails - logger.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Using default {scrape_timeout}ms.") - - job = crud.create_scrape_job(db) - scrape_status.update({ - "job_id": job.id, - "status": "pending", - "message": f"Scraping job {job.id} initiated for URL: {autotrader_url}", - "last_run_time": job.timestamp.isoformat(), - "duration_seconds": None, - "results_count": 0, - "error_message": None - }) - - # Pass job_id to the background task - background_tasks.add_task(run_scraping_task, job.id, autotrader_url, headless, scrape_timeout) - - logger.info(f"Scraping job {job.id} queued for URL: {autotrader_url}") - return job - -@app.post("/api/v1/listings/ingest", response_model=schemas.CarListing, status_code=201) -async def ingest_listing(payload: schemas.CarListingCreate, db: Session = Depends(get_db)): - """ - Ingests a new car listing into the database. - This endpoint is useful for manually adding or testing data. - """ - # Check if listing with this URL already exists to prevent duplicates, - # though the database constraint should also handle this. - db_listing = crud.get_car_listing_by_url(db, url=str(payload.url)) - if db_listing: - raise HTTPException(status_code=400, detail="Listing with this URL already exists.") - - # The job_id in CarListingCreate might be problematic if this is a direct ingest - # not tied to a specific scrape job. For now, we'll assume it's provided or - # we could adjust the schema/logic if direct ingestion shouldn't have a job_id. - # For testing, we might need to create a dummy job or adjust schema. - # Let's assume for now a valid job_id is provided or handle it if not. - if not payload.job_id: - # Create a dummy job or handle as per requirements for listings not tied to a job - # For simplicity, let's assume job_id is optional in the schema for this use case - # or a default/placeholder job_id is used. - # For this test, the payload includes job_id, so we'll proceed. - # If CarListingCreate schema requires job_id, this endpoint needs to handle it. - # For now, let's assume it's provided in the payload. - pass - - try: - created_listing = crud.create_car_listing(db=db, listing=payload) - return created_listing - except Exception as e: - logger.error(f"Error ingesting listing: {e}", exc_info=True) - raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") - - -@app.get("/scrape/status", response_model=schemas.ScrapeJob) # Using ScrapeJob schema for better structure -async def get_current_scrape_status(db: Session = Depends(get_db)): - """ - Returns the status of the current or last scraping job. - """ - global scrape_status - if scrape_status.get("job_id"): - job = crud.get_scrape_job(db, scrape_status["job_id"]) - if job: - # Update status from DB if available, otherwise use in-memory for simplicity - # A more robust system might always fetch from DB or use a proper job queue status - return job - return scrape_status # Fallback to in-memory status if job not found or not started - -@app.get("/scrape/jobs/", response_model=List[schemas.ScrapeJob]) -async def read_jobs(skip: int = 0, limit: int = 10, db: Session = Depends(get_db)): - """ - Retrieve all scrape jobs. - """ - jobs = crud.get_all_scrape_jobs(db, skip=skip, limit=limit) - return jobs - -@app.get("/scrape/jobs/{job_id}/results", response_model=List[schemas.CarListing]) -async def read_job_results(job_id: int, skip: int = 0, limit: int = 10, db: Session = Depends(get_db)): - """ - Retrieve results for a specific scrape job. - """ - job = crud.get_scrape_job(db, job_id=job_id) - if job is None: - raise HTTPException(status_code=404, detail="Job not found") - listings = crud.get_listings_for_job(db, job_id=job_id, skip=skip, limit=limit) - return listings - -@app.get("/") -async def read_root(): - return {"message": "AutoTrader Scraper API is running!"} - -# This is for local development if you run `python app/main.py` -# Uvicorn will be started by Procfile in production environments like Heroku -if __name__ == "__main__": - # Ensure tables are created before starting the app if they don't exist - # This is useful for local development but might be handled differently in production - from .database import create_tables - create_tables() - - # Get port from environment variable or default to 8000 - port = int(os.getenv("PORT", "8000")) - uvicorn.run(app, host="0.0.0.0", port=port) - -# Remove the old main.py content if it exists in the root directory -# This is now handled by app/main.py -# Ensure Procfile points to app.main:app or similar based on your directory structure -# e.g., web: uvicorn app.main:app --host=0.0.0.0 --port=${PORT:-8000} -# (Assuming app.py is moved to app/main.py) -# If app.py remains in root, then Procfile is fine. - -# The `models.Base.metadata.create_all(bind=engine)` should ideally be called once, -# perhaps in main.py or a startup script, not every time database.py is imported. -# For simplicity in this single-file app structure, it's often put there. -# If app.py is the main entry point for uvicorn, it's a good place. -# For Render, buildCommand in render.yaml can also handle migrations/table creation. - -# Let's ensure the imports are correct considering the file structure -# If main.py is in root and imports from app/, it should be `from app import crud, models, schemas, scraper` -# If this file is app/main.py, then `from . import crud, models, schemas, scraper` is correct. -# The prompt implies this file is app/main.py. diff --git a/app/models.py b/app/models.py deleted file mode 100644 index b0d4e5d..0000000 --- a/app/models.py +++ /dev/null @@ -1,45 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import relationship -from datetime import datetime - -Base = declarative_base() - -class ScrapeResult(Base): - __tablename__ = "scrape_results" - - id = Column(Integer, primary_key=True, index=True) - url = Column(String, index=True) - title = Column(String) - price = Column(String, nullable=True) # Store as string to handle variations like 'Contact Seller' - mileage = Column(String, nullable=True) # Store as string to handle non-numeric values - vin = Column(String, nullable=True, unique=True) - images = Column(JSON, nullable=True) # Store list of image URLs - scraped_at = Column(DateTime) - details = Column(JSON, nullable=True) # Store other details as JSON - -class ScrapeJob(Base): - __tablename__ = "scrape_jobs" - - id = Column(Integer, primary_key=True, index=True) - timestamp = Column(DateTime, default=datetime.utcnow) - status = Column(String, default="pending") # e.g., pending, running, completed, failed - results_count = Column(Integer, default=0) - error_message = Column(String, nullable=True) - -class ScrapedData(Base): - __tablename__ = "scraped_data" - - id = Column(Integer, primary_key=True, index=True) - job_id = Column(Integer, ForeignKey("scrape_jobs.id")) - platform = Column(String) # e.g., 'autotrader', 'cars.com' - url = Column(String, unique=True, index=True) - title = Column(String, nullable=True) - price = Column(String, nullable=True) - mileage = Column(String, nullable=True) - vin = Column(String, nullable=True, index=True) - image_urls = Column(JSON, nullable=True) # List of image URLs - raw_data = Column(JSON, nullable=True) # Full raw data if needed - scraped_at = Column(DateTime, default=datetime.utcnow) - - job = relationship("ScrapeJob") diff --git a/app/schemas.py b/app/schemas.py deleted file mode 100644 index 2ee0d8d..0000000 --- a/app/schemas.py +++ /dev/null @@ -1,41 +0,0 @@ -from pydantic import BaseModel, HttpUrl -from typing import List, Optional, Dict, Any -from datetime import datetime - -class CarListingBase(BaseModel): - url: HttpUrl - title: Optional[str] = None - price: Optional[str] = None # Keep as string to handle variations - mileage: Optional[str] = None # Keep as string - vin: Optional[str] = None - image_urls: Optional[List[HttpUrl]] = [] - raw_data: Optional[Dict[str, Any]] = {} # For any other unstructured data - -class CarListingCreate(CarListingBase): - platform: str - job_id: int - -class CarListing(CarListingBase): - id: int - platform: str - job_id: int - scraped_at: datetime - - class Config: - orm_mode = True - -class ScrapeJobBase(BaseModel): - pass - -class ScrapeJobCreate(ScrapeJobBase): - pass - -class ScrapeJob(ScrapeJobBase): - id: int - timestamp: datetime - status: str - results_count: int = 0 - error_message: Optional[str] = None - - class Config: - orm_mode = True diff --git a/app/scraper.py b/app/scraper.py deleted file mode 100644 index b946ecb..0000000 --- a/app/scraper.py +++ /dev/null @@ -1,373 +0,0 @@ -import asyncio -import logging -# import os # No longer needed for getenv in main -import datetime # Keep for now, might be used in data processing -from playwright.async_api import async_playwright -# Required for main test function -from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT -# DATABASE_URL is used by database.py, SessionLocal will pick it up via config - -# Assuming database.py is in the same directory or accessible in PYTHONPATH -from database import get_db, CarListing, SessionLocal # Added SessionLocal for main example -from sqlalchemy.orm import Session -from datetime import datetime # Ensure datetime is imported directly - -# Configure basic logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - -from stealth_utils import apply_stealth_js # Import new stealth utility -# from playwright_stealth import stealth_async # Commenting out old stealth - -class AutoTraderScraper: - """Scraper for AutoTrader private party listings using Playwright.""" - - def __init__(self, source_name: str = "autotrader"): - """ - Initializes the AutoTraderScraper. - Args: - source_name (str): Name of the source platform. - """ - self.source_name = source_name - # Potentially load other configs from a config file or env vars here - # For example: self.base_url = "https://www.autotrader.com/cars-for-sale/private-seller" - - async def get_private_listings(self, autotrader_url: str, headless: bool, timeout: int = 120000) -> list[dict]: - """ - Scrapes private party listings from AutoTrader using Playwright. - - Args: - autotrader_url (str): The starting URL for scraping AutoTrader private listings. - headless (bool): Whether to run the browser in headless mode. - timeout (int): Maximum time in milliseconds for page operations. - - Returns: - list[dict]: A list of dictionaries, where each dictionary represents a scraped vehicle listing. - """ - listings_data = [] - browser = None - - launch_options = { - "headless": headless, - "args": [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-infobars', - '--window-position=0,0', - '--ignore-certificate-errors', - '--ignore-certificate-errors-spki-list', - # '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"' # User agent is set in context - '--disable-gpu' # Already there but keep - ], - # "channel": "chrome" # This might require full Chrome install, trying without first to see if args help - } - - # Try with 'msedge' or 'chrome' if default chromium fails and they are available - # For now, stick to chromium and args. If 'channel' is needed, it's a bigger setup change. - - async with async_playwright() as p: - try: - # browser = await p.chromium.launch(**launch_options) # Default chromium - # Let's try specifying channel, assuming it might use a locally installed Chrome if available, or a Playwright-managed one. - # This is a common suggestion if the default Playwright Chromium build is too easily detected. - # If "chrome" channel is not found by Playwright, it will error. - try: - browser = await p.chromium.launch( - **launch_options, - channel="chrome" # Attempt to use a branded Chrome build - ) - logging.info("Attempting to launch with channel='chrome'") - except Exception as e_channel: - logging.warning(f"Failed to launch with channel='chrome' ({e_channel}). Falling back to default Playwright Chromium.") - # Remove channel from launch_options if it failed - launch_options_no_channel = launch_options.copy() - if "channel" in launch_options_no_channel: # Should not be needed based on above structure but good practice - del launch_options_no_channel["channel"] - browser = await p.chromium.launch(**launch_options_no_channel) - logging.info("Launched with default Playwright Chromium.") - - - context = await browser.new_context( - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36', # A fairly common user agent - java_script_enabled=True, - ) - context.set_default_navigation_timeout(timeout) - context.set_default_timeout(timeout) - - page = await context.new_page() - await page.set_viewport_size({"width": 1920, "height": 1080}) - - # Apply custom JS stealth - await apply_stealth_js(page) - - logging.info(f"Navigating to {autotrader_url}") - await page.goto(autotrader_url, wait_until="domcontentloaded", timeout=timeout) # Reverted to domcontentloaded - - title = await page.title() - logging.info(f"Page title: {title}") - - if "unavailable" in title.lower() or "block" in title.lower() or "access denied" in title.lower(): - logging.critical(f"Failed to load AutoTrader listings page. Blocked by website. Title: {title}") - await browser.close() # Ensure browser is closed before returning - return [] - - # Using speculative selectors for AutoTrader - # Main container for listings: 'div[data-qaid="cntnr-lstng-main"]' (this might be too broad or incorrect) - # A more specific item selector might be needed, e.g., an article or a div with a specific class. - # For now, let's assume individual listing cards can be found with a selector like: - # "div.inventory-listing" or "div[data-cmp='inventoryListing']" - these are common patterns. - # The provided example 'div[data-qaid="cntnr-lstng-main"]' seems like it might be a single container FOR ALL listings. - # Let's try a more specific (but still guessed) selector for individual listing items. - # A common pattern is items within a list or grid. Let's try to find items: - # This selector is a **GUESS** based on common AutoTrader structures. - listing_item_selector = "div[data-cmp='inventoryListing']" # GUESS - - # Fallback if the primary guess doesn't work, try another common pattern - # listing_item_selector_fallback = "div.inventory-listing.new-listing.stub" # Another GUESS - - # await page.wait_for_selector(listing_item_selector, timeout=15000) # Wait for items to appear - - listing_containers = await page.query_selector_all(listing_item_selector) - - # if not listing_containers: - # logging.info(f"No listings found with primary selector '{listing_item_selector}'. Trying fallback...") - # listing_containers = await page.query_selector_all(listing_item_selector_fallback) - - logging.info(f"Found {len(listing_containers)} potential listing containers using selector '{listing_item_selector}'.") - - processed_count = 0 - # first_container_processed_for_html_dump = False # REMOVE HTML DUMP FLAG - for i, container in enumerate(listing_containers): - url_path = None - title_text = "N/A" # Default to N/A - price_text = "N/A" # Default to N/A - mileage_text = "N/A" # Default to N/A (as it's not reliably on card) - listing_url = None - - try: - logging.debug(f"Processing container {i+1}/{len(listing_containers)}") - - # Attempt to get Title - title_el = await container.query_selector("h2[data-cmp='subheading']") # Updated selector from HTML dump - if title_el: - raw_title_text = await title_el.inner_text() - title_text = raw_title_text.strip() if raw_title_text else "N/A" - - # Attempt to get URL from parent of title_el - # Playwright's query_selector does not directly support xpath like "ancestor::a". - # A common structure is

...

or

...

- # We can try to find 'a' that contains this h2, or assume the 'a[data-cmp="link"]' is the one. - - # Let's use the a[data-cmp="link"] which was identified as containing the title h2 - parent_link_el = await container.query_selector("a[data-cmp='link']") - if parent_link_el: - url_path = await parent_link_el.get_attribute("href") - else: # Fallback if the above structure isn't found - logging.warning(f"Could not find parent a[data-cmp='link'] for title in listing {i+1}") - else: - logging.warning(f"Title not found with h2[data-cmp='subheading'] for listing {i+1}.") - - # Fallback or alternative for URL if not found via title's parent link - if not url_path: - url_el_alt = await container.query_selector("a[data-cmp='relLnk']") # Keep this fallback - if url_el_alt: - url_path = await url_el_alt.get_attribute("href") - - if not url_path: # Last resort for URL - first_a = await container.query_selector("a[href]") # Broadest fallback - if first_a: - url_path = await first_a.get_attribute("href") - - if not url_path: - logging.warning(f"Could not extract URL for listing {i+1} (Title: {title_text}). Skipping.") - continue - - if not url_path.startswith(('http://', 'https://')): - listing_url = f"https://www.autotrader.com{url_path}" - else: - listing_url = url_path - - # Attempt to get Price - price_el = await container.query_selector("div[data-cmp='firstPrice']") # Updated selector - if price_el: - raw_price_text = await price_el.inner_text() - price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A" - else: - # Fallback for price (e.g. .first-price class directly) - price_el_fallback = await container.query_selector(".first-price") - if price_el_fallback: - raw_price_text = await price_el_fallback.inner_text() - price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A" - else: - logging.warning(f"Price not found for listing {listing_url}") - price_text = "N/A" - - - # Mileage - Set to N/A as it's not reliably on the card from previous findings - mileage_text = "N/A" - # logging.info(f"Mileage not scraped from listing card for {listing_url} (by design for now).") - - vin_text = None - - listing_data = { - "listing_url": listing_url, - "title": title_text, # Already defaults to N/A or has value - "price": price_text, # Already defaults to N/A or has value - "mileage": mileage_text, # Is N/A - "vin": vin_text, - "source_name": self.source_name, - "data_points": { - "page_title_at_scrape": title # page's title, not listing's - } - } - listings_data.append(listing_data) - processed_count += 1 - logging.info(f"Successfully processed listing: {title_text[:50]}... URL: {listing_url}") - - except Exception as e: - logging.error(f"Error processing listing container {i+1} for URL {listing_url if listing_url else 'Unknown'}: {e}", exc_info=True) - continue - - logging.info(f"Successfully processed {processed_count} out of {len(listing_containers)} listing containers.") - - except Exception as e: - logging.error(f"An error occurred during Playwright scraping phase: {e}", exc_info=True) - finally: - if browser: - logging.info("Closing browser.") - await browser.close() - - return listings_data - - -async def scrape_autotrader_data(autotrader_url: str, headless: bool = True, timeout: int = 120000) -> list[dict]: - """ - High-level function to scrape data from AutoTrader. - Initializes the scraper and calls its scraping method. - - Args: - autotrader_url (str): The URL to scrape. - headless (bool): Whether to run the browser in headless mode. - timeout (int): Timeout for scraping operations in milliseconds. - - Returns: - list[dict]: A list of scraped listing data. - """ - scraper = AutoTraderScraper() - listings = await scraper.get_private_listings(autotrader_url=autotrader_url, headless=headless, timeout=timeout) - return listings - - -async def scrape_autotrader_and_update_db(db: Session, autotrader_url: str, headless: bool, scrape_timeout: int): - """ - Scrapes listings from AutoTrader and updates the database. - - Args: - db (Session): The SQLAlchemy database session. - autotrader_url (str): The URL to scrape. - headless (bool): Whether to run the browser in headless mode. - scrape_timeout (int): Timeout for scraping operations in milliseconds. - - Returns: - dict: A status dictionary with counts of added, updated, and scraped listings. - """ - logging.info(f"Starting scrape and update for URL: {autotrader_url}") - - try: - listings_data = await scrape_autotrader_data( - autotrader_url=autotrader_url, - headless=headless, - timeout=scrape_timeout - ) - except Exception as e: - logging.error(f"Failed to scrape data from {autotrader_url}: {e}", exc_info=True) - return {"status": "error", "message": f"Scraping failed: {e}"} - - added_count = 0 - updated_count = 0 - scraped_count = len(listings_data) - - for listing_data in listings_data: - source_url = listing_data.get('listing_url') # Renamed from 'url' to 'listing_url' in dummy data - if not source_url: - logging.warning(f"Scraped item missing 'listing_url': {listing_data.get('title')}. Skipping.") - continue - - try: - existing_listing = db.query(CarListing).filter(CarListing.source_url == source_url).first() - - if existing_listing: - # Placeholder for update logic - # existing_listing.extracted_at = datetime.utcnow() - # existing_listing.data_points = {k: v for k, v in listing_data.items() if k != 'listing_url'} - # # Update other fields like price if necessary - # db.add(existing_listing) # Not strictly necessary if only mutable fields changed and session tracks - updated_count += 1 - logging.info(f"Listing at {source_url} already exists. Marked for update (placeholder).") - else: - new_listing = CarListing( - platform="autotrader", - extracted_at=datetime.utcnow(), - source_url=source_url, - # Ensure data_points stores everything else from listing_data - data_points={k: v for k, v in listing_data.items() if k != 'listing_url'} - ) - db.add(new_listing) - added_count += 1 - logging.info(f"New listing added from {source_url}") - except Exception as e: - logging.error(f"Error processing listing {source_url} for DB: {e}", exc_info=True) - # Decide if you want to rollback here or continue with other listings - - try: - db.commit() - logging.info("Database changes committed.") - except Exception as e: - logging.error(f"Database commit failed: {e}", exc_info=True) - db.rollback() - return {"status": "error", "message": f"DB commit failed: {e}", "added": 0, "updated": 0, "scraped_count": scraped_count} - - status_summary = { - "status": "success", - "added": added_count, - "updated": updated_count, - "scraped_count": scraped_count - } - logging.info(f"DB update summary: {status_summary}") - return status_summary - -async def main(): - # Use settings from config.py - # url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") - # headless_str = os.getenv("HEADLESS_BROWSER", "True") - # headless = headless_str.lower() == "true" - # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000") - # try: - # scrape_timeout = int(scrape_timeout_str) - # except ValueError: - # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.") - # scrape_timeout = 120000 - - # from database import SessionLocal # Already imported at the top - db: Session = SessionLocal() # SessionLocal now uses DATABASE_URL from config.py via database.py - try: - logging.info(f"Starting scraper and DB update for URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms") - stats = await scrape_autotrader_and_update_db( - db=db, - autotrader_url=AUTOTRADER_URL, - headless=HEADLESS_BROWSER, - scrape_timeout=SCRAPE_TIMEOUT - ) - logging.info(f"Scraping and DB update completed: {stats}") - except Exception as e: - logging.error(f"Error during scraping and DB update in main: {e}", exc_info=True) - finally: - logging.info("Closing DB session in main.") - db.close() - -if __name__ == "__main__": - # To run this: - # 1. Ensure Playwright browsers are installed: `playwright install chromium` - # 2. Set environment variables if needed (AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT) - # 3. Uncomment the line below - asyncio.run(main()) - # pass # Keep it passive for now, to be run manually when needed diff --git a/config.py b/config.py deleted file mode 100644 index 44148ca..0000000 --- a/config.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from dotenv import load_dotenv - -# Load environment variables from .env file if it exists -# This is useful for local development. -load_dotenv() - -# Database Configuration -DATABASE_URL: str = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db") - -# Scraper Configuration -AUTOTRADER_URL: str = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller") -SCRAPE_TIMEOUT: int = int(os.getenv("SCRAPE_TIMEOUT", "120000")) # Milliseconds -HEADLESS_BROWSER: bool = os.getenv("HEADLESS_BROWSER", "True").lower() == "true" - -# API Configuration (if any specific ones are needed later) -# Example: API_HOST: str = os.getenv("API_HOST", "0.0.0.0") -# Example: API_PORT: int = int(os.getenv("API_PORT", "8000")) - -# Logging Configuration (can also be added here if more complex) -LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO").upper() - -# Ensure critical URLs have a scheme for robustness -if not AUTOTRADER_URL.startswith(("http://", "https://")): - # This print statement is for immediate feedback during startup/import. - # In a pure library, side effects on import are sometimes discouraged, - # but for an application's main config, it's often acceptable. - print(f"Warning: AUTOTRADER_URL ('{AUTOTRADER_URL}') did not have a scheme, prepended https://.") - AUTOTRADER_URL = "https://" + AUTOTRADER_URL - print(f"Corrected AUTOTRADER_URL: {AUTOTRADER_URL}") - - -# Example of how to handle SQLite connect_args based on config -DB_CONNECT_ARGS: dict = {"check_same_thread": False} if DATABASE_URL.startswith("sqlite") else {} diff --git a/database.py b/database.py deleted file mode 100644 index 58a11dc..0000000 --- a/database.py +++ /dev/null @@ -1,26 +0,0 @@ -from sqlalchemy import Column, Integer, String, DateTime, JSON, create_engine -from sqlalchemy.orm import declarative_base, sessionmaker, Session -from config import DATABASE_URL, DB_CONNECT_ARGS # Import from config - -# Use imported configuration -engine = create_engine(DATABASE_URL, connect_args=DB_CONNECT_ARGS) - -SessionLocal = sessionmaker(bind=engine, autoflush=False) -Base = declarative_base() - -class CarListing(Base): - __tablename__ = "listings" - id = Column(Integer, primary_key=True, index=True) - platform = Column(String) - extracted_at = Column(DateTime) - source_url = Column(String, unique=True) - data_points = Column(JSON) - -Base.metadata.create_all(bind=engine) - -def get_db(): - db = SessionLocal() - try: - yield db - finally: - db.close() diff --git a/package.json b/package.json deleted file mode 100644 index 5f62ed5..0000000 --- a/package.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "name": "autotrader-scraper", - "version": "1.0.0", - "description": "A FastAPI application for scraping Autotrader data.", - "main": "index.js", - "scripts": { - "start": "python app.py", - "test": "echo \"Error: no test specified\" && exit 1" - }, - "keywords": ["fastapi", "autotrader", "scraper", "web-scraping"], - "author": "", - "license": "ISC" -} diff --git a/render.yaml b/render.yaml index b3c7392..f755e35 100644 --- a/render.yaml +++ b/render.yaml @@ -11,9 +11,19 @@ services: value: 3.11 - key: DATABASE_URL generateValue: true - - key: AUTOTRADER_URL - value: "https://www.autotrader.com/cars-for-sale/private-seller" - - key: SCRAPE_TIMEOUT - value: 120000 - - key: HEADLESS_BROWSER + - key: HEADLESS value: "True" + - key: BROWSER_TIMEOUT + value: "60000" + - key: PAGE_DELAY + value: "5000" + - key: MIN_DELAY_BETWEEN_ACTIONS + value: "2.5" + - key: MAX_LISTINGS_PER_SESSION + value: "25" + - key: PROXY_SERVER + value: "" + - key: PROXY_USERNAME + value: "" + - key: PROXY_PASSWORD + value: "" diff --git a/src/api/routes.py b/src/api/routes.py index c1f5764..612fb3f 100644 --- a/src/api/routes.py +++ b/src/api/routes.py @@ -121,8 +121,10 @@ async def api_v1_root_info(): @router.get("/vehicles/", response_model=List[VehicleListingResponse]) async def get_all_vehicles( + skip: int = Query(0, ge=0), + limit: int = Query(settings.MAX_LISTINGS_PER_SESSION, ge=1, le=200), db: AsyncSession = Depends(get_db), - filters: SearchFilters = Depends() + filters: SearchFilters = Depends(), ): query = select(VehicleListing) conditions = [] @@ -151,7 +153,7 @@ async def get_all_vehicles( if conditions: query = query.where(and_(*conditions)) query = query.order_by(VehicleListing.last_scraped_at.desc(), VehicleListing.created_at.desc()) - result = await db.execute(query) + result = await db.execute(query.offset(skip).limit(limit)) vehicles = result.scalars().all() response_vehicles = [] for vehicle_db_item in vehicles: diff --git a/src/automation/browser_sim.py b/src/automation/browser_sim.py index 96ed36f..c93f204 100644 --- a/src/automation/browser_sim.py +++ b/src/automation/browser_sim.py @@ -25,8 +25,16 @@ async def __aenter__(self): logger.info("Initializing AutoTrader Scraper...") self.playwright_instance = await async_playwright().start() try: + proxy_cfg = None + if settings.PROXY_SERVER: + proxy_cfg = {"server": settings.PROXY_SERVER} + if settings.PROXY_USERNAME and settings.PROXY_PASSWORD: + proxy_cfg["username"] = settings.PROXY_USERNAME + proxy_cfg["password"] = settings.PROXY_PASSWORD + self.browser = await self.playwright_instance.chromium.launch( headless=settings.HEADLESS, + proxy=proxy_cfg, args=[ '--no-sandbox', '--disable-setuid-sandbox', diff --git a/src/config.py b/src/config.py index 598bc2e..94a9b77 100644 --- a/src/config.py +++ b/src/config.py @@ -13,4 +13,9 @@ class Settings: API_PORT: int = int(os.getenv("API_PORT", "8000")) MAX_LISTINGS_PER_SESSION: int = int(os.getenv("MAX_LISTINGS_PER_SESSION", "25")) + # Proxy configuration + PROXY_SERVER: str | None = os.getenv("PROXY_SERVER") + PROXY_USERNAME: str | None = os.getenv("PROXY_USERNAME") + PROXY_PASSWORD: str | None = os.getenv("PROXY_PASSWORD") + settings = Settings() diff --git a/stealth_utils.py b/stealth_utils.py deleted file mode 100644 index e956687..0000000 --- a/stealth_utils.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging - -async def apply_stealth_js(page): - """ - Applies various JavaScript injections to make Playwright less detectable. - """ - try: - # Pass the User-Agent test (though Playwright usually handles this well) - # user_agent = await page.evaluate("() => navigator.userAgent") - # await page.set_extra_http_headers({'User-Agent': user_agent.replace("HeadlessChrome", "Chrome")}) # Example - - # Pass the WebGL test - await page.add_init_script("(() => { const getParameter = WebGLRenderingContext.prototype.getParameter; WebGLRenderingContext.prototype.getParameter = function(parameter) { if (parameter === 37445) { return 'Intel Open Source Technology Center'; } if (parameter === 37446) { return 'Mesa DRI Intel(R) Ivybridge Mobile '; } return getParameter(parameter); }; })()") - - # Pass the Chrome test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); })()") - await page.add_init_script("(() => { window.chrome = { runtime: {}, loadTimes: function(){}, csi: function(){} }; })()") - - # Pass the Permissions test - await page.add_init_script("(() => { const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); })()") - - # Pass the Plugins Length test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); })()") - - # Pass the Languages test - await page.add_init_script("(() => { Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); })()") - - logging.info("Applied JavaScript stealth techniques from stealth_utils.") - except Exception as e: - logging.error(f"Error applying stealth JS from stealth_utils: {e}", exc_info=True)