diff --git a/.env.example b/.env.example
index 767687c..4744016 100644
--- a/.env.example
+++ b/.env.example
@@ -13,3 +13,10 @@ API_PORT=8000
# Scraping Limits
MAX_LISTINGS_PER_SESSION=25
+
+# Optional Proxy Configuration
+# If using rotating proxies (e.g., Webshare), uncomment and provide the proxy URL.
+# Example: http://username:password@proxyhost:port
+# PROXY_SERVER=
+# PROXY_USERNAME=
+# PROXY_PASSWORD=
diff --git a/README.md b/README.md
index f5d383b..7525e16 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,28 @@ The scraper can also be run standalone:
```bash
python main.py scrape_test
```
+
+## Environment Variables
+
+Set the following variables in a `.env` file or your deployment environment:
+
+| Variable | Description | Default |
+| --- | --- | --- |
+| `DATABASE_URL` | Database connection URL | `sqlite+aiosqlite:///./vehicle_data.db` |
+| `HEADLESS` | Run the browser in headless mode | `true` |
+| `BROWSER_TIMEOUT` | Playwright launch timeout (ms) | `60000` |
+| `PAGE_DELAY` | Base delay after page loads (ms) | `5000` |
+| `MIN_DELAY_BETWEEN_ACTIONS` | Delay between scraping actions (s) | `2.5` |
+| `API_HOST` | Host for the FastAPI server | `127.0.0.1` |
+| `API_PORT` | Port for the FastAPI server | `8000` |
+| `MAX_LISTINGS_PER_SESSION` | Maximum listings fetched per scrape | `25` |
+| `PROXY_SERVER` | *(Optional)* Proxy URL for Playwright | - |
+| `PROXY_USERNAME` | *(Optional)* Proxy username | - |
+| `PROXY_PASSWORD` | *(Optional)* Proxy password | - |
+
+### Pagination
+
+The `/api/v1/vehicles/` endpoint accepts `skip` and `limit` query parameters to paginate results.
+Example: `/api/v1/vehicles/?skip=25&limit=25`.
+
+
diff --git a/app.py b/app.py
deleted file mode 100644
index 8fa8500..0000000
--- a/app.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import logging
-# import os # No longer needed for getenv in background task
-import asyncio
-from fastapi import FastAPI, Depends, BackgroundTasks
-from pydantic import BaseModel
-from typing import Dict
-from datetime import datetime
-from database import CarListing, get_db, Session, SessionLocal
-from scraper import scrape_autotrader_and_update_db
-from fastapi.middleware.cors import CORSMiddleware
-from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT, LOG_LEVEL # Import from config
-
-# Configure basic logging using LOG_LEVEL from config
-# Ensure this is called only once. If FastAPI/Uvicorn also configures logging,
-# this might need adjustment or to be handled by the logger instance directly.
-# For now, assume this is the primary logging config.
-logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
-# Added force=True to ensure this config takes precedence if uvicorn also tries to set basicConfig.
-
-app = FastAPI()
-app.add_middleware(
- CORSMiddleware,
- allow_origins=["*"],
- allow_methods=["*"],
- allow_headers=["*"],
-)
-
-class CarListingRaw(BaseModel):
- platform: str
- extracted_at: datetime
- source_url: str
- data_points: Dict
-
-@app.post("/api/v1/listings/ingest")
-async def ingest_listing(payload: CarListingRaw, db: Session = Depends(get_db)):
- listing = CarListing(
- platform=payload.platform,
- extracted_at=payload.extracted_at,
- source_url=payload.source_url,
- data_points=payload.data_points
- )
- db.add(listing)
- db.commit()
- db.refresh(listing)
- return {"status": "saved", "listing_id": listing.id}
-
-@app.get("/")
-def read_root():
- return {"message": "🚗 Car Tracker API is running!"}
-
-# Global variable to store scraping status
-scrape_status = {
- "last_run_time": None,
- "status": "idle", # States: idle, running, success, error
- "message": "",
- "added": 0,
- "updated": 0,
- "scraped_count": 0
-}
-
-# Background task wrapper
-async def _background_scraper_task_wrapper():
- global scrape_status
- db_task_session: Session = SessionLocal()
- logging.info("Background scraper task started.")
- scrape_status["status"] = "running"
- scrape_status["message"] = "Scraping in progress..."
- scrape_status["last_run_time"] = datetime.utcnow().isoformat()
- scrape_status["added"] = 0 # Reset counts for current run
- scrape_status["updated"] = 0
- scrape_status["scraped_count"] = 0
-
- try:
- # Use imported config values
- # autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller")
- # headless_str = os.getenv("HEADLESS_BROWSER", "True")
- # headless = headless_str.lower() == "true"
- # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000")
- # try:
- # scrape_timeout = int(scrape_timeout_str)
- # except ValueError:
- # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.")
- # scrape_timeout = 120000
-
- logging.info(f"Background task using URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms")
-
- result = await scrape_autotrader_and_update_db(
- db=db_task_session,
- autotrader_url=AUTOTRADER_URL,
- headless=HEADLESS_BROWSER,
- scrape_timeout=SCRAPE_TIMEOUT
- )
-
- if result.get("status") == "success":
- scrape_status["status"] = "success"
- scrape_status["message"] = "Scraping completed successfully."
- scrape_status["added"] = result.get("added", 0)
- scrape_status["updated"] = result.get("updated", 0)
- scrape_status["scraped_count"] = result.get("scraped_count", 0)
- else:
- scrape_status["status"] = "error"
- scrape_status["message"] = result.get("message", "Scraping failed with an unknown error.")
-
- logging.info(f"Background scraper task completed: {result}")
-
- except Exception as e:
- logging.error(f"Error in background scraper task: {e}", exc_info=True)
- scrape_status["status"] = "error"
- scrape_status["message"] = str(e)
- finally:
- db_task_session.close()
- logging.info("Background scraper DB session closed.")
-
-@app.post("/api/v1/scrape/autotrader")
-async def trigger_autotrader_scrape(background_tasks: BackgroundTasks):
- if scrape_status["status"] == "running":
- return {"message": "AutoTrader scraping job is already running."}
- background_tasks.add_task(_background_scraper_task_wrapper)
- return {"message": "AutoTrader scraping job started in the background."}
-
-@app.get("/api/v1/scrape/status")
-async def get_scrape_status():
- return scrape_status
diff --git a/app/__init__.py b/app/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/app/crud.py b/app/crud.py
deleted file mode 100644
index fb8708b..0000000
--- a/app/crud.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from sqlalchemy.orm import Session
-from . import models, schemas
-from datetime import datetime
-
-def get_car_listing_by_url(db: Session, url: str):
- return db.query(models.ScrapedData).filter(models.ScrapedData.url == url).first()
-
-def create_car_listing(db: Session, listing: schemas.CarListingCreate):
- db_listing = models.ScrapedData(
- job_id=listing.job_id,
- platform=listing.platform,
- url=str(listing.url), # Ensure HttpUrl is converted to string
- title=listing.title,
- price=listing.price,
- mileage=listing.mileage,
- vin=listing.vin,
- image_urls=listing.image_urls, # Assuming image_urls is already a list of strings or compatible JSON
- raw_data=listing.raw_data,
- scraped_at=datetime.utcnow()
- )
- db.add(db_listing)
- db.commit()
- db.refresh(db_listing)
- return db_listing
-
-def create_scrape_job(db: Session) -> models.ScrapeJob:
- db_job = models.ScrapeJob(timestamp=datetime.utcnow(), status="pending")
- db.add(db_job)
- db.commit()
- db.refresh(db_job)
- return db_job
-
-def update_scrape_job_status(db: Session, job_id: int, status: str, results_count: int = 0, error_message: str = None):
- db_job = db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first()
- if db_job:
- db_job.status = status
- db_job.results_count = results_count
- db_job.error_message = error_message
- db.commit()
- db.refresh(db_job)
- return db_job
-
-def get_scrape_job(db: Session, job_id: int):
- return db.query(models.ScrapeJob).filter(models.ScrapeJob.id == job_id).first()
-
-def get_all_scrape_jobs(db: Session, skip: int = 0, limit: int = 100):
- return db.query(models.ScrapeJob).order_by(models.ScrapeJob.timestamp.desc()).offset(skip).limit(limit).all()
-
-def get_listings_for_job(db: Session, job_id: int, skip: int = 0, limit: int = 100):
- return db.query(models.ScrapedData).filter(models.ScrapedData.job_id == job_id).offset(skip).limit(limit).all()
diff --git a/app/database.py b/app/database.py
deleted file mode 100644
index bf32154..0000000
--- a/app/database.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from sqlalchemy import create_engine
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import sessionmaker
-import os
-
-DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db")
-
-engine_args = {}
-if DATABASE_URL.startswith("sqlite"):
- engine_args["connect_args"] = {"check_same_thread": False}
-
-engine = create_engine(DATABASE_URL, **engine_args)
-SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
-
-Base = declarative_base()
-
-def get_db():
- db = SessionLocal()
- try:
- yield db
- finally:
- db.close()
-
-def create_tables():
- Base.metadata.create_all(bind=engine)
diff --git a/app/main.py b/app/main.py
deleted file mode 100644
index 4817f15..0000000
--- a/app/main.py
+++ /dev/null
@@ -1,265 +0,0 @@
-import logging
-import os
-from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks
-from sqlalchemy.orm import Session
-from typing import List
-
-from . import crud, models, schemas, scraper
-from .database import SessionLocal, engine
-
-# Create database tables if they don't exist
-models.Base.metadata.create_all(bind=engine)
-
-# Configure logging
-LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
-logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-app = FastAPI(title="AutoTrader Scraper API", version="1.0.0")
-
-# Dependency to get DB session
-def get_db():
- db = SessionLocal()
- try:
- yield db
- finally:
- db.close()
-
-# Global variable to store scraping status (simple approach)
-scrape_status = {
- "job_id": None,
- "status": "idle", # States: idle, pending, running, completed, failed
- "message": "No scraping job initiated yet.",
- "last_run_time": None,
- "duration_seconds": None,
- "results_count": 0,
- "error_message": None
-}
-
-async def run_scraping_task(job_id: int, autotrader_url: str, headless: bool, scrape_timeout: int):
- """
- The actual scraping task that runs in the background.
- It creates its own database session.
- """
- global scrape_status
- db: Session = SessionLocal()
- try:
- logger.info(f"Background task started for job_id: {job_id}")
- crud.update_scrape_job_status(db, job_id, status="running")
- scrape_status.update({
- "job_id": job_id,
- "status": "running",
- "message": f"Scraping from {autotrader_url}...",
- "last_run_time": datetime.utcnow().isoformat(),
- "duration_seconds": None,
- "results_count": 0,
- "error_message": None
- })
-
- start_time = datetime.utcnow()
-
- scraped_data_list = await scraper.scrape_autotrader_data(
- autotrader_url=autotrader_url,
- headless=headless,
- timeout=scrape_timeout
- )
-
- end_time = datetime.utcnow()
- duration = (end_time - start_time).total_seconds()
- scrape_status["duration_seconds"] = round(duration, 2)
-
- added_count = 0
- updated_count = 0 # Placeholder for future update logic
-
- if not scraped_data_list:
- logger.info(f"No listings found for job_id: {job_id}")
- crud.update_scrape_job_status(db, job_id, status="completed", results_count=0)
- scrape_status.update({
- "status": "completed",
- "message": "Scraping completed. No new listings found or page was inaccessible.",
- "results_count": 0
- })
- return
-
- for item_data in scraped_data_list:
- # Ensure all required fields for CarListingCreate are present
- listing_create = schemas.CarListingCreate(
- job_id=job_id,
- platform=item_data.get("source_name", "autotrader"), # Get platform from scraper or default
- url=item_data.get("listing_url"),
- title=item_data.get("title"),
- price=item_data.get("price"),
- mileage=item_data.get("mileage"),
- vin=item_data.get("vin"),
- image_urls=item_data.get("image_urls", []),
- raw_data=item_data.get("data_points", {})
- )
-
- existing_listing = crud.get_car_listing_by_url(db, str(listing_create.url))
- if existing_listing:
- # For now, we just count updates. Actual update logic could be added here.
- # e.g., existing_listing.price = listing_create.price
- # existing_listing.extracted_at = datetime.utcnow()
- updated_count += 1
- else:
- crud.create_car_listing(db=db, listing=listing_create)
- added_count += 1
-
- crud.update_scrape_job_status(db, job_id, status="completed", results_count=added_count)
- scrape_status.update({
- "status": "completed",
- "message": f"Scraping finished. Added: {added_count}, Updated: {updated_count} (placeholder).",
- "results_count": added_count + updated_count # Or just added_count if updates aren't really changing data
- })
- logger.info(f"Background task for job_id: {job_id} completed. Added: {added_count}, Updated: {updated_count}")
-
- except Exception as e:
- logger.error(f"Error in background scraper task for job_id {job_id}: {e}", exc_info=True)
- crud.update_scrape_job_status(db, job_id, status="failed", error_message=str(e))
- scrape_status.update({
- "status": "failed",
- "message": f"Error during scraping: {str(e)}",
- "error_message": str(e)
- })
- finally:
- db.close()
- logger.info(f"DB session closed for job_id: {job_id}")
-
-
-@app.post("/scrape/", response_model=schemas.ScrapeJob, status_code=202)
-async def trigger_scrape(background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
- """
- Triggers a new scraping job for Autotrader.
- """
- global scrape_status
- if scrape_status.get("status") == "running":
- raise HTTPException(status_code=409, detail="A scraping job is already in progress.")
-
- autotrader_url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/all-cars/cars-under-10000") # Default to a common search if not set
- headless_str = os.getenv("HEADLESS_BROWSER", "True")
- headless = headless_str.lower() == "true"
- scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000")
-
- try:
- scrape_timeout = int(scrape_timeout_str)
- except ValueError:
- scrape_timeout = 120000 # Default timeout if parsing fails
- logger.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Using default {scrape_timeout}ms.")
-
- job = crud.create_scrape_job(db)
- scrape_status.update({
- "job_id": job.id,
- "status": "pending",
- "message": f"Scraping job {job.id} initiated for URL: {autotrader_url}",
- "last_run_time": job.timestamp.isoformat(),
- "duration_seconds": None,
- "results_count": 0,
- "error_message": None
- })
-
- # Pass job_id to the background task
- background_tasks.add_task(run_scraping_task, job.id, autotrader_url, headless, scrape_timeout)
-
- logger.info(f"Scraping job {job.id} queued for URL: {autotrader_url}")
- return job
-
-@app.post("/api/v1/listings/ingest", response_model=schemas.CarListing, status_code=201)
-async def ingest_listing(payload: schemas.CarListingCreate, db: Session = Depends(get_db)):
- """
- Ingests a new car listing into the database.
- This endpoint is useful for manually adding or testing data.
- """
- # Check if listing with this URL already exists to prevent duplicates,
- # though the database constraint should also handle this.
- db_listing = crud.get_car_listing_by_url(db, url=str(payload.url))
- if db_listing:
- raise HTTPException(status_code=400, detail="Listing with this URL already exists.")
-
- # The job_id in CarListingCreate might be problematic if this is a direct ingest
- # not tied to a specific scrape job. For now, we'll assume it's provided or
- # we could adjust the schema/logic if direct ingestion shouldn't have a job_id.
- # For testing, we might need to create a dummy job or adjust schema.
- # Let's assume for now a valid job_id is provided or handle it if not.
- if not payload.job_id:
- # Create a dummy job or handle as per requirements for listings not tied to a job
- # For simplicity, let's assume job_id is optional in the schema for this use case
- # or a default/placeholder job_id is used.
- # For this test, the payload includes job_id, so we'll proceed.
- # If CarListingCreate schema requires job_id, this endpoint needs to handle it.
- # For now, let's assume it's provided in the payload.
- pass
-
- try:
- created_listing = crud.create_car_listing(db=db, listing=payload)
- return created_listing
- except Exception as e:
- logger.error(f"Error ingesting listing: {e}", exc_info=True)
- raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
-
-
-@app.get("/scrape/status", response_model=schemas.ScrapeJob) # Using ScrapeJob schema for better structure
-async def get_current_scrape_status(db: Session = Depends(get_db)):
- """
- Returns the status of the current or last scraping job.
- """
- global scrape_status
- if scrape_status.get("job_id"):
- job = crud.get_scrape_job(db, scrape_status["job_id"])
- if job:
- # Update status from DB if available, otherwise use in-memory for simplicity
- # A more robust system might always fetch from DB or use a proper job queue status
- return job
- return scrape_status # Fallback to in-memory status if job not found or not started
-
-@app.get("/scrape/jobs/", response_model=List[schemas.ScrapeJob])
-async def read_jobs(skip: int = 0, limit: int = 10, db: Session = Depends(get_db)):
- """
- Retrieve all scrape jobs.
- """
- jobs = crud.get_all_scrape_jobs(db, skip=skip, limit=limit)
- return jobs
-
-@app.get("/scrape/jobs/{job_id}/results", response_model=List[schemas.CarListing])
-async def read_job_results(job_id: int, skip: int = 0, limit: int = 10, db: Session = Depends(get_db)):
- """
- Retrieve results for a specific scrape job.
- """
- job = crud.get_scrape_job(db, job_id=job_id)
- if job is None:
- raise HTTPException(status_code=404, detail="Job not found")
- listings = crud.get_listings_for_job(db, job_id=job_id, skip=skip, limit=limit)
- return listings
-
-@app.get("/")
-async def read_root():
- return {"message": "AutoTrader Scraper API is running!"}
-
-# This is for local development if you run `python app/main.py`
-# Uvicorn will be started by Procfile in production environments like Heroku
-if __name__ == "__main__":
- # Ensure tables are created before starting the app if they don't exist
- # This is useful for local development but might be handled differently in production
- from .database import create_tables
- create_tables()
-
- # Get port from environment variable or default to 8000
- port = int(os.getenv("PORT", "8000"))
- uvicorn.run(app, host="0.0.0.0", port=port)
-
-# Remove the old main.py content if it exists in the root directory
-# This is now handled by app/main.py
-# Ensure Procfile points to app.main:app or similar based on your directory structure
-# e.g., web: uvicorn app.main:app --host=0.0.0.0 --port=${PORT:-8000}
-# (Assuming app.py is moved to app/main.py)
-# If app.py remains in root, then Procfile is fine.
-
-# The `models.Base.metadata.create_all(bind=engine)` should ideally be called once,
-# perhaps in main.py or a startup script, not every time database.py is imported.
-# For simplicity in this single-file app structure, it's often put there.
-# If app.py is the main entry point for uvicorn, it's a good place.
-# For Render, buildCommand in render.yaml can also handle migrations/table creation.
-
-# Let's ensure the imports are correct considering the file structure
-# If main.py is in root and imports from app/, it should be `from app import crud, models, schemas, scraper`
-# If this file is app/main.py, then `from . import crud, models, schemas, scraper` is correct.
-# The prompt implies this file is app/main.py.
diff --git a/app/models.py b/app/models.py
deleted file mode 100644
index b0d4e5d..0000000
--- a/app/models.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, JSON
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import relationship
-from datetime import datetime
-
-Base = declarative_base()
-
-class ScrapeResult(Base):
- __tablename__ = "scrape_results"
-
- id = Column(Integer, primary_key=True, index=True)
- url = Column(String, index=True)
- title = Column(String)
- price = Column(String, nullable=True) # Store as string to handle variations like 'Contact Seller'
- mileage = Column(String, nullable=True) # Store as string to handle non-numeric values
- vin = Column(String, nullable=True, unique=True)
- images = Column(JSON, nullable=True) # Store list of image URLs
- scraped_at = Column(DateTime)
- details = Column(JSON, nullable=True) # Store other details as JSON
-
-class ScrapeJob(Base):
- __tablename__ = "scrape_jobs"
-
- id = Column(Integer, primary_key=True, index=True)
- timestamp = Column(DateTime, default=datetime.utcnow)
- status = Column(String, default="pending") # e.g., pending, running, completed, failed
- results_count = Column(Integer, default=0)
- error_message = Column(String, nullable=True)
-
-class ScrapedData(Base):
- __tablename__ = "scraped_data"
-
- id = Column(Integer, primary_key=True, index=True)
- job_id = Column(Integer, ForeignKey("scrape_jobs.id"))
- platform = Column(String) # e.g., 'autotrader', 'cars.com'
- url = Column(String, unique=True, index=True)
- title = Column(String, nullable=True)
- price = Column(String, nullable=True)
- mileage = Column(String, nullable=True)
- vin = Column(String, nullable=True, index=True)
- image_urls = Column(JSON, nullable=True) # List of image URLs
- raw_data = Column(JSON, nullable=True) # Full raw data if needed
- scraped_at = Column(DateTime, default=datetime.utcnow)
-
- job = relationship("ScrapeJob")
diff --git a/app/schemas.py b/app/schemas.py
deleted file mode 100644
index 2ee0d8d..0000000
--- a/app/schemas.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from pydantic import BaseModel, HttpUrl
-from typing import List, Optional, Dict, Any
-from datetime import datetime
-
-class CarListingBase(BaseModel):
- url: HttpUrl
- title: Optional[str] = None
- price: Optional[str] = None # Keep as string to handle variations
- mileage: Optional[str] = None # Keep as string
- vin: Optional[str] = None
- image_urls: Optional[List[HttpUrl]] = []
- raw_data: Optional[Dict[str, Any]] = {} # For any other unstructured data
-
-class CarListingCreate(CarListingBase):
- platform: str
- job_id: int
-
-class CarListing(CarListingBase):
- id: int
- platform: str
- job_id: int
- scraped_at: datetime
-
- class Config:
- orm_mode = True
-
-class ScrapeJobBase(BaseModel):
- pass
-
-class ScrapeJobCreate(ScrapeJobBase):
- pass
-
-class ScrapeJob(ScrapeJobBase):
- id: int
- timestamp: datetime
- status: str
- results_count: int = 0
- error_message: Optional[str] = None
-
- class Config:
- orm_mode = True
diff --git a/app/scraper.py b/app/scraper.py
deleted file mode 100644
index b946ecb..0000000
--- a/app/scraper.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import asyncio
-import logging
-# import os # No longer needed for getenv in main
-import datetime # Keep for now, might be used in data processing
-from playwright.async_api import async_playwright
-# Required for main test function
-from config import AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT
-# DATABASE_URL is used by database.py, SessionLocal will pick it up via config
-
-# Assuming database.py is in the same directory or accessible in PYTHONPATH
-from database import get_db, CarListing, SessionLocal # Added SessionLocal for main example
-from sqlalchemy.orm import Session
-from datetime import datetime # Ensure datetime is imported directly
-
-# Configure basic logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-
-from stealth_utils import apply_stealth_js # Import new stealth utility
-# from playwright_stealth import stealth_async # Commenting out old stealth
-
-class AutoTraderScraper:
- """Scraper for AutoTrader private party listings using Playwright."""
-
- def __init__(self, source_name: str = "autotrader"):
- """
- Initializes the AutoTraderScraper.
- Args:
- source_name (str): Name of the source platform.
- """
- self.source_name = source_name
- # Potentially load other configs from a config file or env vars here
- # For example: self.base_url = "https://www.autotrader.com/cars-for-sale/private-seller"
-
- async def get_private_listings(self, autotrader_url: str, headless: bool, timeout: int = 120000) -> list[dict]:
- """
- Scrapes private party listings from AutoTrader using Playwright.
-
- Args:
- autotrader_url (str): The starting URL for scraping AutoTrader private listings.
- headless (bool): Whether to run the browser in headless mode.
- timeout (int): Maximum time in milliseconds for page operations.
-
- Returns:
- list[dict]: A list of dictionaries, where each dictionary represents a scraped vehicle listing.
- """
- listings_data = []
- browser = None
-
- launch_options = {
- "headless": headless,
- "args": [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-infobars',
- '--window-position=0,0',
- '--ignore-certificate-errors',
- '--ignore-certificate-errors-spki-list',
- # '--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"' # User agent is set in context
- '--disable-gpu' # Already there but keep
- ],
- # "channel": "chrome" # This might require full Chrome install, trying without first to see if args help
- }
-
- # Try with 'msedge' or 'chrome' if default chromium fails and they are available
- # For now, stick to chromium and args. If 'channel' is needed, it's a bigger setup change.
-
- async with async_playwright() as p:
- try:
- # browser = await p.chromium.launch(**launch_options) # Default chromium
- # Let's try specifying channel, assuming it might use a locally installed Chrome if available, or a Playwright-managed one.
- # This is a common suggestion if the default Playwright Chromium build is too easily detected.
- # If "chrome" channel is not found by Playwright, it will error.
- try:
- browser = await p.chromium.launch(
- **launch_options,
- channel="chrome" # Attempt to use a branded Chrome build
- )
- logging.info("Attempting to launch with channel='chrome'")
- except Exception as e_channel:
- logging.warning(f"Failed to launch with channel='chrome' ({e_channel}). Falling back to default Playwright Chromium.")
- # Remove channel from launch_options if it failed
- launch_options_no_channel = launch_options.copy()
- if "channel" in launch_options_no_channel: # Should not be needed based on above structure but good practice
- del launch_options_no_channel["channel"]
- browser = await p.chromium.launch(**launch_options_no_channel)
- logging.info("Launched with default Playwright Chromium.")
-
-
- context = await browser.new_context(
- user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36', # A fairly common user agent
- java_script_enabled=True,
- )
- context.set_default_navigation_timeout(timeout)
- context.set_default_timeout(timeout)
-
- page = await context.new_page()
- await page.set_viewport_size({"width": 1920, "height": 1080})
-
- # Apply custom JS stealth
- await apply_stealth_js(page)
-
- logging.info(f"Navigating to {autotrader_url}")
- await page.goto(autotrader_url, wait_until="domcontentloaded", timeout=timeout) # Reverted to domcontentloaded
-
- title = await page.title()
- logging.info(f"Page title: {title}")
-
- if "unavailable" in title.lower() or "block" in title.lower() or "access denied" in title.lower():
- logging.critical(f"Failed to load AutoTrader listings page. Blocked by website. Title: {title}")
- await browser.close() # Ensure browser is closed before returning
- return []
-
- # Using speculative selectors for AutoTrader
- # Main container for listings: 'div[data-qaid="cntnr-lstng-main"]' (this might be too broad or incorrect)
- # A more specific item selector might be needed, e.g., an article or a div with a specific class.
- # For now, let's assume individual listing cards can be found with a selector like:
- # "div.inventory-listing" or "div[data-cmp='inventoryListing']" - these are common patterns.
- # The provided example 'div[data-qaid="cntnr-lstng-main"]' seems like it might be a single container FOR ALL listings.
- # Let's try a more specific (but still guessed) selector for individual listing items.
- # A common pattern is items within a list or grid. Let's try to find items:
- # This selector is a **GUESS** based on common AutoTrader structures.
- listing_item_selector = "div[data-cmp='inventoryListing']" # GUESS
-
- # Fallback if the primary guess doesn't work, try another common pattern
- # listing_item_selector_fallback = "div.inventory-listing.new-listing.stub" # Another GUESS
-
- # await page.wait_for_selector(listing_item_selector, timeout=15000) # Wait for items to appear
-
- listing_containers = await page.query_selector_all(listing_item_selector)
-
- # if not listing_containers:
- # logging.info(f"No listings found with primary selector '{listing_item_selector}'. Trying fallback...")
- # listing_containers = await page.query_selector_all(listing_item_selector_fallback)
-
- logging.info(f"Found {len(listing_containers)} potential listing containers using selector '{listing_item_selector}'.")
-
- processed_count = 0
- # first_container_processed_for_html_dump = False # REMOVE HTML DUMP FLAG
- for i, container in enumerate(listing_containers):
- url_path = None
- title_text = "N/A" # Default to N/A
- price_text = "N/A" # Default to N/A
- mileage_text = "N/A" # Default to N/A (as it's not reliably on card)
- listing_url = None
-
- try:
- logging.debug(f"Processing container {i+1}/{len(listing_containers)}")
-
- # Attempt to get Title
- title_el = await container.query_selector("h2[data-cmp='subheading']") # Updated selector from HTML dump
- if title_el:
- raw_title_text = await title_el.inner_text()
- title_text = raw_title_text.strip() if raw_title_text else "N/A"
-
- # Attempt to get URL from parent of title_el
- # Playwright's query_selector does not directly support xpath like "ancestor::a".
- # A common structure is ...
or ...
- # We can try to find 'a' that contains this h2, or assume the 'a[data-cmp="link"]' is the one.
-
- # Let's use the a[data-cmp="link"] which was identified as containing the title h2
- parent_link_el = await container.query_selector("a[data-cmp='link']")
- if parent_link_el:
- url_path = await parent_link_el.get_attribute("href")
- else: # Fallback if the above structure isn't found
- logging.warning(f"Could not find parent a[data-cmp='link'] for title in listing {i+1}")
- else:
- logging.warning(f"Title not found with h2[data-cmp='subheading'] for listing {i+1}.")
-
- # Fallback or alternative for URL if not found via title's parent link
- if not url_path:
- url_el_alt = await container.query_selector("a[data-cmp='relLnk']") # Keep this fallback
- if url_el_alt:
- url_path = await url_el_alt.get_attribute("href")
-
- if not url_path: # Last resort for URL
- first_a = await container.query_selector("a[href]") # Broadest fallback
- if first_a:
- url_path = await first_a.get_attribute("href")
-
- if not url_path:
- logging.warning(f"Could not extract URL for listing {i+1} (Title: {title_text}). Skipping.")
- continue
-
- if not url_path.startswith(('http://', 'https://')):
- listing_url = f"https://www.autotrader.com{url_path}"
- else:
- listing_url = url_path
-
- # Attempt to get Price
- price_el = await container.query_selector("div[data-cmp='firstPrice']") # Updated selector
- if price_el:
- raw_price_text = await price_el.inner_text()
- price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A"
- else:
- # Fallback for price (e.g. .first-price class directly)
- price_el_fallback = await container.query_selector(".first-price")
- if price_el_fallback:
- raw_price_text = await price_el_fallback.inner_text()
- price_text = raw_price_text.replace('$', '').replace(',', '').strip() if raw_price_text else "N/A"
- else:
- logging.warning(f"Price not found for listing {listing_url}")
- price_text = "N/A"
-
-
- # Mileage - Set to N/A as it's not reliably on the card from previous findings
- mileage_text = "N/A"
- # logging.info(f"Mileage not scraped from listing card for {listing_url} (by design for now).")
-
- vin_text = None
-
- listing_data = {
- "listing_url": listing_url,
- "title": title_text, # Already defaults to N/A or has value
- "price": price_text, # Already defaults to N/A or has value
- "mileage": mileage_text, # Is N/A
- "vin": vin_text,
- "source_name": self.source_name,
- "data_points": {
- "page_title_at_scrape": title # page's title, not listing's
- }
- }
- listings_data.append(listing_data)
- processed_count += 1
- logging.info(f"Successfully processed listing: {title_text[:50]}... URL: {listing_url}")
-
- except Exception as e:
- logging.error(f"Error processing listing container {i+1} for URL {listing_url if listing_url else 'Unknown'}: {e}", exc_info=True)
- continue
-
- logging.info(f"Successfully processed {processed_count} out of {len(listing_containers)} listing containers.")
-
- except Exception as e:
- logging.error(f"An error occurred during Playwright scraping phase: {e}", exc_info=True)
- finally:
- if browser:
- logging.info("Closing browser.")
- await browser.close()
-
- return listings_data
-
-
-async def scrape_autotrader_data(autotrader_url: str, headless: bool = True, timeout: int = 120000) -> list[dict]:
- """
- High-level function to scrape data from AutoTrader.
- Initializes the scraper and calls its scraping method.
-
- Args:
- autotrader_url (str): The URL to scrape.
- headless (bool): Whether to run the browser in headless mode.
- timeout (int): Timeout for scraping operations in milliseconds.
-
- Returns:
- list[dict]: A list of scraped listing data.
- """
- scraper = AutoTraderScraper()
- listings = await scraper.get_private_listings(autotrader_url=autotrader_url, headless=headless, timeout=timeout)
- return listings
-
-
-async def scrape_autotrader_and_update_db(db: Session, autotrader_url: str, headless: bool, scrape_timeout: int):
- """
- Scrapes listings from AutoTrader and updates the database.
-
- Args:
- db (Session): The SQLAlchemy database session.
- autotrader_url (str): The URL to scrape.
- headless (bool): Whether to run the browser in headless mode.
- scrape_timeout (int): Timeout for scraping operations in milliseconds.
-
- Returns:
- dict: A status dictionary with counts of added, updated, and scraped listings.
- """
- logging.info(f"Starting scrape and update for URL: {autotrader_url}")
-
- try:
- listings_data = await scrape_autotrader_data(
- autotrader_url=autotrader_url,
- headless=headless,
- timeout=scrape_timeout
- )
- except Exception as e:
- logging.error(f"Failed to scrape data from {autotrader_url}: {e}", exc_info=True)
- return {"status": "error", "message": f"Scraping failed: {e}"}
-
- added_count = 0
- updated_count = 0
- scraped_count = len(listings_data)
-
- for listing_data in listings_data:
- source_url = listing_data.get('listing_url') # Renamed from 'url' to 'listing_url' in dummy data
- if not source_url:
- logging.warning(f"Scraped item missing 'listing_url': {listing_data.get('title')}. Skipping.")
- continue
-
- try:
- existing_listing = db.query(CarListing).filter(CarListing.source_url == source_url).first()
-
- if existing_listing:
- # Placeholder for update logic
- # existing_listing.extracted_at = datetime.utcnow()
- # existing_listing.data_points = {k: v for k, v in listing_data.items() if k != 'listing_url'}
- # # Update other fields like price if necessary
- # db.add(existing_listing) # Not strictly necessary if only mutable fields changed and session tracks
- updated_count += 1
- logging.info(f"Listing at {source_url} already exists. Marked for update (placeholder).")
- else:
- new_listing = CarListing(
- platform="autotrader",
- extracted_at=datetime.utcnow(),
- source_url=source_url,
- # Ensure data_points stores everything else from listing_data
- data_points={k: v for k, v in listing_data.items() if k != 'listing_url'}
- )
- db.add(new_listing)
- added_count += 1
- logging.info(f"New listing added from {source_url}")
- except Exception as e:
- logging.error(f"Error processing listing {source_url} for DB: {e}", exc_info=True)
- # Decide if you want to rollback here or continue with other listings
-
- try:
- db.commit()
- logging.info("Database changes committed.")
- except Exception as e:
- logging.error(f"Database commit failed: {e}", exc_info=True)
- db.rollback()
- return {"status": "error", "message": f"DB commit failed: {e}", "added": 0, "updated": 0, "scraped_count": scraped_count}
-
- status_summary = {
- "status": "success",
- "added": added_count,
- "updated": updated_count,
- "scraped_count": scraped_count
- }
- logging.info(f"DB update summary: {status_summary}")
- return status_summary
-
-async def main():
- # Use settings from config.py
- # url = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller")
- # headless_str = os.getenv("HEADLESS_BROWSER", "True")
- # headless = headless_str.lower() == "true"
- # scrape_timeout_str = os.getenv("SCRAPE_TIMEOUT", "120000")
- # try:
- # scrape_timeout = int(scrape_timeout_str)
- # except ValueError:
- # logging.warning(f"Invalid SCRAPE_TIMEOUT value: {scrape_timeout_str}. Defaulting to 120000ms.")
- # scrape_timeout = 120000
-
- # from database import SessionLocal # Already imported at the top
- db: Session = SessionLocal() # SessionLocal now uses DATABASE_URL from config.py via database.py
- try:
- logging.info(f"Starting scraper and DB update for URL: {AUTOTRADER_URL}, Headless: {HEADLESS_BROWSER}, Timeout: {SCRAPE_TIMEOUT}ms")
- stats = await scrape_autotrader_and_update_db(
- db=db,
- autotrader_url=AUTOTRADER_URL,
- headless=HEADLESS_BROWSER,
- scrape_timeout=SCRAPE_TIMEOUT
- )
- logging.info(f"Scraping and DB update completed: {stats}")
- except Exception as e:
- logging.error(f"Error during scraping and DB update in main: {e}", exc_info=True)
- finally:
- logging.info("Closing DB session in main.")
- db.close()
-
-if __name__ == "__main__":
- # To run this:
- # 1. Ensure Playwright browsers are installed: `playwright install chromium`
- # 2. Set environment variables if needed (AUTOTRADER_URL, HEADLESS_BROWSER, SCRAPE_TIMEOUT)
- # 3. Uncomment the line below
- asyncio.run(main())
- # pass # Keep it passive for now, to be run manually when needed
diff --git a/config.py b/config.py
deleted file mode 100644
index 44148ca..0000000
--- a/config.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-from dotenv import load_dotenv
-
-# Load environment variables from .env file if it exists
-# This is useful for local development.
-load_dotenv()
-
-# Database Configuration
-DATABASE_URL: str = os.getenv("DATABASE_URL", "sqlite:///./data/vehicle_tracker.db")
-
-# Scraper Configuration
-AUTOTRADER_URL: str = os.getenv("AUTOTRADER_URL", "https://www.autotrader.com/cars-for-sale/private-seller")
-SCRAPE_TIMEOUT: int = int(os.getenv("SCRAPE_TIMEOUT", "120000")) # Milliseconds
-HEADLESS_BROWSER: bool = os.getenv("HEADLESS_BROWSER", "True").lower() == "true"
-
-# API Configuration (if any specific ones are needed later)
-# Example: API_HOST: str = os.getenv("API_HOST", "0.0.0.0")
-# Example: API_PORT: int = int(os.getenv("API_PORT", "8000"))
-
-# Logging Configuration (can also be added here if more complex)
-LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO").upper()
-
-# Ensure critical URLs have a scheme for robustness
-if not AUTOTRADER_URL.startswith(("http://", "https://")):
- # This print statement is for immediate feedback during startup/import.
- # In a pure library, side effects on import are sometimes discouraged,
- # but for an application's main config, it's often acceptable.
- print(f"Warning: AUTOTRADER_URL ('{AUTOTRADER_URL}') did not have a scheme, prepended https://.")
- AUTOTRADER_URL = "https://" + AUTOTRADER_URL
- print(f"Corrected AUTOTRADER_URL: {AUTOTRADER_URL}")
-
-
-# Example of how to handle SQLite connect_args based on config
-DB_CONNECT_ARGS: dict = {"check_same_thread": False} if DATABASE_URL.startswith("sqlite") else {}
diff --git a/database.py b/database.py
deleted file mode 100644
index 58a11dc..0000000
--- a/database.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from sqlalchemy import Column, Integer, String, DateTime, JSON, create_engine
-from sqlalchemy.orm import declarative_base, sessionmaker, Session
-from config import DATABASE_URL, DB_CONNECT_ARGS # Import from config
-
-# Use imported configuration
-engine = create_engine(DATABASE_URL, connect_args=DB_CONNECT_ARGS)
-
-SessionLocal = sessionmaker(bind=engine, autoflush=False)
-Base = declarative_base()
-
-class CarListing(Base):
- __tablename__ = "listings"
- id = Column(Integer, primary_key=True, index=True)
- platform = Column(String)
- extracted_at = Column(DateTime)
- source_url = Column(String, unique=True)
- data_points = Column(JSON)
-
-Base.metadata.create_all(bind=engine)
-
-def get_db():
- db = SessionLocal()
- try:
- yield db
- finally:
- db.close()
diff --git a/package.json b/package.json
deleted file mode 100644
index 5f62ed5..0000000
--- a/package.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
- "name": "autotrader-scraper",
- "version": "1.0.0",
- "description": "A FastAPI application for scraping Autotrader data.",
- "main": "index.js",
- "scripts": {
- "start": "python app.py",
- "test": "echo \"Error: no test specified\" && exit 1"
- },
- "keywords": ["fastapi", "autotrader", "scraper", "web-scraping"],
- "author": "",
- "license": "ISC"
-}
diff --git a/render.yaml b/render.yaml
index b3c7392..f755e35 100644
--- a/render.yaml
+++ b/render.yaml
@@ -11,9 +11,19 @@ services:
value: 3.11
- key: DATABASE_URL
generateValue: true
- - key: AUTOTRADER_URL
- value: "https://www.autotrader.com/cars-for-sale/private-seller"
- - key: SCRAPE_TIMEOUT
- value: 120000
- - key: HEADLESS_BROWSER
+ - key: HEADLESS
value: "True"
+ - key: BROWSER_TIMEOUT
+ value: "60000"
+ - key: PAGE_DELAY
+ value: "5000"
+ - key: MIN_DELAY_BETWEEN_ACTIONS
+ value: "2.5"
+ - key: MAX_LISTINGS_PER_SESSION
+ value: "25"
+ - key: PROXY_SERVER
+ value: ""
+ - key: PROXY_USERNAME
+ value: ""
+ - key: PROXY_PASSWORD
+ value: ""
diff --git a/src/api/routes.py b/src/api/routes.py
index c1f5764..612fb3f 100644
--- a/src/api/routes.py
+++ b/src/api/routes.py
@@ -121,8 +121,10 @@ async def api_v1_root_info():
@router.get("/vehicles/", response_model=List[VehicleListingResponse])
async def get_all_vehicles(
+ skip: int = Query(0, ge=0),
+ limit: int = Query(settings.MAX_LISTINGS_PER_SESSION, ge=1, le=200),
db: AsyncSession = Depends(get_db),
- filters: SearchFilters = Depends()
+ filters: SearchFilters = Depends(),
):
query = select(VehicleListing)
conditions = []
@@ -151,7 +153,7 @@ async def get_all_vehicles(
if conditions:
query = query.where(and_(*conditions))
query = query.order_by(VehicleListing.last_scraped_at.desc(), VehicleListing.created_at.desc())
- result = await db.execute(query)
+ result = await db.execute(query.offset(skip).limit(limit))
vehicles = result.scalars().all()
response_vehicles = []
for vehicle_db_item in vehicles:
diff --git a/src/automation/browser_sim.py b/src/automation/browser_sim.py
index 96ed36f..c93f204 100644
--- a/src/automation/browser_sim.py
+++ b/src/automation/browser_sim.py
@@ -25,8 +25,16 @@ async def __aenter__(self):
logger.info("Initializing AutoTrader Scraper...")
self.playwright_instance = await async_playwright().start()
try:
+ proxy_cfg = None
+ if settings.PROXY_SERVER:
+ proxy_cfg = {"server": settings.PROXY_SERVER}
+ if settings.PROXY_USERNAME and settings.PROXY_PASSWORD:
+ proxy_cfg["username"] = settings.PROXY_USERNAME
+ proxy_cfg["password"] = settings.PROXY_PASSWORD
+
self.browser = await self.playwright_instance.chromium.launch(
headless=settings.HEADLESS,
+ proxy=proxy_cfg,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
diff --git a/src/config.py b/src/config.py
index 598bc2e..94a9b77 100644
--- a/src/config.py
+++ b/src/config.py
@@ -13,4 +13,9 @@ class Settings:
API_PORT: int = int(os.getenv("API_PORT", "8000"))
MAX_LISTINGS_PER_SESSION: int = int(os.getenv("MAX_LISTINGS_PER_SESSION", "25"))
+ # Proxy configuration
+ PROXY_SERVER: str | None = os.getenv("PROXY_SERVER")
+ PROXY_USERNAME: str | None = os.getenv("PROXY_USERNAME")
+ PROXY_PASSWORD: str | None = os.getenv("PROXY_PASSWORD")
+
settings = Settings()
diff --git a/stealth_utils.py b/stealth_utils.py
deleted file mode 100644
index e956687..0000000
--- a/stealth_utils.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import logging
-
-async def apply_stealth_js(page):
- """
- Applies various JavaScript injections to make Playwright less detectable.
- """
- try:
- # Pass the User-Agent test (though Playwright usually handles this well)
- # user_agent = await page.evaluate("() => navigator.userAgent")
- # await page.set_extra_http_headers({'User-Agent': user_agent.replace("HeadlessChrome", "Chrome")}) # Example
-
- # Pass the WebGL test
- await page.add_init_script("(() => { const getParameter = WebGLRenderingContext.prototype.getParameter; WebGLRenderingContext.prototype.getParameter = function(parameter) { if (parameter === 37445) { return 'Intel Open Source Technology Center'; } if (parameter === 37446) { return 'Mesa DRI Intel(R) Ivybridge Mobile '; } return getParameter(parameter); }; })()")
-
- # Pass the Chrome test
- await page.add_init_script("(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); })()")
- await page.add_init_script("(() => { window.chrome = { runtime: {}, loadTimes: function(){}, csi: function(){} }; })()")
-
- # Pass the Permissions test
- await page.add_init_script("(() => { const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); })()")
-
- # Pass the Plugins Length test
- await page.add_init_script("(() => { Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); })()")
-
- # Pass the Languages test
- await page.add_init_script("(() => { Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); })()")
-
- logging.info("Applied JavaScript stealth techniques from stealth_utils.")
- except Exception as e:
- logging.error(f"Error applying stealth JS from stealth_utils: {e}", exc_info=True)