diff --git a/backend/README.md b/backend/README.md index e69de29..92718b3 100644 --- a/backend/README.md +++ b/backend/README.md @@ -0,0 +1,177 @@ +# Backend Setup and API Testing Guide + +This guide explains how to set up the backend environment, run the server, and test the endpoints for the project. + +## Prerequisites + +- **Python**: Ensure you have Python 3.10 or later installed. +- **pip**: Comes pre-installed with Python. If not, install it. +- **Postman** (or cURL): For API testing. +- **Git**: For cloning the repository. + +## Steps to Set Up and Run the Backend Server + +### 1. Clone the Repository + +```bash +git clone https://github.com/your-repo/fintech-hackathon.git +cd fintech-hackathon/backend +``` + +### 2. Create a Virtual Environment + +Create and activate a Python virtual environment to isolate dependencies. + +#### macOS/Linux: + +```bash +python3 -m venv fintech-env +source fintech-env/bin/activate +``` + +#### Windows: + +```bash +python -m venv fintech-env +fintech-env\Scripts\activate +``` + +### 3. Install Dependencies + +Install all required Python libraries: + +```bash +pip install -r requirements.txt +``` + +### 4. Set Up Environment Variables + +Create a `.env` file in the `backend/` directory with the following content: + +``` +OPENAI_API_KEY= +``` + +Replace `` with your actual OpenAI API key. + +### 5. Run the Backend Server + +Start the FastAPI server: + +```bash +uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +``` + +The server will be available at `http://127.0.0.1:8000`. + +--- + +## Testing the API Endpoints + +You can test the API endpoints using **Postman** or **cURL**. + +### 1. Using Postman + +1. Open Postman and create a new **POST** request. +2. Set the URL to: + ``` + http://127.0.0.1:8000/scrape + ``` +3. In the **Body** tab, select `raw` and set the content type to `JSON`. +4. Enter the following JSON payload: + + ```json + { + "prompt": "List all projects with their description.", + "url": "https://perinim.github.io/projects/" + } + ``` + +5. Click **Send**. + +6. You should receive a response similar to this: + + ```json + { + "result": [ + { + "name": "Project A", + "description": "Description of Project A", + "link": "https://example.com/project-a" + }, + { + "name": "Project B", + "description": "Description of Project B", + "link": "https://example.com/project-b" + } + ] + } + ``` + +### 2. Using cURL + +Alternatively, you can use `cURL` to test the endpoint: + +```bash +curl -X POST http://127.0.0.1:8000/scrape \ +-H "Content-Type: application/json" \ +-d '{"prompt": "List all projects with their description.", "url": "https://perinim.github.io/projects/"}' +``` + +Expected Output: +```json +{ + "result": [ + { + "name": "Project A", + "description": "Description of Project A", + "link": "https://example.com/project-a" + }, + { + "name": "Project B", + "description": "Description of Project B", + "link": "https://example.com/project-b" + } + ] +} +``` + +--- + +## Troubleshooting + +### Common Issues + +1. **Virtual Environment Not Found**: + - Ensure the virtual environment was created and activated correctly. + +2. **ModuleNotFoundError**: + - Check that all dependencies are installed by running: + ```bash + pip install -r requirements.txt + ``` + +3. **500 Internal Server Error**: + - Verify your `.env` file contains a valid `OPENAI_API_KEY`. + +4. **Subprocess Errors**: + - Ensure the `subprocess_scraper.py` file is in the correct directory (`backend/app/`). + +--- + +## Project Structure + +``` +backend/ +├── app/ +│ ├── __init__.py +│ ├── config.py +│ ├── main.py +│ ├── scrape_utils.py +│ └── subprocess_scraper.py +├── fintech-env/ # Virtual environment (not in Git) +├── requirements.txt +└── README.md +``` + +--- \ No newline at end of file diff --git a/backend/app/main.py b/backend/app/main.py index 9f9209d..e1c14e9 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,12 +1,6 @@ -# backend/app/main.py - -from fastapi import FastAPI, HTTPException, Body +from fastapi import FastAPI, HTTPException from pydantic import BaseModel -from typing import List -from app.scrape_utils import run_scraper -import tracemalloc -## trace issue -tracemalloc.start() +from app.utils.scraper_manager import ScraperManager app = FastAPI(title="ScrapeGraphAI Backend") @@ -14,15 +8,17 @@ class ScrapeRequest(BaseModel): prompt: str url: str + scraper_type: str = "default" # Optional field to select different scrapers @app.post("/scrape") async def scrape_endpoint(request: ScrapeRequest): """ - FastAPI endpoint to run the scraper with the given prompt and URL. + FastAPI endpoint to run the scraper with the given prompt, URL, and scraper type. """ try: - result = await run_scraper(prompt=request.prompt, source_url=request.url) + scraper_manager = ScraperManager(request.scraper_type) + result = await scraper_manager.run_scraper(prompt=request.prompt, source_url=request.url) return {"result": result} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/backend/app/scrape_utils.py b/backend/app/scrape_utils.py deleted file mode 100644 index 0b76e89..0000000 --- a/backend/app/scrape_utils.py +++ /dev/null @@ -1,59 +0,0 @@ -from app.config import OPENAI_API_KEY -import asyncio -import json -import subprocess -import os - -def run_scraper_subprocess(prompt, source_url): - """ - Run the scraper in a separate subprocess to avoid asyncio conflicts. - - Args: - prompt (str): The prompt to pass to the LLM. - source_url (str): The URL of the page to scrape. - - Returns: - dict: The result from the scraper. - """ - script_path = os.path.join(os.path.dirname(__file__), "subprocess_scraper.py") - - # Ensure the script exists - if not os.path.exists(script_path): - raise FileNotFoundError(f"Subprocess script not found: {script_path}") - # Path to the Python executable within the virtual environment - python_executable = os.path.join( - os.getenv("fintech-env") or "", "bin", "python3" - ) - # Fallback to system Python if VIRTUAL_ENV is not set - if not os.path.exists(python_executable): - python_executable = "python3" - - # Command to execute the scraper script in a subprocess - process = subprocess.run( - [ - "python3", - script_path, - json.dumps({"prompt": prompt, "source_url": source_url}), - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - - if process.returncode != 0: - raise RuntimeError( - f"Subprocess failed with error: {process.stderr}" - ) - - return json.loads(process.stdout) - - - -async def run_scraper(prompt, source_url): - """ - Wrapper for the subprocess call to integrate with FastAPI's async system. - """ - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, run_scraper_subprocess, prompt, source_url - ) \ No newline at end of file diff --git a/backend/app/scrapers/__init__.py b/backend/app/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/scrapers/company_scraper.py b/backend/app/scrapers/company_scraper.py new file mode 100644 index 0000000..88c49ef --- /dev/null +++ b/backend/app/scrapers/company_scraper.py @@ -0,0 +1,44 @@ +import sys +import json +import time +from scrapegraphai.graphs import SearchGraph +sys.path.append('/Users/kaungzinye/Documents/SWE/fintech-hackathon/backend') +from app.config import OPENAI_API_KEY + +def main(): + input_data = json.loads(sys.argv[1]) + prompt = input_data["prompt"] + source_url = input_data["source_url"] + + graph_config = { + "llm": { + "api_key": OPENAI_API_KEY, + "model": "openai/gpt-4o-mini", + }, + } + + search_scraper_graph = SearchGraph( + prompt=prompt, + schema = "default", + config=graph_config, + ) + + max_retries = 3 # Number of retry attempts + retry_delay = 5 # Seconds to wait between retries + + for attempt in range(1, max_retries + 1): + try: + print(f"Attempt {attempt} of {max_retries}") + result = search_scraper_graph.run() + print(json.dumps(result)) + break # Exit the loop if successful + except Exception as e: + print(f"Attempt {attempt} failed: {str(e)}", file=sys.stderr) + if attempt < max_retries: + time.sleep(retry_delay) # Wait before retrying + else: + print("All attempts failed.", file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/backend/app/subprocess_scraper.py b/backend/app/scrapers/default_scraper.py similarity index 53% rename from backend/app/subprocess_scraper.py rename to backend/app/scrapers/default_scraper.py index 154651a..c149bd3 100644 --- a/backend/app/subprocess_scraper.py +++ b/backend/app/scrapers/default_scraper.py @@ -1,26 +1,14 @@ import sys -import os import json from scrapegraphai.graphs import SmartScraperGraph - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from app.config import OPENAI_API_KEY +#this is the default scraper that will be called, this can be used as a template for other scrapers def main(): - # Parse arguments passed to the subprocess - try: - input_data = json.loads(sys.argv[1]) - except Exception as e: - print(f"Error parsing input data: {str(e)}", file=sys.stderr) - sys.exit(1) + input_data = json.loads(sys.argv[1]) prompt = input_data["prompt"] source_url = input_data["source_url"] - if not prompt or not source_url: - print("Error: Missing required arguments 'prompt' or 'source_url'.", file=sys.stderr) - sys.exit(1) - graph_config = { "llm": { "api_key": OPENAI_API_KEY, @@ -28,7 +16,6 @@ def main(): }, } - # Run the scraper smart_scraper_graph = SmartScraperGraph( prompt=prompt, source=source_url, @@ -36,7 +23,7 @@ def main(): ) try: - result = smart_scraper_graph.run() # Run synchronously in the subprocess + result = smart_scraper_graph.run() print(json.dumps(result)) except Exception as e: print(f"Error: {str(e)}", file=sys.stderr) diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/utils/scraper_manager.py b/backend/app/utils/scraper_manager.py new file mode 100644 index 0000000..7838d29 --- /dev/null +++ b/backend/app/utils/scraper_manager.py @@ -0,0 +1,21 @@ +import asyncio +from app.utils.subprocess_manager import SubprocessManager +# this is the class that will manage the different scrapers +class ScraperManager: + def __init__(self, scraper_type="default"): + self.scraper_type = scraper_type + self.scraper_scripts = { + "default": "default_scraper.py", + "advanced": "advanced_scraper.py", + "company": "company_scraper.py", + } + + async def run_scraper(self, prompt, source_url): + """ + Run the selected scraper asynchronously. + """ + if self.scraper_type not in self.scraper_scripts: + raise ValueError(f"Unknown scraper type: {self.scraper_type}") + + subprocess_manager = SubprocessManager(self.scraper_scripts[self.scraper_type]) + return await subprocess_manager.run(prompt, source_url) diff --git a/backend/app/utils/subprocess_manager.py b/backend/app/utils/subprocess_manager.py new file mode 100644 index 0000000..8dedd2a --- /dev/null +++ b/backend/app/utils/subprocess_manager.py @@ -0,0 +1,43 @@ +import os +import subprocess +import json +import asyncio +#this is the class that manages the subprocesses, it will be used to run the scraper scripts +class SubprocessManager: + def __init__(self, script_name): + self.script_path = os.path.join(os.path.dirname(__file__), "..", "scrapers", script_name) + if not os.path.exists(self.script_path): + raise FileNotFoundError(f"Subprocess script not found: {self.script_path}") + + def run_subprocess(self, prompt, source_url): + """ + Run the subprocess for the specified scraper script. + """ + python_executable = os.path.join( + os.getenv("VIRTUAL_ENV") or "", "bin", "python3" + ) + if not os.path.exists(python_executable): + python_executable = "python3" + + process = subprocess.run( + [ + python_executable, + self.script_path, + json.dumps({"prompt": prompt, "source_url": source_url}), + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + if process.returncode != 0: + raise RuntimeError(f"Subprocess failed with error: {process.stderr}") + + return json.loads(process.stdout) + + async def run(self, prompt, source_url): + """ + Async wrapper for subprocess execution. + """ + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self.run_subprocess, prompt, source_url)