pastchum · pramoth20 · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025
diff --git a/backend/README.md b/backend/README.md
@@ -0,0 +1,177 @@
+# Backend Setup and API Testing Guide
+
+This guide explains how to set up the backend environment, run the server, and test the endpoints for the project.
+
+## Prerequisites
+
+- **Python**: Ensure you have Python 3.10 or later installed.
+- **pip**: Comes pre-installed with Python. If not, install it.
+- **Postman** (or cURL): For API testing.
+- **Git**: For cloning the repository.
+
+## Steps to Set Up and Run the Backend Server
+
+### 1. Clone the Repository
+
+```bash
+git clone https://github.com/your-repo/fintech-hackathon.git
+cd fintech-hackathon/backend
+```
+
+### 2. Create a Virtual Environment
+
+Create and activate a Python virtual environment to isolate dependencies.
+
+#### macOS/Linux:
+
+```bash
+python3 -m venv fintech-env
+source fintech-env/bin/activate
+```
+
+#### Windows:
+
+```bash
+python -m venv fintech-env
+fintech-env\Scripts\activate
+```
+
+### 3. Install Dependencies
+
+Install all required Python libraries:
+
+```bash
+pip install -r requirements.txt
+```
+
+### 4. Set Up Environment Variables
+
+Create a `.env` file in the `backend/` directory with the following content:
+
+```
+OPENAI_API_KEY=<your-openai-api-key>
+```
+
+Replace `<your-openai-api-key>` with your actual OpenAI API key.
+
+### 5. Run the Backend Server
+
+Start the FastAPI server:
+
+```bash
+uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
+```
+
+The server will be available at `http://127.0.0.1:8000`.
+
+---
+
+## Testing the API Endpoints
+
+You can test the API endpoints using **Postman** or **cURL**.
+
+### 1. Using Postman
+
+1. Open Postman and create a new **POST** request.
+2. Set the URL to:
+   ```
+   http://127.0.0.1:8000/scrape
+   ```
+3. In the **Body** tab, select `raw` and set the content type to `JSON`.
+4. Enter the following JSON payload:
+
+   ```json
+   {
+       "prompt": "List all projects with their description.",
+       "url": "https://perinim.github.io/projects/"
+   }
+   ```
+
+5. Click **Send**.
+
+6. You should receive a response similar to this:
+
+   ```json
+   {
+       "result": [
+           {
+               "name": "Project A",
+               "description": "Description of Project A",
+               "link": "https://example.com/project-a"
+           },
+           {
+               "name": "Project B",
+               "description": "Description of Project B",
+               "link": "https://example.com/project-b"
+           }
+       ]
+   }
+   ```
+
+### 2. Using cURL
+
+Alternatively, you can use `cURL` to test the endpoint:
+
+```bash
+curl -X POST http://127.0.0.1:8000/scrape \
+-H "Content-Type: application/json" \
+-d '{"prompt": "List all projects with their description.", "url": "https://perinim.github.io/projects/"}'
+```
+
+Expected Output:
+```json
+{
+    "result": [
+        {
+            "name": "Project A",
+            "description": "Description of Project A",
+            "link": "https://example.com/project-a"
+        },
+        {
+            "name": "Project B",
+            "description": "Description of Project B",
+            "link": "https://example.com/project-b"
+        }
+    ]
+}
+```
+
+---
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Virtual Environment Not Found**:
+   - Ensure the virtual environment was created and activated correctly.
+
+2. **ModuleNotFoundError**:
+   - Check that all dependencies are installed by running:
+     ```bash
+     pip install -r requirements.txt
+     ```
+
+3. **500 Internal Server Error**:
+   - Verify your `.env` file contains a valid `OPENAI_API_KEY`.
+
+4. **Subprocess Errors**:
+   - Ensure the `subprocess_scraper.py` file is in the correct directory (`backend/app/`).
+
+---
+
+## Project Structure
+
+```
+backend/
+├── app/
+│   ├── __init__.py
+│   ├── config.py
+│   ├── main.py
+│   ├── scrape_utils.py
+│   └── subprocess_scraper.py
+├── fintech-env/  # Virtual environment (not in Git)
+├── requirements.txt
+└── README.md
+```
+
+---
diff --git a/backend/app/main.py b/backend/app/main.py
@@ -1,28 +1,24 @@
-# backend/app/main.py
-
-from fastapi import FastAPI, HTTPException, Body
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from typing import List
-from app.scrape_utils import run_scraper
-import tracemalloc
-## trace issue
-tracemalloc.start()
+from app.utils.scraper_manager import ScraperManager
 
 app = FastAPI(title="ScrapeGraphAI Backend")
 
 
 class ScrapeRequest(BaseModel):
     prompt: str
     url: str
+    scraper_type: str = "default"  # Optional field to select different scrapers
 
 
 @app.post("/scrape")
 async def scrape_endpoint(request: ScrapeRequest):
     """
-    FastAPI endpoint to run the scraper with the given prompt and URL.
+    FastAPI endpoint to run the scraper with the given prompt, URL, and scraper type.
     """
     try:
-        result = await run_scraper(prompt=request.prompt, source_url=request.url)
+        scraper_manager = ScraperManager(request.scraper_type)
+        result = await scraper_manager.run_scraper(prompt=request.prompt, source_url=request.url)
         return {"result": result}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
diff --git a/backend/app/scrape_utils.py b/backend/app/scrape_utils.py
diff --git a/backend/app/scrapers/__init__.py b/backend/app/scrapers/__init__.py
diff --git a/backend/app/scrapers/company_scraper.py b/backend/app/scrapers/company_scraper.py
@@ -0,0 +1,44 @@
+import sys
+import json
+import time
+from scrapegraphai.graphs import SearchGraph
+sys.path.append('/Users/kaungzinye/Documents/SWE/fintech-hackathon/backend')
+from app.config import OPENAI_API_KEY
+
+def main():
+    input_data = json.loads(sys.argv[1])
+    prompt = input_data["prompt"]
+    source_url = input_data["source_url"]
+
+    graph_config = {
+        "llm": {
+            "api_key": OPENAI_API_KEY,
+            "model": "openai/gpt-4o-mini",
+        },
+    }
+
+    search_scraper_graph = SearchGraph(
+        prompt=prompt,
+        schema = "default",
+        config=graph_config,
+    )
+
+    max_retries = 3  # Number of retry attempts
+    retry_delay = 5  # Seconds to wait between retries
+
+    for attempt in range(1, max_retries + 1):
+        try:
+            print(f"Attempt {attempt} of {max_retries}")
+            result = search_scraper_graph.run()
+            print(json.dumps(result))
+            break  # Exit the loop if successful
+        except Exception as e:
+            print(f"Attempt {attempt} failed: {str(e)}", file=sys.stderr)
+            if attempt < max_retries:
+                time.sleep(retry_delay)  # Wait before retrying
+            else:
+                print("All attempts failed.", file=sys.stderr)
+                sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/app/subprocess_scraper.py → backend/app/scrapers/default_scraper.py b/backend/app/subprocess_scraper.py → backend/app/scrapers/default_scraper.py
@@ -1,42 +1,29 @@
 import sys
-import os
 import json
 from scrapegraphai.graphs import SmartScraperGraph
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
 from app.config import OPENAI_API_KEY
 
+#this is the default scraper that will be called, this can be used as a template for other scrapers
 def main():
-    # Parse arguments passed to the subprocess
-    try:
-        input_data = json.loads(sys.argv[1])
-    except Exception as e:
-        print(f"Error parsing input data: {str(e)}", file=sys.stderr)
-        sys.exit(1)
+    input_data = json.loads(sys.argv[1])
     prompt = input_data["prompt"]
     source_url = input_data["source_url"]
 
-    if not prompt or not source_url:
-        print("Error: Missing required arguments 'prompt' or 'source_url'.", file=sys.stderr)
-        sys.exit(1)
-
     graph_config = {
         "llm": {
             "api_key": OPENAI_API_KEY,
             "model": "openai/gpt-4o-mini",
         },
     }
 
-    # Run the scraper
     smart_scraper_graph = SmartScraperGraph(
         prompt=prompt,
         source=source_url,
         config=graph_config,
     )
 
     try:
-        result = smart_scraper_graph.run()  # Run synchronously in the subprocess
+        result = smart_scraper_graph.run()
         print(json.dumps(result))
     except Exception as e:
         print(f"Error: {str(e)}", file=sys.stderr)

diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py
diff --git a/backend/app/utils/scraper_manager.py b/backend/app/utils/scraper_manager.py
@@ -0,0 +1,21 @@
+import asyncio
+from app.utils.subprocess_manager import SubprocessManager
+# this is the class that will manage the different scrapers
+class ScraperManager:
+    def __init__(self, scraper_type="default"):
+        self.scraper_type = scraper_type
+        self.scraper_scripts = {
+            "default": "default_scraper.py",
+            "advanced": "advanced_scraper.py",
+            "company": "company_scraper.py",
+        }
+
+    async def run_scraper(self, prompt, source_url):
+        """
+        Run the selected scraper asynchronously.
+        """
+        if self.scraper_type not in self.scraper_scripts:
+            raise ValueError(f"Unknown scraper type: {self.scraper_type}")
+
+        subprocess_manager = SubprocessManager(self.scraper_scripts[self.scraper_type])
+        return await subprocess_manager.run(prompt, source_url)