From 7d63bc83d0323c58b846683f5a879ff459e62db4 Mon Sep 17 00:00:00 2001 From: Kaung Z Y Naung Date: Mon, 6 Jan 2025 21:33:46 +0800 Subject: [PATCH 1/3] Added new structure for backend --- backend/README.md | 177 ++++++++++++++++++ backend/app/scrape_utils.py | 59 ------ backend/app/scrapers/__init__.py | 0 backend/app/scrapers/advanced_scraper.py | 0 backend/app/scrapers/default_scraper.py | 0 backend/app/utils/scraper_manager.py | 0 .../subprocess_manager.py} | 0 7 files changed, 177 insertions(+), 59 deletions(-) delete mode 100644 backend/app/scrape_utils.py create mode 100644 backend/app/scrapers/__init__.py create mode 100644 backend/app/scrapers/advanced_scraper.py create mode 100644 backend/app/scrapers/default_scraper.py create mode 100644 backend/app/utils/scraper_manager.py rename backend/app/{subprocess_scraper.py => utils/subprocess_manager.py} (100%) diff --git a/backend/README.md b/backend/README.md index e69de29..92718b3 100644 --- a/backend/README.md +++ b/backend/README.md @@ -0,0 +1,177 @@ +# Backend Setup and API Testing Guide + +This guide explains how to set up the backend environment, run the server, and test the endpoints for the project. + +## Prerequisites + +- **Python**: Ensure you have Python 3.10 or later installed. +- **pip**: Comes pre-installed with Python. If not, install it. +- **Postman** (or cURL): For API testing. +- **Git**: For cloning the repository. + +## Steps to Set Up and Run the Backend Server + +### 1. Clone the Repository + +```bash +git clone https://github.com/your-repo/fintech-hackathon.git +cd fintech-hackathon/backend +``` + +### 2. Create a Virtual Environment + +Create and activate a Python virtual environment to isolate dependencies. + +#### macOS/Linux: + +```bash +python3 -m venv fintech-env +source fintech-env/bin/activate +``` + +#### Windows: + +```bash +python -m venv fintech-env +fintech-env\Scripts\activate +``` + +### 3. Install Dependencies + +Install all required Python libraries: + +```bash +pip install -r requirements.txt +``` + +### 4. Set Up Environment Variables + +Create a `.env` file in the `backend/` directory with the following content: + +``` +OPENAI_API_KEY= +``` + +Replace `` with your actual OpenAI API key. + +### 5. Run the Backend Server + +Start the FastAPI server: + +```bash +uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +``` + +The server will be available at `http://127.0.0.1:8000`. + +--- + +## Testing the API Endpoints + +You can test the API endpoints using **Postman** or **cURL**. + +### 1. Using Postman + +1. Open Postman and create a new **POST** request. +2. Set the URL to: + ``` + http://127.0.0.1:8000/scrape + ``` +3. In the **Body** tab, select `raw` and set the content type to `JSON`. +4. Enter the following JSON payload: + + ```json + { + "prompt": "List all projects with their description.", + "url": "https://perinim.github.io/projects/" + } + ``` + +5. Click **Send**. + +6. You should receive a response similar to this: + + ```json + { + "result": [ + { + "name": "Project A", + "description": "Description of Project A", + "link": "https://example.com/project-a" + }, + { + "name": "Project B", + "description": "Description of Project B", + "link": "https://example.com/project-b" + } + ] + } + ``` + +### 2. Using cURL + +Alternatively, you can use `cURL` to test the endpoint: + +```bash +curl -X POST http://127.0.0.1:8000/scrape \ +-H "Content-Type: application/json" \ +-d '{"prompt": "List all projects with their description.", "url": "https://perinim.github.io/projects/"}' +``` + +Expected Output: +```json +{ + "result": [ + { + "name": "Project A", + "description": "Description of Project A", + "link": "https://example.com/project-a" + }, + { + "name": "Project B", + "description": "Description of Project B", + "link": "https://example.com/project-b" + } + ] +} +``` + +--- + +## Troubleshooting + +### Common Issues + +1. **Virtual Environment Not Found**: + - Ensure the virtual environment was created and activated correctly. + +2. **ModuleNotFoundError**: + - Check that all dependencies are installed by running: + ```bash + pip install -r requirements.txt + ``` + +3. **500 Internal Server Error**: + - Verify your `.env` file contains a valid `OPENAI_API_KEY`. + +4. **Subprocess Errors**: + - Ensure the `subprocess_scraper.py` file is in the correct directory (`backend/app/`). + +--- + +## Project Structure + +``` +backend/ +├── app/ +│ ├── __init__.py +│ ├── config.py +│ ├── main.py +│ ├── scrape_utils.py +│ └── subprocess_scraper.py +├── fintech-env/ # Virtual environment (not in Git) +├── requirements.txt +└── README.md +``` + +--- \ No newline at end of file diff --git a/backend/app/scrape_utils.py b/backend/app/scrape_utils.py deleted file mode 100644 index 0b76e89..0000000 --- a/backend/app/scrape_utils.py +++ /dev/null @@ -1,59 +0,0 @@ -from app.config import OPENAI_API_KEY -import asyncio -import json -import subprocess -import os - -def run_scraper_subprocess(prompt, source_url): - """ - Run the scraper in a separate subprocess to avoid asyncio conflicts. - - Args: - prompt (str): The prompt to pass to the LLM. - source_url (str): The URL of the page to scrape. - - Returns: - dict: The result from the scraper. - """ - script_path = os.path.join(os.path.dirname(__file__), "subprocess_scraper.py") - - # Ensure the script exists - if not os.path.exists(script_path): - raise FileNotFoundError(f"Subprocess script not found: {script_path}") - # Path to the Python executable within the virtual environment - python_executable = os.path.join( - os.getenv("fintech-env") or "", "bin", "python3" - ) - # Fallback to system Python if VIRTUAL_ENV is not set - if not os.path.exists(python_executable): - python_executable = "python3" - - # Command to execute the scraper script in a subprocess - process = subprocess.run( - [ - "python3", - script_path, - json.dumps({"prompt": prompt, "source_url": source_url}), - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - - if process.returncode != 0: - raise RuntimeError( - f"Subprocess failed with error: {process.stderr}" - ) - - return json.loads(process.stdout) - - - -async def run_scraper(prompt, source_url): - """ - Wrapper for the subprocess call to integrate with FastAPI's async system. - """ - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, run_scraper_subprocess, prompt, source_url - ) \ No newline at end of file diff --git a/backend/app/scrapers/__init__.py b/backend/app/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/scrapers/advanced_scraper.py b/backend/app/scrapers/advanced_scraper.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/scrapers/default_scraper.py b/backend/app/scrapers/default_scraper.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/utils/scraper_manager.py b/backend/app/utils/scraper_manager.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/subprocess_scraper.py b/backend/app/utils/subprocess_manager.py similarity index 100% rename from backend/app/subprocess_scraper.py rename to backend/app/utils/subprocess_manager.py From 795fb8a4199e9622bf6c75abe83df292d12d0b03 Mon Sep 17 00:00:00 2001 From: Kaung Z Y Naung Date: Mon, 6 Jan 2025 21:40:33 +0800 Subject: [PATCH 2/3] corrected subprocess manager.py --- backend/app/scrapers/default_scraper.py | 34 ++++++++++ backend/app/utils/subprocess_manager.py | 86 ++++++++++++------------- 2 files changed, 75 insertions(+), 45 deletions(-) diff --git a/backend/app/scrapers/default_scraper.py b/backend/app/scrapers/default_scraper.py index e69de29..08908ff 100644 --- a/backend/app/scrapers/default_scraper.py +++ b/backend/app/scrapers/default_scraper.py @@ -0,0 +1,34 @@ +import sys +import json +from scrapegraphai.graphs import SmartScraperGraph +from app.config import OPENAI_API_KEY + +#this is the default scraper that will be called, this can be used as a template for other scrapers +def main(): + input_data = json.loads(sys.argv[1]) + prompt = input_data["prompt"] + source_url = input_data["source_url"] + + graph_config = { + "llm": { + "api_key": OPENAI_API_KEY, + "model": "openai/gpt-4o", + }, + } + + smart_scraper_graph = SmartScraperGraph( + prompt=prompt, + source=source_url, + config=graph_config, + ) + + try: + result = smart_scraper_graph.run() + print(json.dumps(result)) + except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/backend/app/utils/subprocess_manager.py b/backend/app/utils/subprocess_manager.py index 154651a..1840c04 100644 --- a/backend/app/utils/subprocess_manager.py +++ b/backend/app/utils/subprocess_manager.py @@ -1,47 +1,43 @@ -import sys import os +import subprocess import json -from scrapegraphai.graphs import SmartScraperGraph - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from app.config import OPENAI_API_KEY - -def main(): - # Parse arguments passed to the subprocess - try: - input_data = json.loads(sys.argv[1]) - except Exception as e: - print(f"Error parsing input data: {str(e)}", file=sys.stderr) - sys.exit(1) - prompt = input_data["prompt"] - source_url = input_data["source_url"] - - if not prompt or not source_url: - print("Error: Missing required arguments 'prompt' or 'source_url'.", file=sys.stderr) - sys.exit(1) - - graph_config = { - "llm": { - "api_key": OPENAI_API_KEY, - "model": "openai/gpt-4o-mini", - }, - } - - # Run the scraper - smart_scraper_graph = SmartScraperGraph( - prompt=prompt, - source=source_url, - config=graph_config, - ) - - try: - result = smart_scraper_graph.run() # Run synchronously in the subprocess - print(json.dumps(result)) - except Exception as e: - print(f"Error: {str(e)}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() +import asyncio + +class SubprocessManager: + def __init__(self, script_name): + self.script_path = os.path.join(os.path.dirname(__file__), "..", "scrapers", script_name) + if not os.path.exists(self.script_path): + raise FileNotFoundError(f"Subprocess script not found: {self.script_path}") + + def run_subprocess(self, prompt, source_url): + """ + Run the subprocess for the specified scraper script. + """ + python_executable = os.path.join( + os.getenv("VIRTUAL_ENV") or "", "bin", "python3" + ) + if not os.path.exists(python_executable): + python_executable = "python3" + + process = subprocess.run( + [ + python_executable, + self.script_path, + json.dumps({"prompt": prompt, "source_url": source_url}), + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + if process.returncode != 0: + raise RuntimeError(f"Subprocess failed with error: {process.stderr}") + + return json.loads(process.stdout) + + async def run(self, prompt, source_url): + """ + Async wrapper for subprocess execution. + """ + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self.run_subprocess, prompt, source_url) From c6b7a9f441013ba69fc6b07f6dbd75479873a82c Mon Sep 17 00:00:00 2001 From: Kaung Z Y Naung Date: Mon, 6 Jan 2025 22:15:01 +0800 Subject: [PATCH 3/3] Trying to fix search graph --- backend/app/main.py | 16 +++---- backend/app/scrapers/company_scraper.py | 44 +++++++++++++++++++ backend/app/scrapers/default_scraper.py | 2 +- .../advanced_scraper.py => utils/__init__.py} | 0 backend/app/utils/scraper_manager.py | 21 +++++++++ backend/app/utils/subprocess_manager.py | 2 +- 6 files changed, 73 insertions(+), 12 deletions(-) create mode 100644 backend/app/scrapers/company_scraper.py rename backend/app/{scrapers/advanced_scraper.py => utils/__init__.py} (100%) diff --git a/backend/app/main.py b/backend/app/main.py index 9f9209d..e1c14e9 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,12 +1,6 @@ -# backend/app/main.py - -from fastapi import FastAPI, HTTPException, Body +from fastapi import FastAPI, HTTPException from pydantic import BaseModel -from typing import List -from app.scrape_utils import run_scraper -import tracemalloc -## trace issue -tracemalloc.start() +from app.utils.scraper_manager import ScraperManager app = FastAPI(title="ScrapeGraphAI Backend") @@ -14,15 +8,17 @@ class ScrapeRequest(BaseModel): prompt: str url: str + scraper_type: str = "default" # Optional field to select different scrapers @app.post("/scrape") async def scrape_endpoint(request: ScrapeRequest): """ - FastAPI endpoint to run the scraper with the given prompt and URL. + FastAPI endpoint to run the scraper with the given prompt, URL, and scraper type. """ try: - result = await run_scraper(prompt=request.prompt, source_url=request.url) + scraper_manager = ScraperManager(request.scraper_type) + result = await scraper_manager.run_scraper(prompt=request.prompt, source_url=request.url) return {"result": result} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/backend/app/scrapers/company_scraper.py b/backend/app/scrapers/company_scraper.py new file mode 100644 index 0000000..88c49ef --- /dev/null +++ b/backend/app/scrapers/company_scraper.py @@ -0,0 +1,44 @@ +import sys +import json +import time +from scrapegraphai.graphs import SearchGraph +sys.path.append('/Users/kaungzinye/Documents/SWE/fintech-hackathon/backend') +from app.config import OPENAI_API_KEY + +def main(): + input_data = json.loads(sys.argv[1]) + prompt = input_data["prompt"] + source_url = input_data["source_url"] + + graph_config = { + "llm": { + "api_key": OPENAI_API_KEY, + "model": "openai/gpt-4o-mini", + }, + } + + search_scraper_graph = SearchGraph( + prompt=prompt, + schema = "default", + config=graph_config, + ) + + max_retries = 3 # Number of retry attempts + retry_delay = 5 # Seconds to wait between retries + + for attempt in range(1, max_retries + 1): + try: + print(f"Attempt {attempt} of {max_retries}") + result = search_scraper_graph.run() + print(json.dumps(result)) + break # Exit the loop if successful + except Exception as e: + print(f"Attempt {attempt} failed: {str(e)}", file=sys.stderr) + if attempt < max_retries: + time.sleep(retry_delay) # Wait before retrying + else: + print("All attempts failed.", file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/backend/app/scrapers/default_scraper.py b/backend/app/scrapers/default_scraper.py index 08908ff..c149bd3 100644 --- a/backend/app/scrapers/default_scraper.py +++ b/backend/app/scrapers/default_scraper.py @@ -12,7 +12,7 @@ def main(): graph_config = { "llm": { "api_key": OPENAI_API_KEY, - "model": "openai/gpt-4o", + "model": "openai/gpt-4o-mini", }, } diff --git a/backend/app/scrapers/advanced_scraper.py b/backend/app/utils/__init__.py similarity index 100% rename from backend/app/scrapers/advanced_scraper.py rename to backend/app/utils/__init__.py diff --git a/backend/app/utils/scraper_manager.py b/backend/app/utils/scraper_manager.py index e69de29..7838d29 100644 --- a/backend/app/utils/scraper_manager.py +++ b/backend/app/utils/scraper_manager.py @@ -0,0 +1,21 @@ +import asyncio +from app.utils.subprocess_manager import SubprocessManager +# this is the class that will manage the different scrapers +class ScraperManager: + def __init__(self, scraper_type="default"): + self.scraper_type = scraper_type + self.scraper_scripts = { + "default": "default_scraper.py", + "advanced": "advanced_scraper.py", + "company": "company_scraper.py", + } + + async def run_scraper(self, prompt, source_url): + """ + Run the selected scraper asynchronously. + """ + if self.scraper_type not in self.scraper_scripts: + raise ValueError(f"Unknown scraper type: {self.scraper_type}") + + subprocess_manager = SubprocessManager(self.scraper_scripts[self.scraper_type]) + return await subprocess_manager.run(prompt, source_url) diff --git a/backend/app/utils/subprocess_manager.py b/backend/app/utils/subprocess_manager.py index 1840c04..8dedd2a 100644 --- a/backend/app/utils/subprocess_manager.py +++ b/backend/app/utils/subprocess_manager.py @@ -2,7 +2,7 @@ import subprocess import json import asyncio - +#this is the class that manages the subprocesses, it will be used to run the scraper scripts class SubprocessManager: def __init__(self, script_name): self.script_path = os.path.join(os.path.dirname(__file__), "..", "scrapers", script_name)