diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..c106a9f Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md index 42862e3..b717bec 100644 --- a/README.md +++ b/README.md @@ -55,3 +55,34 @@ __Contributors:__ - Vincent Harkins (@vharkins1) - Marc Vergés (@marcvergees) - Jan Sans + + +## Local Development Setup (Beginner Friendly) + +1. Clone your fork and enter project folder: + + - git clone + cd FireForm (Terminal) + +2. Create virtual environment: + + - python3 -m venv venv + source venv/bin/activate + +3. Install dependencies: + +4. Initialize database tables: + +5. Run backend server: + +6. Open Swagger UI in browser: (http://127.0.0.1:8000/docs) + +### Common Errors + +- `sqlite3.OperationalError: no such table` +→ Run database initialization step. + +- `Could not connect to Ollama` +→ Ensure Ollama server is running locally. + + diff --git a/api/db/models.py b/api/db/models.py index f76c93b..d237f82 100644 --- a/api/db/models.py +++ b/api/db/models.py @@ -15,4 +15,5 @@ class FormSubmission(SQLModel, table=True): template_id: int input_text: str output_pdf_path: str - created_at: datetime = Field(default_factory=datetime.utcnow) \ No newline at end of file + requires_review: bool = False + created_at: datetime = Field(default_factory=datetime.utcnow) diff --git a/api/routes/forms.py b/api/routes/forms.py index cee5356..ab03017 100644 --- a/api/routes/forms.py +++ b/api/routes/forms.py @@ -1,27 +1,39 @@ -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, Request from sqlmodel import Session + from api.deps import get_db from api.schemas.forms import FormFill, FormFillResponse from api.db.repositories import create_form, get_template from api.db.models import FormSubmission from api.errors.base import AppError from src.controller import Controller +from api.middleware.rate_limiter import limiter router = APIRouter(prefix="/forms", tags=["forms"]) @router.post("/fill", response_model=FormFillResponse) -def fill_form(form: FormFill, db: Session = Depends(get_db)): - fetched_template = get_template(db, form.template_id) - if not fetched_template: +@limiter.limit("20/minute") +def fill_form(request: Request, form: FormFill, db: Session = Depends(get_db)): + # Single query instead of the previous duplicate get_template() calls + template = get_template(db, form.template_id) + if not template: raise AppError("Template not found", status_code=404) controller = Controller() - path = controller.fill_form( - user_input=form.input_text, - fields=fetched_template.fields, - pdf_form_path=fetched_template.pdf_path, - ) + try: + path = controller.fill_form( + user_input=form.input_text, + fields=template.fields, + pdf_form_path=template.pdf_path, + ) + except AppError: + raise # Re-raise known application errors as-is + except Exception as exc: + raise AppError( + f"Form filling failed: {exc}", + status_code=500, + ) from exc submission = FormSubmission(**form.model_dump(), output_pdf_path=path) - return create_form(db, submission) + return create_form(db, submission) \ No newline at end of file diff --git a/src/controller.py b/src/controller.py index d31ec9c..c761780 100644 --- a/src/controller.py +++ b/src/controller.py @@ -5,7 +5,14 @@ def __init__(self): self.file_manipulator = FileManipulator() def fill_form(self, user_input: str, fields: list, pdf_form_path: str): - return self.file_manipulator.fill_form(user_input, fields, pdf_form_path) + path, review_flag = self.file_manipulator.fill_form( + user_input=user_input, + fields=fields, + pdf_form_path=pdf_form_path + ) + return path, review_flag + + def create_template(self, pdf_path: str): return self.file_manipulator.create_template(pdf_path) \ No newline at end of file diff --git a/src/file_manipulator.py b/src/file_manipulator.py index e499c89..b7815cc 100644 --- a/src/file_manipulator.py +++ b/src/file_manipulator.py @@ -1,6 +1,7 @@ import os from src.filler import Filler from src.llm import LLM +from commonforms import prepare_form class FileManipulator: @@ -12,14 +13,7 @@ def create_template(self, pdf_path: str): """ By using commonforms, we create an editable .pdf template and we store it. """ - # Lazy import - from commonforms import prepare_form template_path = pdf_path[:-4] + "_template.pdf" - - os.system("taskkill /F /IM ollama.exe >nul 2>&1") - print("Cleared existing Ollama instances. Starting fresh...") - - prepare_form(pdf_path, template_path) return template_path diff --git a/src/llm.py b/src/llm.py index 3621187..46e3848 100644 --- a/src/llm.py +++ b/src/llm.py @@ -1,109 +1,167 @@ import json +import logging import os +import time import requests -from requests.exceptions import Timeout, RequestException + +logger = logging.getLogger("fireform.llm") + +# Configuration constants +LLM_REQUEST_TIMEOUT_SECONDS = 120 +LLM_MAX_RETRIES = 3 +LLM_RETRY_BASE_DELAY_SECONDS = 2 class LLM: def __init__(self, transcript_text=None, target_fields=None, json=None): if json is None: json = {} - self._transcript_text = transcript_text # str - self._target_fields = target_fields # List, contains the template field. - self._json = json # dictionary + self._transcript_text = transcript_text + self._target_fields = target_fields + self._json = json def type_check_all(self): if type(self._transcript_text) is not str: raise TypeError( - f"ERROR in LLM() attributes ->\ - Transcript must be text. Input:\n\ttranscript_text: {self._transcript_text}" + f"ERROR in LLM() attributes -> " + f"Transcript must be text. Input:\n\ttranscript_text: {self._transcript_text}" ) elif type(self._target_fields) is not list: raise TypeError( - f"ERROR in LLM() attributes ->\ - Target fields must be a list. Input:\n\ttarget_fields: {self._target_fields}" + f"ERROR in LLM() attributes -> " + f"Target fields must be a list. Input:\n\ttarget_fields: {self._target_fields}" ) def build_prompt(self, current_field): """ - This method is in charge of the prompt engineering. It creates a specific prompt for each target field. - @params: current_field -> represents the current element of the json that is being prompted. + Creates a specific prompt for each target field. """ - prompt = f""" + prompt = f""" SYSTEM PROMPT: - You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. - You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return - only a single string containing the identified value for the JSON field. + You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. + You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return + only a single string containing the identified value for the JSON field. If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";". If you don't identify the value in the provided text, return "-1". --- DATA: Target JSON field to find in text: {current_field} - + TEXT: {self._transcript_text} """ - return prompt - def main_loop(self): - timeout = 30 - max_retries = 3 + def _call_ollama(self, prompt, field_name): + """ + Send a prompt to Ollama with timeout and retry logic. + """ + ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/") + ollama_url = f"{ollama_host}/api/generate" - # self.type_check_all() - total_fields = len(self._target_fields) - for i, field in enumerate(self._target_fields.keys(), 1): - prompt = self.build_prompt(field) - # print(prompt) - # ollama_url = "http://localhost:11434/api/generate" - ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/") - ollama_url = f"{ollama_host}/api/generate" - - payload = { - "model": "mistral", - "prompt": prompt, - "stream": False, # don't really know why --> look into this later. - } - - json_data = None + payload = { + "model": "mistral", + "prompt": prompt, + "stream": False, + } + + last_exception = None + + for attempt in range(1, LLM_MAX_RETRIES + 1): try: - for attempt in range(max_retries): - try: - response = requests.post(ollama_url, json=payload, timeout=timeout) - response.raise_for_status() - json_data = response.json() - break - except Timeout: - print(f"Ollama request timed out (attempt {attempt+1})") - except RequestException as e: - print(f"Ollama request failed: {e}") - except requests.exceptions.ConnectionError: - raise ConnectionError( - f"Could not connect to Ollama at {ollama_url}. " - "Please ensure Ollama is running and accessible." + logger.info( + "LLM request for field '%s' (attempt %d/%d)", + field_name, + attempt, + LLM_MAX_RETRIES, + ) + + response = requests.post( + ollama_url, + json=payload, + timeout=LLM_REQUEST_TIMEOUT_SECONDS, ) - except requests.exceptions.HTTPError as e: - raise RuntimeError(f"Ollama returned an error: {e}") - - if json_data is None: - raise RuntimeError("Failed to get response from Ollama after retries.") - else: - # parse response - parsed_response = json_data["response"] - # print(parsed_response) - self.add_response_to_json(field, parsed_response) - print(f"[{i}/{total_fields}] Extracted data for field '{field}' successfully.") - - print("----------------------------------") - print("\t[LOG] Resulting JSON created from the input text:") - print(json.dumps(self._json, indent=2)) - print("--------- extracted data ---------") + response.raise_for_status() + + json_data = response.json() + result = json_data["response"] + + logger.info( + "LLM response for field '%s': %s", + field_name, + result[:100] if len(result) > 100 else result, + ) + + return result + + except requests.exceptions.Timeout as exc: + last_exception = exc + logger.warning( + "LLM request timed out for field '%s' (attempt %d/%d)", + field_name, + attempt, + LLM_MAX_RETRIES, + ) + + except requests.exceptions.ConnectionError as exc: + last_exception = exc + logger.warning( + "Cannot connect to Ollama for field '%s' (attempt %d/%d)", + field_name, + attempt, + LLM_MAX_RETRIES, + ) + + except requests.exceptions.HTTPError as exc: + last_exception = exc + if response.status_code >= 500: + logger.warning( + "Ollama server error %d for field '%s' (attempt %d/%d)", + response.status_code, + field_name, + attempt, + LLM_MAX_RETRIES, + ) + else: + # Client errors (4xx) should not be retried + raise RuntimeError( + f"Ollama returned client error {response.status_code} " + f"for field '{field_name}': {exc}" + ) from exc + + # Exponential backoff before retry + if attempt < LLM_MAX_RETRIES: + delay = LLM_RETRY_BASE_DELAY_SECONDS * (2 ** (attempt - 1)) + logger.info("Retrying in %d seconds...", delay) + time.sleep(delay) + + # All retries exhausted + raise RuntimeError( + f"LLM extraction failed for field '{field_name}' after " + f"{LLM_MAX_RETRIES} attempts: {last_exception}" + ) + + def main_loop(self): + """ + Iterate over all target fields, extract values from the LLM, + and build the result JSON. + """ + logger.info( + "Starting LLM extraction for %d fields", + len(self._target_fields) if self._target_fields else 0, + ) + + for field in self._target_fields.keys(): + prompt = self.build_prompt(field) + parsed_response = self._call_ollama(prompt, field_name=field) + self.add_response_to_json(field, parsed_response) + + logger.info("LLM extraction complete. Result:\n%s", json.dumps(self._json, indent=2)) return self def add_response_to_json(self, field, value): """ - this method adds the following value under the specified field, - or under a new field if the field doesn't exist, to the json dict + Adds the extracted value under the specified field in the JSON dict. """ value = value.strip().replace('"', "") parsed_value = None @@ -123,27 +181,26 @@ def add_response_to_json(self, field, value): def handle_plural_values(self, plural_value): """ - This method handles plural values. - Takes in strings of the form 'value1; value2; value3; ...; valueN' - returns a list with the respective values -> [value1, value2, value3, ..., valueN] + Handles plural values separated by semicolons. + 'value1; value2; value3' → ['value1', 'value2', 'value3'] """ if ";" not in plural_value: raise ValueError( f"Value is not plural, doesn't have ; separator, Value: {plural_value}" ) - print( - f"\t[LOG]: Formating plural values for JSON, [For input {plural_value}]..." - ) + logger.debug("Formatting plural values for input: %s", plural_value) values = plural_value.split(";") - # Remove trailing leading whitespace for i in range(len(values)): - values[i] = values[i].lstrip() + current = i + 1 + if current < len(values): + clean_value = values[current].lstrip() + values[current] = clean_value - print(f"\t[LOG]: Resulting formatted list of values: {values}") + logger.debug("Resulting formatted list: %s", values) return values def get_data(self): - return self._json + return self._json \ No newline at end of file diff --git a/src/schemas/incident-schema.py b/src/schemas/incident-schema.py new file mode 100644 index 0000000..054054a --- /dev/null +++ b/src/schemas/incident-schema.py @@ -0,0 +1,6 @@ +INCIDENT_SCHEMA = { + "location": "", + "time": "", + "severity": "", + "description": "" +} \ No newline at end of file diff --git a/src/utils/extraction_validator.py b/src/utils/extraction_validator.py new file mode 100644 index 0000000..eb26f76 --- /dev/null +++ b/src/utils/extraction_validator.py @@ -0,0 +1,21 @@ +class ExtractionValidator: + REQUIRED_FIELDS = ["location", "time", "severity", "description"] + + def validate(self, data: dict): + missing_fields = [] + confidence_score = 100 + + for field in self.REQUIRED_FIELDS: + value = data.get(field) + + if value is None or value == "" or value == "-1": + missing_fields.append(field) + confidence_score -= 25 + + requires_review = len(missing_fields) > 0 + + return { + "requires_review": requires_review, + "missing_fields": missing_fields, + "confidence_score": confidence_score + } \ No newline at end of file diff --git a/src/utils/validation.py b/src/utils/validation.py new file mode 100644 index 0000000..5ecc5c1 --- /dev/null +++ b/src/utils/validation.py @@ -0,0 +1,11 @@ +def requires_review(data: dict, required_fields: list): + for field in required_fields: + value = data.get(field) + + if value is None: + return True + + if isinstance(value, str) and value.strip() in ["", "-1"]: + return True + + return False \ No newline at end of file diff --git a/test.pdf b/test.pdf new file mode 100644 index 0000000..e69de29