Add method to Performance object

gabriel-piles · gabriel-piles · commit ee19c8c15609 · 2025-10-07T10:48:14.000+02:00
diff --git a/src/trainable_entity_extractor/adapters/LocalJobExecutor.py b/src/trainable_entity_extractor/adapters/LocalJobExecutor.py
@@ -1,3 +1,6 @@
+from datetime import timedelta, datetime
+from pathlib import Path
+import shutil
 from typing import Tuple, List
 
 from trainable_entity_extractor.adapters.extractors.pdf_to_multi_option_extractor.PdfToMultiOptionExtractor import (
@@ -29,10 +32,26 @@ class LocalJobExecutor(JobExecutor):
         TextToTextExtractor,
     ]
 
+    @staticmethod
+    def ensure_fresh_model_folder(extraction_identifier: ExtractionIdentifier, max_age_hours: int = 1) -> None:
+        path = Path(extraction_identifier.get_path())
+
+        if path.exists():
+            folder_modified_time = datetime.fromtimestamp(path.stat().st_mtime)
+            current_time = datetime.now()
+            age = current_time - folder_modified_time
+
+            if age > timedelta(hours=max_age_hours):
+                shutil.rmtree(path)
+                path.mkdir(parents=True, exist_ok=True)
+        else:
+            path.mkdir(parents=True, exist_ok=True)
+
     def start_performance_evaluation(
         self, extraction_identifier: ExtractionIdentifier, distributed_sub_job: DistributedSubJob
     ):
         try:
+            self.ensure_fresh_model_folder(extraction_identifier)
             extraction_data = self.data_retriever.get_extraction_data(extraction_identifier)
             if not extraction_data:
                 distributed_sub_job.status = JobStatus.FAILURE
@@ -67,9 +86,6 @@ def start_performance_evaluation(
             distributed_sub_job.status = JobStatus.FAILURE
             return None
 
-        distributed_sub_job.status = JobStatus.FAILURE
-        return None
-
     def upload_model(self, extraction_identifier: ExtractionIdentifier, extractor_job: TrainableEntityExtractorJob) -> bool:
         try:
             extraction_identifier.clean_extractor_folder(extractor_job.method_name)
diff --git a/src/trainable_entity_extractor/domain/Performance.py b/src/trainable_entity_extractor/domain/Performance.py
@@ -2,6 +2,7 @@
 
 
 class Performance(BaseModel):
+    method_name: str = "Unknown Method"
     performance: float = 0.0
     execution_seconds: int = 0
     is_perfect: bool = False
diff --git a/src/trainable_entity_extractor/domain/PerformanceSummary.py b/src/trainable_entity_extractor/domain/PerformanceSummary.py
@@ -91,17 +91,19 @@ def from_distributed_job(distributed_job: DistributedJob) -> "PerformanceSummary
         testing_samples_count = 0
         training_samples_count = 0
         options_count = 0
+        extractor_name = "Unknown Extractor"
 
         for sub_job in distributed_job.sub_jobs:
             if not sub_job.result:
                 continue
             testing_samples_count = sub_job.result.testing_samples_count
             training_samples_count = sub_job.result.training_samples_count
             options_count = len(sub_job.extractor_job.options) if sub_job.extractor_job.options else 0
+            extractor_name = sub_job.extractor_job.extractor_name
 
         return PerformanceSummary(
             extraction_identifier=distributed_job.extraction_identifier,
-            extractor_name="Performance Evaluation",
+            extractor_name=extractor_name,
             samples_count=0,
             options_count=options_count,
             languages=[],
diff --git a/src/trainable_entity_extractor/ports/ExtractorBase.py b/src/trainable_entity_extractor/ports/ExtractorBase.py
@@ -125,14 +125,14 @@ def get_performance(self, extractor_job: TrainableEntityExtractorJob, extraction
         method_instance = self.get_method_instance_by_name(method_name)
         if not method_instance:
             self.logger.log(extraction_data.extraction_identifier, f"Method {method_name} not found")
-            return Performance(failed=True)
+            return Performance(method_name=method_name, failed=True)
 
         if hasattr(method_instance, "can_be_used"):
             if not method_instance.can_be_used(extraction_data):
                 self.logger.log(
                     extraction_data.extraction_identifier, f"Method {method_name} cannot be used with current data"
                 )
-                return Performance(failed=True)
+                return Performance(method_name=method_name, failed=True)
 
         self.logger.log(extraction_data.extraction_identifier, f"\nChecking {method_name}")
 
@@ -145,6 +145,7 @@ def get_performance(self, extractor_job: TrainableEntityExtractorJob, extraction
             is_perfect = performance_score >= 99.99
 
             return Performance(
+                method_name=method_name,
                 performance=performance_score,
                 execution_seconds=execution_time,
                 is_perfect=is_perfect,
@@ -157,7 +158,7 @@ def get_performance(self, extractor_job: TrainableEntityExtractorJob, extraction
             self.logger.log(extraction_data.extraction_identifier, "ERROR", LogSeverity.info, e)
             execution_time = int(time.time() - start_time)
 
-            return Performance(execution_seconds=execution_time)
+            return Performance(method_name=method_name, execution_seconds=execution_time)
 
     def train_one_method(
         self, extractor_job: TrainableEntityExtractorJob, extraction_data: ExtractionData
diff --git a/src/trainable_entity_extractor/use_cases/TrainUseCase.py b/src/trainable_entity_extractor/use_cases/TrainUseCase.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from trainable_entity_extractor.domain.ExtractionData import ExtractionData
 from trainable_entity_extractor.domain.Performance import Performance
 from trainable_entity_extractor.domain.TrainableEntityExtractorJob import TrainableEntityExtractorJob
@@ -13,6 +15,11 @@ def __init__(self, extractors: list[type[ExtractorBase]], logger: Logger):
     def train_one_method(
         self, extractor_job: TrainableEntityExtractorJob, extraction_data: ExtractionData
     ) -> tuple[bool, str]:
+
+        method_path = Path(extraction_data.extraction_identifier.get_path()) / extractor_job.method_name
+        if method_path.exists() and any(method_path.iterdir()):
+            return True, ""
+
         extractor_name = extractor_job.extractor_name
         for extractor in self.extractors:
             extractor_instance = extractor(extraction_data.extraction_identifier, self.logger)