open-edge-platform · ashwinvaidya17 · Nov 6, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 31, 2025
@@ -4,13 +4,13 @@
 from typing import Annotated
 from uuid import UUID
 
-from fastapi import APIRouter, Body, Depends
+from fastapi import APIRouter, Body, Depends, status
 from sse_starlette import EventSourceResponse
 
 from api.dependencies import get_job_id, get_job_service
 from api.endpoints import API_PREFIX
 from pydantic_models import JobList
-from pydantic_models.job import JobSubmitted, TrainJobPayload
+from pydantic_models.job import JobCancelled, JobSubmitted, TrainJobPayload
 from services import JobService
 
 job_api_prefix_url = API_PREFIX + "/jobs"
@@ -42,3 +42,21 @@ async def get_job_logs(
 ) -> EventSourceResponse:
     """Endpoint to get the logs of a job by its ID"""
     return EventSourceResponse(job_service.stream_logs(job_id=job_id))
+
+
+@job_router.get("/{job_id}/progress")
+async def get_job_progress(
+    job_id: Annotated[UUID, Depends(get_job_id)],
+    job_service: Annotated[JobService, Depends(get_job_service)],
+) -> EventSourceResponse:
+    """Endpoint to get the progress of a job by its ID"""
+    return EventSourceResponse(job_service.stream_progress(job_id=job_id))
+
+
+@job_router.post("/{job_id}:cancel", status_code=status.HTTP_202_ACCEPTED)
+async def cancel_job(
+    job_id: Annotated[UUID, Depends(get_job_id)],
+    job_service: Annotated[JobService, Depends(get_job_service)],
+) -> JobCancelled:
+    """Endpoint to cancel a job by its ID"""
+    return await job_service.cancel_job(job_id=job_id)
@@ -5,7 +5,7 @@
 from typing import Any
 from uuid import UUID
 
-from pydantic import BaseModel, Field, field_serializer
+from pydantic import BaseModel, Field, computed_field, field_serializer
 
 from pydantic_models.base import BaseIDModel
 
@@ -46,6 +46,14 @@ class JobSubmitted(BaseModel):
     job_id: UUID
 
 
+class JobCancelled(BaseModel):
+    job_id: UUID
+
+    @computed_field
+    def message(self) -> str:
+        return f"Job with ID `{self.job_id}` marked as cancelled."
+
+
 class TrainJobPayload(BaseModel):
     project_id: UUID = Field(exclude=True)
     model_name: str

@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import datetime
+import json
+import logging
 from collections.abc import AsyncGenerator
 from uuid import UUID
 
@@ -12,9 +14,11 @@
 from db import get_async_db_session_ctx
 from exceptions import DuplicateJobException, ResourceNotFoundException
 from pydantic_models import Job, JobList, JobType
-from pydantic_models.job import JobStatus, JobSubmitted, TrainJobPayload
+from pydantic_models.job import JobCancelled, JobStatus, JobSubmitted, TrainJobPayload
 from repositories import JobRepository
 
+logger = logging.getLogger(__name__)
+
 
 class JobService:
     @staticmethod
@@ -24,7 +28,7 @@ async def get_job_list(extra_filters: dict | None = None) -> JobList:
             return JobList(jobs=await repo.get_all(extra_filters=extra_filters))
 
     @staticmethod
-    async def get_job_by_id(job_id: UUID) -> Job | None:
+    async def get_job_by_id(job_id: UUID | str) -> Job | None:
-    async def get_job_by_id(job_id: UUID | str) -> Job | None:
+    async def get_job_by_id(job_id: UUID | str) -> Job | None:
+        if isinstance(job_id, str):
+            try:
+                job_id = UUID(job_id)
+            except (ValueError, AttributeError, TypeError):
+                raise ResourceNotFoundException(resource_id=job_id, resource_name="job")
-    async def get_job_by_id(job_id: UUID | str) -> Job | None:
+    async def get_job_by_id(job_id: UUID | str) -> Job | None:
+        if isinstance(job_id, str):
+            try:
+                job_id = UUID(job_id)
+            except (ValueError, AttributeError, TypeError):
+                raise ResourceNotFoundException(resource_id=job_id, resource_name="job")
         async with get_async_db_session_ctx() as session:
             repo = JobRepository(session)
             return await repo.get_by_id(job_id)
@@ -56,7 +60,10 @@ async def get_pending_train_job() -> Job | None:
 
     @staticmethod
     async def update_job_status(
-        job_id: UUID, status: JobStatus, message: str | None = None, progress: int | None = None
+        job_id: UUID,
+        status: JobStatus,
+        message: str | None = None,
+        progress: int | None = None,
     ) -> None:
         async with get_async_db_session_ctx() as session:
             repo = JobRepository(session)
@@ -75,6 +82,13 @@ async def update_job_status(
                 updates["progress"] = progress_
             await repo.update(job, updates)
 
+    @classmethod
+    async def is_job_still_running(cls, job_id: UUID | str) -> bool:
+        job = await cls.get_job_by_id(job_id=job_id)
+        if job is None:
+            raise ResourceNotFoundException(resource_id=job_id, resource_name="job")
+        return job.status == JobStatus.RUNNING
+
     @classmethod
     async def stream_logs(cls, job_id: UUID | str) -> AsyncGenerator[ServerSentEvent]:
         from core.logging.utils import get_job_logs_path  # noqa: PLC0415
@@ -83,12 +97,6 @@ async def stream_logs(cls, job_id: UUID | str) -> AsyncGenerator[ServerSentEvent
         if not await anyio.Path(log_file).exists():
             raise ResourceNotFoundException(resource_id=job_id, resource_name="job_logs")
 
-        async def is_job_still_running():
-            job = await cls.get_job_by_id(job_id=job_id)
-            if job is None:
-                raise ResourceNotFoundException(resource_id=job_id, resource_name="job")
-            return job.status == JobStatus.RUNNING
-
         # Cache job status and only check every 2 seconds
         status_check_interval = 2.0  # seconds
         last_status_check = 0.0
@@ -101,7 +109,7 @@ async def is_job_still_running():
                 now = loop.time()
                 # Only check job status every status_check_interval seconds
                 if now - last_status_check > status_check_interval:
-                    cached_still_running = await is_job_still_running()
+                    cached_still_running = await cls.is_job_still_running(job_id=job_id)
-                    cached_still_running = await cls.is_job_still_running(job_id=job_id)
+                    try:
+                        cached_still_running = await cls.is_job_still_running(job_id=job_id)
+                    except ResourceNotFoundException:
+                        # Job was deleted mid-stream; terminate gracefully
+                        break
-                    cached_still_running = await cls.is_job_still_running(job_id=job_id)
+                    try:
+                        cached_still_running = await cls.is_job_still_running(job_id=job_id)
+                    except ResourceNotFoundException:
+                        # Job was deleted mid-stream; terminate gracefully
+                        break
                     last_status_check = now
                 still_running = cached_still_running
                 if not line:
@@ -113,3 +121,27 @@ async def is_job_still_running():
                     else:
                         break
                 yield ServerSentEvent(data=line.rstrip())
+
+    @classmethod
+    async def stream_progress(cls, job_id: UUID | str) -> AsyncGenerator[ServerSentEvent]:
+        """Stream the progress of a job by its ID"""
+        still_running = True
+        while still_running:
+            job = await cls.get_job_by_id(job_id=job_id)
+            if job is None:
+                raise ResourceNotFoundException(resource_id=job_id, resource_name="job")
+            yield ServerSentEvent(data=json.dumps({"progress": job.progress, "message": job.message}))
+            still_running = job.status in {JobStatus.RUNNING, JobStatus.PENDING}
+            await asyncio.sleep(0.5)
+
+    @classmethod
+    async def cancel_job(cls, job_id: UUID | str) -> JobCancelled:
+        """Cancel a job by its ID"""
+        async with get_async_db_session_ctx() as session:
+            repo = JobRepository(session)
+            job = await repo.get_by_id(job_id)
+            if job is None:
+                raise ResourceNotFoundException(resource_id=job_id, resource_name="job")
+
+            await repo.update(job, {"status": JobStatus.CANCELED})
+            return JobCancelled(job_id=job.id)
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 from contextlib import redirect_stdout
+from uuid import UUID
 
 from anomalib.data import Folder
 from anomalib.data.utils import TestSplitMode
@@ -15,6 +16,7 @@
 from repositories.binary_repo import ImageBinaryRepository, ModelBinaryRepository
 from services import ModelService
 from services.job_service import JobService
+from utils.callbacks import GetiInspectProgressCallback, ProgressSyncParams
 from utils.devices import Devices
 from utils.experiment_loggers import TrackioLogger
 
@@ -70,12 +72,24 @@ async def _run_training_job(cls, job: Job, job_service: JobService) -> Model:
             name=str(model_name),
             train_job_id=job.id,
         )
+        synchronization_parameters = ProgressSyncParams()
         logger.info(f"Training model `{model_name}` for job `{job.id}`")
 
+        synchronization_task: asyncio.Task[None] | None = None
         try:
+            synchronization_task = asyncio.create_task(
+                cls._sync_progress_with_db(
+                    job_service=job_service, job_id=job.id, synchronization_parameters=synchronization_parameters
+                )
+            )
             # Use asyncio.to_thread to keep event loop responsive
             # TODO: Consider ProcessPoolExecutor for true parallelism with multiple jobs
-            trained_model = await asyncio.to_thread(cls._train_model, model=model, device=device)
+            trained_model = await asyncio.to_thread(
+                cls._train_model,
+                model=model,
+                device=device,
+                synchronization_parameters=synchronization_parameters,
+            )
             if trained_model is None:
                 raise ValueError("Training failed - model is None")
 
@@ -94,9 +108,15 @@ async def _run_training_job(cls, job: Job, job_service: JobService) -> Model:
                 await model_binary_repo.delete_model_folder()
                 await model_service.delete_model(project_id=project_id, model_id=model.id)
             raise e
+        finally:
+            logger.debug("Syncing progress with db stopped")
+            if synchronization_task is not None and not synchronization_task.done():
+                synchronization_task.cancel()
 
     @staticmethod
-    def _train_model(model: Model, device: str | None = None) -> Model | None:
+    def _train_model(
+        model: Model, synchronization_parameters: ProgressSyncParams, device: str | None = None
+    ) -> Model | None:
         """
         Execute CPU-intensive model training using anomalib.
 
@@ -106,6 +126,7 @@ def _train_model(model: Model, device: str | None = None) -> Model | None:
 
         Args:
             model: Model object with training configuration
+            synchronization_parameters: Parameters for synchronization between the main process and the training process
             device: Device to train on
 
         Returns:
@@ -145,7 +166,9 @@ def _train_model(model: Model, device: str | None = None) -> Model | None:
         engine = Engine(
             default_root_dir=model.export_path,
             logger=[trackio, tensorboard],
+            devices=[0],  # Only single GPU training is supported for now
-        engine = Engine(
-            default_root_dir=model.export_path,
-            logger=[trackio, tensorboard],
-            devices=[0],  # Only single GPU training is supported for now
+        # Use training_device to set devices dynamically
+        # If training_device is 'cpu', set devices=['cpu'], else set devices=[0] or as appropriate
+        devices = ['cpu'] if training_device == 'cpu' else [0]
+        engine = Engine(
+            default_root_dir=model.export_path,
+            logger=[trackio, tensorboard],
+            devices=devices,  # Devices now set based on training_device
-        engine = Engine(
-            default_root_dir=model.export_path,
-            logger=[trackio, tensorboard],
-            devices=[0],  # Only single GPU training is supported for now
+        # Use training_device to set devices dynamically
+        # If training_device is 'cpu', set devices=['cpu'], else set devices=[0] or as appropriate
+        devices = ['cpu'] if training_device == 'cpu' else [0]
+        engine = Engine(
+            default_root_dir=model.export_path,
+            logger=[trackio, tensorboard],
+            devices=devices,  # Devices now set based on training_device
             max_epochs=10,
+            callbacks=[GetiInspectProgressCallback(synchronization_parameters)],
             accelerator=training_device,
         )
 
@@ -154,7 +177,7 @@ def _train_model(model: Model, device: str | None = None) -> Model | None:
 
         # Capture pytorch stdout logs into logger
         with redirect_stdout(LoggerStdoutWriter()):  # type: ignore[type-var]
-        with redirect_stdout(LoggerStdoutWriter()):  # type: ignore[type-var]
+        with redirect_stdout(LoggerStdoutWriter()):  # type: ignore[type-var]
+            # Note: anomalib API changed from `engine.train()` to `engine.fit()`.
+            # See anomalib release notes for details. This ensures compatibility with the latest version.
-        with redirect_stdout(LoggerStdoutWriter()):  # type: ignore[type-var]
+        with redirect_stdout(LoggerStdoutWriter()):  # type: ignore[type-var]
+            # Note: anomalib API changed from `engine.train()` to `engine.fit()`.
+            # See anomalib release notes for details. This ensures compatibility with the latest version.
-            engine.train(model=anomalib_model, datamodule=datamodule)
+            engine.fit(model=anomalib_model, datamodule=datamodule)
 
         # Find and set threshold metric
         for callback in engine.trainer.callbacks:  # type: ignore[attr-defined]
@@ -172,6 +195,31 @@ def _train_model(model: Model, device: str | None = None) -> Model | None:
         model.is_ready = True
         return model
 
+    @classmethod
+    async def _sync_progress_with_db(
+        cls,
+        job_service: JobService,
+        job_id: UUID,
+        synchronization_parameters: ProgressSyncParams,
+    ) -> None:
+        try:
+            while True:
+                progress: int = synchronization_parameters.progress
+                message = synchronization_parameters.message
+                if not await job_service.is_job_still_running(job_id=job_id):
+                    logger.debug("Job cancelled, stopping progress sync")
+                    synchronization_parameters.set_cancel_training_event()
+                    break
+                logger.debug(f"Syncing progress with db: {progress}% - {message}")
+                await job_service.update_job_status(
+                    job_id=job_id, status=JobStatus.RUNNING, progress=progress, message=message
+                )
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            logger.exception("Failed to sync progress with db: %s", e)
+            await job_service.update_job_status(job_id=job_id, status=JobStatus.FAILED, message="Training failed")
+            raise
+
     @staticmethod
     async def abort_orphan_jobs() -> None:
         """