refactor: add custom exception class

tillywoodfield · tillywoodfield · commit d5cbc7379a76 · 2025-02-25T10:51:30.000+02:00
diff --git a/oc4ids_datastore_pipeline/pipeline.py b/oc4ids_datastore_pipeline/pipeline.py
@@ -24,6 +24,17 @@
 logger = logging.getLogger(__name__)
 
 
+class ProcessDatasetError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class ValidationError(ProcessDatasetError):
+    def __init__(self, errors_count: int, errors: list[str]):
+        message = f"Dataset has {errors_count} validation errors: {str(errors)}"
+        super().__init__(message)
+
+
 def download_json(url: str) -> Any:
     logger.info(f"Downloading json from {url}")
     try:
@@ -33,19 +44,23 @@ def download_json(url: str) -> Any:
         logger.info(f"Downloaded {url} ({response_size} bytes)")
         return r.json()
     except Exception as e:
-        raise Exception("Download failed", e)
+        raise ProcessDatasetError(f"Download failed: {str(e)}")
 
 
 def validate_json(dataset_id: str, json_data: dict[str, Any]) -> None:
     logger.info(f"Validating dataset {dataset_id}")
     try:
         validation_result = oc4ids_json_output(json_data=json_data)
         validation_errors_count = validation_result["validation_errors_count"]
+        validation_errors = validation_result["validation_errors"]
         if validation_errors_count > 0:
-            raise Exception(f"Dataset has {validation_errors_count} validation errors")
+            raise ValidationError(
+                errors_count=validation_errors_count,
+                errors=validation_errors,
+            )
         logger.info(f"Dataset {dataset_id} is valid")
     except Exception as e:
-        raise Exception("Validation failed", e)
+        raise ProcessDatasetError(f"Validation failed: {str(e)}")
 
 
 def write_json_to_file(file_name: str, json_data: dict[str, Any]) -> str:
@@ -57,7 +72,7 @@ def write_json_to_file(file_name: str, json_data: dict[str, Any]) -> str:
         logger.info(f"Finished writing to {file_name}")
         return file_name
     except Exception as e:
-        raise Exception("Error while writing to JSON file", e)
+        raise ProcessDatasetError(f"Error writing dataset to file: {e}")
 
 
 def transform_to_csv_and_xlsx(json_path: str) -> tuple[Optional[str], Optional[str]]:
@@ -76,7 +91,7 @@ def transform_to_csv_and_xlsx(json_path: str) -> tuple[Optional[str], Optional[s
         logger.info(f"Transformed to XLSX at {xlsx_path}")
         return csv_path, xlsx_path
     except Exception as e:
-        logger.warning(f"Failed to transform JSON to CSV and XLSX with error {e}")
+        logger.warning(f"Failed to transform JSON to CSV and XLSX: {e}")
         return None, None
 
 
@@ -89,46 +104,47 @@ def save_dataset_metadata(
     xlsx_url: Optional[str],
 ) -> None:
     logger.info(f"Saving metadata for dataset {dataset_id}")
-    publisher_name = json_data.get("publisher", {}).get("name", "")
-    license_url = json_data.get("license", None)
-    license_name = get_license_name_from_url(license_url) if license_url else None
-    dataset = Dataset(
-        dataset_id=dataset_id,
-        source_url=source_url,
-        publisher_name=publisher_name,
-        license_url=license_url,
-        license_name=license_name,
-        json_url=json_url,
-        csv_url=csv_url,
-        xlsx_url=xlsx_url,
-        updated_at=datetime.datetime.now(datetime.UTC),
-    )
-    save_dataset(dataset)
-
-
-def process_dataset(dataset_id: str, source_url: str) -> None:
-    logger.info(f"Processing dataset {dataset_id}")
     try:
-        json_data = download_json(source_url)
-        validate_json(dataset_id, json_data)
-        json_path = write_json_to_file(
-            f"data/{dataset_id}/{dataset_id}.json", json_data
-        )
-        csv_path, xlsx_path = transform_to_csv_and_xlsx(json_path)
-        json_public_url, csv_public_url, xlsx_public_url = upload_files(
-            dataset_id, json_path=json_path, csv_path=csv_path, xlsx_path=xlsx_path
-        )
-        save_dataset_metadata(
+        publisher_name = json_data.get("publisher", {}).get("name", "")
+        license_url = json_data.get("license", None)
+        license_name = get_license_name_from_url(license_url) if license_url else None
+        dataset = Dataset(
             dataset_id=dataset_id,
             source_url=source_url,
-            json_data=json_data,
-            json_url=json_public_url,
-            csv_url=csv_public_url,
-            xlsx_url=xlsx_public_url,
+            publisher_name=publisher_name,
+            license_url=license_url,
+            license_name=license_name,
+            json_url=json_url,
+            csv_url=csv_url,
+            xlsx_url=xlsx_url,
+            updated_at=datetime.datetime.now(datetime.UTC),
         )
-        logger.info(f"Processed dataset {dataset_id}")
+        save_dataset(dataset)
     except Exception as e:
-        logger.warning(f"Failed to process dataset {dataset_id} with error {e}")
+        raise ProcessDatasetError(f"Failed to update metadata for dataset: {e}")
+
+
+def process_dataset(dataset_id: str, source_url: str) -> None:
+    logger.info(f"Processing dataset {dataset_id}")
+    json_data = download_json(source_url)
+    validate_json(dataset_id, json_data)
+    json_path = write_json_to_file(
+        file_name=f"data/{dataset_id}/{dataset_id}.json",
+        json_data=json_data,
+    )
+    csv_path, xlsx_path = transform_to_csv_and_xlsx(json_path)
+    json_public_url, csv_public_url, xlsx_public_url = upload_files(
+        dataset_id, json_path=json_path, csv_path=csv_path, xlsx_path=xlsx_path
+    )
+    save_dataset_metadata(
+        dataset_id=dataset_id,
+        source_url=source_url,
+        json_data=json_data,
+        json_url=json_public_url,
+        csv_url=csv_public_url,
+        xlsx_url=xlsx_public_url,
+    )
+    logger.info(f"Processed dataset {dataset_id}")
 
 
 def process_deleted_datasets(registered_datasets: dict[str, str]) -> None:
@@ -143,8 +159,17 @@ def process_deleted_datasets(registered_datasets: dict[str, str]) -> None:
 def process_registry() -> None:
     registered_datasets = fetch_registered_datasets()
     process_deleted_datasets(registered_datasets)
+    errors: list[dict[str, Any]] = []
     for dataset_id, url in registered_datasets.items():
-        process_dataset(dataset_id, url)
+        try:
+            process_dataset(dataset_id, url)
+        except Exception as e:
+            logger.warning(f"Failed to process dataset {dataset_id} with error {e}")
+            errors.append({"dataset": dataset_id, "source_url": url, "errors": str(e)})
+    if errors:
+        logger.error(
+            f"Errors while processing registry: {json.dumps(errors, indent=4)}"
+        )
     logger.info("Finished processing all datasets")
 
 
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -6,9 +6,11 @@
 from pytest_mock import MockerFixture
 
 from oc4ids_datastore_pipeline.pipeline import (
+    ProcessDatasetError,
     download_json,
     process_dataset,
     process_deleted_datasets,
+    process_registry,
     transform_to_csv_and_xlsx,
     validate_json,
     write_json_to_file,
@@ -19,7 +21,7 @@ def test_download_json_raises_failure_exception(mocker: MockerFixture) -> None:
     patch_get = mocker.patch("oc4ids_datastore_pipeline.pipeline.requests.get")
     patch_get.side_effect = Exception("Mocked exception")
 
-    with pytest.raises(Exception) as exc_info:
+    with pytest.raises(ProcessDatasetError) as exc_info:
         download_json(url="https://test_dataset.json")
 
     assert "Download failed" in str(exc_info.value)
@@ -32,7 +34,7 @@ def test_validate_json_raises_failure_exception(mocker: MockerFixture) -> None:
     )
     patch_oc4ids_json_output.side_effect = Exception("Mocked exception")
 
-    with pytest.raises(Exception) as exc_info:
+    with pytest.raises(ProcessDatasetError) as exc_info:
         validate_json(dataset_id="test_dataset", json_data={})
 
     assert "Validation failed" in str(exc_info.value)
@@ -45,13 +47,28 @@ def test_validate_json_raises_validation_errors_exception(
     patch_oc4ids_json_output = mocker.patch(
         "oc4ids_datastore_pipeline.pipeline.oc4ids_json_output"
     )
-    patch_oc4ids_json_output.return_value = {"validation_errors_count": 2}
-
-    with pytest.raises(Exception) as exc_info:
+    patch_oc4ids_json_output.return_value = {
+        "validation_errors_count": 2,
+        "validation_errors": [
+            [
+                '{"message": "Non-unique id values"}',
+                [
+                    {
+                        "path": "projects/22/parties",
+                        "value": "test_value",
+                    },
+                    {"path": "projects/30/parties", "value": "test_value"},
+                ],
+            ]
+        ],
+    }
+
+    with pytest.raises(ProcessDatasetError) as exc_info:
         validate_json(dataset_id="test_dataset", json_data={})
 
     assert "Validation failed" in str(exc_info.value)
     assert "Dataset has 2 validation errors" in str(exc_info.value)
+    assert "Non-unique id values" in str(exc_info.value)
 
 
 def test_write_json_to_file_writes_in_correct_format() -> None:
@@ -73,13 +90,13 @@ def test_write_json_to_file_raises_failure_exception(mocker: MockerFixture) -> N
     patch_json_dump = mocker.patch("oc4ids_datastore_pipeline.pipeline.json.dump")
     patch_json_dump.side_effect = Exception("Mocked exception")
 
-    with pytest.raises(Exception) as exc_info:
+    with pytest.raises(ProcessDatasetError) as exc_info:
         with tempfile.TemporaryDirectory() as dir:
             file_name = os.path.join(dir, "test_dataset.json")
             write_json_to_file(file_name=file_name, json_data={"key": "value"})
 
-            assert "Error while writing to JSON file" in str(exc_info.value)
-            assert "Mocked exception" in str(exc_info.value)
+    assert "Error writing dataset to file" in str(exc_info.value)
+    assert "Mocked exception" in str(exc_info.value)
 
 
 def test_transform_to_csv_and_xlsx_returns_correct_paths(mocker: MockerFixture) -> None:
@@ -122,10 +139,29 @@ def test_process_deleted_datasets(mocker: MockerFixture) -> None:
     patch_delete_files_for_dataset.assert_called_once_with("old_dataset")
 
 
-def test_process_dataset_catches_exception(mocker: MockerFixture) -> None:
+def test_process_dataset_raises_failure_exception(mocker: MockerFixture) -> None:
     patch_download_json = mocker.patch(
         "oc4ids_datastore_pipeline.pipeline.download_json"
     )
-    patch_download_json.side_effect = Exception("Download failed")
+    patch_download_json.side_effect = ProcessDatasetError("Download failed: Exception")
+
+    with pytest.raises(ProcessDatasetError) as exc_info:
+        process_dataset("test_dataset", "https://test_dataset.json")
+
+    assert "Download failed: Exception" in str(exc_info.value)
+
+
+def test_process_registry_catches_exception(mocker: MockerFixture) -> None:
+    patch_fetch_registered_datasets = mocker.patch(
+        "oc4ids_datastore_pipeline.pipeline.fetch_registered_datasets"
+    )
+    patch_fetch_registered_datasets.return_value = {
+        "test_dataset": "https://test_dataset.json"
+    }
+    mocker.patch("oc4ids_datastore_pipeline.pipeline.process_deleted_datasets")
+    patch_process_dataset = mocker.patch(
+        "oc4ids_datastore_pipeline.pipeline.process_dataset"
+    )
+    patch_process_dataset.side_effect = Exception("Mocked exception")
 
-    process_dataset("test_dataset", "https://test_dataset.json")
+    process_registry()