24
24
logger = logging .getLogger (__name__ )
25
25
26
26
27
+ class ProcessDatasetError (Exception ):
28
+ def __init__ (self , message : str ):
29
+ super ().__init__ (message )
30
+
31
+
32
+ class ValidationError (ProcessDatasetError ):
33
+ def __init__ (self , errors_count : int , errors : list [str ]):
34
+ message = f"Dataset has { errors_count } validation errors: { str (errors )} "
35
+ super ().__init__ (message )
36
+
37
+
27
38
def download_json (url : str ) -> Any :
28
39
logger .info (f"Downloading json from { url } " )
29
40
try :
@@ -33,19 +44,23 @@ def download_json(url: str) -> Any:
33
44
logger .info (f"Downloaded { url } ({ response_size } bytes)" )
34
45
return r .json ()
35
46
except Exception as e :
36
- raise Exception ( "Download failed" , e )
47
+ raise ProcessDatasetError ( f "Download failed: { str ( e ) } " )
37
48
38
49
39
50
def validate_json (dataset_id : str , json_data : dict [str , Any ]) -> None :
40
51
logger .info (f"Validating dataset { dataset_id } " )
41
52
try :
42
53
validation_result = oc4ids_json_output (json_data = json_data )
43
54
validation_errors_count = validation_result ["validation_errors_count" ]
55
+ validation_errors = validation_result ["validation_errors" ]
44
56
if validation_errors_count > 0 :
45
- raise Exception (f"Dataset has { validation_errors_count } validation errors" )
57
+ raise ValidationError (
58
+ errors_count = validation_errors_count ,
59
+ errors = validation_errors ,
60
+ )
46
61
logger .info (f"Dataset { dataset_id } is valid" )
47
62
except Exception as e :
48
- raise Exception ( "Validation failed" , e )
63
+ raise ProcessDatasetError ( f "Validation failed: { str ( e ) } " )
49
64
50
65
51
66
def write_json_to_file (file_name : str , json_data : dict [str , Any ]) -> str :
@@ -57,7 +72,7 @@ def write_json_to_file(file_name: str, json_data: dict[str, Any]) -> str:
57
72
logger .info (f"Finished writing to { file_name } " )
58
73
return file_name
59
74
except Exception as e :
60
- raise Exception ( "Error while writing to JSON file" , e )
75
+ raise ProcessDatasetError ( f "Error writing dataset to file: { e } " )
61
76
62
77
63
78
def transform_to_csv_and_xlsx (json_path : str ) -> tuple [Optional [str ], Optional [str ]]:
@@ -76,7 +91,7 @@ def transform_to_csv_and_xlsx(json_path: str) -> tuple[Optional[str], Optional[s
76
91
logger .info (f"Transformed to XLSX at { xlsx_path } " )
77
92
return csv_path , xlsx_path
78
93
except Exception as e :
79
- logger .warning (f"Failed to transform JSON to CSV and XLSX with error { e } " )
94
+ logger .warning (f"Failed to transform JSON to CSV and XLSX: { e } " )
80
95
return None , None
81
96
82
97
@@ -89,46 +104,47 @@ def save_dataset_metadata(
89
104
xlsx_url : Optional [str ],
90
105
) -> None :
91
106
logger .info (f"Saving metadata for dataset { dataset_id } " )
92
- publisher_name = json_data .get ("publisher" , {}).get ("name" , "" )
93
- license_url = json_data .get ("license" , None )
94
- license_name = get_license_name_from_url (license_url ) if license_url else None
95
- dataset = Dataset (
96
- dataset_id = dataset_id ,
97
- source_url = source_url ,
98
- publisher_name = publisher_name ,
99
- license_url = license_url ,
100
- license_name = license_name ,
101
- json_url = json_url ,
102
- csv_url = csv_url ,
103
- xlsx_url = xlsx_url ,
104
- updated_at = datetime .datetime .now (datetime .UTC ),
105
- )
106
- save_dataset (dataset )
107
-
108
-
109
- def process_dataset (dataset_id : str , source_url : str ) -> None :
110
- logger .info (f"Processing dataset { dataset_id } " )
111
107
try :
112
- json_data = download_json (source_url )
113
- validate_json (dataset_id , json_data )
114
- json_path = write_json_to_file (
115
- f"data/{ dataset_id } /{ dataset_id } .json" , json_data
116
- )
117
- csv_path , xlsx_path = transform_to_csv_and_xlsx (json_path )
118
- json_public_url , csv_public_url , xlsx_public_url = upload_files (
119
- dataset_id , json_path = json_path , csv_path = csv_path , xlsx_path = xlsx_path
120
- )
121
- save_dataset_metadata (
108
+ publisher_name = json_data .get ("publisher" , {}).get ("name" , "" )
109
+ license_url = json_data .get ("license" , None )
110
+ license_name = get_license_name_from_url (license_url ) if license_url else None
111
+ dataset = Dataset (
122
112
dataset_id = dataset_id ,
123
113
source_url = source_url ,
124
- json_data = json_data ,
125
- json_url = json_public_url ,
126
- csv_url = csv_public_url ,
127
- xlsx_url = xlsx_public_url ,
114
+ publisher_name = publisher_name ,
115
+ license_url = license_url ,
116
+ license_name = license_name ,
117
+ json_url = json_url ,
118
+ csv_url = csv_url ,
119
+ xlsx_url = xlsx_url ,
120
+ updated_at = datetime .datetime .now (datetime .UTC ),
128
121
)
129
- logger . info ( f"Processed dataset { dataset_id } " )
122
+ save_dataset ( dataset )
130
123
except Exception as e :
131
- logger .warning (f"Failed to process dataset { dataset_id } with error { e } " )
124
+ raise ProcessDatasetError (f"Failed to update metadata for dataset: { e } " )
125
+
126
+
127
+ def process_dataset (dataset_id : str , source_url : str ) -> None :
128
+ logger .info (f"Processing dataset { dataset_id } " )
129
+ json_data = download_json (source_url )
130
+ validate_json (dataset_id , json_data )
131
+ json_path = write_json_to_file (
132
+ file_name = f"data/{ dataset_id } /{ dataset_id } .json" ,
133
+ json_data = json_data ,
134
+ )
135
+ csv_path , xlsx_path = transform_to_csv_and_xlsx (json_path )
136
+ json_public_url , csv_public_url , xlsx_public_url = upload_files (
137
+ dataset_id , json_path = json_path , csv_path = csv_path , xlsx_path = xlsx_path
138
+ )
139
+ save_dataset_metadata (
140
+ dataset_id = dataset_id ,
141
+ source_url = source_url ,
142
+ json_data = json_data ,
143
+ json_url = json_public_url ,
144
+ csv_url = csv_public_url ,
145
+ xlsx_url = xlsx_public_url ,
146
+ )
147
+ logger .info (f"Processed dataset { dataset_id } " )
132
148
133
149
134
150
def process_deleted_datasets (registered_datasets : dict [str , str ]) -> None :
@@ -143,8 +159,17 @@ def process_deleted_datasets(registered_datasets: dict[str, str]) -> None:
143
159
def process_registry () -> None :
144
160
registered_datasets = fetch_registered_datasets ()
145
161
process_deleted_datasets (registered_datasets )
162
+ errors : list [dict [str , Any ]] = []
146
163
for dataset_id , url in registered_datasets .items ():
147
- process_dataset (dataset_id , url )
164
+ try :
165
+ process_dataset (dataset_id , url )
166
+ except Exception as e :
167
+ logger .warning (f"Failed to process dataset { dataset_id } with error { e } " )
168
+ errors .append ({"dataset" : dataset_id , "source_url" : url , "errors" : str (e )})
169
+ if errors :
170
+ logger .error (
171
+ f"Errors while processing registry: { json .dumps (errors , indent = 4 )} "
172
+ )
148
173
logger .info ("Finished processing all datasets" )
149
174
150
175
0 commit comments