@@ -106,6 +106,7 @@ def transform_to_csv_and_xlsx(json_path: str) -> tuple[Optional[str], Optional[s
106
106
def save_dataset_metadata (
107
107
dataset_id : str ,
108
108
source_url : str ,
109
+ publisher_country : str ,
109
110
json_data : dict [str , Any ],
110
111
json_url : Optional [str ],
111
112
csv_url : Optional [str ],
@@ -122,6 +123,7 @@ def save_dataset_metadata(
122
123
dataset_id = dataset_id ,
123
124
source_url = source_url ,
124
125
publisher_name = publisher_name ,
126
+ publisher_country = publisher_country ,
125
127
license_url = license_url ,
126
128
license_title = license_title ,
127
129
license_title_short = license_title_short ,
@@ -135,9 +137,9 @@ def save_dataset_metadata(
135
137
raise ProcessDatasetError (f"Failed to update metadata for dataset: { e } " )
136
138
137
139
138
- def process_dataset (dataset_id : str , source_url : str ) -> None :
140
+ def process_dataset (dataset_id : str , registry_metadata : dict [ str , str ] ) -> None :
139
141
logger .info (f"Processing dataset { dataset_id } " )
140
- json_data = download_json (dataset_id , source_url )
142
+ json_data = download_json (dataset_id , registry_metadata [ " source_url" ] )
141
143
validate_json (dataset_id , json_data )
142
144
json_path = write_json_to_file (
143
145
file_name = f"data/{ dataset_id } /{ dataset_id } .json" ,
@@ -149,7 +151,8 @@ def process_dataset(dataset_id: str, source_url: str) -> None:
149
151
)
150
152
save_dataset_metadata (
151
153
dataset_id = dataset_id ,
152
- source_url = source_url ,
154
+ source_url = registry_metadata ["source_url" ],
155
+ publisher_country = registry_metadata ["country" ],
153
156
json_data = json_data ,
154
157
json_url = json_public_url ,
155
158
csv_url = csv_public_url ,
@@ -158,7 +161,7 @@ def process_dataset(dataset_id: str, source_url: str) -> None:
158
161
logger .info (f"Processed dataset { dataset_id } " )
159
162
160
163
161
- def process_deleted_datasets (registered_datasets : dict [str , str ]) -> None :
164
+ def process_deleted_datasets (registered_datasets : dict [str , dict [ str , str ] ]) -> None :
162
165
stored_datasets = get_dataset_ids ()
163
166
deleted_datasets = stored_datasets - registered_datasets .keys ()
164
167
for dataset_id in deleted_datasets :
@@ -171,13 +174,17 @@ def process_registry() -> None:
171
174
registered_datasets = fetch_registered_datasets ()
172
175
process_deleted_datasets (registered_datasets )
173
176
errors : list [dict [str , Any ]] = []
174
- for dataset_id , url in registered_datasets .items ():
177
+ for dataset_id , registry_metadata in registered_datasets .items ():
175
178
try :
176
- process_dataset (dataset_id , url )
179
+ process_dataset (dataset_id , registry_metadata )
177
180
except Exception as e :
178
181
logger .warning (f"Failed to process dataset { dataset_id } with error { e } " )
179
182
errors .append (
180
- {"dataset_id" : dataset_id , "source_url" : url , "message" : str (e )}
183
+ {
184
+ "dataset_id" : dataset_id ,
185
+ "source_url" : registry_metadata ["source_url" ],
186
+ "message" : str (e ),
187
+ }
181
188
)
182
189
if errors :
183
190
logger .error (
0 commit comments