Skip to content

Commit e5a22df

Browse files
committed
added custom validator to extend validation exceptions
1 parent f10cba5 commit e5a22df

File tree

2 files changed

+69
-20
lines changed

2 files changed

+69
-20
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/usr/bin/env python
2+
3+
4+
def validate_with_exceptions(schema, data, errors):
5+
"""Filter out type errors for:
6+
- integer/float fields containing 'Not Provided'
7+
- string fields with format: date containing 'Not Provided'"""
8+
filtered_errors = []
9+
10+
for error in errors:
11+
property_path = ".".join(str(p) for p in error.path)
12+
prop_schema = schema["properties"].get(property_path, {})
13+
14+
# allow not provided for numeric types
15+
if (
16+
error.validator == "type"
17+
and error.instance == "Not Provided [GENEPIO:0001668]"
18+
and prop_schema.get("type") in ["integer", "number"]
19+
):
20+
continue
21+
22+
# allow not provided for date format types
23+
if (
24+
error.validator == "format"
25+
and error.instance == "Not Provided [GENEPIO:0001668]"
26+
and prop_schema.get("type") == "string"
27+
and prop_schema.get("format") == "date"
28+
):
29+
continue
30+
31+
# Keep all other errors
32+
filtered_errors.append(error)
33+
34+
return

relecov_tools/json_validation.py

+35-20
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88

99
import relecov_tools.utils
1010
import relecov_tools.assets.schema_utils.jsonschema_draft
11+
import relecov_tools.assets.schema_utils.custom_validators
1112
from relecov_tools.config_json import ConfigJson
1213
from relecov_tools.log_summary import LogSum
1314

15+
1416
log = logging.getLogger(__name__)
1517
stderr = rich.console.Console(
1618
stderr=True,
@@ -112,9 +114,9 @@ def get_sample_id_field(self):
112114
return sample_id_field
113115

114116
def validate_instances(self):
115-
"""Validate data instances against a validated json schema"""
117+
"""Validate data instances against a validated JSON schema"""
116118

117-
# create validator
119+
# Create validator
118120
validator = Draft202012Validator(
119121
self.json_schema, format_checker=FormatChecker()
120122
)
@@ -124,52 +126,65 @@ def validate_instances(self):
124126
invalid_json = []
125127
errors = {}
126128
error_keys = {}
129+
127130
if self.sample_id_field is None:
128131
log_text = f"Logs keys set to None. Reason: {self.SAMPLE_FIELD_ERROR}"
129132
self.logsum.add_warning(sample=self.sample_id_field, entry=log_text)
130-
stderr.print("[blue] Start processing the json file")
131-
log.info("Start processing the json file")
133+
134+
stderr.print("[blue] Start processing the JSON file")
135+
log.info("Start processing the JSON file")
136+
132137
for item_row in self.json_data:
133-
# validate(instance=item_row, schema=json_schema)
134138
sample_id_value = item_row.get(self.sample_id_field)
135-
if validator.is_valid(item_row):
139+
140+
# Collect all errors (don't raise immediately)
141+
validation_errors = list(validator.iter_errors(item_row))
142+
143+
# Run the custom validator to check if errors should be ignored
144+
validation_errors = relecov_tools.assets.schema_utils.custom_validators.validate_with_exceptions(
145+
self.json_schema, item_row, validation_errors
146+
)
147+
if not validation_errors:
136148
validated_json_data.append(item_row)
137149
self.logsum.feed_key(sample=sample_id_value)
138150
else:
139-
# Count error types
140-
for error in validator.iter_errors(item_row):
151+
# Process remaining errors
152+
for error in validation_errors:
153+
# Extract the error field name
141154
if error.validator == "required":
142155
error_field = [
143156
f for f in error.validator_value if f in error.message
144157
][0]
145158
else:
146159
error_field = error.absolute_path[0]
160+
161+
# Try to get the human-readable label from the schema
147162
try:
148163
err_field_label = schema_props[error_field]["label"]
149164
except KeyError:
150-
log.error("Could not extract label for %s" % error_field)
165+
log.error(f"Could not extract label for {error_field}")
151166
err_field_label = error_field
152-
error.message.replace(error_field, err_field_label)
167+
168+
# Format the error message
169+
error.message = error.message.replace(error_field, err_field_label)
153170
error_text = f"Error in column {err_field_label}: {error.message}"
171+
172+
# Log errors for summary
154173
error_keys[error.message] = error_field
155-
if error.message in errors:
156-
errors[error.message] += 1
157-
else:
158-
errors[error.message] = 1
174+
errors[error.message] = errors.get(error.message, 0) + 1
159175
self.logsum.add_error(sample=sample_id_value, entry=error_text)
160-
# append row with errors
176+
177+
# Add the invalid row to the list
161178
invalid_json.append(item_row)
162179

163180
# Summarize errors
164181
stderr.print("[blue] --------------------")
165182
stderr.print("[blue] VALIDATION SUMMARY")
166183
stderr.print("[blue] --------------------")
167184
log.info("Validation summary:")
168-
for error_type in errors.keys():
169-
num_of_errors = str(errors[error_type])
170-
field_with_error = str(error_keys[error_type])
171-
error_text = "{} samples failed validation for {}:\n{}"
172-
error_text = error_text.format(num_of_errors, field_with_error, error_type)
185+
for error_type, count in errors.items():
186+
field_with_error = error_keys[error_type]
187+
error_text = f"{count} samples failed validation for {field_with_error}:\n{error_type}"
173188
self.logsum.add_warning(entry=error_text)
174189
stderr.print(f"[red]{error_text}")
175190
stderr.print("[red] --------------------")

0 commit comments

Comments
 (0)