diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 9b03b14a50..947decf6bb 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1469,6 +1469,24 @@ defaultOrganisms: - name: "Country map" url: "https://mapoplexus.genomium.org/?url={{[metadata+accessionVersion,accession,version,geoLocAdmin1,geoLocAdmin2,geoLocCity,geoLocCountry,geoLocSite,hostNameCommon,hostNameScientific,authors,sampleCollectionDate]}}" metadataAdd: + - name: displayName + preprocessing: + function: build_display_name + inputs: + geoLocCountry: geoLocCountry + sampleCollectionDate: sampleCollectionDate + specimenCollectorSampleId: specimenCollectorSampleId + submissionId: submissionId + args: + order: [geoLocCountry, IDENTIFIER, sampleCollectionDate] + type: [string, IDENTIFIER, string] + # regex pattern constraints: + # - Cannot start with a slash + # - Four fields, separated by exactly three slashes + # - Last field is a date in format YYYY, YYYY-MM, or YYYY-MM-DD + # - Identifier is the second to last field (extracted through named capture group) + regex_pattern: | + ^[^\/][^/]*/[^/]+/(?P[^/]+)/\d{4}(?:-\d{2}){0,2}$ - name: lineage isSequenceFilter: true header: "Lineage" diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py index cdc6fe3095..afd6687120 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/backend.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/backend.py @@ -97,6 +97,7 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]: submitter=json_object["submitter"], group_id=json_object["groupId"], submittedAt=json_object["submittedAt"], + submissionId=json_object["submissionId"], metadata=json_object["data"]["metadata"], unalignedNucleotideSequences=trimmed_unaligned_nucleotide_sequences if unaligned_nucleotide_sequences diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index 9fc88bcb8d..a3f2129652 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -79,6 +79,7 @@ class UnprocessedData: submitter: str group_id: int submittedAt: str # timestamp # noqa: N815 + submissionId: str # noqa: N815 metadata: InputMetadata unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815 diff --git a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py index 2ddc171c61..1b9f7a241e 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/nextclade.py @@ -787,6 +787,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914 **entry.data.metadata, "submitter": entry.data.submitter, "submittedAt": entry.data.submittedAt, + "submissionId": entry.data.submissionId, "group_id": entry.data.group_id, } for entry in unprocessed diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 85b4fc3449..3e0201a21b 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -697,7 +697,7 @@ def add_errors(): n_inputs = len(input_data.keys()) # exclude ACCESSION_VERSION as it's provided by _call_preprocessing_function() and should not be an input_metadata field n_expected = len([i for i in order if i != "ACCESSION_VERSION"]) - if n_inputs != n_expected: + if n_inputs < n_expected: logger.error( f"Concatenate: Expected {n_expected} fields, got {n_inputs}. " f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})" @@ -1177,6 +1177,163 @@ def is_above_threshold( ) return ProcessingResult(datum=(input > threshold), warnings=[], errors=[]) + @staticmethod + def build_display_name( + input_data: InputMetadata, + output_field: str, + input_fields: list[str], + args: FunctionArgs, + ) -> ProcessingResult: + """Builds a displayName from input_fields. The identifier field in the displayName is based on + specimenCollectorSampleId or - if it is not set - submissionId. + + This method wraps ProcessingFunctions.concatenate(). Thus, it has the same required input + args, as well as adding some additional checks and requirements: + - submissionId and specimenCollectorSampleId must be in the input_data + - IDENTIFIER keyword must be in args['order'] and args['type'] + - if the IDENTIFIER is in an unrecognized format, it will be replaced with the ACCESSION_VERSION + - if fallback_value is not in args, { 'fallback_value': 'unknown' } is added to the args before passing + them on to concatenate() + - for sequences ingested from INSDC, we do not try to parse the IDENTIFIER field using regex. We + will use the Isolate Name as IDENTIFIER field if it contains no slashes or spaces (otherwise we fall back to + ACCESSION_VERSION) + """ + collector_id = input_data.get("specimenCollectorSampleId", None) + submission_id = input_data.get("submissionId", None) + warnings: list[ProcessingAnnotation] = [] + if submission_id is None: + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + "Internal Error: 'submissionId' must not be None for build_display_name(). Please contact the administrator." + ), + ) + ], + ) + + order = args.get("order") + field_types = args.get("type") + if ( + not isinstance(order, list) + or not isinstance(field_types, list) + or len(order) != len(field_types) + or "IDENTIFIER" not in order + ): + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + "Internal Error: 'order' and 'type' must be lists of equal length, and 'order' must contain IDENTIFIER - this is required for build_display_name to function. Please contact the administrator." + ), + ) + ], + ) + + regex_pattern = args.get("regex_pattern") + if regex_pattern is not None: + if "identifier" not in re.compile(str(regex_pattern)).groupindex: + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + "Internal Error: if provided, 'regex_pattern' must contain a named capture group called 'identifier'" + ), + ) + ], + ) + + concatenate_order = order.copy() + concatenate_field_types = field_types.copy() + + def replace_identifier(values, replacement): + return [replacement if v == "IDENTIFIER" else v for v in values] + + identifier: ProcessedMetadataValue = collector_id or submission_id + if not isinstance(identifier, str): + identifier = None + elif args["is_insdc_ingest_group"]: + # For INSDC ingested sequence: use ID as is unless it contains ' ' or '/' + # If it does: fall back to ACCESSION_VERSION + if " " in identifier or "/" in identifier: + identifier = None + elif "/" in identifier: + # For direct submissions with "/": try to extrect ID field using regex + if regex_pattern is None: + identifier = None + warnings.append( + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + "identifier string contained '/' but no regex_pattern was provided" + ), + ) + ) + else: + extract_result = ProcessingFunctions.extract_regex( + input_data={"regex_field": identifier}, + output_field="IDENTIFIER", + input_fields=[], + args={"pattern": regex_pattern, "capture_group": "identifier"}, + ) + identifier = extract_result.datum + if identifier is None: + # regex extraction of ID field failed, fall back to ACCESSION_VERSION + warnings.append( + ProcessingAnnotation.from_fields( + input_fields, + [output_field], + AnnotationSourceType.METADATA, + message=( + "identifier string could not be parsed using provided regex_pattern" + ), + ) + ) + + if identifier is None: + # Use ACCESSION_VERSION instead of IDENTIFIER + concatenate_order = replace_identifier(order, "ACCESSION_VERSION") + concatenate_field_types = replace_identifier(field_types, "ACCESSION_VERSION") + else: + # Keep IDENTIFIER but treat it as string + concatenate_field_types = replace_identifier(field_types, "string") + input_data["IDENTIFIER"] = str(identifier) + + concat_result = ProcessingFunctions.concatenate( + input_data, + output_field, + input_fields, + { + "order": concatenate_order, + "type": concatenate_field_types, + "fallback_value": args.get("fallback_value", "unknown"), + "ACCESSION_VERSION": args["ACCESSION_VERSION"], + }, + ) + + return ProcessingResult( + datum=concat_result.datum, + warnings=warnings + concat_result.warnings, + errors=concat_result.errors, + ) + def single_metadata_annotation( source_name: str, diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 933cc5f66d..41b26062ae 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -85,6 +85,7 @@ def create_unprocessed_entry( submittedAt=str( datetime.strptime("2021-12-15", "%Y-%m-%d").replace(tzinfo=pytz.utc).timestamp() ), + submissionId=metadata_dict.get("submissionId") or "test_submission_id", group_id=group_id, metadata=metadata_dict, unalignedNucleotideSequences=sequences, diff --git a/preprocessing/nextclade/tests/test_metadata_processing_functions.py b/preprocessing/nextclade/tests/test_metadata_processing_functions.py index 06196480f6..a8fa54f366 100644 --- a/preprocessing/nextclade/tests/test_metadata_processing_functions.py +++ b/preprocessing/nextclade/tests/test_metadata_processing_functions.py @@ -709,6 +709,7 @@ def test_preprocessing_without_consensus_sequences(config: Config) -> None: accessionVersion="LOC_01.1", data=UnprocessedData( submitter="test_submitter", + submissionId="test_submission_id", group_id=2, submittedAt=ts_from_ymd(2021, 12, 15), metadata={ @@ -966,5 +967,113 @@ def test_concatenate() -> None: assert res_fallback_explicit_null.datum == "0/unknown/version.1/unknown" +def test_display_name_construction() -> None: + submission_id = "mySample" + submission_id_formatted = "hDENV1/Germany/myExtractedSample/2025" + submission_id_formatted_unexpected = "hDENV1/myExtractedSample/2025" + input_data: InputMetadata = { + "nextclade.clade": "DENV-1", + "geoLocCountry": "Switzerland", + "sampleCollectionDate": "2025", + "submissionId": submission_id, + } + output_field: str = "displayName" + + def input_fields(): + return [ + "nextclade.clade", + "geoLocCountry", + "specimenCollectorSampleId", + "submissionId", + "sampleCollectionDate", + ] + + def args(): + return { + "ACCESSION_VERSION": "version.1", + "is_insdc_ingest_group": False, + "order": ["nextclade.clade", "geoLocCountry", "IDENTIFIER", "sampleCollectionDate"], + "type": ["string", "string", "IDENTIFIER", "string"], + "regex_pattern": r"^[^\/][^/]*/[^/]+/(?P[^/]+)/\d{4}(?:-\d{2}){0,2}$", + } + + def args_insdc(): + return { + "ACCESSION_VERSION": "version.1", + "is_insdc_ingest_group": True, + "order": ["nextclade.clade", "geoLocCountry", "IDENTIFIER", "sampleCollectionDate"], + "type": ["string", "string", "IDENTIFIER", "string"], + "regex_pattern": r"^[^\/][^/]*/[^/]+/(?P[^/]+)/\d{4}(?:-\d{2}){0,2}$", + } + + res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args()) + res_insdc = ProcessingFunctions.build_display_name( + input_data, output_field, input_fields(), args_insdc() + ) + assert res.datum == "DENV-1/Switzerland/mySample/2025" + assert res_insdc.datum == "DENV-1/Switzerland/mySample/2025" + + input_data["specimenCollectorSampleId"] = "myCollectorSample" + res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args()) + res_insdc = ProcessingFunctions.build_display_name( + input_data, output_field, input_fields(), args_insdc() + ) + assert res.datum == "DENV-1/Switzerland/myCollectorSample/2025" + assert res_insdc.datum == "DENV-1/Switzerland/myCollectorSample/2025" + + input_data["specimenCollectorSampleId"] = submission_id_formatted + res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args()) + res_insdc = ProcessingFunctions.build_display_name( + input_data, output_field, input_fields(), args_insdc() + ) + assert res.datum == "DENV-1/Switzerland/myExtractedSample/2025" + assert res_insdc.datum == "DENV-1/Switzerland/version.1/2025" + + input_data["specimenCollectorSampleId"] = submission_id_formatted_unexpected + res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args()) + res_insdc = ProcessingFunctions.build_display_name( + input_data, output_field, input_fields(), args_insdc() + ) + assert res.datum == "DENV-1/Switzerland/version.1/2025" + assert res_insdc.datum == "DENV-1/Switzerland/version.1/2025" + + input_data["specimenCollectorSampleId"] = submission_id_formatted_unexpected + input_data["geoLocCountry"] = "" + res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args()) + res_insdc = ProcessingFunctions.build_display_name( + input_data, output_field, input_fields(), args_insdc() + ) + assert res.datum == "DENV-1/unknown/version.1/2025" + assert len(res.warnings) == 1 + assert ( + res.warnings[0].message + == "identifier string could not be parsed using provided regex_pattern" + ) + assert res_insdc.datum == "DENV-1/unknown/version.1/2025" + assert len(res_insdc.warnings) == 0 + + input_data["specimenCollectorSampleId"] = submission_id_formatted_unexpected + res = ProcessingFunctions.build_display_name( + input_data, + output_field, + input_fields(), + {"fallback_value": "another_fallback"} | args(), + ) + res_insdc = ProcessingFunctions.build_display_name( + input_data, + output_field, + input_fields(), + {"fallback_value": "another_fallback"} | args_insdc(), + ) + assert res.datum == "DENV-1/another_fallback/version.1/2025" + assert len(res.warnings) == 1 + assert ( + res.warnings[0].message + == "identifier string could not be parsed using provided regex_pattern" + ) + assert res_insdc.datum == "DENV-1/another_fallback/version.1/2025" + assert len(res_insdc.warnings) == 0 + + if __name__ == "__main__": pytest.main() diff --git a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py index 8f8d6e859e..420b4817fd 100644 --- a/preprocessing/nextclade/tests/test_nextclade_preprocessing.py +++ b/preprocessing/nextclade/tests/test_nextclade_preprocessing.py @@ -1198,6 +1198,7 @@ def test_preprocessing_without_metadata() -> None: data=UnprocessedData( group_id=2, submitter="test_submitter", + submissionId="test_submission_id", submittedAt=ts_from_ymd(2021, 12, 15), metadata={}, unalignedNucleotideSequences={ @@ -1315,6 +1316,7 @@ def test_create_flatfile(): submitter="test_submitter", group_id=2, submittedAt=ts_from_ymd(2021, 12, 15), + submissionId="test_submission_id", metadata={ "sampleCollectionDate": "2024-01-01", "geoLocCountry": "Netherlands", diff --git a/website/src/config.ts b/website/src/config.ts index 00994c6b52..bf4914b10a 100644 --- a/website/src/config.ts +++ b/website/src/config.ts @@ -176,7 +176,7 @@ export function getSubmissionIdInputFields(schema: Schema): InputField[] { displayName: 'ID', definition: 'FASTA ID', guidance: - "Your sequence identifier; should match the sequence's id in the FASTA file - this is used to link the metadata to the FASTA sequence.", + "Your sequence identifier; should match the sequence's id in the FASTA file - this is used to link the metadata to the FASTA sequence. Depending on it's format, this ID may be parsed and used in the display name of this sequence on the website (only if configured for this organism).", example: 'GJP123', noEdit: true, required: true, @@ -189,7 +189,7 @@ export function getSubmissionIdInputFields(schema: Schema): InputField[] { displayName: 'ID', definition: 'METADATA ID', guidance: - 'Your sample identifier. If FASTA IDS column is provided, this sample ID will be used to associate the metadata with the sequence.', + "Your sample identifier. If no Fasta Ids are provided in the FASTA IDS column, this sample ID will be used as a FASTA ID to associate the metadata with the sequence. Depending on it's format, this ID may be parsed and used in the display name of this sequence on the website (only if configured for this organism).", example: 'GJP123', noEdit: true, required: true,