Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1469,6 +1469,24 @@ defaultOrganisms:
- name: "Country map"
url: "https://mapoplexus.genomium.org/?url={{[metadata+accessionVersion,accession,version,geoLocAdmin1,geoLocAdmin2,geoLocCity,geoLocCountry,geoLocSite,hostNameCommon,hostNameScientific,authors,sampleCollectionDate]}}"
metadataAdd:
- name: displayName
preprocessing:
function: build_display_name
inputs:
geoLocCountry: geoLocCountry
sampleCollectionDate: sampleCollectionDate
specimenCollectorSampleId: specimenCollectorSampleId
submissionId: submissionId
args:
order: [geoLocCountry, IDENTIFIER, sampleCollectionDate]
type: [string, IDENTIFIER, string]
# regex pattern constraints:
# - Cannot start with a slash
# - Four fields, separated by exactly three slashes
# - Last field is a date in format YYYY, YYYY-MM, or YYYY-MM-DD
# - Identifier is the second to last field (extracted through named capture group)
regex_pattern: |
^[^\/][^/]*/[^/]+/(?P<identifier>[^/]+)/\d{4}(?:-\d{2}){0,2}$
- name: lineage
isSequenceFilter: true
header: "Lineage"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def parse_ndjson(ndjson_data: str) -> Sequence[UnprocessedEntry]:
submitter=json_object["submitter"],
group_id=json_object["groupId"],
submittedAt=json_object["submittedAt"],
submissionId=json_object["submissionId"],
metadata=json_object["data"]["metadata"],
unalignedNucleotideSequences=trimmed_unaligned_nucleotide_sequences
if unaligned_nucleotide_sequences
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class UnprocessedData:
submitter: str
group_id: int
submittedAt: str # timestamp # noqa: N815
submissionId: str # noqa: N815
metadata: InputMetadata
unalignedNucleotideSequences: dict[SequenceName, NucleotideSequence | None] # noqa: N815

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0914
**entry.data.metadata,
"submitter": entry.data.submitter,
"submittedAt": entry.data.submittedAt,
"submissionId": entry.data.submissionId,
"group_id": entry.data.group_id,
}
for entry in unprocessed
Expand Down
Comment thread
maverbiest marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,7 @@ def add_errors():
n_inputs = len(input_data.keys())
# exclude ACCESSION_VERSION as it's provided by _call_preprocessing_function() and should not be an input_metadata field
n_expected = len([i for i in order if i != "ACCESSION_VERSION"])
if n_inputs != n_expected:
if n_inputs < n_expected:
logger.error(
f"Concatenate: Expected {n_expected} fields, got {n_inputs}. "
f"This is probably a configuration error. (ACCESSION_VERSION: {accession_version})"
Expand Down Expand Up @@ -1177,6 +1177,163 @@ def is_above_threshold(
)
return ProcessingResult(datum=(input > threshold), warnings=[], errors=[])

@staticmethod
def build_display_name(
input_data: InputMetadata,
output_field: str,
input_fields: list[str],
args: FunctionArgs,
) -> ProcessingResult:
"""Builds a displayName from input_fields. The identifier field in the displayName is based on
specimenCollectorSampleId or - if it is not set - submissionId.

This method wraps ProcessingFunctions.concatenate(). Thus, it has the same required input
args, as well as adding some additional checks and requirements:
- submissionId and specimenCollectorSampleId must be in the input_data
- IDENTIFIER keyword must be in args['order'] and args['type']
- if the IDENTIFIER is in an unrecognized format, it will be replaced with the ACCESSION_VERSION
- if fallback_value is not in args, { 'fallback_value': 'unknown' } is added to the args before passing
them on to concatenate()
- for sequences ingested from INSDC, we do not try to parse the IDENTIFIER field using regex. We
will use the Isolate Name as IDENTIFIER field if it contains no slashes or spaces (otherwise we fall back to
ACCESSION_VERSION)
"""
collector_id = input_data.get("specimenCollectorSampleId", None)
submission_id = input_data.get("submissionId", None)
warnings: list[ProcessingAnnotation] = []
if submission_id is None:
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
"Internal Error: 'submissionId' must not be None for build_display_name(). Please contact the administrator."
),
)
],
)

order = args.get("order")
field_types = args.get("type")
if (
not isinstance(order, list)
or not isinstance(field_types, list)
or len(order) != len(field_types)
or "IDENTIFIER" not in order
):
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
"Internal Error: 'order' and 'type' must be lists of equal length, and 'order' must contain IDENTIFIER - this is required for build_display_name to function. Please contact the administrator."
),
)
],
)

regex_pattern = args.get("regex_pattern")
if regex_pattern is not None:
if "identifier" not in re.compile(str(regex_pattern)).groupindex:
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
"Internal Error: if provided, 'regex_pattern' must contain a named capture group called 'identifier'"
),
)
],
)

concatenate_order = order.copy()
concatenate_field_types = field_types.copy()

def replace_identifier(values, replacement):
return [replacement if v == "IDENTIFIER" else v for v in values]

identifier: ProcessedMetadataValue = collector_id or submission_id
if not isinstance(identifier, str):
identifier = None
elif args["is_insdc_ingest_group"]:
# For INSDC ingested sequence: use ID as is unless it contains ' ' or '/'
# If it does: fall back to ACCESSION_VERSION
if " " in identifier or "/" in identifier:
identifier = None
elif "/" in identifier:
# For direct submissions with "/": try to extrect ID field using regex
if regex_pattern is None:
identifier = None
warnings.append(
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
"identifier string contained '/' but no regex_pattern was provided"
),
)
)
else:
extract_result = ProcessingFunctions.extract_regex(
input_data={"regex_field": identifier},
output_field="IDENTIFIER",
input_fields=[],
args={"pattern": regex_pattern, "capture_group": "identifier"},
)
identifier = extract_result.datum
if identifier is None:
# regex extraction of ID field failed, fall back to ACCESSION_VERSION
warnings.append(
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
"identifier string could not be parsed using provided regex_pattern"
),
)
)

if identifier is None:
# Use ACCESSION_VERSION instead of IDENTIFIER
concatenate_order = replace_identifier(order, "ACCESSION_VERSION")
concatenate_field_types = replace_identifier(field_types, "ACCESSION_VERSION")
else:
# Keep IDENTIFIER but treat it as string
concatenate_field_types = replace_identifier(field_types, "string")
input_data["IDENTIFIER"] = str(identifier)

concat_result = ProcessingFunctions.concatenate(
input_data,
output_field,
input_fields,
{
"order": concatenate_order,
"type": concatenate_field_types,
"fallback_value": args.get("fallback_value", "unknown"),
"ACCESSION_VERSION": args["ACCESSION_VERSION"],
},
)

return ProcessingResult(
datum=concat_result.datum,
warnings=warnings + concat_result.warnings,
errors=concat_result.errors,
)


def single_metadata_annotation(
source_name: str,
Expand Down
1 change: 1 addition & 0 deletions preprocessing/nextclade/tests/factory_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def create_unprocessed_entry(
submittedAt=str(
datetime.strptime("2021-12-15", "%Y-%m-%d").replace(tzinfo=pytz.utc).timestamp()
),
submissionId=metadata_dict.get("submissionId") or "test_submission_id",
group_id=group_id,
metadata=metadata_dict,
unalignedNucleotideSequences=sequences,
Expand Down
109 changes: 109 additions & 0 deletions preprocessing/nextclade/tests/test_metadata_processing_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,7 @@ def test_preprocessing_without_consensus_sequences(config: Config) -> None:
accessionVersion="LOC_01.1",
data=UnprocessedData(
submitter="test_submitter",
submissionId="test_submission_id",
group_id=2,
submittedAt=ts_from_ymd(2021, 12, 15),
metadata={
Expand Down Expand Up @@ -966,5 +967,113 @@ def test_concatenate() -> None:
assert res_fallback_explicit_null.datum == "0/unknown/version.1/unknown"


def test_display_name_construction() -> None:
submission_id = "mySample"
submission_id_formatted = "hDENV1/Germany/myExtractedSample/2025"
submission_id_formatted_unexpected = "hDENV1/myExtractedSample/2025"
input_data: InputMetadata = {
"nextclade.clade": "DENV-1",
"geoLocCountry": "Switzerland",
"sampleCollectionDate": "2025",
"submissionId": submission_id,
}
output_field: str = "displayName"

def input_fields():
return [
"nextclade.clade",
"geoLocCountry",
"specimenCollectorSampleId",
"submissionId",
"sampleCollectionDate",
]

def args():
return {
"ACCESSION_VERSION": "version.1",
"is_insdc_ingest_group": False,
"order": ["nextclade.clade", "geoLocCountry", "IDENTIFIER", "sampleCollectionDate"],
"type": ["string", "string", "IDENTIFIER", "string"],
"regex_pattern": r"^[^\/][^/]*/[^/]+/(?P<identifier>[^/]+)/\d{4}(?:-\d{2}){0,2}$",
}

def args_insdc():
return {
"ACCESSION_VERSION": "version.1",
"is_insdc_ingest_group": True,
"order": ["nextclade.clade", "geoLocCountry", "IDENTIFIER", "sampleCollectionDate"],
"type": ["string", "string", "IDENTIFIER", "string"],
"regex_pattern": r"^[^\/][^/]*/[^/]+/(?P<identifier>[^/]+)/\d{4}(?:-\d{2}){0,2}$",
}

res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args())
res_insdc = ProcessingFunctions.build_display_name(
input_data, output_field, input_fields(), args_insdc()
)
assert res.datum == "DENV-1/Switzerland/mySample/2025"
assert res_insdc.datum == "DENV-1/Switzerland/mySample/2025"

input_data["specimenCollectorSampleId"] = "myCollectorSample"
res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args())
res_insdc = ProcessingFunctions.build_display_name(
input_data, output_field, input_fields(), args_insdc()
)
assert res.datum == "DENV-1/Switzerland/myCollectorSample/2025"
assert res_insdc.datum == "DENV-1/Switzerland/myCollectorSample/2025"

input_data["specimenCollectorSampleId"] = submission_id_formatted
res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args())
res_insdc = ProcessingFunctions.build_display_name(
input_data, output_field, input_fields(), args_insdc()
)
assert res.datum == "DENV-1/Switzerland/myExtractedSample/2025"
assert res_insdc.datum == "DENV-1/Switzerland/version.1/2025"

input_data["specimenCollectorSampleId"] = submission_id_formatted_unexpected
res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args())
res_insdc = ProcessingFunctions.build_display_name(
input_data, output_field, input_fields(), args_insdc()
)
assert res.datum == "DENV-1/Switzerland/version.1/2025"
assert res_insdc.datum == "DENV-1/Switzerland/version.1/2025"

input_data["specimenCollectorSampleId"] = submission_id_formatted_unexpected
input_data["geoLocCountry"] = ""
res = ProcessingFunctions.build_display_name(input_data, output_field, input_fields(), args())
res_insdc = ProcessingFunctions.build_display_name(
input_data, output_field, input_fields(), args_insdc()
)
assert res.datum == "DENV-1/unknown/version.1/2025"
assert len(res.warnings) == 1
assert (
res.warnings[0].message
== "identifier string could not be parsed using provided regex_pattern"
)
assert res_insdc.datum == "DENV-1/unknown/version.1/2025"
assert len(res_insdc.warnings) == 0

input_data["specimenCollectorSampleId"] = submission_id_formatted_unexpected
res = ProcessingFunctions.build_display_name(
input_data,
output_field,
input_fields(),
{"fallback_value": "another_fallback"} | args(),
)
res_insdc = ProcessingFunctions.build_display_name(
input_data,
output_field,
input_fields(),
{"fallback_value": "another_fallback"} | args_insdc(),
)
assert res.datum == "DENV-1/another_fallback/version.1/2025"
assert len(res.warnings) == 1
assert (
res.warnings[0].message
== "identifier string could not be parsed using provided regex_pattern"
)
assert res_insdc.datum == "DENV-1/another_fallback/version.1/2025"
assert len(res_insdc.warnings) == 0


if __name__ == "__main__":
pytest.main()
2 changes: 2 additions & 0 deletions preprocessing/nextclade/tests/test_nextclade_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1198,6 +1198,7 @@ def test_preprocessing_without_metadata() -> None:
data=UnprocessedData(
group_id=2,
submitter="test_submitter",
submissionId="test_submission_id",
submittedAt=ts_from_ymd(2021, 12, 15),
metadata={},
unalignedNucleotideSequences={
Expand Down Expand Up @@ -1315,6 +1316,7 @@ def test_create_flatfile():
submitter="test_submitter",
group_id=2,
submittedAt=ts_from_ymd(2021, 12, 15),
submissionId="test_submission_id",
metadata={
"sampleCollectionDate": "2024-01-01",
"geoLocCountry": "Netherlands",
Expand Down
4 changes: 2 additions & 2 deletions website/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ export function getSubmissionIdInputFields(schema: Schema): InputField[] {
displayName: 'ID',
definition: 'FASTA ID',
guidance:
"Your sequence identifier; should match the sequence's id in the FASTA file - this is used to link the metadata to the FASTA sequence.",
"Your sequence identifier; should match the sequence's id in the FASTA file - this is used to link the metadata to the FASTA sequence. Depending on it's format, this ID may be parsed and used in the display name of this sequence on the website (only if configured for this organism).",
example: 'GJP123',
noEdit: true,
required: true,
Expand All @@ -189,7 +189,7 @@ export function getSubmissionIdInputFields(schema: Schema): InputField[] {
displayName: 'ID',
definition: 'METADATA ID',
guidance:
'Your sample identifier. If FASTA IDS column is provided, this sample ID will be used to associate the metadata with the sequence.',
"Your sample identifier. If no Fasta Ids are provided in the FASTA IDS column, this sample ID will be used as a FASTA ID to associate the metadata with the sequence. Depending on it's format, this ID may be parsed and used in the display name of this sequence on the website (only if configured for this organism).",
example: 'GJP123',
noEdit: true,
required: true,
Expand Down
Loading