import gbif_eml

nicokant · nicokant · commit 2682ca94f24f · 2025-10-29T11:16:59.000+01:00
diff --git a/pygeometa/schemas/__init__.py b/pygeometa/schemas/__init__.py
@@ -63,7 +63,8 @@
     'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema',
     'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema',
     'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema',
-    'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema'
+    'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema',
+    'gbif-eml': 'pygeometa.schemas.gbif_eml.GBIF_EMLOutputSchema'
 }
 
 
diff --git a/pygeometa/schemas/gbif_eml/__init__.py b/pygeometa/schemas/gbif_eml/__init__.py
@@ -0,0 +1,192 @@
+import re
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+from pygeometa.schemas.base import BaseOutputSchema
+
+THISDIR = Path(__file__).parent
+
+
+def text_or_null(node, strip=False):
+    if not node:
+        return None
+
+    if strip:
+        return node.text.strip()
+
+    return node.text
+
+
+def text_or_empty(node, strip=False):
+    if not node:
+        return ""
+
+    if strip:
+        return node.text.strip()
+
+    return node.text
+
+
+def scrub_dict(d):
+    if type(d) is dict:
+        return dict(
+            (k, scrub_dict(v))
+            for k, v in d.items()
+            if v is not None and scrub_dict(v) is not None
+        )
+    else:
+        return d
+
+
+def to_contact_role(node, role, mapped_role=None):
+    if not mapped_role:
+        mapped_role = role
+
+    for idx, contact in enumerate(node.find_all(role)):
+        name = f'{text_or_empty(contact.find("surName"))}, '
+        name += text_or_empty(contact.find("givenName"))
+        org = text_or_empty(contact.find("organizationName"))
+        yield (
+            mapped_role + (f"_{idx}" if idx else ""),
+            {
+                "organization": org,
+                "individualname": name,
+                "positionname": text_or_empty(contact.find("positionName"))
+                or text_or_empty(contact.find("role")),
+                "phone": "",
+                "url": "",
+                "fax": "",
+                "address": "",
+                "city": "",
+                "administrativearea": "",
+                "postalcode": "",
+                "country": text_or_empty(contact.find("country")),
+                "email": text_or_empty(contact.find("electronicMailAddress")),
+            },
+        )
+
+
+class GBIF_EMLOutputSchema(BaseOutputSchema):
+    def __init__(self):
+        super().__init__("gbif-eml", "EML - GBIF profile", "xml", THISDIR)
+
+    def import_(self, metadata):
+        soup = BeautifulSoup(metadata, features="lxml-xml")
+        dataset = soup.find("dataset")
+        mcf = {
+            "mcf": {
+                "version": 1,
+            },
+            "metadata": {
+                "charset": "utf8",
+                "hierarchylevel": "dataset",
+                "datestamp": "$datetime$",
+            },
+            "identification": {},
+            "contact": {},
+            "distribution": {},
+        }
+
+        for identifier in dataset.find_all("alternateIdentifier"):
+            mcf["metadata"]["identifier"] = text_or_null(identifier)
+
+        if language := dataset.find("language"):
+            mcf["metadata"]["language"] = text_or_null(language)
+
+        idf = mcf["identification"]
+
+        idf["title"] = text_or_null(dataset.find("title"))
+        idf["abstract"] = text_or_null(dataset.find("abstract"))
+
+        if intellectual_rights := dataset.find("intellectualRights"):
+            url = (
+                intellectual_rights.find("ulink")["url"]
+                if intellectual_rights.find("ulink")
+                else None
+            )
+            idf["rights"] = {
+                "name": text_or_null(intellectual_rights.find("citetitle")),
+                "url": url,
+            }
+
+        idf["url"] = text_or_null(dataset.find("alternateIdentifier"))
+        idf["status"] = "completed"
+
+        # if maintenance := dataset.find("maintenance"):
+        #     metadata.maintenance_update_description = text_or_null(
+        #         maintenance.find("description")
+        #     )
+
+        idf["maintenancefrequency"] = (
+            text_or_null(dataset.find("maintenanceUpdateFrequency")) or
+            "unknown"
+        )
+
+        idf["dates"] = {"publication": text_or_null(dataset.find("pubDate"))}
+        idf["extents"] = {}
+
+        if coords := dataset.find("boundingCoordinates"):
+            idf["extents"]["spatial"] = [{}]
+            spatial = idf["extents"]["spatial"][0]
+
+            spatial["bbox"] = [
+                float(coords.find("westBoundingCoordinate").text),
+                float(coords.find("southBoundingCoordinate").text),
+                float(coords.find("eastBoundingCoordinate").text),
+                float(coords.find("northBoundingCoordinate").text),
+            ]
+
+            spatial["crs"] = "4326"
+            spatial["description"] = \
+                text_or_null(dataset.find("geographicDescription"))
+
+        # temporal = idf["extents"]["temporal"]
+        # temporal["begin"]
+        # temporal["end"]
+        # temporal["resolution"]
+
+        idf["keywords"] = {}
+
+        ct = mcf["contact"]
+
+        for r, obj in to_contact_role(dataset, "contact", "pointOfContact"):
+            ct[r] = obj
+
+        for r, obj in to_contact_role(dataset,
+                                      "metadataProvider",
+                                      "distributor"):
+            ct[r] = obj
+
+        for r, obj in to_contact_role(dataset, "creator"):
+            ct[r] = obj
+
+        for r, obj in to_contact_role(dataset,
+                                      "personnel",
+                                      "projectPersonnel"):
+            ct[r] = obj
+
+        for idx, keyword_set in enumerate(dataset.find_all("keywordSet")):
+            thesaurus = text_or_null(keyword_set.find("keywordThesaurus"))
+            match = re.search(r"(?P<url>https?://[^\s]+)", thesaurus)
+            definition = match.group("url") if match else None
+
+            idf["keywords"][f"default-{idx}"] = {
+                "keywords": [
+                    text_or_null(kw) for kw in keyword_set.find_all("keyword")
+                ],
+                "vocabulary": {"name": thesaurus, "url": definition},
+            }
+
+        mcf["spatial"] = {"datatype": "vector", "geomtype": "composite"}
+
+        mcf["distribution"] = {
+            "file": {
+                "url": idf["url"],
+                "type": "WWW:LINK",
+                "function": "information",
+                "description": "",
+                "name": "Darwin Core Archive",
+            }
+        }
+
+        return scrub_dict(mcf)
diff --git a/pygeometa/schemas/gbif_eml/main.j2 b/pygeometa/schemas/gbif_eml/main.j2
@@ -0,0 +1,150 @@
+<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:dc="http://purl.org/dc/terms/"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
+    packageId="{{ record['metadata']['dataseturi'] }}" system="http://gbif.org"
+    scope="system" xml:lang="{{ record['identification']['language'] }}">
+
+    <dataset>
+        <alternateIdentifier>{{ record['identification']['doi' ]}}</alternateIdentifier>
+        <title xml:lang="{{ record['identification']['language'] }}">{{ record['identification']['title'] }}</title>
+        {#
+        <!--creator>
+            {% include 'person.j2' %}
+        </creator>
+        <creator>
+            {% include 'person.j2' %}
+        </creator>
+        <metadataProvider>
+            {% include 'person.j2' %}
+        </metadataProvider-->
+        #}
+        <pubDate>
+            {{ record['identification']['dates']['publication'] }}
+        </pubDate>
+        <language>{{ record['identification']['language'] }}</language>
+        <abstract>
+            <para>{{ record['identification']['abstract'] }}</para>
+        </abstract>
+        {% for group, keywords in record['identification']['keywords'].items() %}
+        <keywordSet>
+            {% for kw in keywords['keywords'] %}
+            <keyword>{{ kw }}</keyword>
+            {% endfor %}
+            <keywordThesaurus>{{ keywords['vocabulary']['name'] }}: {{ keywords['vocabulary']['url'] }}</keywordThesaurus>
+        </keywordSet>
+        {% endfor%}
+        <intellectualRights>
+            <para>This work is licensed under a <ulink url="{{ record['identification']['rights']['url'] }}">
+                    <citetitle>{{ record['identification']['rights']['name'] }}</citetitle>
+                </ulink>.</para>
+        </intellectualRights>
+        <distribution scope="document">
+            <online>
+                {% for key, value in record['distribution'].items() %}
+                <url function="{{ value['function'] }}">{{ value['url' ]}}</url>
+                {% endfor %}
+            </online>
+        </distribution>
+        {% set extents = record['identification']['extents'] %}
+        {% set bbox = extents['spatial'][0]['bbox'] %}
+        <coverage>
+            <geographicCoverage>
+                <geographicDescription>{{ extents['spatial'][0]['description'] }}</geographicDescription>
+                <boundingCoordinates>
+                    <westBoundingCoordinate>{{ bbox[0] }}</westBoundingCoordinate>
+                    <eastBoundingCoordinate>{{ bbox[1] }}</eastBoundingCoordinate>
+                    <northBoundingCoordinate>{{ bbox[2] }}</northBoundingCoordinate>
+                    <southBoundingCoordinate>{{ bbox[3] }}</southBoundingCoordinate>
+                </boundingCoordinates>
+            </geographicCoverage>
+            {% if 'temporal' in extents %}
+            <temporalCoverage>
+                <rangeOfDates>
+                    <beginDate>
+                        <calendarDate>{{ extents['temporal'][0]['begin'] }}</calendarDate>
+                    </beginDate>
+                    {% if extents['temporal'][0]['end'] %}
+                    <endDate>
+                        <calendarDate>{{ extents['temporal'][0]['end'] }}</calendarDate>
+                    </endDate>
+                    {% endif %}
+                </rangeOfDates>
+            </temporalCoverage>
+            {% endif %}
+            <taxonomicCoverage>
+                <generalTaxonomicCoverage>
+                </generalTaxonomicCoverage>
+                <taxonomicClassification>
+                    <taxonRankName></taxonRankName>
+                    <taxonRankValue></taxonRankValue>
+                    <commonName></commonName>
+                </taxonomicClassification>
+            </taxonomicCoverage>
+        </coverage>
+        <maintenance>
+            <description>
+                <para />
+            </description>
+            <maintenanceUpdateFrequency></maintenanceUpdateFrequency>
+        </maintenance>
+
+        {#
+        <contact>
+            {% include 'person.j2' %}
+        </contact>
+        <methods>
+            <methodStep>
+                <description>
+                    <para></para>
+                </description>
+            </methodStep>
+            <sampling>
+                <studyExtent>
+                    <description>
+                        <para>
+                        </para>
+                    </description>
+                </studyExtent>
+                <samplingDescription>
+                    <para></para>
+                </samplingDescription>
+            </sampling>
+            <qualityControl>
+                <description>
+                    <para></para>
+                </description>
+            </qualityControl>
+        </methods>
+        <project>
+            <title></title>
+            <personnel>
+                {% include 'person.j2' %}
+                <role />
+            </personnel>
+            <abstract>
+                <para></para>
+            </abstract>
+            <funding>
+                <para>Artsdatabanken</para>
+            </funding>
+            <studyAreaDescription>
+                <descriptor name="generic" citableClassificationSystem="false">
+                    <descriptorValue></descriptorValue>
+                </descriptor>
+            </studyAreaDescription>
+        </project>
+         #}
+    </dataset>
+    {#
+    <additionalMetadata>
+        <metadata>
+            <gbif>
+                <dateStamp></dateStamp>
+                <hierarchyLevel>dataset</hierarchyLevel>
+                <citation></citation>
+                <resourceLogoUrl></resourceLogoUrl>
+            </gbif>
+        </metadata>
+    </additionalMetadata>
+    #}
+</eml:eml>
diff --git a/pygeometa/schemas/gbif_eml/person.j2 b/pygeometa/schemas/gbif_eml/person.j2
@@ -0,0 +1,11 @@
+<individualName>
+    <givenName>{{ first_name }}</givenName>
+    <surName>{{ last_name }}</surName>
+</individualName>
+<organizationName>{{ org_name }}</organizationName>
+<positionName>{{ position }}</positionName>
+<address>
+    <country>{{ country }}</country>
+</address>
+<electronicMailAddress>{{ email }}</electronicMailAddress>
+{% if  orcid %}<userId directory="http://orcid.org/">{{ orcid }}</userId>{% endif %}
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ jsonschema
 lxml
 OWSLib
 pyyaml
+beautifulsoup4
diff --git a/tests/eml.xml b/tests/eml.xml
diff --git a/tests/run_tests.py b/tests/run_tests.py

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,8 @@`
`63`	`63`	`'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema',`
`64`	`64`	`'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema',`
`65`	`65`	`'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema',`
`66`		`- 'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema'`
	`66`	`+ 'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema',`
	`67`	`+ 'gbif-eml': 'pygeometa.schemas.gbif_eml.GBIF_EMLOutputSchema'`
`67`	`68`	`}`
`68`	`69`
`69`	`70`
-Original file line number
+Diff line change
 lxml
 OWSLib
 pyyaml
 +beautifulsoup4