Skip to content

Commit 2682ca9

Browse files
committed
import gbif_eml
1 parent ecd14dc commit 2682ca9

File tree

7 files changed

+604
-4
lines changed

7 files changed

+604
-4
lines changed

pygeometa/schemas/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@
6363
'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema',
6464
'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema',
6565
'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema',
66-
'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema'
66+
'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema',
67+
'gbif-eml': 'pygeometa.schemas.gbif_eml.GBIF_EMLOutputSchema'
6768
}
6869

6970

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import re
2+
from pathlib import Path
3+
4+
from bs4 import BeautifulSoup
5+
from pygeometa.schemas.base import BaseOutputSchema
6+
7+
THISDIR = Path(__file__).parent
8+
9+
10+
def text_or_null(node, strip=False):
11+
if not node:
12+
return None
13+
14+
if strip:
15+
return node.text.strip()
16+
17+
return node.text
18+
19+
20+
def text_or_empty(node, strip=False):
21+
if not node:
22+
return ""
23+
24+
if strip:
25+
return node.text.strip()
26+
27+
return node.text
28+
29+
30+
def scrub_dict(d):
31+
if type(d) is dict:
32+
return dict(
33+
(k, scrub_dict(v))
34+
for k, v in d.items()
35+
if v is not None and scrub_dict(v) is not None
36+
)
37+
else:
38+
return d
39+
40+
41+
def to_contact_role(node, role, mapped_role=None):
42+
if not mapped_role:
43+
mapped_role = role
44+
45+
for idx, contact in enumerate(node.find_all(role)):
46+
name = f'{text_or_empty(contact.find("surName"))}, '
47+
name += text_or_empty(contact.find("givenName"))
48+
org = text_or_empty(contact.find("organizationName"))
49+
yield (
50+
mapped_role + (f"_{idx}" if idx else ""),
51+
{
52+
"organization": org,
53+
"individualname": name,
54+
"positionname": text_or_empty(contact.find("positionName"))
55+
or text_or_empty(contact.find("role")),
56+
"phone": "",
57+
"url": "",
58+
"fax": "",
59+
"address": "",
60+
"city": "",
61+
"administrativearea": "",
62+
"postalcode": "",
63+
"country": text_or_empty(contact.find("country")),
64+
"email": text_or_empty(contact.find("electronicMailAddress")),
65+
},
66+
)
67+
68+
69+
class GBIF_EMLOutputSchema(BaseOutputSchema):
70+
def __init__(self):
71+
super().__init__("gbif-eml", "EML - GBIF profile", "xml", THISDIR)
72+
73+
def import_(self, metadata):
74+
soup = BeautifulSoup(metadata, features="lxml-xml")
75+
dataset = soup.find("dataset")
76+
mcf = {
77+
"mcf": {
78+
"version": 1,
79+
},
80+
"metadata": {
81+
"charset": "utf8",
82+
"hierarchylevel": "dataset",
83+
"datestamp": "$datetime$",
84+
},
85+
"identification": {},
86+
"contact": {},
87+
"distribution": {},
88+
}
89+
90+
for identifier in dataset.find_all("alternateIdentifier"):
91+
mcf["metadata"]["identifier"] = text_or_null(identifier)
92+
93+
if language := dataset.find("language"):
94+
mcf["metadata"]["language"] = text_or_null(language)
95+
96+
idf = mcf["identification"]
97+
98+
idf["title"] = text_or_null(dataset.find("title"))
99+
idf["abstract"] = text_or_null(dataset.find("abstract"))
100+
101+
if intellectual_rights := dataset.find("intellectualRights"):
102+
url = (
103+
intellectual_rights.find("ulink")["url"]
104+
if intellectual_rights.find("ulink")
105+
else None
106+
)
107+
idf["rights"] = {
108+
"name": text_or_null(intellectual_rights.find("citetitle")),
109+
"url": url,
110+
}
111+
112+
idf["url"] = text_or_null(dataset.find("alternateIdentifier"))
113+
idf["status"] = "completed"
114+
115+
# if maintenance := dataset.find("maintenance"):
116+
# metadata.maintenance_update_description = text_or_null(
117+
# maintenance.find("description")
118+
# )
119+
120+
idf["maintenancefrequency"] = (
121+
text_or_null(dataset.find("maintenanceUpdateFrequency")) or
122+
"unknown"
123+
)
124+
125+
idf["dates"] = {"publication": text_or_null(dataset.find("pubDate"))}
126+
idf["extents"] = {}
127+
128+
if coords := dataset.find("boundingCoordinates"):
129+
idf["extents"]["spatial"] = [{}]
130+
spatial = idf["extents"]["spatial"][0]
131+
132+
spatial["bbox"] = [
133+
float(coords.find("westBoundingCoordinate").text),
134+
float(coords.find("southBoundingCoordinate").text),
135+
float(coords.find("eastBoundingCoordinate").text),
136+
float(coords.find("northBoundingCoordinate").text),
137+
]
138+
139+
spatial["crs"] = "4326"
140+
spatial["description"] = \
141+
text_or_null(dataset.find("geographicDescription"))
142+
143+
# temporal = idf["extents"]["temporal"]
144+
# temporal["begin"]
145+
# temporal["end"]
146+
# temporal["resolution"]
147+
148+
idf["keywords"] = {}
149+
150+
ct = mcf["contact"]
151+
152+
for r, obj in to_contact_role(dataset, "contact", "pointOfContact"):
153+
ct[r] = obj
154+
155+
for r, obj in to_contact_role(dataset,
156+
"metadataProvider",
157+
"distributor"):
158+
ct[r] = obj
159+
160+
for r, obj in to_contact_role(dataset, "creator"):
161+
ct[r] = obj
162+
163+
for r, obj in to_contact_role(dataset,
164+
"personnel",
165+
"projectPersonnel"):
166+
ct[r] = obj
167+
168+
for idx, keyword_set in enumerate(dataset.find_all("keywordSet")):
169+
thesaurus = text_or_null(keyword_set.find("keywordThesaurus"))
170+
match = re.search(r"(?P<url>https?://[^\s]+)", thesaurus)
171+
definition = match.group("url") if match else None
172+
173+
idf["keywords"][f"default-{idx}"] = {
174+
"keywords": [
175+
text_or_null(kw) for kw in keyword_set.find_all("keyword")
176+
],
177+
"vocabulary": {"name": thesaurus, "url": definition},
178+
}
179+
180+
mcf["spatial"] = {"datatype": "vector", "geomtype": "composite"}
181+
182+
mcf["distribution"] = {
183+
"file": {
184+
"url": idf["url"],
185+
"type": "WWW:LINK",
186+
"function": "information",
187+
"description": "",
188+
"name": "Darwin Core Archive",
189+
}
190+
}
191+
192+
return scrub_dict(mcf)

pygeometa/schemas/gbif_eml/main.j2

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:dc="http://purl.org/dc/terms/"
2+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3+
xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
4+
packageId="{{ record['metadata']['dataseturi'] }}" system="http://gbif.org"
5+
scope="system" xml:lang="{{ record['identification']['language'] }}">
6+
7+
<dataset>
8+
<alternateIdentifier>{{ record['identification']['doi' ]}}</alternateIdentifier>
9+
<title xml:lang="{{ record['identification']['language'] }}">{{ record['identification']['title'] }}</title>
10+
{#
11+
<!--creator>
12+
{% include 'person.j2' %}
13+
</creator>
14+
<creator>
15+
{% include 'person.j2' %}
16+
</creator>
17+
<metadataProvider>
18+
{% include 'person.j2' %}
19+
</metadataProvider-->
20+
#}
21+
<pubDate>
22+
{{ record['identification']['dates']['publication'] }}
23+
</pubDate>
24+
<language>{{ record['identification']['language'] }}</language>
25+
<abstract>
26+
<para>{{ record['identification']['abstract'] }}</para>
27+
</abstract>
28+
{% for group, keywords in record['identification']['keywords'].items() %}
29+
<keywordSet>
30+
{% for kw in keywords['keywords'] %}
31+
<keyword>{{ kw }}</keyword>
32+
{% endfor %}
33+
<keywordThesaurus>{{ keywords['vocabulary']['name'] }}: {{ keywords['vocabulary']['url'] }}</keywordThesaurus>
34+
</keywordSet>
35+
{% endfor%}
36+
<intellectualRights>
37+
<para>This work is licensed under a <ulink url="{{ record['identification']['rights']['url'] }}">
38+
<citetitle>{{ record['identification']['rights']['name'] }}</citetitle>
39+
</ulink>.</para>
40+
</intellectualRights>
41+
<distribution scope="document">
42+
<online>
43+
{% for key, value in record['distribution'].items() %}
44+
<url function="{{ value['function'] }}">{{ value['url' ]}}</url>
45+
{% endfor %}
46+
</online>
47+
</distribution>
48+
{% set extents = record['identification']['extents'] %}
49+
{% set bbox = extents['spatial'][0]['bbox'] %}
50+
<coverage>
51+
<geographicCoverage>
52+
<geographicDescription>{{ extents['spatial'][0]['description'] }}</geographicDescription>
53+
<boundingCoordinates>
54+
<westBoundingCoordinate>{{ bbox[0] }}</westBoundingCoordinate>
55+
<eastBoundingCoordinate>{{ bbox[1] }}</eastBoundingCoordinate>
56+
<northBoundingCoordinate>{{ bbox[2] }}</northBoundingCoordinate>
57+
<southBoundingCoordinate>{{ bbox[3] }}</southBoundingCoordinate>
58+
</boundingCoordinates>
59+
</geographicCoverage>
60+
{% if 'temporal' in extents %}
61+
<temporalCoverage>
62+
<rangeOfDates>
63+
<beginDate>
64+
<calendarDate>{{ extents['temporal'][0]['begin'] }}</calendarDate>
65+
</beginDate>
66+
{% if extents['temporal'][0]['end'] %}
67+
<endDate>
68+
<calendarDate>{{ extents['temporal'][0]['end'] }}</calendarDate>
69+
</endDate>
70+
{% endif %}
71+
</rangeOfDates>
72+
</temporalCoverage>
73+
{% endif %}
74+
<taxonomicCoverage>
75+
<generalTaxonomicCoverage>
76+
</generalTaxonomicCoverage>
77+
<taxonomicClassification>
78+
<taxonRankName></taxonRankName>
79+
<taxonRankValue></taxonRankValue>
80+
<commonName></commonName>
81+
</taxonomicClassification>
82+
</taxonomicCoverage>
83+
</coverage>
84+
<maintenance>
85+
<description>
86+
<para />
87+
</description>
88+
<maintenanceUpdateFrequency></maintenanceUpdateFrequency>
89+
</maintenance>
90+
91+
{#
92+
<contact>
93+
{% include 'person.j2' %}
94+
</contact>
95+
<methods>
96+
<methodStep>
97+
<description>
98+
<para></para>
99+
</description>
100+
</methodStep>
101+
<sampling>
102+
<studyExtent>
103+
<description>
104+
<para>
105+
</para>
106+
</description>
107+
</studyExtent>
108+
<samplingDescription>
109+
<para></para>
110+
</samplingDescription>
111+
</sampling>
112+
<qualityControl>
113+
<description>
114+
<para></para>
115+
</description>
116+
</qualityControl>
117+
</methods>
118+
<project>
119+
<title></title>
120+
<personnel>
121+
{% include 'person.j2' %}
122+
<role />
123+
</personnel>
124+
<abstract>
125+
<para></para>
126+
</abstract>
127+
<funding>
128+
<para>Artsdatabanken</para>
129+
</funding>
130+
<studyAreaDescription>
131+
<descriptor name="generic" citableClassificationSystem="false">
132+
<descriptorValue></descriptorValue>
133+
</descriptor>
134+
</studyAreaDescription>
135+
</project>
136+
#}
137+
</dataset>
138+
{#
139+
<additionalMetadata>
140+
<metadata>
141+
<gbif>
142+
<dateStamp></dateStamp>
143+
<hierarchyLevel>dataset</hierarchyLevel>
144+
<citation></citation>
145+
<resourceLogoUrl></resourceLogoUrl>
146+
</gbif>
147+
</metadata>
148+
</additionalMetadata>
149+
#}
150+
</eml:eml>
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<individualName>
2+
<givenName>{{ first_name }}</givenName>
3+
<surName>{{ last_name }}</surName>
4+
</individualName>
5+
<organizationName>{{ org_name }}</organizationName>
6+
<positionName>{{ position }}</positionName>
7+
<address>
8+
<country>{{ country }}</country>
9+
</address>
10+
<electronicMailAddress>{{ email }}</electronicMailAddress>
11+
{% if  orcid %}<userId directory="http://orcid.org/">{{ orcid }}</userId>{% endif %}

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ jsonschema
44
lxml
55
OWSLib
66
pyyaml
7+
beautifulsoup4

0 commit comments

Comments
 (0)