DB driver independent metadata fields implementation

Allows extraction of search fields from dataset metadata without dependencies on db driver.
opendatacube · Aug 15, 2018 · bdcccc2 · bdcccc2
1 parent 057413e
commit bdcccc2
Show file tree

Hide file tree

Showing 3 changed files with 309 additions and 0 deletions.
diff --git a/datacube/model/__init__.py b/datacube/model/__init__.py
@@ -739,3 +739,13 @@ def __str__(self):
 
     def __repr__(self):
         return self.__str__()
+
+
+def metadata_from_doc(doc):
+    """Construct MetadataType that is not tied to any particular db index. This is
+    useful when there is a need to interpret dataset metadata documents
+    according to metadata spec.
+    """
+    from .fields import get_dataset_fields
+    MetadataType.validate(doc)
+    return MetadataType(doc, get_dataset_fields(doc))
diff --git a/datacube/model/fields.py b/datacube/model/fields.py
@@ -0,0 +1,116 @@
+"""Non-db specific implementation of metadata search fields.
+
+This allows extraction of fields of interest from dataset metadata document.
+"""
+import toolz
+import decimal
+from datacube.utils import parse_time
+from datacube.model import Range
+
+
+class SimpleField(object):
+    def __init__(self,
+                 offset,
+                 converter,
+                 type_name,
+                 name='',
+                 description=''):
+        self._offset = offset
+        self._converter = converter
+        self.type_name = type_name
+        self.description = description
+        self.name = name
+
+    def extract(self, doc):
+        v = toolz.get_in(self._offset, doc, default=None)
+        if v is None:
+            return None
+        return self._converter(v)
+
+
+class RangeField(object):
+    def __init__(self,
+                 min_offset,
+                 max_offset,
+                 base_converter,
+                 type_name,
+                 name='',
+                 description=''):
+        self.type_name = type_name
+        self.description = description
+        self.name = name
+        self._converter = base_converter
+        self._min_offset = min_offset
+        self._max_offset = max_offset
+
+    def extract(self, doc):
+        def extract_raw(paths):
+            vv = [toolz.get_in(p, doc, default=None) for p in paths]
+            return [self._converter(v) for v in vv if v is not None]
+
+        v_min = extract_raw(self._min_offset)
+        v_max = extract_raw(self._max_offset)
+
+        v_min = None if len(v_min) == 0 else min(v_min)
+        v_max = None if len(v_max) == 0 else max(v_max)
+
+        if v_min is None and v_max is None:
+            return None
+
+        return Range(v_min, v_max)
+
+
+def parse_search_field(doc, name=''):
+    parsers = {
+        'string': str,
+        'double': float,
+        'integer': int,
+        'numeric': decimal.Decimal,
+        'datetime': parse_time,
+        'object': lambda x: x,
+    }
+    _type = doc.get('type', 'string')
+
+    if _type in parsers:
+        offset = doc.get('offset', None)
+        if offset is None:
+            raise ValueError('Missing offset')
+
+        return SimpleField(offset,
+                           parsers[_type],
+                           _type,
+                           name=name,
+                           description=doc.get('description', ''))
+
+    if not _type.endswith('-range'):
+        raise ValueError('Unsupported search field type: ' + str(_type))
+
+    raw_type = _type.split('-')[0]
+
+    if raw_type == 'float':  # float-range is supposed to be supported, but not just float?
+        raw_type = 'numeric'
+
+    if raw_type not in parsers:
+        raise ValueError('Unsupported search field type: ' + str(_type))
+
+    min_offset = doc.get('min_offset', None)
+    max_offset = doc.get('max_offset', None)
+
+    if min_offset is None or max_offset is None:
+        raise ValueError('Need to specify both min_offset and max_offset')
+
+    return RangeField(min_offset,
+                      max_offset,
+                      parsers[raw_type],
+                      _type,
+                      name=name,
+                      description=doc.get('description', ''))
+
+
+def get_dataset_fields(metadata_definition):
+    """Construct search fields dictionary not tied to any specific db
+    implementation.
+
+    """
+    fields = toolz.get_in(['dataset', 'search_fields'], metadata_definition, {})
+    return {n: parse_search_field(doc, name=n) for n, doc in fields.items()}
diff --git a/tests/test_metadata_fields.py b/tests/test_metadata_fields.py
@@ -0,0 +1,183 @@
+import yaml
+import datetime
+import decimal
+from textwrap import dedent
+import pytest
+
+from datacube.model.fields import get_dataset_fields, parse_search_field
+from datacube.model import Range, metadata_from_doc
+
+METADATA_DOC = yaml.safe_load('''---
+name: test
+description: test all simple search field types
+dataset:
+  id: [id]
+  sources: [lineage, source_datasets]
+  label: [label]
+  creation_dt: [creation_dt]
+  search_fields:
+    x_default_type:
+       description: string type is assumed
+       offset: [some, path, x_default_type_path]
+
+    x_string:
+      type: string
+      description: field of type 'string'
+      offset: [x_string_path]
+
+    x_double:
+      type: double
+      description: field of type 'double'
+      offset: [x_double_path]
+
+    x_integer:
+      type: integer
+      description: field of type 'integer'
+      offset: [x_integer_path]
+
+    x_numeric:
+      type: numeric
+      description: field of type 'numeric'
+      offset: [x_numeric_path]
+
+    x_datetime:
+      type: datetime
+      description: field of type 'datetime'
+      offset: [x_datetime_path]
+''')
+
+SAMPLE_DOC = yaml.safe_load('''---
+x_string_path: some_string
+x_double_path: 6.283185307179586
+x_integer_path: 4466778
+x_numeric_path: '100.33'
+x_datetime_path: 1999-04-15 12:33:55.001
+some:
+  path:
+    x_default_type_path: just_a_string
+''')
+
+METADATA_DOC_RANGES = yaml.safe_load('''---
+name: test
+description: test all simple search field types
+dataset:
+  id: [id]
+  sources: [lineage, source_datasets]
+  label: [label]
+  creation_dt: [creation_dt]
+  search_fields:
+     t_range:
+       type: datetime-range
+       min_offset: [[t,a], [t,b]]
+       max_offset: [[t,a], [t,b]]
+
+     x_range:
+       type: double-range
+       min_offset: [[x,a], [x,b], [x,c], [x,d]]
+       max_offset: [[x,a], [x,b], [x,c], [x,d]]
+
+     ab:
+       type: integer-range
+       min_offset: [[a]]
+       max_offset: [[b]]
+''')
+
+SAMPLE_DOC_RANGES = yaml.safe_load('''---
+t:
+  a: 1999-04-15
+  b: 1999-04-16
+x:
+  a: 1
+  b: 2
+  c: 3
+  d: 4
+''')
+
+
+def test_get_dataset_simple_fields():
+    xx = get_dataset_fields(METADATA_DOC)
+    assert xx['x_default_type'].type_name == 'string'
+
+    type_map = dict(
+        double=float,
+        integer=int,
+        string=str,
+        datetime=datetime.datetime,
+        numeric=decimal.Decimal,
+    )
+
+    for n, f in xx.items():
+        assert n == f.name
+        assert isinstance(f.description, str)
+
+        expected_type = type_map.get(f.type_name)
+        vv = f.extract(SAMPLE_DOC)
+        assert isinstance(vv, expected_type)
+
+        # missing data should return None
+        assert f.extract({}) is None
+
+
+def test_get_dataset_range_fields():
+    xx = get_dataset_fields(METADATA_DOC_RANGES)
+    v = xx['x_range'].extract(SAMPLE_DOC_RANGES)
+    assert v == Range(1, 4)
+
+    v = xx['t_range'].extract(SAMPLE_DOC_RANGES)
+    assert v.begin.strftime('%Y-%m-%d') == "1999-04-15"
+    assert v.end.strftime('%Y-%m-%d') == "1999-04-16"
+
+    # missing range should return None
+    assert xx['ab'].extract({}) is None
+
+    # partially missing Range
+    assert xx['ab'].extract(dict(a=3)) == Range(3, None)
+    assert xx['ab'].extract(dict(b=4)) == Range(None, 4)
+
+
+def test_metadata_from_doc():
+    mm = metadata_from_doc(METADATA_DOC)
+    assert mm.definition is METADATA_DOC
+
+    rdr = mm.dataset_reader(SAMPLE_DOC)
+    assert rdr.x_double == SAMPLE_DOC['x_double_path']
+    assert rdr.x_integer == SAMPLE_DOC['x_integer_path']
+    assert rdr.x_string == SAMPLE_DOC['x_string_path']
+    assert rdr.x_numeric == decimal.Decimal(SAMPLE_DOC['x_numeric_path'])
+
+
+def test_bad_field_definition():
+    def doc(s):
+        return yaml.safe_load(dedent(s))
+
+    with pytest.raises(ValueError):
+        parse_search_field(doc('''
+        type: bad_type
+        offset: [a]
+        '''))
+
+    with pytest.raises(ValueError):
+        parse_search_field(doc('''
+        type: badtype-range
+        offset: [a]
+        '''))
+
+    with pytest.raises(ValueError):
+        parse_search_field(doc('''
+        type: double
+        description: missing offset
+        '''))
+
+    with pytest.raises(ValueError):
+        parse_search_field(doc('''
+        type: double-range
+        description: missing min_offset
+        max_offset: [[a]]
+        '''))
+
+    with pytest.raises(ValueError):
+        parse_search_field(doc('''
+        type: double-range
+        description: missing max_offset
+        min_offset: [[a]]
+        '''))