Skip to content

Commit

Permalink
DB driver independent metadata fields implementation
Browse files Browse the repository at this point in the history
Allows extraction of search fields from dataset metadata without dependencies on
db driver.
  • Loading branch information
Kirill888 authored and mergify[bot] committed Aug 15, 2018
1 parent 057413e commit bdcccc2
Show file tree
Hide file tree
Showing 3 changed files with 309 additions and 0 deletions.
10 changes: 10 additions & 0 deletions datacube/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,3 +739,13 @@ def __str__(self):

def __repr__(self):
return self.__str__()


def metadata_from_doc(doc):
"""Construct MetadataType that is not tied to any particular db index. This is
useful when there is a need to interpret dataset metadata documents
according to metadata spec.
"""
from .fields import get_dataset_fields
MetadataType.validate(doc)
return MetadataType(doc, get_dataset_fields(doc))
116 changes: 116 additions & 0 deletions datacube/model/fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""Non-db specific implementation of metadata search fields.
This allows extraction of fields of interest from dataset metadata document.
"""
import toolz
import decimal
from datacube.utils import parse_time
from datacube.model import Range


class SimpleField(object):
def __init__(self,
offset,
converter,
type_name,
name='',
description=''):
self._offset = offset
self._converter = converter
self.type_name = type_name
self.description = description
self.name = name

def extract(self, doc):
v = toolz.get_in(self._offset, doc, default=None)
if v is None:
return None
return self._converter(v)


class RangeField(object):
def __init__(self,
min_offset,
max_offset,
base_converter,
type_name,
name='',
description=''):
self.type_name = type_name
self.description = description
self.name = name
self._converter = base_converter
self._min_offset = min_offset
self._max_offset = max_offset

def extract(self, doc):
def extract_raw(paths):
vv = [toolz.get_in(p, doc, default=None) for p in paths]
return [self._converter(v) for v in vv if v is not None]

v_min = extract_raw(self._min_offset)
v_max = extract_raw(self._max_offset)

v_min = None if len(v_min) == 0 else min(v_min)
v_max = None if len(v_max) == 0 else max(v_max)

if v_min is None and v_max is None:
return None

return Range(v_min, v_max)


def parse_search_field(doc, name=''):
parsers = {
'string': str,
'double': float,
'integer': int,
'numeric': decimal.Decimal,
'datetime': parse_time,
'object': lambda x: x,
}
_type = doc.get('type', 'string')

if _type in parsers:
offset = doc.get('offset', None)
if offset is None:
raise ValueError('Missing offset')

return SimpleField(offset,
parsers[_type],
_type,
name=name,
description=doc.get('description', ''))

if not _type.endswith('-range'):
raise ValueError('Unsupported search field type: ' + str(_type))

raw_type = _type.split('-')[0]

if raw_type == 'float': # float-range is supposed to be supported, but not just float?
raw_type = 'numeric'

if raw_type not in parsers:
raise ValueError('Unsupported search field type: ' + str(_type))

min_offset = doc.get('min_offset', None)
max_offset = doc.get('max_offset', None)

if min_offset is None or max_offset is None:
raise ValueError('Need to specify both min_offset and max_offset')

return RangeField(min_offset,
max_offset,
parsers[raw_type],
_type,
name=name,
description=doc.get('description', ''))


def get_dataset_fields(metadata_definition):
"""Construct search fields dictionary not tied to any specific db
implementation.
"""
fields = toolz.get_in(['dataset', 'search_fields'], metadata_definition, {})
return {n: parse_search_field(doc, name=n) for n, doc in fields.items()}
183 changes: 183 additions & 0 deletions tests/test_metadata_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import yaml
import datetime
import decimal
from textwrap import dedent
import pytest

from datacube.model.fields import get_dataset_fields, parse_search_field
from datacube.model import Range, metadata_from_doc

METADATA_DOC = yaml.safe_load('''---
name: test
description: test all simple search field types
dataset:
id: [id]
sources: [lineage, source_datasets]
label: [label]
creation_dt: [creation_dt]
search_fields:
x_default_type:
description: string type is assumed
offset: [some, path, x_default_type_path]
x_string:
type: string
description: field of type 'string'
offset: [x_string_path]
x_double:
type: double
description: field of type 'double'
offset: [x_double_path]
x_integer:
type: integer
description: field of type 'integer'
offset: [x_integer_path]
x_numeric:
type: numeric
description: field of type 'numeric'
offset: [x_numeric_path]
x_datetime:
type: datetime
description: field of type 'datetime'
offset: [x_datetime_path]
''')

SAMPLE_DOC = yaml.safe_load('''---
x_string_path: some_string
x_double_path: 6.283185307179586
x_integer_path: 4466778
x_numeric_path: '100.33'
x_datetime_path: 1999-04-15 12:33:55.001
some:
path:
x_default_type_path: just_a_string
''')

METADATA_DOC_RANGES = yaml.safe_load('''---
name: test
description: test all simple search field types
dataset:
id: [id]
sources: [lineage, source_datasets]
label: [label]
creation_dt: [creation_dt]
search_fields:
t_range:
type: datetime-range
min_offset: [[t,a], [t,b]]
max_offset: [[t,a], [t,b]]
x_range:
type: double-range
min_offset: [[x,a], [x,b], [x,c], [x,d]]
max_offset: [[x,a], [x,b], [x,c], [x,d]]
ab:
type: integer-range
min_offset: [[a]]
max_offset: [[b]]
''')

SAMPLE_DOC_RANGES = yaml.safe_load('''---
t:
a: 1999-04-15
b: 1999-04-16
x:
a: 1
b: 2
c: 3
d: 4
''')


def test_get_dataset_simple_fields():
xx = get_dataset_fields(METADATA_DOC)
assert xx['x_default_type'].type_name == 'string'

type_map = dict(
double=float,
integer=int,
string=str,
datetime=datetime.datetime,
numeric=decimal.Decimal,
)

for n, f in xx.items():
assert n == f.name
assert isinstance(f.description, str)

expected_type = type_map.get(f.type_name)
vv = f.extract(SAMPLE_DOC)
assert isinstance(vv, expected_type)

# missing data should return None
assert f.extract({}) is None


def test_get_dataset_range_fields():
xx = get_dataset_fields(METADATA_DOC_RANGES)
v = xx['x_range'].extract(SAMPLE_DOC_RANGES)
assert v == Range(1, 4)

v = xx['t_range'].extract(SAMPLE_DOC_RANGES)
assert v.begin.strftime('%Y-%m-%d') == "1999-04-15"
assert v.end.strftime('%Y-%m-%d') == "1999-04-16"

# missing range should return None
assert xx['ab'].extract({}) is None

# partially missing Range
assert xx['ab'].extract(dict(a=3)) == Range(3, None)
assert xx['ab'].extract(dict(b=4)) == Range(None, 4)


def test_metadata_from_doc():
mm = metadata_from_doc(METADATA_DOC)
assert mm.definition is METADATA_DOC

rdr = mm.dataset_reader(SAMPLE_DOC)
assert rdr.x_double == SAMPLE_DOC['x_double_path']
assert rdr.x_integer == SAMPLE_DOC['x_integer_path']
assert rdr.x_string == SAMPLE_DOC['x_string_path']
assert rdr.x_numeric == decimal.Decimal(SAMPLE_DOC['x_numeric_path'])


def test_bad_field_definition():
def doc(s):
return yaml.safe_load(dedent(s))

with pytest.raises(ValueError):
parse_search_field(doc('''
type: bad_type
offset: [a]
'''))

with pytest.raises(ValueError):
parse_search_field(doc('''
type: badtype-range
offset: [a]
'''))

with pytest.raises(ValueError):
parse_search_field(doc('''
type: double
description: missing offset
'''))

with pytest.raises(ValueError):
parse_search_field(doc('''
type: double-range
description: missing min_offset
max_offset: [[a]]
'''))

with pytest.raises(ValueError):
parse_search_field(doc('''
type: double-range
description: missing max_offset
min_offset: [[a]]
'''))

0 comments on commit bdcccc2

Please sign in to comment.