-
Notifications
You must be signed in to change notification settings - Fork 178
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DB driver independent metadata fields implementation
Allows extraction of search fields from dataset metadata without dependencies on db driver.
- Loading branch information
1 parent
057413e
commit bdcccc2
Showing
3 changed files
with
309 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
"""Non-db specific implementation of metadata search fields. | ||
This allows extraction of fields of interest from dataset metadata document. | ||
""" | ||
import toolz | ||
import decimal | ||
from datacube.utils import parse_time | ||
from datacube.model import Range | ||
|
||
|
||
class SimpleField(object): | ||
def __init__(self, | ||
offset, | ||
converter, | ||
type_name, | ||
name='', | ||
description=''): | ||
self._offset = offset | ||
self._converter = converter | ||
self.type_name = type_name | ||
self.description = description | ||
self.name = name | ||
|
||
def extract(self, doc): | ||
v = toolz.get_in(self._offset, doc, default=None) | ||
if v is None: | ||
return None | ||
return self._converter(v) | ||
|
||
|
||
class RangeField(object): | ||
def __init__(self, | ||
min_offset, | ||
max_offset, | ||
base_converter, | ||
type_name, | ||
name='', | ||
description=''): | ||
self.type_name = type_name | ||
self.description = description | ||
self.name = name | ||
self._converter = base_converter | ||
self._min_offset = min_offset | ||
self._max_offset = max_offset | ||
|
||
def extract(self, doc): | ||
def extract_raw(paths): | ||
vv = [toolz.get_in(p, doc, default=None) for p in paths] | ||
return [self._converter(v) for v in vv if v is not None] | ||
|
||
v_min = extract_raw(self._min_offset) | ||
v_max = extract_raw(self._max_offset) | ||
|
||
v_min = None if len(v_min) == 0 else min(v_min) | ||
v_max = None if len(v_max) == 0 else max(v_max) | ||
|
||
if v_min is None and v_max is None: | ||
return None | ||
|
||
return Range(v_min, v_max) | ||
|
||
|
||
def parse_search_field(doc, name=''): | ||
parsers = { | ||
'string': str, | ||
'double': float, | ||
'integer': int, | ||
'numeric': decimal.Decimal, | ||
'datetime': parse_time, | ||
'object': lambda x: x, | ||
} | ||
_type = doc.get('type', 'string') | ||
|
||
if _type in parsers: | ||
offset = doc.get('offset', None) | ||
if offset is None: | ||
raise ValueError('Missing offset') | ||
|
||
return SimpleField(offset, | ||
parsers[_type], | ||
_type, | ||
name=name, | ||
description=doc.get('description', '')) | ||
|
||
if not _type.endswith('-range'): | ||
raise ValueError('Unsupported search field type: ' + str(_type)) | ||
|
||
raw_type = _type.split('-')[0] | ||
|
||
if raw_type == 'float': # float-range is supposed to be supported, but not just float? | ||
raw_type = 'numeric' | ||
|
||
if raw_type not in parsers: | ||
raise ValueError('Unsupported search field type: ' + str(_type)) | ||
|
||
min_offset = doc.get('min_offset', None) | ||
max_offset = doc.get('max_offset', None) | ||
|
||
if min_offset is None or max_offset is None: | ||
raise ValueError('Need to specify both min_offset and max_offset') | ||
|
||
return RangeField(min_offset, | ||
max_offset, | ||
parsers[raw_type], | ||
_type, | ||
name=name, | ||
description=doc.get('description', '')) | ||
|
||
|
||
def get_dataset_fields(metadata_definition): | ||
"""Construct search fields dictionary not tied to any specific db | ||
implementation. | ||
""" | ||
fields = toolz.get_in(['dataset', 'search_fields'], metadata_definition, {}) | ||
return {n: parse_search_field(doc, name=n) for n, doc in fields.items()} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
import yaml | ||
import datetime | ||
import decimal | ||
from textwrap import dedent | ||
import pytest | ||
|
||
from datacube.model.fields import get_dataset_fields, parse_search_field | ||
from datacube.model import Range, metadata_from_doc | ||
|
||
METADATA_DOC = yaml.safe_load('''--- | ||
name: test | ||
description: test all simple search field types | ||
dataset: | ||
id: [id] | ||
sources: [lineage, source_datasets] | ||
label: [label] | ||
creation_dt: [creation_dt] | ||
search_fields: | ||
x_default_type: | ||
description: string type is assumed | ||
offset: [some, path, x_default_type_path] | ||
x_string: | ||
type: string | ||
description: field of type 'string' | ||
offset: [x_string_path] | ||
x_double: | ||
type: double | ||
description: field of type 'double' | ||
offset: [x_double_path] | ||
x_integer: | ||
type: integer | ||
description: field of type 'integer' | ||
offset: [x_integer_path] | ||
x_numeric: | ||
type: numeric | ||
description: field of type 'numeric' | ||
offset: [x_numeric_path] | ||
x_datetime: | ||
type: datetime | ||
description: field of type 'datetime' | ||
offset: [x_datetime_path] | ||
''') | ||
|
||
SAMPLE_DOC = yaml.safe_load('''--- | ||
x_string_path: some_string | ||
x_double_path: 6.283185307179586 | ||
x_integer_path: 4466778 | ||
x_numeric_path: '100.33' | ||
x_datetime_path: 1999-04-15 12:33:55.001 | ||
some: | ||
path: | ||
x_default_type_path: just_a_string | ||
''') | ||
|
||
METADATA_DOC_RANGES = yaml.safe_load('''--- | ||
name: test | ||
description: test all simple search field types | ||
dataset: | ||
id: [id] | ||
sources: [lineage, source_datasets] | ||
label: [label] | ||
creation_dt: [creation_dt] | ||
search_fields: | ||
t_range: | ||
type: datetime-range | ||
min_offset: [[t,a], [t,b]] | ||
max_offset: [[t,a], [t,b]] | ||
x_range: | ||
type: double-range | ||
min_offset: [[x,a], [x,b], [x,c], [x,d]] | ||
max_offset: [[x,a], [x,b], [x,c], [x,d]] | ||
ab: | ||
type: integer-range | ||
min_offset: [[a]] | ||
max_offset: [[b]] | ||
''') | ||
|
||
SAMPLE_DOC_RANGES = yaml.safe_load('''--- | ||
t: | ||
a: 1999-04-15 | ||
b: 1999-04-16 | ||
x: | ||
a: 1 | ||
b: 2 | ||
c: 3 | ||
d: 4 | ||
''') | ||
|
||
|
||
def test_get_dataset_simple_fields(): | ||
xx = get_dataset_fields(METADATA_DOC) | ||
assert xx['x_default_type'].type_name == 'string' | ||
|
||
type_map = dict( | ||
double=float, | ||
integer=int, | ||
string=str, | ||
datetime=datetime.datetime, | ||
numeric=decimal.Decimal, | ||
) | ||
|
||
for n, f in xx.items(): | ||
assert n == f.name | ||
assert isinstance(f.description, str) | ||
|
||
expected_type = type_map.get(f.type_name) | ||
vv = f.extract(SAMPLE_DOC) | ||
assert isinstance(vv, expected_type) | ||
|
||
# missing data should return None | ||
assert f.extract({}) is None | ||
|
||
|
||
def test_get_dataset_range_fields(): | ||
xx = get_dataset_fields(METADATA_DOC_RANGES) | ||
v = xx['x_range'].extract(SAMPLE_DOC_RANGES) | ||
assert v == Range(1, 4) | ||
|
||
v = xx['t_range'].extract(SAMPLE_DOC_RANGES) | ||
assert v.begin.strftime('%Y-%m-%d') == "1999-04-15" | ||
assert v.end.strftime('%Y-%m-%d') == "1999-04-16" | ||
|
||
# missing range should return None | ||
assert xx['ab'].extract({}) is None | ||
|
||
# partially missing Range | ||
assert xx['ab'].extract(dict(a=3)) == Range(3, None) | ||
assert xx['ab'].extract(dict(b=4)) == Range(None, 4) | ||
|
||
|
||
def test_metadata_from_doc(): | ||
mm = metadata_from_doc(METADATA_DOC) | ||
assert mm.definition is METADATA_DOC | ||
|
||
rdr = mm.dataset_reader(SAMPLE_DOC) | ||
assert rdr.x_double == SAMPLE_DOC['x_double_path'] | ||
assert rdr.x_integer == SAMPLE_DOC['x_integer_path'] | ||
assert rdr.x_string == SAMPLE_DOC['x_string_path'] | ||
assert rdr.x_numeric == decimal.Decimal(SAMPLE_DOC['x_numeric_path']) | ||
|
||
|
||
def test_bad_field_definition(): | ||
def doc(s): | ||
return yaml.safe_load(dedent(s)) | ||
|
||
with pytest.raises(ValueError): | ||
parse_search_field(doc(''' | ||
type: bad_type | ||
offset: [a] | ||
''')) | ||
|
||
with pytest.raises(ValueError): | ||
parse_search_field(doc(''' | ||
type: badtype-range | ||
offset: [a] | ||
''')) | ||
|
||
with pytest.raises(ValueError): | ||
parse_search_field(doc(''' | ||
type: double | ||
description: missing offset | ||
''')) | ||
|
||
with pytest.raises(ValueError): | ||
parse_search_field(doc(''' | ||
type: double-range | ||
description: missing min_offset | ||
max_offset: [[a]] | ||
''')) | ||
|
||
with pytest.raises(ValueError): | ||
parse_search_field(doc(''' | ||
type: double-range | ||
description: missing max_offset | ||
min_offset: [[a]] | ||
''')) |