Skip to content

Commit b21113a

Browse files
committed
Augment sclaing factor parsing
1 parent 383dabc commit b21113a

File tree

7 files changed

+252
-82
lines changed

7 files changed

+252
-82
lines changed

gemd/entity/dict_serializable.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def from_dict(cls, d: Mapping[str, Any]) -> DictSerializableType:
8484
return cls(**kwargs)
8585

8686
@classmethod
87-
@functools.lru_cache(maxsize=None)
87+
@functools.lru_cache(maxsize=1024)
8888
def _init_sig(cls) -> List[str]:
8989
"""Internal method for generating the argument names for the class init method."""
9090
expected_arg_names = inspect.getfullargspec(cls.__init__).args

gemd/entity/object/base_object.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def __init__(self,
4949
self.file_links = file_links
5050

5151
@classmethod
52-
@functools.lru_cache(maxsize=None)
52+
@functools.lru_cache(maxsize=1024)
5353
def _attribute_has_setter(cls, name: str) -> bool:
5454
"""
5555
Internal method to identify if an attribute has a setter method.

gemd/units/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# flake8: noqa
2-
from .impl import parse_units, convert_units, change_definitions_file, \
2+
from .impl import parse_units, convert_units, get_base_units, change_definitions_file, \
33
UndefinedUnitError, IncompatibleUnitsError, DefinitionSyntaxError
44

5-
__all__ = [parse_units, convert_units, change_definitions_file,
5+
__all__ = [parse_units, convert_units, get_base_units, change_definitions_file,
66
UndefinedUnitError, IncompatibleUnitsError, DefinitionSyntaxError]

gemd/units/impl.py

Lines changed: 180 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -3,70 +3,126 @@
33

44
from pint import UnitRegistry, Unit, register_unit_format, Quantity
55
from pint.compat import tokenizer
6-
from tokenize import NAME, NUMBER, OP
6+
from tokenize import NAME, NUMBER, OP, Token, ERRORTOKEN
77
# alias the error that is thrown when units are incompatible
88
# this helps to isolate the dependence on pint
99
from pint.errors import DimensionalityError as IncompatibleUnitsError # noqa Import
1010
from pint.errors import UndefinedUnitError, DefinitionSyntaxError # noqa Import
1111

1212
import functools
1313
import pkg_resources
14-
from typing import Union
14+
from typing import Union, List, Tuple
1515

1616
# use the default unit registry for now
1717
DEFAULT_FILE = pkg_resources.resource_filename("gemd.units", "citrine_en.txt")
1818
_ALLOWED_OPERATORS = {".", "+", "-", "*", "/", "//", "^", "**", "(", ")"}
1919

2020

21+
def _scientific_notation_preprocessor(input_string: str) -> str:
22+
"""Preprocessor that converts x * 10 ** y format to xEy."""
23+
def _as_scientific(matchobj: re.Match) -> str:
24+
return f"{matchobj.group(1) or '1'}e{matchobj.group(2)}"
25+
26+
number = r'\b(?:(\d+\.?\d*|\.\d+)\s*\*\s*)?10\s*(?:\*{2}|\^)\s*\+?(-?\d+\b)'
27+
return re.sub(number, _as_scientific, input_string)
28+
29+
2130
def _scaling_preprocessor(input_string: str) -> str:
2231
"""Preprocessor that turns scaling factors into non-dimensional units."""
32+
blocks: List[List[Token]] = [[]]
33+
operator_stack = []
34+
for token in tokenizer(input_string):
35+
exponent_context = any(t.string in {"**", "^"} for t in operator_stack)
36+
if token.type == OP:
37+
if token.string not in _ALLOWED_OPERATORS:
38+
raise UndefinedUnitError(f"Unrecognized operator: {token.string}")
39+
40+
if exponent_context or token.string in {"**", "^", ".", "-", "+"}:
41+
# Exponents & unaries do not change context
42+
blocks[-1].append(token)
43+
elif token.string not in {}:
44+
blocks.append([])
45+
46+
if token.string == '(':
47+
operator_stack.append(token)
48+
elif token.string == ')':
49+
while operator_stack: # don't worry about enforcing balance
50+
if operator_stack.pop().string == '(':
51+
break # We found token's friend
52+
elif token.string in {"**", "^"}:
53+
operator_stack.append(token)
54+
continue # Break flow since next token is in exponent context
55+
elif token.type == NAME:
56+
if exponent_context or len(blocks[-1]) == 0 or blocks[-1][-1].type != NAME:
57+
blocks[-1].append(token)
58+
else: # Break blocks for two units in a row
59+
blocks.append([token])
60+
elif token.type == NUMBER:
61+
blocks[-1].append(token)
62+
elif token.type == ERRORTOKEN: # Keep non-legal Python symbols like °
63+
blocks[-1].append(token)
64+
# Drop other tokens, such as EOF
65+
66+
if len(operator_stack) > 0 and operator_stack[-1].string in {"**", "^"}:
67+
operator_stack.pop() # Exit context for this exponential
68+
69+
todo = []
70+
blocks.pop(0) # Leading term is not allowed to be a scaling factor
71+
for block in blocks:
72+
i_exp = next((i for i, t in enumerate(block) if t.string in {"**", "^"}), len(block))
73+
i_name = next((i for i, t in enumerate(block) if t.type == NAME), None)
74+
numbers = [(i, t.string) for i, t in enumerate(block) if t.type == NUMBER and i < i_exp]
75+
76+
if len(numbers) == 1:
77+
position, value = numbers[0]
78+
if i_exp != len(block):
79+
raise ValueError(
80+
f"Scaling factors ({value}) with exponents are not supported ({input_string})"
81+
)
82+
if i_name is not None and i_name < position:
83+
raise ValueError(f"Scaling factor ({value}) follows unit in {input_string}")
84+
if float(value) != 1.0 and float(value) != 0.0: # Don't create definitions for 0 or 1
85+
block_string = input_string[block[0].start[1]:block[-1].end[1]]
86+
if i_name is None:
87+
unit_string = None
88+
else:
89+
unit_string = input_string[block[position + 1].start[1]:block[i_name].end[1]]
90+
todo.append((block_string, value, unit_string))
91+
elif len(numbers) > 1:
92+
raise ValueError(
93+
f"Replicate scaling factor ({[n[1] for n in numbers]}) in {input_string}"
94+
)
95+
2396
global _REGISTRY
24-
tokens = list(tokenizer(input_string))
25-
scales = []
26-
27-
unrecognized = [t for t in tokens if t.type == OP and t.string not in _ALLOWED_OPERATORS]
28-
if len(unrecognized) > 0:
29-
raise UndefinedUnitError(f"Unrecognized operator(s): {unrecognized}")
30-
31-
# Ignore leading numbers & operators, since Pint handles those itself
32-
start = next((i for i, token in enumerate(tokens) if token.type == NAME), len(tokens))
33-
34-
for i, token in enumerate(tokens[start:], start=start):
35-
if token.type != NUMBER:
36-
continue
37-
38-
# Note we can't run off the front because we started at a NAME
39-
first = i
40-
while tokens[first - 1].string in {'+', '-'}:
41-
first -= 1 # Include unary operations
42-
43-
if tokens[first - 1].string in {"^", "**"}:
44-
continue # Don't mangle exponents
45-
46-
# Names couple tightly to their preceding numbers, so is it a denominator?
47-
division = tokens[first - 1].string in {"/", "//"}
48-
tight = i < len(tokens) - 2 and tokens[i + 1].type == NAME
49-
50-
# Get the number
51-
substr = input_string[tokens[first].start[1]:token.end[1]]
52-
value = eval(substr)
53-
if value <= 0:
54-
raise DefinitionSyntaxError(f"Scaling factors must be positive: {substr}")
55-
scales.append([substr, token.string, division and tight])
56-
57-
for substr, value, division in scales:
58-
# There's probably something to be said for stashing these, but this sin
59-
# should be ameliorated by the LRU cache
60-
regex = rf"(?<!=[-+0-9.]){re.escape(substr)}(?!=[0-9.])"
61-
valid = "_" + value.replace(".", "_").replace("+", "").replace("-", "_")
62-
trailing = "/" if division else ""
63-
_REGISTRY.define(f"{valid} = {value} = {value}")
64-
input_string = re.sub(regex, valid + trailing, input_string)
97+
for scaled_term, number_string, unit_string in todo:
98+
regex = rf"(?<![-+0-9.]){re.escape(scaled_term)}(?![0-9.])"
99+
stripped = re.sub(r"--", "", re.sub(r"[+\s]+", "", scaled_term))
100+
101+
if unit_string is not None:
102+
stripped_unit = re.sub(r"--", "", re.sub(r"[+\s]+", "", unit_string))
103+
long_unit = f"{_REGISTRY(stripped_unit).u}"
104+
short_unit = f"{_REGISTRY(stripped_unit).u:~}"
105+
long = stripped.replace(stripped_unit, "_" + long_unit)
106+
short = stripped.replace(stripped_unit, " " + short_unit)
107+
else:
108+
long = stripped
109+
short = stripped
110+
111+
underscored = re.sub(r"[-.]", "_", long)
112+
valid = f"_{underscored}"
113+
if valid not in _REGISTRY:
114+
# Parse subexpression to clean things up for define
115+
value = f"{_REGISTRY.parse_expression(scaled_term)}"
116+
_REGISTRY.define(f"{valid} = {value} = {short}")
117+
input_string = re.sub(regex, valid, input_string)
65118

66119
return input_string
67120

68121

69-
_REGISTRY = UnitRegistry(filename=DEFAULT_FILE, preprocessors=[_scaling_preprocessor])
122+
_REGISTRY = UnitRegistry(filename=DEFAULT_FILE,
123+
preprocessors=[_scientific_notation_preprocessor,
124+
_scaling_preprocessor],
125+
autoconvert_offset_to_baseunit=True)
70126

71127

72128
@register_unit_format("clean")
@@ -75,9 +131,9 @@ def _format_clean(unit, registry, **options):
75131
numerator = []
76132
denominator = []
77133
for u, p in unit.items():
78-
if re.match(r"_[\d_]+$", u):
79-
# Munged scaling factor; drop leading underscore, restore . and -
80-
u = re.sub(r"(?<=\d)_(?=\d)", ".", u[1:]).replace("_", "-")
134+
if re.match(r"_[\d_]+", u):
135+
# Munged scaling factor; grab symbol, which is the prettier
136+
u = registry.get_symbol(u)
81137

82138
if p == 1:
83139
numerator.append(u)
@@ -98,7 +154,7 @@ def _format_clean(unit, registry, **options):
98154

99155

100156
@functools.lru_cache(maxsize=1024)
101-
def parse_units(units: Union[str, Unit, None]) -> Union[str, Unit, None]:
157+
def _parse_units(units: str) -> Unit:
102158
"""
103159
Parse a string or Unit into a standard string representation of the unit.
104160
@@ -112,24 +168,60 @@ def parse_units(units: Union[str, Unit, None]) -> Union[str, Unit, None]:
112168
[Union[str, Unit, None]]
113169
The representation; note that the same type that was passed is returned
114170
171+
"""
172+
# TODO: parse_units has a bug resolved in 0.19, but 3.7 only supports up to 0.18
173+
parsed: Quantity = _REGISTRY(units)
174+
if isinstance(parsed, Quantity):
175+
magnitude = parsed.magnitude
176+
result = parsed.units
177+
else:
178+
magnitude = parsed # It was non-dimensional
179+
result = _REGISTRY("").u
180+
if magnitude == 0.0:
181+
raise ValueError(f"Unit expression had a zero scaling factor. {units}")
182+
if magnitude != 1:
183+
raise ValueError(f"Unit expression cannot have a leading scaling factor. {units}")
184+
return result
185+
186+
187+
def parse_units(units: Union[str, Unit, None],
188+
*,
189+
return_unit: bool = False
190+
) -> Union[str, Unit, None]:
191+
"""
192+
Parse a string or Unit into a standard string representation of the unit.
193+
194+
Parameters
195+
----------
196+
units: Union[str, Unit, None]
197+
The string or Unit representation of the object we wish to display
198+
return_unit: boolean
199+
Whether to return a Unit object, vs. whatever was initially passed
200+
201+
Returns
202+
-------
203+
[Union[str, Unit, None]]
204+
The representation; note that the same type that was passed is returned
205+
115206
"""
116207
if units is None:
117-
return None
118-
elif units == '':
119-
return 'dimensionless'
208+
if return_unit:
209+
return _REGISTRY("").u
210+
else:
211+
return None
120212
elif isinstance(units, str):
121-
# TODO: parse_units has a bug resolved in 0.19, but 3.7 only supports up to 0.18
122-
parsed = _REGISTRY(units)
123-
if not isinstance(parsed, Quantity) or parsed.magnitude != 1:
124-
raise ValueError(f"Unit expression cannot have a leading scaling factor. {units}")
125-
return f"{parsed.u:clean}"
213+
parsed = _parse_units(units)
214+
if return_unit:
215+
return parsed
216+
else:
217+
return f"{parsed:clean}"
126218
elif isinstance(units, Unit):
127219
return units
128220
else:
129221
raise UndefinedUnitError("Units must be given as a recognized unit string or Units object")
130222

131223

132-
@functools.lru_cache(maxsize=None)
224+
@functools.lru_cache(maxsize=1024 * 1024)
133225
def convert_units(value: float, starting_unit: str, final_unit: str) -> float:
134226
"""
135227
Convert the value from the starting_unit to the final_unit.
@@ -152,7 +244,31 @@ def convert_units(value: float, starting_unit: str, final_unit: str) -> float:
152244
if starting_unit == final_unit:
153245
return value # skip computation
154246
else:
155-
return _REGISTRY.Quantity(value, starting_unit).to(final_unit).magnitude
247+
resolved_final_unit = _REGISTRY(final_unit) # `to` bypasses preparser
248+
return _REGISTRY.Quantity(value, starting_unit).to(resolved_final_unit).magnitude
249+
250+
251+
@functools.lru_cache(maxsize=1024)
252+
def get_base_units(units: Union[str, Unit]) -> Tuple[Unit, float, float]:
253+
"""
254+
Get the base units and conversion factors for the given unit.
255+
256+
Parameters
257+
----------
258+
units: Union[str, Unit, None]
259+
The string or Unit representation of the object we wish to display
260+
261+
Returns
262+
-------
263+
Tuple[Unit, Number, float]
264+
The base unit, its
265+
266+
"""
267+
if isinstance(units, str):
268+
units = _REGISTRY(units)
269+
ratio, base_unit = _REGISTRY.get_base_units(units)
270+
offset = _REGISTRY.Quantity(0, units).to(_REGISTRY.Quantity(0, base_unit)).magnitude
271+
return base_unit, float(ratio), offset
156272

157273

158274
def change_definitions_file(filename: str = None):
@@ -169,4 +285,9 @@ def change_definitions_file(filename: str = None):
169285
convert_units.cache_clear() # Units will change
170286
if filename is None:
171287
filename = DEFAULT_FILE
172-
_REGISTRY = UnitRegistry(filename=filename, preprocessors=[_scaling_preprocessor])
288+
_REGISTRY = UnitRegistry(filename=filename,
289+
preprocessors=[
290+
_scientific_notation_preprocessor,
291+
_scaling_preprocessor
292+
],
293+
autoconvert_offset_to_baseunit=True)

gemd/util/impl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def cached_isinstance(
6565
_cached_isinstance = cached_isinstance
6666

6767

68-
@functools.lru_cache(maxsize=None)
68+
@functools.lru_cache(maxsize=1024)
6969
def _cached_issubclass(
7070
cls: Type,
7171
class_or_tuple: Union[Type, Tuple[Union[Type, Tuple[Type]]]]) -> bool:
@@ -210,7 +210,7 @@ def _key(obj):
210210
return thing
211211

212212

213-
@functools.lru_cache(maxsize=None)
213+
@functools.lru_cache(maxsize=1024)
214214
def _setter_by_attribute(clazz: type, attribute: str) -> Callable:
215215
"""
216216
Internal method to get the setter method for an attribute.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
packages.append("")
55

66
setup(name='gemd',
7-
version='1.13.6',
7+
version='1.14.0',
88
url='http://github.com/CitrineInformatics/gemd-python',
99
description="Python binding for Citrine's GEMD data model",
1010
author='Citrine Informatics',

0 commit comments

Comments
 (0)