Augment sclaing factor parsing

kroenlein · kroenlein · commit b21113a69c13 · 2023-05-22T14:04:59.000-06:00
diff --git a/gemd/entity/dict_serializable.py b/gemd/entity/dict_serializable.py
@@ -84,7 +84,7 @@ def from_dict(cls, d: Mapping[str, Any]) -> DictSerializableType:
         return cls(**kwargs)
 
     @classmethod
-    @functools.lru_cache(maxsize=None)
+    @functools.lru_cache(maxsize=1024)
     def _init_sig(cls) -> List[str]:
         """Internal method for generating the argument names for the class init method."""
         expected_arg_names = inspect.getfullargspec(cls.__init__).args
diff --git a/gemd/entity/object/base_object.py b/gemd/entity/object/base_object.py
@@ -49,7 +49,7 @@ def __init__(self,
         self.file_links = file_links
 
     @classmethod
-    @functools.lru_cache(maxsize=None)
+    @functools.lru_cache(maxsize=1024)
     def _attribute_has_setter(cls, name: str) -> bool:
         """
         Internal method to identify if an attribute has a setter method.
diff --git a/gemd/units/__init__.py b/gemd/units/__init__.py
@@ -1,6 +1,6 @@
 # flake8: noqa
-from .impl import parse_units, convert_units, change_definitions_file, \
+from .impl import parse_units, convert_units, get_base_units, change_definitions_file, \
     UndefinedUnitError, IncompatibleUnitsError, DefinitionSyntaxError
 
-__all__ = [parse_units, convert_units, change_definitions_file,
+__all__ = [parse_units, convert_units, get_base_units, change_definitions_file,
            UndefinedUnitError, IncompatibleUnitsError, DefinitionSyntaxError]
diff --git a/gemd/units/impl.py b/gemd/units/impl.py
@@ -3,70 +3,126 @@
 
 from pint import UnitRegistry, Unit, register_unit_format, Quantity
 from pint.compat import tokenizer
-from tokenize import NAME, NUMBER, OP
+from tokenize import NAME, NUMBER, OP, Token, ERRORTOKEN
 # alias the error that is thrown when units are incompatible
 # this helps to isolate the dependence on pint
 from pint.errors import DimensionalityError as IncompatibleUnitsError  # noqa Import
 from pint.errors import UndefinedUnitError, DefinitionSyntaxError  # noqa Import
 
 import functools
 import pkg_resources
-from typing import Union
+from typing import Union, List, Tuple
 
 # use the default unit registry for now
 DEFAULT_FILE = pkg_resources.resource_filename("gemd.units", "citrine_en.txt")
 _ALLOWED_OPERATORS = {".", "+", "-", "*", "/", "//", "^", "**", "(", ")"}
 
 
+def _scientific_notation_preprocessor(input_string: str) -> str:
+    """Preprocessor that converts x * 10 ** y format to xEy."""
+    def _as_scientific(matchobj: re.Match) -> str:
+        return f"{matchobj.group(1) or '1'}e{matchobj.group(2)}"
+
+    number = r'\b(?:(\d+\.?\d*|\.\d+)\s*\*\s*)?10\s*(?:\*{2}|\^)\s*\+?(-?\d+\b)'
+    return re.sub(number, _as_scientific, input_string)
+
+
 def _scaling_preprocessor(input_string: str) -> str:
     """Preprocessor that turns scaling factors into non-dimensional units."""
+    blocks: List[List[Token]] = [[]]
+    operator_stack = []
+    for token in tokenizer(input_string):
+        exponent_context = any(t.string in {"**", "^"} for t in operator_stack)
+        if token.type == OP:
+            if token.string not in _ALLOWED_OPERATORS:
+                raise UndefinedUnitError(f"Unrecognized operator: {token.string}")
+
+            if exponent_context or token.string in {"**", "^", ".", "-", "+"}:
+                # Exponents & unaries do not change context
+                blocks[-1].append(token)
+            elif token.string not in {}:
+                blocks.append([])
+
+            if token.string == '(':
+                operator_stack.append(token)
+            elif token.string == ')':
+                while operator_stack:  # don't worry about enforcing balance
+                    if operator_stack.pop().string == '(':
+                        break  # We found token's friend
+            elif token.string in {"**", "^"}:
+                operator_stack.append(token)
+                continue  # Break flow since next token is in exponent context
+        elif token.type == NAME:
+            if exponent_context or len(blocks[-1]) == 0 or blocks[-1][-1].type != NAME:
+                blocks[-1].append(token)
+            else:  # Break blocks for two units in a row
+                blocks.append([token])
+        elif token.type == NUMBER:
+            blocks[-1].append(token)
+        elif token.type == ERRORTOKEN:  # Keep non-legal Python symbols like °
+            blocks[-1].append(token)
+        # Drop other tokens, such as EOF
+
+        if len(operator_stack) > 0 and operator_stack[-1].string in {"**", "^"}:
+            operator_stack.pop()  # Exit context for this exponential
+
+    todo = []
+    blocks.pop(0)  # Leading term is not allowed to be a scaling factor
+    for block in blocks:
+        i_exp = next((i for i, t in enumerate(block) if t.string in {"**", "^"}), len(block))
+        i_name = next((i for i, t in enumerate(block) if t.type == NAME), None)
+        numbers = [(i, t.string) for i, t in enumerate(block) if t.type == NUMBER and i < i_exp]
+
+        if len(numbers) == 1:
+            position, value = numbers[0]
+            if i_exp != len(block):
+                raise ValueError(
+                    f"Scaling factors ({value}) with exponents are not supported ({input_string})"
+                )
+            if i_name is not None and i_name < position:
+                raise ValueError(f"Scaling factor ({value}) follows unit in {input_string}")
+            if float(value) != 1.0 and float(value) != 0.0:  # Don't create definitions for 0 or 1
+                block_string = input_string[block[0].start[1]:block[-1].end[1]]
+                if i_name is None:
+                    unit_string = None
+                else:
+                    unit_string = input_string[block[position + 1].start[1]:block[i_name].end[1]]
+                todo.append((block_string, value, unit_string))
+        elif len(numbers) > 1:
+            raise ValueError(
+                f"Replicate scaling factor ({[n[1] for n in numbers]}) in {input_string}"
+            )
+
     global _REGISTRY
-    tokens = list(tokenizer(input_string))
-    scales = []
-
-    unrecognized = [t for t in tokens if t.type == OP and t.string not in _ALLOWED_OPERATORS]
-    if len(unrecognized) > 0:
-        raise UndefinedUnitError(f"Unrecognized operator(s): {unrecognized}")
-
-    # Ignore leading numbers & operators, since Pint handles those itself
-    start = next((i for i, token in enumerate(tokens) if token.type == NAME), len(tokens))
-
-    for i, token in enumerate(tokens[start:], start=start):
-        if token.type != NUMBER:
-            continue
-
-        # Note we can't run off the front because we started at a NAME
-        first = i
-        while tokens[first - 1].string in {'+', '-'}:
-            first -= 1  # Include unary operations
-
-        if tokens[first - 1].string in {"^", "**"}:
-            continue  # Don't mangle exponents
-
-        # Names couple tightly to their preceding numbers, so is it a denominator?
-        division = tokens[first - 1].string in {"/", "//"}
-        tight = i < len(tokens) - 2 and tokens[i + 1].type == NAME
-
-        # Get the number
-        substr = input_string[tokens[first].start[1]:token.end[1]]
-        value = eval(substr)
-        if value <= 0:
-            raise DefinitionSyntaxError(f"Scaling factors must be positive: {substr}")
-        scales.append([substr, token.string, division and tight])
-
-    for substr, value, division in scales:
-        # There's probably something to be said for stashing these, but this sin
-        # should be ameliorated by the LRU cache
-        regex = rf"(?<!=[-+0-9.]){re.escape(substr)}(?!=[0-9.])"
-        valid = "_" + value.replace(".", "_").replace("+", "").replace("-", "_")
-        trailing = "/" if division else ""
-        _REGISTRY.define(f"{valid} = {value} = {value}")
-        input_string = re.sub(regex, valid + trailing, input_string)
+    for scaled_term, number_string, unit_string in todo:
+        regex = rf"(?<![-+0-9.]){re.escape(scaled_term)}(?![0-9.])"
+        stripped = re.sub(r"--", "", re.sub(r"[+\s]+", "", scaled_term))
+
+        if unit_string is not None:
+            stripped_unit = re.sub(r"--", "", re.sub(r"[+\s]+", "", unit_string))
+            long_unit = f"{_REGISTRY(stripped_unit).u}"
+            short_unit = f"{_REGISTRY(stripped_unit).u:~}"
+            long = stripped.replace(stripped_unit, "_" + long_unit)
+            short = stripped.replace(stripped_unit, " " + short_unit)
+        else:
+            long = stripped
+            short = stripped
+
+        underscored = re.sub(r"[-.]", "_", long)
+        valid = f"_{underscored}"
+        if valid not in _REGISTRY:
+            # Parse subexpression to clean things up for define
+            value = f"{_REGISTRY.parse_expression(scaled_term)}"
+            _REGISTRY.define(f"{valid} = {value} = {short}")
+        input_string = re.sub(regex, valid, input_string)
 
     return input_string
 
 
-_REGISTRY = UnitRegistry(filename=DEFAULT_FILE, preprocessors=[_scaling_preprocessor])
+_REGISTRY = UnitRegistry(filename=DEFAULT_FILE,
+                         preprocessors=[_scientific_notation_preprocessor,
+                                        _scaling_preprocessor],
+                         autoconvert_offset_to_baseunit=True)
 
 
 @register_unit_format("clean")
@@ -75,9 +131,9 @@ def _format_clean(unit, registry, **options):
     numerator = []
     denominator = []
     for u, p in unit.items():
-        if re.match(r"_[\d_]+$", u):
-            # Munged scaling factor; drop leading underscore, restore . and -
-            u = re.sub(r"(?<=\d)_(?=\d)", ".", u[1:]).replace("_", "-")
+        if re.match(r"_[\d_]+", u):
+            # Munged scaling factor; grab symbol, which is the prettier
+            u = registry.get_symbol(u)
 
         if p == 1:
             numerator.append(u)
@@ -98,7 +154,7 @@ def _format_clean(unit, registry, **options):
 
 
 @functools.lru_cache(maxsize=1024)
-def parse_units(units: Union[str, Unit, None]) -> Union[str, Unit, None]:
+def _parse_units(units: str) -> Unit:
     """
     Parse a string or Unit into a standard string representation of the unit.
 
@@ -112,24 +168,60 @@ def parse_units(units: Union[str, Unit, None]) -> Union[str, Unit, None]:
     [Union[str, Unit, None]]
         The representation; note that the same type that was passed is returned
 
+    """
+    # TODO: parse_units has a bug resolved in 0.19, but 3.7 only supports up to 0.18
+    parsed: Quantity = _REGISTRY(units)
+    if isinstance(parsed, Quantity):
+        magnitude = parsed.magnitude
+        result = parsed.units
+    else:
+        magnitude = parsed  # It was non-dimensional
+        result = _REGISTRY("").u
+    if magnitude == 0.0:
+        raise ValueError(f"Unit expression had a zero scaling factor. {units}")
+    if magnitude != 1:
+        raise ValueError(f"Unit expression cannot have a leading scaling factor. {units}")
+    return result
+
+
+def parse_units(units: Union[str, Unit, None],
+                *,
+                return_unit: bool = False
+                ) -> Union[str, Unit, None]:
+    """
+    Parse a string or Unit into a standard string representation of the unit.
+
+    Parameters
+    ----------
+    units: Union[str, Unit, None]
+        The string or Unit representation of the object we wish to display
+    return_unit: boolean
+        Whether to return a Unit object, vs. whatever was initially passed
+
+    Returns
+    -------
+    [Union[str, Unit, None]]
+        The representation; note that the same type that was passed is returned
+
     """
     if units is None:
-        return None
-    elif units == '':
-        return 'dimensionless'
+        if return_unit:
+            return _REGISTRY("").u
+        else:
+            return None
     elif isinstance(units, str):
-        # TODO: parse_units has a bug resolved in 0.19, but 3.7 only supports up to 0.18
-        parsed = _REGISTRY(units)
-        if not isinstance(parsed, Quantity) or parsed.magnitude != 1:
-            raise ValueError(f"Unit expression cannot have a leading scaling factor. {units}")
-        return f"{parsed.u:clean}"
+        parsed = _parse_units(units)
+        if return_unit:
+            return parsed
+        else:
+            return f"{parsed:clean}"
     elif isinstance(units, Unit):
         return units
     else:
         raise UndefinedUnitError("Units must be given as a recognized unit string or Units object")
 
 
-@functools.lru_cache(maxsize=None)
+@functools.lru_cache(maxsize=1024 * 1024)
 def convert_units(value: float, starting_unit: str, final_unit: str) -> float:
     """
     Convert the value from the starting_unit to the final_unit.
@@ -152,7 +244,31 @@ def convert_units(value: float, starting_unit: str, final_unit: str) -> float:
     if starting_unit == final_unit:
         return value  # skip computation
     else:
-        return _REGISTRY.Quantity(value, starting_unit).to(final_unit).magnitude
+        resolved_final_unit = _REGISTRY(final_unit)  # `to` bypasses preparser
+        return _REGISTRY.Quantity(value, starting_unit).to(resolved_final_unit).magnitude
+
+
+@functools.lru_cache(maxsize=1024)
+def get_base_units(units: Union[str, Unit]) -> Tuple[Unit, float, float]:
+    """
+    Get the base units and conversion factors for the given unit.
+
+    Parameters
+    ----------
+    units: Union[str, Unit, None]
+        The string or Unit representation of the object we wish to display
+
+    Returns
+    -------
+    Tuple[Unit, Number, float]
+        The base unit, its
+
+    """
+    if isinstance(units, str):
+        units = _REGISTRY(units)
+    ratio, base_unit = _REGISTRY.get_base_units(units)
+    offset = _REGISTRY.Quantity(0, units).to(_REGISTRY.Quantity(0, base_unit)).magnitude
+    return base_unit, float(ratio), offset
 
 
 def change_definitions_file(filename: str = None):
@@ -169,4 +285,9 @@ def change_definitions_file(filename: str = None):
     convert_units.cache_clear()  # Units will change
     if filename is None:
         filename = DEFAULT_FILE
-    _REGISTRY = UnitRegistry(filename=filename, preprocessors=[_scaling_preprocessor])
+    _REGISTRY = UnitRegistry(filename=filename,
+                             preprocessors=[
+                                 _scientific_notation_preprocessor,
+                                 _scaling_preprocessor
+                             ],
+                             autoconvert_offset_to_baseunit=True)
diff --git a/gemd/util/impl.py b/gemd/util/impl.py
@@ -65,7 +65,7 @@ def cached_isinstance(
 _cached_isinstance = cached_isinstance
 
 
-@functools.lru_cache(maxsize=None)
+@functools.lru_cache(maxsize=1024)
 def _cached_issubclass(
         cls: Type,
         class_or_tuple: Union[Type, Tuple[Union[Type, Tuple[Type]]]]) -> bool:
@@ -210,7 +210,7 @@ def _key(obj):
     return thing
 
 
-@functools.lru_cache(maxsize=None)
+@functools.lru_cache(maxsize=1024)
 def _setter_by_attribute(clazz: type, attribute: str) -> Callable:
     """
     Internal method to get the setter method for an attribute.
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 packages.append("")
 
 setup(name='gemd',
-      version='1.13.6',
+      version='1.14.0',
       url='http://github.com/CitrineInformatics/gemd-python',
       description="Python binding for Citrine's GEMD data model",
       author='Citrine Informatics',
diff --git a/tests/units/test_parser.py b/tests/units/test_parser.py