Implement feedback

kroenlein · kroenlein · commit 0a806a554c1a · 2023-04-18T17:57:50.000-06:00
diff --git a/gemd/units/__init__.py b/gemd/units/__init__.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 from .impl import parse_units, convert_units, change_definitions_file, \
-    UndefinedUnitError, IncompatibleUnitsError
+    UndefinedUnitError, IncompatibleUnitsError, DefinitionSyntaxError
 
 __all__ = [parse_units, convert_units, change_definitions_file,
-           UndefinedUnitError, IncompatibleUnitsError]
+           UndefinedUnitError, IncompatibleUnitsError, DefinitionSyntaxError]
diff --git a/gemd/units/impl.py b/gemd/units/impl.py
@@ -1,59 +1,66 @@
 """Implementation of units."""
 import re
 
-from pint import UnitRegistry, Unit, register_unit_format
+from pint import UnitRegistry, Unit, register_unit_format, Quantity
 from pint.compat import tokenizer
 from tokenize import NAME, NUMBER, OP
 # alias the error that is thrown when units are incompatible
 # this helps to isolate the dependence on pint
 from pint.errors import DimensionalityError as IncompatibleUnitsError  # noqa Import
-from pint.errors import UndefinedUnitError
+from pint.errors import UndefinedUnitError, DefinitionSyntaxError  # noqa Import
 
 import functools
 import pkg_resources
 from typing import Union
 
 # use the default unit registry for now
 DEFAULT_FILE = pkg_resources.resource_filename("gemd.units", "citrine_en.txt")
+_ALLOWED_OPERATORS = {"+", "-", "*", "/", "//", "^", "**", "(", ")"}
 
 
 def _scaling_preprocessor(input_string: str) -> str:
     """Preprocessor that turns scaling factors into non-dimensional units."""
     global _REGISTRY
-    tokens = tokenizer(input_string)
-    exponent = False
-    division = False
-    tight_division = False
+    tokens = list(tokenizer(input_string))
     scales = []
 
-    if next(token for token in tokens).type == NUMBER:
-        return input_string  # The unit can't have a leading number; scaling factors are internal
-
-    for token in tokens:
-        # Note that while this prevents adding a bunch of numbers to the registry,
-        # no test would break if the `exponent` logic were removed
-        if tight_division:
-            # A unit for a scaling factor is in the denominator if the factor is
-            scales[-1][-1] = token.type == NAME
-            tight_division = False
-        if not exponent and token.type == NUMBER:
-            scales.append([token.string, False])
-            tight_division = division
-        if token.type == OP:
-            if token.string not in {"+", "-", "*", "/", "//", "^", "**", "(", ")"}:
-                raise UndefinedUnitError(f"Unrecognized operator: {token.string}")
-            exponent = token.string in {"^", "**"}
-            division = token.string in {"/", "//"}
-        else:
-            exponent, division = False, False
-
-    for scale, division in scales:
+    unrecognized = [t for t in tokens if t.type == OP and t.string not in _ALLOWED_OPERATORS]
+    if len(unrecognized) > 0:
+        raise UndefinedUnitError(f"Unrecognized operator(s): {unrecognized}")
+
+    # Ignore leading numbers & operators, since Pint handles those itself
+    start = next((i for i, token in enumerate(tokens) if token.type == NAME), len(tokens))
+
+    for i, token in enumerate(tokens[start:], start=start):
+        if token.type != NUMBER:
+            continue
+
+        # Note we can't run off the front because we started at a NAME
+        first = i
+        while tokens[first - 1].string in {'+', '-'}:
+            first -= 1  # Include unary operations
+
+        if tokens[first - 1].string in {"^", "**"}:
+            continue  # Don't mangle exponents
+
+        # Names couple tightly to their preceding numbers, so is it a denominator?
+        division = tokens[first - 1].string in {"/", "//"}
+        tight = i < len(tokens) - 2 and tokens[i + 1].type == NAME
+
+        # Get the number
+        substr = input_string[tokens[first].start[1]:token.end[1]]
+        value = eval(substr)
+        if value <= 0:
+            raise DefinitionSyntaxError(f"Scaling factors must be positive: {substr}")
+        scales.append([substr, value, division and tight])
+
+    for substr, value, division in scales:
         # There's probably something to be said for stashing these, but this sin
         # should be ameliorated by the LRU cache
-        regex = rf"\b{re.escape(scale)}(?!=[0-9.])"
-        valid = "_" + scale.replace(".", "_").replace("+", "").replace("-", "_")
+        regex = rf"(?<!=[-+0-9.]){re.escape(substr)}(?!=[0-9.])"
+        valid = "_" + substr.replace(".", "_").replace("+", "").replace("-", "_")
         trailing = "/" if division else ""
-        _REGISTRY.define(f"{valid} = {scale} = {scale}")
+        _REGISTRY.define(f"{valid} = {value} = {substr}")
         input_string = re.sub(regex, valid + trailing, input_string)
 
     return input_string
@@ -112,8 +119,8 @@ def parse_units(units: Union[str, Unit, None]) -> Union[str, Unit, None]:
         return 'dimensionless'
     elif isinstance(units, str):
         parsed = _REGISTRY(units)
-        if isinstance(parsed, int) or parsed.magnitude != 1:
-            raise ValueError("Unit expression cannot have a scaling factor.")
+        if not isinstance(parsed, Quantity) or parsed.magnitude != 1:
+            raise ValueError(f"Units cannot start with (or just be) numbers: {units}")
         return f"{parsed.u:clean}"
     elif isinstance(units, Unit):
         return units
diff --git a/gemd/units/tests/test_parser.py b/gemd/units/tests/test_parser.py
@@ -4,14 +4,18 @@
 import pkg_resources
 from contextlib import contextmanager
 from pint import UnitRegistry
-from gemd.units import parse_units, convert_units, change_definitions_file, UndefinedUnitError
+from gemd.units import parse_units, convert_units, change_definitions_file, \
+    UndefinedUnitError, DefinitionSyntaxError
 
 
 def test_parse_expected():
     """Test that we can parse the units that we expect to be able to."""
     # use the default unit registry for now
     reg = UnitRegistry(filename=pkg_resources.resource_filename("gemd.units", "citrine_en.txt"))
 
+    # Pint's parse_units actually gets this wrong
+    assert parse_units("m^-1 * newton / meter") == parse_units("N / m^2")
+
     expected = [
         "degC", "degF", "K",
         "g", "kg", "mg", "ton",
@@ -24,6 +28,7 @@ def test_parse_expected():
         "Seconds",  # Added support for some title-case units
         "delta_Celsius / hour",  # Added to make sure pint version is right (>0.10)
         "g / 2.5 cm",  # Scaling factors are acceptable
+        "g / -+-25e-1 m"  # Weird but fine
     ]
     for unit in expected:
         parse_units(unit)
@@ -43,14 +48,28 @@ def test_parse_unexpected():
         "cp",  # Removed because of risk of collision with cP
         "chain",  # Survey units eliminated
         "SECONDS",  # Not just case insensitivity
-        "lb : in^3",  # Not just case insensitivity
+        "lb : in^3",  # : is not a valid operator
     ]
     for unit in unexpected:
         with pytest.raises(UndefinedUnitError):
             parse_units(unit)
 
-    for unit in ("3 rpm", "16"):
-        with pytest.raises(ValueError, match="scaling"):
+    scaling = [
+        "3 rpm",  # No leading digits
+        "16",  # No values that are just integers
+        "16.2"  # No values that are just floats
+    ]
+    for unit in scaling:
+        with pytest.raises(ValueError):
+            parse_units(unit)
+
+    definition = [
+        "/gram",  # A leading operator makes no sense
+        "g / 0 m",  # Zero scaling factor
+        "g / -2 m"  # Negative scaling factor
+    ]
+    for unit in definition:
+        with pytest.raises(DefinitionSyntaxError):
             parse_units(unit)
 
 
@@ -64,7 +83,7 @@ def test_format():
     # use the default unit registry for now
     reg = UnitRegistry(filename=pkg_resources.resource_filename("gemd.units", "citrine_en.txt"))
 
-    result = parse_units("K^-2 m^-1 C^0 g^1 s^2")
+    result = parse_units("K^-2.0 m^-1e0 C^0 g^1 s^2")
     assert "-" not in result
     assert "[time]" in reg(result).dimensionality
     assert "[current]" not in reg(result).dimensionality