Merge pull request #183 from CitrineInformatics/maintain/update-unit-handling

kroenlein · web-flow · commit 54df335ce405 · 2023-04-19T10:00:25.000-06:00
Add additional tests for unexpected fields in units
diff --git a/.travis.yml b/.travis.yml
@@ -5,6 +5,7 @@ python:
 - '3.8'
 - '3.9'
 - '3.10'
+- '3.11'
 env:
 - PINT_VERSION=0.18
 - PINT_VERSION=0.20
diff --git a/gemd/units/__init__.py b/gemd/units/__init__.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 from .impl import parse_units, convert_units, change_definitions_file, \
-    UndefinedUnitError, IncompatibleUnitsError
+    UndefinedUnitError, IncompatibleUnitsError, DefinitionSyntaxError
 
 __all__ = [parse_units, convert_units, change_definitions_file,
-           UndefinedUnitError, IncompatibleUnitsError]
+           UndefinedUnitError, IncompatibleUnitsError, DefinitionSyntaxError]
diff --git a/gemd/units/impl.py b/gemd/units/impl.py
@@ -1,51 +1,66 @@
 """Implementation of units."""
 import re
 
-from pint import UnitRegistry, Unit, register_unit_format
+from pint import UnitRegistry, Unit, register_unit_format, Quantity
 from pint.compat import tokenizer
 from tokenize import NAME, NUMBER, OP
 # alias the error that is thrown when units are incompatible
 # this helps to isolate the dependence on pint
 from pint.errors import DimensionalityError as IncompatibleUnitsError  # noqa Import
-from pint.errors import UndefinedUnitError
+from pint.errors import UndefinedUnitError, DefinitionSyntaxError  # noqa Import
 
 import functools
 import pkg_resources
 from typing import Union
 
 # use the default unit registry for now
 DEFAULT_FILE = pkg_resources.resource_filename("gemd.units", "citrine_en.txt")
+_ALLOWED_OPERATORS = {"+", "-", "*", "/", "//", "^", "**", "(", ")"}
 
 
 def _scaling_preprocessor(input_string: str) -> str:
     """Preprocessor that turns scaling factors into non-dimensional units."""
     global _REGISTRY
-    tokens = tokenizer(input_string)
-    exponent = False
-    division = False
-    tight_division = False
+    tokens = list(tokenizer(input_string))
     scales = []
 
-    for token in tokens:
-        # Note that while this prevents adding a bunch of numbers to the registry,
-        # no test would break if the `exponent` logic were removed
-        if tight_division:
-            # A unit for a scaling factor is in the denominator if the factor is
-            scales[-1][-1] = token.type == NAME
-            tight_division = False
-        if not exponent and token.type == NUMBER:
-            scales.append([token.string, False])
-            tight_division = division
-        exponent = token.type == OP and token.string in {"^", "**"}
-        division = token.type == OP and token.string in {"/", "//"}
-
-    for scale, division in scales:
+    unrecognized = [t for t in tokens if t.type == OP and t.string not in _ALLOWED_OPERATORS]
+    if len(unrecognized) > 0:
+        raise UndefinedUnitError(f"Unrecognized operator(s): {unrecognized}")
+
+    # Ignore leading numbers & operators, since Pint handles those itself
+    start = next((i for i, token in enumerate(tokens) if token.type == NAME), len(tokens))
+
+    for i, token in enumerate(tokens[start:], start=start):
+        if token.type != NUMBER:
+            continue
+
+        # Note we can't run off the front because we started at a NAME
+        first = i
+        while tokens[first - 1].string in {'+', '-'}:
+            first -= 1  # Include unary operations
+
+        if tokens[first - 1].string in {"^", "**"}:
+            continue  # Don't mangle exponents
+
+        # Names couple tightly to their preceding numbers, so is it a denominator?
+        division = tokens[first - 1].string in {"/", "//"}
+        tight = i < len(tokens) - 2 and tokens[i + 1].type == NAME
+
+        # Get the number
+        substr = input_string[tokens[first].start[1]:token.end[1]]
+        value = eval(substr)
+        if value <= 0:
+            raise DefinitionSyntaxError(f"Scaling factors must be positive: {substr}")
+        scales.append([substr, value, division and tight])
+
+    for substr, value, division in scales:
         # There's probably something to be said for stashing these, but this sin
         # should be ameliorated by the LRU cache
-        regex = rf"\b{re.escape(scale)}(?!=[0-9.])"
-        valid = "_" + scale.replace(".", "_").replace("+", "").replace("-", "_")
+        regex = rf"(?<!=[-+0-9.]){re.escape(substr)}(?!=[0-9.])"
+        valid = "_" + substr.replace(".", "_").replace("+", "").replace("-", "_")
         trailing = "/" if division else ""
-        _REGISTRY.define(f"{valid} = {scale} = {scale}")
+        _REGISTRY.define(f"{valid} = {value} = {substr}")
         input_string = re.sub(regex, valid + trailing, input_string)
 
     return input_string
@@ -103,7 +118,11 @@ def parse_units(units: Union[str, Unit, None]) -> Union[str, Unit, None]:
     elif units == '':
         return 'dimensionless'
     elif isinstance(units, str):
-        return f"{_REGISTRY(units).u:clean}"
+        # TODO: parse_units has a bug resolved in 0.19, but 3.7 only supports up to 0.18
+        parsed = _REGISTRY(units)
+        if not isinstance(parsed, Quantity) or parsed.magnitude != 1:
+            raise ValueError(f"Unit expression cannot have a leading scaling factor. {units}")
+        return f"{parsed.u:clean}"
     elif isinstance(units, Unit):
         return units
     else:
diff --git a/gemd/units/tests/test_parser.py b/gemd/units/tests/test_parser.py
@@ -4,14 +4,18 @@
 import pkg_resources
 from contextlib import contextmanager
 from pint import UnitRegistry
-from gemd.units import parse_units, convert_units, change_definitions_file, UndefinedUnitError
+from gemd.units import parse_units, convert_units, change_definitions_file, \
+    UndefinedUnitError, DefinitionSyntaxError
 
 
 def test_parse_expected():
     """Test that we can parse the units that we expect to be able to."""
     # use the default unit registry for now
     reg = UnitRegistry(filename=pkg_resources.resource_filename("gemd.units", "citrine_en.txt"))
 
+    # Pint's parse_units actually gets this wrong
+    assert parse_units("m^-1 * newton / meter") == parse_units("N / m^2")
+
     expected = [
         "degC", "degF", "K",
         "g", "kg", "mg", "ton",
@@ -24,6 +28,7 @@ def test_parse_expected():
         "Seconds",  # Added support for some title-case units
         "delta_Celsius / hour",  # Added to make sure pint version is right (>0.10)
         "g / 2.5 cm",  # Scaling factors are acceptable
+        "g / -+-25e-1 m"  # Weird but fine
     ]
     for unit in expected:
         parse_units(unit)
@@ -42,12 +47,31 @@ def test_parse_unexpected():
         5,
         "cp",  # Removed because of risk of collision with cP
         "chain",  # Survey units eliminated
-        "SECONDS"  # Not just case insensitivity
+        "SECONDS",  # Not just case insensitivity
+        "lb : in^3",  # : is not a valid operator
     ]
     for unit in unexpected:
         with pytest.raises(UndefinedUnitError):
             parse_units(unit)
 
+    scaling = [
+        "3 rpm",  # No leading digits
+        "16",  # No values that are just integers
+        "16.2"  # No values that are just floats
+    ]
+    for unit in scaling:
+        with pytest.raises(ValueError, match="scaling"):
+            parse_units(unit)
+
+    definition = [
+        "/gram",  # A leading operator makes no sense
+        "g / 0 m",  # Zero scaling factor
+        "g / -2 m"  # Negative scaling factor
+    ]
+    for unit in definition:
+        with pytest.raises(DefinitionSyntaxError):
+            parse_units(unit)
+
 
 def test_parse_none():
     """Test that None parses as None."""
@@ -59,7 +83,7 @@ def test_format():
     # use the default unit registry for now
     reg = UnitRegistry(filename=pkg_resources.resource_filename("gemd.units", "citrine_en.txt"))
 
-    result = parse_units("K^-2 m^-1 C^0 g^1 s^2")
+    result = parse_units("K^-2.0 m^-1e0 C^0 g^1 s^2")
     assert "-" not in result
     assert "[time]" in reg(result).dimensionality
     assert "[current]" not in reg(result).dimensionality
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 
 setup(name='gemd',
-      version='1.13.0',
+      version='1.13.1',
       url='http://github.com/CitrineInformatics/gemd-python',
       description="Python binding for Citrine's GEMD data model",
       author='Citrine Informatics',
@@ -39,5 +39,6 @@
           'Programming Language :: Python :: 3.8',
           'Programming Language :: Python :: 3.9',
           'Programming Language :: Python :: 3.10',
+          'Programming Language :: Python :: 3.11',
       ],
       )