Make sure incoming file URIs are encoded

bsweger · bsweger · commit 3b930f14126d · 2024-05-10T14:37:16.000-04:00
This ensures we can accommodate spaces in file names (and other
special characters).
diff --git a/src/hubverse_transform/model_output.py b/src/hubverse_transform/model_output.py
@@ -1,6 +1,7 @@
 import logging
 import pathlib
 import re
+from urllib.parse import quote
 
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -17,11 +18,11 @@
 
 class ModelOutputHandler:
     def __init__(self, input_uri: str, output_uri: str):
-        input_filesystem = fs.FileSystem.from_uri(input_uri)
+        input_filesystem = fs.FileSystem.from_uri(self.sanitize_uri(input_uri))
         self.fs_input = input_filesystem[0]
         self.input_file = input_filesystem[1]
 
-        output_filesystem = fs.FileSystem.from_uri(output_uri)
+        output_filesystem = fs.FileSystem.from_uri(self.sanitize_uri(output_uri))
         self.fs_output = output_filesystem[0]
         self.output_path = output_filesystem[1]
 
@@ -66,6 +67,22 @@ def from_s3(cls, bucket_name: str, s3_key: str, origin_prefix: str = "raw") -> "
 
         return cls(s3_input_uri, s3_output_uri)
 
+    def sanitize_uri(self, uri: str, safe=":/") -> str:
+        """Sanitize URIs for use with pyarrow's filesystem."""
+
+        uri_path = pathlib.Path(uri)
+
+        # remove spaces at the end of a filename (e.g., my-model-output .csv) and
+        # also at the beginning and end of the path string
+        clean_path = pathlib.Path(str(uri_path).replace(uri_path.stem, uri_path.stem.strip()))
+        clean_string = str(clean_path).strip()
+
+        # encode the cleaned path (for example, any remaining spaces) so we can
+        # safely use it as a URI
+        clean_uri = quote(str(clean_string), safe=safe)
+
+        return clean_uri
+
     def parse_file(cls, file_name: str) -> dict:
         """Parse model-output file name into individual parts."""
 
@@ -84,13 +101,12 @@ def parse_file(cls, file_name: str) -> dict:
         model_id_split = re.split(rf"{round_id}[-_]*", file_name)
         if not model_id_split or len(model_id_split) <= 1 or not model_id_split[-1]:
             raise ValueError(f"Unable to get model_id from file name {file_name}.")
-        model_id = "".join(model_id_split[-1].split())
+        model_id = model_id_split[-1].strip()
 
         file_parts = {}
         file_parts["round_id"] = round_id
         file_parts["model_id"] = model_id
 
-        # TODO: why so many logs?
         logger.info(f"Parsed model-output filename: {file_parts}")
         return file_parts
 
diff --git a/test/integration/test_model_output_integration.py b/test/integration/test_model_output_integration.py
@@ -18,7 +18,7 @@ def test_file_path() -> pathlib.Path:
 def test_missing_model_output_id_numeric(tmpdir, test_file_path):
     """Test behavior of model_output_id columns when there are a mix of numeric and missing output_type_ids."""
     output_dir = str(tmpdir.mkdir("model-output"))
-    file_path = test_file_path.joinpath("2024-07-07-teamabc-output_type_ids_numeric.csv")
+    file_path = str(test_file_path.joinpath("2024-07-07-teamabc-output_type_ids_numeric.csv"))
     mo = ModelOutputHandler(file_path, output_dir)
     output_uri = mo.transform_model_output()
 
@@ -34,7 +34,7 @@ def test_missing_model_output_id_numeric(tmpdir, test_file_path):
 def test_missing_model_output_id_mixture(tmpdir, test_file_path):
     """Test behavior of model_output_id columns when there are a mix of numeric, string, and missing output_type_ids."""
     output_dir = str(tmpdir.mkdir("model-output"))
-    file_path = test_file_path.joinpath("2024-07-07-teamabc-output_type_ids_mixed.csv")
+    file_path = str(test_file_path.joinpath("2024-07-07-teamabc-output_type_ids_mixed.csv"))
     mo = ModelOutputHandler(file_path, output_dir)
     output_uri = mo.transform_model_output()
 
diff --git a/test/unit/test_model_output.py b/test/unit/test_model_output.py
@@ -9,6 +9,8 @@
 from pyarrow import csv as pyarrow_csv
 from pyarrow import fs
 
+# note: the mocker fixture used throughout is provided by pytest-mock
+
 
 @pytest.fixture()
 def model_output_table() -> pa.Table:
@@ -118,6 +120,55 @@ def test_parse_file(file_uri, expected_round_id, expected_model_id):
     assert mo.model_id == expected_model_id
 
 
+@pytest.mark.parametrize(
+    "input_uri, output_uri, expected_input_file, expected_output_path, expected_file_name, expected_model_id",
+    [
+        (
+            "mock:bucket123/raw/prefix1/prefix 2/2420-01-01-team-model name with spaces.csv",
+            "mock:bucket123/prefix1/prefix 2",
+            "bucket123/raw/prefix1/prefix 2/2420-01-01-team-model name with spaces.csv",
+            "bucket123/prefix1/prefix 2",
+            "2420-01-01-team-model name with spaces",
+            "team-model name with spaces",
+        ),
+        (
+            "mock:bucket1.2.3/raw/prefix1/prefix 2/2420-01-01-team-model.name.csv",
+            "mock:bucket123/prefix1/~prefix 2",
+            "bucket1.2.3/raw/prefix1/prefix 2/2420-01-01-team-model.name.csv",
+            "bucket123/prefix1/~prefix 2",
+            "2420-01-01-team-model.name",
+            "team-model.name",
+        ),
+        (
+            "mock:raw/prefix 1/prefix2/2420-01-01-spáces at end .csv",
+            "mock:prefix 1/prefix2",
+            "raw/prefix 1/prefix2/2420-01-01-spáces at end.csv",
+            "prefix 1/prefix2",
+            "2420-01-01-spáces at end",
+            "spáces at end",
+        ),
+        (
+            "mock:a space/prefix 1/prefix2/2420-01-01 look ma no hyphens.csv",
+            "mock:prefix 1/prefix 🐍",
+            "a space/prefix 1/prefix2/2420-01-01 look ma no hyphens.csv",
+            "prefix 1/prefix 🐍",
+            "2420-01-01 look ma no hyphens",
+            "look ma no hyphens",
+        ),
+    ],
+)
+def test_new_instance_special_characters(
+    input_uri, output_uri, expected_input_file, expected_output_path, expected_file_name, expected_model_id
+):
+    # ensure spaces and other characters in directory, filename, s3 key, etc. are handled correctly
+
+    mo = ModelOutputHandler(input_uri, output_uri)
+    assert mo.input_file == expected_input_file
+    assert mo.output_path == expected_output_path
+    assert mo.file_name == expected_file_name
+    assert mo.model_id == expected_model_id
+
+
 @pytest.mark.parametrize(
     "s3_key, expected_input_uri, expected_output_uri",
     [