Add integration tests

bsweger · bsweger · commit df03f70a04ef · 2024-04-22T19:10:50.000-04:00
These small integration tests in their current form are a check to
make sure we're getting the expected behavior for model-output files
that contain a variety of model_output_id_types
diff --git a/test/integration/data/2024-07-07-teamabc-output_type_ids_mixed.csv b/test/integration/data/2024-07-07-teamabc-output_type_ids_mixed.csv
@@ -0,0 +1,7 @@
+"origin_date","target","horizon","location","output_type","output_type_id","value"
+2022-10-08,"wk inc flu hosp",1,"02","quantile",0.99,203
+2022-10-08,"wk inc flu hosp",1,"02","mean",,173
+2023-10-21,wk flu hosp rate change,-1,US,pmf,large,0.0018554857403307722
+2023-10-21,wk flu hosp rate change,-1,US,pmf,"large",0.0018554857403307722
+2023-10-21,wk flu hosp rate change,-1,US,pmf,"large",what if this is a big string with no quotes
+
diff --git a/test/integration/data/2024-07-07-teamabc-output_type_ids_numeric.csv b/test/integration/data/2024-07-07-teamabc-output_type_ids_numeric.csv
@@ -0,0 +1,5 @@
+"origin_date","target","horizon","location","output_type","output_type_id","value"
+2022-10-08,"wk inc flu hosp",1,"02","quantile",0.99,203
+2022-10-08,"wk inc flu hosp",1,"02","mean",,173
+2023-10-21,wk flu hosp rate change,-1,US,pmf,111,0.0018554857403307722
+
diff --git a/test/integration/test_model_output_integration.py b/test/integration/test_model_output_integration.py
@@ -0,0 +1,51 @@
+import pathlib
+
+import pyarrow.compute as pc
+import pytest
+from hubverse_transform.model_output import ModelOutputHandler
+from pyarrow import parquet
+
+
+@pytest.fixture()
+def test_file_path() -> pathlib.Path:
+    """
+    Return path to the integration test files.
+    """
+    test_file_path = pathlib.Path(__file__).parent.joinpath('data')
+    return test_file_path
+
+
+def test_missing_model_output_id_numeric(tmpdir, test_file_path):
+    """Test behavior of model_output_id columns when there are a mix of numeric and missing output_type_ids."""
+    output_dir = str(tmpdir.mkdir('model-output'))
+    file_path = test_file_path.joinpath('2024-07-07-teamabc-output_type_ids_numeric.csv')
+    mo = ModelOutputHandler(file_path, output_dir)
+    output_uri = mo.transform_model_output()
+
+    # read the output parquet file
+    transformed_output = parquet.read_table(output_uri)
+
+    # when the rest of the model_output_types are numeric, the empty one should be null
+    expr = pc.field('output_type_id').is_null()
+    null_output_type_rows = transformed_output.filter(expr)
+    assert len(null_output_type_rows) == 1
+
+
+def test_missing_model_output_id_mixture(tmpdir, test_file_path):
+    """Test behavior of model_output_id columns when there are a mix of numeric, string, and missing output_type_ids."""
+    output_dir = str(tmpdir.mkdir('model-output'))
+    file_path = test_file_path.joinpath('2024-07-07-teamabc-output_type_ids_mixed.csv')
+    mo = ModelOutputHandler(file_path, output_dir)
+    output_uri = mo.transform_model_output()
+
+    # read the output parquet file
+    transformed_output = parquet.read_table(output_uri)
+
+    # where there are a mix of string and numeric output_type_ids, the column is cast to string
+    # and, therefore, missing values should be empty strings
+    expr = pc.field('output_type_id').is_null()
+    null_output_type_rows = transformed_output.filter(expr)
+    assert len(null_output_type_rows) == 0
+    expr = pc.field('output_type_id') == ''
+    empty_output_type_rows = transformed_output.filter(expr)
+    assert len(empty_output_type_rows) == 1
diff --git a/test/unit/test_model_output.py b/test/unit/test_model_output.py
@@ -29,6 +29,7 @@ def model_output_data() -> list[dict[str, Any]]:
     Fixture that returns a list of model-output data representing multiple output types.
     This fixture is used as input for other fixtures that generate temporary .csv and .parquest files for testing.
     """
+
     model_output_fieldnames = [
         'reference_date',
         'location',
@@ -41,8 +42,6 @@ def model_output_data() -> list[dict[str, Any]]:
     model_output_list = [
         ['2420-01-01', 'US', '1 light year', 'hospitalizations', 'quantile', 0.5, 62],
         ['2420-01-01', 'US', '1 light year', 'hospitalizations', 'quantile', 0.75, 50.1],
-        ['2420-01-01', '02', 3, 'hospitalizations', 'mean', 'NA', 11],
-        ['2420-01-01', '03', 3, 'hospitalizations', 'mean', 'NA', 'a string value for some reason'],
         ['2420-01-01', '03', 3, 'hospitalizations', 'mean', None, 33],
         ['1999-12-31', 'US', 'last month', 'hospitalizations', 'pmf', 'large_increase', 2.597827508665773e-9],
     ]
@@ -203,25 +202,26 @@ def test_added_column_values(model_output_table):
 def test_read_file_csv(test_csv_file, model_output_table):
     mo = ModelOutputHandler(test_csv_file, 'mock:fake-output-uri')
     pyarrow_table = mo.read_file()
-    assert len(pyarrow_table) == 6
+    assert len(pyarrow_table) == 4
 
     # output_type_id should retain the value from the .csv file, even when the value is empty or "NA"
+    # NA values generate
     output_type_id_col = pyarrow_table.column('output_type_id')
     assert str(output_type_id_col[0]) == '0.5'
-    assert str(output_type_id_col[2]) == 'NA'
-    assert str(output_type_id_col[4]) == ''
+    assert str(output_type_id_col[2]) == ''
+    assert str(output_type_id_col[3]) == 'large_increase'
 
 
 def test_read_file_parquet(test_parquet_file, model_output_table):
     mo = ModelOutputHandler(test_parquet_file, 'mock:fake-output-uri')
     pyarrow_table = mo.read_file()
-    assert len(pyarrow_table) == 6
+    assert len(pyarrow_table) == 4
 
     # output_type_id should retain the value from the .csv file, even when the value is empty or "NA"
     output_type_id_col = pyarrow_table.column('output_type_id')
     assert str(output_type_id_col[0]) == '0.5'
-    assert str(output_type_id_col[2]) == 'NA'
-    assert str(output_type_id_col[4]) == ''
+    assert str(output_type_id_col[2]) == ''
+    assert str(output_type_id_col[3]) == 'large_increase'
 
 
 def test_write_parquet(tmpdir, model_output_table):
@@ -235,7 +235,7 @@ def test_write_parquet(tmpdir, model_output_table):
     assert actual_output_file_path == expected_output_file_path
 
 
-def test_transform_model_output(test_csv_file, tmpdir):
+def test_transform_model_output_path(test_csv_file, tmpdir):
     output_dir = str(tmpdir.mkdir('model-output'))
     mo = ModelOutputHandler(test_csv_file, output_dir)
     output_uri = mo.transform_model_output()