From b14b2bb0ddea76c7785010254ee4086e8f27913f Mon Sep 17 00:00:00 2001
From: Becky Sweger <rsweger@umass.edu>
Date: Thu, 14 Nov 2024 15:35:11 -0500
Subject: [PATCH] Fix a bug when opening a parquet file on S3

When reading parquet files from S3, hubverse-transform
does an initial read to get the schema (so we can override
it if necessary). However, the read fails because it's
reading the wrong thing, and the transform process tries
to open the model-output data on the local filesystem instead
of on S3.

I opened an issue to address the lack of S3 test cases,
which resulting in this bug hitting production:
https://github.com/hubverse-org/hubverse-transform/issues/30
---
 src/hubverse_transform/model_output.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hubverse_transform/model_output.py b/src/hubverse_transform/model_output.py
index 52f32df..a3b10ba 100644
--- a/src/hubverse_transform/model_output.py
+++ b/src/hubverse_transform/model_output.py
@@ -208,12 +208,12 @@ def read_file(self) -> pa.table:
             model_output_table = csv.read_csv(model_output_file, convert_options=options)
         else:
             # temp fix: force location and output_type_id columns to string
-            schema_new = pq.read_schema(self.input_file)
+            model_output_file = self.fs_input.open_input_file(self.input_file)
+            schema_new = pq.read_schema(model_output_file)
             for field_name in ["location", "output_type_id"]:
                 field_idx = schema_new.get_field_index(field_name)
                 if field_idx >= 0:
                     schema_new = schema_new.set(field_idx, pa.field(field_name, pa.string()))
-            model_output_file = self.fs_input.open_input_file(self.input_file)
             model_output_table = pq.read_table(model_output_file, schema=schema_new)
 
         return model_output_table