From b14b2bb0ddea76c7785010254ee4086e8f27913f Mon Sep 17 00:00:00 2001 From: Becky Sweger Date: Thu, 14 Nov 2024 15:35:11 -0500 Subject: [PATCH] Fix a bug when opening a parquet file on S3 When reading parquet files from S3, hubverse-transform does an initial read to get the schema (so we can override it if necessary). However, the read fails because it's reading the wrong thing, and the transform process tries to open the model-output data on the local filesystem instead of on S3. I opened an issue to address the lack of S3 test cases, which resulting in this bug hitting production: https://github.com/hubverse-org/hubverse-transform/issues/30 --- src/hubverse_transform/model_output.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hubverse_transform/model_output.py b/src/hubverse_transform/model_output.py index 52f32df..a3b10ba 100644 --- a/src/hubverse_transform/model_output.py +++ b/src/hubverse_transform/model_output.py @@ -208,12 +208,12 @@ def read_file(self) -> pa.table: model_output_table = csv.read_csv(model_output_file, convert_options=options) else: # temp fix: force location and output_type_id columns to string - schema_new = pq.read_schema(self.input_file) + model_output_file = self.fs_input.open_input_file(self.input_file) + schema_new = pq.read_schema(model_output_file) for field_name in ["location", "output_type_id"]: field_idx = schema_new.get_field_index(field_name) if field_idx >= 0: schema_new = schema_new.set(field_idx, pa.field(field_name, pa.string())) - model_output_file = self.fs_input.open_input_file(self.input_file) model_output_table = pq.read_table(model_output_file, schema=schema_new) return model_output_table