diff --git a/.github/workflows/config.yml b/.github/workflows/config.yml index 52231c8bae..6b780449ab 100644 --- a/.github/workflows/config.yml +++ b/.github/workflows/config.yml @@ -714,3 +714,22 @@ jobs: echo "Pushing to branch: $branch_name" git push -u origin $branch_name fi + data-dictionary: + runs-on: ubuntu-22.04 + steps: + - name: Checkout feature branch + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref || github.ref_name }} + + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Generate data and enumeration dictionay + run: | + pip install pandas + + python postprocessing/resstockpostproc/data_dict.py + python postprocessing/resstockpostproc/enum_dict.py diff --git a/postprocessing/resstockpostproc/data_dict.py b/postprocessing/resstockpostproc/data_dict.py new file mode 100644 index 0000000000..cd7b269d6e --- /dev/null +++ b/postprocessing/resstockpostproc/data_dict.py @@ -0,0 +1,48 @@ +import pandas as pd +import pathlib +def data_dictionary(df_sdr): + """ + generate data dictionary based on sdr_column_definitions.csv. + """ + df_sdr_meta = df_sdr[(df_sdr['Publish In Full'] == 'yes') & (df_sdr['Published Annual Name'].notnull())] + df_sdr_tsagg = df_sdr[(df_sdr['Timeseries Publish In Full'] == 'yes') & (df_sdr['Published Timeseries Name'].notnull())] + + # metadata_and_annual_results column names, units, and description + df_meta = df_sdr_meta[['Published Annual Name', + 'Data Type', + 'Published Annual Unit', + 'Notes']].rename(columns={ + 'Published Annual Name': 'field_name', + 'Data Type': 'data_type', + 'Published Annual Unit': 'units', + 'Notes': 'field_description' + }) + df_meta.insert(loc=0, column='field_location', value='metadata_and_annual') + + # timeseries_aggregates column names, units, and description + df_tsagg_sdr = df_sdr_tsagg[['Published Timeseries Name', + 'Data Type', + 'Published Timeseries Unit', + 'Notes']].rename(columns={ + 'Published Timeseries Name': 'field_name', + 'Data Type': 'data_type', + 'Published Timeseries Unit': 'units', + 'Notes': 'field_description' + }) + df_tsagg_sdr.insert(loc=0, column='field_location', value='timeseries_aggregates') + + #combine metadata_and_annual_results and timeseries_aggregates + df_data_dict = pd.concat([df_meta, df_tsagg_sdr], ignore_index=True) + df_data_dict['units'] = df_data_dict['units'].fillna('n/a') + + return df_data_dict + +def main(): + here = pathlib.Path(__file__).resolve().parent + df_sdr = pd.read_csv(here / "resources" / "publication" / "sdr_column_definitions.csv") + df_data_dict = data_dictionary(df_sdr) + df_data_dict.to_csv(here / "resources" / "publication" / "data_dictionary.tsv", sep='\t', index=None) + + +if __name__ == "__main__": + main() diff --git a/postprocessing/resstockpostproc/enum_dict.py b/postprocessing/resstockpostproc/enum_dict.py new file mode 100644 index 0000000000..81ac97a509 --- /dev/null +++ b/postprocessing/resstockpostproc/enum_dict.py @@ -0,0 +1,84 @@ +import pandas as pd +import pathlib + + +def enum(df): + """ + enumerations for a dataframe + """ + df_enum = ( + pd.concat( + [pd.DataFrame({'metadata_column': col, 'enumeration': df[col].unique()}) + for col in df.columns], + ignore_index=True + ) + ) + + return df_enum + + +def enum_dict(df_data_dict, df_bs_csv, df_meta_up, up_files): + #format buildstock.csv column names + df_bs_csv.columns = ['in.' + col.lower().replace(' ', '_') for col in df_bs_csv.columns] + df_bs_csv = df_bs_csv.drop('in.building', axis=1) + df_bs_csv = df_bs_csv.rename(columns={ + 'in.ashrae_iecc_climate_zone_2004_-_sub-cz_split': 'in.ashrae_iecc_climate_zone_2004_sub_cz_split', + 'in.income_recs2015': 'in.income_recs_2015', + 'in.income_recs2020': 'in.income_recs_2020' + }) + + #enumerations from buildstock.csv + df_enum_bs_csv = enum(df_bs_csv) + + df_data_dict_filter = df_data_dict[df_data_dict['field_location'] == 'metadata_and_annual'] + data_dict_columns = df_data_dict_filter['field_name'] + data_dict_columns = [x for x in data_dict_columns if not x.startswith(("out.", "calc.weighted", "bldg_id"))] + + bs_csv_columns = df_bs_csv.columns + leftover_columns = list(set(data_dict_columns) - set(bs_csv_columns)) + + #enumerations from released data + df_enum_meta = pd.DataFrame(columns=['metadata_column', 'enumeration']) + for up in up_files: + #Do not need the renaming for the released parquet file + df_meta_up[up] = df_meta_up[up].rename(columns={ + 'in.sqft': 'in.sqft..ft2', + 'in.air_leakage_to_outside_ach_50': 'in.air_leakage_to_outside_ach50', + 'upgrade_name': 'in.upgrade_name', + 'in.electric_panel_service_rating': 'in.electric_panel_service_rating..a', + 'in.electric_panel_service_rating_bin': 'in.electric_panel_service_rating_bin..a', + 'in.air_leakage_to_outside_ach_50': 'in.air_leakage_to_outside_ach50' + }) + existing_cols = [c for c in leftover_columns if c in df_meta_up[up].columns] + df_meta_filter = df_meta_up[up][existing_cols] + df_meta_filter_enum = enum(df_meta_filter) + df_enum_meta = pd.concat([df_enum_meta, df_meta_filter_enum]).drop_duplicates(keep='first') + + df_enum_dict = pd.concat([df_enum_bs_csv, df_enum_meta]).drop_duplicates(keep='first') + df_enum_dict['enumeration'] = df_enum_dict['enumeration'].fillna("None") + df_enum_dict = df_enum_dict.sort_values(by=['metadata_column', 'enumeration']) + + df_enum_dict_columns = df_enum_dict['metadata_column'].unique().tolist() + missing_cols = [c for c in data_dict_columns if c not in df_enum_dict_columns] + print("Missing columns:", missing_cols) + + return df_enum_dict + + +def main(): + here = pathlib.Path(__file__).resolve().parent + test_path = here.parent.parent + df_data_dict = pd.read_csv(here / "resources" / "publication" / "data_dictionary.tsv", sep='\t') + df_bs_csv = pd.read_csv(test_path / "test" / "base_results" / "baseline"/ "annual"/ "buildstock.csv") + df_meta_up = {} + up_path = (test_path / "test" / "base_results" / "upgrades"/ "sdr_annual") + up_files = [f.name for f in up_path.glob('*.csv')] + for up in up_files: + df_meta_up[up] = pd.read_csv(test_path / "test" / "base_results" / "upgrades"/ "sdr_annual"/ up) + + df_enum_dict = enum_dict(df_data_dict, df_bs_csv, df_meta_up, up_files) + df_enum_dict.to_csv(here / "resources" / "publication" / "enumeration_dictionary.tsv", sep='\t', index=None) + + +if __name__ == "__main__": + main() \ No newline at end of file