Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -714,3 +714,22 @@ jobs:
echo "Pushing to branch: $branch_name"
git push -u origin $branch_name
fi
data-dictionary:
runs-on: ubuntu-22.04
steps:
- name: Checkout feature branch
uses: actions/checkout@v4
with:
ref: ${{ github.head_ref || github.ref_name }}

- name: Setup python
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Generate data and enumeration dictionay
run: |
pip install pandas

python postprocessing/resstockpostproc/data_dict.py
python postprocessing/resstockpostproc/enum_dict.py
48 changes: 48 additions & 0 deletions postprocessing/resstockpostproc/data_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
import pathlib
def data_dictionary(df_sdr):
"""
generate data dictionary based on sdr_column_definitions.csv.
"""
df_sdr_meta = df_sdr[(df_sdr['Publish In Full'] == 'yes') & (df_sdr['Published Annual Name'].notnull())]
df_sdr_tsagg = df_sdr[(df_sdr['Timeseries Publish In Full'] == 'yes') & (df_sdr['Published Timeseries Name'].notnull())]

# metadata_and_annual_results column names, units, and description
df_meta = df_sdr_meta[['Published Annual Name',
'Data Type',
'Published Annual Unit',
'Notes']].rename(columns={
'Published Annual Name': 'field_name',
'Data Type': 'data_type',
'Published Annual Unit': 'units',
'Notes': 'field_description'
})
df_meta.insert(loc=0, column='field_location', value='metadata_and_annual')

# timeseries_aggregates column names, units, and description
df_tsagg_sdr = df_sdr_tsagg[['Published Timeseries Name',
'Data Type',
'Published Timeseries Unit',
'Notes']].rename(columns={
'Published Timeseries Name': 'field_name',
'Data Type': 'data_type',
'Published Timeseries Unit': 'units',
'Notes': 'field_description'
})
df_tsagg_sdr.insert(loc=0, column='field_location', value='timeseries_aggregates')

#combine metadata_and_annual_results and timeseries_aggregates
df_data_dict = pd.concat([df_meta, df_tsagg_sdr], ignore_index=True)
df_data_dict['units'] = df_data_dict['units'].fillna('n/a')

return df_data_dict

def main():
here = pathlib.Path(__file__).resolve().parent
df_sdr = pd.read_csv(here / "resources" / "publication" / "sdr_column_definitions.csv")
df_data_dict = data_dictionary(df_sdr)
df_data_dict.to_csv(here / "resources" / "publication" / "data_dictionary.tsv", sep='\t', index=None)


if __name__ == "__main__":
main()
84 changes: 84 additions & 0 deletions postprocessing/resstockpostproc/enum_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pandas as pd
import pathlib


def enum(df):
"""
enumerations for a dataframe
"""
df_enum = (
pd.concat(
[pd.DataFrame({'metadata_column': col, 'enumeration': df[col].unique()})
for col in df.columns],
ignore_index=True
)
)

return df_enum


def enum_dict(df_data_dict, df_bs_csv, df_meta_up, up_files):
#format buildstock.csv column names
df_bs_csv.columns = ['in.' + col.lower().replace(' ', '_') for col in df_bs_csv.columns]
df_bs_csv = df_bs_csv.drop('in.building', axis=1)
df_bs_csv = df_bs_csv.rename(columns={
'in.ashrae_iecc_climate_zone_2004_-_sub-cz_split': 'in.ashrae_iecc_climate_zone_2004_sub_cz_split',
'in.income_recs2015': 'in.income_recs_2015',
'in.income_recs2020': 'in.income_recs_2020'
})

#enumerations from buildstock.csv
df_enum_bs_csv = enum(df_bs_csv)

df_data_dict_filter = df_data_dict[df_data_dict['field_location'] == 'metadata_and_annual']
data_dict_columns = df_data_dict_filter['field_name']
data_dict_columns = [x for x in data_dict_columns if not x.startswith(("out.", "calc.weighted", "bldg_id"))]

bs_csv_columns = df_bs_csv.columns
leftover_columns = list(set(data_dict_columns) - set(bs_csv_columns))

#enumerations from released data
df_enum_meta = pd.DataFrame(columns=['metadata_column', 'enumeration'])
for up in up_files:
#Do not need the renaming for the released parquet file
df_meta_up[up] = df_meta_up[up].rename(columns={
'in.sqft': 'in.sqft..ft2',
'in.air_leakage_to_outside_ach_50': 'in.air_leakage_to_outside_ach50',
'upgrade_name': 'in.upgrade_name',
'in.electric_panel_service_rating': 'in.electric_panel_service_rating..a',
'in.electric_panel_service_rating_bin': 'in.electric_panel_service_rating_bin..a',
'in.air_leakage_to_outside_ach_50': 'in.air_leakage_to_outside_ach50'
})
existing_cols = [c for c in leftover_columns if c in df_meta_up[up].columns]
df_meta_filter = df_meta_up[up][existing_cols]
df_meta_filter_enum = enum(df_meta_filter)
df_enum_meta = pd.concat([df_enum_meta, df_meta_filter_enum]).drop_duplicates(keep='first')

df_enum_dict = pd.concat([df_enum_bs_csv, df_enum_meta]).drop_duplicates(keep='first')
df_enum_dict['enumeration'] = df_enum_dict['enumeration'].fillna("None")
df_enum_dict = df_enum_dict.sort_values(by=['metadata_column', 'enumeration'])

df_enum_dict_columns = df_enum_dict['metadata_column'].unique().tolist()
missing_cols = [c for c in data_dict_columns if c not in df_enum_dict_columns]
print("Missing columns:", missing_cols)

return df_enum_dict


def main():
here = pathlib.Path(__file__).resolve().parent
test_path = here.parent.parent
df_data_dict = pd.read_csv(here / "resources" / "publication" / "data_dictionary.tsv", sep='\t')
df_bs_csv = pd.read_csv(test_path / "test" / "base_results" / "baseline"/ "annual"/ "buildstock.csv")
df_meta_up = {}
up_path = (test_path / "test" / "base_results" / "upgrades"/ "sdr_annual")
up_files = [f.name for f in up_path.glob('*.csv')]
for up in up_files:
df_meta_up[up] = pd.read_csv(test_path / "test" / "base_results" / "upgrades"/ "sdr_annual"/ up)

df_enum_dict = enum_dict(df_data_dict, df_bs_csv, df_meta_up, up_files)
df_enum_dict.to_csv(here / "resources" / "publication" / "enumeration_dictionary.tsv", sep='\t', index=None)


if __name__ == "__main__":
main()
Loading