diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9498df7..551c98a 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -52,7 +52,7 @@ jobs: with: context: . push: true - platforms: linux/amd64 + platforms: linux/amd64, linux/arm64 tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha diff --git a/DEMOS_Technical_Memo.pdf b/DEMOS_Technical_Memo.pdf index ebd9c45..3f3cb0e 100644 Binary files a/DEMOS_Technical_Memo.pdf and b/DEMOS_Technical_Memo.pdf differ diff --git a/README.md b/README.md index 2a2d3a1..be5b79b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Demographic Microsimulator (DEMOS) -[![Docs](https://github.com/NREL/DEMOS/actions/workflows/docs.yml/badge.svg)](https://nrel.github.io/DEMOS/) +[![Docs](https://github.com/NatLabRockies/DEMOS/actions/workflows/docs.yml/badge.svg)](https://natlabrockies.github.io/DEMOS/) ## Overview The Demographic Microsimulator (DEMOS) is an agent-based simulation framework used to model the evolution of population demographic characteristics and lifecycle events, such as education attainment, marital status, and other key transitions. DEMOS modules are designed to capture the interdependencies between short-term and long-term lifecycle events, which are often influential in downstream transportation and land-use modeling. @@ -16,10 +16,12 @@ A technical memorandum describing DEMOS is available [here](./DEMOS_Technical_Me ## Usage ### Docker Compose (recommended) -The latest docker image for demos is stored in `ghcr.io/NatLabRockies/demos:latest`. The input data and configuration file are fed to the container through volumes ([more info about Docker volumes](https://docs.docker.com/engine/storage/volumes/)). We provide a `docker-compose` workflow that can be used to make the process of mounting volumes easier. +The latest docker image for demos is stored in `ghcr.io/NatLabRockies/demos:latest`. The input data and configuration file are fed to the container through volumes ([more info about Docker volumes](https://docs.docker.com/engine/storage/volumes/)). We provide a `docker-compose` workflow that can be used to make the process of mounting volumes easier. Make sure you have [Docker](https://docs.docker.com/desktop/) and [Docker Compose](https://docs.docker.com/compose/install/) installed before you proceed. + +The following instructions will guide you through running DEMOS with example data of two hypothetical counties. This example is provided to help users quickly get started with DEMOS. It includes the required inputs to run DEMOS for two example counties. The data and configuration files required to run this example are located in `./data/` and `./configuration` folders. You can change where DEMOS will look for your data following the instructions [in the Docs](https://NatLabRockies.github.io/DEMOS/). #### Clone this repository -By cloning this repository you download the configuration and data for an example run of DEMOS. +By cloning this repository you download the configuration and data for an example run of DEMOS. You can also use the `Download ZIP` option available through the green `Code` button above and decompress it to achieve the same results as the clone command. Run the following command in the Terminal App (MacOS) or Command Prompt/PowerShell (Windows): ```bash @@ -30,17 +32,13 @@ cd DEMOS # This folder contains (among other files) a data and configuration folder # as well as a docker-compose.yml file -``` -Make sure you have [Docker](https://docs.docker.com/desktop/) and [Docker Compose](https://docs.docker.com/compose/install/) installed. Now you can run docker as follows: - -```bash +# This command runs DEMOS on a docker container docker compose up ``` #### IMPORTANT for MacOS and Windows users > Docker imposes a global limit on how much RAM containers can allocate. DEMOS easily surpases those limits, so in order to run DEMOS in Docker, users need to access the Docker Desktop GUI and `Preferences → Resources → Memory → Increase it (at least 16-20gb)`. The amount of memory required to run DEMOS will primarily depend on the size of the input data. -Documentation for custom data requirements, configuration and overall functionality of demos can be found [in the Docs](https://nrel.github.io/DEMOS/). ## Other ways to run DEMOS @@ -66,3 +64,14 @@ If you prefer to create your own Python environment and run the Python code dire conda activate demos-env pip install . ``` + +## Comprehensive Documentation + +Documentation for custom data requirements, configuration and overall functionality of demos can be found [in the Docs](https://NatLabRockies.github.io/DEMOS/). + +## Contact +If you have questions, suggestions, or are interested in collaborating, please feel free to reach out or open an issue. +Bingrong Sun: Bingrong.Sun@nlr.gov +Shivam Sharda: Shivam.Sharda@nlr.gov +Venu Garikapati: Venu.Garikapati@nlr.gov +Yamil Essus: essusyamil@gmail.com diff --git a/configuration/demos_config.toml b/configuration/demos_config.toml index 475603d..dd2529c 100644 --- a/configuration/demos_config.toml +++ b/configuration/demos_config.toml @@ -6,14 +6,16 @@ calibrated_models_dir = "../data/small_example/calibrated_models_coefficients/" inconsistent_persons_table_behavior = "fix" modules = [ "aging", - "employment", + "fatality", "household_reorg", "kids_moving", - "fatality", "birth", "education", + "employment", + "income", + "income_adjustment", "household_rebalancing", - "income_adjustment" + "normalize_table_dtypes", ] output_tables = [ "persons", @@ -33,22 +35,16 @@ initialize_empty_tables = [ [[tables]] file_type = "h5" table_name = "persons" -filepath = "../data/small_example/small_example_tables.h5" # custom_mpo_06197001_model_data_small.h5, minihh.h5, custom_mpo_06197001_model_data_small_stratHH.h5, custom_mpo_06197001_model_data_hh0.05.h5, countyNM.h5 +filepath = "../data/small_example/small_example_tables.h5" h5_key = "persons" [[tables]] file_type = "h5" table_name = "households" -filepath = "../data/small_example/small_example_tables.h5" # custom_mpo_06197001_model_data_small.h5, minihh.h5, custom_mpo_06197001_model_data_hh0.05.h5, custom_mpo_06197001_model_data_county1, countyNM.h5 +filepath = "../data/small_example/small_example_tables.h5" h5_key = "households" ## Other static data tables -[[tables]] -file_type = "csv" -table_name = "relational_adjustment_mapping" -filepath = "../data/small_example/relmap_06197001.csv" -index_col = "index" - [[tables]] file_type = "csv" table_name = "income_rates" diff --git a/configuration/demos_config_ref.toml b/configuration/demos_config_ref.toml index 818f6a0..a8c4b78 100644 --- a/configuration/demos_config_ref.toml +++ b/configuration/demos_config_ref.toml @@ -6,14 +6,16 @@ calibrated_models_dir = "../data/sf_bay_example/calibrated_models_coefficients/" inconsistent_persons_table_behavior = "fix" modules = [ "aging", - "employment", + "fatality", "household_reorg", "kids_moving", - "fatality", "birth", "education", + "employment", + "income", + "income_adjustment", "household_rebalancing", - "income_adjustment" + "normalize_table_dtypes", ] output_tables = [ "persons", @@ -41,12 +43,6 @@ filepath = "../data/sf_bay_example/custom_mpo_06197001_model_data.h5" h5_key = "households" ## Other static data tables -[[tables]] -file_type = "csv" -table_name = "relational_adjustment_mapping" -filepath = "../data/sf_bay_example/relmap_06197001.csv" -index_col = "index" - [[tables]] file_type = "csv" table_name = "income_rates" diff --git a/data/small_example/calibrated_models_coefficients/birth_model.yaml b/data/small_example/calibrated_models_coefficients/birth_model.yaml index a2a6311..3110158 100644 --- a/data/small_example/calibrated_models_coefficients/birth_model.yaml +++ b/data/small_example/calibrated_models_coefficients/birth_model.yaml @@ -8,7 +8,7 @@ saved_object: - 0.847 - 1.754 - 1.417 - model_expression: birth ~ hh_birth_agebin1 + hh_birth_agebin2 + fsize_bin23 + fsize_bingt3 + 1 + model_expression: birth ~ hh_birth_age_lt27 + hh_birth_age_27_35 + hh_fsize_bin23 + hh_fsize_bingt3 + 1 name: birth out_column: birth out_filters: null @@ -20,4 +20,4 @@ saved_object: summary_table: null tags: [] template: BinaryLogitStep - template_version: 0.2.dev9 \ No newline at end of file + template_version: 0.2.dev9 diff --git a/data/small_example/calibrated_models_coefficients/cohabitation.yaml b/data/small_example/calibrated_models_coefficients/cohabitation.yaml index 41cdc02..f4e7641 100644 --- a/data/small_example/calibrated_models_coefficients/cohabitation.yaml +++ b/data/small_example/calibrated_models_coefficients/cohabitation.yaml @@ -26,18 +26,18 @@ saved_object: - 0.752726 spec_names: - intercept - - avg_agebin2 - - avg_agebin3 - - avg_agebin4 - - top_edu_bin3 - - hd_race_wht - - income_bin1 - - income_bin2 - - income_bin4 - - income_bin5 + - hh_age_avg_bin2 + - hh_age_avg_bin3 + - hh_age_avg_bin4 + - hh_edu_top_bin3 + - hh_head_race_white + - hh_income_bin1 + - hh_income_bin2 + - hh_income_bin4 + - hh_income_bin5 name: cohabitation tables: households out_tables: households filters: null template: MultinomialLogitStep - template_version: 0.2.dev9 \ No newline at end of file + template_version: 0.2.dev9 diff --git a/data/small_example/calibrated_models_coefficients/demos_in_labor_force.yaml b/data/small_example/calibrated_models_coefficients/demos_in_labor_force.yaml index c4821af..2181412 100644 --- a/data/small_example/calibrated_models_coefficients/demos_in_labor_force.yaml +++ b/data/small_example/calibrated_models_coefficients/demos_in_labor_force.yaml @@ -13,7 +13,7 @@ saved_object: - 0.013 - -0.205 - -0.175 - model_expression: stay_out ~ agebin6_labor + agebin3_labor + agebin4_labor + agebin5_labor + gender2 + edubin2 + edubin3 + race_blk + race_asn + race_other + 1 + model_expression: stay_out ~ age_emp_20_40 + age_emp_41_50 + age_emp_51_70 + age_emp_70plus + sex_female + edu_hs_ged + edu_college_plus + race_black + race_asian_pi + race_other + 1 name: enter_labor_force out_column: null out_filters: diff --git a/data/small_example/calibrated_models_coefficients/demos_out_labor_force.yaml b/data/small_example/calibrated_models_coefficients/demos_out_labor_force.yaml index caf9b12..52b15e3 100644 --- a/data/small_example/calibrated_models_coefficients/demos_out_labor_force.yaml +++ b/data/small_example/calibrated_models_coefficients/demos_out_labor_force.yaml @@ -11,7 +11,7 @@ saved_object: - 0.3530349 - -0.5569076 - -0.9928796 - model_expression: leaving_workforce ~ agebin6_labor + agebin3_labor + agebin4_labor + agebin5_labor + gender2 + edubin2 + edubin3 + 1 + model_expression: leaving_workforce ~ age_emp_20_40 + age_emp_41_50 + age_emp_51_70 + age_emp_70plus + sex_female + edu_hs_ged + edu_college_plus + 1 name: exit_labor_force out_column: null out_filters: diff --git a/data/small_example/calibrated_models_coefficients/divorce_model.yaml b/data/small_example/calibrated_models_coefficients/divorce_model.yaml index 1474feb..0d64dc4 100644 --- a/data/small_example/calibrated_models_coefficients/divorce_model.yaml +++ b/data/small_example/calibrated_models_coefficients/divorce_model.yaml @@ -13,8 +13,8 @@ saved_object: - 0.7375 - 0.97831 - 0.30529 - model_expression: divorced ~ min_agebin2 + min_agebin3 + min_agebin4 + income_bin1 - + income_bin2 + income_bin4 + income_bin5 + top_edu_bin2 + top_edu_bin3 + fam_work2 + model_expression: divorced ~ hh_age_min_bin2 + hh_age_min_bin3 + hh_age_min_bin4 + hh_income_bin1 + + hh_income_bin2 + hh_income_bin4 + hh_income_bin5 + hh_edu_top_bin2 + hh_edu_top_bin3 + hh_fam_work2 + 1 name: divorce out_column: null diff --git a/data/small_example/calibrated_models_coefficients/edu_model.yaml b/data/small_example/calibrated_models_coefficients/edu_model.yaml index dfa57cf..5b857d6 100644 --- a/data/small_example/calibrated_models_coefficients/edu_model.yaml +++ b/data/small_example/calibrated_models_coefficients/edu_model.yaml @@ -15,8 +15,8 @@ saved_object: - -1.769 - 0.637 - -0.297 - model_expression: stop ~ agebin2 + agebin3 + agebin4 + employbin2 + employbin3 + - marital1 + marital34 + marital2 + edubin2 + edubin3 + 1 + model_expression: stop ~ age_23_35 + age_36_60 + age_60plus + emp_idle_under60 + emp_idle_over60 + + mar_married + mar_div_or_sep + mar_widowed + edu_hs_ged + edu_college_plus + 1 name: education out_column: stop out_filters: @@ -30,4 +30,4 @@ saved_object: tables: persons tags: [] template: BinaryLogitStep - template_version: 0.2.dev9 \ No newline at end of file + template_version: 0.2.dev9 diff --git a/data/small_example/calibrated_models_coefficients/income_model.yaml b/data/small_example/calibrated_models_coefficients/income_model.yaml new file mode 100644 index 0000000..e1ba19a --- /dev/null +++ b/data/small_example/calibrated_models_coefficients/income_model.yaml @@ -0,0 +1,41 @@ +modelmanager_version: 0.2.dev9 + +saved_object: + filters: null + fitted_parameters: + - 9.2866172 + - 0.0038117 + - 0.1245698 + - -0.0847078 + - 0.4372168 + - 0.7482159 + - 0.8479667 + - 0.6201558 + - 0.5492689 + - 0.3194774 + - 0.6786951 + - 0.4040705 + - 0.3016074 + - 0.027432 + - 0.026867 + - 0.1687073 + - -0.2822124 + - -0.1492966 + - -0.1573647 + - -0.1559694 + - -0.258431 + model_expression: income ~ hh_age_head + true_hh_size + not_met_area + income_model_edu_bin1 + income_model_edu_bin2 + + income_model_edu_bin3 + job_industry_bin1 + job_industry_bin2 + job_industry_bin3 + + job_occupation_bin1 + job_occupation_bin2 + job_occupation_bin3 + + state_quart_2 + state_quart_3 + state_quart_4 + + head_race_blk + head_race3 + head_race_asian + head_race_hawaiian + head_race5 + 1 + name: income + out_column: null + out_filters: null + out_tables: null + out_transform: null + tables: households + summary_table: null + tags: [] + template: RegressionStep + template_version: 0.2.dev9 \ No newline at end of file diff --git a/data/small_example/calibrated_models_coefficients/income_model_w_nworkers.yaml b/data/small_example/calibrated_models_coefficients/income_model_w_nworkers.yaml new file mode 100644 index 0000000..0522209 --- /dev/null +++ b/data/small_example/calibrated_models_coefficients/income_model_w_nworkers.yaml @@ -0,0 +1,42 @@ +modelmanager_version: 0.2.dev9 + +saved_object: + filters: null + fitted_parameters: + - 9.24495115 + - 0.00417772 + - 0.05948244 + - 0.32053385 + - -0.08125172 + - 0.41666961 + - 0.7286613 + - 0.84074453 + - 0.47524105 + - 0.39585793 + - 0.19305833 + - 0.49488234 + - 0.23930184 + - 0.18253791 + - 0.02907993 + - 0.01390249 + - 0.1580996 + - -0.26774506 + - -0.07188592 + - -0.15350612 + - -0.17286581 + - -0.27394566 + model_expression: income ~ hh_head_age + true_hh_size + true_hh_workers + not_met_area + hh_head_edu_bin1 + hh_head_edu_bin2 + + hh_head_edu_bin3 + job_industry_bin1 + job_industry_bin2 + job_industry_bin3 + + job_occupation_bin1 + job_occupation_bin2 + job_occupation_bin3 + + state_quart_2 + state_quart_3 + state_quart_4 + + hh_head_race_black + hh_head_race_native_am + hh_head_race_asian + hh_head_race_hawaiian + hh_head_race_acs_other + 1 + name: income_nworkers + out_column: null + out_filters: null + out_tables: null + out_transform: null + tables: households + summary_table: null + tags: [] + template: RegressionStep + template_version: 0.2.dev9 diff --git a/data/small_example/calibrated_models_coefficients/kids_move_model.yaml b/data/small_example/calibrated_models_coefficients/kids_move_model.yaml index 39260ff..8f2fbda 100644 --- a/data/small_example/calibrated_models_coefficients/kids_move_model.yaml +++ b/data/small_example/calibrated_models_coefficients/kids_move_model.yaml @@ -30,12 +30,12 @@ saved_object: - -0.888188 - -1.517554 - -1.374318 - model_expression: kid_moves ~ agebin5_mo + agebin2_mo + agebin3_mo + agebin4_mo - + gender2 + employbin2 + employbin3 + edubin2 + edubin3 + race_wht + race_asn - + employ2_agebin5_mo + employ2_agebin2_mo + employ2_agebin3_mo + employ2_agebin4_mo - + employ3_agebin5_mo + employ3_agebin2_mo + employ3_agebin3_mo + employ3_agebin4_mo - + edu2_agebin5_mo + edu2_agebin2_mo + edu2_agebin3_mo + edu2_agebin4_mo + edu3_agebin5_mo - + edu3_agebin2_mo + edu3_agebin3_mo + edu3_agebin4_mo + 1 + model_expression: kid_moves ~ age_km_30plus + age_km_19_20 + age_km_21_25 + age_km_26_30 + + sex_female + emp_idle_under60 + emp_idle_over60 + edu_hs_ged + edu_college_plus + race_white + race_asian_pi + + emp_idle_under60_age_km_30plus + emp_idle_under60_age_km_19_20 + emp_idle_under60_age_km_21_25 + emp_idle_under60_age_km_26_30 + + emp_idle_over60_age_km_30plus + emp_idle_over60_age_km_19_20 + emp_idle_over60_age_km_21_25 + emp_idle_over60_age_km_26_30 + + edu_hs_ged_age_km_30plus + edu_hs_ged_age_km_19_20 + edu_hs_ged_age_km_21_25 + edu_hs_ged_age_km_26_30 + + edu_college_plus_age_km_30plus + edu_college_plus_age_km_19_20 + edu_college_plus_age_km_21_25 + edu_college_plus_age_km_26_30 + 1 name: kids_move out_column: kid_moves out_filters: relate in [2, 3, 4, 7, 9, 14] diff --git a/data/small_example/calibrated_models_coefficients/marriage.yaml b/data/small_example/calibrated_models_coefficients/marriage.yaml index 7b7819d..d712c2d 100644 --- a/data/small_example/calibrated_models_coefficients/marriage.yaml +++ b/data/small_example/calibrated_models_coefficients/marriage.yaml @@ -26,18 +26,18 @@ saved_object: - -0.603113 spec_names: - intercept - - agebin2 - - agebin3 - - agebin4 - - gender2 - - employbin2 - - employbin3 - - edubin2 - - edubin3 - - race_blk + - age_23_35 + - age_36_60 + - age_60plus + - sex_female + - emp_idle_under60 + - emp_idle_over60 + - edu_hs_ged + - edu_college_plus + - race_black name: marriage tables: persons out_tables: persons filters: null template: MultinomialLogitStep - template_version: 0.2.dev9 \ No newline at end of file + template_version: 0.2.dev9 diff --git a/data/small_example/calibrated_models_coefficients/mortality_model.yaml b/data/small_example/calibrated_models_coefficients/mortality_model.yaml index e2897a5..338288e 100644 --- a/data/small_example/calibrated_models_coefficients/mortality_model.yaml +++ b/data/small_example/calibrated_models_coefficients/mortality_model.yaml @@ -15,8 +15,8 @@ saved_object: - -0.642 - 0.779 - 0.546 - model_expression: dead ~ agebin1_new + agebin2_new + agebin3_new + agebin4_new + agebin5_new - + gender2 + employbin2 + employbin3 + edubin2 + edubin3 + marital25 + marital34 + 1 + model_expression: dead ~ age_mort_21_40 + age_mort_41_50 + age_mort_51_70 + age_mort_71_90 + age_mort_90plus + + sex_female + emp_idle_under60 + emp_idle_over60 + edu_hs_ged + edu_college_plus + mar_widowed_or_never + mar_div_or_sep + 1 name: mortality out_column: null out_filters: null @@ -28,4 +28,4 @@ saved_object: summary_table: null tags: [] template: BinaryLogitStep - template_version: 0.2.dev9 \ No newline at end of file + template_version: 0.2.dev9 diff --git a/data/small_example/small_example_tables.h5 b/data/small_example/small_example_tables.h5 index 9d7f5a5..f05985a 100644 Binary files a/data/small_example/small_example_tables.h5 and b/data/small_example/small_example_tables.h5 differ diff --git a/demos/config.py b/demos/config.py index 3a2b1bc..4272da6 100644 --- a/demos/config.py +++ b/demos/config.py @@ -145,6 +145,7 @@ def model_post_init(self, __context) -> None: "education_model", "household_rebalancing", "update_income", + "normalize_table_dtypes", ] @model_validator(mode="after") diff --git a/demos/models/__init__.py b/demos/models/__init__.py index eaf0eae..8487b40 100644 --- a/demos/models/__init__.py +++ b/demos/models/__init__.py @@ -9,5 +9,7 @@ from .education import * from .rebalancing import * from .income_adjustment import * +from .income import * from .export import * from .main import * +from .constants import * diff --git a/demos/models/aging.py b/demos/models/aging.py index ecf59a6..7b52091 100644 --- a/demos/models/aging.py +++ b/demos/models/aging.py @@ -100,3 +100,24 @@ def age_group(data="persons.age"): return pd.cut( data, bins=age_intervals, labels=age_labels, include_lowest=True ).astype(str) + + +@orca.column("households") +def hh_head_age(persons, households): + """ + Get the age of the head of each household. + + Identifies the head of household (where `relate` == 0) and returns their age for each household. + + Parameters + ---------- + persons : orca.Table + The persons table containing `age`, `relate`, and `household_id` columns. + + Returns + ------- + pandas.Series + Age of the head of household, indexed by household_id. + """ + heads = persons.local[persons["relate"] == 0] + return heads.set_index("household_id").loc[households.index, "age"] diff --git a/demos/models/birth.py b/demos/models/birth.py index e5493e0..cc8d991 100644 --- a/demos/models/birth.py +++ b/demos/models/birth.py @@ -102,7 +102,7 @@ def birth(persons, households, get_new_person_id): .reset_index() .merge( households.to_frame( - ["hh_race_of_head", "hh_race_id_of_head", "household_id"] + ["hh_head_race_str", "hh_head_race_id", "household_id"] ).reset_index(), on="household_id", ) @@ -110,7 +110,7 @@ def birth(persons, households, get_new_person_id): one_race_hh_filter = (hh_races.loc[babies.household_id]["num_races"] == 1).values babies["race_id"] = 9 babies.loc[one_race_hh_filter, "race_id"] = hh_races.loc[ - babies.loc[one_race_hh_filter, "household_id"], "hh_race_id_of_head" + babies.loc[one_race_hh_filter, "household_id"], "hh_head_race_id" ].values babies["race"] = babies["race_id"].map({1: "white", 2: "black"}) babies["race"].fillna("other", inplace=True) @@ -140,3 +140,68 @@ def run_and_calibrate_birth_model(persons, households): birth_model, birth_model_data ) return birth_model.predict(birth_model_data) + + +# ----------------------------------------------------------------------------------------- +# BIRTH MODEL COLUMNS (moved from variables.py) +# ----------------------------------------------------------------------------------------- + + +@orca.column("households") +def hh_n_persons(households, persons): + counts = persons.local.groupby("household_id").size() + return households.local.join(counts.rename("hh_n_persons"))["hh_n_persons"] + + +@orca.column("households") +def hh_fsize_bin23(households): + df = households.to_frame(columns=["hh_n_persons"]) + return df["hh_n_persons"].isin([2, 3]) * 1 + + +@orca.column("households") +def hh_fsize_bingt3(households): + df = households.to_frame(columns=["hh_n_persons"]) + return df.gt(3) * 1 + + +@orca.column("households") +def hh_birth_age_lt27(persons, households): + df = persons.to_frame(columns=["household_id", "relate", "sex", "age"]) + df.loc[:, "is_head"] = np.where(df["relate"] == 0, 1, 0) + df.loc[:, "is_female"] = np.where(df["sex"] == 2, 1, 0) + df.loc[:, "is_head_or_spouse"] = np.where(df["relate"].isin([0, 1, 13]), 1, 0) + df.loc[:, "age_head"] = df["age"] * df["is_head"] + df.loc[:, "age_female"] = df["age"] * df["is_female"] * df["is_head_or_spouse"] + df.loc[:, "is_spouse"] = np.where(df["relate"].isin([1, 13]), 1, 0) + df.loc[:, "head_spouse"] = df["is_head"] + df["is_spouse"] + df = df.groupby("household_id").agg( + age_head=("age_head", "sum"), + age_female=("age_female", "sum"), + head_spouse=("head_spouse", "sum"), + ) + df.loc[:, "age_final"] = np.where( + df["head_spouse"] >= 2, df["age_female"], df["age_head"] + ) + return (df["age_final"] <= 27).astype(int) + + +@orca.column("households") +def hh_birth_age_27_35(persons, households): + df = persons.to_frame(columns=["household_id", "relate", "sex", "age"]) + df.loc[:, "is_head"] = np.where(df["relate"] == 0, 1, 0) + df.loc[:, "is_female"] = np.where(df["sex"] == 2, 1, 0) + df.loc[:, "is_head_or_spouse"] = np.where(df["relate"].isin([0, 1, 13]), 1, 0) + df.loc[:, "age_head"] = df["age"] * df["is_head"] + df.loc[:, "age_female"] = df["age"] * df["is_female"] * df["is_head_or_spouse"] + df.loc[:, "is_spouse"] = np.where(df["relate"].isin([1, 13]), 1, 0) + df.loc[:, "head_spouse"] = df["is_head"] + df["is_spouse"] + df = df.groupby("household_id").agg( + age_head=("age_head", "sum"), + age_female=("age_female", "sum"), + head_spouse=("head_spouse", "sum"), + ) + df.loc[:, "age_final"] = np.where( + df["head_spouse"] >= 2, df["age_female"], df["age_head"] + ) + return (df["age_final"].between(27, 35, inclusive="right")).astype(int) diff --git a/demos/models/constants.py b/demos/models/constants.py new file mode 100644 index 0000000..fb99dca --- /dev/null +++ b/demos/models/constants.py @@ -0,0 +1,106 @@ +import orca +import pandas as pd + +# Relational adjustment mapping (equivalent to relmap_06197001.csv) +# Rows = old relate of person changing household head; Columns = old relate of new head +# Values = new relate code to assign +RELATIONAL_ADJUSTMENT_MAPPING = pd.DataFrame( + data=[ + [5, 5, 5, 10, 7, 6, 7, 1, 10, 11, 12, 5, 15, 16, 17], # index=2 + [5, 5, 5, 10, 7, 6, 7, 1, 10, 11, 12, 5, 15, 16, 17], # index=3 + [5, 5, 5, 10, 7, 6, 7, 1, 10, 11, 12, 5, 15, 16, 17], # index=4 + [10, 10, 10, 5, 2, 10, 9, 10, 10, 11, 12, 10, 15, 16, 17], # index=5 + [10, 10, 10, 6, 1, 10, 10, 10, 10, 11, 12, 10, 15, 16, 17], # index=6 + [2, 2, 2, 10, 10, 5, 10, 10, 10, 11, 12, 2, 15, 16, 17], # index=7 + [10, 10, 10, 8, 10, 10, 1, 6, 10, 11, 12, 10, 15, 16, 17], # index=8 + [10, 10, 10, 9, 10, 10, 2, 5, 10, 11, 12, 10, 15, 16, 17], # index=9 + [10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 10, 15, 16, 17], # index=10 + [11, 11, 11, 11, 11, 11, 11, 11, 10, 11, 12, 11, 15, 16, 17], # index=11 + [12, 12, 12, 12, 12, 12, 12, 12, 10, 11, 12, 12, 15, 16, 17], # index=12 + [5, 5, 5, 10, 7, 10, 10, 10, 10, 11, 12, 5, 15, 16, 17], # index=14 + [15, 15, 15, 15, 15, 15, 15, 15, 10, 11, 12, 15, 15, 16, 17], # index=15 + [16, 16, 16, 16, 16, 16, 16, 16, 10, 11, 12, 16, 15, 16, 17], # index=16 + [17, 17, 17, 17, 17, 17, 17, 17, 10, 11, 12, 17, 15, 16, 17], # index=17 + ], + index=pd.Index([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17], name="index"), + columns=[ + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "14", + "15", + "16", + "17", + ], +) + + +def _register_relational_adjustment_mapping(): + if "relational_adjustment_mapping" not in orca.list_tables(): + orca.add_table("relational_adjustment_mapping", RELATIONAL_ADJUSTMENT_MAPPING) + + +_register_relational_adjustment_mapping() +# FIPS Code to Income Quartile +STATE_QUARTILE_LABELS = { + "01": 1, # Alabama + "02": 4, # Alaska + "04": 3, # Arizona + "05": 1, # Arkansas + "06": 4, # California + "08": 3, # Colorado + "09": 4, # Connecticut + "10": 3, # Delaware + "11": 4, # District of Columbia + "12": 3, # Florida + "13": 1, # Georgia + "15": 4, # Hawaii + "16": 3, # Idaho + "17": 2, # Illinois + "18": 1, # Indiana + "19": 1, # Iowa + "20": 1, # Kansas + "21": 2, # Kentucky + "22": 2, # Louisiana + "23": 3, # Maine + "24": 4, # Maryland + "25": 4, # Massachusetts + "26": 1, # Michigan + "27": 2, # Minnesota + "28": 1, # Mississippi + "29": 1, # Missouri + "30": 2, # Montana + "31": 2, # Nebraska + "32": 3, # Nevada + "33": 4, # New Hampshire + "34": 4, # New Jersey + "35": 2, # New Mexico + "36": 4, # New York + "37": 3, # North Carolina + "38": 1, # North Dakota + "39": 2, # Ohio + "40": 1, # Oklahoma + "41": 3, # Oregon + "42": 2, # Pennsylvania + "44": 4, # Rhode Island + "45": 2, # South Carolina + "46": 2, # South Dakota + "47": 1, # Tennessee + "48": 2, # Texas + "49": 3, # Utah + "50": 4, # Vermont + "51": 3, # Virginia + "53": 4, # Washington + "54": 1, # West Virginia + "55": 3, # Wisconsin + "56": 2, # Wyoming + "72": 3, # Puerto Rico +} diff --git a/demos/models/data_fix.py b/demos/models/data_fix.py index ad899bb..9a3eb87 100644 --- a/demos/models/data_fix.py +++ b/demos/models/data_fix.py @@ -1,4 +1,5 @@ import orca +import pandas as pd from config import DEMOSConfig, get_config from loguru import logger @@ -127,3 +128,45 @@ def validate_persons_table(persons, households): persons.local.loc[cohabitating_people_married_idx, "MAR"] = 0 households.local = households.local.reindex(sorted(persons.household_id.unique())) + + +@orca.step() +def normalize_table_dtypes(persons, households): + """ + Ensure columns in the persons and households tables have consistent dtypes + before orca's checkpoint is written. + + Without this step, PyTables raises a PerformanceWarning because object-dtype + columns can contain mixed Python types (e.g. strings + float NaN when new + household rows are created with only lcm_county_id set, or integer IDs stored + as object). PyTables then falls back to pickling those columns, which is + slower and produces larger output files. + + This step inspects every object-dtype column generically: + - All non-null values are strings → fill NaN with '' + - All non-null values are numeric → cast to float via pd.to_numeric + - All non-null values are boolean → fill NaN with False, cast to bool + """ + + def _normalize(df): + for col in df.select_dtypes(include="object").columns: + s = df[col] + if not s.isna().any(): + continue # no NaN, nothing to fix + non_null = s.dropna() + if len(non_null) == 0: + continue # fully-empty column, skip + inferred = pd.api.types.infer_dtype(non_null) + if inferred == "string": + df[col] = s.fillna("") + elif inferred in ("integer", "mixed-integer", "floating"): + df[col] = pd.to_numeric(s, errors="coerce") + elif inferred == "boolean": + df[col] = s.where(s.notna(), other=False).astype(bool) + + p = persons.local + h = households.local + _normalize(p) + _normalize(h) + persons.local = p + households.local = h diff --git a/demos/models/employment.py b/demos/models/employment.py index e4df992..a24cf8e 100644 --- a/demos/models/employment.py +++ b/demos/models/employment.py @@ -75,11 +75,11 @@ def employment(persons): # Updating working status and income persons.local.loc[reindexed_exit_workforce == 1, "worker"] = 0 - persons.local.loc[reindexed_exit_workforce == 1, "earning"] = 0 + # persons.local.loc[reindexed_exit_workforce == 1, "earning"] = 0 persons.local.loc[reindexed_remain_unemployed == 0, "worker"] = 1 - persons.local.loc[reindexed_remain_unemployed == 0, "earning"] = ( - persons["new_earning"].loc[reindexed_remain_unemployed == 0].values - ) + # persons.local.loc[reindexed_remain_unemployed == 0, "earning"] = ( + # persons["new_earning"].loc[reindexed_remain_unemployed == 0].values + # ) log_execution_time(start_time, orca.get_injectable("year"), STEP_NAME) @@ -300,30 +300,30 @@ def hh_workers(persons): ) -@orca.column(table_name="households") -def income(persons): - """ - Aggregate household income from person-level earnings. +# @orca.column(table_name="households") +# def income(persons): +# """ +# Aggregate household income from person-level earnings. - Parameters - ---------- - persons : orca.Table - The persons table. +# Parameters +# ---------- +# persons : orca.Table +# The persons table. - Returns - ------- - pandas.Series - Total income per household. +# Returns +# ------- +# pandas.Series +# Total income per household. - Notes - ----- - This is for `HOUSEHOLDS` table - """ - return ( - persons.to_frame(["household_id", "earning"]) - .groupby("household_id") - .sum()["earning"] - ) +# Notes +# ----- +# This is for `HOUSEHOLDS` table +# """ +# return ( +# persons.to_frame(["household_id", "earning"]) +# .groupby("household_id") +# .sum()["earning"] +# ) @orca.table(cache=True, cache_scope="forever") @@ -368,3 +368,32 @@ def income_dist(persons): ) income_dist = pd.concat([income_dist, mu, sigma], axis=1) return income_dist + + +# ----------------------------------------------------------------------------------------- +# EMPLOYMENT MODEL AGE BIN COLUMNS (moved from variables.py) +# ----------------------------------------------------------------------------------------- + + +@orca.column("persons") +def age_emp_20_40(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(20, 40, inclusive="both") * 1 + + +@orca.column("persons") +def age_emp_41_50(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(41, 50, inclusive="both") * 1 + + +@orca.column("persons") +def age_emp_51_70(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(51, 70, inclusive="both") * 1 + + +@orca.column("persons") +def age_emp_70plus(persons): + p = persons.to_frame(columns=["age"]) + return p.gt(70) * 1 diff --git a/demos/models/fatality.py b/demos/models/fatality.py index e885d5f..f58a21c 100644 --- a/demos/models/fatality.py +++ b/demos/models/fatality.py @@ -193,3 +193,38 @@ def run_and_calibrate_mortality_model(persons): if calibration_procedure is not None: return calibration_procedure.calibrate_and_run_model(model, model_data) return model.predict(model_data) + + +# ----------------------------------------------------------------------------------------- +# MORTALITY MODEL AGE BIN COLUMNS (moved from variables.py) +# ----------------------------------------------------------------------------------------- + + +@orca.column("persons") +def age_mort_21_40(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(21, 40, inclusive="both") * 1 + + +@orca.column("persons") +def age_mort_41_50(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(41, 50, inclusive="both") * 1 + + +@orca.column("persons") +def age_mort_51_70(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(51, 70, inclusive="both") * 1 + + +@orca.column("persons") +def age_mort_71_90(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(71, 90, inclusive="both") * 1 + + +@orca.column("persons") +def age_mort_90plus(persons): + p = persons.to_frame(columns=["age"]) + return p.gt(90) * 1 diff --git a/demos/models/household_reorg.py b/demos/models/household_reorg.py index f3df248..b3eb8df 100644 --- a/demos/models/household_reorg.py +++ b/demos/models/household_reorg.py @@ -272,15 +272,15 @@ def gt2(persons_grouped_household): @orca.column(table_name="households") -def hh_race_of_head(data="households.hh_race_id_of_head"): +def hh_head_race_str(data="households.hh_head_race_id"): """ - Maps `households.hh_race_id_of_head`, which is a numeric value, into 'white', 'black', 'asian' or other + Maps `households.hh_head_race_id` (numeric race_id of head) to a string: 'white', 'black', 'asian', or 'other'. """ return data.map({1: "white", 2: "black", 6: "asian", 7: "asian"}).fillna("other") @orca.column(table_name="households") -def hh_race_id_of_head(persons_grouped_household): +def hh_head_race_id(persons_grouped_household): """""" agg_df = persons_grouped_household.agg(race_of_head=("race_head", "sum")) return agg_df["race_of_head"] @@ -292,6 +292,146 @@ def hh_size(persons_grouped_household): return agg_df.map({1: "one", 2: "two", 3: "three"}).fillna("four or more") +# ----------------------------------------------------------------------------------------- +# HOUSEHOLD AGGREGATE COLUMNS (moved from variables.py) +# ----------------------------------------------------------------------------------------- + + +@orca.column("households") +def hh_n_children(households, persons): + hh = households.local.copy() + p = persons.local.copy() + children = p[p["age"] <= 17].groupby("household_id").count() + hh = hh.join(children[["age"]]).fillna(0) + return hh["age"] + + +@orca.column("households") +def hh_income_bin1(households): + df = households.to_frame(columns=["income"]) + return df.lt(25_000) * 1 + + +@orca.column("households") +def hh_income_bin2(households): + df = households.to_frame(columns=["income"])["income"] + return df.between(25_000, 50_000, inclusive="left") * 1 + + +@orca.column("households") +def hh_income_bin3(households): + df = households.to_frame(columns=["income"])["income"] + return df.between(50_000, 75_000, inclusive="left") * 1 + + +@orca.column("households") +def hh_income_bin4(households): + df = households.to_frame(columns=["income"])["income"] + return df.between(75_000, 150_000, inclusive="left") * 1 + + +@orca.column("households") +def hh_income_bin5(households): + df = households.to_frame(columns=["income"]) + return df.ge(150_000) * 1 + + +# Max education year of head or spouse (relate < 2) +@orca.column("households") +def hh_edu_top(persons): + df = persons.to_frame(columns=["household_id", "edu", "relate"]) + df = df[df["relate"] < 2][["household_id", "edu"]] + return df.groupby("household_id").agg({"edu": "max"}) + + +# High school or equivalent (head/spouse max edu) +@orca.column("households") +def hh_edu_top_bin2(households): + df = households.to_frame(columns=["hh_edu_top"])["hh_edu_top"] + return df.between(16, 17, inclusive="both") * 1 + + +# Some college or more (head/spouse max edu) +@orca.column("households") +def hh_edu_top_bin3(households): + df = households.to_frame(columns=["hh_edu_top"]) + return df.gt(17) * 1 + + +@orca.column("households") +def hh_fam_work(persons): + df = persons.to_frame(columns=["relate", "worker", "household_id"]) + df = df[df["relate"] < 2] + return df.groupby("household_id").agg({"worker": "sum"}) + + +@orca.column("households") +def hh_fam_work2(households): + df = households.to_frame(columns=["hh_fam_work"]) + return df.eq(2) * 1 + + +# Average age of head and spouse +@orca.column("households") +def hh_age_avg(persons): + df = persons.to_frame(columns=["relate", "age", "household_id"]) + df = df[df["relate"] < 2] + return df.groupby("household_id").agg({"age": "mean"}) + + +@orca.column("households") +def hh_age_avg_bin2(households): + df = households.to_frame(columns=["hh_age_avg"])["hh_age_avg"] + return df.between(22, 35, inclusive="right") * 1 + + +@orca.column("households") +def hh_age_avg_bin3(households): + df = households.to_frame(columns=["hh_age_avg"])["hh_age_avg"] + return df.between(35, 60, inclusive="right") * 1 + + +@orca.column("households") +def hh_age_avg_bin4(households): + df = households.to_frame(columns=["hh_age_avg"]) + return df.gt(60) * 1 + + +# Minimum age of head and spouse +@orca.column("households") +def hh_age_min(persons): + df = persons.to_frame(columns=["relate", "age", "household_id"]) + df = df[df["relate"] < 2] + return df.groupby("household_id").agg({"age": "min"}) + + +@orca.column("households") +def hh_age_min_bin2(households): + df = households.to_frame(columns=["hh_age_min"])["hh_age_min"] + return df.between(22, 35, inclusive="right") * 1 + + +@orca.column("households") +def hh_age_min_bin3(households): + df = households.to_frame(columns=["hh_age_min"])["hh_age_min"] + return df.between(35, 60, inclusive="right") * 1 + + +@orca.column("households") +def hh_age_min_bin4(households): + df = households.to_frame(columns=["hh_age_min"]) + return df.gt(60) * 1 + + +# Race of household head (White, race_id == 1) +@orca.column("households") +def hh_head_race_white(persons): + df = persons.to_frame(columns=["relate", "race_white", "household_id"]) + df = df[df["relate"] == 0] + df.sort_values("household_id", inplace=True) + return df.set_index("household_id")["race_white"] + + def simultaneous_calibration( sim_cal_config, persons, diff --git a/demos/models/income.py b/demos/models/income.py new file mode 100644 index 0000000..5f3abee --- /dev/null +++ b/demos/models/income.py @@ -0,0 +1,219 @@ +import orca +import numpy as np +import pandas as pd +from demos.config import DEMOSConfig, get_config +from templates.utils.models import columns_in_formula +from templates import estimated_models, modelmanager as mm +import time +from logging_logic import log_execution_time +from .constants import STATE_QUARTILE_LABELS + +STEP_NAME = "income" + + +@orca.step(STEP_NAME) +def income(households): + """ """ + start_time = time.time() + predicted_income = run_and_calibrate_income_model(households) + households.local["income"] = predicted_income + log_execution_time(start_time, orca.get_injectable("year"), "income") + + +def run_and_calibrate_income_model(households): + # Load calibration config + demos_config: DEMOSConfig = get_config() + # calibration_procedure = demos_config.income_module_config.calibration_procedure + calibration_procedure = None + + # Get model data + model = mm.get_step("income_nworkers") + model_variables = columns_in_formula(model.model_expression) + model_data = households.to_frame(model_variables) + + # Calibrate if needed + if calibration_procedure is not None: + return calibration_procedure.calibrate_and_run_model(model, model_data) + return np.exp(model.predict(model_data)) + + +################### +# MODEL VARIABLES # +################### + + +@orca.column("households") +def true_hh_size(persons, households): + return persons.local.groupby("household_id").size().loc[households.index] + + +@orca.column(table_name="households") +def true_hh_workers(persons): + """ + Compute the number of workers per household. + + Parameters + ---------- + persons : orca.Table + The persons table. + + Returns + ------- + pandas.Series + Sum of total workers in each household, indexed by household_id. + """ + return persons.worker.groupby(persons.household_id).sum() + + +@orca.column("households") +def not_met_area(households): + return pd.Series(np.ones(households.local.shape[0]), index=households.local.index) + + +# Education variables +# TODO: This numbers for education are not updated beyon 19 in the education model +@orca.column("households") +def hh_head_edu_bin1(households, persons): + # Get the persons row for the head of every household (head is when relate == 0) + heads = persons.local[persons["relate"] == 0] + return ( + heads.set_index("household_id") + .loc[households.index, "edu"] + .isin([15, 16, 17]) + .astype(int) + ) + + +@orca.column("households") +def hh_head_edu_bin2(households, persons): + heads = persons.local[persons["relate"] == 0] + return (heads.set_index("household_id").loc[households.index, "edu"] == 18).astype( + int + ) + + +@orca.column("households") +def hh_head_edu_bin3(households, persons): + heads = persons.local[persons["relate"] == 0] + return (heads.set_index("household_id").loc[households.index, "edu"] >= 19).astype( + int + ) + + +# Job industry variables +# TODO: This column should be implemented more rigorously based on actual job industry data rather than random assignment +# @orca.column("households", cache=True, cache_scope="step") +# def job_industry(households): +# return pd.Series(np.random.choice([1, 2, 3, 4]), index=households.index) + + +@orca.column("households") +def job_industry_bin1(households): # First quartile + return (households["job_industry"] == 1).astype(int) + + +@orca.column("households") +def job_industry_bin2(households): + return (households["job_industry"] == 2).astype(int) + + +@orca.column("households") +def job_industry_bin3(households): + return (households["job_industry"] == 3).astype(int) + + +@orca.column("households") +def job_industry_bin4(households): + return (households["job_industry"] == 4).astype(int) + + +# TODO: This column should be implemented more rigorously based on actual job industry data rather than random assignment +# @orca.column("households", cache=True, cache_scope="step") +# def job_occupation(households): +# return pd.Series(np.random.choice([1, 2, 3, 4]), index=households.index) + + +@orca.column("households") +def job_occupation_bin1(households): # First quartile + return (households["job_occupation"] == 1).astype(int) + + +@orca.column("households") +def job_occupation_bin2(households): + return (households["job_occupation"] == 2).astype(int) + + +@orca.column("households") +def job_occupation_bin3(households): + return (households["job_occupation"] == 3).astype(int) + + +@orca.column("households") +def job_occupation_bin4(households): + return (households["job_occupation"] == 4).astype(int) + + +@orca.column("households") +def state_quartile(households): + return pd.Series( + households.local["lcm_county_id"] + .apply(lambda s: f"{int(s) // 1000:02d}") + .map(STATE_QUARTILE_LABELS) + .values, + index=households.local.index.values, + ) + + +@orca.column("households") +def state_quart_2(households): + return (households["state_quartile"] == 2).astype(int) + + +@orca.column("households") +def state_quart_3(households): + return (households["state_quartile"] == 3).astype(int) + + +@orca.column("households") +def state_quart_4(households): + return (households["state_quartile"] == 4).astype(int) + + +@orca.column("households") +def hh_head_race_black(households, persons): + heads = persons.to_frame(["household_id", "race_black"])[persons["relate"] == 0] + return (heads.set_index("household_id").loc[households.index, "race_black"]).astype( + int + ) + + +@orca.column("households") +def hh_head_race_native_am(households, persons): + heads = persons.to_frame(["household_id", "race_native_am"])[persons["relate"] == 0] + return ( + heads.set_index("household_id").loc[households.index, "race_native_am"] + ).astype(int) + + +@orca.column("households") +def hh_head_race_asian(households, persons): + heads = persons.to_frame(["household_id", "race_asian"])[persons["relate"] == 0] + return (heads.set_index("household_id").loc[households.index, "race_asian"]).astype( + int + ) + + +@orca.column("households") +def hh_head_race_hawaiian(households, persons): + heads = persons.to_frame(["household_id", "race_hawaiian"])[persons["relate"] == 0] + return ( + heads.set_index("household_id").loc[households.index, "race_hawaiian"] + ).astype(int) + + +@orca.column("households") +def hh_head_race_acs_other(households, persons): + heads = persons.to_frame(["household_id", "race_acs_other"])[persons["relate"] == 0] + return ( + heads.set_index("household_id").loc[households.index, "race_acs_other"] + ).astype(int) diff --git a/demos/models/kids_moving.py b/demos/models/kids_moving.py index 6fac404..544b201 100644 --- a/demos/models/kids_moving.py +++ b/demos/models/kids_moving.py @@ -186,3 +186,138 @@ def run_and_calibrate_model(persons): print(f"{calibrate_iteration} iteration error: {error}") return kids_moving + + +# ----------------------------------------------------------------------------------------- +# KIDS MOVE MODEL COLUMNS (moved from variables.py) +# ----------------------------------------------------------------------------------------- + + +@orca.column("persons") +def age_km_16_18(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(16, 18, inclusive="both") * 1 + + +@orca.column("persons") +def age_km_19_20(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(19, 20, inclusive="both") * 1 + + +@orca.column("persons") +def age_km_21_25(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(21, 25, inclusive="both") * 1 + + +@orca.column("persons") +def age_km_26_30(persons): + p = persons.to_frame(columns=["age"])["age"] + return p.between(26, 30, inclusive="both") * 1 + + +@orca.column("persons") +def age_km_30plus(persons): + p = persons.to_frame(columns=["age"]) + return p.gt(30) * 1 + + +# emp_idle_under60 × age_km cross-products +@orca.column("persons") +def emp_idle_under60_age_km_19_20(persons): + p = persons.to_frame(columns=["emp_idle_under60", "age_km_19_20"]) + return p["emp_idle_under60"] * p["age_km_19_20"] + + +@orca.column("persons") +def emp_idle_under60_age_km_21_25(persons): + p = persons.to_frame(columns=["emp_idle_under60", "age_km_21_25"]) + return p["emp_idle_under60"] * p["age_km_21_25"] + + +@orca.column("persons") +def emp_idle_under60_age_km_26_30(persons): + p = persons.to_frame(columns=["emp_idle_under60", "age_km_26_30"]) + return p["emp_idle_under60"] * p["age_km_26_30"] + + +@orca.column("persons") +def emp_idle_under60_age_km_30plus(persons): + p = persons.to_frame(columns=["emp_idle_under60", "age_km_30plus"]) + return p["emp_idle_under60"] * p["age_km_30plus"] + + +# emp_idle_over60 × age_km cross-products +@orca.column("persons") +def emp_idle_over60_age_km_19_20(persons): + p = persons.to_frame(columns=["emp_idle_over60", "age_km_19_20"]) + return p["emp_idle_over60"] * p["age_km_19_20"] + + +@orca.column("persons") +def emp_idle_over60_age_km_21_25(persons): + p = persons.to_frame(columns=["emp_idle_over60", "age_km_21_25"]) + return p["emp_idle_over60"] * p["age_km_21_25"] + + +@orca.column("persons") +def emp_idle_over60_age_km_26_30(persons): + p = persons.to_frame(columns=["emp_idle_over60", "age_km_26_30"]) + return p["emp_idle_over60"] * p["age_km_26_30"] + + +@orca.column("persons") +def emp_idle_over60_age_km_30plus(persons): + p = persons.to_frame(columns=["emp_idle_over60", "age_km_30plus"]) + return p["emp_idle_over60"] * p["age_km_30plus"] + + +# edu_hs_ged × age_km cross-products +@orca.column("persons") +def edu_hs_ged_age_km_19_20(persons): + p = persons.to_frame(columns=["age_km_19_20", "edu_hs_ged"]) + return p["age_km_19_20"] * p["edu_hs_ged"] + + +@orca.column("persons") +def edu_hs_ged_age_km_21_25(persons): + p = persons.to_frame(columns=["age_km_21_25", "edu_hs_ged"]) + return p["age_km_21_25"] * p["edu_hs_ged"] + + +@orca.column("persons") +def edu_hs_ged_age_km_26_30(persons): + p = persons.to_frame(columns=["age_km_26_30", "edu_hs_ged"]) + return p["age_km_26_30"] * p["edu_hs_ged"] + + +@orca.column("persons") +def edu_hs_ged_age_km_30plus(persons): + p = persons.to_frame(columns=["age_km_30plus", "edu_hs_ged"]) + return p["age_km_30plus"] * p["edu_hs_ged"] + + +# edu_college_plus × age_km cross-products +@orca.column("persons") +def edu_college_plus_age_km_19_20(persons): + p = persons.to_frame(columns=["age_km_19_20", "edu_college_plus"]) + return p["age_km_19_20"] * p["edu_college_plus"] + + +@orca.column("persons") +def edu_college_plus_age_km_21_25(persons): + p = persons.to_frame(columns=["age_km_21_25", "edu_college_plus"]) + return p["age_km_21_25"] * p["edu_college_plus"] + + +@orca.column("persons") +def edu_college_plus_age_km_26_30(persons): + p = persons.to_frame(columns=["age_km_26_30", "edu_college_plus"]) + return p["age_km_26_30"] * p["edu_college_plus"] + + +@orca.column("persons") +def edu_college_plus_age_km_30plus(persons): + p = persons.to_frame(columns=["age_km_30plus", "edu_college_plus"]) + return p["age_km_30plus"] * p["edu_college_plus"] diff --git a/demos/models/marriage.py b/demos/models/marriage.py index ef02e6d..f5d0721 100644 --- a/demos/models/marriage.py +++ b/demos/models/marriage.py @@ -48,10 +48,20 @@ def update_married_households_random( if n_weddings == 0 and n_newcohabs == 0: return + # Compute per-person income: household income divided by number of persons in household + hh_sizes = persons.local.groupby("household_id").size() + person_hh_id = persons.local["household_id"] + per_person_income = pd.Series( + households.local["income"].reindex(person_hh_id.values).values + / hh_sizes.reindex(person_hh_id.values).values, + index=persons.local.index, + ) + ## Selecting individuals for marriage and cohabitation female_newmarried = ( persons.local.loc[(married_reindexed == 2) & female_index][ - ["age", "household_id", "earning", "relate"] + # ["age", "household_id", "earning", "relate"] + ["age", "household_id", "relate"] ] .sort_index(axis=0) .sample(n_weddings) @@ -59,7 +69,8 @@ def update_married_households_random( ) male_newmarried = ( persons.local.loc[(married_reindexed == 2) & male_index][ - ["age", "household_id", "earning", "relate"] + # ["age", "household_id", "earning", "relate"] + ["age", "household_id", "relate"] ] .sort_index(axis=0) .sample(n_weddings) @@ -67,7 +78,8 @@ def update_married_households_random( ) female_newcohab = ( persons.local.loc[(married_reindexed == 1) & female_index][ - ["age", "household_id", "earning", "relate"] + # ["age", "household_id", "earning", "relate"] + ["age", "household_id", "relate"] ] .sort_index(axis=0) .sample(n_newcohabs) @@ -75,7 +87,8 @@ def update_married_households_random( ) male_newcohab = ( persons.local.loc[(married_reindexed == 1) & male_index][ - ["age", "household_id", "earning", "relate"] + # ["age", "household_id", "earning", "relate"] + ["age", "household_id", "relate"] ] .sort_index(axis=0) .sample(n_newcohabs) @@ -84,6 +97,10 @@ def update_married_households_random( ## Modifying auxiliary dataframe to compute new relation and household_id ### Pairs are selected by age + female_newmarried["per_person_income"] = per_person_income.loc[ + female_newmarried.index + ] + male_newmarried["per_person_income"] = per_person_income.loc[male_newmarried.index] female_newmarried.sort_values("age", inplace=True) male_newmarried.sort_values("age", inplace=True) newmarried = pd.concat([male_newmarried, female_newmarried], axis=0) @@ -95,11 +112,15 @@ def update_married_households_random( newmarried["rnd"] = np.random.random(len(newmarried)) newmarried.sort_values( - by=["hh_group", "earning", "rnd"], ascending=[True, False, True], inplace=True + by=["hh_group", "per_person_income", "rnd"], + ascending=[True, False, True], + inplace=True, ) newmarried["new_relate"] = np.arange(len(newmarried)) % 2 # [0, 1, 0, 1, ...] newmarried["did_marry"] = True + female_newcohab["per_person_income"] = per_person_income.loc[female_newcohab.index] + male_newcohab["per_person_income"] = per_person_income.loc[male_newcohab.index] female_newcohab.sort_values("age", inplace=True) male_newcohab.sort_values("age", inplace=True) newcohab = pd.concat([male_newcohab, female_newcohab], axis=0) @@ -112,7 +133,9 @@ def update_married_households_random( newcohab["rnd"] = np.random.random(len(newcohab)) newcohab.sort_values( - by=["hh_group", "earning", "rnd"], ascending=[True, False, True], inplace=True + by=["hh_group", "per_person_income", "rnd"], + ascending=[True, False, True], + inplace=True, ) newcohab["new_relate"] = (np.arange(len(newcohab)) % 2) * 13 # [0, 13, 0, 13, ...] newcohab["did_marry"] = False diff --git a/demos/templates/calibration/procedures.py b/demos/templates/calibration/procedures.py index 9005d68..ba50f2c 100644 --- a/demos/templates/calibration/procedures.py +++ b/demos/templates/calibration/procedures.py @@ -1,4 +1,4 @@ -from ..estimated_models.binary_logit import BinaryLogitStep +from ..estimated_models.template import TemplateStep from pydantic import BaseModel, TypeAdapter, field_validator from typing import Literal import pandas as pd @@ -33,7 +33,7 @@ def _coerce_oberserved_values_table(cls, v): # otherwise assume it's already a str return v - def calibration_step(self, model: BinaryLogitStep, update_delta: float): + def calibration_step(self, model: TemplateStep, update_delta: float): model.fitted_parameters[0] += update_delta def compute_error(self, prediction: pd.Series, target: float): @@ -47,7 +47,7 @@ def compute_error(self, prediction: pd.Series, target: float): f"Tolerance type {self.tolerance_type} not implemented" ) - def calibrate_and_run_model(self, model: BinaryLogitStep, data: pd.DataFrame): + def calibrate_and_run_model(self, model: TemplateStep, data: pd.DataFrame): table_column = "count" if self.tolerance_type == "absolute" else "share" # Sort for reproducibility @@ -63,7 +63,9 @@ def calibrate_and_run_model(self, model: BinaryLogitStep, data: pd.DataFrame): total_iterations = 0 while error > self.tolerance and total_iterations < self.max_iter: - logger.info(f"{total_iterations} iter: {error}") + logger.info( + f"({self.tolerance_type}) Error from reference table {self.observed_values_table} at iteration {total_iterations}: {error}" + ) target_for_update = ( target_value if self.tolerance_type == "absolute" @@ -74,7 +76,9 @@ def calibrate_and_run_model(self, model: BinaryLogitStep, data: pd.DataFrame): prediction = model.predict(data) error = self.compute_error(prediction, target_value) - logger.info(f"{total_iterations} iter: {error}") + logger.info( + f"({self.tolerance_type}) Error from reference table {self.observed_values_table} at final iteration {total_iterations}: {error}" + ) return prediction diff --git a/demos/templates/estimated_models/__init__.py b/demos/templates/estimated_models/__init__.py index 4d77985..c279ff9 100644 --- a/demos/templates/estimated_models/__init__.py +++ b/demos/templates/estimated_models/__init__.py @@ -1,3 +1,4 @@ from .multinomial_logit import MultinomialLogitStep from .binary_logit import BinaryLogitStep +from .regression_model import RegressionStep from .template import TemplateStep diff --git a/demos/templates/estimated_models/regression_model.py b/demos/templates/estimated_models/regression_model.py new file mode 100644 index 0000000..99c1373 --- /dev/null +++ b/demos/templates/estimated_models/regression_model.py @@ -0,0 +1,267 @@ +from __future__ import print_function + +import numpy as np +import patsy +import pandas as pd +from statsmodels.api import OLS + +import orca + +from .. import modelmanager +from ..utils.misc import get_data +from .template import TemplateStep + + +@modelmanager.template +class RegressionStep(TemplateStep): + """ + A class for building ordinary least squares (OLS) regression model steps. This + extends TemplateStep, where some common functionality is defined. Estimation is + handled by Statsmodels and simulation is handled within this class. + + Expected usage: + - create a model object + - specify some parameters + - run the `fit()` method + - iterate as needed + + Then, for simulation: + - specify some simulation parameters + - use the `run()` method for interactive testing + - use `modelmanager.register()` to save the model to Orca and disk + - registered steps can be accessed via ModelManager and Orca + + All parameters listed in the constructor can be set directly on the class object, + at any time. + + Parameters + ---------- + tables : str or list of str, optional + Name(s) of Orca tables to draw data from. The first table is the primary one. + Any additional tables need to have merge relationships ("broadcasts") specified + so that they can be merged unambiguously onto the first table. Among them, the + tables must contain all variables used in the model expression and filters. The + left-hand-side variable should be in the primary table. The `tables` parameter is + required for fitting a model, but it does not have to be provided when the object + is created. + + model_expression : str, optional + Patsy formula containing both the left- and right-hand sides of the model + expression: http://patsy.readthedocs.io/en/latest/formulas.html + This parameter is required for fitting a model, but it does not have to be + provided when the object is created. + + filters : str or list of str, optional + Filters to apply to the data before fitting the model. These are passed to + `pd.DataFrame.query()`. Filters are applied after any additional tables are merged + onto the primary one. Replaces the `fit_filters` argument in UrbanSim. + + out_tables : str or list of str, optional + Name(s) of Orca tables to use for simulation. If not provided, the `tables` + parameter will be used. Same guidance applies: the tables must be able to be + merged unambiguously, and must include all columns used in the right-hand-side + of the model expression and in the `out_filters`. + + out_column : str, optional + Name of the column to write simulated values to. If not provided, the left-hand- + side variable from the model expression will be used. + + out_filters : str or list of str, optional + Filters to apply to the data before simulation. If not provided, no filters will + be applied. Replaces the `predict_filters` argument in UrbanSim. + + name : str, optional + Name of the model step, passed to ModelManager. If none is provided, a name is + generated each time the `fit()` method runs. + + tags : list of str, optional + Tags, passed to ModelManager. + + """ + + def __init__( + self, + tables=None, + model_expression=None, + filters=None, + out_tables=None, + out_column=None, + out_filters=None, + name=None, + tags=[], + ): + + # Parent class can initialize the standard parameters + TemplateStep.__init__( + self, + tables=tables, + model_expression=model_expression, + filters=filters, + out_tables=out_tables, + out_column=out_column, + out_transform=None, + out_filters=out_filters, + name=name, + tags=tags, + ) + + # Placeholders for model fit data, filled in by fit() or from_dict() + self.summary_table = None + self.fitted_parameters = None + + @classmethod + def from_dict(cls, d): + """ + Create an object instance from a saved dictionary representation. + + Parameters + ---------- + d : dict + + Returns + ------- + RegressionStep + + """ + obj = cls( + tables=d["tables"], + model_expression=d["model_expression"], + filters=d["filters"], + out_tables=d["out_tables"], + out_column=d["out_column"], + out_filters=d["out_filters"], + name=d["name"], + tags=d["tags"], + ) + + obj.summary_table = d["summary_table"] + obj.fitted_parameters = d["fitted_parameters"] + + return obj + + def to_dict(self): + """ + Create a dictionary representation of the object. + + Returns + ------- + dict + + """ + d = TemplateStep.to_dict(self) + + # Add parameters not in parent class + d.update( + { + "summary_table": self.summary_table, + "fitted_parameters": self.fitted_parameters, + } + ) + return d + + def fit(self): + """ + Fit the model; save and report results. Uses the Statsmodels OLS class with + default estimation settings. + + The `fit()` method can be run as many times as desired. Results will not be saved + with Orca or ModelManager until the `register()` method is run. + + Parameters + ---------- + None + + Returns + ------- + None + + """ + df = get_data( + tables=self.tables, + filters=self.filters, + model_expression=self.model_expression, + ) + + m = OLS.from_formula(data=df, formula=self.model_expression) + results = m.fit() + + self.name = self._generate_name() + self.summary_table = str(results.summary()) + print(self.summary_table) + + self.fitted_parameters = results.params.tolist() # params is a pd.Series + + def predict(self, data: pd.DataFrame) -> pd.Series: + """ + Generate continuous predictions from fitted OLS parameters. + + Parameters + ---------- + data : pd.DataFrame + Input data containing the right-hand-side variables of the model expression. + + Returns + ------- + pd.Series + Predicted values with the same index as `data`. + + """ + data = data.sort_index(axis=0) + rhs = self.model_expression.split("~", 1)[1] + dm = patsy.dmatrix(data=data, formula_like=rhs, return_type="dataframe") + + predictions = np.dot(dm, self.fitted_parameters) + return pd.Series(predictions, index=data.index) + + def run(self): + """ + Run the model step: calculate predicted values and use them to update a column. + + Predicted values are saved to the class object for interactive use + (`choices`, with type pd.Series) but are not persisted in the dictionary + representation of the model step. + + Parameters + ---------- + None + + Returns + ------- + None + + """ + df = get_data( + tables=self.out_tables, + fallback_tables=self.tables, + filters=self.out_filters, + model_expression=self.model_expression, + extra_columns=self.out_column, + ) + + predictions = self.predict(df) + self.choices = predictions + + colname = self._get_out_column() + tabname = self._get_out_table() + + orca.get_table(tabname).update_col_from_series(colname, predictions, cast=True) + + def run_with_data(self, df): + """ + Run prediction on a provided DataFrame and return the result without updating + any Orca tables. + + Parameters + ---------- + df : pd.DataFrame + Input data containing the right-hand-side variables of the model expression. + + Returns + ------- + pd.Series + Predicted values with the same index as `df`. + + """ + predictions = self.predict(df) + self.choices = predictions + return predictions diff --git a/demos/variables.py b/demos/variables.py index daa4dda..446108e 100644 --- a/demos/variables.py +++ b/demos/variables.py @@ -173,499 +173,165 @@ def intercept(persons): return np.ones(size) -# @orca.column('persons') -# def dead(persons): -# size = persons.to_frame(columns=["age"]).shape[0] -# return np.zeros(size) - 99 - -# @orca.column('persons') -# def stop(persons): -# size = persons.to_frame(columns=["age"]).shape[0] -# return np.zeros(size) - 99 - -# @orca.column('persons') -# def kids_move(persons): -# size = persons.to_frame(columns=["age"]).shape[0] -# return np.zeros(size) - 99 - - -@orca.column("persons") -def agebin1_labor(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(20, 30, inclusive="both") * 1 - - -@orca.column("persons") -def agebin2_labor(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(31, 40, inclusive="both") * 1 - - -@orca.column("persons") -def agebin3_labor(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(41, 50, inclusive="both") * 1 - - -@orca.column("persons") -def agebin4_labor(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(51, 70, inclusive="both") * 1 - - -@orca.column("persons") -def agebin5_labor(persons): - p = persons.to_frame(columns=["age"])["age"] - return (p > 70) * 1 - - @orca.column("persons") -def agebin6_labor(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(20, 40, inclusive="both") * 1 - - -@orca.column("persons") -def agebin1(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(16, 22, inclusive="both") * 1 - - -@orca.column("persons") -def agebin2(persons): +def age_23_35(persons): p = persons.to_frame(columns=["age"])["age"] return p.between(23, 35, inclusive="both") * 1 @orca.column("persons") -def agebin3(persons): +def age_36_60(persons): p = persons.to_frame(columns=["age"])["age"] return p.between(36, 60, inclusive="both") * 1 @orca.column("persons") -def agebin4(persons): +def age_60plus(persons): p = persons.to_frame(columns=["age"]) return p.gt(60) * 1 -################### added for updated - - -@orca.column("persons") -def agebin1_new(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(21, 40, inclusive="both") * 1 - - -@orca.column("persons") -def agebin2_new(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(41, 50, inclusive="both") * 1 - - -@orca.column("persons") -def agebin3_new(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(51, 70, inclusive="both") * 1 - - -@orca.column("persons") -def agebin4_new(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(71, 90, inclusive="both") * 1 - - -@orca.column("persons") -def agebin5_new(persons): - p = persons.to_frame(columns=["age"]) - return p.gt(90) * 1 - - -#################### - - -@orca.column("persons") -def agebin1_mo(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(16, 18, inclusive="both") * 1 - - -@orca.column("persons") -def agebin2_mo(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(19, 20, inclusive="both") * 1 - - -@orca.column("persons") -def agebin3_mo(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(21, 25, inclusive="both") * 1 - - -@orca.column("persons") -def agebin4_mo(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.between(26, 30, inclusive="both") * 1 - - -@orca.column("persons") -def agebin5_mo(persons): - p = persons.to_frame(columns=["age"])["age"] - return p.gt(30) * 1 - - -# Male -@orca.column("persons") -def gender1(persons): - p = persons.to_frame(columns=["sex"]) - return p.eq(1).astype(int) - - # Female @orca.column("persons") -def gender2(persons): +def sex_female(persons): p = persons.to_frame(columns=["sex"]) return p.eq(2).astype(int) @orca.column("persons") -def employbin1(persons): - p = persons.to_frame(columns=["worker"]) - return p.eq(1) * 1 - - -@orca.column("persons") -def employbin2(persons): +def emp_idle_under60(persons): p = persons.to_frame(columns=["worker", "age"]) return (p["worker"].eq(0) & p["age"].lt(60)).astype(int) @orca.column("persons") -def employbin3(persons): +def emp_idle_over60(persons): p = persons.to_frame(columns=["worker", "age"]) return (p["worker"].eq(0) & p["age"].ge(60)).astype(int) -@orca.column("persons") -def employ2_agebin2_mo(persons): - p = persons.to_frame(columns=["employbin2", "agebin2_mo"]) - return p["employbin2"] * p["agebin2_mo"] - - -@orca.column("persons") -def employ2_agebin3_mo(persons): - p = persons.to_frame(columns=["employbin2", "agebin3_mo"]) - return p["employbin2"] * p["agebin3_mo"] - - -@orca.column("persons") -def employ2_agebin4_mo(persons): - p = persons.to_frame(columns=["employbin2", "agebin4_mo"]) - return p["employbin2"] * p["agebin4_mo"] - - -@orca.column("persons") -def employ2_agebin5_mo(persons): - p = persons.to_frame(columns=["employbin2", "agebin5_mo"]) - return p["employbin2"] * p["agebin5_mo"] - - -### - - -@orca.column("persons") -def employ3_agebin2_mo(persons): - p = persons.to_frame(columns=["employbin3", "agebin2_mo"]) - return p["employbin3"] * p["agebin2_mo"] - - -@orca.column("persons") -def employ3_agebin3_mo(persons): - p = persons.to_frame(columns=["employbin3", "agebin3_mo"]) - return p["employbin3"] * p["agebin3_mo"] - - -@orca.column("persons") -def employ3_agebin4_mo(persons): - p = persons.to_frame(columns=["employbin3", "agebin4_mo"]) - return p["employbin3"] * p["agebin4_mo"] - - -@orca.column("persons") -def employ3_agebin5_mo(persons): - p = persons.to_frame(columns=["employbin3", "agebin5_mo"]) - return p["employbin3"] * p["agebin5_mo"] - - -# Less than high school -@orca.column("persons") -def edubin1(persons): - p = persons.to_frame(columns=["edu"]) - return p.lt(16) * 1 +# ----------------------------------------------------------------------------------------- +# EDUCATION VARIABLES +# ----------------------------------------------------------------------------------------- # High School or GED @orca.column("persons") -def edubin2(persons): +def edu_hs_ged(persons): p = persons.to_frame(columns=["edu"])["edu"] return p.between(16, 17, inclusive="both") * 1 # Some college or more @orca.column("persons") -def edubin3(persons): +def edu_college_plus(persons): p = persons.to_frame(columns=["edu"]) return p.gt(17) * 1 +# ----------------------------------------------------------------------------------------- +# RACE VARIABLES +# ----------------------------------------------------------------------------------------- + + @orca.column("persons") -def race_wht(persons): +def race_white(persons): p = persons.to_frame(columns=["race_id"]) return p.eq(1) * 1 @orca.column("persons") -def race_blk(persons): +def race_black(persons): p = persons.to_frame(columns=["race_id"]) return p.eq(2) * 1 @orca.column("persons") -def race_asn(persons): +def race_asian_pi(persons): p = persons.to_frame(columns=["race_id"])["race_id"] return p.between(6, 7) * 1 -list_of_other_races = [3, 4, 5, 8, 9] - - -# Other Races -@orca.column("persons") -def race_other(persons): - p = persons.to_frame(columns=["race_id"]) - return p.isin(list_of_other_races) * 1 - - -# White -@orca.column("persons") -def race1(persons): - p = persons.to_frame(columns=["race_wht"]) - return p["race_wht"] - - -# Black -@orca.column("persons") -def race2(persons): - p = persons.to_frame(columns=["race_blk"]) - return p["race_blk"] - - native_american_list = [3, 4, 5] -# Native American +# Native American (ACS race codes 3, 4, 5) @orca.column("persons") -def race3(persons): +def race_native_am(persons): p = persons.to_frame(columns=["race_id"]) return p.isin(native_american_list) * 1 -# Asian or Pacific Islander -@orca.column("persons") -def race4(persons): - p = persons.to_frame(columns=["race_asn"]) - return p["race_asn"] - - -# Other +# Hawaiian (ACS race code 7) @orca.column("persons") -def race5(persons): +def race_hawaiian(persons): p = persons.to_frame(columns=["race_id"]) - return p.eq(8) * 1 + return p.eq(7) * 1 -# Multiracial +# Asian (ACS race code 6) @orca.column("persons") -def race6(persons): +def race_asian(persons): p = persons.to_frame(columns=["race_id"]) - return p.eq(9) * 1 - - -@orca.column("persons") -def age2_edu2(persons): - p = persons.to_frame(columns=["agebin2", "edubin2"]) - p["age2_edu2"] = p["agebin2"] * p["edubin2"] - return p["age2_edu2"] - - -@orca.column("persons") -def age3_edu2(persons): - p = persons.to_frame(columns=["agebin3", "edubin2"]) - p["age3_edu2"] = p["agebin3"] * p["edubin2"] - return p["age3_edu2"] - - -@orca.column("persons") -def age4_edu2(persons): - p = persons.to_frame(columns=["agebin4", "edubin2"]) - p["age4_edu2"] = p["agebin4"] * p["edubin2"] - return p["age4_edu2"] - - -@orca.column("persons") -def age2_edu3(persons): - p = persons.to_frame(columns=["agebin2", "edubin3"]) - p["age2_edu3"] = p["agebin2"] * p["edubin3"] - return p["age2_edu3"] + return p.eq(6) * 1 -@orca.column("persons") -def age3_edu3(persons): - p = persons.to_frame(columns=["agebin3", "edubin3"]) - p["age3_edu3"] = p["agebin3"] * p["edubin3"] - return p["age3_edu3"] - - -@orca.column("persons") -def age4_edu3(persons): - p = persons.to_frame(columns=["agebin4", "edubin3"]) - p["age4_edu3"] = p["agebin4"] * p["edubin3"] - return p["age4_edu3"] - - -@orca.column("persons") -def edu2_agebin2_mo(persons): - p = persons.to_frame(columns=["agebin2_mo", "edubin2"]) - p["edu2_agebin2_mo"] = p["agebin2_mo"] * p["edubin2"] - return p["edu2_agebin2_mo"] - - -@orca.column("persons") -def edu2_agebin3_mo(persons): - p = persons.to_frame(columns=["agebin3_mo", "edubin2"]) - p["edu2_agebin3_mo"] = p["agebin3_mo"] * p["edubin2"] - return p["edu2_agebin3_mo"] - - -@orca.column("persons") -def edu2_agebin4_mo(persons): - p = persons.to_frame(columns=["agebin4_mo", "edubin2"]) - p["edu2_agebin4_mo"] = p["agebin4_mo"] * p["edubin2"] - return p["edu2_agebin4_mo"] - - -@orca.column("persons") -def edu2_agebin5_mo(persons): - p = persons.to_frame(columns=["agebin5_mo", "edubin2"]) - p["edu2_agebin5_mo"] = p["agebin5_mo"] * p["edubin2"] - return p["edu2_agebin5_mo"] - - -@orca.column("persons") -def edu3_agebin2_mo(persons): - p = persons.to_frame(columns=["agebin2_mo", "edubin3"]) - p["edu3_agebin2_mo"] = p["agebin2_mo"] * p["edubin3"] - return p["edu3_agebin2_mo"] +list_of_other_races = [3, 4, 5, 8, 9] +# Broad "other" category used in employment models @orca.column("persons") -def edu3_agebin3_mo(persons): - p = persons.to_frame(columns=["agebin3_mo", "edubin3"]) - p["edu3_agebin3_mo"] = p["agebin3_mo"] * p["edubin3"] - return p["edu3_agebin3_mo"] +def race_other(persons): + p = persons.to_frame(columns=["race_id"]) + return p.isin(list_of_other_races) * 1 +# ACS "some other race" (code 8) — used as building block for household head race @orca.column("persons") -def edu3_agebin4_mo(persons): - p = persons.to_frame(columns=["agebin4_mo", "edubin3"]) - p["edu3_agebin4_mo"] = p["agebin4_mo"] * p["edubin3"] - return p["edu3_agebin4_mo"] +def race_acs_other(persons): + p = persons.to_frame(columns=["race_id"]) + return p.eq(8) * 1 -@orca.column("persons") -def edu3_agebin5_mo(persons): - p = persons.to_frame(columns=["agebin5_mo", "edubin3"]) - p["edu3_agebin5_mo"] = p["agebin5_mo"] * p["edubin3"] - return p["edu3_agebin5_mo"] +# ----------------------------------------------------------------------------------------- +# MARITAL STATUS VARIABLES +# ----------------------------------------------------------------------------------------- @orca.column("persons") -def marital1(persons): +def mar_married(persons): p = persons.to_frame(columns=["MAR"]) return p.eq(1).astype(int) @orca.column("persons") -def marital2(persons): +def mar_widowed(persons): p = persons.to_frame(columns=["MAR"]) return p.eq(2).astype(int) @orca.column("persons") -def marital3(persons): - p = persons.to_frame(columns=["MAR"]) - # print(persons.local.columns) - return p.eq(3).astype(int) - - -@orca.column("persons") -def marital4(persons): - p = persons.to_frame(columns=["MAR"]) - return p.gt(3).astype(int) - - -@orca.column("persons") -def married_before(persons): - p = persons.to_frame(columns=["MAR"]) - return p["MAR"].between(2, 4) * 1 - - -@orca.column("persons") -def marital25(persons): +def mar_widowed_or_never(persons): p = persons.to_frame(columns=["MAR"]) return p.isin([2, 5]).astype(int) @orca.column("persons") -def marital34(persons): +def mar_div_or_sep(persons): p = persons.to_frame(columns=["MAR"]) return p.isin([3, 4]).astype(int) -######## NOTE: Not needed for now -# # PERSON VARIABLES -# # ----------------------------------------------------------------------------------------- - - -# @orca.column("persons", cache=True) -# def mandatory_work_zone_id(persons): -# return persons.work_zone_id.where(persons.age > 17, "-1").astype(str) - - -# @orca.column("persons", cache=True) -# def mandatory_school_zone_id(persons): -# return persons.school_zone_id.where(persons.age > 17, "-1").astype(str) - - -# @orca.column("persons", cache=True) -# def mandatory_work_dummy(persons): -# has_work = persons.mandatory_work_zone_id != "-1" -# return has_work.astype(int) - - -# @orca.column("persons", cache=True) -# def mandatory_school_dummy(persons): -# has_school = persons.mandatory_school_zone_id != "-1" -# return has_school.astype(int) +# ----------------------------------------------------------------------------------------- +# EDUCATION VARIABLES +# ----------------------------------------------------------------------------------------- -# @orca.column("persons", cache=True) -# def mandatory_activity_dummy(persons): -# school_or_work = (persons.mandatory_school_dummy.astype(bool)) | ( -# persons.mandatory_work_dummy.astype(bool) -# ) -# return school_or_work.astype(int) +# High School or GED +@orca.column("persons") +def edu_hs_ged(persons): + p = persons.to_frame(columns=["edu"])["edu"] + return p.between(16, 17, inclusive="both") * 1 # ----------------------------------------------------------------------------------------- @@ -678,19 +344,6 @@ def intercept(households): return np.ones(households.local.shape[0]) -# @orca.column('households') -# def birth(households): -# return np.zeros(households.local.shape[0]) - 99 - -# @orca.column('households') -# def divorced(households): -# return np.zeros(households.local.shape[0]) - 99 - -# @orca.column('households') -# def birth(households): -# return np.zeros(households.local.shape[0]) - 99 - - @orca.column("households", cache=True, cache_scope="step") def county_id(households): hh = households.to_frame("block_id") @@ -705,9 +358,11 @@ def tract_id(households): @orca.column("households", cache=True) def hh_type(): - hh = orca.get_table("households").to_frame(["persons", "tenure", "age_of_head"]) + hh = orca.get_table("households").to_frame( + ["hh_n_persons", "tenure", "age_of_head"] + ) hh["gt55"] = (hh.age_of_head >= 55).astype("int") - hh["gt2"] = (hh.persons >= 2).astype("int") + hh["gt2"] = (hh.hh_n_persons >= 2).astype("int") hh["hh_type"] = 0 hh.loc[(hh.tenure == 1) & (hh.gt2 == 0) & (hh.gt55 == 0), "hh_type"] = 1 hh.loc[(hh.tenure == 1) & (hh.gt2 == 0) & (hh.gt55 == 1), "hh_type"] = 2 @@ -722,84 +377,6 @@ def hh_type(): return hh.hh_type -@orca.column("households") -def persons(households, persons): - hh = households.local.copy() - persons = pd.DataFrame(persons.local.groupby("household_id").size()) - persons.columns = ["total_persons"] - hh = hh.join(persons) - return hh["total_persons"] - - -@orca.column("households") -def children(households, persons): - hh = households.local.copy() - persons = persons.local.copy() - children = persons[persons["age"] <= 17].groupby("household_id").count() - hh = hh.join(children[["age"]]).fillna(0) - return hh["age"] - - -@orca.column("households") -def adult_count(households, persons): - hh = households.local.copy() - p = persons.local.copy() - p = p[p["age"] >= 18].groupby("household_id").count() - return hh.join(p)["age"].fillna(0) - - -@orca.column("households") -def adult_count2(households, persons): - hh = households.local.copy() - p = persons.local.copy() - p = p[p["age"] >= 18].groupby("household_id").count() == 2 - return hh.join(p)["age"].fillna(False) * 1 - - -@orca.column("households") -def adult_count_gt2(households, persons): - hh = households.local.copy() - p = persons.local.copy() - p = p[p["age"] >= 18].groupby("household_id").count() > 2 - return hh.join(p)["age"].fillna(False) * 1 - - -@orca.column("households") -def persons_65plus(households, persons): - hh = households.local.copy() - persons = persons.local.copy() - persons = persons[persons["age"] >= 65].groupby("household_id").count() - hh = hh.join(persons[["age"]]).fillna(0) - return hh["age"] - - -@orca.column("households") -def persons_black(households, persons): - hh = households.local.copy() - persons = persons.local.copy() - persons = persons[persons["race"] == "black"].groupby("household_id").count() - hh = hh.join(persons[["race"]]).fillna(0) - return hh["race"] - - -@orca.column("households") -def persons_hispanic(households, persons): - hh = households.local.copy() - persons = persons.local.copy() - persons = persons[persons["hispanic"] == 1].groupby("household_id").count() - hh = hh.join(persons[["hispanic"]]).fillna(0) - return hh["hispanic"] - - -@orca.column("households") -def persons_asian(households, persons): - hh = households.local.copy() - persons = persons.local.copy() - persons = persons[persons["race"] == "asian"].groupby("household_id").count() - hh = hh.join(persons[["race"]]).fillna(0) - return hh["race"] - - @orca.column("households") def income_segment(households): s = pd.Series( @@ -816,387 +393,6 @@ def income_segment(households): return s -@orca.column("households") -def income_bin1(households): - df = households.to_frame(columns=["income"]) - return df.lt(25_000) * 1 - - -@orca.column("households") -def income_bin2(households): - df = households.to_frame(columns=["income"])["income"] - return df.between(25_000, 50_000, inclusive="left") * 1 - - -@orca.column("households") -def income_bin3(households): - df = households.to_frame(columns=["income"])["income"] - return df.between(50000, 75000, inclusive="left") * 1 - - -@orca.column("households") -def income_bin4(households): - df = households.to_frame(columns=["income"])["income"] - return df.between(75000, 150000, inclusive="left") * 1 - - -@orca.column("households") -def income_bin5(households): - df = households.to_frame(columns=["income"]) - return df.ge(150000) * 1 - - -# max edu year -@orca.column("households") -def top_edu(persons): - df = persons.to_frame(columns=["household_id", "edu", "relate"]) - df = df[df["relate"] < 2][["household_id", "edu"]] - return df.groupby("household_id").agg({"edu": "max"}) - - -# less than high school -@orca.column("households") -def top_edu_bin1(households): - df = households.to_frame(columns=["top_edu"]) - return df.lt(16) * 1 - - -# high school or equivalent -@orca.column("households") -def top_edu_bin2(households): - df = households.to_frame(columns=["top_edu"])["top_edu"] - return df.between(16, 17, inclusive="both") * 1 - - -# Some college, college, or more than college -@orca.column("households") -def top_edu_bin3(households): - df = households.to_frame(columns=["top_edu"]) - return df.gt(17) * 1 - - -@orca.column("households") -def have_spouse(persons): - df = persons.to_frame(columns=["household_id", "relate"]) - filtered_houses = df[df["relate"] == 1]["household_id"] - return (df["relate"] == 1).groupby(df["household_id"]).sum() - - -@orca.column("households") -def hh_birth_agebin1(persons, households): - df = persons.to_frame(columns=["household_id", "relate", "sex", "age"]) - # households_df = households.to_frame(columns=["have_spouse"]).reset_index() - # df = df.merge(households_df, on="household_id") - df.loc[:, "is_head"] = np.where(df["relate"] == 0, 1, 0) - df.loc[:, "is_female"] = np.where(df["sex"] == 2, 1, 0) - df.loc[:, "female_head"] = df["is_head"] * df["is_female"] - df.loc[:, "is_head_or_spouse"] = np.where(df["relate"].isin([0, 1, 13]), 1, 0) - df.loc[:, "age_head"] = df["age"] * df["is_head"] - df.loc[:, "age_female"] = df["age"] * df["is_female"] * df["is_head_or_spouse"] - df.loc[:, "is_spouse"] = np.where(df["relate"].isin([1, 13]), 1, 0) - df.loc[:, "head_spouse"] = df["is_head"] + df["is_spouse"] - df = df.groupby("household_id").agg( - # size = ("person", "sum"), - age_head=("age_head", "sum"), - age_female=("age_female", "sum"), - head_spouse=("head_spouse", "sum"), - ) - df.loc[:, "age_final"] = np.where( - df["head_spouse"] >= 2, df["age_female"], df["age_head"] - ) - # print("NEW DF", df.shape[0]) - return (df["age_final"] <= 27).astype(int) - - -@orca.column("households") -def hh_birth_agebin2(persons, households): - df = persons.to_frame(columns=["household_id", "relate", "sex", "age"]) - # households_df = households.to_frame(columns=["have_spouse"]).reset_index() - # df = df.merge(households_df, on="household_id") - df.loc[:, "is_head"] = np.where(df["relate"] == 0, 1, 0) - df.loc[:, "is_female"] = np.where(df["sex"] == 2, 1, 0) - df.loc[:, "female_head"] = df["is_head"] * df["is_female"] - df.loc[:, "is_head_or_spouse"] = np.where(df["relate"].isin([0, 1, 13]), 1, 0) - df.loc[:, "age_head"] = df["age"] * df["is_head"] - df.loc[:, "age_female"] = df["age"] * df["is_female"] * df["is_head_or_spouse"] - df.loc[:, "is_spouse"] = np.where(df["relate"].isin([1, 13]), 1, 0) - df.loc[:, "head_spouse"] = df["is_head"] + df["is_spouse"] - df = df.groupby("household_id").agg( - # size = ("person", "sum"), - age_head=("age_head", "sum"), - age_female=("age_female", "sum"), - head_spouse=("head_spouse", "sum"), - ) - df.loc[:, "age_final"] = np.where( - df["head_spouse"] >= 2, df["age_female"], df["age_head"] - ) - # print(df.shape[0]) - return (df["age_final"].between(27, 35, inclusive="right")).astype(int) - - -# @orca.column('households') -# def use_agebin3(persons, households): -# df = persons.to_frame(columns=['household_id', 'relate']) -# households_df = households.to_frame(columns=["have_spouse"]).reset_index() -# df.merge(households_df, on="household_id") -# subset = df[df["relate"].isin([0, 1])] - -# subset["is_head"] = np.where(df["relate"]==0, 1, 0) -# subset["is_female"] = np.where(df["sex"]==2, 1, 0) -# subset["female_head"] = df["is_head"] * df["is_female"] -# subset["person"] = 1 -# subset["age_head"] = subset["age"] * subset["is_head"] -# subset["age_female"] = subset["age"] * subset["is_female"] -# subset = subset.groupby("household_id").agg( -# size = ("person", "sum"), -# age_head = ("age_head", "sum"), -# age_female = ("age_female", "sum") -# ).reset_index() -# subset["age_final"] = np.where(subset["size"]==2, subset["age_female"], subset["age_head"]) - -# return np.where(subset["age_final"]>35, 1, 0) - - -@orca.column("households") -def fam_work(persons): - df = persons.to_frame(columns=["relate", "worker", "household_id"]) - df = df[df["relate"] < 2] - return df.groupby("household_id").agg({"worker": "sum"}) - - -@orca.column("households") -def fam_work0(households): - df = households.to_frame(columns=["fam_work"]) - return df.eq(0) * 1 - - -@orca.column("households") -def fam_work1(households): - df = households.to_frame(columns=["fam_work"]) - return df.eq(1) * 1 - - -@orca.column("households") -def fam_work2(households): - df = households.to_frame(columns=["fam_work"]) - return df.eq(2) * 1 - - -@orca.column("households") -def fsize_bin1(households): - df = households.to_frame(columns=["persons"]) - return df.eq(1) * 1 - - -@orca.column("households") -def fsize_bin2(households): - df = households.to_frame(columns=["persons"]) - return df.eq(2) * 1 - - -@orca.column("households") -def fsize_bin3(households): - df = households.to_frame(columns=["persons"]) - return df.eq(3) * 1 - - -@orca.column("households") -def fsize_bin35(households): - df = households.to_frame(columns=["persons"]) - return df["persons"].between(4, 5) * 1 - - -@orca.column("households") -def fsize_bin5(households): - df = households.to_frame(columns=["persons"]) - return df.gt(5) * 1 - - -@orca.column("households") -def fsize_bin23(households): - df = households.to_frame(columns=["persons"]) - return df["persons"].isin([2, 3]) * 1 - - -@orca.column("households") -def fsize_bingt3(households): - df = households.to_frame(columns=["persons"]) - return df.gt(3) * 1 - - -@orca.column("households") -def avg_age(persons): - df = persons.to_frame(columns=["relate", "age", "household_id"]) - df = df[df["relate"] < 2] - return df.groupby("household_id").agg({"age": "mean"}) - - -@orca.column("households") -def avg_agebin1(households): - df = households.to_frame(columns=["avg_age"])["avg_age"] - return df.between(16, 22, inclusive="both") * 1 - - -@orca.column("households") -def avg_agebin2(households): - df = households.to_frame(columns=["avg_age"])["avg_age"] - return df.between(22, 35, inclusive="right") * 1 - - -@orca.column("households") -def avg_agebin3(households): - df = households.to_frame(columns=["avg_age"])["avg_age"] - return df.between(35, 60, inclusive="right") * 1 - - -@orca.column("households") -def avg_agebin4(households): - df = households.to_frame(columns=["avg_age"]) - return df.gt(60) * 1 - - -@orca.column("households") -def min_age(persons): - df = persons.to_frame(columns=["relate", "age", "household_id"]) - df = df[df["relate"] < 2] - return df.groupby("household_id").agg({"age": "min"}) - - -@orca.column("households") -def min_agebin1(households): - df = households.to_frame(columns=["min_age"])["min_age"] - return df.between(16, 22, inclusive="both") * 1 - - -@orca.column("households") -def min_agebin2(households): - df = households.to_frame(columns=["min_age"])["min_age"] - return df.between(22, 35, inclusive="right") * 1 - - -@orca.column("households") -def min_agebin3(households): - df = households.to_frame(columns=["min_age"])["min_age"] - return df.between(35, 60, inclusive="right") * 1 - - -@orca.column("households") -def min_agebin4(households): - df = households.to_frame(columns=["min_age"]) - return df.gt(60) * 1 - - -@orca.column("households") -def kidsn0(households): - df = households.to_frame(columns=["children"]) - return df.eq(0) * 1 - - -@orca.column("households") -def kidsn1(households): - df = households.to_frame(columns=["children"]) - return df.eq(1) * 1 - - -@orca.column("households") -def kidsn2(households): - df = households.to_frame(columns=["children"]) - return df.eq(2) * 1 - - -@orca.column("households") -def kidsn3(households): - df = households.to_frame(columns=["children"]) - return df.gt(2) * 1 - - -@orca.column("households") -def hd_race_wht(persons): - df = persons.to_frame(columns=["relate", "race_wht", "household_id"]) - df = df[df["relate"] == 0] - df.sort_values("household_id", inplace=True) - return df.set_index("household_id")["race_wht"] - # # TODO: This is hiding an error that needs to be fixed! - # return df.groupby("household_id").first()["race_wht"] - - -@orca.column("households") -def hd_agebin1(households): - df = households.to_frame(columns=["age_of_head"])["age_of_head"] - return df.between(16, 22, inclusive="right") * 1 - - -@orca.column("households") -def hd_agebin2(households): - df = households.to_frame(columns=["age_of_head"])["age_of_head"] - return df.between(22, 35, inclusive="right") * 1 - - -@orca.column("households") -def hd_agebin3(households): - df = households.to_frame(columns=["age_of_head"])["age_of_head"] - return df.between(35, 60, inclusive="right") * 1 - - -@orca.column("households") -def hd_agebin4(households): - df = households.to_frame(columns=["age_of_head"])["age_of_head"] - return df.gt(60) * 1 - - -@orca.column("households") -def husband_works(households, persons): - df = persons.to_frame(columns=["relate", "household_id", "worker", "sex"]) - households = households.to_frame(columns=["cars"]) - df = df[(df["relate"] < 2) & (df["sex"] == 1)] - households["husband_works"] = df.groupby("household_id").agg({"worker": "first"}) - households["husband_works"] = households["husband_works"].fillna(0).astype(int) - return households["husband_works"] - - -@orca.column("households") -def sex_of_head(persons): - df = persons.to_frame(columns=["relate", "household_id", "sex"]) - df.sort_values("relate", inplace=True) - return df.groupby("household_id").agg({"sex": "first"}) - - -@orca.column("households") -def fes(households): - df = households.to_frame( - columns=[ - "fam_work1", - "fam_work2", - "have_spouse", - "husband_works", - "sex_of_head", - "fam_work0", - ] - ) - df["fes"] = 0 - df.loc[(df.fam_work2 == 1), "fes"] = 1 - df.loc[ - (df.have_spouse == 1) & (df.fam_work1 == 1) & (df.husband_works == 1), "fes" - ] = 2 - df.loc[ - (df.have_spouse == 1) & (df.fam_work1 == 1) & (df.husband_works == 0), "fes" - ] = 3 - df.loc[(df.have_spouse == 1) & (df.fam_work0 == 1), "fes"] = 4 - df.loc[ - (df.have_spouse == 0) & (df.fam_work0 == 1) & (df.sex_of_head == 1), "fes" - ] = 6 - df.loc[ - (df.have_spouse == 0) & (df.fam_work0 == 1) & (df.sex_of_head == 2), "fes" - ] = 8 - df.loc[ - (df.have_spouse == 0) & (df.fam_work1 == 1) & (df.sex_of_head == 1), "fes" - ] = 5 - df.loc[ - (df.have_spouse == 0) & (df.fam_work1 == 1) & (df.sex_of_head == 2), "fes" - ] = 7 - return df["fes"] - - # ----------------------------------------------------------------------------------------- # JOB VARIABLES # ----------------------------------------------------------------------------------------- @@ -1321,15 +517,15 @@ def density_hh(blocks): @orca.column("blocks", cache=True, cache_scope="step") def hh_size_1(blocks, households): - hh = households.to_frame(["persons", "block_id"]) - hh_size_1 = hh[hh["persons"] == 1].groupby("block_id").size() + hh = households.to_frame(["hh_n_persons", "block_id"]) + hh_size_1 = hh[hh["hh_n_persons"] == 1].groupby("block_id").size() return pd.Series(index=blocks.index, data=hh_size_1).fillna(0) @orca.column("blocks", cache=True, cache_scope="step") def hh_size_5plus(blocks, households): - hh = households.to_frame(["persons", "block_id"]) - hh_size_5plus = hh[hh["persons"] >= 5].groupby("block_id").size() + hh = households.to_frame(["hh_n_persons", "block_id"]) + hh_size_5plus = hh[hh["hh_n_persons"] >= 5].groupby("block_id").size() return pd.Series(index=blocks.index, data=hh_size_5plus).fillna(0) @@ -1357,9 +553,9 @@ def mean_income(blocks, households): @orca.column("blocks", cache=True) def mean_hh_size(blocks, households): - size = households.to_frame(["persons", "block_id"]).groupby("block_id").mean() + size = households.to_frame(["hh_n_persons", "block_id"]).groupby("block_id").mean() blocks = blocks.local.join(size) - return blocks["persons"].fillna(blocks["persons"].median()) + return blocks["hh_n_persons"].fillna(blocks["hh_n_persons"].median()) @orca.column("blocks", cache=True, cache_scope="step") @@ -1386,52 +582,20 @@ def hh_rent(blocks, households): @orca.column("blocks", cache=True, cache_scope="step") def total_persons(blocks, households): - persons = households.to_frame(["persons", "block_id"]).groupby("block_id").sum() - blocks = blocks.local.join(persons).fillna(0) - return blocks["persons"] - - -@orca.column("blocks", cache=True, cache_scope="step") -def children(blocks, households): - children = households.to_frame(["children", "block_id"]).groupby("block_id").sum() - blocks = blocks.local.join(children).fillna(0) - return blocks["children"] - - -@orca.column("blocks", cache=True, cache_scope="step") -def persons_65plus(blocks, households): - persons_65plus = ( - households.to_frame(["persons_65plus", "block_id"]).groupby("block_id").sum() - ) - blocks = blocks.local.join(persons_65plus).fillna(0) - return blocks["persons_65plus"] - - -@orca.column("blocks", cache=True, cache_scope="step") -def persons_black(blocks, households): - persons = ( - households.to_frame(["persons_black", "block_id"]).groupby("block_id").sum() - ) - blocks = blocks.local.join(persons).fillna(0) - return blocks["persons_black"] - - -@orca.column("blocks", cache=True, cache_scope="step") -def persons_hispanic(blocks, households): persons = ( - households.to_frame(["persons_hispanic", "block_id"]).groupby("block_id").sum() + households.to_frame(["hh_n_persons", "block_id"]).groupby("block_id").sum() ) blocks = blocks.local.join(persons).fillna(0) - return blocks["persons_hispanic"] + return blocks["hh_n_persons"] @orca.column("blocks", cache=True, cache_scope="step") -def persons_asian(blocks, households): - persons = ( - households.to_frame(["persons_asian", "block_id"]).groupby("block_id").sum() +def children(blocks, households): + children = ( + households.to_frame(["hh_n_children", "block_id"]).groupby("block_id").sum() ) - blocks = blocks.local.join(persons).fillna(0) - return blocks["persons_asian"] + blocks = blocks.local.join(children).fillna(0) + return blocks["hh_n_children"] @orca.column("blocks", cache=True, cache_scope="step") diff --git a/docker-compose.yml b/docker-compose.yml index c4c51a5..99188b0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,7 @@ services: build: context: . dockerfile: Dockerfile - image: ghcr.io/${GITHUB_REPOSITORY_OWNER:-nrel}/demos + image: ghcr.io/${GITHUB_REPOSITORY_OWNER:-natlabrockies}/demos tty: true platform: linux/amd64 environment: diff --git a/docs/source/conf.py b/docs/source/conf.py index fd3ca8d..fc6b451 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -15,8 +15,8 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "demos" -copyright = "2025, National Renewable Energy Laboratory" -author = "National Renewable Energy Laboratory" +copyright = "2025, National Laboratory of the Rockies" +author = "National Laboratory of the Rockies" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration @@ -63,7 +63,13 @@ html_static_path = ["_static"] # -- MyST settings ----------------------------------------------------------- -myst_enable_extensions = ["deflist", "html_admonition", "html_image", "colon_fence"] +myst_enable_extensions = [ + "deflist", + "html_admonition", + "html_image", + "colon_fence", + "dollarmath", +] # Sphinx multiversion config smv_branch_whitelist = r"^(main|dev|yep/.*)$" diff --git a/docs/source/pages/advanced_configuration.md b/docs/source/pages/advanced_configuration.md index 45cffe0..8ab78a9 100644 --- a/docs/source/pages/advanced_configuration.md +++ b/docs/source/pages/advanced_configuration.md @@ -88,4 +88,173 @@ modules = [ "aging", "education", ] -``` \ No newline at end of file +``` + +--- + +## Lazily computed columns + +DEMOS uses [Orca](https://udst.github.io/orca/) as its internal data pipeline framework. One of +Orca's most important features for working with DEMOS data is the concept of **lazily computed +columns**, and understanding them will save you a lot of confusion when inspecting or extending +the simulation. + +### What is a lazily computed column? + +In a normal table (a pandas DataFrame), every column holds actual stored values. A lazily computed +column is different: instead of a pre-stored array of values, it is a **Python function** +registered with Orca that *returns* a pandas Series on demand. Orca only calls that function when +some part of the simulation actually asks for the column — hence "lazy" (computed only when needed, +not upfront). + +This is useful because: + +- Columns that depend on other columns (for example, a column that categorises people by age into + bins) are always guaranteed to be up to date, since they are recalculated from the current data + every time they are requested. +- It avoids storing redundant derived data, which keeps memory usage lower. +- Dependencies between columns are explicit and traceable in the code. + +In the DEMOS codebase these columns are registered using the `@orca.column` decorator. For example, +the `child` indicator column (persons table) is defined in `aging.py` as: + +```python +@orca.column(table_name="persons") +def child(data="persons.relate"): + return data.isin([2, 3, 4, 14]).astype(int) +``` + +Every time any module requests `persons["child"]`, Orca calls this function and passes in +the current `relate` column automatically (Orca matches function argument names to registered +data). + +Other examples of lazily computed columns in DEMOS: + +| Column | Table | Defined in | What it computes | +|---|---|---|---| +| `child` | `persons` | `aging.py` | 1 if `relate` ∈ {2, 3, 4, 14}, else 0 | +| `senior` | `persons` | `aging.py` | 1 if `age` ≥ configured senior age (default 65) | +| `age_group` | `persons` | `aging.py` | Categorical age bin (e.g. `"20-29"`, `"30-39"`) | +| `is_head` | `persons` | `household_reorg.py` | 1 if `relate == 0` | +| `is_not_married` | `persons` | `household_reorg.py` | True if `age ≥ 15` and `MAR != 1` | +| `cohabitate` | `persons` | `household_reorg.py` | True if person is a cohabiting partner or head of a cohabiting household | +| `hh_size` | `households` | `household_reorg.py` | Categorical household size (e.g. `"one"`, `"two"`, `"four or more"`) | +| `hh_workers` | `households` | `employment.py` | Categorical worker count (`"none"`, `"one"`, `"two or more"`) | +| `income` | `households` | `employment.py` | Total household income aggregated from person-level earnings | + +### Caching + +Some lazily computed columns are marked with `cache=True`. When caching is enabled, the function +is called once and the result is stored in memory. Subsequent requests for that column return +the stored value without recomputing it, which improves performance for columns that are expensive +to compute. + +You will also see a `cache_scope` parameter alongside `cache=True`. This controls *when* the +cached value is discarded and recomputed: + +- `cache_scope="step"` — The cached value is discarded after each simulation step finishes. + The column will be recomputed fresh the next time a step requests it. This is appropriate for + columns that should not change *within* a step but may change between steps. +- `cache_scope="forever"` — The value is computed once and never cleared for the duration + of the simulation run. Use this only for columns that genuinely cannot change (for example, + a fixed income distribution table). + +For example, `is_not_married` and `cohabitate` in `household_reorg.py` both use +`cache_scope="step"` because they are read multiple times within a single household +reorganisation step but must reflect an updated persons table at the start of the next step. + +### Input columns are overwritten by computed columns + +> **Important:** If a column with the same name exists both in the input data (i.e., it is a +> column in your `persons.h5` or `households.h5` file) **and** as a registered Orca computed +> column, **the computed column takes precedence and the input value is silently ignored**. + +This is by design in Orca: registered column functions are always considered authoritative. In +practice this means: + +- If your input synthetic population includes a column called `hh_size` in the `households` + table, it will be **replaced** by the value computed dynamically from the persons table. +- Similarly, `child`, `senior`, `age_group`, `is_head`, and every other column in the table + above will override any same-named column in your input files. + +This is usually the correct behaviour (computed values are up to date), but it is worth being +aware of when preparing input data or when debugging unexpected values. + +--- + +## GEOID columns and geographic assignment + +Several modules in DEMOS create or reorganise households during the simulation (for example, +when two people get married they may form a new household, or when a young adult moves out they +start a new household). Whenever a new household is created, DEMOS needs to assign it a +geographic unit — otherwise the new household would have no location and downstream +processing steps that depend on geography would fail or produce incorrect results. + +DEMOS handles this through a configurable **GEOID column** setting in each relevant module. +The GEOID column is the name of the column in the `households` table that stores the geographic +identifier for each household (for example, a TAZ code, a Census tract GEOID, or a county ID). +When a new household is created, DEMOS copies the GEOID value from the original household +(the one the person is splitting from or merging with) into the new household row. + +### Modules that use `geoid_col` + +**Household Reorganization** (`household_reorg_module_config.geoid_col`) + +The household reorganisation step creates new households for three events: + +- two people who were not heads of household form a new household together (new marriage or + cohabitation where neither person was a household head); +- a cohabitating partner leaves to form their own household after a break-up; +- divorce (one of the partners moves out and starts a new household). + +In all three cases the new household inherits the GEOID from the departing person's original +household. If `geoid_col` is not set (left as `null`), no geographic assignment is performed +for newly created households. + +```toml +[hh_reorg_module_config] +geoid_col = "taz_id" # name of the column in the households table that holds the geographic ID +``` + +**Kids Moving** (`kids_moving_module_config.geoid_col`) + +When a young adult moves out of the parental household, a new single-person household is created. +The `geoid_col` setting tells DEMOS which column to copy from the original household to the +new one. The `lcm_county_id` column is always copied in addition to `geoid_col`. + +```toml +[kids_moving_module_config] +geoid_col = "taz_id" +calibration_target_share = 0.12 +calibration_tolerance = 0.01 +max_iter = 100 +``` + +**Household Rebalancing** (`hh_rebalancing_module_config.geoid_col`) + +The rebalancing step duplicates or removes households to match external control totals. The +`geoid_col` parameter here indicates which column in the **households** table denotes the +geographic unit over which the control totals are stratified. The control totals table (set +via `control_table`) must also contain a column with this exact name. + +```toml +[hh_rebalancing_module_config] +control_table = "hsize_ct" +control_col = "hh_size" +geoid_col = "lcm_county_id" +``` + +### Making sure your GEOID column is consistent + +Because GEOID values are inherited (copied from original to new household), consistency of the +GEOID column in your input data is critical: + +1. The column name you set in `geoid_col` must exist in the `households` table of your input + file. If it does not, DEMOS will raise a `KeyError` when the first household-creation event + occurs. +2. The same column name must be used consistently across all module configurations. Mixing + `"taz_id"` in one module and `"TAZ"` in another will cause some new households to be missing + their geographic assignment. +3. For the rebalancing module specifically, the control totals CSV must contain a column with + the exact same name and values as the GEOID column in the households table. A mismatch will + cause the rebalancing step to fail with an assertion error. \ No newline at end of file diff --git a/docs/source/pages/calibration.md b/docs/source/pages/calibration.md new file mode 100644 index 0000000..a0374e0 --- /dev/null +++ b/docs/source/pages/calibration.md @@ -0,0 +1,348 @@ +# Model Calibration + +DEMOS includes a calibration system that adjusts the internal parameters of its estimated models (logit, multinomial logit, and regression) so that the simulation output matches observed real-world data. Without calibration, models will produce output strictly derived from their original estimation (fitting) data, which may not match conditions in the target geography or time period. Calibration shifts model predictions closer to observed aggregate statistics — such as the total number of births, deaths, or employed persons in a given year — while preserving the underlying behavioral relationships captured during model estimation. + +This page explains: +1. [How Calibration Works](#how-calibration-works) +2. [Modules That Support Calibration](#modules-that-support-calibration) +3. [Preparing Observed Data Files](#preparing-observed-data-files) +4. [Configuring Calibration in the TOML File](#configuring-calibration-in-the-toml-file) +5. [Modules with Simultaneous Calibration: Employment and Household Reorganization](#modules-with-simultaneous-calibration-employment-and-household-reorganization) + +--- +(how-calibration-works)= +## How Calibration Works + +DEMOS uses an iterative adjustment procedure based on the **Root Mean Square Error (RMSE)** between the model's predictions and a target value drawn from an observed data table. At each iteration, a single parameter (the model's *intercept*) is nudged so that the aggregate predicted count (or share) moves toward the observed value. This continues until either the error falls below a user-specified tolerance, or the maximum number of iterations is reached. + +### The update rule + +At each iteration, the intercept $\beta_0$ is updated as: + +$$\beta_0 \leftarrow \beta_0 + \ln\!\left(\frac{\text{target}}{\sum \hat{y}}\right)$$ + +where $\sum \hat{y}$ is the sum of the current model predictions and $\text{target}$ is the observed value for the simulation year. This log-ratio update is a standard technique for intercept adjustment in discrete-choice and regression models: shifting the intercept by this amount is equivalent to re-scaling the aggregate predicted probability so it matches the aggregate observed rate. + +(absolute-vs-relative-tolerance)= +### Absolute vs relative tolerance + +The calibration procedure supports two ways of measuring how close "close enough" is: + +- **Absolute** (`tolerance_type = "absolute"`): The error is computed as the RMSE between the *total predicted count* and the *observed count*. The tolerance value is therefore expressed in the same units as the counts (e.g., number of people). Use this when your observed data contains raw counts. +- **Relative** (`tolerance_type = "relative"`): The error is computed as the RMSE between the *predicted share* (predicted count divided by the eligible population size) and the *observed share*. The tolerance value is a fraction (e.g., `0.01` means 1%). Use this when your observed data contains rates or proportions. + +--- +(modules-that-support-calibration)= +## Modules That Support Calibration + +The following modules support calibration via the standard `calibration_procedure` configuration block: + +| Module | Config key | Calibrated model | What is calibrated | +|---|---|---|---| +| **Mortality** (`fatality`) | `mortality_module_config` | `mortality` | Total deaths per year | +| **Birth** (`birth`) | `birth_module_config` | `birth` | Total births per year | + +The following modules have a **different calibration interface** (see [below](#modules-with-simultaneous-calibration-employment-and-household-reorganization)): + +| Module | Config key | Note | +|---|---|---| +| **Employment** (`employment`) | `employment_module_config` | Simultaneous calibration across two sub-models | +| **Household Reorganization** (`household_reorg`) | `hh_reorg_module_config` | Simultaneous calibration across three sub-models | + +The **Kids Moving** module (`kids_moving`) also performs hardcoded calibration internally; its parameters are controlled directly under `kids_moving_module_config` (see the [reference configuration](../api/configuration_module.rst)). + +--- +(preparing-observed-data-files)= +## Preparing Observed Data Files + +Each module that uses the standard calibration procedure requires a CSV file containing observed aggregate statistics, one row per year. The file must have a column named **`year`** (used as the row index) and either a **`count`** or **`share`** column depending on your chosen `tolerance_type`. + +### File format for absolute calibration (`tolerance_type = "absolute"`) + +The file must contain a column named `count` with the observed total number of events (deaths, births, etc.) for each year. + +``` +year,count +2010,1452 +2011,1489 +2012,1503 +... +``` + +### File format for relative calibration (`tolerance_type = "relative"`) + +The file must contain a column named `share` with the observed *rate* (a number between 0 and 1) for each year. + +``` +year,share +2010,0.0092 +2011,0.0094 +2012,0.0096 +... +``` + +> **Important:** The index column must be named `year`. Make sure this column contains integer years matching the years in your simulation run (between `base_year` and `forecast_year`). If a year is missing from the file, the simulation will raise an error when it tries to look up the target value. + +The file can be stored anywhere accessible from the configuration file. A common convention is to place these files in a subdirectory of your data folder, e.g.: + +``` +data/ + my_region/ + observed_calibration_values/ + mortalities_over_time.csv + births_over_time.csv +``` + +--- +(configuring-calibration-in-the-toml-file)= +## Configuring Calibration in the TOML File + +Calibration is configured inside the relevant module's configuration block in your `demos_config.toml` file. Each calibratable module has a `calibration_procedure` sub-block. + +### Full example: mortality module with absolute calibration + +```toml +[mortality_module_config.calibration_procedure] +procedure_type = "rmse_error" +tolerance_type = "absolute" +tolerance = 30 +max_iter = 500 + +[mortality_module_config.calibration_procedure.observed_values_table] +file_type = "csv" +table_name = "observed_fatalities_data" +filepath = "../data/my_region/observed_calibration_values/mortalities_over_time.csv" +index_col = "year" +``` + +### Full example: birth module with absolute calibration + +```toml +[birth_module_config.calibration_procedure] +procedure_type = "rmse_error" +tolerance_type = "absolute" +tolerance = 60 +max_iter = 1000 + +[birth_module_config.calibration_procedure.observed_values_table] +file_type = "csv" +table_name = "observed_births_data" +filepath = "../data/my_region/observed_calibration_values/births_over_time.csv" +index_col = "year" +``` + +### Parameter reference + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `procedure_type` | string | Yes | Must be `"rmse_error"`. This is the only available calibration procedure. | +| `tolerance_type` | string | No | `"absolute"` (default) or `"relative"`. Controls how the error is measured. See [above](#absolute-vs-relative-tolerance). | +| `tolerance` | float | Yes | The error threshold below which calibration stops. Units depend on `tolerance_type`. | +| `max_iter` | int | No | Maximum number of calibration iterations (default: 20). Calibration stops here even if the tolerance has not been reached. | +| `observed_values_table` | table block | Yes | Defines the CSV file with observed values. Follows the same format as any `[[tables]]` entry. | + +### The `observed_values_table` sub-block + +The `observed_values_table` block defines the file containing observed data. It uses the same fields as a regular data source entry: + +| Field | Description | +|---|---| +| `file_type` | Must be `"csv"` for observed calibration data. | +| `table_name` | An internal name for this table in DEMOS memory. Must be unique across all tables. Pick a descriptive name like `"observed_fatalities_data"`. | +| `filepath` | Path to the CSV file, relative to your configuration file. | +| `index_col` | Must be `"year"`. | + +### Disabling calibration + +To disable calibration for a module, simply remove or comment out the entire `calibration_procedure` block for that module: + +```toml +# [mortality_module_config.calibration_procedure] +# procedure_type = "rmse_error" +# ... +``` + +If the block is absent, DEMOS will run the model using the original estimated parameters with no adjustment. + +--- +(modules-with-simultaneous-calibration-employment-and-household-reorganization)= +## Modules with Simultaneous Calibration: Employment and Household Reorganization + +Two modules — **Employment** and **Household Reorganization** — use a more complex calibration strategy called *simultaneous calibration*. Instead of calibrating a single model against a single target, these modules adjust multiple models at the same time so that their *joint output* matches aggregate observed counts. + +This approach is necessary because the outcome of interest (e.g., total number of employed persons, total number of married persons) depends on the combined output of several sub-models that each influence the same population. Calibrating them independently would cause the adjustments to conflict with each other. + +### The simultaneous calibration algorithm + +Simultaneous calibration uses a **momentum-based gradient descent** approach. At each iteration: + +1. All sub-models are run on their respective data to produce predictions. +2. The combined predicted aggregate is compared to the observed aggregate. +3. An update step $g$ is computed using a momentum term to smooth out oscillations: + +$$g_t = \alpha_t \left( \mu \cdot g_{t-1} + (1-\mu) \cdot \delta_t \right)$$ + +where: +- $\alpha_t$ is a *decaying learning rate* that starts at `learning_rate` and decreases toward zero as the iteration count approaches `max_iter` +- $\mu$ is the `momentum_weight` (a value between 0 and 1 that controls how much the previous gradient update is retained) +- $\delta_t = \ln(\text{target} / \hat{y}_t)$ is the log-ratio between the target and current prediction + +4. The intercepts of each sub-model are adjusted by $g_t$. +5. Steps 1–4 repeat until the absolute error falls below `tolerance` or `max_iter` is reached. + +The decaying learning rate helps the algorithm converge: large adjustments happen early, and corrections become finer as the prediction approaches the target. + +--- + +### Employment module + +The Employment module calibrates two sub-models simultaneously: + +- **`enter_labor_force`**: A logit model predicting which unemployed persons (age ≥ 18) transition into employment. +- **`exit_labor_force`**: A logit model predicting which employed persons (age ≥ 18) exit the workforce. + +The calibration target is the **total number of workers** (employed persons) predicted after running both models, compared against an observed count loaded from the `observed_employment` table. + +> **Important:** The table name `observed_employment` is hard-coded. You must load this table via the `[[tables]]` section of your configuration with exactly that name. + +#### Required observed data table + +The `observed_employment` table must be a CSV with a `year` index and a `count` column containing the total observed employed persons per year: + +``` +year,count +2010,18423 +2011,18751 +... +``` + +Add it to your configuration under `[[tables]]`: + +```toml +[[tables]] +file_type = "csv" +table_name = "observed_employment" +filepath = "../data/my_region/observed_calibration_values/employment_obs.csv" +index_col = "year" +``` + +#### Calibration configuration block + +```toml +[employment_module_config.simultaneous_calibration_config] +tolerance = 100 +max_iter = 20 +learning_rate = 2.0 +momentum_weight = 0.3 +``` + +#### Parameter reference + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `tolerance` | float | Yes | — | Absolute error threshold (in number of workers) below which calibration stops. | +| `max_iter` | int | No | 20 | Maximum number of calibration iterations. | +| `learning_rate` | float | No | 2.5 | Controls the step size at the first iteration. Decays linearly toward zero. | +| `momentum_weight` | float | No | 0.3 | Weight applied to the previous gradient update. Values closer to 1 introduce more smoothing. | + +#### Disabling employment calibration + +To disable simultaneous calibration for employment, comment out or remove the block: + +```toml +# [employment_module_config.simultaneous_calibration_config] +# ... +``` + +When the block is absent, DEMOS checks whether individual model calibration procedures are defined under `employment_module_config.enter_model_calibration_procedure` and `employment_module_config.exit_model_calibration_procedure`. If neither is present, no calibration is performed. Note that simultaneous calibration and per-model calibration are mutually exclusive: defining both will raise a validation error. + +--- + +### Household Reorganization module + +The Household Reorganization module calibrates three sub-models simultaneously: + +- **`marriage`**: A multinomial logit model predicting transitions for unmarried, non-cohabitating persons — options are: stay single, start cohabitating, or get married. +- **`cohabitation`**: A multinomial logit model predicting transitions for cohabitating persons — options are: stay cohabitating, break up, or get married. +- **`divorce`**: A binary logit model predicting which married households undergo a divorce. + +The combined output of these three models determines the predicted number of married and divorced persons after the step. Calibration targets both of these counts simultaneously, weighting each proportionally to its magnitude in the observed data. + +> **Important:** The table name `observed_marrital_data` is hard-coded. You must load this table via the `[[tables]]` section with exactly that name. + +#### Required observed data table + +The `observed_marrital_data` table must be a CSV with a `year` index and columns `MAR` and `count`, containing observed counts for each marital status category per year. The values used during calibration are `MAR == 1` (married) and `MAR == 3` (divorced/separated): + +``` +year,MAR,count +2010,1,14200 +2010,3,2300 +2011,1,14350 +2011,3,2280 +... +``` + +Add it to your configuration under `[[tables]]`: + +```toml +[[tables]] +file_type = "csv" +table_name = "observed_marrital_data" +filepath = "../data/my_region/observed_calibration_values/marrital_status_over_time_obs.csv" +index_col = "year" +``` + +#### Calibration configuration block + +```toml +[hh_reorg_module_config.simultaneous_calibration_config] +tolerance = 5000 +max_iter = 100 +learning_rate = 1.5 +momentum_weight = 0.3 +``` + +#### Parameter reference + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `tolerance` | float | Yes | — | Combined RMSE threshold (in number of persons) below which calibration stops. The error combines the residuals for the married and divorced counts. | +| `max_iter` | int | No | 20 | Maximum number of calibration iterations. | +| `learning_rate` | float | No | 2.5 | Controls the initial step size. Decays linearly toward zero. | +| `momentum_weight` | float | No | 0.3 | Weight applied to the previous gradient update. | + +#### How intercept updates are applied + +Each of the three sub-models is adjusted through a different parameter: + +- **`divorce`**: The model's primary intercept (`fitted_parameters[0]`) is shifted by the divorce gradient. +- **`marriage`**: The `married` outcome coefficient in the model's coefficient table is shifted by the marriage gradient. +- **`cohabitation`**: The `marriage` outcome coefficient in the model's coefficient table is shifted by the cohabitation gradient (cohabitation → marriage transitions are steered to match the marriage target). + +The marriage and cohabitation gradients are weighted by the *proportion of the married count* relative to the total observed marital transitions. The divorce gradient is weighted by the *proportion of the divorced count*. This ensures that the calibration effort is distributed in proportion to the size of each phenomenon. + +#### Disabling household reorganization calibration + +To disable calibration for household reorganization, comment out or remove the block: + +```toml +# [hh_reorg_module_config.simultaneous_calibration_config] +# ... +``` + +When absent, models run with their original estimated parameters. + +--- + +## Choosing Good Tolerance Values + +Choosing a tolerance that is too tight (very small) can cause calibration to run for the full `max_iter` without converging, adding unnecessary compute time. Choosing one that is too loose means the model output may deviate substantially from observed data. + +A practical starting point: + +- For **absolute** tolerances, set the tolerance to approximately 1–5% of the observed count for the most common year. For example, if you observe ~1,500 deaths per year, a tolerance of `30` (≈2%) is reasonable. +- For **relative** tolerances, values between `0.005` and `0.02` (0.5%–2%) work well for most demographic rates. +- For **simultaneous calibration**, the tolerance represents the absolute difference in person counts. Values of 100–5,000 are typical depending on the size of the synthetic population. + +If calibration is not converging even after increasing `max_iter`, consider increasing the `learning_rate` slightly. If the algorithm oscillates (error fluctuates without declining), try increasing `momentum_weight` toward 0.5–0.7 or decreasing `learning_rate`. diff --git a/docs/source/pages/configuration.rst b/docs/source/pages/configuration.rst index 03569cd..9774748 100644 --- a/docs/source/pages/configuration.rst +++ b/docs/source/pages/configuration.rst @@ -5,5 +5,7 @@ DEMOS Configuration :maxdepth: 2 ../api/configuration_module + ../pages/calibration.md + ../pages/variables.md datasources default_configuration \ No newline at end of file diff --git a/docs/source/pages/datasources.rst b/docs/source/pages/datasources.rst index ab17ffc..fb598d0 100644 --- a/docs/source/pages/datasources.rst +++ b/docs/source/pages/datasources.rst @@ -1,7 +1,148 @@ Data Sources ============ -Documentation for configuration of data sources currently implemented +DEMOS reads all input data from flat files declared in the ``[[tables]]`` section of your configuration +file. Every entry in ``[[tables]]`` describes one table that will be loaded into memory and made +available to the simulation by name. DEMOS supports two file formats: **CSV** and **HDF5 (H5)**. + +This page explains when to use each format, what each parameter controls, and shows complete +configuration examples for both. + +---- + +Choosing Between CSV and HDF5 +----------------------------- + +The two formats serve different roles and you will often use both in the same project: + +* **CSV files** are plain-text tables, one row per line, columns separated by commas (or another + delimiter you specify). They are easy to open in a spreadsheet application, easy to edit, and + easy to produce from any tool. Use CSV for smaller or auxiliary tables such as calibration + reference data, mapping tables, and control totals. + +* **HDF5 files** (``.h5``) are a binary format designed to store large numerical datasets + efficiently. They can contain multiple tables (called *keys*) in a single file. Use H5 for + the main synthetic population tables (``persons`` and ``households``) because they are + typically large and the binary format reads much faster than CSV. + +---- + +CSV Tables +---------- + +A CSV entry in the ``[[tables]]`` section looks like this: + +.. code-block:: toml + + [[tables]] + file_type = "csv" + table_name = "income_rates" + filepath = "../data/my_region/income_rates.csv" + index_col = "year" + +The ``file_type = "csv"`` value identifies this entry as a CSV source. + +Required parameters +~~~~~~~~~~~~~~~~~~~ + +* ``file_type`` — Must be ``"csv"``. +* ``table_name`` — The name by which this table will be referred to everywhere in DEMOS + (for example, in module configuration entries like ``observed_values_table``). + Choose a short, descriptive name with no spaces, using underscores: e.g. + ``"observed_births_data"`` or ``"income_rates"``. +* ``filepath`` — Path to the CSV file. This can be a relative path (evaluated relative to the + configuration file location) or an absolute path. +* ``index_col`` — The column in the CSV file to use as the row identifier (the *index*). + This column will not appear as a regular data column; it becomes the label for each row. + For calibration tables the index must be ``"year"``. + +Optional parameters +~~~~~~~~~~~~~~~~~~~ + +* ``delimiter`` — The character used to separate columns. Defaults to ``","`` for standard CSV. + Set to ``"\t"`` for tab-separated files or ``";"`` for semicolon-separated files. +* ``custom_dtype_casting`` — An inline mapping from column names to data types. This is useful + when pandas would otherwise infer the wrong type, such as reading a GEOID code as an integer + when it should be kept as a string. Example: + + .. code-block:: toml + + custom_dtype_casting = {"lcm_county_id" = "object", "year" = "int", "rate" = "float"} + + Supported type strings are any value accepted by ``pandas.read_csv``'s ``dtype`` argument: + ``"object"`` (string), ``"int"``, ``"float"``, ``"bool"``. + +Full example with all parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: toml + + [[tables]] + file_type = "csv" + table_name = "income_rates" + filepath = "../data/my_region/observed_calibration_values/income_rates.csv" + index_col = "year" + custom_dtype_casting = {"lcm_county_id" = "object", "year" = "int", "rate" = "float"} + +---- + +HDF5 Tables +----------- + +An HDF5 entry in the ``[[tables]]`` section looks like this: + +.. code-block:: toml + + [[tables]] + file_type = "h5" + table_name = "persons" + filepath = "../data/my_region/population.h5" + h5_key = "persons" + +The ``file_type = "h5"`` value identifies this entry as an HDF5 source. + +Required parameters +~~~~~~~~~~~~~~~~~~~ + +* ``file_type`` — Must be ``"h5"``. +* ``table_name`` — The name by which this table will be referred to everywhere in DEMOS. + For the main synthetic population this is almost always ``"persons"`` and ``"households"``. +* ``filepath`` — Path to the HDF5 file. +* ``h5_key`` — The key inside the HDF5 file that identifies the table to load. An HDF5 file + can contain many tables under different keys, similar to sheets in a spreadsheet workbook. + To inspect the keys in an HDF5 file you can open it in Python: + + .. code-block:: python + + import pandas as pd + store = pd.HDFStore("../data/my_region/population.h5") + print(store.keys()) # e.g. ['/persons', '/households'] + store.close() + +Full example with all parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: toml + + [[tables]] + file_type = "h5" + table_name = "persons" + filepath = "../data/my_region/population.h5" + h5_key = "persons" + + [[tables]] + file_type = "h5" + table_name = "households" + filepath = "../data/my_region/population.h5" + h5_key = "households" + +Note that two tables can be loaded from the same HDF5 file by using two separate +``[[tables]]`` entries that share the same ``filepath`` but point to different ``h5_key`` values. + +---- + +API Reference +------------- .. autoclass:: demos.datasources.CSVTableSource :members: diff --git a/docs/source/pages/intro.md b/docs/source/pages/intro.md index 3740ca5..db98387 100644 --- a/docs/source/pages/intro.md +++ b/docs/source/pages/intro.md @@ -26,10 +26,12 @@ This document summarizes instructions to install, configure and run DEMOS. Secti ## 1. Installation ### Docker Compose (recommended) -The latest docker image for demos is stored in `ghcr.io/NatLabRockies/demos:latest`. The input data and configuration file are fed to the container through volumes ([more info about Docker volumes](https://docs.docker.com/engine/storage/volumes/)). We provide a `docker-compose` workflow that can be used to make the process of mounting volumes easier. +The latest docker image for demos is stored in `ghcr.io/NatLabRockies/demos:latest`. The input data and configuration file are fed to the container through volumes ([more info about Docker volumes](https://docs.docker.com/engine/storage/volumes/)). We provide a `docker-compose` workflow that can be used to make the process of mounting volumes easier. Make sure you have [Docker](https://docs.docker.com/desktop/) and [Docker Compose](https://docs.docker.com/compose/install/) installed before you proceed. + +The following instructions will guide you through running DEMOS with example data of two hypothetical counties. This example is provided to help users quickly get started with DEMOS. It includes the required inputs to run DEMOS for two example counties. The data and configuration files required to run this example are located in `./data/` and `./configuration` folders. You can change where DEMOS will look for your data following the instructions [in the Docs](https://NatLabRockies.github.io/DEMOS/). #### Clone this repository -By cloning this repository you download the configuration and data for an example run of DEMOS. +By cloning this repository you download the configuration and data for an example run of DEMOS. You can also use the `Download ZIP` option available through the green `Code` button in the [GitHub repo](https://github.com/NatLabRockies/DEMOS) and decompress it to achieve the same results as the clone command. Run the following command in the Terminal App (MacOS) or Command Prompt/PowerShell (Windows): ```bash @@ -40,14 +42,12 @@ cd DEMOS # This folder contains (among other files) a data and configuration folder # as well as a docker-compose.yml file -``` - -Make sure you have [Docker](https://docs.docker.com/desktop/) and [Docker Compose](https://docs.docker.com/compose/install/) installed. Now you can run docker as follows: -```bash +# This command runs DEMOS on a docker container docker compose up ``` + > **Note:** > Make sure the Docker Daemon is running. This changes from system to system but Docker Desktop should have a status flag indicating if the daemon is live, if Desktop is available diff --git a/docs/source/pages/variables.md b/docs/source/pages/variables.md new file mode 100644 index 0000000..0695180 --- /dev/null +++ b/docs/source/pages/variables.md @@ -0,0 +1,255 @@ +# Variables Reference + +DEMOS computes lazily evaluated columns (via the [orca](https://udst.github.io/orca/) framework) that +are available as inputs to model expressions and calibration steps. This page describes the naming +conventions used for these columns and provides an inventory of every computed variable by table. + +```{contents} +:local: +:depth: 2 +:backlinks: none +``` + +--- + +## Naming Conventions + +### Persons table + +| Prefix / Pattern | Meaning | Example | +|---|---|---| +| `age_` | Age bin for persons | `age_23_35`, `age_60plus` | +| `age_emp_` | Age bin used in employment (enter/exit labor-force) models | `age_emp_20_40`, `age_emp_70plus` | +| `age_mort_` | Age bin used in the mortality model | `age_mort_21_40`, `age_mort_90plus` | +| `age_km_` | Age bin used in the kids-move model | `age_km_19_20`, `age_km_30plus` | +| `sex_` | Sex indicator | `sex_female` | +| `emp_idle_` | Idle (non-working) employment status bin | `emp_idle_under60`, `emp_idle_over60` | +| `edu_` | Education attainment bin | `edu_hs_ged`, `edu_college_plus` | +| `race_` | Race/ethnicity indicator | `race_white`, `race_black`, `race_asian_pi`, `race_native_am`, `race_acs_other`, `race_hawaiian`, `race_asian`, `race_other` | +| `mar_` | Marital status indicator | `mar_married`, `mar_widowed`, `mar_div_or_sep`, `mar_widowed_or_never` | +| `emp_idle_*_age_km_*` | Cross-product: employment status × kids-move age bin | `emp_idle_under60_age_km_19_20` | +| `edu_*_age_km_*` | Cross-product: education bin × kids-move age bin | `edu_hs_ged_age_km_21_25` | +| `intercept` | Constant term (ones vector) | `intercept` | + +### Households table + +| Prefix / Pattern | Meaning | Example | +|---|---|---| +| `hh_` | Computed household-level aggregate or indicator | `hh_n_persons`, `hh_n_children` | +| `hh_income_` | Household income bin | `hh_income_bin1` … `hh_income_bin5` | +| `hh_edu_top_` | Education bin of the most-educated head/spouse | `hh_edu_top`, `hh_edu_top_bin2`, `hh_edu_top_bin3` | +| `hh_fam_work` | Count of working head/spouse members | `hh_fam_work`, `hh_fam_work2` | +| `hh_age_avg_` | Average age of head/spouse, and bins | `hh_age_avg`, `hh_age_avg_bin2` … `hh_age_avg_bin4` | +| `hh_age_min_` | Minimum age of head/spouse, and bins | `hh_age_min`, `hh_age_min_bin2` … `hh_age_min_bin4` | +| `hh_fsize_` | Household-size bin for birth model | `hh_fsize_bin23`, `hh_fsize_bingt3` | +| `hh_birth_age_` | Age of relevant female member, for birth model | `hh_birth_age_lt27`, `hh_birth_age_27_35` | +| `hh_head_` | Attribute of the household head (relate == 0) | `hh_head_age`, `hh_head_race_id`, `hh_head_race_str`, `hh_head_race_white` | +| `hh_head_edu_` | Education bin of head only (used in income model) | `hh_head_edu_bin1` … `hh_head_edu_bin3` | +| `hh_head_race_` | Race indicator derived from the household head | `hh_head_race_black`, `hh_head_race_native_am`, `hh_head_race_asian`, `hh_head_race_hawaiian`, `hh_head_race_acs_other` | +| `hh_size_str` | Categorical household size string ("one", "two", …) | `hh_size_str` | +| `intercept` | Constant term (ones vector) | `intercept` | + +> **`hh_edu_top_bin` vs. `hh_head_edu_bin`** +> These are distinct columns. `hh_edu_top_bin` reflects the maximum education of the head *or* spouse +> (relate < 2) and is used in the divorce and cohabitation models. `hh_head_edu_bin` reflects the +> education of the household head only (relate == 0) and is used in the income model. + +--- + +## Column Inventory + +### Persons columns + +#### Age bins (general) + +| Column | Definition | +|---|---| +| `age_23_35` | Age ∈ [23, 35] | +| `age_36_60` | Age ∈ [36, 60] | +| `age_60plus` | Age > 60 | +| `intercept` | 1 for every person | + +#### Age bins — employment models + +| Column | Definition | +|---|---| +| `age_emp_20_40` | Age ∈ [20, 40] | +| `age_emp_41_50` | Age ∈ [41, 50] | +| `age_emp_51_70` | Age ∈ [51, 70] | +| `age_emp_70plus` | Age > 70 | + +#### Age bins — mortality model + +| Column | Definition | +|---|---| +| `age_mort_21_40` | Age ∈ [21, 40] | +| `age_mort_41_50` | Age ∈ [41, 50] | +| `age_mort_51_70` | Age ∈ [51, 70] | +| `age_mort_71_90` | Age ∈ [71, 90] | +| `age_mort_90plus` | Age > 90 | + +#### Age bins — kids-move model + +| Column | Definition | +|---|---| +| `age_km_16_18` | Age ∈ [16, 18] | +| `age_km_19_20` | Age ∈ [19, 20] | +| `age_km_21_25` | Age ∈ [21, 25] | +| `age_km_26_30` | Age ∈ [26, 30] | +| `age_km_30plus` | Age > 30 | + +#### Sex + +| Column | Definition | +|---|---| +| `sex_female` | sex == 2 | + +#### Employment status + +| Column | Definition | +|---|---| +| `emp_idle_under60` | worker == 0 and age < 60 | +| `emp_idle_over60` | worker == 0 and age ≥ 60 | + +#### Education + +| Column | Definition | +|---|---| +| `edu_hs_ged` | edu ∈ [16, 17] (high school diploma or GED) | +| `edu_college_plus` | edu > 17 (some college or higher) | + +#### Race / ethnicity + +| Column | Definition | Notes | +|---|---|---| +| `race_white` | race_id == 1 | | +| `race_black` | race_id == 2 | | +| `race_native_am` | race_id ∈ {3, 4, 5} | | +| `race_asian` | race_id == 6 | | +| `race_hawaiian` | race_id == 7 | | +| `race_asian_pi` | race_id ∈ {6, 7} | Asian or Pacific Islander combined | +| `race_acs_other` | race_id == 8 | ACS "some other race" category; building block for head-of-household columns | +| `race_other` | race_id ∈ {3, 4, 5, 8, 9} | Broad "other" category used in employment models | + +#### Marital status + +| Column | Definition | +|---|---| +| `mar_married` | MAR == 1 | +| `mar_widowed` | MAR == 2 | +| `mar_div_or_sep` | MAR ∈ {3, 4} | +| `mar_widowed_or_never` | MAR ∈ {2, 5} | + +#### Cross-products (kids-move model) + +Eight `emp_idle_under60 × age_km_*` and eight `emp_idle_over60 × age_km_*` columns, plus eight +`edu_hs_ged × age_km_*` and eight `edu_college_plus × age_km_*` columns. The pattern is +`_`, e.g. `emp_idle_under60_age_km_19_20`. + +--- + +### Households columns + +#### Head-of-household attributes + +| Column | Definition | +|---|---| +| `hh_head_age` | Age of person with relate == 0 | +| `hh_head_race_id` | race_id of person with relate == 0 | +| `hh_head_race_str` | String label of head's race: `'white'`, `'black'`, `'asian'`, or `'other'` | +| `hh_head_race_white` | 1 if head's race_id == 1 | +| `hh_head_race_black` | 1 if head's race_id == 2 | +| `hh_head_race_native_am` | 1 if head's race_id ∈ {3, 4, 5} | +| `hh_head_race_asian` | 1 if head's race_id == 6 | +| `hh_head_race_hawaiian` | 1 if head's race_id == 7 | +| `hh_head_race_acs_other` | 1 if head's race_id == 8 | +| `hh_head_edu_bin1` | head's edu < 16 (less than high school) | +| `hh_head_edu_bin2` | head's edu ∈ [16, 17] (HS/GED) | +| `hh_head_edu_bin3` | head's edu > 17 (some college or more) | +| `hh_size_str` | Categorical size: `'one'`, `'two'`, etc. | + +#### Household composition + +| Column | Definition | +|---|---| +| `hh_n_persons` | Total number of persons in the household | +| `hh_n_children` | Number of persons with age ≤ 17 | + +#### Income bins + +| Column | Definition | +|---|---| +| `hh_income_bin1` | income < $25,000 | +| `hh_income_bin2` | income ∈ [$25k, $50k) | +| `hh_income_bin3` | income ∈ [$50k, $75k) | +| `hh_income_bin4` | income ∈ [$75k, $150k) | +| `hh_income_bin5` | income ≥ $150,000 | + +#### Top education of head/spouse (relate < 2) + +| Column | Definition | +|---|---| +| `hh_edu_top` | Maximum edu value among head and spouse | +| `hh_edu_top_bin2` | hh_edu_top ∈ [16, 17] | +| `hh_edu_top_bin3` | hh_edu_top > 17 | + +#### Working members (head/spouse) + +| Column | Definition | +|---|---| +| `hh_fam_work` | Number of working (worker == 1) head/spouse members | +| `hh_fam_work2` | 1 if hh_fam_work == 2 | + +#### Average age of head/spouse + +| Column | Definition | +|---|---| +| `hh_age_avg` | Mean age of head and spouse (relate < 2) | +| `hh_age_avg_bin2` | hh_age_avg ∈ (22, 35] | +| `hh_age_avg_bin3` | hh_age_avg ∈ (35, 60] | +| `hh_age_avg_bin4` | hh_age_avg > 60 | + +#### Minimum age of head/spouse + +| Column | Definition | +|---|---| +| `hh_age_min` | Minimum age of head and spouse (relate < 2) | +| `hh_age_min_bin2` | hh_age_min ∈ (22, 35] | +| `hh_age_min_bin3` | hh_age_min ∈ (35, 60] | +| `hh_age_min_bin4` | hh_age_min > 60 | + +#### Birth-model household columns + +| Column | Definition | +|---|---| +| `hh_birth_age_lt27` | Age of relevant female member ≤ 27 | +| `hh_birth_age_27_35` | Age of relevant female member ∈ (27, 35] | +| `hh_fsize_bin23` | hh_n_persons ∈ {2, 3} | +| `hh_fsize_bingt3` | hh_n_persons > 3 | + +#### Other + +| Column | Definition | +|---|---| +| `income_segment` | Quantile-based income segment (1–6) | +| `hh_type` | Categorical type code (1–8) based on tenure, size, and age of head | +| `county_id` | First 5 characters of block_id | +| `tract_id` | First 11 characters of block_id | +| `intercept` | 1 for every household | + +--- + +## Where variables are defined + +Each variable is defined in the module that owns it, or in `variables.py` if it is shared across models. + +| File | Variables defined | +|---|---| +| `demos/variables.py` | Shared persons variables (age bins, sex, employment, education, race, marital status), shared household variables (intercept, county/tract IDs, income segment, hh_type), and blocks aggregations | +| `demos/models/aging.py` | `hh_head_age` | +| `demos/models/birth.py` | `hh_n_persons`, `hh_fsize_bin23`, `hh_fsize_bingt3`, `hh_birth_age_lt27`, `hh_birth_age_27_35` | +| `demos/models/employment.py` | `age_emp_20_40`, `age_emp_41_50`, `age_emp_51_70`, `age_emp_70plus` | +| `demos/models/fatality.py` | `age_mort_21_40`, `age_mort_41_50`, `age_mort_51_70`, `age_mort_71_90`, `age_mort_90plus` | +| `demos/models/household_reorg.py` | `hh_head_race_id`, `hh_head_race_str`, `hh_size_str`, `hh_n_children`, `hh_income_bin1–5`, `hh_edu_top`, `hh_edu_top_bin2–3`, `hh_fam_work`, `hh_fam_work2`, `hh_age_avg` + bins, `hh_age_min` + bins, `hh_head_race_white` | +| `demos/models/income.py` | `hh_head_edu_bin1–3`, `hh_head_race_black`, `hh_head_race_native_am`, `hh_head_race_asian`, `hh_head_race_hawaiian`, `hh_head_race_acs_other` | +| `demos/models/kids_moving.py` | `age_km_*` bins, all `emp_idle_*_age_km_*` and `edu_*_age_km_*` cross-products | diff --git a/pyproject.toml b/pyproject.toml index eb1e005..a4bbdab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta" [project] name = "nrel.demos" -version = "0.0.1" +version = "1.0.1" description = "DEMOS is a demographic simulation tool." readme = "README.md" -authors = [{ name = "National Renewable Energy Laboratory" }, { name = "Lawrence Berkley National Laboratory" }] +authors = [{ name = "National Laboratory of the Rockies" }, { name = "Lawrence Berkley National Laboratory" }] # license = {} classifiers = [ "Programming Language :: Python :: 3.10" @@ -32,7 +32,7 @@ requires-python = ">= 3.10" packages = ["demos"] [project.urls] -Homepage = "https://github.com/NREL/DEMOS_NREL" +Homepage = "https://github.com/NatLabRockies/DEMOS" [project.optional-dependencies] reporting = [ @@ -52,8 +52,3 @@ docs = [ dev = [ "black" ] - -[project.scripts] -# TODO: Check these -demos = "demos.simulate:simulate" -# plotting = \ No newline at end of file diff --git a/scripts/clean_columns.py b/scripts/clean_columns.py new file mode 100644 index 0000000..768c7c0 --- /dev/null +++ b/scripts/clean_columns.py @@ -0,0 +1,49 @@ +if __name__ == "__main__": + import argparse + import pandas as pd + import numpy as np + + # argument parser to receive the input file path and the output file path + parser = argparse.ArgumentParser(description="Clean columns in H5 file") + parser.add_argument("--input", required=True, help="Input H5 file path") + parser.add_argument("--output", required=True, help="Output H5 file path") + args = parser.parse_args() + + # Read H5 file and filter a list of hardn0coded columns + persons_df = pd.read_hdf(args.input, key="persons") + persons_required_columns = [ + "age", + "sex", + "person_sex", + "edu", + "student", + "worker", + "earning", + "MAR", + "relate", + "race_id", + "hispanic", + "household_id", + ] + persons_df = persons_df[persons_required_columns] + # Save the filtered DataFrame to a new H5 file + persons_df.to_hdf(args.output, key="persons", mode="w") + + # Do the same for the "households" table + households_df = pd.read_hdf(args.input, key="households") + households_required_columns = [ + "income", + "lcm_county_id", + ] + households_df = households_df[households_required_columns] + + # Add random values between 1 and 4 to the "job_industry" column and "job_occupation" column + households_df["job_industry"] = pd.Series( + np.random.choice([1, 2, 3, 4], size=len(households_df)), + index=households_df.index, + ) + households_df["job_occupation"] = pd.Series( + np.random.choice([1, 2, 3, 4], size=len(households_df)), + index=households_df.index, + ) + households_df.to_hdf(args.output, key="households", mode="a")