annual-aggregates.Rmd

---
title: "Cohabitation Trends: 2012-2022"
author: "Lorae Stojanovic"
date: "2024-10-21"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r stacked-bar-function, echo = FALSE}
# Initialize a graphing function
stacked_bar <- function(
    data,
    title = "Default title"
    ) {
  ggplot(data, aes(x = AGE_bucket, y = percent, fill = cohabit_bin)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = scales::percent_format()) +
    labs(
      title = title, 
      x = "Age Group", 
      y = "Percentage", 
      fill = "Cohabit Bin"
      ) +
    scale_fill_manual(values = c(
      "0" = "white", 
      "1" = "#075792", 
      "2" = "#1e81b0", 
      "3" = "#46b1d5", 
      "9" = "lightcoral"
      ), labels = c(
        "0" = "Not living with parents", 
        "1" = "Child provides for parent", 
        "2" = "Both child and parent are dependent", 
        "3" = "Child depends on parent", 
        "9" = "Living in institution"
        )) +
    geom_text(aes(
      label = scales::percent(percent / 100, accuracy = 0.1)), 
      position = position_fill(vjust = 0.5), size = 3) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), 
          legend.position = "bottom")
}
```

```{r summarize-stacked-bar, echo = FALSE}
summarize_stacked_bar <- function(
    data,
    title = "Default title"
) {
  # Summarize the input data set using the crosstab_percent function
  graph_input <- crosstab_percent(
    data = data,
    weight = "PERWT",
    group_by = c("AGE_bucket", "cohabit_bin"),
    percent_group_by = c("AGE_bucket"),
    every_combo = TRUE,
    repwts = paste0("REPWTP", sprintf("%d", 1:80))
  ) |> 
    arrange(AGE_bucket, cohabit_bin) |>
    mutate(cohabit_bin = factor(cohabit_bin, levels = c(0, 1, 2, 3, 9)),
           AGE_bucket = factor(AGE_bucket, levels = c("Under 16", "16-17", "17-18", "18-19", "20-21", "22-23", "24-25", "26-27", "28-29", "30+")))
  
  # Produce the stacked bar graph
  graph <- stacked_bar(data = graph_input, title = title)
  
  return(graph)
}
```

## Introduction
This report tracks the rates of young adults living with parents from 2012 to 2022. It presents a comparison of percentages for different age groups and cohabitation types between these two years, and evaluates whether there were significant changes in these percentages.

## Cohabitation Trends in 2012 and 2022

We begin by loading the data and calculating the percentages of individuals living with their parents across different age groups.
```{r, echo = FALSE}
# ----- Step 0: Load packages ----- #
library("dplyr")
library("duckdb")
library("ipumsr")
library("readr")
library("tidyr")
library("ggplot2")
library("knitr")
library("purrr")

# ----- Step 1: Source helper functions ----- #

devtools::load_all("../dataduck")

# ----- Step 2: Import and wrangle data ----- #

con <- dbConnect(duckdb::duckdb(), "data/db/ipums.duckdb")
ipums_relate <- tbl(con, "ipums_relationships") 

# Calculate percentages for 2022 and 2012
cohabit_2022_with_percents <- estimate_with_bootstrap_se(
  data = ipums_relate |> filter(YEAR == 2022),
  f = crosstab_percent,
  wt_col = "PERWT",
  repwt_cols = paste0("REPWTP", sprintf("%d", 1:80)),
  constant = 4/80,
  se_cols = c("percent"),
  id_cols = c("AGE_bucket", "cohabit_bin"),
  group_by = c("AGE_bucket", "cohabit_bin"),
  percent_group_by = c("AGE_bucket"),
  every_combo = TRUE
) |>
  mutate(
    cohabit_bin = factor(
      cohabit_bin, 
      levels = c(0, 1, 2, 3, 9)
      ),
    AGE_bucket = factor(
      AGE_bucket, 
      levels = 
        c("Under 16", 
          "16-17", 
          "17-18", 
          "18-19", 
          "20-21", 
          "22-23", 
          "24-25", 
          "26-27", 
          "28-29", 
          "30+")
      )) |>
  arrange(AGE_bucket, cohabit_bin)


cohabit_2012_with_percents <- estimate_with_bootstrap_se(
  data = ipums_relate |> filter(YEAR == 2022),
  f = crosstab_percent,
  wt_col = "PERWT",
  repwt_cols = paste0("REPWTP", sprintf("%d", 1:80)),
  constant = 4/80,
  se_cols = c("percent"),
  id_cols = c("AGE_bucket", "cohabit_bin"),
  group_by = c("AGE_bucket", "cohabit_bin"),
  percent_group_by = c("AGE_bucket"),
  every_combo = TRUE
) |>
  mutate(
    cohabit_bin = factor(
      cohabit_bin, 
      levels = c(0, 1, 2, 3, 9)
      ),
    AGE_bucket = factor(
      AGE_bucket, 
      levels = 
        c("Under 16", 
          "16-17", 
          "17-18", 
          "18-19", 
          "20-21", 
          "22-23", 
          "24-25", 
          "26-27", 
          "28-29", 
          "30+")
      )) |>
  arrange(AGE_bucket, cohabit_bin)
```

Statistical Comparison
In this section, we compare the percentages between 2012 and 2022 using a z-test to determine if the differences are statistically significant.

```{r, echo = FALSE}
# Function to compare percent changes between years and calculate p-value
compare_percent_changes <- function(df1, df2) {
  comparison <- df1 %>%
    inner_join(df2, by = c("AGE_bucket", "cohabit_bin"), suffix = c("_2012", "_2022")) %>%
    rowwise() %>%
    mutate(
      percent_difference = percent_2022 - percent_2012,
      combined_standard_error = sqrt(se_percent_2022^2 + se_percent_2012^2),
      z_score = percent_difference / combined_standard_error,
      p_value = 2 * pnorm(-abs(z_score)),  # Two-tailed p-value calculation
      significant_change = ifelse(abs(z_score) > 1.96, "Significant", "Not Significant")
    ) %>%
    ungroup() %>%
    select(AGE_bucket, cohabit_bin, percent_2012, percent_2022, percent_difference, combined_standard_error, z_score, p_value, significant_change)
  return(comparison)
}

# Calculate the comparison between 2012 and 2022
result <- compare_percent_changes(cohabit_2012_with_percents, cohabit_2022_with_percents)

# Display the results
kable(result, caption = "Comparison of Cohabitation Trends (2012 vs 2022)")
```


## Visualizing Cohabitation Trends
In this section, we display bar charts showing the percentage of young adults cohabiting with their parents for the years 2012 and 2022. These charts are broken down by age group.

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
stacked_bar(data = cohabit_2022_with_percents, title = "Cohabitation by Age Group, 2022")
```

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
stacked_bar(data = cohabit_2012_with_percents, title = "Cohabitation by Age Group, 2012")
```

# By Gender
```{r, echo = FALSE}
women_2022 <- crosstab_percent(
  data = ipums_relate |> filter(YEAR == 2022 & SEX == 2),
  weight = "PERWT",
  group_by = c("AGE_bucket", "cohabit_bin"),
  percent_group_by = c("AGE_bucket"),
  every_combo = TRUE,
  repwts = paste0("REPWTP", sprintf("%d", 1:80))
) |> 
  arrange(AGE_bucket, cohabit_bin) |>
  mutate(cohabit_bin = factor(cohabit_bin, levels = c(0, 1, 2, 3, 9)),
         AGE_bucket = factor(AGE_bucket, levels = c("Under 16", "16-17", "17-18", "18-19", "20-21", "22-23", "24-25", "26-27", "28-29", "30+")))

men_2022 <- crosstab_percent(
  data = ipums_relate |> filter(YEAR == 2022 & SEX == 1),
  weight = "PERWT",
  group_by = c("AGE_bucket", "cohabit_bin"),
  percent_group_by = c("AGE_bucket"),
  every_combo = TRUE,
  repwts = paste0("REPWTP", sprintf("%d", 1:80))
) |> 
  arrange(AGE_bucket, cohabit_bin) |>
  mutate(cohabit_bin = factor(cohabit_bin, levels = c(0, 1, 2, 3, 9)),
         AGE_bucket = factor(AGE_bucket, levels = c("Under 16", "16-17", "17-18", "18-19", "20-21", "22-23", "24-25", "26-27", "28-29", "30+")))
```

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
stacked_bar(data = women_2022, title = "Cohabitation by Age Group, 2022: Females")
```
```{r, echo = FALSE, fig.width = 10, fig.height = 10}
stacked_bar(data = men_2022, title = "Cohabitation by Age Group, 2022: Males")
```


# By Race/Ethnicity
 Here's a breakdown of the percentage of the population in 2012 and 2022 falling into our nonoverlapping and comprehensive race/ethnicity groups. The percentages differ slightly from what is published by the Census due to our differing definitions. For more information on our race/ethnicity aggregation methodology, please see XXX [link to another Rmd here].

```{r, echo = FALSE}

# Calculate percentages for 2022 and 2012
race_ethnicity_2022 <- crosstab_percent(
  data = ipums_relate |> filter(YEAR == 2022),
  weight = "PERWT",
  group_by = c("RACE_ETH_bucket"),
  percent_group_by = c(),
  every_combo = TRUE,
  repwts = paste0("REPWTP", sprintf("%d", 1:80))
) |> arrange(RACE_ETH_bucket)

race_ethnicity_2012 <- crosstab_percent(
  data = ipums_relate |> filter(YEAR == 2012),
  weight = "PERWT",
  group_by = c("RACE_ETH_bucket"),
  percent_group_by = c(),
  every_combo = TRUE,
  repwts = paste0("REPWTP", sprintf("%d", 1:80))
) |> arrange(RACE_ETH_bucket)

# Function to compare percent changes for race/ethnicity and calculate p-values
compare_race_eth_changes <- function(df1, df2) {
  comparison <- df1 %>%
    inner_join(df2, by = c("RACE_ETH_bucket"), suffix = c("_2012", "_2022")) %>%
    rowwise() %>%
    mutate(
      percent_difference = percent_2022 - percent_2012,
      combined_standard_error = sqrt(percent_standard_error_2022^2 + percent_standard_error_2012^2),
      z_score = percent_difference / combined_standard_error,
      p_value = 2 * pnorm(-abs(z_score)),  # Two-tailed p-value calculation
      significant_change = ifelse(abs(z_score) > 1.96, "Significant", "Not Significant")
    ) %>%
    ungroup() %>%
    select(RACE_ETH_bucket, percent_2012, percent_2022, percent_difference, combined_standard_error, z_score, p_value, significant_change)
  return(comparison)
}

# Perform comparison for race/ethnicity between 2012 and 2022
race_eth_comparison <- compare_race_eth_changes(race_ethnicity_2012, race_ethnicity_2022)

# Display the results
kable(race_eth_comparison, caption = "Comparison of Race/Ethnicity Cohabitation Trends (2012 vs 2022)")

```

# Asian American/Pacific Islanders

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "AAPI", SEX == 2), 
  title = "Cohabitation by Age Group, 2022: AAPI Females")
```


```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "AAPI", SEX == 1), 
  title = "Cohabitation by Age Group, 2022: AAPI Males")
```

# American Indians / Alaska Natives

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "AIAN", SEX == 2), 
  title = "Cohabitation by Age Group, 2022: AIAN Females")
```


```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "AIAN", SEX == 1), 
  title = "Cohabitation by Age Group, 2022: AIAN Males")
```

# Black Americans

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "Black", SEX == 2), 
  title = "Cohabitation by Age Group, 2022: Black American Females")
```


```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "Black", SEX == 1), 
  title = "Cohabitation by Age Group, 2022: Black American Males")
```

# Hispanic Americans

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "Hispanic", SEX == 2), 
  title = "Cohabitation by Age Group, 2022: Hispanic American Females")
```


```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "Hispanic", SEX == 1), 
  title = "Cohabitation by Age Group, 2022: Hispanic American Males")
```

# Multiracial Americans

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "Multiracial", SEX == 2), 
  title = "Cohabitation by Age Group, 2022: Multiracial American Females")
```


```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "Multiracial", SEX == 1), 
  title = "Cohabitation by Age Group, 2022: Multiracial American Males")
```

# White Americans

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "White", SEX == 2), 
  title = "Cohabitation by Age Group, 2022: White American Females")
```


```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "White", SEX == 1), 
  title = "Cohabitation by Age Group, 2022: White American Males")
```

# Other Americans

```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "Other", SEX == 2), 
  title = "Cohabitation by Age Group, 2022: Not Otherwise Classified American Females")
```


```{r, echo = FALSE, fig.width = 10, fig.height = 10}
summarize_stacked_bar(
  data = ipums_relate |> filter(YEAR == 2022, RACE_ETH_bucket == "Other", SEX == 1), 
  title = "Cohabitation by Age Group, 2022: Not Otherwise Classified American Males")
```