-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultidim-matrix.R
59 lines (46 loc) · 1.94 KB
/
multidim-matrix.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# multidim-matrix.R
#
# This script produces a multidimensional aggregate matrix on synthetic data
# as a proof-of-concept for aggregation methods in the IPUMS data.
# ----- Step 0: Source helper functions and packages
library("data.table")
library("magrittr")
source("src/utils/create-synthetic-data.R")
source("src/utils/create-categorical-buckets.R")
# ----- Step 1: Generate synthetic data
# Set pseudorandom seed for replicability
set.seed(42)
# Generate synthetic household data
household_data <- generate_household_data()
# ----- Step 2: Bucket the data
# Buckets are defined in lookup tables that are stored as .csv files in the /lookup_tables/
# directory. There are several bucketing schemes saved. Here we explicitly choose
# each .csv file.
age_lookup_table <- read.csv("lookup_tables/age/age_buckets00.csv", stringsAsFactors = FALSE)
hhincome_lookup_table <- read.csv("lookup_tables/hhincome/hhincome_buckets00.csv", stringsAsFactors = FALSE)
# Use the lookup tables to add bucket columns to the data frame.
household_data_bucketed <- household_data %>%
# Age buckets
generate_bucket_continuous(
data = .,
lookup_table = age_lookup_table,
column_name = "AGE"
) %>%
# Household income buckets
generate_bucket_continuous(
data = .,
lookup_table = hhincome_lookup_table,
column_name = "HHINCOME"
)
View(household_data_bucketed)
# ----- Step 3: Produce aggregate household sizes in data table
# Convert your large data frame to data.table for faster processing
household_dt <- as.data.table(household_data_bucketed)
# Compute weighted mean directly using data.table's grouping and add counts
# TODO: replace HHWT with the correct person-level weight.
aggregate_dt <- household_dt[, .(
weighted_mean = weighted.mean(PERNUM, w = HHWT, na.rm = FALSE), # Weighted mean calculation
count = .N # Count of observations in each group
), by = .(AGE_bucket, HHINCOME_bucket, SEX)]
# Print the resulting data table
print(aggregate_dt)