MAP-Census-LEHD-Commute-Patterns/1-ProcessLEHDforCommutePatterns.Rmd at main · WFRCAnalytics/MAP-Census-LEHD-Commute-Patterns · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
---
title: "Process LEHD for Commute Patterns"
author: "Chris Day"
date: "2023-01-06"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```

```{r}
library(tidyverse)
library(foreign)
library(sf)
library(jsonlite)
library(mapview)
library(imputeTS)
```

## Read Config Variables
```{r}
config <- fromJSON("config.json")
config_year <- config$year
config_bg_year <- config$bg_year
config_bk_year <- config$bk_year
config_bk_geoid <- config$bk_geoid
```


## Data
```{r}
#read in the LEHD data
lehd_raw <- read_csv(paste0("data/data",config_year,"/ut_od_main_JT00_",config_year,".csv")) %>%
  select(w_geocode, h_geocode, S000) # S000 is Number of Jobs
```

```{r}
#read in township codes and adjust the shortdesc to make merging easier later on
township_codes <- read_csv(paste0("data/data",config_year,"/citytownship.csv")) %>%
  select(1,2) %>%
  mutate(SHORTDESC = case_when(
    CODE3 == "EMT" ~ "EMIGRATION CANYON",
    CODE3 == "CMT" ~ "COPPERTON",
    CODE3 == "KMT" ~ "KEARNS",
    CODE3 == "MMT" ~ "MAGNA CITY",
    CODE3 == "WHT" ~ "WHITE CITY",
    TRUE ~ SHORTDESC
  ))
```

```{r}
#read in the uofu adjustment factors (if needed)
#uofu_adj <- read_csv("data/data2018/490351108004_Manual_Allocation.csv") %>%
#  mutate(GEOID = as.factor(GEOID))
```

```{r}
# a list of all the city/townships in the wfrc areas
wfrc_towns <- c("AFK","ALA","ALP","BDL","BGM","BNT","BRT","CDF","CEN","CHA","CHL","CLF","CLI","CMT","COA","CWH","DAN","DRA","EAG","ELK","EMT","FAR","FCS","FFD","FRR","FTH","GLA","GOS","GRL","HAR","HDT","HDT","HEB","HER","HGH","HNF","HOL","HOO","HVL","IND","INT","KAY","KMS","KMT","LAY","LEH","LIN","MAP","MID","MLC","MMT","MRG","MSL","MUR","MWY","NOG","NSL","OGD","OKL","ORM","PAY","PGR","PLN","PRK","PRY","PVO","PVW","ROY","RVD","RVT","SAN","SAQ","SAR","SFK","SJC","SLC","SLM","SOG","SPV","SSL","SUN","SWE","SYR","TAY","TOO","UIN","VIN","WAT","WBG","WDL","WEB","WHT","WHV","WIL","WJC","WPT","WVC","WXC")

```

## Geographic Data
```{r}
#read in block group tiger line file for specific year
blockgroups <- st_read(paste0("data/data",config_year,"/tl_",config_bg_year,"_49_bg/tl_",config_bg_year,"_49_bg.shp")) %>%
  st_transform(crs = 4326) %>%
  st_as_sf() %>%
  st_transform(26912) %>%
  select(GEOID,geometry)
```


```{r}
#read in blocks and convert to point geometry, merge blockgroups to get GEOID & GEOID20 in same file
blockpoints <- st_read(paste0("data/data",config_year,"/CensusBlocks",config_bk_year,"/CensusBlocks",config_bk_year,".shp")) %>%
  st_transform(crs = 4326) %>%
  st_as_sf() %>%
  st_transform(26912) %>%
  st_centroid() %>%
  select(config_bk_geoid, geometry)%>%
  st_join(blockgroups, join=st_within, left=TRUE)
  #mutate(
  #  xcol = st_coordinates(geometry)[, 1],
  #  ycol = st_coordinates(geometry)[, 2]
  #) %>%
  #st_transform(crs = 4326) %>%
  #st_as_sf() %>%
  #st_transform(26912)

#mapview(blockpoints)
```
```{r}
## block level coordinates (for 2018 & 2019 data)
#blockpoints <- read.dbf(paste0("data/data",year,"/blockcentersWBlockGroup.dbf")) %>%
#  select(GEOID, GEOID10,x,y) %>% st_as_sf(coords = c("x","y"), crs = 4326) %>%
#  st_transform(crs = 4326) %>%
#  st_as_sf() %>%
#  st_transform(26912)
```


```{r}
# township shapefile
sf_townships <- st_read(paste0("data/data",config_year,"/Municipalities_Township/Municipalities_Township.shp")) %>%
  st_transform(crs = 4326) %>%
  st_as_sf() %>%
  st_transform(26912)

# backup township shapefile for invalid geometries
sf_townships_backup <- st_read(paste0("data/data",2019,"/Municipalities_Township/Municipalities_Township.shp")) %>%
  st_transform(crs = 4326) %>%
  st_as_sf() %>%
  st_transform(26912)
```

```{r}
# determine if some geometries are invalid
if (length(invalid_idx) == 0) {
  cat("✅ All geometries are valid.\n")
} else {
  sf_invalid <- sf_townships[invalid_idx, ]
  cat("⚠️ Found", length(invalid_idx), "invalid geometries.\n")
  print(st_is_valid(sf_invalid, reason = TRUE))
  mapview::mapview(sf_invalid)
}
```

```{r}
#FIX INVALID GEOMETRIES - by replacing with previous set of valid ones

# 1. Identify invalid geometries
invalid_idx <- which(!st_is_valid(sf_townships))
cat("⚠️ Found", length(invalid_idx), "invalid geometries to replace.\n")

# 2. SHORTDESC of invalid rows
invalid_shortdesc <- sf_townships$SHORTDESC[invalid_idx]

# 3. Extract replacement rows from backup by SHORTDESC
replacement_rows_all <- sf_townships_backup[sf_townships_backup$SHORTDESC %in% invalid_shortdesc, ]

# 4. Drop invalid rows from sf_townships
sf_townships_clean <- sf_townships[-invalid_idx, ]

# 5. Align columns: get common columns
common_cols <- intersect(names(sf_townships_clean), names(replacement_rows_all))

# 6. Subset both to common columns (in sf_townships_clean order)
sf_townships_clean_sub <- sf_townships_clean[, common_cols]
replacement_rows_sub <- replacement_rows_all[, common_cols]

# 7. For any columns in sf_townships_clean but missing in replacement_rows, add NA
missing_cols <- setdiff(names(sf_townships_clean), names(replacement_rows_all))
if (length(missing_cols) > 0) {
  for (col in missing_cols) {
    replacement_rows_sub[[col]] <- NA
  }
  # Reorder to sf_townships_clean column order
  replacement_rows_sub <- replacement_rows_sub[, names(sf_townships_clean_sub)]
}

# 8. Combine
sf_townships_fixed <- rbind(sf_townships_clean_sub, replacement_rows_sub)

# 9. Optional: reorder by SHORTDESC if you want
sf_townships_fixed <- sf_townships_fixed[order(sf_townships_fixed$SHORTDESC), ]
```

## Merge Townships and Blockgroups
```{r}
blockpoint_city <- st_join(blockpoints, sf_townships_fixed, join = st_within) %>%
  mutate_at(vars(1:2), as.double)
blockpoint_city_full <- blockpoint_city %>%
  select(GEOID, config_bk_geoid, COUNTYNBR, NAME, SHORTDESC) %>%
  left_join(township_codes, by = c("SHORTDESC" = "SHORTDESC")) %>%
  select(config_bk_geoid, CODE3, SHORTDESC) %>%
  mutate(config_bk_geoid = as.numeric(as.character(config_bk_geoid))) %>%
  mutate(CODE3 = ifelse(is.na(CODE3), "XXX", CODE3),
         SHORTDESC = ifelse(is.na(SHORTDESC), "Outside Area", SHORTDESC))

# get the township block point data that corresponds to the wfrc modeling area
blockpoints_wfrc <- blockpoint_city_full %>%
  mutate(CODE3 = ifelse(CODE3 %in% wfrc_towns, CODE3, "XXX"),
         SHORTDESC = ifelse(CODE3 %in% wfrc_towns, SHORTDESC, "Outside WFRC Area"))

#mapview(blockpoints_wfrc %>% filter(CODE3 != 'XXX'))
```


## Process LEHD data at the Block level for home/work locations
```{r}
# get home/work number of jobs at blockpoint city level
homeside <- left_join(lehd_raw,blockpoint_city_full, by = c("h_geocode" = config_bk_geoid)) %>%
  mutate(h_geocode = as.character(h_geocode), w_geocode = as.character(w_geocode))
workside <- left_join(lehd_raw,blockpoint_city_full, by = c("w_geocode" = config_bk_geoid)) %>%
  mutate(h_geocode = as.character(h_geocode), w_geocode = as.character(w_geocode))

# get home/work number of jobs at blockpoint city level for the wfrc modeling area
homeside_wfrc <- homeside %>%
  mutate(CODE3 = ifelse(CODE3 %in% wfrc_towns, CODE3, "XXX"),
         SHORTDESC = ifelse(CODE3 %in% wfrc_towns, SHORTDESC, "Outside WFRC Area"))
workside_wfrc <- workside %>%
  mutate(CODE3 = ifelse(CODE3 %in% wfrc_towns, CODE3, "XXX"),
         SHORTDESC = ifelse(CODE3 %in% wfrc_towns, SHORTDESC, "Outside WFRC Area"))
```

## Summarize LEHD-Blockpoint data for Python Script
```{r}
# get every possible combination of block geoids and city/township
blockseq <-  homeside_wfrc %>%
  left_join(blockpoints, by = c("w_geocode" = config_bk_geoid)) %>%
  expand(GEOID, CODE3 = (c(wfrc_towns,"XXX")))
```

```{r}
# summarize home-based job data at the blockpoint-city/township level -- including totals
# {CODE3}_h represents number of workers who have a home location in city/town
homeside_wide <- homeside_wfrc %>%
  left_join(blockpoints, by = c("w_geocode" = config_bk_geoid)) %>%
  select(-SHORTDESC, -h_geocode) %>%
  group_by(GEOID, CODE3) %>%
  summarize(SumS000 = sum(S000)) %>% ungroup()
homeside_wide_full <- blockseq %>%
  left_join(homeside_wide)%>%
  complete(GEOID,CODE3) %>%
  arrange(CODE3) %>%
  pivot_wider(names_from = CODE3, values_from = SumS000, values_fn = sum, names_glue = "{CODE3}_h") %>%
  na_replace(0)
homeside_wide_ttl <- homeside_wide_full %>% as.tibble() %>%
  mutate(TTL_h = rowSums(.[2:99])) %>%
  select(-XXX_h)
```

```{r}
# summarize work-based job data at the blockpoint-city/township level -- including totals
# {CODE3}_w represents number of workers who have a work location in city/town
workside_wide <- workside_wfrc %>%
  left_join(blockpoints, by = c("h_geocode" = config_bk_geoid)) %>%
  select(-SHORTDESC, -w_geocode) %>%
  group_by(GEOID, CODE3) %>%
  summarize(SumS000 = sum(S000)) %>% ungroup()
workside_wide_full <- blockseq %>%
  left_join(workside_wide)%>%
  complete(GEOID,CODE3) %>%
  arrange(CODE3) %>%
  pivot_wider(names_from = CODE3, values_from = SumS000, values_fn = sum, names_glue = "{CODE3}_w") %>%
  na_replace(0)
workside_wide_ttl <- workside_wide_full %>% as.tibble() %>%
  mutate(TTL_w = rowSums(.[2:99])) %>%
  select(-XXX_w)
```

```{r}
# merge home and work based data into single dataframe
home_work_ttl <- left_join(homeside_wide_ttl, workside_wide_ttl)
```


## Fix Misplaced College Jobs
In the 2018 and 2019 data, there were about 20,000 additional educational jobs in holiday (block group 490351108004001) and about 18,000+ too few jobs in the UofU campus block group. And so, we reallocated them.

In the 2022 data, holiday did not have any block groups with an over-abundance of educational jobs, and UofU block group looked to have a reasonable number. However, the majority of the educational jobs for BYU are located a few block groups north of the campus location. As a result, we reallocated these values below.


Steps to follow:
  - Read in the "C" LEHD LODES dataset to determine the top five 5 block groups with educational jobs
  - Double check if these locations make sense and which colleges they correspond to
  - Match these up with the "S" LEHD LODES dataset
  - Determine if somewhere is mixed up

First we read in the blockgroup shapefile spatial dataframe.
```{r}
blockpoints2 <- blockpoints %>% as.tibble() %>% select(-geometry)
blockgroups2 <- blockgroups %>%
  as_tibble() %>%
  left_join(blockpoints2) %>%
  select(GEOID,config_bk_geoid,geometry)
geoid <- blockgroups2 %>% select(-config_bk_geoid) %>%
  unique()
```

We also read in the college shapefile to compare with the LDOES data.
```{r}
slTazPolygon <- st_read("data/data2019/SL_College/StreetLight_TAZ_2021_09_22.shp") %>%
  filter(SUBAREAID == 1) %>%
  select(SA_TAZID,SL_COTAZID) %>%
  st_set_crs(26912)

collegePolygon <- read_csv("data/data2019/SL_College/College_to_SL_COTAZID.csv") %>%
  left_join(slTazPolygon) %>%
  st_as_sf()
```

Next we read in the Workplace Area Characteristics (WAC) LODES data. This will tell us how many jobs exist within each block. We merge the block level data to get it at the block level.
```{r}
utrac <- read_csv(paste0("data/data",config_year,"/ut_wac_S000_JT00_",config_year,".csv")) %>%
  mutate(w_geocode = as.factor(w_geocode)) %>%
  select(w_geocode,C000,CNS15) %>%
  left_join(blockpoints, by = c("w_geocode" = config_bk_geoid)) %>%
  group_by(GEOID) %>%
  summarize(C000 = sum(C000),        # C000 = total jobs
            CNS15 = sum(CNS15)) %>%  # CNS15 = educational jobs
  left_join(geoid) %>%
  arrange(desc(CNS15)) %>%
  st_as_sf()

mapview(utrac)
```


Below is a map comparing the location of blockgroups with more than 2000 educational jobs as well as the college locations. Notice how there is no block groups next to UofU! This means the data was not fixed for the 2019 LODES dataset, and we must reassign the educational employment in Holladay to the UofU area.

```{r}
utracC <- utrac %>% filter(CNS15 > 2000)
#utracC_Exact <- utrac %>% filter(GEOID %in% c(490490016022, 490490011033, 490351135103, 490351014022, 490351014011,490572015003))


mapview(utracC, col.regions = "red") +
  mapview(collegePolygon) +
  mapview(utrac, col.regions='green')
```


```{r}
byu_2022_hhjobsViewer <- data.frame(GEOID = c(490490015042,490490016022),
                                    DESCRIP = c('RockCanyonElem','BYU_Campus'),
                                    SOURCE = c("TOT",'TOT','EDU/OFF','EDU/OFF'),
                                    HHJOB = c(389, 15037, 285, 15037),
                                    LODES = c(7879, 114, 7532, 0))%>%
  mutate(Difference = abs(HHJOB - LODES))
tibble(byu_2022_hhjobsViewer)
```
After analyzing the map, we created a table which compares the employment data of the LODES dataset with the employment data from the Household and Job Forecasts Viewer for 2022 (https://wfrc.org/household-job-forecast-map/). It looks like for the main block group corresponding to UofU campus, a total of 14923 jobs are missing -- most of them we assume to be educational jobs because currently the LODES data only has 0 of those jobs in that area!

Furthermore, we see that with block group 490490015042, the LODES data has 7490 too many jobs in that block group compared to the household and Jobs Forecast Viewer! And since the BYU block group is missing at 14923 overall jobs, we will make an assumption by moving 7490 jobs from block group 490490015042 to block group 490490016022 in the LODES data.

This means we want to keep only 5.21% of jobs that currently exist in block group 490490015042 We also want to multiple the jobs within block group 490490016022 by 7676% to get a total of 7604 jobs in the BYU block group area (in other words to add the 7490 jobs that we took away from the RockCanyonElem block group).

Below we make the necessary adjustments by multiplying the two block groups by the ratio needed to correct for the BYU adjustment. Then, we "patch" these two block groups back into the processed dataset.

```{r}
byu_adjust <- home_work_ttl %>%
  filter(GEOID %in% c('490490015042','490490016022'))
byu_adjust[1,c(2:98)] <- byu_adjust[1,c(2:98)]*.0521
byu_adjust[2,c(2:98)] <- byu_adjust[2,c(2:98)]*76.76
byu_adjust[1,1] <- '490490015042'
byu_adjust[2,1] <- '490490016022'
byu_adjustr <- byu_adjust %>%
  mutate_if(is.numeric, round) %>%
  mutate(TTL_h = rowSums(.[2:98]))

print(byu_adjustr['TTL_h'])

home_work_ttl_fix <- home_work_ttl %>%
  filter(!GEOID %in% c('490490015042','490490016022')) %>%
  bind_rows(byu_adjustr) %>%
  arrange(GEOID)
```

## Combine Datasets

```{r}
# calculate a fakeid and calculate a column sum
results_at_blockgroups <- home_work_ttl_fix %>%
  mutate(fakeid = row_number() - 1) %>%
  select(198,1:197) %>%
  arrange(fakeid) %>%
bind_rows(summarise(.,
                    across(where(is.numeric), sum),
                    across(where(is.factor), ~"colSUM"))) %>%
  slice(n(),1:(n()-1)) %>%
  mutate(fakeid = ifelse(GEOID == "colSUM", 0,fakeid)) %>%
  mutate(fakeid = ifelse(is.na(fakeid),0,fakeid)) %>%
  mutate(GEOID = ifelse(is.na(GEOID),"colSUM",GEOID))

triplecheck <-results_at_blockgroups%>%filter(GEOID=='490490016022')
```

## Result
```{r}
#write output csv to be used in the python script which create the webapp inputs
write_csv(results_at_blockgroups, paste0("data/data",config_year,"/lehd_at_blockgroup_level",config_year,".csv"))
```