Skip to content

Commit

Permalink
initial issue 9 commit
Browse files Browse the repository at this point in the history
  • Loading branch information
geneorama committed Nov 13, 2014
1 parent 5dd6a9d commit 64d86d6
Show file tree
Hide file tree
Showing 24 changed files with 1,125 additions and 731 deletions.
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/FoodInspectionEvaluation.zip
/.project
*.Rproj.user
food-inspections-evaluation.Rproj
.Rproj.user
.Rhistory
.Rproj
data/
DATA_ORIGINAL/
104 changes: 104 additions & 0 deletions 10_download_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
##==============================================================================
## INITIALIZE
##==============================================================================
## Remove all objects; perform garbage collection
rm(list=ls())
gc(reset=TRUE)
## Check for dependencies
if(!"geneorama" %in% rownames(installed.packages())){
if(!"devtools" %in% rownames(installed.packages())){install.packages('devtools')}
devtools::install_github('geneorama/geneorama')}
## Load libraries
geneorama::detach_nonstandard_packages()
geneorama::loadinstall_libraries(c("geneorama"))
geneorama::sourceDir("functions/")


## Tokens:
mytoken <- "NCxdKMXKT2fPVvmZQnCdziPel" ## gmail account
mytoken <- "YPSdn0B006OmWzSQkhIBpDc0R" ## city account

multi <- FALSE
# multi <- TRUE

##==============================================================================
## DOWNLOAD FILES FROM DATA PORTAL AS CSV
##==============================================================================
chi_dp_downloader(db="r5kz-chrr", outdir = "data/bus_license", multicore=multi,
apptoken=mytoken, useaskey="id")
chi_dp_downloader(db="ijzp-q8t2", outdir = "data/crime", multicore=multi,
apptoken=mytoken, useaskey="id")
chi_dp_downloader(db="4ijn-s7e5", outdir = "data/food_inspections", multicore=multi,
apptoken=mytoken, useaskey="inspection_id")
chi_dp_downloader(db="9ksk-na4q", outdir = "data/garbage_carts", multicore=multi,
apptoken=mytoken, useaskey="service_request_number")
chi_dp_downloader(db="me59-5fac", outdir = "data/sanitation_code", multicore=multi,
apptoken=mytoken, useaskey="service_request_number")


##==============================================================================
## CONVERT FILES FROM CSV TO RDS FILES (ALSO CHECK FOR DATES AND CONVERT THOSE)
##==============================================================================
chi_dp_csv2rds(indir = "data/bus_license")
chi_dp_csv2rds(indir = "data/crime")
chi_dp_csv2rds(indir = "data/food_inspections")
chi_dp_csv2rds(indir = "data/garbage_carts")
chi_dp_csv2rds(indir = "data/sanitation_code")

## Delete the old directories with the downloaded parts
unlink("data/bus_license/*");unlink("data/bus_license", recursive = T, force=T)
unlink("data/crime/*");unlink("data/crime", recursive = T, force=T)
unlink("data/food_inspections/*");unlink("data/food_inspections", recursive = T, force=T)
unlink("data/garbage_carts/*");unlink("data/garbage_carts", recursive = T, force=T)
unlink("data/sanitation_code/*");unlink("data/sanitation_code", recursive = T, force=T)

#==============================================================================
## SMALL FIXES
##==============================================================================

## read in data
business <- readRDS("data/bus_license.Rds")
crime <- readRDS("data/crime.Rds")
foodInspect <- readRDS("data/food_inspections.Rds")
garbageCarts <- readRDS("data/garbage_carts.Rds")
sanitationComplaints <- readRDS("data/sanitation_code.Rds")

geneorama::convert_datatable_IntNum(business)
geneorama::convert_datatable_IntNum(crime)
geneorama::convert_datatable_IntNum(foodInspect)
geneorama::convert_datatable_IntNum(garbageCarts)
geneorama::convert_datatable_IntNum(sanitationComplaints)

crime[ , Arrest := as.logical(Arrest)]
crime[ , Domestic := as.logical(Domestic)]

str(business)
str(crime)
str(foodInspect)
str(garbageCarts)
str(sanitationComplaints)

## Remove one row where the header is (somewhat) repeated
sanitationComplaints <- sanitationComplaints[Service_Request_Number!="SERVICE REQUEST NUMBER"]
## Fix non-numeric latitude in sanitation complaints
sanitationComplaints[ , Latitude := as.numeric(Latitude)]

saveRDS(business, "data/bus_license.Rds")
saveRDS(crime , "data/crime.Rds")
saveRDS(foodInspect , "data/food_inspections.Rds")
saveRDS(garbageCarts , "data/garbage_carts.Rds")
saveRDS(sanitationComplaints , "data/sanitation_code.Rds")














68 changes: 68 additions & 0 deletions 11_Filter_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@

##==============================================================================
## INITIALIZE
##==============================================================================
## Remove all objects; perform garbage collection
rm(list=ls())
gc(reset=TRUE)
## Check for dependencies
if(!"geneorama" %in% rownames(installed.packages())){
if(!"devtools" %in% rownames(installed.packages())){install.packages('devtools')}
devtools::install_github('geneorama/geneorama')}
## Load libraries
geneorama::detach_nonstandard_packages()
# geneorama::loadinstall_libraries(c("geneorama", "data.table"))
geneorama::loadinstall_libraries(c("data.table"))

##==============================================================================
## LOAD CACHED RDS FILES
##==============================================================================
business <- readRDS("data/bus_license.Rds")
crime <- readRDS("data/crime.Rds")
foodInspect <- readRDS("data/food_inspections.Rds")
garbageCarts <- readRDS("data/garbage_carts.Rds")
sanitationComplaints <- readRDS("data/sanitation_code.Rds")

##==============================================================================
## BUSINESS
##==============================================================================
business <- business[!is.na(LICENSE_TERM_START_DATE)]
business <- business[!is.na(LICENSE_TERM_EXPIRATION_DATE)]
business <- business[!(APPLICATION_TYPE %in% c("C_CAPA","C_SBA"))]
saveRDS(business, "data/bus_license_filtered.Rds")

##==============================================================================
## CRIME
##==============================================================================
crime <- crime[Date>as.IDate('2011-07-01')]
crime <- crime[!(is.na(Latitude) | is.na(Longitude) | is.na(Date))]
crime <- crime[Primary_Type=="BURGLARY"]
saveRDS(crime, "data/crime_filtered.Rds")

##==============================================================================
## FOOD INSPECTIONS
##==============================================================================
foodInspect <- foodInspect[!is.na(Inspection_Date) & !is.na(License)]
foodInspect <- foodInspect[!duplicated(Inspection_ID)]
foodInspect <- foodInspect[License != 0]
foodInspect <- foodInspect[Inspection_Date > as.IDate("2011-09-01")]
foodInspect <- foodInspect[Inspection_Type == "Canvass"]
saveRDS(foodInspect, "data/food_inspections_filtered.Rds")

##==============================================================================
## garbage subsets
##==============================================================================
garbageCarts <- garbageCarts[!is.na(Latitude) & !is.na(Longitude) & !is.na(Creation_Date)]
garbageCarts <- garbageCarts[Status %in% c("Completed", "Open")]
# garbageCarts$status <- NULL
saveRDS(garbageCarts, "data/garbage_carts_filtered.Rds")

##==============================================================================
## sanitation subsets
##==============================================================================
sanitationComplaints <- sanitationComplaints[!is.na(Latitude) & !is.na(Longitude) & !is.na(Creation_Date)]
sanitationComplaints <- sanitationComplaints[Status %in% c("Completed", "Open")]
# sanitationComplaints$status <- NULL
saveRDS(sanitationComplaints, "data/sanitation_code_filtered.Rds")


173 changes: 173 additions & 0 deletions 12_Merge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@

stop()

##==============================================================================
## INITIALIZE
##==============================================================================
## Remove all objects; perform garbage collection
rm(list=ls())
gc(reset=TRUE)
## Check for dependencies
if(!"geneorama" %in% rownames(installed.packages())){
if(!"devtools" %in% rownames(installed.packages())){install.packages('devtools')}
devtools::install_github('geneorama/geneorama')}
## Load libraries
geneorama::detach_nonstandard_packages()
# geneorama::loadinstall_libraries(c("geneorama", "data.table"))
geneorama::loadinstall_libraries(c("data.table"))
geneorama::sourceDir("functions/")

##==============================================================================
## LOAD CACHED RDS FILES
##==============================================================================
business <- readRDS("data/bus_license_filtered.Rds")
crime <- readRDS("data/crime_filtered.Rds")
foodInspect <- readRDS("data/food_inspections_filtered.Rds")
garbageCarts <- readRDS("data/garbage_carts_filtered.Rds")
sanitationComplaints <- readRDS("data/sanitation_code_filtered.Rds")

##==============================================================================
## FOOD INSPECTIONS
##==============================================================================
#omit records with a missing inspection date

## Tabluate voilation types
## 1) Split violoation description by "|"
## 2) use regex to extract leading digits of code number
## 3) create indicator matrix of code violations
## 4) use apply to total up each group of code violations
vio <- strsplit(foodInspect$Violations,"| ",fixed=T)
vio_nums <- lapply(vio,
function(item) regmatches(x = item,
m = gregexpr(pattern = "^[0-9]+",
text = item)))
vio_mat <- geneorama::list2matrix(vio_nums, count = T)
vio_mat <- vio_mat[ , order(as.numeric(colnames(vio_mat)))]
colnames(vio_mat)
range(vio_mat)

foodInspect$criticalCount <- apply(vio_mat[ , colnames(vio_mat) %in% 1:14], 1, sum)
foodInspect$seriousCount <- apply(vio_mat[ , colnames(vio_mat) %in% 15:29], 1, sum)
foodInspect$minorCount <- apply(vio_mat[ , colnames(vio_mat) %in% 30:44], 1, sum)

foodInspect$Violations <- NULL
rm(vio, vio_nums, vio_mat)

## Set key to ensure that records are treated CHRONOLOGICALLY
setkey(foodInspect, License, Inspection_Date)
foodInspect[ , pass_flag := ifelse(Results=="Pass",1, 0)]
foodInspect[ , fail_flag := ifelse(Results=="Fail",1, 0)]
foodInspect[ , pastFail := geneorama::shift(fail_flag, -1, 0), by = License]
foodInspect[ , pastCritical := geneorama::shift(criticalCount, -1, 0), by = License]
foodInspect[ , pastSerious := geneorama::shift(seriousCount, -1, 0), by = License]
foodInspect[ , pastMinor := geneorama::shift(minorCount, -1, 0), by = License]


## Calcualte time since last inspection.
## If the time is NA, this means it's the first inspection; add an inicator
## variable to indicate that it's the first inspection.
foodInspect[i = TRUE ,
j = timeSinceLast := as.numeric(
Inspection_Date - geneorama::shift(Inspection_Date, -1, NA)) / 365,
by = License]
foodInspect[ , firstRecord := 0]
foodInspect[is.na(timeSinceLast), firstRecord := 1]
foodInspect[is.na(timeSinceLast), timeSinceLast := 2]
# hist(foodInspect$timeSinceLast)
# foodInspect[, timeSinceLast := pmin(timeSinceLast, 2)]
# hist(foodInspect$timeSinceLast)

# foodInspect[License==40]
# foodInspect[License==62]
# foodInspect[License==104]

##==============================================================================
## ATTACH BUSINESS LICENSE DATA
##==============================================================================

business[ , WP :=paste("w",WARD,"p",PRECINCT,sep="_")]


## Matching food licenses in business:
# inin(foodInspect$License, business$LICENSE_NUMBER)
# table(unique(foodInspect$License) %in% business$LICENSE_NUMBER)
# found <- unique(foodInspect$License)[unique(foodInspect$License) %in% business$LICENSE_NUMBER]
# notfound <- unique(foodInspect$License)[!unique(foodInspect$License) %in% business$LICENSE_NUMBER]
# set.seed(1);clipper(sample(found)[1:10])
# set.seed(1);clipper(sample(notfound)[1:10])
# rm(found, notfound)


# load("DATA_ORIGINAL/original_training_data_20140129v01.Rdata")
# inin(train$license_, found)
# inin(train$license_, notfound)
# train[train$license_==notfound[1]]

# train[grep("104", train$license_),]
# train[train$license_=="104",]

# range(foodInspect$Inspection_Date)
# range(business$DATE_ISSUED)
# range(business$APPLICATION_CREATED_DATE, na.rm=T)
# range(business$DATE_ISSUED, na.rm=T)
# range(business$LICENSE_STATUS_CHANGE_DATE, na.rm=T)

business[,.N,LICENSE_NUMBER]
foodInspect[License=="349"]
business[LICENSE_NUMBER=="349"]
foodInspect[License=="1593938"]
business[LICENSE_NUMBER=="1593938"]
foodInspect[License=="1892716"]
business[LICENSE_NUMBER=="1892716"]
foodInspect[License=="18236"]
business[LICENSE_NUMBER=="18236"]

bus <- business[LICENSE_TERM_START_DATE < LICENSE_TERM_EXPIRATION_DATE, LICENSE_NUMBER]
fd <- foodInspect[,License]
length(unique(bus))
geneorama::inin(bus, fd)

## Merge over time periods
dat <- foverlaps(foodInspect[i = TRUE,
j = .SD,
keyby = list(License,
Inspection_Date = Inspection_Date,
Inspection_Date_end = Inspection_Date)],
business[i = LICENSE_TERM_START_DATE < LICENSE_TERM_EXPIRATION_DATE,
j = .SD,
keyby = list(LICENSE_NUMBER,
LICENSE_TERM_START_DATE,
LICENSE_TERM_EXPIRATION_DATE)],
mult="first",
type="any", nomatch=NA)
str(dat)

dat[,table(is.na(ID))]

business[,list(minDate=min(DATE_ISSUED),
maxDate=max(LICENSE_TERM_EXPIRATION_DATE),
payment_date=min(PAYMENT_DATE),
license_start_date=min(LICENSE_TERM_START_DATE))]

business[,list(SSA), list(License=LICENSE_NUMBER), mult="first"]
business[,.N, list(License=LICENSE_NUMBER, SSA)]

NAsummary(business)
nrow(foodInspect)
temp <- merge(foodInspect, all.y=F,
business[,list(.N), list(License=LICENSE_NUMBER, SSA)],
by="License")
nrow(temp)
nrow(foodInspect)
nrow(business)

merge(foodInspect, business[,list(SSA, License=LICENSE_ID)], by="License")
merge(foodInspect,
business[,list(SSA), keyby=list(License=LICENSE_ID)],
by="License")






Loading

0 comments on commit 64d86d6

Please sign in to comment.