Skip to content

Commit

Permalink
Changes to allow finalization of pull request.
Browse files Browse the repository at this point in the history
  • Loading branch information
geneorama committed Dec 8, 2014
1 parent 4f6d775 commit 4413991
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 88 deletions.
23 changes: 8 additions & 15 deletions CODE/10_download_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,22 +97,29 @@ unlink(file.path(DataDir, "sanitation_code"), recursive = T, force=T)
## SMALL FIXES
##==============================================================================

## read in data
## read in data that has been downloaded
business <- readRDS(file.path(DataDir, "bus_license.Rds"))
crime <- readRDS(file.path(DataDir, "crime.Rds"))
foodInspect <- readRDS(file.path(DataDir, "food_inspections.Rds"))
garbageCarts <- readRDS(file.path(DataDir, "garbage_carts.Rds"))
sanitationComplaints <- readRDS(file.path(DataDir, "sanitation_code.Rds"))

## Convert any integer columns to numeric
## Although numeric takes up more space and is slightly slower, keeping these
## fields as numeric avoids problems with integer overflow and models that
## can't handle integers.
geneorama::convert_datatable_IntNum(business)
geneorama::convert_datatable_IntNum(crime)
geneorama::convert_datatable_IntNum(foodInspect)
geneorama::convert_datatable_IntNum(garbageCarts)
geneorama::convert_datatable_IntNum(sanitationComplaints)

## Ensure that Arrest and Domestic are Logical values
crime[ , Arrest := as.logical(Arrest)]
crime[ , Domestic := as.logical(Domestic)]

## If you are inclined, uncomment and view the structures of your downloaded
## data before saving to see if it makes sense.
# str(business)
# str(crime)
# str(foodInspect)
Expand All @@ -129,17 +136,3 @@ saveRDS(crime , file.path(DataDir, "crime.Rds"))
saveRDS(foodInspect , file.path(DataDir, "food_inspections.Rds"))
saveRDS(garbageCarts , file.path(DataDir, "garbage_carts.Rds"))
saveRDS(sanitationComplaints , file.path(DataDir, "sanitation_code.Rds"))














2 changes: 0 additions & 2 deletions CODE/11_Filter_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,5 +69,3 @@ sanitationComplaints <- sanitationComplaints[!is.na(Latitude) & !is.na(Longitude
sanitationComplaints <- sanitationComplaints[Status %in% c("Completed", "Open")]
# sanitationComplaints$status <- NULL
saveRDS(sanitationComplaints, file.path(DataDir, "sanitation_code_filtered.Rds"))


8 changes: 5 additions & 3 deletions CODE/12_Merge.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

stop()
stop() ## This stop is to prevent *accidental* execution of the entire script

##==============================================================================
## INITIALIZE
Expand Down Expand Up @@ -164,8 +164,10 @@ for (j in match(colnames(OtherCategories)[-1], colnames(dat))) {
weather$date <- as.IDate(weather$date)
weather_new$date <- as.IDate(weather_new$date, format="%m/%d/%y")
weather <- weather[order(weather$date), ]
weather[nrow(weather), ]
# weather_new[1, ]

## Check that the new and old dates line up, then combine
# tail(weather, 1)
# head(weather_new, 1)
weather <- rbind(weather, weather_new[-1, ])
rm(weather_new)

Expand Down
76 changes: 52 additions & 24 deletions CODE/20_analyze_data.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@


## This file is basically a sandbox of examples and has nothing to do with the
## actual analysis.
##
## A few restaurants / schools / businesses were picked randomly just to test
## the matching logic between databases, and a few variables were plotted.
## However, nothing was saved or used anywhere else.
##


##==============================================================================
## INITIALIZE
##==============================================================================
Expand All @@ -10,29 +21,40 @@ if(!"geneorama" %in% rownames(installed.packages())){
devtools::install_github('geneorama/geneorama')}
## Load libraries
geneorama::detach_nonstandard_packages()
# geneorama::loadinstall_libraries(c("geneorama", "data.table"))
geneorama::loadinstall_libraries(c("data.table"))
geneorama::sourceDir("functions/")
geneorama::sourceDir("CODE/functions/")

##==============================================================================
## LOAD CACHED RDS FILES
## DEFINE GLOBAL VARIABLES / MANUAL CODE
##==============================================================================
business <- readRDS("DATA/bus_license.Rds")
crime <- readRDS("DATA/crime.Rds")
foodInspect <- readRDS("DATA/food_inspections.Rds")
garbageCarts <- readRDS("DATA/garbage_carts.Rds")
sanitationComplaints <- readRDS("DATA/sanitation_code.Rds")

DataDir <- "DATA/20141110"

##==============================================================================
## LOAD PREVIOUS DATA
## LOAD DATA
##==============================================================================

## LOAD CACHED RDS FILES
business <- readRDS(file.path(DataDir, "bus_license.Rds"))
crime <- readRDS(file.path(DataDir, "crime.Rds"))
foodInspect <- readRDS(file.path(DataDir, "food_inspections.Rds"))
garbageCarts <- readRDS(file.path(DataDir, "garbage_carts.Rds"))
sanitationComplaints <- readRDS(file.path(DataDir, "sanitation_code.Rds"))

## Filter crime to make more managable
crime <- crime[Date>as.IDate('2011-07-01')]
gc()

## LOAD DATA FROM PREVIOUS ANALYSIS
load("DATA/recreated_training_data_20141103v02.Rdata")
prev <- as.data.table(rbind(train,tune,evaluate))
rm(train,tune,evaluate)
prev[ , inspection_id := as.integer(inspection_id)]
prev[ , license_ := as.integer(license_)]
prev <- prev[!duplicated(inspection_id)]

## LOAD DATA FOR CURRENT ANALYSIS
dat <- readRDS("DATA/20141110/dat_with_inspector.Rds")

##==============================================================================
## BUSINESS
##==============================================================================
Expand All @@ -46,22 +68,28 @@ business[ , .N, is.na(LICENSE_TERM_EXPIRATION_DATE)]
## CRIME
##==============================================================================
crime[ , .N, Primary_Type][order(-N)]

## Based on previous factor level assignment
lvls <- crime[ , .N, Primary_Type][order(-N), Primary_Type]
crime[ , Primary_Type := factor(x = Primary_Type, levels = lvls)]
rm(lvls)


##==============================================================================
## FOOD INSPECTIONS
##==============================================================================
str(foodInspect)
foodInspect[,.N,Inspection_Type]
foodInspect[ , .N, Inspection_Type]

hist(foodInspect$timeSinceLast)
hist(dat$timeSinceLast, main="Histogram of dat$timeSinceLast (current data)")
hist(prev$timeSinceLast, main="Histogram of dat$timeSinceLast (previous analysis)")

## Some example licenses
foodInspect[License==40]
foodInspect[License==62]
foodInspect[License==104]
dat[License==40]
dat[License==62]
dat[License==104]


##==============================================================================
Expand All @@ -78,7 +106,7 @@ prev[grep("CARNICERIA LA GLORIA NO. 2", doing_business_as_name)]
prev[license_==1514813][order(inspection_date)]
prev[license_==1514813,
list(license_, doing_business_as_name,pastFail),
keyby=inspection_date]
keyby = inspection_date]

## Checking merge
setkey(foodInspect, Inspection_ID)
Expand Down Expand Up @@ -164,23 +192,23 @@ rm(temp)

business[,.N,LICENSE_NUMBER]

foodInspect[License=="349"]
business[LICENSE_NUMBER=="349"]
foodInspect[License=="1593938"]
business[LICENSE_NUMBER=="1593938"]
foodInspect[License=="1892716"]
business[LICENSE_NUMBER=="1892716"]
foodInspect[License=="18236"]
business[LICENSE_NUMBER=="18236"]
foodInspect[License==349]
business[LICENSE_NUMBER==349]
foodInspect[License==1593938]
business[LICENSE_NUMBER==1593938]
foodInspect[License==1892716]
business[LICENSE_NUMBER==1892716]
foodInspect[License==18236]
business[LICENSE_NUMBER==18236]


## Matching food licenses in business:
geneorama::inin(foodInspect$License, business$LICENSE_NUMBER)
table(unique(foodInspect$License) %in% business$LICENSE_NUMBER)
found <- unique(foodInspect$License)[unique(foodInspect$License) %in% business$LICENSE_NUMBER]
notfound <- unique(foodInspect$License)[!unique(foodInspect$License) %in% business$LICENSE_NUMBER]
set.seed(1);clipper(sample(found)[1:10])
set.seed(1);clipper(sample(notfound)[1:10])
# set.seed(1);clipper(sample(found)[1:10])
# set.seed(1);clipper(sample(notfound)[1:10])
rm(found, notfound)


Expand Down
93 changes: 49 additions & 44 deletions CODE/30_glmnet_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,11 @@ xmat <- dat[ , list(criticalFound,
# risk = as.factor(Risk),
# facility_type = as.factor(Facility_Type),
timeSinceLast),
keyby = "Inspection_ID"]

MyFormula <- ~ -1 + Inspection_ID + criticalFound + Inspector_Assigned +
pastSerious + ageAtInspection + pastCritical +
consumption_on_premises_incidental_activity + tobacco_retail_over_counter +
temperatureMax + heat_burglary + heat_sanitation + heat_garbage +
# risk + facility_type +
timeSinceLast

mm <- model.matrix(MyFormula, data=xmat[,all.vars(MyFormula),with=F])
str(xmat)
keyby = Inspection_ID]
mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F])
mm <- as.data.table(mm)
str(mm)
colnames(mm)

##==============================================================================
## CREATE TEST / TRAIN PARTITIONS
Expand All @@ -89,68 +82,81 @@ iiTest <- dat[ , which(Inspection_Date > "2014-07-01")]
nrow(dat)
nrow(xmat)
nrow(mm)
dat[!Inspection_ID %in% mm[, "Inspection_ID"]]


##==============================================================================
## GLMNET MODEL
##==============================================================================
# fit ridge regression, alpha = 0, only inspector coefficients penalized
net <- glmnet(x = mm[iiTrain, -(1:2)],
y = mm[iiTrain, 2],
family = "binomial",
alpha = 0,
penalty.factor = ifelse(grepl("^Inspector.Assigned", colnames(mm)), 1, 0))

# see what regularization parameter 'lambda' is optimal on tune set
errors <- sapply(net$lambda,
function(lam)
logLik(p = as.numeric(predict(net,
newx = mm[iiTrain,-(1:2)],
s=lam,
type="response")),
y = mm[iiTrain ,2]))
plot(x=log(net$lambda), y=errors, type="l")
model <- glmnet(x = as.matrix(mm[iiTrain]),
y = xmat[iiTrain, criticalFound],
family = "binomial",
alpha = 0,
penalty.factor = ifelse(grepl("^Inspector.Assigned", colnames(mm)), 1, 0))


## See what regularization parameter 'lambda' is optimal on tune set
## (We are essentially usin the previous hardcoded value)
errors <- sapply(model$lambda,
function(lam)
logLik(p = predict(model,
newx = as.matrix(mm[iiTrain]),
s=lam,
type="response")[,1],
y = xmat[iiTrain, criticalFound]))
plot(x=log(model$lambda), y=errors, type="l")
which.min(errors)
model$lambda[which.min(errors)]
## manual lambda selection
w.lam <- 100
lam <- net$lambda[w.lam]
coef <- net$beta[,w.lam]
lam <- model$lambda[w.lam]
coef <- model$beta[,w.lam]
inspCoef <- coef[grepl("^Inspector.Assigned",names(coef))]
inspCoef <- inspCoef[order(-inspCoef)]
head(inspCoef,10); tail(inspCoef,10)
## coefficients for the inspectors, and for other variables
inspCoef
coef[!grepl("^Inspector.Assigned",names(coef))]

## ATTACH PREDICTIONS TO XMAT AND DAT
xmat$glm_pred <- as.numeric(predict(net, newx=mm[, -(1:2)],
s=lam,
type="response"))
dat$glm_pred <- as.numeric(predict(net, newx=mm[, -(1:2)],
s=lam,
type="response"))

## By the way, if we had knowledge of the future, we would have chosen a
## different lambda
errorsTest <- sapply(model$lambda,
function(lam)
logLik(p = predict(model,
newx = as.matrix(mm[iiTest]),
s=lam,
type="response")[,1],
y = xmat[iiTest, criticalFound]))
plot(x=log(model$lambda), y=errorsTest, type="l")
which.min(errorsTest)
model$lambda[which.min(errorsTest)]

## ATTACH PREDICTIONS TO DAT
dat$glm_pred <- predict(model, newx=as.matrix(mm),
s=lam,
type="response")[,1]

# show gini performance of inspector model on tune data set
xmat[iiTest, gini(glm_pred, criticalFound, plot=TRUE)]
dat[iiTest, gini(glm_pred, criticalFound, plot=TRUE)]

## Calculate confusion matrix values for evaluation
calculate_confusion_values(actual = xmat[iiTest, criticalFound],
expected = xmat[iiTest, glm_pred],
expected = dat[iiTest, glm_pred],
r = .25)

## Calculate matrix of confusion matrix values for evaluation
confusion_values_test <- t(sapply(seq(0, 1 ,.01),
calculate_confusion_values,
actual = xmat[iiTest, criticalFound],
expected = xmat[iiTest, glm_pred]))
expected = dat[iiTest, glm_pred]))
confusion_values_test
ggplot(reshape2::melt(as.data.table(confusion_values_test),
id.vars="r")) +
aes(x=r, y=value, colour=variable) + geom_line() +
geom_hline(yintercept = c(0,1))



##==============================================================================
## CALCULATION OF LIFT
##==============================================================================
## TEST PERIOD: Date range
dat[iiTest, range(Inspection_Date)]
## TEST PERIOD: Total inspections
Expand All @@ -160,7 +166,6 @@ dat[iiTest, sum(criticalCount)]
## TEST PERIOD: Inspections with any critical violations
dat[iiTest, sum(criticalFound)]


## Subset test period
datTest <- dat[iiTest]
## Identify first period
Expand Down

0 comments on commit 4413991

Please sign in to comment.