Changes to allow finalization of pull request.

Chicago · Dec 8, 2014 · 4413991 · 4413991
1 parent 4f6d775
commit 4413991
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 88 deletions.
diff --git a/CODE/10_download_data.R b/CODE/10_download_data.R
@@ -97,22 +97,29 @@ unlink(file.path(DataDir, "sanitation_code"), recursive = T, force=T)
 ## SMALL FIXES
 ##==============================================================================
 
-## read in data
+## read in data that has been downloaded
 business <- readRDS(file.path(DataDir, "bus_license.Rds"))
 crime <-  readRDS(file.path(DataDir, "crime.Rds"))
 foodInspect <- readRDS(file.path(DataDir, "food_inspections.Rds"))
 garbageCarts <- readRDS(file.path(DataDir, "garbage_carts.Rds"))
 sanitationComplaints <- readRDS(file.path(DataDir, "sanitation_code.Rds"))
 
+## Convert any integer columns to numeric
+## Although numeric takes up more space and is slightly slower, keeping these
+## fields as numeric avoids problems with integer overflow and models that
+## can't handle integers.  
 geneorama::convert_datatable_IntNum(business)
 geneorama::convert_datatable_IntNum(crime)
 geneorama::convert_datatable_IntNum(foodInspect)
 geneorama::convert_datatable_IntNum(garbageCarts)
 geneorama::convert_datatable_IntNum(sanitationComplaints)
 
+## Ensure that Arrest and Domestic are Logical values
 crime[ , Arrest := as.logical(Arrest)]
 crime[ , Domestic := as.logical(Domestic)]
 
+## If you are inclined, uncomment and view the structures of your downloaded 
+## data before saving to see if it makes sense.
 # str(business)
 # str(crime)
 # str(foodInspect)
@@ -129,17 +136,3 @@ saveRDS(crime , file.path(DataDir, "crime.Rds"))
 saveRDS(foodInspect , file.path(DataDir, "food_inspections.Rds"))
 saveRDS(garbageCarts , file.path(DataDir, "garbage_carts.Rds"))
 saveRDS(sanitationComplaints , file.path(DataDir, "sanitation_code.Rds"))
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/CODE/11_Filter_data.R b/CODE/11_Filter_data.R
@@ -69,5 +69,3 @@ sanitationComplaints <- sanitationComplaints[!is.na(Latitude) & !is.na(Longitude
 sanitationComplaints <- sanitationComplaints[Status %in% c("Completed", "Open")]
 # sanitationComplaints$status <- NULL
 saveRDS(sanitationComplaints, file.path(DataDir, "sanitation_code_filtered.Rds"))
-
-
diff --git a/CODE/12_Merge.R b/CODE/12_Merge.R
@@ -1,5 +1,5 @@
 
-stop()
+stop()  ## This stop is to prevent *accidental* execution of the entire script
 
 ##==============================================================================
 ## INITIALIZE
@@ -164,8 +164,10 @@ for (j in match(colnames(OtherCategories)[-1], colnames(dat))) {
 weather$date <- as.IDate(weather$date)
 weather_new$date <- as.IDate(weather_new$date, format="%m/%d/%y")
 weather <- weather[order(weather$date), ]
-weather[nrow(weather), ]
-# weather_new[1, ]
+
+## Check that the new and old dates line up, then combine
+# tail(weather, 1)
+# head(weather_new, 1)
 weather <- rbind(weather, weather_new[-1, ])
 rm(weather_new)
 

diff --git a/CODE/20_analyze_data.R b/CODE/20_analyze_data.R
@@ -1,3 +1,14 @@
+
+
+## This file is basically a sandbox of examples and has nothing to do with the
+## actual analysis.  
+##
+## A few restaurants / schools / businesses were picked randomly just to test 
+## the matching logic between databases, and a few variables were plotted.
+## However, nothing was saved or used anywhere else.
+## 
+
+
 ##==============================================================================
 ## INITIALIZE
 ##==============================================================================
@@ -10,29 +21,40 @@ if(!"geneorama" %in% rownames(installed.packages())){
     devtools::install_github('geneorama/geneorama')}
 ## Load libraries
 geneorama::detach_nonstandard_packages()
-# geneorama::loadinstall_libraries(c("geneorama", "data.table"))
 geneorama::loadinstall_libraries(c("data.table"))
-geneorama::sourceDir("functions/")
+geneorama::sourceDir("CODE/functions/")
 
 ##==============================================================================
-## LOAD CACHED RDS FILES
+## DEFINE GLOBAL VARIABLES / MANUAL CODE
 ##==============================================================================
-business <- readRDS("DATA/bus_license.Rds")
-crime <-  readRDS("DATA/crime.Rds")
-foodInspect <- readRDS("DATA/food_inspections.Rds")
-garbageCarts <- readRDS("DATA/garbage_carts.Rds")
-sanitationComplaints <- readRDS("DATA/sanitation_code.Rds")
-
+DataDir <- "DATA/20141110"
 
 ##==============================================================================
-## LOAD PREVIOUS DATA
+## LOAD DATA
 ##==============================================================================
+
+## LOAD CACHED RDS FILES
+business <- readRDS(file.path(DataDir, "bus_license.Rds"))
+crime <-  readRDS(file.path(DataDir, "crime.Rds"))
+foodInspect <- readRDS(file.path(DataDir, "food_inspections.Rds"))
+garbageCarts <- readRDS(file.path(DataDir, "garbage_carts.Rds"))
+sanitationComplaints <- readRDS(file.path(DataDir, "sanitation_code.Rds"))
+
+## Filter crime to make more managable
+crime <- crime[Date>as.IDate('2011-07-01')]
+gc()
+
+## LOAD DATA FROM PREVIOUS ANALYSIS
 load("DATA/recreated_training_data_20141103v02.Rdata")
 prev <- as.data.table(rbind(train,tune,evaluate))
 rm(train,tune,evaluate)
 prev[ , inspection_id := as.integer(inspection_id)]
+prev[ , license_ := as.integer(license_)]
 prev <- prev[!duplicated(inspection_id)]
 
+## LOAD DATA FOR CURRENT ANALYSIS
+dat <- readRDS("DATA/20141110/dat_with_inspector.Rds")
+
 ##==============================================================================
 ## BUSINESS
 ##==============================================================================
@@ -46,22 +68,28 @@ business[ , .N, is.na(LICENSE_TERM_EXPIRATION_DATE)]
 ## CRIME
 ##==============================================================================
 crime[ , .N, Primary_Type][order(-N)]
+
+## Based on previous factor level assignment
 lvls <- crime[ , .N, Primary_Type][order(-N), Primary_Type]
 crime[ , Primary_Type := factor(x = Primary_Type, levels = lvls)]
 rm(lvls)
 
-
 ##==============================================================================
 ## FOOD INSPECTIONS
 ##==============================================================================
 str(foodInspect)
-foodInspect[,.N,Inspection_Type]
+foodInspect[ , .N, Inspection_Type]
 
-hist(foodInspect$timeSinceLast)
+hist(dat$timeSinceLast, main="Histogram of dat$timeSinceLast (current data)")
+hist(prev$timeSinceLast, main="Histogram of dat$timeSinceLast (previous analysis)")
 
+## Some example licenses
 foodInspect[License==40]
 foodInspect[License==62]
 foodInspect[License==104]
+dat[License==40]
+dat[License==62]
+dat[License==104]
 
 
 ##==============================================================================
@@ -78,7 +106,7 @@ prev[grep("CARNICERIA LA GLORIA NO. 2", doing_business_as_name)]
 prev[license_==1514813][order(inspection_date)]
 prev[license_==1514813,
      list(license_, doing_business_as_name,pastFail),
-     keyby=inspection_date]
+     keyby = inspection_date]
 
 ## Checking merge
 setkey(foodInspect, Inspection_ID)
@@ -164,23 +192,23 @@ rm(temp)
 
 business[,.N,LICENSE_NUMBER]
 
-foodInspect[License=="349"]
-business[LICENSE_NUMBER=="349"]
-foodInspect[License=="1593938"]
-business[LICENSE_NUMBER=="1593938"]
-foodInspect[License=="1892716"]
-business[LICENSE_NUMBER=="1892716"]
-foodInspect[License=="18236"]
-business[LICENSE_NUMBER=="18236"]
+foodInspect[License==349]
+business[LICENSE_NUMBER==349]
+foodInspect[License==1593938]
+business[LICENSE_NUMBER==1593938]
+foodInspect[License==1892716]
+business[LICENSE_NUMBER==1892716]
+foodInspect[License==18236]
+business[LICENSE_NUMBER==18236]
 
 
 ## Matching food licenses in business:
 geneorama::inin(foodInspect$License, business$LICENSE_NUMBER)
 table(unique(foodInspect$License) %in% business$LICENSE_NUMBER)
 found <- unique(foodInspect$License)[unique(foodInspect$License) %in% business$LICENSE_NUMBER]
 notfound <- unique(foodInspect$License)[!unique(foodInspect$License) %in% business$LICENSE_NUMBER]
-set.seed(1);clipper(sample(found)[1:10])
-set.seed(1);clipper(sample(notfound)[1:10])
+# set.seed(1);clipper(sample(found)[1:10])
+# set.seed(1);clipper(sample(notfound)[1:10])
 rm(found, notfound)
 
 

diff --git a/CODE/30_glmnet_model.R b/CODE/30_glmnet_model.R
@@ -62,18 +62,11 @@ xmat <- dat[ , list(criticalFound,
                     # risk = as.factor(Risk),
                     # facility_type = as.factor(Facility_Type),
                     timeSinceLast),
-            keyby = "Inspection_ID"]
-
-MyFormula <- ~ -1 + Inspection_ID + criticalFound + Inspector_Assigned +
-    pastSerious + ageAtInspection + pastCritical + 
-    consumption_on_premises_incidental_activity + tobacco_retail_over_counter +
-    temperatureMax + heat_burglary + heat_sanitation + heat_garbage + 
-    # risk + facility_type + 
-    timeSinceLast
-
-mm <- model.matrix(MyFormula, data=xmat[,all.vars(MyFormula),with=F])
-str(xmat)
+            keyby = Inspection_ID]
+mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F])
+mm <- as.data.table(mm)
 str(mm)
+colnames(mm)
 
 ##==============================================================================
 ## CREATE TEST / TRAIN PARTITIONS
@@ -89,68 +82,81 @@ iiTest <- dat[ , which(Inspection_Date > "2014-07-01")]
 nrow(dat)
 nrow(xmat)
 nrow(mm)
-dat[!Inspection_ID %in% mm[, "Inspection_ID"]]
-
 
 ##==============================================================================
 ## GLMNET MODEL
 ##==============================================================================
 # fit ridge regression, alpha = 0, only inspector coefficients penalized
-net <- glmnet(x = mm[iiTrain, -(1:2)],
-              y = mm[iiTrain,  2],
-              family = "binomial", 
-              alpha = 0,
-              penalty.factor = ifelse(grepl("^Inspector.Assigned", colnames(mm)), 1, 0))
-
-# see what regularization parameter 'lambda' is optimal on tune set
-errors <- sapply(net$lambda, 
-                 function(lam) 
-                     logLik(p = as.numeric(predict(net, 
-                                                   newx = mm[iiTrain,-(1:2)], 
-                                                   s=lam, 
-                                                   type="response")), 
-                            y = mm[iiTrain ,2]))
-plot(x=log(net$lambda), y=errors, type="l")
+model <- glmnet(x = as.matrix(mm[iiTrain]),
+                y = xmat[iiTrain,  criticalFound],
+                family = "binomial", 
+                alpha = 0,
+                penalty.factor = ifelse(grepl("^Inspector.Assigned", colnames(mm)), 1, 0))
+
 
+## See what regularization parameter 'lambda' is optimal on tune set
+## (We are essentially usin the previous hardcoded value)
+errors <- sapply(model$lambda, 
+                 function(lam) 
+                     logLik(p = predict(model, 
+                                        newx = as.matrix(mm[iiTrain]), 
+                                        s=lam, 
+                                        type="response")[,1], 
+                            y = xmat[iiTrain, criticalFound]))
+plot(x=log(model$lambda), y=errors, type="l")
 which.min(errors)
+model$lambda[which.min(errors)]
+## manual lambda selection
 w.lam <- 100
-lam <- net$lambda[w.lam]
-coef <- net$beta[,w.lam]
+lam <- model$lambda[w.lam]
+coef <- model$beta[,w.lam]
 inspCoef <- coef[grepl("^Inspector.Assigned",names(coef))]
 inspCoef <- inspCoef[order(-inspCoef)]
-head(inspCoef,10); tail(inspCoef,10)
+## coefficients for the inspectors, and for other variables
+inspCoef
 coef[!grepl("^Inspector.Assigned",names(coef))]
 
-## ATTACH PREDICTIONS TO XMAT AND DAT
-xmat$glm_pred <- as.numeric(predict(net, newx=mm[, -(1:2)], 
-                                    s=lam, 
-                                    type="response"))
-dat$glm_pred <- as.numeric(predict(net, newx=mm[, -(1:2)], 
-                                   s=lam, 
-                                   type="response"))
+
+## By the way, if we had knowledge of the future, we would have chosen a 
+## different lambda
+errorsTest <- sapply(model$lambda, 
+                     function(lam) 
+                         logLik(p = predict(model, 
+                                            newx = as.matrix(mm[iiTest]), 
+                                            s=lam, 
+                                            type="response")[,1], 
+                                y = xmat[iiTest, criticalFound]))
+plot(x=log(model$lambda), y=errorsTest, type="l")
+which.min(errorsTest)
+model$lambda[which.min(errorsTest)]
+
+## ATTACH PREDICTIONS TO DAT
+dat$glm_pred <- predict(model, newx=as.matrix(mm), 
+                        s=lam, 
+                        type="response")[,1]
 
 # show gini performance of inspector model on tune data set
-xmat[iiTest, gini(glm_pred, criticalFound, plot=TRUE)]
 dat[iiTest, gini(glm_pred, criticalFound, plot=TRUE)]
 
 ## Calculate confusion matrix values for evaluation
 calculate_confusion_values(actual = xmat[iiTest, criticalFound],
-                           expected = xmat[iiTest, glm_pred], 
+                           expected = dat[iiTest, glm_pred], 
                            r = .25)
 
 ## Calculate matrix of confusion matrix values for evaluation
 confusion_values_test <- t(sapply(seq(0, 1 ,.01), 
                                   calculate_confusion_values,
                                   actual = xmat[iiTest, criticalFound],
-                                  expected = xmat[iiTest, glm_pred]))
+                                  expected = dat[iiTest, glm_pred]))
 confusion_values_test
 ggplot(reshape2::melt(as.data.table(confusion_values_test), 
                       id.vars="r")) + 
     aes(x=r, y=value, colour=variable) + geom_line() + 
     geom_hline(yintercept = c(0,1))
 
-
-
+##==============================================================================
+## CALCULATION OF LIFT
+##==============================================================================
 ## TEST PERIOD: Date range
 dat[iiTest, range(Inspection_Date)]
 ## TEST PERIOD: Total inspections
@@ -160,7 +166,6 @@ dat[iiTest, sum(criticalCount)]
 ## TEST PERIOD: Inspections with any critical violations
 dat[iiTest, sum(criticalFound)]
 
-
 ## Subset test period
 datTest <- dat[iiTest]
 ## Identify first period
Original file line number	Diff line number	Diff line change
Expand Up		@@ -69,5 +69,3 @@ sanitationComplaints <- sanitationComplaints[!is.na(Latitude) & !is.na(Longitude
		sanitationComplaints <- sanitationComplaints[Status %in% c("Completed", "Open")]
		# sanitationComplaints$status <- NULL
		saveRDS(sanitationComplaints, file.path(DataDir, "sanitation_code_filtered.Rds"))