-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_borrow_a_kernel.R
138 lines (108 loc) · 4.49 KB
/
1_borrow_a_kernel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# borrowed from Troy Walters
# https://www.kaggle.com/captcalculator/r-xgboost-with-caret-tuning-and-gini-score/code
# This is a minimal framework for training xgboost in R using caret to do the cross-validation/grid tuning
# and using the normalized gini metric for scoring. The # of CV folds and size of the tuning grid
# are limited to remain under kaggle kernel limits. To improve the score up the nrounds and expand
# the tuning grid.
library(data.table)
library(caret)
library(xgboost)
library(verification)
# Read train and test data
dtrain <- fread('./train.csv')
#dtest <- fread('./test.csv')
# Check data size in memory
print("Training data size in RAM:");
print(object.size(dtrain), units = 'Mb')
# print training data dimensions
print(dim(dtrain))
# collect names of all categorical variables
cat_vars <- names(dtrain)[grepl('_cat$', names(dtrain))]
# turn categorical features into factors
dtrain[, (cat_vars) := lapply(.SD, factor), .SDcols = cat_vars]
#dtest[, (cat_vars) := lapply(.SD, factor), .SDcols = cat_vars]
# one hot encode the factor levels
dtrain <- model.matrix(~. - 1, data = dtrain)
#dtest <- model.matrix(~ . - 1, data = dtest)
# create index for train/test split
train_index <- sample(c(TRUE, FALSE), size = nrow(dtrain), replace = TRUE, prob = c(0.8, 0.2))
# perform x/y ,train/test split.
x_train <- dtrain[train_index, 3:ncol(dtrain)]
y_train <- as.factor(dtrain[train_index, 'target'])
x_test <- dtrain[!train_index, 3:ncol(dtrain)]
y_test <- as.factor(dtrain[!train_index, 'target'])
# Convert target factor levels to 0 = "No" and 1 = "Yes" to avoid this error when predicting class probs:
# https://stackoverflow.com/questions/18402016/error-when-i-try-to-predict-class-probabilities-in-r-caret
levels(y_train) <- c("No", "Yes")
levels(y_test) <- c("No", "Yes")
# normalized gini function taked from:
# https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703
normalizedGini <- function(aa, pp) {
Gini <- function(a, p) {
if (length(a) != length(p)) stop("Actual and Predicted need to be equal lengths!")
temp.df <- data.frame(actual = a, pred = p, range=c(1:length(a)))
temp.df <- temp.df[order(-temp.df$pred, temp.df$range),]
population.delta <- 1 / length(a)
total.losses <- sum(a)
null.losses <- rep(population.delta, length(a)) # Hopefully is similar to accumulatedPopulationPercentageSum
accum.losses <- temp.df$actual / total.losses # Hopefully is similar to accumulatedLossPercentageSum
gini.sum <- cumsum(accum.losses - null.losses) # Not sure if this is having the same effect or not
sum(gini.sum) / length(a)
}
Gini(aa,pp) / Gini(aa,aa)
}
# create the normalized gini summary function to pass into caret
giniSummary <- function (data, lev = "Yes", model = NULL) {
levels(data$obs) <- c('0', '1')
out <- normalizedGini(as.numeric(levels(data$obs))[data$obs], data[, lev[2]])
names(out) <- "NormalizedGini"
out
}
# create the training control object. Two-fold CV to keep the execution time under the kaggle
# limit. You can up this as your compute resources allow.
trControl = trainControl(
method = 'cv',
number = 2,
summaryFunction = giniSummary,
classProbs = TRUE,
verboseIter = TRUE,
allowParallel = FALSE)
# create the tuning grid. Again keeping this small to avoid exceeding kernel memory limits.
# You can expand as your compute resources allow.
tuneGridXGB <- expand.grid(
nrounds=c(150),
max_depth = c(4, 6),
eta = c(0.01, 0.05, 0.1, 0.2),
gamma = c(0.01),
colsample_bytree = c(0.75),
subsample = c(0.50),
min_child_weight = c(0))
start <- Sys.time()
# train the xgboost learner
xgbmod <- train(
x = x_train,
y = y_train,
method = 'xgbTree',
metric = 'NormalizedGini',
trControl = trControl,
tuneGrid = tuneGridXGB)
print(Sys.time() - start)
# make predictions
preds <- predict(xgbmod, newdata = x_test, type = "prob")
preds_final <- predict(xgbmod, newdata = dtest, type = "prob")
# convert test target values back to numeric for gini and roc.plot functions
levels(y_test) <- c("0", "1")
y_test_raw <- as.numeric(levels(y_test))[y_test]
# Diagnostics
print(xgbmod$results)
print(xgbmod$resample)
# plot results (useful for larger tuning grids)
plot(xgbmod)
# score the predictions against test data
normalizedGini(y_test_raw, preds$Yes)
# plot the ROC curve
roc.plot(y_test_raw, preds$Yes, plot.thres = c(0.02, 0.03, 0.04, 0.05))
# prep the predictions for submissions
sub <- data.frame(id = as.integer(dtest[, 'id']), target = preds_final$Yes)
# write to csv
write.csv(sub, 'xgb_submission.csv', row.names = FALSE)