forked from numerai/example-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_model.r
executable file
·169 lines (146 loc) · 9.2 KB
/
example_model.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#Tutorial and Tips for Numerai
#***
#The stock market is the hardest data science problem in the world.
#If it seems easy, you're doing it wrong.
#We don't know how to solve this problem. But here is what we know.
#We need your help. You take it from here.
#***
library(Metrics)
library(gbm)
library(randomForest)
# for automatic download of data and upload of submissions have a look at the package Rnumerai
# for the latest development release use devtools::install_github("Omni-Analytics-Group/Rnumerai") or pak::pkg_install("Omni-Analytics-Group/Rnumerai")
# for the CRAN version use install.packages("Rnumerai")
print("THIS SCRIPT DOWNSAMPLES DATA AND WILL LIKELY UNDERPERFORM THE example_predictions.csv")
#the training data is used to train your model how to predict the targets
training_data<-read.csv("numerai_training_data.csv", header=T)
#the tournament data is the data that Numerai uses to evaluate your model
tournament<-read.csv("numerai_tournament_data.csv", header=T)
#the tournament data contains validation data, test data and live data
#validation is a hold out set out of sample and further in time (higher eras) than the training data
validation<-tournament[tournament$data_type=="validation",]
benchmark<-0.002
consistency<-function(data_with_probs, target_name){
unique_eras<-unique(data_with_probs$era)
era_correlation<-vector()
i<-1
while(i<=length(unique_eras)){
this_data<-data_with_probs[data_with_probs$era==unique_eras[i],]
era_correlation[i]<-cor(this_data[,names(this_data)==target_name], this_data$prediction, method="spearman")
i<-i+1
}
consistency<-sum(era_correlation>benchmark)/length(era_correlation)
consistency
}
#the training data is large so we reduce it by 10x in order to speed up this script
#NB reducing the training data like this is not recommended for a production ready model
nrow(training_data)
set.seed(10)
#remove the following line to not downsample. Note: it will take much longer to run
sample_rows<-sample(1:nrow(training_data), ceiling(nrow(training_data)/10))
train<-training_data[sample_rows,]
#we remove the id, era, and data_type columns for now
train<-train[,-c(1:3)]
nrow(train)
#there are a large number of features
ncol(train)
#features are a number of feature groups of different sizes (intelligence, charisma, strength, dexterity, constitution, wisdom)
#Numerai does not disclose what the features or feature groups mean; they are abstract and obfuscated
#however, features within feature groups tend to be related in some way
#models which have large feature importance in one group tend to do badly for long stretches of time (eras).
head(names(train),20)
#the target variable has 5 ordinal outcomes (0,0.25,0.5,0.75,1)
#we train a model on the features to predict this target
summary(train$target)
#on this dataset, feature importance analysis is very important
#we build a random forest to understand which features tend to improve the model out of bag
#because stocks within eras are not independent, we use small bag sizes (sampsize=10%) so the out of bag estimate is meaningful
#we use a small subset of features for each tree to build a more feature balanced forest (mtry=10%)
set.seed(10)
forest<-randomForest(target~., data=train, ntree=50, mtry=ceiling(0.1*ncol(train)-1), sampsize=ceiling(0.1*nrow(train)), importance=T, maxnodes=5)
#a good model might drop the bad features according to the forest before training a final model
#if a feature group or feature is too good, it might also be a good idea to drop to improve the feature balance and improve consistency of the model
imp<-importance(forest)
imp<-imp[order(imp[,1], decreasing = FALSE),]
head(imp)
#based on the forest we can generate predictions on the validation set
#the validation set contains eras further in time than the training set
#the validation set is not "representative of the live data"; no one knows how the live data will behave
#usually train performance < validation performance < live performance because live data is many eras into the future from the training data
predictions<-predict(forest, validation)
head(predictions)
#Numerai measures performance based on Rank Correlation between your predictions and the true targets
cor(validation$target, predictions, method="spearman")
val<-validation
val$prediction<-predictions
#consistency is the fraction of months where the model achieves better correlation with the targets than the benchmark
consistency(val, "target")
#we try a gbm model; we also choose a low bag fraction of 10% as a strategy to deal with within-era non-independence (which is a property of this data)
#(if you take a sample from one era and a different sample from the same era that sample is not really out of sample because the observations occured in the same era)
#having small bags also improves the out of bag estimate for the optimal number of trees
set.seed(10)
model<-gbm(target~., data=train, n.trees=50, shrinkage=0.01, interaction.depth=5, train.fraction=1, bag.fraction=0.1, verbose=T)
#looking at the relative importance of the features we can see the model relies more on some features than others
head(summary(model))
best.iter <- gbm.perf(model, method="OOB")
best.iter
#we check performance of the gbm model on the out of sample validation data
predictions<-predict.gbm(model, validation, n.trees=best.iter, type="response")
head(predictions)
cor(validation$target, predictions, method="spearman")
val<-validation
val$prediction<-predictions
consistency(val, "target")
#the gbm model and random forest model have the same consistency (number of eras where correlation > benchmark) even though correlation are different
#improving consistency can be more important than improving standard machine learning metrics like RMSE
#good models might train in such a way as to minimize the error across eras (improve consistency) not just reduce the error on each training example
#so far we have ignored eras for training
#eras are in order of time; for example, era4 is before era5
#the reason to use eras is that cross validation on rows will tend to overfit (rows within eras are not independent)
#so it's much better to cross validate within eras for example: take a subset of eras, build a model and test on the out of sample subset of eras
#this will give a better estimate of how well your model will generalize
#the validation set is not special; it is just an out of sample set of eras greater than the training set
#some users might choose to train on the validation set as well
#to give you a sense of how to use eras we train a model on the first half of the eras and test it on the second half
#we take another 10x smaller sample of data to speed up the script
ordered_eras<-unique(training_data$era)
train<-training_data[sample_rows,]
first_half<-train[train$era%in%ordered_eras[1:ceiling(length(ordered_eras)/2)],]
second_half<-train[!(train$id%in%first_half$id),]
#we remove id, era, data_type column and train a gbm model on the first half of the data
set.seed(10)
model<-gbm(target~., data=first_half[,-c(1:3)], n.trees=100, shrinkage=0.01, interaction.depth=5, train.fraction=1, bag.fraction=0.1, verbose=T)
best.iter <- gbm.perf(model, method="OOB")
best.iter
predictions<-predict.gbm(model, second_half, n.trees=best.iter, type="response")
#our correlation score is good; what we appeared to learn generalizes well on the second half of the eras
0.5+cor(second_half$target, predictions, method="spearman")
sec<-second_half
sec$prediction<-predictions
#consistency is the fraction of months where the model achieves better correlation with the targets than the benchmark
consistency(sec, "target")
#but now we try build a model on the second half of the eras and predict on the first half
set.seed(10)
model<-gbm(target~., data=second_half[,-c(1:3)], n.trees=100, shrinkage=0.01, interaction.depth=5, train.fraction=1, bag.fraction=0.1, verbose=T)
best.iter <- gbm.perf(model, method="OOB")
best.iter
predictions<-predict.gbm(model, first_half, n.trees=best.iter, type="response")
#our correlation score is surprisingly low and surprisingly different from when we trained on the first half
#this means our model is not very consistent, and it's possible that we will see unexpectedly low performance on the test set or live set
#it also shows that our validation performance is likely to greatly overestimate our performance--this era-wise cross validation is more valuable
#a model whose performance when training on the first half is the same as training on the second half would likely be more consistent
#and would likely perform better on the tournament data and the live data
cor(first_half$target, predictions, method="spearman")
fir<-first_half
fir$prediction<-predictions
#consistency is the fraction of months where the model achieves better correlation with the targets than the benchmark
consistency(fir, "target")
#Numerai only pays models with correlations that beat the benchmark on the live portion of the tournament data
#to submit predictions from your model to Numerai, predict on the entire tournament data
#we choose use our original forest model for our final submission
tournament$prediction<-predict(forest, tournament)
#create your submission
submission<-subset(tournament, select=c(id, prediction))
head(submission)
#save your submission and now upload it to https://numer.ai
write.csv(submission, file="submission.csv", row.names=F)