Skip to content

Commit 96cf527

Browse files
authored
Merge pull request #170 from numerai/ndharasz/fix-ram-issues
decrease feature set sizes to work with google colab
2 parents 43e4017 + 0d5d8fd commit 96cf527

File tree

5 files changed

+33
-21
lines changed

5 files changed

+33
-21
lines changed

example_model.ipynb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,10 @@
110110
"\n",
111111
"# Load data\n",
112112
"feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
113-
"features = feature_metadata[\"feature_sets\"][\"medium\"] # use \"all\" for better performance. Requires more RAM.\n",
113+
"features = feature_metadata[\"feature_sets\"][\"small\"]\n",
114+
"# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
115+
"# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
116+
"# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
114117
"train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
115118
"\n",
116119
"# For better models, join train and validation data and train on all of it.\n",

feature_neutralization.ipynb

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@
465465
},
466466
{
467467
"cell_type": "code",
468-
"execution_count": 3,
468+
"execution_count": null,
469469
"metadata": {
470470
"colab": {
471471
"base_uri": "https://localhost:8080/"
@@ -484,8 +484,11 @@
484484
],
485485
"source": [
486486
"# define the medium features and medium serenity features\n",
487-
"medium_features = feature_sets[\"medium\"]\n",
488-
"med_serenity_feats = list(subgroups[\"medium\"][\"serenity\"])\n",
487+
"# use \"all\" for better performance. Requires more RAM.\n",
488+
"feature_size = \"medium\"\n",
489+
"# feature_size = \"all\"\n",
490+
"medium_features = feature_sets[feature_size]\n",
491+
"med_serenity_feats = list(subgroups[feature_size][\"serenity\"])\n",
489492
"\n",
490493
"# Download the training data and feature metadata\n",
491494
"napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",

hello_numerai.ipynb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@
244244
},
245245
{
246246
"cell_type": "code",
247-
"execution_count": 5,
247+
"execution_count": null,
248248
"metadata": {
249249
"colab": {
250250
"base_uri": "https://localhost:8080/"
@@ -265,7 +265,10 @@
265265
"import pandas as pd\n",
266266
"\n",
267267
"# Define our feature set\n",
268-
"feature_set = feature_sets[\"medium\"]\n",
268+
"feature_set = feature_sets[\"small\"]\n",
269+
"# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
270+
"# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
271+
"# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
269272
"\n",
270273
"# Download the training data - this will take a few minutes\n",
271274
"napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",

target_ensemble.ipynb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
},
6262
{
6363
"cell_type": "code",
64-
"execution_count": 2,
64+
"execution_count": null,
6565
"metadata": {
6666
"colab": {
6767
"base_uri": "https://localhost:8080/",
@@ -747,7 +747,10 @@
747747
"\n",
748748
"# Load data\n",
749749
"feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
750-
"feature_cols = feature_metadata[\"feature_sets\"][\"medium\"]\n",
750+
"feature_cols = feature_metadata[\"feature_sets\"][\"small\"]\n",
751+
"# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
752+
"# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
753+
"# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
751754
"target_cols = feature_metadata[\"targets\"]\n",
752755
"train = pd.read_parquet(\n",
753756
" f\"{DATA_VERSION}/train.parquet\",\n",

utils.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# This code is soft-deprecated. It is recommended that you use
2+
# This code is deprecated. It is recommended that you use
33
# the numerai-tools package instead:
44
# https://github.com/numerai/numerai-tools
55
#
@@ -316,27 +316,27 @@ def validation_metrics(
316316
lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max()
317317
)
318318
max_feature_exposure = max_per_era.mean()
319-
validation_stats.loc[
320-
"max_feature_exposure", pred_col
321-
] = max_feature_exposure
319+
validation_stats.loc["max_feature_exposure", pred_col] = (
320+
max_feature_exposure
321+
)
322322

323323
# Check feature neutral mean
324324
feature_neutral_mean = get_feature_neutral_mean(
325325
validation_data, pred_col, target_col, features_for_neutralization
326326
)
327-
validation_stats.loc[
328-
"feature_neutral_mean", pred_col
329-
] = feature_neutral_mean
327+
validation_stats.loc["feature_neutral_mean", pred_col] = (
328+
feature_neutral_mean
329+
)
330330

331331
# Check TB200 feature neutral mean
332332
tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(
333333
lambda df: get_feature_neutral_mean_tb_era(
334334
df, pred_col, target_col, 200, features_for_neutralization
335335
)
336336
)
337-
validation_stats.loc[
338-
"tb200_feature_neutral_mean", pred_col
339-
] = tb200_feature_neutral_mean_era.mean()
337+
validation_stats.loc["tb200_feature_neutral_mean", pred_col] = (
338+
tb200_feature_neutral_mean_era.mean()
339+
)
340340

341341
# Check top and bottom 200 metrics (TB200)
342342
tb200_validation_correlations = fast_score_by_date(
@@ -372,9 +372,9 @@ def validation_metrics(
372372
lambda d: unif(d[pred_col]).corr(unif(d[example_col]))
373373
)
374374
corr_with_example_preds = per_era_corrs.mean()
375-
validation_stats.loc[
376-
"corr_with_example_preds", pred_col
377-
] = corr_with_example_preds
375+
validation_stats.loc["corr_with_example_preds", pred_col] = (
376+
corr_with_example_preds
377+
)
378378

379379
# Check exposure dissimilarity per era
380380
tdf = validation_data.groupby(ERA_COL).apply(

0 commit comments

Comments
 (0)