Skip to content

Commit edac139

Browse files
authored
use v5.0 data (#163)
* update tutorial notebooks to use v5.0 data * use v4.3 meta_model for now * fix target ensemble tutorial notebook
1 parent f623631 commit edac139

File tree

4 files changed

+1448
-4519
lines changed

4 files changed

+1448
-4519
lines changed

example_model.ipynb

Lines changed: 22 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,25 @@
1111
},
1212
{
1313
"cell_type": "code",
14-
"source": [
15-
"!python --version"
16-
],
14+
"execution_count": null,
1715
"metadata": {
1816
"colab": {
1917
"base_uri": "https://localhost:8080/"
2018
},
2119
"id": "Ekw8Z93ljC3v",
2220
"outputId": "675ac893-5a46-4c6b-dc03-09438941d1fc"
2321
},
24-
"execution_count": null,
2522
"outputs": [
2623
{
27-
"output_type": "stream",
2824
"name": "stdout",
25+
"output_type": "stream",
2926
"text": [
3027
"Python 3.10.12\n"
3128
]
3229
}
30+
],
31+
"source": [
32+
"!python --version"
3333
]
3434
},
3535
{
@@ -44,8 +44,8 @@
4444
},
4545
"outputs": [
4646
{
47-
"output_type": "stream",
4847
"name": "stdout",
48+
"output_type": "stream",
4949
"text": [
5050
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.4/34.4 MB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
5151
"\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
@@ -62,58 +62,6 @@
6262
"!pip install -q numerapi pandas lightgbm cloudpickle pyarrow scikit-learn scipy==1.10.1"
6363
]
6464
},
65-
{
66-
"cell_type": "code",
67-
"execution_count": null,
68-
"metadata": {
69-
"colab": {
70-
"base_uri": "https://localhost:8080/"
71-
},
72-
"id": "4SrY-eRrhMqH",
73-
"outputId": "50373903-067a-4298-bab6-c74945fe8a3a"
74-
},
75-
"outputs": [
76-
{
77-
"output_type": "stream",
78-
"name": "stderr",
79-
"text": [
80-
"v4.3/train_int8.parquet: 2.09GB [01:10, 29.5MB/s] \n",
81-
"v4.3/features.json: 1.12MB [00:00, 4.25MB/s] \n"
82-
]
83-
}
84-
],
85-
"source": []
86-
},
87-
{
88-
"cell_type": "code",
89-
"execution_count": null,
90-
"metadata": {
91-
"colab": {
92-
"base_uri": "https://localhost:8080/"
93-
},
94-
"id": "mcv85XqKhMqH",
95-
"outputId": "a44c7266-be28-4621-afb1-c0abe69abb18"
96-
},
97-
"outputs": [
98-
{
99-
"output_type": "stream",
100-
"name": "stdout",
101-
"text": [
102-
"[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
103-
"[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
104-
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.151962 seconds.\n",
105-
"You can set `force_row_wise=true` to remove the overhead.\n",
106-
"And if memory is not enough, you can set `force_col_wise=true`.\n",
107-
"[LightGBM] [Info] Total Bins 3525\n",
108-
"[LightGBM] [Info] Number of data points in the train set: 606176, number of used features: 705\n",
109-
"[LightGBM] [Info] Start training from score 0.499979\n",
110-
"[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
111-
"[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n"
112-
]
113-
}
114-
],
115-
"source": []
116-
},
11765
{
11866
"cell_type": "code",
11967
"execution_count": null,
@@ -127,68 +75,24 @@
12775
},
12876
"outputs": [
12977
{
130-
"output_type": "display_data",
13178
"data": {
79+
"application/javascript": "\n async function download(id, filename, size) {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n const div = document.createElement('div');\n const label = document.createElement('label');\n label.textContent = `Downloading \"${filename}\": `;\n div.appendChild(label);\n const progress = document.createElement('progress');\n progress.max = size;\n div.appendChild(progress);\n document.body.appendChild(div);\n\n const buffers = [];\n let downloaded = 0;\n\n const channel = await google.colab.kernel.comms.open(id);\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n\n for await (const message of channel.messages) {\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n if (message.buffers) {\n for (const buffer of message.buffers) {\n buffers.push(buffer);\n downloaded += buffer.byteLength;\n progress.value = downloaded;\n }\n }\n }\n const blob = new Blob(buffers, {type: 'application/binary'});\n const a = document.createElement('a');\n a.href = window.URL.createObjectURL(blob);\n a.download = filename;\n div.appendChild(a);\n a.click();\n div.remove();\n }\n ",
13280
"text/plain": [
13381
"<IPython.core.display.Javascript object>"
134-
],
135-
"application/javascript": [
136-
"\n",
137-
" async function download(id, filename, size) {\n",
138-
" if (!google.colab.kernel.accessAllowed) {\n",
139-
" return;\n",
140-
" }\n",
141-
" const div = document.createElement('div');\n",
142-
" const label = document.createElement('label');\n",
143-
" label.textContent = `Downloading \"${filename}\": `;\n",
144-
" div.appendChild(label);\n",
145-
" const progress = document.createElement('progress');\n",
146-
" progress.max = size;\n",
147-
" div.appendChild(progress);\n",
148-
" document.body.appendChild(div);\n",
149-
"\n",
150-
" const buffers = [];\n",
151-
" let downloaded = 0;\n",
152-
"\n",
153-
" const channel = await google.colab.kernel.comms.open(id);\n",
154-
" // Send a message to notify the kernel that we're ready.\n",
155-
" channel.send({})\n",
156-
"\n",
157-
" for await (const message of channel.messages) {\n",
158-
" // Send a message to notify the kernel that we're ready.\n",
159-
" channel.send({})\n",
160-
" if (message.buffers) {\n",
161-
" for (const buffer of message.buffers) {\n",
162-
" buffers.push(buffer);\n",
163-
" downloaded += buffer.byteLength;\n",
164-
" progress.value = downloaded;\n",
165-
" }\n",
166-
" }\n",
167-
" }\n",
168-
" const blob = new Blob(buffers, {type: 'application/binary'});\n",
169-
" const a = document.createElement('a');\n",
170-
" a.href = window.URL.createObjectURL(blob);\n",
171-
" a.download = filename;\n",
172-
" div.appendChild(a);\n",
173-
" a.click();\n",
174-
" div.remove();\n",
175-
" }\n",
176-
" "
17782
]
17883
},
179-
"metadata": {}
84+
"metadata": {},
85+
"output_type": "display_data"
18086
},
18187
{
182-
"output_type": "display_data",
18388
"data": {
89+
"application/javascript": "download(\"download_9cb9b662-7992-47b0-b787-453b845e7050\", \"predict_barebones.pkl\", 6572312)",
18490
"text/plain": [
18591
"<IPython.core.display.Javascript object>"
186-
],
187-
"application/javascript": [
188-
"download(\"download_9cb9b662-7992-47b0-b787-453b845e7050\", \"predict_barebones.pkl\", 6572312)"
18992
]
19093
},
191-
"metadata": {}
94+
"metadata": {},
95+
"output_type": "display_data"
19296
}
19397
],
19498
"source": [
@@ -198,21 +102,21 @@
198102
"napi = NumerAPI()\n",
199103
"\n",
200104
"# use one of the latest data versions\n",
201-
"DATA_VERSION = \"v4.3\"\n",
105+
"DATA_VERSION = \"v5.0\"\n",
202106
"\n",
203107
"# Download data\n",
204-
"napi.download_dataset(f\"{DATA_VERSION}/train_int8.parquet\")\n",
108+
"napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
205109
"napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
206110
"\n",
207111
"# Load data\n",
208112
"feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
209113
"features = feature_metadata[\"feature_sets\"][\"medium\"] # use \"all\" for better performance. Requires more RAM.\n",
210-
"train = pd.read_parquet(f\"{DATA_VERSION}/train_int8.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
114+
"train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
211115
"\n",
212116
"# For better models, join train and validation data and train on all of it.\n",
213117
"# This would cause diagnostics to be misleading though.\n",
214-
"# napi.download_dataset(f\"{DATA_VERSION}/validation_int8.parquet\");\n",
215-
"# validation = pd.read_parquet(f\"{DATA_VERSION}/validation_int8.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
118+
"# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
119+
"# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
216120
"# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
217121
"# train = pd.concat([train, validation])\n",
218122
"\n",
@@ -258,6 +162,9 @@
258162
}
259163
],
260164
"metadata": {
165+
"colab": {
166+
"provenance": []
167+
},
261168
"kernelspec": {
262169
"display_name": "venv",
263170
"language": "python",
@@ -275,11 +182,8 @@
275182
"pygments_lexer": "ipython3",
276183
"version": "3.10.12"
277184
},
278-
"orig_nbformat": 4,
279-
"colab": {
280-
"provenance": []
281-
}
185+
"orig_nbformat": 4
282186
},
283187
"nbformat": 4,
284188
"nbformat_minor": 0
285-
}
189+
}

feature_neutralization.ipynb

Lines changed: 274 additions & 318 deletions
Large diffs are not rendered by default.

hello_numerai.ipynb

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@
114114
"print(\"Available versions:\\n\", dataset_versions)\n",
115115
"\n",
116116
"# Set data version to one of the latest datasets\n",
117-
"DATA_VERSION = \"v4.3\"\n",
117+
"DATA_VERSION = \"v5.0\"\n",
118118
"\n",
119119
"# Print all files available for download for our version\n",
120120
"current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]\n",
@@ -169,7 +169,7 @@
169169
"import json\n",
170170
"\n",
171171
"# download the feature metadata file\n",
172-
"napi.download_dataset(f\"{DATA_VERSION}/features.json\");\n",
172+
"napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
173173
"\n",
174174
"# read the metadata and display\n",
175175
"feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
@@ -266,12 +266,12 @@
266266
"feature_set = feature_sets[\"medium\"]\n",
267267
"\n",
268268
"# Download the training data - this will take a few minutes\n",
269-
"napi.download_dataset(f\"{DATA_VERSION}/train_int8.parquet\");\n",
269+
"napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
270270
"\n",
271271
"# Load only the \"medium\" feature set to\n",
272272
"# Use the \"all\" feature set to use all features\n",
273273
"train = pd.read_parquet(\n",
274-
" f\"{DATA_VERSION}/train_int8.parquet\",\n",
274+
" f\"{DATA_VERSION}/train.parquet\",\n",
275275
" columns=[\"era\", \"target\"] + feature_set\n",
276276
")\n",
277277
"\n",
@@ -1361,7 +1361,7 @@
13611361
"model.fit(\n",
13621362
" train[feature_set],\n",
13631363
" train[\"target\"]\n",
1364-
");"
1364+
")"
13651365
]
13661366
},
13671367
{
@@ -1740,11 +1740,11 @@
17401740
],
17411741
"source": [
17421742
"# Download validation data - this will take a few minutes\n",
1743-
"napi.download_dataset(f\"{DATA_VERSION}/validation_int8.parquet\");\n",
1743+
"napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
17441744
"\n",
17451745
"# Load the validation data and filter for data_type == \"validation\"\n",
17461746
"validation = pd.read_parquet(\n",
1747-
" f\"{DATA_VERSION}/validation_int8.parquet\",\n",
1747+
" f\"{DATA_VERSION}/validation.parquet\",\n",
17481748
" columns=[\"era\", \"data_type\", \"target\"] + feature_set\n",
17491749
")\n",
17501750
"validation = validation[validation[\"data_type\"] == \"validation\"]\n",
@@ -1810,9 +1810,9 @@
18101810
"from numerai_tools.scoring import numerai_corr, correlation_contribution\n",
18111811
"\n",
18121812
"# Download and join in the meta_model for the validation eras\n",
1813-
"napi.download_dataset(f\"{DATA_VERSION}/meta_model.parquet\")\n",
1813+
"napi.download_dataset(f\"v4.3/meta_model.parquet\")\n",
18141814
"validation[\"meta_model\"] = pd.read_parquet(\n",
1815-
" f\"{DATA_VERSION}/meta_model.parquet\"\n",
1815+
" f\"v4.3/meta_model.parquet\"\n",
18161816
")[\"numerai_meta_model\"]"
18171817
]
18181818
},
@@ -2690,10 +2690,10 @@
26902690
],
26912691
"source": [
26922692
"# Download latest live features\n",
2693-
"napi.download_dataset(f\"{DATA_VERSION}/live_int8.parquet\")\n",
2693+
"napi.download_dataset(f\"{DATA_VERSION}/live.parquet\")\n",
26942694
"\n",
26952695
"# Load live features\n",
2696-
"live_features = pd.read_parquet(f\"{DATA_VERSION}/live_int8.parquet\", columns=feature_set)\n",
2696+
"live_features = pd.read_parquet(f\"{DATA_VERSION}/live.parquet\", columns=feature_set)\n",
26972697
"\n",
26982698
"# Generate live predictions\n",
26992699
"live_predictions = model.predict(live_features[feature_set])\n",

0 commit comments

Comments
 (0)