|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 1, |
| 5 | + "execution_count": null, |
6 | 6 | "id": "bb28e271",
|
7 | 7 | "metadata": {},
|
8 | 8 | "outputs": [],
|
|
57 | 57 | },
|
58 | 58 | {
|
59 | 59 | "cell_type": "code",
|
60 |
| - "execution_count": 2, |
| 60 | + "execution_count": null, |
61 | 61 | "id": "edd46306",
|
62 | 62 | "metadata": {},
|
63 | 63 | "outputs": [],
|
|
70 | 70 | },
|
71 | 71 | {
|
72 | 72 | "cell_type": "code",
|
73 |
| - "execution_count": 3, |
| 73 | + "execution_count": null, |
74 | 74 | "id": "591f8c61",
|
75 | 75 | "metadata": {},
|
76 |
| - "outputs": [ |
77 |
| - { |
78 |
| - "name": "stderr", |
79 |
| - "output_type": "stream", |
80 |
| - "text": [ |
81 |
| - "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s] \n", |
82 |
| - "unzipping files: 100%|█████████████████████████████| 8/8 [00:04<00:00, 1.61files/s]\n" |
83 |
| - ] |
84 |
| - } |
85 |
| - ], |
| 76 | + "outputs": [], |
86 | 77 | "source": [
|
87 | 78 | "DATA_PATH = os.environ.get(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n",
|
88 | 79 | "download_file(\"http://files.grouplens.org/datasets/movielens/ml-25m.zip\", DATA_PATH + \"/ml-25m.zip\")"
|
|
110 | 101 | },
|
111 | 102 | {
|
112 | 103 | "cell_type": "code",
|
113 |
| - "execution_count": 4, |
| 104 | + "execution_count": null, |
114 | 105 | "id": "c65e5ef6",
|
115 | 106 | "metadata": {},
|
116 | 107 | "outputs": [],
|
|
140 | 131 | },
|
141 | 132 | {
|
142 | 133 | "cell_type": "code",
|
143 |
| - "execution_count": 5, |
| 134 | + "execution_count": null, |
144 | 135 | "id": "9fbe17a7",
|
145 | 136 | "metadata": {},
|
146 |
| - "outputs": [ |
147 |
| - { |
148 |
| - "name": "stdout", |
149 |
| - "output_type": "stream", |
150 |
| - "text": [ |
151 |
| - "Overwriting ./tf_trainer.py\n" |
152 |
| - ] |
153 |
| - } |
154 |
| - ], |
| 137 | + "outputs": [], |
155 | 138 | "source": [
|
156 | 139 | "%%writefile \"./tf_trainer.py\"\n",
|
157 | 140 | "\n",
|
|
183 | 166 | "\n",
|
184 | 167 | "parser = argparse.ArgumentParser()\n",
|
185 | 168 | "parser.add_argument(\"--data_path\", default=None, help=\"Input directory.\")\n",
|
186 |
| - "parser.add_argument(\"--batch_size\", default=None, help=\"Batch size.\")\n", |
| 169 | + "parser.add_argument(\"--batch_size\", type=int, default=None, help=\"Batch size.\")\n", |
187 | 170 | "args = parser.parse_args()\n",
|
188 | 171 | "\n",
|
189 | 172 | "DATA_PATH = args.data_path or os.path.expanduser(\"~/workspace\")\n",
|
|
291 | 274 | "metadata": {
|
292 | 275 | "scrolled": true
|
293 | 276 | },
|
294 |
| - "outputs": [ |
295 |
| - { |
296 |
| - "name": "stdout", |
297 |
| - "output_type": "stream", |
298 |
| - "text": [ |
299 |
| - "2023-06-08 04:04:31.525132: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
300 |
| - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
301 |
| - "2023-06-08 04:04:31.640485: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
302 |
| - "[1,0]<stderr>:2023-06-08 04:04:34.931845: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
303 |
| - "[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
304 |
| - "[1,1]<stderr>:2023-06-08 04:04:34.939388: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
305 |
| - "[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
306 |
| - "[1,1]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
307 |
| - "[1,0]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
308 |
| - "[1,1]<stderr>:2023-06-08 04:04:41.286722: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n", |
309 |
| - "[1,1]<stderr>:2023-06-08 04:04:41.286821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n", |
310 |
| - "[1,0]<stderr>:2023-06-08 04:04:41.292086: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n", |
311 |
| - "[1,0]<stderr>:2023-06-08 04:04:41.292173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n", |
312 |
| - "[1,1]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n", |
313 |
| - "[1,1]<stderr>:Instructions for updating:\n", |
314 |
| - "[1,1]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n", |
315 |
| - "[1,0]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n", |
316 |
| - "[1,0]<stderr>:Instructions for updating:\n", |
317 |
| - "[1,0]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n", |
318 |
| - "[1,0]<stdout>:Step #0\tLoss: 13.976286\n", |
319 |
| - "[1,0]<stdout>:Step #10\tLoss: 13.746111\n", |
320 |
| - "[1,0]<stdout>:Step #20\tLoss: 13.905323\n", |
321 |
| - "[1,0]<stdout>:Step #30\tLoss: 14.093473\n", |
322 |
| - "[1,0]<stdout>:Step #40\tLoss: 13.336206\n", |
323 |
| - "[1,0]<stdout>:Step #50\tLoss: 13.932583\n", |
324 |
| - "[1,0]<stdout>:Step #60\tLoss: 13.702780\n", |
325 |
| - "[1,0]<stdout>:Step #70\tLoss: 13.522057\n", |
326 |
| - "[1,0]<stdout>:Step #80\tLoss: 13.382860\n", |
327 |
| - "[1,0]<stdout>:Step #90\tLoss: 13.701270\n", |
328 |
| - "[1,0]<stdout>:Step #100\tLoss: 13.240610\n", |
329 |
| - "[1,0]<stdout>:Step #110\tLoss: 13.264977\n", |
330 |
| - "[1,0]<stdout>:Step #120\tLoss: 13.984927\n", |
331 |
| - "[1,0]<stdout>:Step #130\tLoss: 14.039978\n", |
332 |
| - "[1,0]<stdout>:Step #140\tLoss: 13.639907\n", |
333 |
| - "[1,0]<stdout>:Step #150\tLoss: 13.430090\n", |
334 |
| - "[1,0]<stdout>:Step #160\tLoss: 13.219415\n", |
335 |
| - "[1,0]<stdout>:Step #170\tLoss: 12.758451\n", |
336 |
| - "[1,0]<stdout>:Step #180\tLoss: 13.592442\n" |
337 |
| - ] |
338 |
| - } |
339 |
| - ], |
| 277 | + "outputs": [], |
340 | 278 | "source": [
|
341 |
| - "!horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536" |
| 279 | + "! horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536" |
342 | 280 | ]
|
343 | 281 | },
|
344 | 282 | {
|
|
0 commit comments