Skip to content

Commit 35260c2

Browse files
committed
feat: update ray resources to match the newer/cleaner torchx resources form
this allows fixing: codeflare logs when late-attaching may not stream out gpu utilization BREAKING CHANGE: this changes the structure of the ray form; tests may need updates. Also, any automated -y runs will require an update.
1 parent 2c12e78 commit 35260c2

File tree

14 files changed

+75
-35
lines changed

14 files changed

+75
-35
lines changed

package-lock.json

Lines changed: 17 additions & 17 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

plugins/plugin-codeflare/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@
3030
"@types/split2": "^3.2.1"
3131
},
3232
"dependencies": {
33-
"@guidebooks/store": "^6.1.9",
33+
"@guidebooks/store": "^6.2.1",
3434
"@logdna/tail-file": "^3.0.1",
3535
"@patternfly/react-charts": "^6.94.18",
3636
"@patternfly/react-core": "^4.276.6",
3737
"asciinema-player": "^3.1.0",
3838
"chokidar": "^3.5.3",
39-
"madwizard": "^6.4.1",
39+
"madwizard": "^6.5.3",
4040
"needle": "^3.2.0",
4141
"open": "^8.4.2",
4242
"pretty-bytes": "^6.1.0",

tests/kind/profiles/non-gpu1/keep-it-simple

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"500m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.5Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
1818
"kubernetes/context": "kind-codeflare-test",
1919
"kubernetes/choose/ns": "default",
2020
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu1/mcad-coscheduler

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "200m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.25Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu1/mcad-default

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": "{\"Number of Workers\":1,\"CPUs per worker\":\"200m\",\"GPUs per worker\":0,\"Memory per worker\":\"1.25Gi\",\"Ephemeral Storage per worker\":\"5Gi\"}",
1818
"kubernetes/context": "kind-codeflare-test",
1919
"kubernetes/choose/ns": "default",
2020
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu1/mcad-preinstalled

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "200m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.25Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu1/ray-autoscaler

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"200m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"0\",\"Maximum Workers\":\"0\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"2.5Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "200m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.25Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu2/keep-it-simple

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,18 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/ray-basic\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "500m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.5Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",
2127
"ml/ray/cluster/choose": "codeflare-test-ray-cluster",
2228
"ml/ray/cluster/choose/kubernetes": "codeflare-test-ray-cluster",
2329
"ml/ray/cluster/kubernetes/choose-pod-scheduler": "Keep It Simple"
2430
}
25-
}
31+
}

tests/kind/profiles/non-gpu3/keep-it-simple

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "500m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.5Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

tests/kind/profiles/non-gpu4/keep-it-simple

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
"ml/codeflare/training/byoc/form": "{\"Path to source\":\"$PWD/tests/kind/inputs/qiskit\",\"Base image\":\"rayproject/ray:2.1.0\",\"Command line prefix\":\"python3 main.py\"}",
1515
"kubernetes/choose/secret/image-pull": "No secret needed, since my image is public",
1616
"s3/choose/bucket/maybe": "My data is not stored in S3",
17-
"ml/ray/start/resources": "{\"Number of CPUs\":\"500m\",\"Number of GPUs\":\"0\",\"Minimum Workers\":\"1\",\"Maximum Workers\":\"1\",\"Worker Memory\":\"500Mi\",\"Head Memory\":\"3Gi\",\"Ephemeral Storage\":\"5Gi\"}",
17+
"ml/ray/start/resources": {
18+
"Number of Workers": 1,
19+
"CPUs per worker": "500m",
20+
"GPUs per worker": 0,
21+
"Memory per worker": "1.5Gi",
22+
"Ephemeral Storage per worker": "5Gi"
23+
},
1824
"kubernetes/context": "kind-codeflare-test",
1925
"kubernetes/choose/ns": "default",
2026
"ml/ray/storage/s3/maybe": "My code does not use Ray Workflows",

0 commit comments

Comments
 (0)