Skip to content

Commit f9c0603

Browse files
committed
feat: improved handling of data-related events in the job logs
Fetching from s3 to local storage: - Data: try remote get - Data: done remote get Pushing from local storage to plasma: - Data: try cache put - Data: done cache put Fetching from plasma: - Data: try cache get - Data: done cache get Uncompressing, etc.: - Data: try unpack - Data: done unpack This PR also adds a pulse animation to the InProgress events. This has worked with with Kui's Grid view.
1 parent ddbfe3b commit f9c0603

File tree

13 files changed

+2697
-1839
lines changed

13 files changed

+2697
-1839
lines changed

plugins/plugin-codeflare/src/controller/events/torch.ts

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,16 @@
1616

1717
import Event from "./Event"
1818

19-
type EventType = "Data Fetch" | "Data Uncompress" | "Evaluation" | "EvaluationStep" | "Epoch" | "Iteration" | "Marker"
19+
type EventType =
20+
| "Data Fetch from Upstream"
21+
| "Data Store in Cache"
22+
| "Data Fetch from Cache"
23+
| "Data Uncompress"
24+
| "Evaluation"
25+
| "EvaluationStep"
26+
| "Epoch"
27+
| "Iteration"
28+
| "Marker"
2029
type Detail = { epoch: number; step: number; nSteps: number; ip: string }
2130
export type TorchEvent = Event<EventType, Detail>
2231

@@ -78,13 +87,36 @@ export function collateEvent(M: TorchEvent[], line: string) {
7887
}
7988

8089
// Data fetch/uncompress events
81-
const hackMatch = line.match(/ip=([\d.]+)\)\s+(\d+-\d+-\d+\s+\d+:\d+:\d+)\s+(getting data|unpacking)/)
90+
const hackMatch = line.match(
91+
/ip=([\d.]+)\)\s+(\d+-\d+-\d+\s+\d+:\d+:\d+)\s+Data:\s+(try|done)\s+(remote get|cache put|cache get|unpack)/
92+
)
8293
if (hackMatch) {
8394
const ip = hackMatch[1]
8495
const timestamp = new Date(hackMatch[2]).getTime()
8596
const name = `Torch Training on ${ip}`
86-
const type: EventType = hackMatch[3] === "unpacking" ? "Data Uncompress" : "Data Fetch"
87-
M.push(new TorchEventImpl(name, ip, type, 1, 1, 1, timestamp, "Done", line.slice(line.indexOf(hackMatch[3]))))
97+
98+
const protoState = hackMatch[3]
99+
const state = protoState === "try" ? "InProgress" : "Done"
100+
const protoType = hackMatch[4]
101+
const type: EventType =
102+
protoType === "remote get"
103+
? "Data Fetch from Upstream"
104+
: protoType === "cache put"
105+
? "Data Store in Cache"
106+
: protoType === "cache get"
107+
? "Data Fetch from Cache"
108+
: "Data Uncompress"
109+
110+
if (state === "InProgress") {
111+
M.push(new TorchEventImpl(name, ip, type, 1, 1, 1, timestamp, state, line.slice(line.indexOf(protoState))))
112+
} else {
113+
const prev = findPrevious(M, ip, type, "InProgress")
114+
if (prev) {
115+
prev.state = state
116+
} else {
117+
console.error("Missing Data: begin event for this Data: end event", line)
118+
}
119+
}
88120
}
89121

90122
// Torch Events

plugins/plugin-codeflare/web/scss/components/Dashboard/Grid.scss

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ $fullWidth: 1em; /* $large * ($unit + $rgap) - $rgap */
9999
}
100100
@include State("InProgress") {
101101
@include Color(var(--color-base0A), !important);
102+
filter: brightness(1.1);
103+
animation: var(--animation-infinite-repeating-pulse);
102104
}
103105
@include State("Pending") {
104106
@include Color(var(--color-base04), !important);
@@ -146,7 +148,18 @@ $fullWidth: 1em; /* $large * ($unit + $rgap) - $rgap */
146148
@include StepUI;
147149
}
148150

149-
@include CFCell(DataFetch) {
151+
@include CFCell(DataFetchfromUpstream) {
152+
grid-column: span $small;
153+
@include Color(var(--color-base0E));
154+
}
155+
156+
@include CFCell(DataStoreinCache) {
157+
grid-column: span $small;
158+
@include StepUI;
159+
@include Color(var(--color-base0E));
160+
}
161+
162+
@include CFCell(DataFetchfromCache) {
150163
grid-column: span $small;
151164
@include Color(var(--color-base0E));
152165
}

tests/plugin-codeflare/dashboard/inputs/2/README.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/plugin-codeflare/dashboard/inputs/2/choices.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"Choose the bucket that contains your model and glue data.madwizard/apriori/platform": "Darwin",
1313
"expand(kubectl config get-contexts -o name, Kubernetes contexts)": "default/api-codeflare-train-v11-codeflare-openshift-com:6443/kube:admin",
1414
"expand([ -z ${KUBE_CONTEXT} ] && exit 1 || kubectl --context ${KUBE_CONTEXT} get ns -o name | grep -Ev 'openshift|kube-' | sed 's#namespace/##', Kubernetes namespaces)####Create a namespace": "nvidia-gpu-operator",
15-
"Number of CPUs####Number of GPUs####Minimum Workers####Maximum Workers####Worker Memory####Head Memory": "{\"Number of CPUs\":\"1\",\"Number of GPUs\":\"1\",\"Minimum Workers\":\"5\",\"Maximum Workers\":\"5\",\"Worker Memory\":\"32Gi\",\"Head Memory\":\"32Gi\"}",
15+
"Number of CPUs####Number of GPUs####Minimum Workers####Maximum Workers####Worker Memory####Head Memory": "{\"Number of CPUs\":\"1\",\"Number of GPUs\":\"1\",\"Minimum Workers\":\"4\",\"Maximum Workers\":\"4\",\"Worker Memory\":\"32Gi\",\"Head Memory\":\"32Gi\"}",
1616
"Choose the bucket that contains your model and glue data.expand([ -n \"$MC_CONFIG_DIR\" ] && mc -q --config-dir ${MC_CONFIG_DIR} ls s3 | awk '{print substr($NF, 1, length($NF) - 1)}', S3 Buckets)####separator####📁 Create a new bucket": "browsey",
1717
"Choose your Model File.expand([ -n \"$MC_CONFIG_DIR\" ] && [ -n \"$S3_FILEPATH\" ] && [ -n \"$S3_FILEPATH${S3_BUCKET_SUFFIX}\" ] && mc -q --config-dir ${MC_CONFIG_DIR} ls \"s3/$S3_FILEPATH${S3_BUCKET_SUFFIX}\" | awk '{print $NF}', S3 Objects)": "roberta-base",
1818
"Choose your Glue Data File.expand([ -n \"$MC_CONFIG_DIR\" ] && [ -n \"$S3_FILEPATH\" ] && [ -n \"$S3_FILEPATH${S3_BUCKET_SUFFIX}\" ] && mc -q --config-dir ${MC_CONFIG_DIR} ls \"s3/$S3_FILEPATH${S3_BUCKET_SUFFIX}\" | awk '{print $NF}', S3 Objects)": "glue_data",
Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +0,0 @@
1-
LAST SEEN TYPE REASON OBJECT MESSAGE
2-
0s Normal Scheduled pod/mycluster-ray-head-type-krlr4 Successfully assigned nvidia-gpu-operator/mycluster-ray-head-type-krlr4 to ip-10-0-128-169.ec2.internal
3-
0s Normal AddedInterface pod/mycluster-ray-head-type-krlr4 Add eth0 [10.128.44.144/23] from openshift-sdn
4-
0s Normal Pulling pod/mycluster-ray-head-type-krlr4 Pulling image "rayproject/ray-ml:1.13.0-py37-gpu"
5-
0s Normal Pulled pod/mycluster-ray-head-type-krlr4 Successfully pulled image "rayproject/ray-ml:1.13.0-py37-gpu" in 6m48.700535275s
6-
0s Normal Created pod/mycluster-ray-head-type-krlr4 Created container ray-node
7-
0s Normal Started pod/mycluster-ray-head-type-krlr4 Started container ray-node
8-
0s Normal Scheduled pod/mycluster-ray-worker-type-6r7hp Successfully assigned nvidia-gpu-operator/mycluster-ray-worker-type-6r7hp to ip-10-0-133-106.ec2.internal
9-
0s Normal AddedInterface pod/mycluster-ray-worker-type-6r7hp Add eth0 [10.131.42.42/23] from openshift-sdn
10-
0s Normal Pulling pod/mycluster-ray-worker-type-6r7hp Pulling image "rayproject/ray-ml:1.13.0-py37-gpu"
11-
0s Normal Pulled pod/mycluster-ray-worker-type-6r7hp Successfully pulled image "rayproject/ray-ml:1.13.0-py37-gpu" in 6m14.380152399s
12-
0s Normal Created pod/mycluster-ray-worker-type-6r7hp Created container ray-node
13-
0s Normal Started pod/mycluster-ray-worker-type-6r7hp Started container ray-node

tests/plugin-codeflare/dashboard/inputs/2/job.json

Lines changed: 17 additions & 17 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
11f4cebe-c012-4467-b5d6-b3cf1fd69269
1+
f6c365c4-5fec-41b1-a811-3594c1c0f91d

0 commit comments

Comments
 (0)