|
16 | 16 |
|
17 | 17 | import Event from "./Event"
|
18 | 18 |
|
19 |
| -type EventType = "Epoch" | "Iteration" | "Marker" |
| 19 | +type EventType = "Data Fetch" | "Data Uncompress" | "Evaluation" | "EvaluationStep" | "Epoch" | "Iteration" | "Marker" |
20 | 20 | type Detail = { epoch: number; step: number; nSteps: number; ip: string }
|
21 | 21 | export type TorchEvent = Event<EventType, Detail>
|
22 | 22 |
|
@@ -77,26 +77,65 @@ export function collateEvent(M: TorchEvent[], line: string) {
|
77 | 77 | return M
|
78 | 78 | }
|
79 | 79 |
|
80 |
| - const match = line.match(/ip=([\d.]+)\)\s+(Epoch|Iteration):\s+(\d+)%\|[^|]+\|\s(\d+)\/(\d+)/) |
| 80 | + // Data fetch/uncompress events |
| 81 | + const hackMatch = line.match(/ip=([\d.]+)\)\s+(\d+-\d+-\d+\s+\d+:\d+:\d+)\s+(getting data|unpacking)/) |
| 82 | + if (hackMatch) { |
| 83 | + const ip = hackMatch[1] |
| 84 | + const timestamp = new Date(hackMatch[2]).getTime() |
| 85 | + const name = `Torch Training on ${ip}` |
| 86 | + const type: EventType = hackMatch[3] === "unpacking" ? "Data Uncompress" : "Data Fetch" |
| 87 | + M.push(new TorchEventImpl(name, ip, type, 1, 1, 1, timestamp, "Done", line.slice(line.indexOf(hackMatch[3])))) |
| 88 | + } |
| 89 | + |
| 90 | + // Torch Events |
| 91 | + const match = line.match(/ip=([\d.]+)\)\s+(Evaluation|Epoch|Iteration):\s+(\d+)%\|[^|]+\|\s(\d+)\/(\d+)/) |
81 | 92 | if (match) {
|
82 | 93 | const ip = match[1]
|
83 | 94 | const type = match[2] as EventType
|
84 |
| - // const percentage = parseInt(match[3], 10) |
85 |
| - const step = parseInt(match[4], 10) - (type === "Epoch" ? 0 : 1) |
86 | 95 | const nSteps = parseInt(match[5], 10)
|
| 96 | + const name = `Torch Training on ${ip}` |
| 97 | + |
| 98 | + // re: the complex conditional (-)... Iteration markers are post |
| 99 | + // i.e. emitted upon completion, whereas Evaluation and Epoch are |
| 100 | + // pre, i.e. emitted upon commencement |
| 101 | + const step = parseInt(match[4], 10) - (type === "Iteration" ? 1 : 0) |
87 | 102 |
|
88 | 103 | const epoch =
|
89 |
| - type === "Epoch" |
| 104 | + type === "Evaluation" |
| 105 | + ? { step: -1, nSteps: -1, state: "InProgress" } |
| 106 | + : type === "Epoch" |
90 | 107 | ? { step, nSteps, state: "InProgress" }
|
91 | 108 | : findEpoch(M, ip) || { step: -1, nSteps: 0, state: "InProgress" }
|
92 |
| - const name = `Torch Training on ${ip}` |
93 | 109 | const timestampMarker = findPrevious(M, ip, "Marker", "Done")
|
94 | 110 | const timestamp = timestampMarker ? timestampMarker.timestamp : Date.now()
|
95 | 111 |
|
96 |
| - if (type === "Iteration") { |
| 112 | + if (type === "Evaluation") { |
| 113 | + if (step === 0) { |
| 114 | + M.push(new TorchEventImpl(name, ip, "Evaluation", step, nSteps, epoch.step, timestamp)) |
| 115 | + for (let idx = 1; idx < nSteps; idx++) { |
| 116 | + // prefill |
| 117 | + M.push(new TorchEventImpl(name, ip, "EvaluationStep", idx, nSteps, epoch.step, timestamp, "Pending")) |
| 118 | + } |
| 119 | + } else { |
| 120 | + for (let idx = 1; idx <= step; idx++) { |
| 121 | + const priorEvaluationStep = findPrevious(M, ip, "EvaluationStep", "Pending", idx, epoch.step) |
| 122 | + if (priorEvaluationStep) { |
| 123 | + priorEvaluationStep.state = "Done" |
| 124 | + } |
| 125 | + } |
| 126 | + |
| 127 | + if (step === nSteps) { |
| 128 | + const priorEvaluation = findPrevious(M, ip, "Evaluation", "InProgress", 0, epoch.step) |
| 129 | + if (priorEvaluation) { |
| 130 | + priorEvaluation.state = "Done" |
| 131 | + } |
| 132 | + } |
| 133 | + } |
| 134 | + return M |
| 135 | + } else if (type === "Iteration") { |
97 | 136 | epoch.state = "InProgress"
|
98 | 137 | } else if (step > 0) {
|
99 |
| - const thisEpoch = findEpoch(M, ip, "Pending", step) |
| 138 | + const thisEpoch = findPrevious(M, ip, type, "Pending", step) |
100 | 139 | if (thisEpoch) {
|
101 | 140 | thisEpoch.state = "InProgress"
|
102 | 141 | }
|
@@ -145,7 +184,15 @@ export function collateEvent(M: TorchEvent[], line: string) {
|
145 | 184 | }
|
146 | 185 |
|
147 | 186 | function sortFn(a: TorchEvent, b: TorchEvent) {
|
148 |
| - return a.ip.localeCompare(b.ip) || a.epoch - b.epoch || a.step - b.step || a.type.localeCompare(b.type) |
| 187 | + const aIsEval = /^Evaluation/.test(a.type) ? 1 : 0 |
| 188 | + const bIsEval = /^Evaluation/.test(b.type) ? 1 : 0 |
| 189 | + return ( |
| 190 | + a.ip.localeCompare(b.ip) || |
| 191 | + aIsEval - bIsEval || |
| 192 | + a.epoch - b.epoch || |
| 193 | + a.step - b.step || |
| 194 | + a.type.localeCompare(b.type) |
| 195 | + ) |
149 | 196 | }
|
150 | 197 |
|
151 | 198 | /** @return lifecycle events (Epoch, Iteration) for Torch training */
|
|
0 commit comments