From a70fbb24b0da4b85f99aafd9d82818ed86a3df28 Mon Sep 17 00:00:00 2001 From: Joseph White Date: Wed, 15 Jan 2025 14:44:49 -0500 Subject: [PATCH 1/4] Add slurm json --- classes/OpenXdmod/Shredder/Slurm.php | 6 +- classes/OpenXdmod/Shredder/Slurmjson.php | 292 +++++++++++++++++++++++ 2 files changed, 295 insertions(+), 3 deletions(-) create mode 100644 classes/OpenXdmod/Shredder/Slurmjson.php diff --git a/classes/OpenXdmod/Shredder/Slurm.php b/classes/OpenXdmod/Shredder/Slurm.php index 51db30e47d..cf89b2a08f 100644 --- a/classes/OpenXdmod/Shredder/Slurm.php +++ b/classes/OpenXdmod/Shredder/Slurm.php @@ -156,7 +156,7 @@ class Slurm extends Shredder * * @var string[] */ - private static $endedJobStates = [ + protected static $endedJobStates = [ 'BOOT_FAIL', 'CANCELLED', 'COMPLETED', @@ -175,7 +175,7 @@ class Slurm extends Shredder * * @var string[] */ - private static $nonEndedJobStates = [ + protected static $nonEndedJobStates = [ 'PENDING', 'RUNNING', 'REQUEUED', @@ -188,7 +188,7 @@ class Slurm extends Shredder * * @var string[] */ - private static $unknownJobStates = []; + protected static $unknownJobStates = []; /** * Time zone used when parsing datetimes. diff --git a/classes/OpenXdmod/Shredder/Slurmjson.php b/classes/OpenXdmod/Shredder/Slurmjson.php new file mode 100644 index 0000000000..2f495533a7 --- /dev/null +++ b/classes/OpenXdmod/Shredder/Slurmjson.php @@ -0,0 +1,292 @@ +logger->notice("Shredding file '$file'"); + + if (!is_file($file)) { + $this->logger->err("'$file' is not a file"); + return false; + } + + $contents = file_get_contents($file); + + if ($contents === false) { + throw new Exception("Failed to open file '$file'"); + } + + $data = json_decode($contents); + + if ($data === null || !isset($data->jobs)) { + $this->logger->err("'$file' does not contain valid json"); + return false; + } + + $recordCount = 0; + $duplicateCount = 0; + + $this->logger->info('Starting database transaction'); + $this->db->beginTransaction(); + + try { + foreach($data->jobs as $jobrecord) { + $job = $this->parseJobRecord($jobrecord); + + if ($job === null) { + continue; + } + + $recordCount++; + + try { + $this->insertRow($job); + } catch (\PDOException $e) { + + // Ignore duplicate key errors. + if ($e->getCode() == 23000) { + $msg = 'Skipping duplicate data: ' . $e->getMessage(); + $this->logger->debug(array( 'message' => $msg, 'file' => $file)); + $duplicateCount++; + continue; + } else { + throw $e; + } + } + } + $this->logger->info('Committing database transaction'); + $this->db->commit(); + + if ($duplicateCount > 0) { + $msg = "Skipped $duplicateCount duplicate records"; + $this->logger->info($msg); + } + + } catch (Exception $e) { + $this->logger->info('Rolling back database transaction'); + $this->db->rollBack(); + + $msg = sprintf( + 'Failed to shred file "%s": %s', + $file, + $e->getMessage() + ); + + throw new Exception($msg, 0, $e); + } + + return $recordCount; + } + + /** + * helper function to get a a TRES value from a slurm json accounting + * record. This follows a similar algorithm used by sacct. If the job + * has been allocated resource then the allocated value is used, otherwise + * the requested value is used. 0 is returned if no data found. + */ + function getTresValue($jobrecord, $rtype, $rname = null) + { + if (!isset($jobrecord->tres)) { + return 0; + } + + if (isset($jobrecord->tres->allocated)) { + foreach($jobrecord->tres->allocated as $record) { + if ($record->type == $rtype) { + if ($rname === null || $record->name == $rname) { + return $record->count; + } + } + } + } + + if (isset($jobrecord->tres->requested)) { + foreach($jobrecord->tres->requested as $record) { + if ($record->type == $rtype) { + if ($rname === null || $record->name == $rname) { + return $record->count; + } + } + } + } + + return 0; + } + + function getTimeLimit($jobrecord) { + if (isset($jobrecord->time->limit)) { + if (isset($jobrecord->time->limit->number)) { + return $jobrecord->time->limit->number; + } + return $jobrecord->time->limit; + } + return 0; + } + + function getJobId($jobrecord) { + if (isset($jobrecord->array) && isset($jobrecord->array->job_id) && $jobrecord->array->job_id != 0) + { + $array_index = $jobrecord->array->task_id->number ?? $jobrecord->array->task_id; + + return array($jobrecord->array->job_id, $array_index); + } + + return array($jobrecord->job_id, -1); + } + + function getStartTime($jobrecord) { + $start_ts = null; + + foreach($jobrecord->steps as $step) { + $ts1 = $step->time->start->number ?? $step->time->start; + + if ($start_ts == null) { + $start_ts = $ts1; + } else { + $start_ts = min($start_ts, $ts1); + } + } + + if ($start_ts === null) { + if ($jobrecord->time->elapsed == 0) { + $start_ts = $jobrecord->time->end; + } else { + $start_ts = $jobrecord->time->start; + } + } + + return $start_ts; + } + + function getExitCode($jobrecord) { + + $state = $jobrecord->state->current; + + if ($state == 'FAILED') { + return "1:0"; + } + + $derived = $jobrecord->derived_exit_code; + $return_code = $derived->return_code->number ?? $derived->return_code; + + if (isset($derived->signal)) { + $signal = $derived->signal->id->number ?? $derived->signal->signal_id; + } else { + $signal = 0; + } + + if ($return_code === null) { + $return_code = $signal; + $signal = 0; + } + + return "$return_code:$signal"; + } + + function parseJobRecord($jobrecord) { + + // Skip jobs that haven't ended. + if ($jobrecord->time->end == 0) { + $this->logger->debug('Skipping job with unknown end time'); + return null; + } + + // Skip jobs that have no nodes assigned. + if ($jobrecord->nodes == 'None assigned') { + $this->logger->debug('Skipping job with no nodes assigned'); + return null; + } + + $jobState = $jobrecord->state->current; + + if (!in_array($jobState, self::$endedJobStates)) { + if (in_array($jobState, self::$nonEndedJobStates)) { + $this->logger->debug( + sprintf( + 'Skipping job with non-ended state "%s"', + $jobState + ) + ); + return null; + } + + // Warn about an unknown job state the first time it is + // encountered. + if (!in_array($jobState, self::$unknownJobStates)) { + $this->logger->warning( + sprintf( + 'Found job with unknown state "%s", ' + . 'all jobs with this state will be ignored', + $jobState + ) + ); + self::$unknownJobStates[] = $jobState; + } + $this->logger->debug( + sprintf('Skipping job with unknown state "%s"', $jobState) + ); + return null; + } + + list($local_job_id, $local_job_array_index) = $this->getJobId($jobrecord); + + $job = array( + 'job_id' => $local_job_id, + 'job_array_index' => $local_job_array_index, + 'job_id_raw' => $jobrecord->job_id, + 'cluster_name' => $jobrecord->cluster, + 'partition_name' => $jobrecord->partition, + 'qos_name' => $jobrecord->qos, + 'account_name' => $jobrecord->account, + 'group_name' => $jobrecord->group, + 'gid_number' => -1, + 'user_name' => $jobrecord->user, + 'uid_number' => -1, + 'submit_time' => $jobrecord->time->submission, + 'eligible_time' => $jobrecord->time->eligible, + 'start_time' => $this->getStartTime($jobrecord), + 'end_time' => $jobrecord->time->end, + 'elapsed' => $jobrecord->time->elapsed, + 'exit_code' => $this->getExitCode($jobrecord), + 'state' => $jobState, + 'nnodes' => $this->getTresValue($jobrecord, 'node'), + 'ncpus' => $this->getTresValue($jobrecord, 'cpu'), + 'ngpus' => $this->getTresValue($jobrecord, 'gres', 'gpu'), + 'req_cpus' => $jobrecord->required->CPUs, + 'req_mem' => $this->getTresValue($jobrecord, 'mem') * 1024 * 1024, + 'timelimit' => $this->getTimeLimit($jobrecord), + 'node_list' => $jobrecord->nodes, + 'job_name' => $jobrecord->name + ); + + $this->logger->debug(json_encode($job)); + + return $job; + } +} From 9bfcf717f34836ce57507e313ea6fa5c5d912db9 Mon Sep 17 00:00:00 2001 From: Joseph White Date: Wed, 15 Jan 2025 16:59:29 -0500 Subject: [PATCH 2/4] Fix resource population --- classes/OpenXdmod/Shredder/Slurmjson.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classes/OpenXdmod/Shredder/Slurmjson.php b/classes/OpenXdmod/Shredder/Slurmjson.php index 2f495533a7..a81df046d5 100644 --- a/classes/OpenXdmod/Shredder/Slurmjson.php +++ b/classes/OpenXdmod/Shredder/Slurmjson.php @@ -260,7 +260,7 @@ function parseJobRecord($jobrecord) { 'job_id' => $local_job_id, 'job_array_index' => $local_job_array_index, 'job_id_raw' => $jobrecord->job_id, - 'cluster_name' => $jobrecord->cluster, + 'cluster_name' => $this->getResource(), 'partition_name' => $jobrecord->partition, 'qos_name' => $jobrecord->qos, 'account_name' => $jobrecord->account, From 3de89ab005266a431b88d36f9881e8c78002c54a Mon Sep 17 00:00:00 2001 From: Joseph White Date: Wed, 15 Jan 2025 18:38:43 -0500 Subject: [PATCH 3/4] Job State is an array in the Delta data. --- classes/OpenXdmod/Shredder/Slurmjson.php | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/classes/OpenXdmod/Shredder/Slurmjson.php b/classes/OpenXdmod/Shredder/Slurmjson.php index a81df046d5..693f4c2ab5 100644 --- a/classes/OpenXdmod/Shredder/Slurmjson.php +++ b/classes/OpenXdmod/Shredder/Slurmjson.php @@ -186,7 +186,7 @@ function getStartTime($jobrecord) { function getExitCode($jobrecord) { - $state = $jobrecord->state->current; + $state = $this->getJobState($jobrecord); if ($state == 'FAILED') { return "1:0"; @@ -209,6 +209,14 @@ function getExitCode($jobrecord) { return "$return_code:$signal"; } + function getJobState($jobrecord) { + if (is_array($jobrecord->state->current)) { + return $jobrecord->state->current[0]; + } + + return $jobrecord->state->current; + } + function parseJobRecord($jobrecord) { // Skip jobs that haven't ended. @@ -223,7 +231,7 @@ function parseJobRecord($jobrecord) { return null; } - $jobState = $jobrecord->state->current; + $jobState = $this->getJobState($jobrecord); if (!in_array($jobState, self::$endedJobStates)) { if (in_array($jobState, self::$nonEndedJobStates)) { From df392aa1ca1c54d45e78473d54e17e30cf866d8c Mon Sep 17 00:00:00 2001 From: Joseph White Date: Tue, 25 Mar 2025 17:37:01 -0400 Subject: [PATCH 4/4] Add function visibility declarations --- classes/OpenXdmod/Shredder/Slurmjson.php | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/classes/OpenXdmod/Shredder/Slurmjson.php b/classes/OpenXdmod/Shredder/Slurmjson.php index 693f4c2ab5..60c7a5824c 100644 --- a/classes/OpenXdmod/Shredder/Slurmjson.php +++ b/classes/OpenXdmod/Shredder/Slurmjson.php @@ -110,7 +110,7 @@ public function shredFile($file) * has been allocated resource then the allocated value is used, otherwise * the requested value is used. 0 is returned if no data found. */ - function getTresValue($jobrecord, $rtype, $rname = null) + private function getTresValue($jobrecord, $rtype, $rname = null) { if (!isset($jobrecord->tres)) { return 0; @@ -139,7 +139,7 @@ function getTresValue($jobrecord, $rtype, $rname = null) return 0; } - function getTimeLimit($jobrecord) { + private function getTimeLimit($jobrecord) { if (isset($jobrecord->time->limit)) { if (isset($jobrecord->time->limit->number)) { return $jobrecord->time->limit->number; @@ -149,7 +149,7 @@ function getTimeLimit($jobrecord) { return 0; } - function getJobId($jobrecord) { + private function getJobId($jobrecord) { if (isset($jobrecord->array) && isset($jobrecord->array->job_id) && $jobrecord->array->job_id != 0) { $array_index = $jobrecord->array->task_id->number ?? $jobrecord->array->task_id; @@ -160,7 +160,7 @@ function getJobId($jobrecord) { return array($jobrecord->job_id, -1); } - function getStartTime($jobrecord) { + private function getStartTime($jobrecord) { $start_ts = null; foreach($jobrecord->steps as $step) { @@ -184,7 +184,7 @@ function getStartTime($jobrecord) { return $start_ts; } - function getExitCode($jobrecord) { + private function getExitCode($jobrecord) { $state = $this->getJobState($jobrecord); @@ -209,7 +209,7 @@ function getExitCode($jobrecord) { return "$return_code:$signal"; } - function getJobState($jobrecord) { + private function getJobState($jobrecord) { if (is_array($jobrecord->state->current)) { return $jobrecord->state->current[0]; } @@ -217,7 +217,7 @@ function getJobState($jobrecord) { return $jobrecord->state->current; } - function parseJobRecord($jobrecord) { + private function parseJobRecord($jobrecord) { // Skip jobs that haven't ended. if ($jobrecord->time->end == 0) {