@@ -570,6 +570,8 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
570
570
return self ._describe_sacct (app_id )
571
571
572
572
def _describe_sacct (self , app_id : str ) -> Optional [DescribeAppResponse ]:
573
+ # NOTE: Handles multiple job ID formats due to SLURM version differences.
574
+ # Different clusters use heterogeneous (+) vs regular (.) job ID formats.
573
575
try :
574
576
output = subprocess .check_output (
575
577
["sacct" , "--parsable2" , "-j" , app_id ],
@@ -594,15 +596,27 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
594
596
msg = ""
595
597
app_state = AppState .UNKNOWN
596
598
for row in reader :
597
- job_id , * parts = row ["JobID" ].split ("+" )
599
+ # Handle both "+" (heterogeneous) and "." (regular) job ID formats
600
+ job_id_full = row ["JobID" ]
601
+
602
+ # Split on both "+" and "." to handle different SLURM configurations
603
+ if "+" in job_id_full :
604
+ job_id , * parts = job_id_full .split ("+" )
605
+ is_subjob = len (parts ) > 0 and "." in parts [0 ]
606
+ else :
607
+ job_id , * parts = job_id_full .split ("." )
608
+ is_subjob = len (parts ) > 0
609
+
598
610
if job_id != app_id :
599
611
continue
600
- if len (parts ) > 0 and "." in parts [0 ]:
601
- # we only care about the worker not the child jobs
612
+
613
+ if is_subjob :
614
+ # we only care about the main job not the child jobs (.batch, .0, etc.)
602
615
continue
603
616
604
- state = row ["State" ]
605
- msg = state
617
+ msg = row ["State" ]
618
+ # Remove truncation indicator (CANCELLED+) and extract base state from verbose formats
619
+ state = msg .split ()[0 ].rstrip ("+" )
606
620
app_state = appstate_from_slurm_state (state )
607
621
608
622
role , _ , replica_id = row ["JobName" ].rpartition ("-" )
@@ -629,6 +643,9 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
629
643
)
630
644
631
645
def _describe_squeue (self , app_id : str ) -> Optional [DescribeAppResponse ]:
646
+ # NOTE: This method contains multiple compatibility checks for different SLURM versions
647
+ # due to API format changes across versions (20.02, 23.02, 24.05, 24.11+).
648
+
632
649
# squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
633
650
# if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
634
651
output = subprocess .check_output (
@@ -670,7 +687,18 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
670
687
if state == AppState .PENDING :
671
688
# NOTE: torchx launched jobs points to exactly one host
672
689
# otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
673
- hostname = job_resources .get ("scheduled_nodes" , "" )
690
+
691
+ # SLURM 24.11.5+ returns job_resources=None for pending jobs (issue #1101)
692
+ if job_resources is not None :
693
+ hostname = job_resources .get ("scheduled_nodes" , "" )
694
+ # If scheduled_nodes not found in job_resources, try nodes.list
695
+ if not hostname and "nodes" in job_resources :
696
+ nodes_info = job_resources .get ("nodes" , {})
697
+ if isinstance (nodes_info , dict ):
698
+ hostname = nodes_info .get ("list" , "" )
699
+ else :
700
+ # For pending jobs where job_resources is None, check top-level fields
701
+ hostname = job .get ("nodes" , "" ) or job .get ("scheduled_nodes" , "" )
674
702
675
703
role .num_replicas += 1
676
704
role_status .replicas .append (
@@ -686,24 +714,35 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
686
714
# where each replica is a "sub-job" so `allocated_nodes` will always be 1
687
715
# but we deal with jobs that have not been launched with torchx
688
716
# which can have multiple hosts per sub-job (count them as replicas)
689
- node_infos = job_resources .get ("allocated_nodes" , [])
717
+ nodes_data = job_resources .get ("nodes" , {})
718
+
719
+ # SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
720
+ if "allocation" in nodes_data and isinstance (
721
+ nodes_data ["allocation" ], list
722
+ ):
723
+ # SLURM 24.11+ format: nodes.allocation is a list
724
+ for node_info in nodes_data ["allocation" ]:
725
+ hostname = node_info ["name" ]
726
+ cpu = int (node_info ["cpus" ]["used" ])
727
+ memMB = (
728
+ int (node_info ["memory" ]["allocated" ]) // 1024
729
+ ) # Convert to MB
690
730
691
- if not isinstance (node_infos , list ):
692
- # NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes
693
- # is not a list of individual nodes, but a map of the nodelist specs
694
- # in this case just use jobs[].job_resources.nodes
695
- hostname = job_resources .get ("nodes" )
696
- role .num_replicas += 1
697
- role_status .replicas .append (
698
- ReplicaStatus (
699
- id = int (replica_id ),
700
- role = role_name ,
701
- state = state ,
702
- hostname = hostname ,
731
+ role .resource = Resource (cpu = cpu , memMB = memMB , gpu = - 1 )
732
+ role .num_replicas += 1
733
+ role_status .replicas .append (
734
+ ReplicaStatus (
735
+ id = int (replica_id ),
736
+ role = role_name ,
737
+ state = state ,
738
+ hostname = hostname ,
739
+ )
703
740
)
704
- )
705
- else :
706
- for node_info in node_infos :
741
+ elif "allocated_nodes" in job_resources and isinstance (
742
+ job_resources ["allocated_nodes" ], list
743
+ ):
744
+ # Legacy format: allocated_nodes is a list
745
+ for node_info in job_resources ["allocated_nodes" ]:
707
746
# NOTE: we expect resource specs for all the nodes to be the same
708
747
# NOTE: use allocated (not used/requested) memory since
709
748
# users may only specify --cpu, in which case slurm
@@ -726,6 +765,26 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
726
765
hostname = hostname ,
727
766
)
728
767
)
768
+ else :
769
+ # Fallback: use hostname from nodes.list
770
+ if isinstance (nodes_data , str ):
771
+ hostname = nodes_data
772
+ else :
773
+ hostname = (
774
+ nodes_data .get ("list" , "" )
775
+ if isinstance (nodes_data , dict )
776
+ else ""
777
+ )
778
+
779
+ role .num_replicas += 1
780
+ role_status .replicas .append (
781
+ ReplicaStatus (
782
+ id = int (replica_id ),
783
+ role = role_name ,
784
+ state = state ,
785
+ hostname = hostname ,
786
+ )
787
+ )
729
788
730
789
return DescribeAppResponse (
731
790
app_id = app_id ,
0 commit comments