diff --git a/gcm/monitoring/slurm/parsing.py b/gcm/monitoring/slurm/parsing.py index e37a40b..1bf62d4 100644 --- a/gcm/monitoring/slurm/parsing.py +++ b/gcm/monitoring/slurm/parsing.py @@ -278,3 +278,50 @@ def parse_scontrol_maxnodes(v: str) -> int: def parse_job_ids(s: str) -> list[str]: """Given a comma separated string of job ids, return a list of job ids.""" return s.split(",") if s else [] + + +def parse_gres_gpu_indices(v: str) -> str | None: + """Parse gres_detail to extract GPU indices for single-node jobs. + + The input is a comma-joined string of gres_detail entries from the SLURM REST + API (joined by _map_job_fields). Each entry looks like "gpu:ampere:1(IDX:7)" + or "gpu:ampere:4(IDX:0-3)". + + Returns a comma-separated string of GPU indices (e.g., "7" or "0,1,2,3") for + single-node jobs. Returns None for multi-node jobs (multiple IDX entries) or + parse failures. + + Examples: + + >>> parse_gres_gpu_indices("gpu:ampere:1(IDX:7)") + '7' + >>> parse_gres_gpu_indices("gpu:ampere:3(IDX:0,3,5)") + '0,3,5' + >>> parse_gres_gpu_indices("gpu:ampere:4(IDX:0-3)") + '0,1,2,3' + >>> parse_gres_gpu_indices("gpu:ampere:8(IDX:0-7)") + '0,1,2,3,4,5,6,7' + >>> parse_gres_gpu_indices("gpu:ampere:8(IDX:0-7),gpu:ampere:8(IDX:0-7)") + >>> parse_gres_gpu_indices("") + >>> parse_gres_gpu_indices("(null)") + """ + if not v or v in {"N/A", "(null)", "[]"}: + return None + + idx_matches = re.findall(r"IDX:([0-9,\-]+)", v) + if len(idx_matches) != 1: + # Multi-node (multiple IDX entries) or no IDX found + return None + + indices: list[int] = [] + for part in idx_matches[0].split(","): + if "-" in part: + start_s, end_s = part.split("-", 1) + indices.extend(range(int(start_s), int(end_s) + 1)) + else: + indices.append(int(part)) + + if not indices: + return None + + return ",".join(str(i) for i in sorted(indices)) diff --git a/gcm/schemas/slurm/squeue.py b/gcm/schemas/slurm/squeue.py index 212552b..26e7980 100644 --- a/gcm/schemas/slurm/squeue.py +++ b/gcm/schemas/slurm/squeue.py @@ -1,12 +1,13 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -from dataclasses import dataclass, fields +from dataclasses import dataclass, field, fields from gcm.monitoring.clock import time_to_time_aware from gcm.monitoring.coerce import maybe_float, maybe_int from gcm.monitoring.slurm.nodelist_parsers import nodelist from gcm.monitoring.slurm.parsing import ( maybe_parse_memory_to_bytes, + parse_gres_gpu_indices, parse_gres_or_tres, parse_value_from_tres, ) @@ -75,6 +76,14 @@ class JobData(DerivedCluster): FEATURE: str = parsed_field(parser=str) RESTARTCNT: int = parsed_field(parser=int) SCHEDNODES: list[str] | None = parsed_field(parser=lambda s: nodelist()(s)[0]) + GRES_GPU_INDICES: str | None = field( + default=None, + metadata={ + "parser": parse_gres_gpu_indices, + "field_name": "GRES_DETAIL", + "slurm_field": False, + }, + ) JOB_DATA_SLURM_FIELDS = list( @@ -125,4 +134,5 @@ class JobData(DerivedCluster): "features": "FEATURE", "restart_cnt": "RESTARTCNT", "scheduled_nodes": "SCHEDNODES", + "gres_detail": "GRES_DETAIL", } diff --git a/gcm/tests/test_parsers.py b/gcm/tests/test_parsers.py index 9bd8ae1..24bb933 100644 --- a/gcm/tests/test_parsers.py +++ b/gcm/tests/test_parsers.py @@ -17,6 +17,7 @@ maybe_parse_memory_to_bytes, mb_to_bytes, parse_gres, + parse_gres_gpu_indices, parse_memory_to_bytes, parse_tres, parse_value_from_tres, @@ -549,6 +550,54 @@ def test_parse_gpu_from_tres_bad(s: str, exc: Type[Exception]) -> None: parse_value_from_tres(s, "gres/gpu") +@pytest.mark.parametrize( + "s, expected", + [ + # Single GPU (1-GPU job) + ("gpu:ampere:1(IDX:7)", "7"), + # Multiple specific GPUs + ("gpu:ampere:3(IDX:0,3,5)", "0,3,5"), + # Range notation + ("gpu:ampere:4(IDX:0-3)", "0,1,2,3"), + # Mixed range and specific + ("gpu:ampere:5(IDX:0-2,5,7)", "0,1,2,5,7"), + # Full node (8 GPUs) — still returns indices (caller decides whether to filter) + ("gpu:ampere:8(IDX:0-7)", "0,1,2,3,4,5,6,7"), + # Multi-node (multiple IDX entries) — returns None, unsupported + ( + "gpu:ampere:8(IDX:0-7),gpu:ampere:8(IDX:0-7)", + None, + ), + # Multi-node partial GPUs — returns None, unsupported + ( + "gpu:ampere:3(IDX:0,3,5),gpu:ampere:3(IDX:1,4,7)", + None, + ), + # Empty string + ("", None), + # SLURM null values + ("(null)", None), + ("N/A", None), + # Empty array representation + ("[]", None), + # No IDX in the string + ("gpu:ampere:8", None), + # 16-GPU node — partial allocation (works for nodes with >8 GPUs) + ("gpu:ampere:10(IDX:0-9)", "0,1,2,3,4,5,6,7,8,9"), + # 16-GPU node — full allocation + ("gpu:ampere:16(IDX:0-15)", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15"), + # Comma-join ambiguity: IDX commas inside () are safely delimited by ) + ( + "gpu:ampere:3(IDX:0,3,5),gpu:ampere:3(IDX:1,4,7)", + None, + ), + ], +) +@typechecked +def test_parse_gres_gpu_indices(s: str, expected: str | None) -> None: + assert parse_gres_gpu_indices(s) == expected + + @pytest.mark.parametrize( "value, expected", [