From 70ebd3b0ea4335534062c5a9389927a6033e367d Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Sun, 25 Jan 2026 23:51:47 +0100 Subject: [PATCH 1/8] add small program to parallelize snakemake across nodes on NERSC --- pyproject.toml | 3 + workflow/Snakefile | 2 + workflow/src/legendsimflow/nersc.py | 95 ++++++++++++++++++++++++++++- workflow/src/legendsimflow/utils.py | 22 ++++++- 4 files changed, 119 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e6b903d9..b8d1816f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,6 +88,9 @@ all = [ "pre-commit", ] +[project.scripts] +snakemake-nersc = "legendsimflow.nersc:snakemake_nersc_cli" + [tool.uv.workspace] exclude = ["generated", "inputs", "software", "workflow"] diff --git a/workflow/Snakefile b/workflow/Snakefile index cc418ba7..445a5f42 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -72,6 +72,8 @@ def gen_target_all(): if config.get("simlist", "*") in ("all", "*"): if "pdf" in make_tiers: return rules.gen_pdf_release.input + elif "cvt" in make_tiers: + return rules.gen_all_tier_cvt.input elif "evt" in make_tiers: return rules.gen_all_tier_evt.input elif "hit" in make_tiers: diff --git a/workflow/src/legendsimflow/nersc.py b/workflow/src/legendsimflow/nersc.py index c6ebfe4a..3a19bc5a 100644 --- a/workflow/src/legendsimflow/nersc.py +++ b/workflow/src/legendsimflow/nersc.py @@ -1,16 +1,109 @@ from __future__ import annotations +import argparse import re +import subprocess from collections.abc import Iterable from pathlib import Path +import yaml +from dbetto import AttrsDict +from legenddataflowscripts.workflow.utils import subst_vars +from legendmeta import LegendMetadata from snakemake.io import InputFiles from snakemake.script import Snakemake -from . import SimflowConfig +from . import SimflowConfig, aggregate from .exceptions import SimflowConfigError +def _partition(xs, n): + k, r = divmod(len(xs), n) + out, i = [], 0 + for j in range(n): + s = k + (j < r) + out.append(xs[i : i + s]) + i += s + return out + + +def snakemake_nersc_cli(): + parser = argparse.ArgumentParser( + description="Execute the Simflow on multiple nodes in parallel." + ) + parser.add_argument( + "-N", "--nodes", type=int, required=True, help="number of nodes" + ) + args, extra = parser.parse_known_args() + + if args.nodes < 2: + msg = "must parallelize over at least 2 nodes" + raise ValueError(msg) + + cfg_path = Path("./simflow-config.yaml") + if not cfg_path.is_file(): + msg = "this program must be executed in the directory where simflow-config.yaml resides" + raise RuntimeError(msg) + + with cfg_path.open("r") as f: + config = yaml.safe_load(f) + + subst_vars( + config, + var_values={"_": Path().resolve()}, + use_env=True, + ignore_missing=False, + ) + config = AttrsDict(config) + + # NOTE: this will attempt a clone of legend-metadata, if the directory does not exist + metadata = LegendMetadata(config.paths.metadata, lazy=True) + + if "legend_metadata_version" in config: + metadata.checkout(config.legend_metadata_version) + + config["metadata"] = metadata + + simlist = config.get("simlist", None) + make_tiers = config.make_tiers + if simlist is None: + # auto determine tier from config + tiers = ("pdf", "cvt", "evt", "hit", "opt", "stp") + tier = next(t for t in tiers if t in make_tiers) + + simlist = [ + f"{tier}.{simid}" for simid in aggregate.gen_list_of_all_simids(config) + ] + + procs = [] + for simlist_chunk in _partition(simlist, args.nodes): + smk_cmd = [ + "srun", + "--nodes", + "1", + "--ntasks", + "1", + "--cpus-per-task", + "256", + "snakemake", + "--config", + "simlist=" + ",".join(simlist_chunk), + "--nolock", + *extra, + ] + + print("INFO: spawning process:", " ".join(smk_cmd)) # noqa: T201 + procs.append(subprocess.Popen(smk_cmd)) + + for p in procs: + rc = p.wait() + if rc != 0: + msg = f"process failed: {p.args}" + raise RuntimeError(msg) + + print("INFO: all snakemake processes successfully returned") # noqa: T201 + + def dvs_ro( config: SimflowConfig, path: str | Path | Iterable[str | Path] ) -> str | Path | list[str | Path]: diff --git a/workflow/src/legendsimflow/utils.py b/workflow/src/legendsimflow/utils.py index 9f73197c..586328de 100644 --- a/workflow/src/legendsimflow/utils.py +++ b/workflow/src/legendsimflow/utils.py @@ -49,7 +49,7 @@ def _merge_defaults(user: dict, default: dict) -> dict: return result -def init_simflow_context(raw_config: dict, workflow) -> AttrsDict: +def init_simflow_context(raw_config: dict, workflow=None) -> AttrsDict: """Pre-process and sanitize the Simflow configuration. - set default configuration fields; @@ -61,6 +61,15 @@ def init_simflow_context(raw_config: dict, workflow) -> AttrsDict: configuration; - export important environment variables. + Parameters + ---------- + raw_config + path to the Simflow configuration file. + workflow + Snakemake workflow instance. If None, occurrences of ``$_`` in the + configuration will be replaced with the path to the current working + directory. + Returns a dictionary with useful objects to be used in the Simflow Snakefiles (i.e. the "context"). """ @@ -73,7 +82,16 @@ def init_simflow_context(raw_config: dict, workflow) -> AttrsDict: {"benchmark": {"enabled": False}, "nersc": {"dvs_ro": False, "scratch": False}}, ) - ldfs.workflow.utils.subst_vars_in_snakemake_config(workflow, raw_config) + if workflow is None: + ldfs.subst_vars( + raw_config, + var_values={"_": Path().resolve()}, + use_env=True, + ignore_missing=False, + ) + else: + ldfs.workflow.utils.subst_vars_in_snakemake_config(workflow, raw_config) + config = AttrsDict(raw_config) # convert all strings in the "paths" block to pathlib.Path From efe17647ef6a4b12dfd98d1f04283f5dcc3bc020 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Mon, 26 Jan 2026 11:48:15 +0100 Subject: [PATCH 2/8] nersc: remove nolock and add workflow profile --- workflow/src/legendsimflow/nersc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/src/legendsimflow/nersc.py b/workflow/src/legendsimflow/nersc.py index 3a19bc5a..13497bce 100644 --- a/workflow/src/legendsimflow/nersc.py +++ b/workflow/src/legendsimflow/nersc.py @@ -86,9 +86,10 @@ def snakemake_nersc_cli(): "--cpus-per-task", "256", "snakemake", + "--worflow-profile", + "workflow/profiles/nersc", "--config", "simlist=" + ",".join(simlist_chunk), - "--nolock", *extra, ] From cec38cf6dcdad23e55e98b4f70225af3c48ec9dd Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Mon, 26 Jan 2026 12:02:56 +0100 Subject: [PATCH 3/8] snakemake-nersc: handle signals --- workflow/src/legendsimflow/nersc.py | 47 ++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/workflow/src/legendsimflow/nersc.py b/workflow/src/legendsimflow/nersc.py index 13497bce..cfbf8b84 100644 --- a/workflow/src/legendsimflow/nersc.py +++ b/workflow/src/legendsimflow/nersc.py @@ -2,6 +2,7 @@ import argparse import re +import signal import subprocess from collections.abc import Iterable from pathlib import Path @@ -34,6 +35,11 @@ def snakemake_nersc_cli(): parser.add_argument( "-N", "--nodes", type=int, required=True, help="number of nodes" ) + parser.add_argument( + "--without-srun", + action="store_true", + help="do not prefix the snakemake call with 'srun ...'", + ) args, extra = parser.parse_known_args() if args.nodes < 2: @@ -78,24 +84,49 @@ def snakemake_nersc_cli(): procs = [] for simlist_chunk in _partition(simlist, args.nodes): smk_cmd = [ - "srun", - "--nodes", - "1", - "--ntasks", - "1", - "--cpus-per-task", - "256", "snakemake", - "--worflow-profile", + "--workflow-profile", "workflow/profiles/nersc", "--config", "simlist=" + ",".join(simlist_chunk), *extra, ] + if not args.without_srun: + smk_cmd = [ + "srun", + "--disable-status", # otherwise SIGINT has no effect + "--nodes", + "1", + "--ntasks", + "1", + "--cpus-per-task", + "256", + *smk_cmd, + ] print("INFO: spawning process:", " ".join(smk_cmd)) # noqa: T201 procs.append(subprocess.Popen(smk_cmd)) + # propagate signals to the snakemake instances. + def new_signal_handler(sig: int, _): + for p in procs: + p.send_signal(sig) + + signals = [ + signal.SIGHUP, + signal.SIGINT, + signal.SIGQUIT, + signal.SIGTERM, + signal.SIGTSTP, # SIGSTOP cannot be caught, and will do nothing... + signal.SIGCONT, + signal.SIGUSR1, + signal.SIGUSR2, + signal.SIGWINCH, + ] + + for sig in signals: + signal.signal(sig, new_signal_handler) + for p in procs: rc = p.wait() if rc != 0: From 51d14a0f1d9cf6e2ac0a461a7c86849bd8510165 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Mon, 26 Jan 2026 12:22:31 +0100 Subject: [PATCH 4/8] snakemake-nersc: randomly shuffle the simlist --- workflow/src/legendsimflow/nersc.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflow/src/legendsimflow/nersc.py b/workflow/src/legendsimflow/nersc.py index cfbf8b84..6e1125c8 100644 --- a/workflow/src/legendsimflow/nersc.py +++ b/workflow/src/legendsimflow/nersc.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +import random import re import signal import subprocess @@ -81,6 +82,11 @@ def snakemake_nersc_cli(): f"{tier}.{simid}" for simid in aggregate.gen_list_of_all_simids(config) ] + # trick: there won't be anything to do for some simids (targets already + # done), this could result in a very inefficient partitioning. as a + # mitigation, we randomly shuffle the simlist first + random.shuffle(simlist) + procs = [] for simlist_chunk in _partition(simlist, args.nodes): smk_cmd = [ From a381204cfd87733235c63abba15071b847c057a3 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Tue, 27 Jan 2026 11:25:27 +0100 Subject: [PATCH 5/8] remage: overwrite existing output files --- workflow/src/legendsimflow/commands.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/src/legendsimflow/commands.py b/workflow/src/legendsimflow/commands.py index 68634394..fa0b9a3b 100644 --- a/workflow/src/legendsimflow/commands.py +++ b/workflow/src/legendsimflow/commands.py @@ -148,6 +148,7 @@ def remage_run( *remage_exe, "--ignore-warnings", "--merge-output-files", + "--overwrite", "--log-level=detail", "--procs", str(procs), From b95b24366873a34f91fa202db2e27c3ef3ba666a Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Tue, 27 Jan 2026 11:41:46 +0100 Subject: [PATCH 6/8] move snakemake_nersc_cli to own module --- pyproject.toml | 2 +- workflow/src/legendsimflow/cli.py | 137 ++++++++++++++++++++++++++++ workflow/src/legendsimflow/nersc.py | 133 +-------------------------- 3 files changed, 139 insertions(+), 133 deletions(-) create mode 100644 workflow/src/legendsimflow/cli.py diff --git a/pyproject.toml b/pyproject.toml index b8d1816f..e8ca7805 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,7 +89,7 @@ all = [ ] [project.scripts] -snakemake-nersc = "legendsimflow.nersc:snakemake_nersc_cli" +snakemake-nersc = "legendsimflow.cli:snakemake_nersc_cli" [tool.uv.workspace] exclude = ["generated", "inputs", "software", "workflow"] diff --git a/workflow/src/legendsimflow/cli.py b/workflow/src/legendsimflow/cli.py new file mode 100644 index 00000000..55679579 --- /dev/null +++ b/workflow/src/legendsimflow/cli.py @@ -0,0 +1,137 @@ +from __future__ import annotations + +import argparse +import random +import signal +import subprocess +from pathlib import Path + +import yaml +from dbetto import AttrsDict +from legenddataflowscripts.workflow.utils import subst_vars +from legendmeta import LegendMetadata + +from . import aggregate + + +def _partition(xs, n): + k, r = divmod(len(xs), n) + out, i = [], 0 + for j in range(n): + s = k + (j < r) + out.append(xs[i : i + s]) + i += s + return out + + +def snakemake_nersc_cli(): + parser = argparse.ArgumentParser( + description="Execute the Simflow on multiple nodes in parallel." + ) + parser.add_argument( + "-N", "--nodes", type=int, required=True, help="number of nodes" + ) + parser.add_argument( + "--without-srun", + action="store_true", + help="do not prefix the snakemake call with 'srun ...'", + ) + args, extra = parser.parse_known_args() + + if args.nodes < 2: + msg = "must parallelize over at least 2 nodes" + raise ValueError(msg) + + cfg_path = Path("./simflow-config.yaml") + if not cfg_path.is_file(): + msg = "this program must be executed in the directory where simflow-config.yaml resides" + raise RuntimeError(msg) + + with cfg_path.open("r") as f: + config = yaml.safe_load(f) + + subst_vars( + config, + var_values={"_": Path().resolve()}, + use_env=True, + ignore_missing=False, + ) + config = AttrsDict(config) + + # NOTE: this will attempt a clone of legend-metadata, if the directory does not exist + metadata = LegendMetadata(config.paths.metadata, lazy=True) + + if "legend_metadata_version" in config: + metadata.checkout(config.legend_metadata_version) + + config["metadata"] = metadata + + simlist = config.get("simlist", None) + make_tiers = config.make_tiers + if simlist is None: + # auto determine tier from config + tiers = ("pdf", "cvt", "evt", "hit", "opt", "stp") + tier = next(t for t in tiers if t in make_tiers) + + simlist = [ + f"{tier}.{simid}" for simid in aggregate.gen_list_of_all_simids(config) + ] + + # trick: there won't be anything to do for some simids (targets already + # done), this could result in a very inefficient partitioning. as a + # mitigation, we randomly shuffle the simlist first + random.shuffle(simlist) + + procs = [] + for simlist_chunk in _partition(simlist, args.nodes): + smk_cmd = [ + "snakemake", + "--workflow-profile", + "workflow/profiles/nersc", + "--config", + "simlist=" + ",".join(simlist_chunk), + *extra, + ] + if not args.without_srun: + smk_cmd = [ + "srun", + "--disable-status", # otherwise SIGINT has no effect + "--nodes", + "1", + "--ntasks", + "1", + "--cpus-per-task", + "256", + *smk_cmd, + ] + + print("INFO: spawning process:", " ".join(smk_cmd)) # noqa: T201 + procs.append(subprocess.Popen(smk_cmd)) + + # propagate signals to the snakemake instances. + def new_signal_handler(sig: int, _): + for p in procs: + p.send_signal(sig) + + signals = [ + signal.SIGHUP, + signal.SIGINT, + signal.SIGQUIT, + signal.SIGTERM, + signal.SIGTSTP, # SIGSTOP cannot be caught, and will do nothing... + signal.SIGCONT, + signal.SIGUSR1, + signal.SIGUSR2, + signal.SIGWINCH, + ] + + for sig in signals: + signal.signal(sig, new_signal_handler) + + for p in procs: + rc = p.wait() + if rc != 0: + msg = f"process failed: {p.args}" + raise RuntimeError(msg) + + print("INFO: all snakemake processes successfully returned") # noqa: T201 diff --git a/workflow/src/legendsimflow/nersc.py b/workflow/src/legendsimflow/nersc.py index 6e1125c8..c6ebfe4a 100644 --- a/workflow/src/legendsimflow/nersc.py +++ b/workflow/src/legendsimflow/nersc.py @@ -1,147 +1,16 @@ from __future__ import annotations -import argparse -import random import re -import signal -import subprocess from collections.abc import Iterable from pathlib import Path -import yaml -from dbetto import AttrsDict -from legenddataflowscripts.workflow.utils import subst_vars -from legendmeta import LegendMetadata from snakemake.io import InputFiles from snakemake.script import Snakemake -from . import SimflowConfig, aggregate +from . import SimflowConfig from .exceptions import SimflowConfigError -def _partition(xs, n): - k, r = divmod(len(xs), n) - out, i = [], 0 - for j in range(n): - s = k + (j < r) - out.append(xs[i : i + s]) - i += s - return out - - -def snakemake_nersc_cli(): - parser = argparse.ArgumentParser( - description="Execute the Simflow on multiple nodes in parallel." - ) - parser.add_argument( - "-N", "--nodes", type=int, required=True, help="number of nodes" - ) - parser.add_argument( - "--without-srun", - action="store_true", - help="do not prefix the snakemake call with 'srun ...'", - ) - args, extra = parser.parse_known_args() - - if args.nodes < 2: - msg = "must parallelize over at least 2 nodes" - raise ValueError(msg) - - cfg_path = Path("./simflow-config.yaml") - if not cfg_path.is_file(): - msg = "this program must be executed in the directory where simflow-config.yaml resides" - raise RuntimeError(msg) - - with cfg_path.open("r") as f: - config = yaml.safe_load(f) - - subst_vars( - config, - var_values={"_": Path().resolve()}, - use_env=True, - ignore_missing=False, - ) - config = AttrsDict(config) - - # NOTE: this will attempt a clone of legend-metadata, if the directory does not exist - metadata = LegendMetadata(config.paths.metadata, lazy=True) - - if "legend_metadata_version" in config: - metadata.checkout(config.legend_metadata_version) - - config["metadata"] = metadata - - simlist = config.get("simlist", None) - make_tiers = config.make_tiers - if simlist is None: - # auto determine tier from config - tiers = ("pdf", "cvt", "evt", "hit", "opt", "stp") - tier = next(t for t in tiers if t in make_tiers) - - simlist = [ - f"{tier}.{simid}" for simid in aggregate.gen_list_of_all_simids(config) - ] - - # trick: there won't be anything to do for some simids (targets already - # done), this could result in a very inefficient partitioning. as a - # mitigation, we randomly shuffle the simlist first - random.shuffle(simlist) - - procs = [] - for simlist_chunk in _partition(simlist, args.nodes): - smk_cmd = [ - "snakemake", - "--workflow-profile", - "workflow/profiles/nersc", - "--config", - "simlist=" + ",".join(simlist_chunk), - *extra, - ] - if not args.without_srun: - smk_cmd = [ - "srun", - "--disable-status", # otherwise SIGINT has no effect - "--nodes", - "1", - "--ntasks", - "1", - "--cpus-per-task", - "256", - *smk_cmd, - ] - - print("INFO: spawning process:", " ".join(smk_cmd)) # noqa: T201 - procs.append(subprocess.Popen(smk_cmd)) - - # propagate signals to the snakemake instances. - def new_signal_handler(sig: int, _): - for p in procs: - p.send_signal(sig) - - signals = [ - signal.SIGHUP, - signal.SIGINT, - signal.SIGQUIT, - signal.SIGTERM, - signal.SIGTSTP, # SIGSTOP cannot be caught, and will do nothing... - signal.SIGCONT, - signal.SIGUSR1, - signal.SIGUSR2, - signal.SIGWINCH, - ] - - for sig in signals: - signal.signal(sig, new_signal_handler) - - for p in procs: - rc = p.wait() - if rc != 0: - msg = f"process failed: {p.args}" - raise RuntimeError(msg) - - print("INFO: all snakemake processes successfully returned") # noqa: T201 - - def dvs_ro( config: SimflowConfig, path: str | Path | Iterable[str | Path] ) -> str | Path | list[str | Path]: From 691a24fc7ed7c19ce31673c572e3d19bafcc0c06 Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Tue, 27 Jan 2026 12:36:49 +0100 Subject: [PATCH 7/8] document snakemake-nersc --- docs/source/conf.py | 4 ++ docs/source/manual/prod.md | 4 +- docs/source/manual/setup.md | 6 +-- docs/source/manual/sites.md | 72 ++++++++++++++++++++++++++++++- workflow/src/legendsimflow/cli.py | 4 +- 5 files changed, 83 insertions(+), 7 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index ba00e700..7493c06b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,6 +10,7 @@ copyright = "2025, the LEGEND Collaboration" extensions = [ + "sphinx.ext.todo", "sphinx.ext.autodoc", "sphinx.ext.mathjax", "sphinx.ext.napoleon", @@ -89,3 +90,6 @@ autodoc_typehints = "description" autodoc_typehints_description_target = "documented_params" autodoc_typehints_format = "short" + +# actually show ToDo admonitions +todo_include_todos = True diff --git a/docs/source/manual/prod.md b/docs/source/manual/prod.md index 33b13e0a..326faef5 100644 --- a/docs/source/manual/prod.md +++ b/docs/source/manual/prod.md @@ -1,3 +1,5 @@ +(production)= + # Production Run a production by using one of the provided site-specific profiles @@ -16,7 +18,7 @@ be omitted. Snakemake will use the `default` profile. The `--config` command line option is very useful to override configuration values. It can be used, for example, to restrict the production to a subset of -simulations: +simulations (a "simlist"): ```console > snakemake --config simlist="mylist.txt" [...] diff --git a/docs/source/manual/setup.md b/docs/source/manual/setup.md index 568358b1..b4ca94dd 100644 --- a/docs/source/manual/setup.md +++ b/docs/source/manual/setup.md @@ -21,9 +21,9 @@ Here's a basic description of its fields: - `experiment`: labels the experiment to be simulated. The same name is used in the metadata to label the corresponding configuration files. -- `simlist`: list of simulation identifiers (see below) to be processed by - Snakemake. Can be a list of strings or a path to a text file. If `*` or `all`, - will process all simulations defined in the metadata. +- `simlist`: list of simulation identifiers to be processed by Snakemake. Can be + a list of strings or a path to a text file. If `*` or `all`, will process all + simulations defined in the metadata. - `runlist`: list of LEGEND data taking runs to build pdfs for, in the standard format `---` (e.g. `l200-p03-r000-phy`) - `make_tiers`: list the tiers you would like to populate here. This option is diff --git a/docs/source/manual/sites.md b/docs/source/manual/sites.md index 020af90e..b1963eb8 100644 --- a/docs/source/manual/sites.md +++ b/docs/source/manual/sites.md @@ -53,7 +53,14 @@ hand. ::: -Now you can proceed with setting up and running the production workflow. +Now you can proceed with setting up and running the production workflow, with +e.g.: + +```console +> pixi run snakemake --workflow-profile workflow/profiles/nersc +``` + +Using the provided `nersc` profile is recommended (have a look at it). ### I/O optimization @@ -82,9 +89,70 @@ nersc: Both features can be disabled by setting the corresponding fields to false. -:::{warning +:::{warning} These features are implemented manually for each Snakemake rule, so it could be that some rules are unaffected by them. ::: + +### Multi-node execution + +As of Snakemake v8.30, support for parallel execution across multiple compute +nodes or interaction with job schedulers (such as Slurm) is not well supported. + +:::{note} + +An experimental profile to interact with the NERSC batch job system is available +in `nersc-batch`. Unfortunately, specifying rule resources (which is required +for job submission) seems to slow down the DAG generation step by a lot. + +::: + +:::{note} + +In principle, one could use the `snakemake-executor-plugin-slurm-jobstep` to +prefix each rule command with a `srun` call, which would make it possible to +parallelize the workflow over several nodes. In practice, NERSC discourages from +starting many `srun` instances for performance reasons. As a result, the only +reliable way to run Snakemake is with one instance on a single compute node. + +The `snakemake-nersc` executable, exposed by `legend-simflow` offers a way to +parallelize the workflow in some situations over several nodes. The usage is: + +```console +> [pixi run] snakemake-nersc -N NUMBER_OF_NODES [SNAKEMAKE ARGS] +``` + +The program determines the list of simulations (see the `simlist` in +{ref}`production`) that the user wants to process, partitions it in +`NUMBER_OF_NODES` chunks, and spawns a dedicated Snakemake instance for each, +prefixed by the appropriate `srun` call. This is equivalent to something like: + +```sh +srun -N1 -n1 snakemake --workflow-profile workflow/profiles/nersc --config simlist=LIST1 [SNAKEMAKE ARGS] & +srun -N1 -n1 snakemake --workflow-profile workflow/profiles/nersc --config simlist=LIST2 [SNAKEMAKE ARGS] & +... + +wait +``` + +This approach makes it unfortunately harder to manually interrupt the Simflow, +e.g. hitting `Ctrl+C` will just make Slurm print some jobset status information. +You should instead send signals (`TERM` to stop scheduling more jobs and just +wait for running jobs and `INT` to kill all running jobs) directly to the +snakemake instance. + +:::{todo} + +Add commands to send signals. + +::: + +:::{note} + +Since the actual jobs that need to be run will not be known a priori, the +_a-priori_ partitioning might be inefficient. To mitigate this, the `simlist` is +randomly shuffled before partitioning. + +::: diff --git a/workflow/src/legendsimflow/cli.py b/workflow/src/legendsimflow/cli.py index 55679579..639017eb 100644 --- a/workflow/src/legendsimflow/cli.py +++ b/workflow/src/legendsimflow/cli.py @@ -25,8 +25,10 @@ def _partition(xs, n): def snakemake_nersc_cli(): + """Implementation of the ``snakemake-cli`` CLI.""" parser = argparse.ArgumentParser( - description="Execute the Simflow on multiple nodes in parallel." + description="""Execute the Simflow on multiple nodes in parallel. + Extra arguments will be forwarded to Snakemake.""" ) parser.add_argument( "-N", "--nodes", type=int, required=True, help="number of nodes" From 98aa997a2873951b17203c60c8caa0572b3f318d Mon Sep 17 00:00:00 2001 From: Luigi Pertoldi Date: Tue, 27 Jan 2026 12:37:28 +0100 Subject: [PATCH 8/8] srun: remove --disable-status it does not seem to do the right job --- workflow/src/legendsimflow/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/src/legendsimflow/cli.py b/workflow/src/legendsimflow/cli.py index 639017eb..de7dcd88 100644 --- a/workflow/src/legendsimflow/cli.py +++ b/workflow/src/legendsimflow/cli.py @@ -97,7 +97,6 @@ def snakemake_nersc_cli(): if not args.without_srun: smk_cmd = [ "srun", - "--disable-status", # otherwise SIGINT has no effect "--nodes", "1", "--ntasks",