Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Malay/container version #357

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 29 additions & 11 deletions launcher_scripts/nemo_launcher/core/stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,19 +210,36 @@ def _make_nemo_path_command(self) -> List[str]:
]

def _make_git_log_command(self, stage_cfg_path: Path):
"""log last 5 commits for repos- NeMo, megatron-lm, NeMo-Framework-Launcher or NeMo-Megatron-Launcher
'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. We run git log for both for
backwards compatibility.
"""
append_to_file = f"{stage_cfg_path.parent}/git_log.txt"
return [
f"(echo PYT$\"NVIDIA_PYTORCH_VERSION\" && \
git --git-dir=/opt/NeMo/.git log -n 5 --format='NeMo;%h;%aD;%s' && \
git --git-dir=/opt/megatron-lm/.git log -n 5 --format='megatron-lm;%h;%aD;%s' && \
git --git-dir=/opt/NeMo-Framework-Launcher/.git log -n 5 --format='NeMo-Framework-Launcher;%h;%aD;%s' && \
git --git-dir=/opt/NeMo-Megatron-Launcher/.git log -n 5 --format='NeMo-Megatron-Launcher;%h;%aD;%s') > {append_to_file}"
log HEAD commit for subset of repos in NeMo container, version names for PyTorch and NeMo container
"""
filepath = os.path.join(f"{stage_cfg_path.parent}", "git-info.log")

git_repos = [
malay-nagda marked this conversation as resolved.
Show resolved Hide resolved
"NeMo",
"megatron-lm",
"TransformerEngine",
"NeMo-Framework-Launcher",
"apex",
"NeMo-Aligner",
"NeMo-Curator",
]

git_log_cmd = [
f"git --git-dir=/opt/{repo}/.git log -n 1 --format='{repo};%h;%aD;%s'"
for repo in git_repos
]

container_info_cmd = [
f"echo NeMo-Container-Version\;{self.cfg.get('container', '')}",
'echo PyTorch-Container-Version\;PYT$"NVIDIA_PYTORCH_VERSION"',
]

# semi-colon delimiter ensures we run all above commands even after a failure
# circular brackets groups commands and ensures we write to file ONLY after all
# commands finish execution
return [f"({';'.join(git_log_cmd + container_info_cmd)}) > {filepath}"]

def _make_k8s_spec_file(
self, template_root: str, cluster_parameters: Dict, job_path: JobPaths
):
Expand Down Expand Up @@ -622,7 +639,8 @@ def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
command_groups = [[]]
command_groups[0] += self._make_wandb_login_command()
command_groups[0] += self._make_nemo_path_command()
command_groups[0] += self._make_git_log_command(stage_cfg_path)
if self.cluster == "bcm":
command_groups[0] += self._make_git_log_command(stage_cfg_path)
# command_groups[0] += self._make_numa_mapping_command()

# _cuda_device_max_connections and _cuda_visible_devices cannot be used as command prefix on BCP
Expand Down