Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Malay/container version #357

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 29 additions & 9 deletions launcher_scripts/nemo_launcher/core/stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,19 +210,39 @@ def _make_nemo_path_command(self) -> List[str]:
]

def _make_git_log_command(self, stage_cfg_path: Path):
"""log last 5 commits for repos- NeMo, megatron-lm, NeMo-Framework-Launcher or NeMo-Megatron-Launcher
'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. We run git log for both for
"""
log HEAD commit for subset of repos in NeMo container, version names for PyTorch and NeMo container
'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. Try logging for both for
backwards compatibility.
"""
append_to_file = f"{stage_cfg_path.parent}/git_log.txt"
return [
f"(echo PYT$\"NVIDIA_PYTORCH_VERSION\" && \
git --git-dir=/opt/NeMo/.git log -n 5 --format='NeMo;%h;%aD;%s' && \
git --git-dir=/opt/megatron-lm/.git log -n 5 --format='megatron-lm;%h;%aD;%s' && \
git --git-dir=/opt/NeMo-Framework-Launcher/.git log -n 5 --format='NeMo-Framework-Launcher;%h;%aD;%s' && \
git --git-dir=/opt/NeMo-Megatron-Launcher/.git log -n 5 --format='NeMo-Megatron-Launcher;%h;%aD;%s') > {append_to_file}"
filepath = os.path.join(f"{stage_cfg_path.parent}", "git-info.log")

git_repos = [
malay-nagda marked this conversation as resolved.
Show resolved Hide resolved
"NeMo",
"megatron-lm",
"TransformerEngine",
"NeMo-Framework-Launcher",
"NeMo-Megatron-Launcher",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can remove "NeMo-Megatron-Launcher"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed

"apex",
"NeMo-Aligner",
"NeMo-Curator",
]

git_log_cmd = [
f"git --git-dir=/opt/{repo}/.git log -n 1 --format='{repo};%h;%aD;%s'"
for repo in git_repos
]

container_info_cmd = [
f"echo NeMo-Container-Version\;{self.cfg.get('container', '')}",
'echo PyTorch-Container-Version\;PYT$"NVIDIA_PYTORCH_VERSION"',
]

# semi-colon delimiter ensures we run all above commands even after a failure
# circular brackets groups commands and ensures we write to file ONLY after all
# commands finish execution
return [f"({';'.join(git_log_cmd + container_info_cmd)}) > {filepath}"]

def _make_k8s_spec_file(
self, template_root: str, cluster_parameters: Dict, job_path: JobPaths
):
Expand Down