From 2b11b6ad5825e0119c65761adba368b1cdce3654 Mon Sep 17 00:00:00 2001 From: Shaurya Date: Mon, 9 Dec 2024 23:56:45 +0000 Subject: [PATCH 01/11] Pathways v6e large scale runs --- README.md | 2 +- src/xpk/commands/workload.py | 18 ++++++++++++++++++ src/xpk/core/core.py | 3 ++- src/xpk/core/kjob.py | 2 +- src/xpk/core/pathways.py | 8 ++++++++ 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c4e682a0..ffdec101 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ xpk uses many tool to provide all neccessary functionalities. User must install - gcloud (install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the)) - kubectl (install from [here](https://kubernetes.io/docs/tasks/tools/)) - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/)) -- kjob (installation instructions [here](https://github.com/kubernetes-sigs/kueue/blob/main/cmd/experimental/kjobctl/docs/installation.md)) +- kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md)) # Installation To install xpk, run the following command and install additional tools, mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools: diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 70ef1a74..8b85e74b 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -198,6 +198,9 @@ containers: - args: {pathways_worker_args} + env: + - name: GRPC_TRACE + value: "client_channel" image: {args.server_image} imagePullPolicy: Always name: pathways-worker @@ -253,6 +256,8 @@ value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) - name: TPU_SKIP_MDS_QUERY value: "true" + - name: GRPC_TRACE + value: "client_channel" image: {args.server_image} imagePullPolicy: Always name: pathways-rm @@ -263,6 +268,10 @@ volumeMounts: - mountPath: /tmp name: shared-tmp + resources: + limits: + cpu: "30" + memory: 120G nodeSelector: cloud.google.com/gke-nodepool: cpu-rm-np hostNetwork: true @@ -287,11 +296,20 @@ containers: - args: {pathways_proxy_args} + env: + - name: XLA_FLAGS + value: "--xla_dump_to={args.pathways_gcs_location}/xla_dump" + - name: GRPC_TRACE + value: "client_channel" image: {args.proxy_server_image} imagePullPolicy: Always name: pathways-proxy ports: - containerPort: 29000 + resources: + limits: + cpu: "30" + memory: 120G hostNetwork: true dnsPolicy: ClusterFirstWithHostNet nodeSelector: diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py index 49513c30..b134c8ad 100644 --- a/src/xpk/core/core.py +++ b/src/xpk/core/core.py @@ -2281,7 +2281,8 @@ def get_main_container_resources( resources_yaml = """cpu: "24" memory: 100G""" if args.use_pathways: - return resources_yaml + return "" +# return resources_yaml gpu_resources_yaml = """nvidia.com/gpu: {system.chips_per_vm}""" if system.accelerator_type == AcceleratorType['GPU']: diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py index e8d4e7a1..d491949e 100644 --- a/src/xpk/core/kjob.py +++ b/src/xpk/core/kjob.py @@ -105,7 +105,7 @@ def verify_kjob_installed(args: Namespace) -> int: if verify_kjob_installed_code != 0: xpk_print( " kjob not found. Please follow" - " https://github.com/kubernetes-sigs/kueue/blob/main/cmd/experimental/kjobctl/docs/installation.md" + " https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md" " to install kjob." ) return verify_kjob_installed_code diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 71ed27ea..42ac43a6 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -59,6 +59,13 @@ def get_pathways_proxy_args(args) -> str: """ yaml = """- --server_port=29000 - --resource_manager_address={rm_address} + - --xla_tpu_scoped_vmem_limit_kib=98304 + - --xla_tpu_enable_async_collective_fusion=true + - --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true + - --xla_tpu_enable_async_collective_fusion_multiple_steps=true + - --xla_tpu_overlap_compute_collective_tc=true + - --xla_enable_async_all_gather=true + - --xla_tpu_spmd_rng_bit_generator_unsafe=true - --gcs_scratch_location={args.pathways_gcs_location}""" if args.use_pathways: @@ -201,6 +208,7 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str: - --gcs_scratch_location={args.pathways_gcs_location} - --node_type=resource_manager - --instance_count={instance_count} + - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip - --instance_type={instance_type}""" if args.use_pathways: return yaml.format( From bc54d06f8158e022a8f9bba1fa995a8d50800cfb Mon Sep 17 00:00:00 2001 From: Shaurya Date: Tue, 10 Dec 2024 07:37:34 +0000 Subject: [PATCH 02/11] Pathways v6e large scale runs --- src/xpk/core/pathways.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 42ac43a6..ded65053 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -42,6 +42,8 @@ def get_pathways_worker_args(args) -> str: """ yaml = """- --server_port=29001 - --resource_manager_address={rm_address} + - --temporary_flags_for_debugging=temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true + - --megascale_grpc_premap_memory_bytes=17179869184 - --gcs_scratch_location={args.pathways_gcs_location}""" if args.use_pathways: return yaml.format(args=args, rm_address=get_rm_address(args)) @@ -66,6 +68,7 @@ def get_pathways_proxy_args(args) -> str: - --xla_tpu_overlap_compute_collective_tc=true - --xla_enable_async_all_gather=true - --xla_tpu_spmd_rng_bit_generator_unsafe=true + - --xla_tpu_enable_sunk_dcn_allreduce_done_with_host_reduction=true - --gcs_scratch_location={args.pathways_gcs_location}""" if args.use_pathways: @@ -208,7 +211,7 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str: - --gcs_scratch_location={args.pathways_gcs_location} - --node_type=resource_manager - --instance_count={instance_count} - - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip + - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip;;;temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true - --instance_type={instance_type}""" if args.use_pathways: return yaml.format( From a3f7839561aceb8a44f92be9c8ab72815cd00f3e Mon Sep 17 00:00:00 2001 From: Shaurya Date: Tue, 10 Dec 2024 16:24:15 +0000 Subject: [PATCH 03/11] Pathways v6e large scale runs --- src/xpk/commands/workload.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py index 8b85e74b..a163cc13 100644 --- a/src/xpk/commands/workload.py +++ b/src/xpk/commands/workload.py @@ -195,12 +195,16 @@ template: spec: terminationGracePeriodSeconds: {args.termination_grace_period_seconds} + initContainers: # Add this section + - name: network-init + image: {args.docker_image} + command: ["bash", "-c", "echo '4096 41943040 314572800' > /proc/sys/net/ipv4/tcp_rmem"] + securityContext: + privileged: true containers: - args: {pathways_worker_args} env: - - name: GRPC_TRACE - value: "client_channel" image: {args.server_image} imagePullPolicy: Always name: pathways-worker @@ -256,8 +260,6 @@ value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) - name: TPU_SKIP_MDS_QUERY value: "true" - - name: GRPC_TRACE - value: "client_channel" image: {args.server_image} imagePullPolicy: Always name: pathways-rm @@ -299,8 +301,6 @@ env: - name: XLA_FLAGS value: "--xla_dump_to={args.pathways_gcs_location}/xla_dump" - - name: GRPC_TRACE - value: "client_channel" image: {args.proxy_server_image} imagePullPolicy: Always name: pathways-proxy From 6289f43d2b4a73b6116d1a593238829d516554f2 Mon Sep 17 00:00:00 2001 From: Sujeeth Jinesh Date: Tue, 10 Dec 2024 23:05:50 +0000 Subject: [PATCH 04/11] Minor update --- src/xpk/core/cluster_private.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xpk/core/cluster_private.py b/src/xpk/core/cluster_private.py index 70f3c9dd..d18dc649 100644 --- a/src/xpk/core/cluster_private.py +++ b/src/xpk/core/cluster_private.py @@ -54,8 +54,8 @@ def authorize_private_cluster_access_if_necessary(args) -> int: if add_current_machine_to_networks_return_code != 0: return add_current_machine_to_networks_return_code - if new_authorized_networks_needed or not is_current_machine_in_network: - return update_cluster_new_authorized_networks(args, authorized_networks) + # if new_authorized_networks_needed or not is_current_machine_in_network: + # return update_cluster_new_authorized_networks(args, authorized_networks) xpk_print("Current machine's IP adrress is already authorized.") return 0 From f88ae030e4b1eefaf566da885b43892e001641f3 Mon Sep 17 00:00:00 2001 From: Shaurya Date: Thu, 12 Dec 2024 00:14:58 +0000 Subject: [PATCH 05/11] add host network true to user workload --- src/xpk/core/pathways.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index ded65053..d77738fd 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -253,6 +253,8 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str: {container} nodeSelector: cloud.google.com/gke-nodepool: cpu-user-np + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet restartPolicy: OnFailure volumes: - hostPath: From 6217d88da2dcf6e0f42728dbbd34a3345cd9fc70 Mon Sep 17 00:00:00 2001 From: Sujeeth Jinesh Date: Thu, 12 Dec 2024 19:53:01 +0000 Subject: [PATCH 06/11] Add megascale graph within launch hang threshold --- src/xpk/core/pathways.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index d77738fd..91008f64 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -44,7 +44,9 @@ def get_pathways_worker_args(args) -> str: - --resource_manager_address={rm_address} - --temporary_flags_for_debugging=temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true - --megascale_grpc_premap_memory_bytes=17179869184 - - --gcs_scratch_location={args.pathways_gcs_location}""" + - --gcs_scratch_location={args.pathways_gcs_location} + - --megascale_graph_within_launch_hang_threshold=5""" # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633 + if args.use_pathways: return yaml.format(args=args, rm_address=get_rm_address(args)) else: From 0bcf863ec8b5b31cf4d7889ed29c480804b97f5f Mon Sep 17 00:00:00 2001 From: Sujeeth Jinesh Date: Thu, 12 Dec 2024 21:00:04 +0000 Subject: [PATCH 07/11] Syntax update --- src/xpk/core/pathways.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 91008f64..bcb40c1f 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -256,7 +256,7 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str: nodeSelector: cloud.google.com/gke-nodepool: cpu-user-np hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet + dnsPolicy: ClusterFirstWithHostNet restartPolicy: OnFailure volumes: - hostPath: From 76547c4aad37034537bf0751319bf1f755b1929e Mon Sep 17 00:00:00 2001 From: Shaurya Date: Fri, 13 Dec 2024 02:24:07 +0000 Subject: [PATCH 08/11] add gcsfuse on user job --- src/xpk/core/core.py | 6 +++++- src/xpk/core/pathways.py | 27 +++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py index b134c8ad..d63d3a99 100644 --- a/src/xpk/core/core.py +++ b/src/xpk/core/core.py @@ -2125,7 +2125,11 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str: if args.use_pathways: volume_mount_yaml = """- mountPath: /tmp - name: shared-tmp""" + name: shared-tmp + - name: gcs-fuse-csi-ephemeral + mountPath: /training-data + - name: dshm + mountPath: /dev/shm""" elif ( system.accelerator_type == AcceleratorType['TPU'] and args.deploy_stacktrace_sidecar diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index bcb40c1f..8d1a4ced 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -117,9 +117,9 @@ def add_pw_resources_to_kueue(args): - name: cpu-rm resources: - name: "cpu" - nominalQuota: 80 + nominalQuota: 480 - name: "memory" - nominalQuota: 160G + nominalQuota: 2000G - name: cpu-proxy resources: - name: "cpu" @@ -250,11 +250,17 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str: completions: 1 parallelism: 1 template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" spec: containers: {container} nodeSelector: - cloud.google.com/gke-nodepool: cpu-user-np + cloud.google.com/gke-nodepool: high-mem-pool hostNetwork: true dnsPolicy: ClusterFirstWithHostNet restartPolicy: OnFailure @@ -262,7 +268,20 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str: - hostPath: path: /tmp type: DirectoryOrCreate - name: shared-tmp""" + name: shared-tmp + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: trillium-storage-datasets-sr + mountOptions: "debug_fuse,implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" + """ if args.headless: return '' else: From 5521001a4e628e46da0fea476d43a308dc319ef3 Mon Sep 17 00:00:00 2001 From: RoshaniN Date: Fri, 13 Dec 2024 15:12:06 +0000 Subject: [PATCH 09/11] Correct typo on megascale launch hang flag. --- src/xpk/core/pathways.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index 8d1a4ced..d6f81137 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -45,7 +45,7 @@ def get_pathways_worker_args(args) -> str: - --temporary_flags_for_debugging=temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true - --megascale_grpc_premap_memory_bytes=17179869184 - --gcs_scratch_location={args.pathways_gcs_location} - - --megascale_graph_within_launch_hang_threshold=5""" # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633 + - --megascale_graph_within_launch_hang_threshold=5m""" # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633 if args.use_pathways: return yaml.format(args=args, rm_address=get_rm_address(args)) From 17433ad2355b8fcd7dc87ae889c74fc0c3bde05a Mon Sep 17 00:00:00 2001 From: RoshaniN Date: Fri, 13 Dec 2024 18:14:26 +0000 Subject: [PATCH 10/11] XLA flags added to McJAX LR workload added for parity. --- src/xpk/core/pathways.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index d6f81137..dfdc15ab 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -45,7 +45,8 @@ def get_pathways_worker_args(args) -> str: - --temporary_flags_for_debugging=temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true - --megascale_grpc_premap_memory_bytes=17179869184 - --gcs_scratch_location={args.pathways_gcs_location} - - --megascale_graph_within_launch_hang_threshold=5m""" # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633 + - --megascale_graph_within_launch_hang_threshold=5m + - --deepsea_chip_config_name=megachip_tccontrol""" # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633 if args.use_pathways: return yaml.format(args=args, rm_address=get_rm_address(args)) @@ -71,7 +72,14 @@ def get_pathways_proxy_args(args) -> str: - --xla_enable_async_all_gather=true - --xla_tpu_spmd_rng_bit_generator_unsafe=true - --xla_tpu_enable_sunk_dcn_allreduce_done_with_host_reduction=true - - --gcs_scratch_location={args.pathways_gcs_location}""" + - --gcs_scratch_location={args.pathways_gcs_location} + - --xla_sc_disable_megacore_partitioning=true + - --xla_tpu_enable_all_reduce_offload_tracing=true + - --xla_tpu_use_tc_device_shape_on_sc=true + - --xla_tpu_enable_sparse_core_collective_offload_all_reduce=true + - --xla_sc_enable_instruction_fusion=false + - --xla_sc_disjoint_spmem=false + - --deepsea_chip_config_name=megachip_tccontrol""" if args.use_pathways: return yaml.format(args=args, rm_address=get_rm_address(args)) @@ -214,7 +222,8 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str: - --node_type=resource_manager - --instance_count={instance_count} - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip;;;temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true - - --instance_type={instance_type}""" + - --instance_type={instance_type} + - --deepsea_chip_config_name=megachip_tccontrol""" if args.use_pathways: return yaml.format( args=args, From 360a060ee001b9bf2b7da3a5c7c12ac216ef54ca Mon Sep 17 00:00:00 2001 From: Sujeeth Jinesh Date: Sat, 14 Dec 2024 00:27:52 +0000 Subject: [PATCH 11/11] Add megachip_tccontrol --- src/xpk/core/pathways.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py index bcb40c1f..a9cdf143 100644 --- a/src/xpk/core/pathways.py +++ b/src/xpk/core/pathways.py @@ -213,7 +213,7 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str: - --gcs_scratch_location={args.pathways_gcs_location} - --node_type=resource_manager - --instance_count={instance_count} - - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip;;;temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true + - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip_tccontrol;;;temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true - --instance_type={instance_type}""" if args.use_pathways: return yaml.format(