From 2b11b6ad5825e0119c65761adba368b1cdce3654 Mon Sep 17 00:00:00 2001
From: Shaurya <shaurya.gup@gmail.com>
Date: Mon, 9 Dec 2024 23:56:45 +0000
Subject: [PATCH 01/11] Pathways v6e large scale runs

---
 README.md                    |  2 +-
 src/xpk/commands/workload.py | 18 ++++++++++++++++++
 src/xpk/core/core.py         |  3 ++-
 src/xpk/core/kjob.py         |  2 +-
 src/xpk/core/pathways.py     |  8 ++++++++
 5 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c4e682a0..ffdec101 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ xpk uses many tool to provide all neccessary functionalities. User must install
 - gcloud (install from [here](https://cloud.google.com/sdk/gcloud#download_and_install_the))
 - kubectl (install from [here](https://kubernetes.io/docs/tasks/tools/))
 - kueuectl (install from [here](https://kueue.sigs.k8s.io/docs/reference/kubectl-kueue/installation/))
-- kjob (installation instructions [here](https://github.com/kubernetes-sigs/kueue/blob/main/cmd/experimental/kjobctl/docs/installation.md))
+- kjob (installation instructions [here](https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md))
 
 # Installation
 To install xpk, run the following command and install additional tools, mentioned in [prerequisites](#prerequisites). [Makefile](https://github.com/AI-Hypercomputer/xpk/blob/main/Makefile) provides a way to install all neccessary tools:
diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py
index 70ef1a74..8b85e74b 100644
--- a/src/xpk/commands/workload.py
+++ b/src/xpk/commands/workload.py
@@ -198,6 +198,9 @@
             containers:
             - args:
               {pathways_worker_args}
+              env:
+              - name: GRPC_TRACE
+                value: "client_channel"
               image: {args.server_image}
               imagePullPolicy: Always
               name: pathways-worker
@@ -253,6 +256,8 @@
                 value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
               - name: TPU_SKIP_MDS_QUERY
                 value: "true"
+              - name: GRPC_TRACE
+                value: "client_channel"
               image: {args.server_image}
               imagePullPolicy: Always
               name: pathways-rm
@@ -263,6 +268,10 @@
               volumeMounts:
               - mountPath: /tmp
                 name: shared-tmp
+              resources:
+                limits:
+                  cpu: "30"
+                  memory: 120G
             nodeSelector:
               cloud.google.com/gke-nodepool: cpu-rm-np
             hostNetwork: true
@@ -287,11 +296,20 @@
             containers:
             - args:
               {pathways_proxy_args}
+              env:
+              - name: XLA_FLAGS
+                value: "--xla_dump_to={args.pathways_gcs_location}/xla_dump"
+              - name: GRPC_TRACE
+                value: "client_channel"
               image: {args.proxy_server_image}
               imagePullPolicy: Always
               name: pathways-proxy
               ports:
               - containerPort: 29000
+              resources:
+                limits:
+                  cpu: "30"
+                  memory: 120G
             hostNetwork: true
             dnsPolicy: ClusterFirstWithHostNet
             nodeSelector:
diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py
index 49513c30..b134c8ad 100644
--- a/src/xpk/core/core.py
+++ b/src/xpk/core/core.py
@@ -2281,7 +2281,8 @@ def get_main_container_resources(
   resources_yaml = """cpu: "24"
                     memory: 100G"""
   if args.use_pathways:
-    return resources_yaml
+    return ""
+#    return resources_yaml
 
   gpu_resources_yaml = """nvidia.com/gpu: {system.chips_per_vm}"""
   if system.accelerator_type == AcceleratorType['GPU']:
diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py
index e8d4e7a1..d491949e 100644
--- a/src/xpk/core/kjob.py
+++ b/src/xpk/core/kjob.py
@@ -105,7 +105,7 @@ def verify_kjob_installed(args: Namespace) -> int:
   if verify_kjob_installed_code != 0:
     xpk_print(
         " kjob not found. Please follow"
-        " https://github.com/kubernetes-sigs/kueue/blob/main/cmd/experimental/kjobctl/docs/installation.md"
+        " https://github.com/kubernetes-sigs/kjob/blob/main/docs/installation.md"
         " to install kjob."
     )
     return verify_kjob_installed_code
diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index 71ed27ea..42ac43a6 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -59,6 +59,13 @@ def get_pathways_proxy_args(args) -> str:
   """
   yaml = """- --server_port=29000
               - --resource_manager_address={rm_address}
+              - --xla_tpu_scoped_vmem_limit_kib=98304
+              - --xla_tpu_enable_async_collective_fusion=true
+              - --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true
+              - --xla_tpu_enable_async_collective_fusion_multiple_steps=true
+              - --xla_tpu_overlap_compute_collective_tc=true
+              - --xla_enable_async_all_gather=true
+              - --xla_tpu_spmd_rng_bit_generator_unsafe=true
               - --gcs_scratch_location={args.pathways_gcs_location}"""
 
   if args.use_pathways:
@@ -201,6 +208,7 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
               - --gcs_scratch_location={args.pathways_gcs_location}
               - --node_type=resource_manager
               - --instance_count={instance_count}
+              - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip
               - --instance_type={instance_type}"""
   if args.use_pathways:
     return yaml.format(

From bc54d06f8158e022a8f9bba1fa995a8d50800cfb Mon Sep 17 00:00:00 2001
From: Shaurya <shaurya.gup@gmail.com>
Date: Tue, 10 Dec 2024 07:37:34 +0000
Subject: [PATCH 02/11] Pathways v6e large scale runs

---
 src/xpk/core/pathways.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index 42ac43a6..ded65053 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -42,6 +42,8 @@ def get_pathways_worker_args(args) -> str:
   """
   yaml = """- --server_port=29001
               - --resource_manager_address={rm_address}
+              - --temporary_flags_for_debugging=temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true
+              - --megascale_grpc_premap_memory_bytes=17179869184
               - --gcs_scratch_location={args.pathways_gcs_location}"""
   if args.use_pathways:
     return yaml.format(args=args, rm_address=get_rm_address(args))
@@ -66,6 +68,7 @@ def get_pathways_proxy_args(args) -> str:
               - --xla_tpu_overlap_compute_collective_tc=true
               - --xla_enable_async_all_gather=true
               - --xla_tpu_spmd_rng_bit_generator_unsafe=true
+              - --xla_tpu_enable_sunk_dcn_allreduce_done_with_host_reduction=true
               - --gcs_scratch_location={args.pathways_gcs_location}"""
 
   if args.use_pathways:
@@ -208,7 +211,7 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
               - --gcs_scratch_location={args.pathways_gcs_location}
               - --node_type=resource_manager
               - --instance_count={instance_count}
-              - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip
+              - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip;;;temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true
               - --instance_type={instance_type}"""
   if args.use_pathways:
     return yaml.format(

From a3f7839561aceb8a44f92be9c8ab72815cd00f3e Mon Sep 17 00:00:00 2001
From: Shaurya <shaurya.gup@gmail.com>
Date: Tue, 10 Dec 2024 16:24:15 +0000
Subject: [PATCH 03/11] Pathways v6e large scale runs

---
 src/xpk/commands/workload.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py
index 8b85e74b..a163cc13 100644
--- a/src/xpk/commands/workload.py
+++ b/src/xpk/commands/workload.py
@@ -195,12 +195,16 @@
         template:
           spec:
             terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
+            initContainers:  # Add this section
+            - name: network-init 
+              image: {args.docker_image}
+              command: ["bash", "-c", "echo '4096 41943040 314572800' > /proc/sys/net/ipv4/tcp_rmem"]
+              securityContext:
+                privileged: true
             containers:
             - args:
               {pathways_worker_args}
               env:
-              - name: GRPC_TRACE
-                value: "client_channel"
               image: {args.server_image}
               imagePullPolicy: Always
               name: pathways-worker
@@ -256,8 +260,6 @@
                 value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
               - name: TPU_SKIP_MDS_QUERY
                 value: "true"
-              - name: GRPC_TRACE
-                value: "client_channel"
               image: {args.server_image}
               imagePullPolicy: Always
               name: pathways-rm
@@ -299,8 +301,6 @@
               env:
               - name: XLA_FLAGS
                 value: "--xla_dump_to={args.pathways_gcs_location}/xla_dump"
-              - name: GRPC_TRACE
-                value: "client_channel"
               image: {args.proxy_server_image}
               imagePullPolicy: Always
               name: pathways-proxy

From 6289f43d2b4a73b6116d1a593238829d516554f2 Mon Sep 17 00:00:00 2001
From: Sujeeth Jinesh <sujinesh@google.com>
Date: Tue, 10 Dec 2024 23:05:50 +0000
Subject: [PATCH 04/11] Minor update

---
 src/xpk/core/cluster_private.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/xpk/core/cluster_private.py b/src/xpk/core/cluster_private.py
index 70f3c9dd..d18dc649 100644
--- a/src/xpk/core/cluster_private.py
+++ b/src/xpk/core/cluster_private.py
@@ -54,8 +54,8 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
   if add_current_machine_to_networks_return_code != 0:
     return add_current_machine_to_networks_return_code
 
-  if new_authorized_networks_needed or not is_current_machine_in_network:
-    return update_cluster_new_authorized_networks(args, authorized_networks)
+  # if new_authorized_networks_needed or not is_current_machine_in_network:
+  #   return update_cluster_new_authorized_networks(args, authorized_networks)
 
   xpk_print("Current machine's IP adrress is already authorized.")
   return 0

From f88ae030e4b1eefaf566da885b43892e001641f3 Mon Sep 17 00:00:00 2001
From: Shaurya <shaurya.gup@gmail.com>
Date: Thu, 12 Dec 2024 00:14:58 +0000
Subject: [PATCH 05/11] add host network true to user workload

---
 src/xpk/core/pathways.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index ded65053..d77738fd 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -253,6 +253,8 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
               {container}
             nodeSelector:
               cloud.google.com/gke-nodepool: cpu-user-np
+            hostNetwork: true
+              dnsPolicy: ClusterFirstWithHostNet
             restartPolicy: OnFailure
             volumes:
             - hostPath:

From 6217d88da2dcf6e0f42728dbbd34a3345cd9fc70 Mon Sep 17 00:00:00 2001
From: Sujeeth Jinesh <sujinesh@google.com>
Date: Thu, 12 Dec 2024 19:53:01 +0000
Subject: [PATCH 06/11] Add megascale graph within launch hang threshold

---
 src/xpk/core/pathways.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index d77738fd..91008f64 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -44,7 +44,9 @@ def get_pathways_worker_args(args) -> str:
               - --resource_manager_address={rm_address}
               - --temporary_flags_for_debugging=temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true
               - --megascale_grpc_premap_memory_bytes=17179869184
-              - --gcs_scratch_location={args.pathways_gcs_location}"""
+              - --gcs_scratch_location={args.pathways_gcs_location}
+              - --megascale_graph_within_launch_hang_threshold=5"""  # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633
+
   if args.use_pathways:
     return yaml.format(args=args, rm_address=get_rm_address(args))
   else:

From 0bcf863ec8b5b31cf4d7889ed29c480804b97f5f Mon Sep 17 00:00:00 2001
From: Sujeeth Jinesh <sujinesh@google.com>
Date: Thu, 12 Dec 2024 21:00:04 +0000
Subject: [PATCH 07/11] Syntax update

---
 src/xpk/core/pathways.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index 91008f64..bcb40c1f 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -256,7 +256,7 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
             nodeSelector:
               cloud.google.com/gke-nodepool: cpu-user-np
             hostNetwork: true
-              dnsPolicy: ClusterFirstWithHostNet
+            dnsPolicy: ClusterFirstWithHostNet
             restartPolicy: OnFailure
             volumes:
             - hostPath:

From 76547c4aad37034537bf0751319bf1f755b1929e Mon Sep 17 00:00:00 2001
From: Shaurya <shaurya.gup@gmail.com>
Date: Fri, 13 Dec 2024 02:24:07 +0000
Subject: [PATCH 08/11] add gcsfuse on user job

---
 src/xpk/core/core.py     |  6 +++++-
 src/xpk/core/pathways.py | 27 +++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/xpk/core/core.py b/src/xpk/core/core.py
index b134c8ad..d63d3a99 100644
--- a/src/xpk/core/core.py
+++ b/src/xpk/core/core.py
@@ -2125,7 +2125,11 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
 
   if args.use_pathways:
     volume_mount_yaml = """- mountPath: /tmp
-                  name: shared-tmp"""
+                  name: shared-tmp
+                - name: gcs-fuse-csi-ephemeral
+                  mountPath: /training-data
+                - name: dshm
+                  mountPath: /dev/shm"""
   elif (
       system.accelerator_type == AcceleratorType['TPU']
       and args.deploy_stacktrace_sidecar
diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index bcb40c1f..8d1a4ced 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -117,9 +117,9 @@ def add_pw_resources_to_kueue(args):
     - name: cpu-rm
       resources:
       - name: "cpu"
-        nominalQuota: 80
+        nominalQuota: 480
       - name: "memory"
-        nominalQuota: 160G
+        nominalQuota: 2000G
     - name: cpu-proxy
       resources:
       - name: "cpu"
@@ -250,11 +250,17 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
         completions: 1
         parallelism: 1
         template:
+          metadata:
+            annotations:
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "0"
+              gke-gcsfuse/memory-limit: "0"
+              gke-gcsfuse/ephemeral-storage-limit: "0"
           spec:
             containers:
               {container}
             nodeSelector:
-              cloud.google.com/gke-nodepool: cpu-user-np
+              cloud.google.com/gke-nodepool: high-mem-pool
             hostNetwork: true
             dnsPolicy: ClusterFirstWithHostNet
             restartPolicy: OnFailure
@@ -262,7 +268,20 @@ def get_user_workload_for_pathways(args, system: SystemCharacteristics) -> str:
             - hostPath:
                 path: /tmp
                 type: DirectoryOrCreate
-              name: shared-tmp"""
+              name: shared-tmp
+            - name: gke-gcsfuse-cache
+              emptyDir:
+                medium: Memory
+            - name: dshm
+              emptyDir:
+                medium: Memory
+            - name: gcs-fuse-csi-ephemeral
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: trillium-storage-datasets-sr
+                  mountOptions: "debug_fuse,implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
+  """
   if args.headless:
     return ''
   else:

From 5521001a4e628e46da0fea476d43a308dc319ef3 Mon Sep 17 00:00:00 2001
From: RoshaniN <roshanin@google.com>
Date: Fri, 13 Dec 2024 15:12:06 +0000
Subject: [PATCH 09/11] Correct typo on megascale launch hang flag.

---
 src/xpk/core/pathways.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index 8d1a4ced..d6f81137 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -45,7 +45,7 @@ def get_pathways_worker_args(args) -> str:
               - --temporary_flags_for_debugging=temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true
               - --megascale_grpc_premap_memory_bytes=17179869184
               - --gcs_scratch_location={args.pathways_gcs_location}
-              - --megascale_graph_within_launch_hang_threshold=5"""  # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633
+              - --megascale_graph_within_launch_hang_threshold=5m"""  # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633
 
   if args.use_pathways:
     return yaml.format(args=args, rm_address=get_rm_address(args))

From 17433ad2355b8fcd7dc87ae889c74fc0c3bde05a Mon Sep 17 00:00:00 2001
From: RoshaniN <roshanin@google.com>
Date: Fri, 13 Dec 2024 18:14:26 +0000
Subject: [PATCH 10/11] XLA flags added to McJAX LR workload added for parity.

---
 src/xpk/core/pathways.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index d6f81137..dfdc15ab 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -45,7 +45,8 @@ def get_pathways_worker_args(args) -> str:
               - --temporary_flags_for_debugging=temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true
               - --megascale_grpc_premap_memory_bytes=17179869184
               - --gcs_scratch_location={args.pathways_gcs_location}
-              - --megascale_graph_within_launch_hang_threshold=5m"""  # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633
+              - --megascale_graph_within_launch_hang_threshold=5m
+              - --deepsea_chip_config_name=megachip_tccontrol"""  # More flags we can adjust here: https://source.corp.google.com/piper///depot/google3/platforms/xla/megascale/runtime/executor/executor.cc;l=53-81;rcl=705575633
 
   if args.use_pathways:
     return yaml.format(args=args, rm_address=get_rm_address(args))
@@ -71,7 +72,14 @@ def get_pathways_proxy_args(args) -> str:
               - --xla_enable_async_all_gather=true
               - --xla_tpu_spmd_rng_bit_generator_unsafe=true
               - --xla_tpu_enable_sunk_dcn_allreduce_done_with_host_reduction=true
-              - --gcs_scratch_location={args.pathways_gcs_location}"""
+              - --gcs_scratch_location={args.pathways_gcs_location}
+              - --xla_sc_disable_megacore_partitioning=true
+              - --xla_tpu_enable_all_reduce_offload_tracing=true
+              - --xla_tpu_use_tc_device_shape_on_sc=true
+              - --xla_tpu_enable_sparse_core_collective_offload_all_reduce=true
+              - --xla_sc_enable_instruction_fusion=false
+              - --xla_sc_disjoint_spmem=false
+              - --deepsea_chip_config_name=megachip_tccontrol"""
 
   if args.use_pathways:
     return yaml.format(args=args, rm_address=get_rm_address(args))
@@ -214,7 +222,8 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
               - --node_type=resource_manager
               - --instance_count={instance_count}
               - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip;;;temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true
-              - --instance_type={instance_type}"""
+              - --instance_type={instance_type}
+              - --deepsea_chip_config_name=megachip_tccontrol"""
   if args.use_pathways:
     return yaml.format(
         args=args,

From 360a060ee001b9bf2b7da3a5c7c12ac216ef54ca Mon Sep 17 00:00:00 2001
From: Sujeeth Jinesh <sujinesh@google.com>
Date: Sat, 14 Dec 2024 00:27:52 +0000
Subject: [PATCH 11/11] Add megachip_tccontrol

---
 src/xpk/core/pathways.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
index bcb40c1f..a9cdf143 100644
--- a/src/xpk/core/pathways.py
+++ b/src/xpk/core/pathways.py
@@ -213,7 +213,7 @@ def get_pathways_rm_args(args, system: SystemCharacteristics) -> str:
               - --gcs_scratch_location={args.pathways_gcs_location}
               - --node_type=resource_manager
               - --instance_count={instance_count}
-              - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip;;;temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true
+              - --temporary_flags_for_debugging=temporary_flag_for_debugging_worker_expected_tpu_chip_config=megachip_tccontrol;;;temporary_flag_for_debugging_megascale_address_derive_from_megascale_grpc=true
               - --instance_type={instance_type}"""
   if args.use_pathways:
     return yaml.format(