From b3200a59730b7158ee27eebbbbd39b267be85761 Mon Sep 17 00:00:00 2001 From: Swarna Bharathi Mantena Date: Thu, 25 Jun 2026 09:25:28 +0000 Subject: [PATCH 1/7] feat(pathways): align GCluster JobSet with XPK production defaults Aligns the Kubernetes JobSet manifests generated by GCluster with XPK and GKE Pathways standards to ensure reliable execution of distributed JAX workloads. - Injected JAX proxy environment variables (JAX_PLATFORMS, JAX_BACKEND_TARGET, XCLOUD_ENVIRONMENT) into the JAX workload container. - Added host-path volume mount for /tmp to enable shared-memory and local socket IPC between the JAX client, Proxy, and Resource Manager. - Enabled privileged security context (privileged: true) on the JAX container to allow host network binding and physical memory locking. - Added default resource limits (cpu: "24", memory: "100Gi") to the JAX workload container to prevent CPU node starvation. - Wrapped the user command in a SIGTERM-propagating bash trap to ensure reliable checkpoint-on-preemption during Spot VM evictions. - Stamped exclusive-topology annotations on the worker replicated job to force contiguous scheduling on GKE TPU node pools. - Natively injected ALTS bypass environment variables to prevent secure gRPC handshake failures on standard GKE VPC networks. - Propagated priorityClassName to the head coordinator pod spec. - Updated GKE orchestrator unit tests to assert all new configurations. --- .../gke/gke_job_orchestrator_test.go | 6 +++ .../gke/templates/pathways_jobset.tmpl | 52 ++++++++++++++++++- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/pkg/orchestrator/gke/gke_job_orchestrator_test.go b/pkg/orchestrator/gke/gke_job_orchestrator_test.go index 2aff1f94cf..9ecbef1d3d 100644 --- a/pkg/orchestrator/gke/gke_job_orchestrator_test.go +++ b/pkg/orchestrator/gke/gke_job_orchestrator_test.go @@ -484,6 +484,12 @@ func TestGeneratePathwaysManifest(t *testing.T) { `cpu: "8"`, `memory: "32Gi"`, "restartStrategy: Recreate", + "privileged: true", + "PATHWAYS_UNSAFE_UNSAFE_OVERRIDE_GRPC_CREDENTIALS", + "IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS", + "alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool", + `cpu: "24"`, + "_sigterm() (kill -SIGTERM $! 2>/dev/null;)", } for _, substr := range expectedSubstrs { diff --git a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl index db3545d2a7..bb57ecf890 100644 --- a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl +++ b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl @@ -55,6 +55,9 @@ spec: hostNetwork: true dnsPolicy: ClusterFirstWithHostNet restartPolicy: Never +{{- if .PriorityClassName }} + priorityClassName: {{.PriorityClassName}} +{{- end }} {{- if .ServiceAccountName }} serviceAccountName: {{.ServiceAccountName}} {{- end }} @@ -69,6 +72,7 @@ spec: {{- end }} - name: pathways-proxy image: {{.Pathways.ProxyServerImage}} + imagePullPolicy: Always ports: - containerPort: 29000 args: @@ -81,6 +85,11 @@ spec: {{- range .ProxyArgsList }} - {{.}} {{- end }} + env: + - name: PATHWAYS_HEAD + valueFrom: + fieldRef: + fieldPath: metadata.labels['jobset.sigs.k8s.io/coordinator'] {{- if not .Pathways.Headless}} restartPolicy: Always {{- end }} @@ -90,6 +99,7 @@ spec: memory: "100Gi" - name: pathways-rm image: {{.Pathways.ServerImage}} + imagePullPolicy: Always ports: - containerPort: 29001 - containerPort: 29002 @@ -131,13 +141,47 @@ spec: containers: - name: workload-container image: {{.FullImageName}} + imagePullPolicy: Always + securityContext: + privileged: true + resources: + limits: + cpu: "24" + memory: "100Gi" + env: + - name: PATHWAYS_HEAD + valueFrom: + fieldRef: + fieldPath: metadata.labels['jobset.sigs.k8s.io/coordinator'] + - name: JAX_PLATFORMS + value: proxy + - name: XCLOUD_ENVIRONMENT + value: GCP + - name: JAX_BACKEND_TARGET + value: grpc://$(PATHWAYS_HEAD):29000 + - name: PATHWAYS_UNSAFE_UNSAFE_OVERRIDE_GRPC_CREDENTIALS + value: "grpc_insecure_override" + - name: TEST_UNDECLARED_OUTPUTS_DIR + value: "/tmp" + - name: IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS + value: "true" command: - "/bin/bash" - "-c" - | - {{.CommandToRun}} -{{- if .VolumeMountsYAML }} + echo "GCluster Start: $(date)" + _sigterm() (kill -SIGTERM $! 2>/dev/null;) + trap _sigterm SIGTERM + ({{.CommandToRun}}) & PID=$! + while kill -0 $PID 2>/dev/null; do sleep 5; done + wait $PID + EXIT_CODE=$? + echo "GCluster End: $(date)" + exit $EXIT_CODE volumeMounts: + - mountPath: /tmp + name: shared-tmp +{{- if .VolumeMountsYAML }} {{.VolumeMountsYAML}} {{- end }} {{- end}} @@ -154,6 +198,9 @@ spec: - name: worker replicas: {{.NumSlices}} template: + metadata: + annotations: + alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool spec: completionMode: Indexed parallelism: {{.NodesPerSlice}} @@ -194,6 +241,7 @@ spec: containers: - name: pathways-worker image: {{.Pathways.WorkerImage}} + imagePullPolicy: Always ports: - containerPort: 29005 - containerPort: 29006 From d43d8f396a97155ac22a5befbb7036cdaa56cee0 Mon Sep 17 00:00:00 2001 From: Swarna Bharathi Mantena Date: Thu, 25 Jun 2026 11:03:43 +0000 Subject: [PATCH 2/7] refactor(gke): address PR review comments and optimize pathways platform translation --- pkg/orchestrator/gke/gke_job_orchestrator_test.go | 2 +- pkg/orchestrator/gke/manifest_generator.go | 10 +++++++++- pkg/orchestrator/gke/templates/pathways_jobset.tmpl | 10 ++++++---- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/pkg/orchestrator/gke/gke_job_orchestrator_test.go b/pkg/orchestrator/gke/gke_job_orchestrator_test.go index 9ecbef1d3d..6e6ec8005f 100644 --- a/pkg/orchestrator/gke/gke_job_orchestrator_test.go +++ b/pkg/orchestrator/gke/gke_job_orchestrator_test.go @@ -489,7 +489,7 @@ func TestGeneratePathwaysManifest(t *testing.T) { "IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS", "alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool", `cpu: "24"`, - "_sigterm() (kill -SIGTERM $! 2>/dev/null;)", + "kill -SIGTERM $PID", } for _, substr := range expectedSubstrs { diff --git a/pkg/orchestrator/gke/manifest_generator.go b/pkg/orchestrator/gke/manifest_generator.go index fc606d3ab7..aff4dd5db7 100644 --- a/pkg/orchestrator/gke/manifest_generator.go +++ b/pkg/orchestrator/gke/manifest_generator.go @@ -123,7 +123,15 @@ func (g *GKEOrchestrator) PrepareManifestOptions(job orchestrator.JobDefinition, parts := strings.Split(originalAccelType, "-") instanceType := parts[0] - pathwaysInstanceType := fmt.Sprintf("%s:%s", instanceType, schedOpts.Topology) + + // Reuse GCluster's existing GKE accelerator label mapping and algorithmically + // derive the Pathways short platform key to avoid duplicating mapping tables. + gkeLabel := g.GenerateGKENodeSelectorLabel(instanceType) + pathwaysPlatform := strings.ReplaceAll(gkeLabel, "-podslice", "") + pathwaysPlatform = strings.ReplaceAll(pathwaysPlatform, "-slice", "") + pathwaysPlatform = strings.ReplaceAll(pathwaysPlatform, "-", "") + + pathwaysInstanceType := fmt.Sprintf("%s:%s", pathwaysPlatform, schedOpts.Topology) opts := ManifestOptions{ IsDynamicSlicing: isDynamicSlicing, diff --git a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl index bb57ecf890..d489fce1b7 100644 --- a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl +++ b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl @@ -141,7 +141,6 @@ spec: containers: - name: workload-container image: {{.FullImageName}} - imagePullPolicy: Always securityContext: privileged: true resources: @@ -170,10 +169,13 @@ spec: - "-c" - | echo "GCluster Start: $(date)" - _sigterm() (kill -SIGTERM $! 2>/dev/null;) + _sigterm() { + kill -SIGTERM $PID 2>/dev/null + wait $PID + exit 143 + } trap _sigterm SIGTERM - ({{.CommandToRun}}) & PID=$! - while kill -0 $PID 2>/dev/null; do sleep 5; done + {{.CommandToRun}} & PID=$! wait $PID EXIT_CODE=$? echo "GCluster End: $(date)" From a0b70574503d3716c6556d4c1f5f6d81328362dc Mon Sep 17 00:00:00 2001 From: Swarna Bharathi Mantena Date: Thu, 25 Jun 2026 11:18:55 +0000 Subject: [PATCH 3/7] perf(gke): remove imagePullPolicy Always from Pathways support containers --- pkg/orchestrator/gke/templates/pathways_jobset.tmpl | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl index d489fce1b7..92a9fa51b3 100644 --- a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl +++ b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl @@ -72,7 +72,6 @@ spec: {{- end }} - name: pathways-proxy image: {{.Pathways.ProxyServerImage}} - imagePullPolicy: Always ports: - containerPort: 29000 args: @@ -99,7 +98,6 @@ spec: memory: "100Gi" - name: pathways-rm image: {{.Pathways.ServerImage}} - imagePullPolicy: Always ports: - containerPort: 29001 - containerPort: 29002 @@ -243,7 +241,6 @@ spec: containers: - name: pathways-worker image: {{.Pathways.WorkerImage}} - imagePullPolicy: Always ports: - containerPort: 29005 - containerPort: 29006 From 3228e70050199fd0b39bc5ccd31775acbd320c52 Mon Sep 17 00:00:00 2001 From: Swarna Bharathi Mantena Date: Thu, 25 Jun 2026 11:37:10 +0000 Subject: [PATCH 4/7] fix(gke): align pathways platform key and enable native ALTS security --- pkg/orchestrator/gke/gke_job_orchestrator_test.go | 2 -- pkg/orchestrator/gke/manifest_generator.go | 8 +++++++- pkg/orchestrator/gke/templates/pathways_jobset.tmpl | 6 ------ 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pkg/orchestrator/gke/gke_job_orchestrator_test.go b/pkg/orchestrator/gke/gke_job_orchestrator_test.go index 6e6ec8005f..8d180c721b 100644 --- a/pkg/orchestrator/gke/gke_job_orchestrator_test.go +++ b/pkg/orchestrator/gke/gke_job_orchestrator_test.go @@ -485,8 +485,6 @@ func TestGeneratePathwaysManifest(t *testing.T) { `memory: "32Gi"`, "restartStrategy: Recreate", "privileged: true", - "PATHWAYS_UNSAFE_UNSAFE_OVERRIDE_GRPC_CREDENTIALS", - "IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS", "alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool", `cpu: "24"`, "kill -SIGTERM $PID", diff --git a/pkg/orchestrator/gke/manifest_generator.go b/pkg/orchestrator/gke/manifest_generator.go index aff4dd5db7..8934a787c1 100644 --- a/pkg/orchestrator/gke/manifest_generator.go +++ b/pkg/orchestrator/gke/manifest_generator.go @@ -127,7 +127,13 @@ func (g *GKEOrchestrator) PrepareManifestOptions(job orchestrator.JobDefinition, // Reuse GCluster's existing GKE accelerator label mapping and algorithmically // derive the Pathways short platform key to avoid duplicating mapping tables. gkeLabel := g.GenerateGKENodeSelectorLabel(instanceType) - pathwaysPlatform := strings.ReplaceAll(gkeLabel, "-podslice", "") + // Normalize GKE's "v5-lite" naming to JAX/Pathways's standard "v5e" naming + // to ensure correct topology lookup in the Pathways server binary. + normalizedLabel := gkeLabel + if strings.Contains(gkeLabel, "v5-lite") { + normalizedLabel = strings.ReplaceAll(gkeLabel, "v5-lite", "v5e") + } + pathwaysPlatform := strings.ReplaceAll(normalizedLabel, "-podslice", "") pathwaysPlatform = strings.ReplaceAll(pathwaysPlatform, "-slice", "") pathwaysPlatform = strings.ReplaceAll(pathwaysPlatform, "-", "") diff --git a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl index 92a9fa51b3..88feedd650 100644 --- a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl +++ b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl @@ -156,12 +156,6 @@ spec: value: GCP - name: JAX_BACKEND_TARGET value: grpc://$(PATHWAYS_HEAD):29000 - - name: PATHWAYS_UNSAFE_UNSAFE_OVERRIDE_GRPC_CREDENTIALS - value: "grpc_insecure_override" - - name: TEST_UNDECLARED_OUTPUTS_DIR - value: "/tmp" - - name: IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS - value: "true" command: - "/bin/bash" - "-c" From 5b98363cfdfe61d6a504a7b31f18dde20f2a5130 Mon Sep 17 00:00:00 2001 From: Swarna Bharathi Mantena Date: Thu, 25 Jun 2026 11:40:39 +0000 Subject: [PATCH 5/7] fix(gke): explicitly define resource requests for pathways workload container --- pkg/orchestrator/gke/gke_job_orchestrator_test.go | 2 ++ pkg/orchestrator/gke/templates/pathways_jobset.tmpl | 3 +++ 2 files changed, 5 insertions(+) diff --git a/pkg/orchestrator/gke/gke_job_orchestrator_test.go b/pkg/orchestrator/gke/gke_job_orchestrator_test.go index 8d180c721b..494dda3f56 100644 --- a/pkg/orchestrator/gke/gke_job_orchestrator_test.go +++ b/pkg/orchestrator/gke/gke_job_orchestrator_test.go @@ -487,6 +487,8 @@ func TestGeneratePathwaysManifest(t *testing.T) { "privileged: true", "alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool", `cpu: "24"`, + `cpu: "2"`, + `memory: "8Gi"`, "kill -SIGTERM $PID", } diff --git a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl index 88feedd650..b8ff7063ba 100644 --- a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl +++ b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl @@ -145,6 +145,9 @@ spec: limits: cpu: "24" memory: "100Gi" + requests: + cpu: "2" + memory: "8Gi" env: - name: PATHWAYS_HEAD valueFrom: From 516689190950d0bcf3ca5f597e14f4bb3ff66cda Mon Sep 17 00:00:00 2001 From: Swarna Bharathi Mantena Date: Thu, 25 Jun 2026 15:24:01 +0000 Subject: [PATCH 6/7] fix(gke): robustify preemption trap, normalize tpu platform keys, and clean redundant rm volume mounts --- pkg/orchestrator/gke/manifest_generator.go | 7 +++++-- pkg/orchestrator/gke/templates/pathways_jobset.tmpl | 9 ++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pkg/orchestrator/gke/manifest_generator.go b/pkg/orchestrator/gke/manifest_generator.go index 8934a787c1..d599d6faa5 100644 --- a/pkg/orchestrator/gke/manifest_generator.go +++ b/pkg/orchestrator/gke/manifest_generator.go @@ -127,11 +127,14 @@ func (g *GKEOrchestrator) PrepareManifestOptions(job orchestrator.JobDefinition, // Reuse GCluster's existing GKE accelerator label mapping and algorithmically // derive the Pathways short platform key to avoid duplicating mapping tables. gkeLabel := g.GenerateGKENodeSelectorLabel(instanceType) - // Normalize GKE's "v5-lite" naming to JAX/Pathways's standard "v5e" naming - // to ensure correct topology lookup in the Pathways server binary. + // Normalize GKE node selector labels to match JAX/Pathways platform keys: + // 1. Map GKE "v5-lite" (TPU v5e) to JAX standard "v5e" (deriving tpuv5e) + // 2. Map GKE "v5p" (TPU v5p) to JAX standard "v5" (deriving tpuv5) normalizedLabel := gkeLabel if strings.Contains(gkeLabel, "v5-lite") { normalizedLabel = strings.ReplaceAll(gkeLabel, "v5-lite", "v5e") + } else if strings.Contains(gkeLabel, "v5p") { + normalizedLabel = strings.ReplaceAll(gkeLabel, "v5p", "v5") } pathwaysPlatform := strings.ReplaceAll(normalizedLabel, "-podslice", "") pathwaysPlatform = strings.ReplaceAll(pathwaysPlatform, "-slice", "") diff --git a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl index b8ff7063ba..5096416a07 100644 --- a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl +++ b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl @@ -132,9 +132,6 @@ spec: limits: cpu: "8" memory: "32Gi" - volumeMounts: - - mountPath: /tmp - name: shared-tmp {{- if not .Pathways.Headless}} containers: - name: workload-container @@ -165,8 +162,10 @@ spec: - | echo "GCluster Start: $(date)" _sigterm() { - kill -SIGTERM $PID 2>/dev/null - wait $PID + if [ -n "$PID" ]; then + kill -SIGTERM $PID 2>/dev/null + wait $PID + fi exit 143 } trap _sigterm SIGTERM From 60cbce0a2c9cf8eab8d432b93bbca8be5fea451d Mon Sep 17 00:00:00 2001 From: Swarna Bharathi Mantena Date: Thu, 25 Jun 2026 20:06:24 +0000 Subject: [PATCH 7/7] feat(gke): align pathways jobset with stable release and add 60s peer timeout --- .../gke/templates/pathways_jobset.tmpl | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl index 5096416a07..54349147b8 100644 --- a/pkg/orchestrator/gke/templates/pathways_jobset.tmpl +++ b/pkg/orchestrator/gke/templates/pathways_jobset.tmpl @@ -89,6 +89,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.labels['jobset.sigs.k8s.io/coordinator'] + - name: ABSL_FLAGS + value: "--pathways_pipe_unreachable_timeout=60s" {{- if not .Pathways.Headless}} restartPolicy: Always {{- end }} @@ -125,6 +127,8 @@ spec: fieldPath: metadata.labels['jobset.sigs.k8s.io/coordinator'] - name: TPU_SKIP_MDS_QUERY value: "true" + - name: ABSL_FLAGS + value: "--pathways_pipe_unreachable_timeout=60s" {{- if not .Pathways.Headless}} restartPolicy: Always {{- end }} @@ -156,6 +160,8 @@ spec: value: GCP - name: JAX_BACKEND_TARGET value: grpc://$(PATHWAYS_HEAD):29000 + - name: ABSL_FLAGS + value: "--pathways_pipe_unreachable_timeout=60s" command: - "/bin/bash" - "-c" @@ -169,14 +175,13 @@ spec: exit 143 } trap _sigterm SIGTERM - {{.CommandToRun}} & PID=$! + ( + {{.CommandToRun}} + ) & PID=$! wait $PID EXIT_CODE=$? echo "GCluster End: $(date)" exit $EXIT_CODE - volumeMounts: - - mountPath: /tmp - name: shared-tmp {{- if .VolumeMountsYAML }} {{.VolumeMountsYAML}} {{- end }} @@ -186,10 +191,6 @@ spec: image: {{.Pathways.ColocatedPythonSidecarImage}} {{- end}} volumes: - - name: shared-tmp - hostPath: - path: /tmp - type: DirectoryOrCreate {{.VolumesYAML}} - name: worker replicas: {{.NumSlices}} @@ -284,15 +285,10 @@ spec: valueFrom: fieldRef: fieldPath: metadata.labels['jobset.sigs.k8s.io/coordinator'] + - name: ABSL_FLAGS + value: "--pathways_pipe_unreachable_timeout=60s" {{.ResourcesString}} - volumeMounts: - - name: shared-tmp - mountPath: /tmp volumes: - - name: shared-tmp - hostPath: - path: /tmp - type: DirectoryOrCreate {{.VolumesYAML}} {{- if .NodeSelector }} nodeSelector: