From 519d67b66d79cc6fe24010e93edcc50b6f084b3d Mon Sep 17 00:00:00 2001 From: Sravani Sanigepalli Date: Wed, 4 Mar 2026 01:52:07 +0530 Subject: [PATCH 1/5] CASM-5771 Fix for CEPH issue faced after storage node rollout when RR is enabled --- .../iuf/workflows/management_rollout.md | 77 +++++++- upgrade/scripts/ceph/RR_ceph_upgrade.sh | 186 ++++++++++++++++++ 2 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 upgrade/scripts/ceph/RR_ceph_upgrade.sh diff --git a/operations/iuf/workflows/management_rollout.md b/operations/iuf/workflows/management_rollout.md index 796cb17b5a51f..0b88193f825b8 100644 --- a/operations/iuf/workflows/management_rollout.md +++ b/operations/iuf/workflows/management_rollout.md @@ -186,6 +186,16 @@ The specific scripts executed as part of this hook are `/usr/share/doc/csm/upgra **`NOTE`** The `management-nodes-rollout` stage creates additional separate Argo workflows when rebuilding NCN storage nodes. The Argo workflow names will include the string `ncn-lifecycle-rebuild`. If monitoring progress with the Argo UI, remember to include these workflows. + **`NOTE`** If `Rack Resiliency` is enabled, add the `_admin` label to all Ceph nodes before proceeding. + + (`ncn-m001#`) Add the `_admin` label to all Ceph nodes. + + ```bash + for host in $(ceph orch host ls --format json | jq -r '.[].hostname'); do + ceph orch host label add $host _admin + done + ``` + 1. (`ncn-m001#`) Execute the `management-nodes-rollout` stage with a single NCN storage node. ```bash @@ -204,7 +214,60 @@ The specific scripts executed as part of this hook are `/usr/share/doc/csm/upgra cray cfs components describe "${XNAME}" ``` - The desired value for `configuration_status` is `configured`. If it is `pending`, then wait for the status to change to `configured`. + The desired value for `configuration_status` is `configured`. If it is `pending`, then wait for the status to change to be `configured`. + + **`NOTE`** If `Rack Resiliency` is enabled, there is a known corner case where the node would not transition to `configured` state for a long time. If this occurs, perform the following steps: + + 1. From the output of `cray cfs components describe "${XNAME}"` that was run above, fetch the `CFS_SESSION_NAME` for the `rack_resiliency_for_mgmt_nodes.yml` playbook: + + ```toml + [[state]] + cloneUrl = "https://api-gw-service-nmn.local/vcs/cray/csm-config-management.git" + commit = "a3e8d330adb99215e2d4cd084fc38ff590718705" + lastUpdated = "2026-03-03T11:41:04Z" + playbook = "rack_resiliency_for_mgmt_nodes.yml" + sessionName = "batcher-75d4e8ee-b688-4ae8-9ecd-065d8a3c9705" + ``` + + The value of `sessionName` is the `CFS_SESSION_NAME`. + + 1. (`ncn-m#`) Identify the associated CFS pod with the session: + + ```bash + CFS_POD_NAME=$(kubectl get pods --no-headers -o custom-columns=":metadata.name" -n services -l cfsession=) + echo "${CFS_POD_NAME}" + ``` + + 1. (`ncn-m#`) Inspect the Ansible logs for the CFS pod: + + ```bash + kubectl logs -n services "${CFS_POD_NAME}" ansible + ``` + + 1. (`ncn-m#`) If the session is stuck at the following task for long time: + + ```text + TASK [csm.rr.ceph_zoning : Apply CEPH zoning] ********************************** + changed: [x3000c0s29b0n0 -> x3000c0s29b0n0] + + TASK [csm.rr.ceph_haproxy : Copy ceph_haproxy.sh script to the target machine] *** + changed: [x3000c0s29b0n0] + ``` + Then delete the CFS pod: + + ```bash + kubectl delete pod -n services "${CFS_POD_NAME}" + ``` + A replacement CFS pod will be created automatically, and the session should proceed to completion successfully. + + + **`NOTE`** If `Rack Resiliency` is enabled, run the following script from `ncn-s001`, which has access to the Kubernetes cluster. The script waits for the Ceph cluster to be stabilized, and then updates the latest monitor configuration in the Ceph CSI, customizations, and `loftsman-cray-sysmgmt-health` ConfigMaps. + + (`ncn-s001#`) Execute `RR_ceph_upgrade.sh` script + + ```bash + /usr/share/doc/csm/upgrade/scripts/RR_ceph_upgrade.sh + ``` 1. (`ncn-m001#`) Upgrade the remaining NCN storage nodes once the first has upgraded successfully. This upgrades NCN storage nodes serially. Get the number of storage nodes based on the cluster and verify that it is correct. The storage canary node should not be in the list since it has already been upgraded. @@ -227,6 +290,18 @@ The specific scripts executed as part of this hook are `/usr/share/doc/csm/upgra $ncn --format json | jq -r ' .id+" "+.desiredConfig+" status="+.configurationStatus'; done ``` + **`NOTE`** If `Rack Resiliency` is enabled, remove the `_admin` label from all Ceph nodes except `ncn-s001`. The storage canary node (`ncn-s001`) retains the `_admin` label as it functions as the primary Ceph administration node. + + (`ncn-m001#`) Remove the `_admin` label from all Ceph nodes except `ncn-s001`. + + ```bash + for host in $(ceph orch host ls --format json | jq -r '.[].hostname'); do + if [ "$host" != "ncn-s001" ]; then + ceph orch host label rm $host _admin + fi + done + ``` + 1. Perform the NCN master node upgrade of `ncn-m002` and `ncn-m003`. > **`NOTE`** If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../../kubernetes/encryption/README.md), diff --git a/upgrade/scripts/ceph/RR_ceph_upgrade.sh b/upgrade/scripts/ceph/RR_ceph_upgrade.sh new file mode 100644 index 0000000000000..3d93acd7b40ec --- /dev/null +++ b/upgrade/scripts/ceph/RR_ceph_upgrade.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2021-2025 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +# This script is introduced to fix CAST-39537 and is to be used only if Rack Resiliency(RR) is enabled. It performs two main functions: +# 1. Waits for any ongoing Ceph orchestrator operations to complete before syncing the updated monitor configuration to other nodes and kubernetes cluster. +# 2. Updates the monitors list in all Ceph CSI ConfigMaps across relevant namespaces using the current Ceph monitor map. +# 3. Copies the updated `/etc/ceph/ceph.conf` file from the storage node to all Kubernetes master nodes. +# 4. Update the customizations.yaml file with the new monitor information. + +# Note - This script should be executed only after first storage node rollout on the first Ceph node(ncn-s001) where kubernetes access is available. + +set -euo pipefail + +# ────────────────────────────── +# Function: wait_for_ceph_orch +# Description: Waits for Ceph orchestrator operations to complete. +# Checks that all daemons are running, MONs are in quorum, +# and cluster health does not report monitor-related issues. +# ────────────────────────────── +wait_for_ceph_orch() { + local timeout=600 # 10 minutes max + local interval=15 + local elapsed=0 + + echo "Waiting for ceph orchestrator operations to complete..." + + while true; do + # Check for any non-running ceph daemons + if ceph orch ps | grep -E 'starting|stopped|error|unknown' >/dev/null; then + echo "Daemons still transitioning..." + + # Ensure all MONs are in quorum + elif ! ceph quorum_status --format json | grep -q '"quorum_names"'; then + echo "Waiting for MON quorum..." + + # Ensure cluster is not reporting monitor-related health issues + elif ceph health | grep -qE 'MON_DOWN|MON_LEFT_QUORUM|MON_JOINED_QUORUM'; then + echo "Waiting for monitor health to stabilize..." + + else + echo "Ceph orchestrator operations completed successfully." + break + fi + + sleep $interval + elapsed=$((elapsed + interval)) + + if [ $elapsed -ge $timeout ]; then + echo "Timed out waiting for ceph orchestrator to finish." + return 1 + fi + done + + return 0 +} + +# ────────────────────────────── +# Function: update_ceph_csi_configmaps +# Description: Updates the monitors list in all Ceph CSI ConfigMaps +# across relevant namespaces using the current Ceph monitor map. +# ────────────────────────────── +update_ceph_csi_configmaps() { + local NEW_MONITORS + NEW_MONITORS=$(ceph mon dump -f json | jq -c '[.mons[].public_addr | split("/")[0]]') + # Namespaces ceph-rbd, ceph-cephfs, default, services contains ceph-csi-config configmap + for ns in ceph-rbd ceph-cephfs default services; do + if kubectl -n "$ns" get cm ceph-csi-config >/dev/null 2>&1; then + echo "Updating ceph-csi-config in namespace $ns" + kubectl -n "$ns" get cm ceph-csi-config -o json \ + | jq --argjson mons "$NEW_MONITORS" ' + .data["config.json"] |= ( + fromjson + | map(.monitors = $mons) + | tojson + ) + ' \ + | kubectl apply -f - + else + echo "Skipping $ns (ceph-csi-config not found)" + fi + done + + # Handle ceph-etc configmap in backups namespace separately + if kubectl -n backups get cm ceph-etc >/dev/null 2>&1; then + echo "Updating ceph-etc in namespace backups" + kubectl -n backups get cm ceph-etc -o json \ + | jq --argjson mons "$NEW_MONITORS" ' + .data["config.json"] |= ( + fromjson + | map(.monitors = $mons) + | tojson + ) + ' \ + | kubectl apply -f - + else + echo "Skipping backups (ceph-etc not found)" + fi +} + +# ────────────────────────────── +# Function: update_customizations +# Description: Updates the Ceph monitor addresses in the loftsman site-init +# secret (customizations.yaml) and the loftsman-cray-sysmgmt-health +# ConfigMap (cephExporter.endpoints). +# ────────────────────────────── +update_customizations() { + local new_mons + new_mons=$(ceph mon dump -f json | jq -r '.mons[].public_addr | split("/")[0]') + local tmpdir + tmpdir=$(mktemp -d -p ~) + trap 'rm -rf "${tmpdir}"' EXIT + + echo "Extracting customizations.yaml from secret..." + + kubectl get secret -n loftsman site-init \ + -o jsonpath='{.data.customizations\.yaml}' \ + | base64 -d > "${tmpdir}/customizations.yaml" + + echo "Updating monitor list..." + + NEW_MONS="$new_mons" yq -i \ + '.spec.network.netstaticips.nmn_ncn_storage_mons = (strenv(NEW_MONS) | split("\n") | map(select(length > 0)))' \ + "${tmpdir}/customizations.yaml" + + echo "Recreating secret..." + + kubectl delete secret -n loftsman site-init --ignore-not-found + + kubectl create secret -n loftsman generic site-init \ + --from-file="${tmpdir}/customizations.yaml" + + rm -rf "$tmpdir" + echo "customizations secret updated successfully" + + echo "Updating cephExporter endpoints in loftsman-cray-sysmgmt-health..." + + local manifest_yaml + manifest_yaml=$(kubectl get cm -n loftsman loftsman-cray-sysmgmt-health \ + -o jsonpath='{.data.manifest\.yaml}') + + local updated_manifest + updated_manifest=$(echo "$manifest_yaml" \ + | NEW_MONS="$new_mons" yq e \ + '.spec.charts[].values.cephExporter.endpoints = (strenv(NEW_MONS) | split("\n") | map(select(length > 0)))' -) + + kubectl patch cm -n loftsman loftsman-cray-sysmgmt-health \ + --type merge \ + -p "{\"data\":{\"manifest.yaml\":$(echo "$updated_manifest" | jq -Rs .)}}" + + echo "loftsman-cray-sysmgmt-health ConfigMap updated successfully" +} + +# Execute functions +wait_for_ceph_orch || { + echo "Ceph did not stabilize. Exiting." + exit 1 +} +update_ceph_csi_configmaps +update_customizations + +# Update the ceph.conf file on all k8s master nodes to ensure they have the latest CEPH configuration which would have changed during upgrade to enable CEPH access from the k8s masters +for master in $(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o jsonpath='{.items[*].metadata.name}'); do + scp -o StrictHostKeyChecking=no /etc/ceph/ceph.conf "${master}:/etc/ceph/ceph.conf" +done From 5e5f8a0be9babc8cb4c1ea0fb0964090ea5d6ad7 Mon Sep 17 00:00:00 2001 From: Sravani Sanigepalli Date: Wed, 4 Mar 2026 02:28:02 +0530 Subject: [PATCH 2/5] CASM-5771 Fixing few issues --- .../iuf/workflows/management_rollout.md | 6 +- upgrade/scripts/ceph/RR_ceph_upgrade.sh | 186 +++++++++--------- 2 files changed, 97 insertions(+), 95 deletions(-) diff --git a/operations/iuf/workflows/management_rollout.md b/operations/iuf/workflows/management_rollout.md index 0b88193f825b8..529d1ce15d20c 100644 --- a/operations/iuf/workflows/management_rollout.md +++ b/operations/iuf/workflows/management_rollout.md @@ -253,15 +253,17 @@ The specific scripts executed as part of this hook are `/usr/share/doc/csm/upgra TASK [csm.rr.ceph_haproxy : Copy ceph_haproxy.sh script to the target machine] *** changed: [x3000c0s29b0n0] ``` + Then delete the CFS pod: ```bash kubectl delete pod -n services "${CFS_POD_NAME}" ``` - A replacement CFS pod will be created automatically, and the session should proceed to completion successfully. + A replacement CFS pod will be created automatically, and the session should proceed to completion successfully. - **`NOTE`** If `Rack Resiliency` is enabled, run the following script from `ncn-s001`, which has access to the Kubernetes cluster. The script waits for the Ceph cluster to be stabilized, and then updates the latest monitor configuration in the Ceph CSI, customizations, and `loftsman-cray-sysmgmt-health` ConfigMaps. + **`NOTE`** If `Rack Resiliency` is enabled, run the following script from `ncn-s001`, which has access to the Kubernetes cluster. + The script waits for completion of Ceph Orchestrator operations, and updates the latest monitor configuration in several ConfigMaps. (`ncn-s001#`) Execute `RR_ceph_upgrade.sh` script diff --git a/upgrade/scripts/ceph/RR_ceph_upgrade.sh b/upgrade/scripts/ceph/RR_ceph_upgrade.sh index 3d93acd7b40ec..14cb8e080e14f 100644 --- a/upgrade/scripts/ceph/RR_ceph_upgrade.sh +++ b/upgrade/scripts/ceph/RR_ceph_upgrade.sh @@ -2,7 +2,7 @@ # # MIT License # -# (C) Copyright 2021-2025 Hewlett Packard Enterprise Development LP +# (C) Copyright 2021-2026 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -23,11 +23,11 @@ # OTHER DEALINGS IN THE SOFTWARE. # -# This script is introduced to fix CAST-39537 and is to be used only if Rack Resiliency(RR) is enabled. It performs two main functions: +# This script is introduced to fix CAST-39537 and is to be used only if Rack Resiliency(RR) is enabled. It performs four main functions: # 1. Waits for any ongoing Ceph orchestrator operations to complete before syncing the updated monitor configuration to other nodes and kubernetes cluster. -# 2. Updates the monitors list in all Ceph CSI ConfigMaps across relevant namespaces using the current Ceph monitor map. +# 2. Updates the monitors list in all Ceph CSI ConfigMaps across relevant namespaces using the current Ceph monitor map. # 3. Copies the updated `/etc/ceph/ceph.conf` file from the storage node to all Kubernetes master nodes. -# 4. Update the customizations.yaml file with the new monitor information. +# 4. Updates the customizations.yaml file and loftsman-cray-sysmgmt-health ConfigMap with the new monitor information. # Note - This script should be executed only after first storage node rollout on the first Ceph node(ncn-s001) where kubernetes access is available. @@ -35,88 +35,88 @@ set -euo pipefail # ────────────────────────────── # Function: wait_for_ceph_orch -# Description: Waits for Ceph orchestrator operations to complete. +# Description: Waits for Ceph orchestrator operations to complete. # Checks that all daemons are running, MONs are in quorum, # and cluster health does not report monitor-related issues. # ────────────────────────────── wait_for_ceph_orch() { - local timeout=600 # 10 minutes max - local interval=15 - local elapsed=0 + local timeout=600 # 10 minutes max + local interval=15 + local elapsed=0 - echo "Waiting for ceph orchestrator operations to complete..." + echo "Waiting for ceph orchestrator operations to complete..." - while true; do - # Check for any non-running ceph daemons - if ceph orch ps | grep -E 'starting|stopped|error|unknown' >/dev/null; then - echo "Daemons still transitioning..." + while true; do + # Check for any non-running ceph daemons + if ceph orch ps | grep -E 'starting|stopped|error|unknown' >/dev/null; then + echo "Daemons still transitioning..." - # Ensure all MONs are in quorum - elif ! ceph quorum_status --format json | grep -q '"quorum_names"'; then - echo "Waiting for MON quorum..." + # Ensure all MONs are in quorum + elif ! ceph quorum_status --format json | grep -q '"quorum_names"'; then + echo "Waiting for MON quorum..." - # Ensure cluster is not reporting monitor-related health issues - elif ceph health | grep -qE 'MON_DOWN|MON_LEFT_QUORUM|MON_JOINED_QUORUM'; then - echo "Waiting for monitor health to stabilize..." + # Ensure cluster is not reporting monitor-related health issues + elif ceph health | grep -qE 'MON_DOWN|MON_LEFT_QUORUM|MON_JOINED_QUORUM'; then + echo "Waiting for monitor health to stabilize..." - else - echo "Ceph orchestrator operations completed successfully." - break - fi + else + echo "Ceph orchestrator operations completed successfully." + break + fi - sleep $interval - elapsed=$((elapsed + interval)) + sleep "$interval" + elapsed=$((elapsed + interval)) - if [ $elapsed -ge $timeout ]; then - echo "Timed out waiting for ceph orchestrator to finish." - return 1 - fi - done + if [[ $elapsed -ge $timeout ]]; then + echo "Timed out waiting for ceph orchestrator to finish." + return 1 + fi + done - return 0 + return 0 } # ────────────────────────────── # Function: update_ceph_csi_configmaps -# Description: Updates the monitors list in all Ceph CSI ConfigMaps +# Description: Updates the monitors list in all Ceph CSI ConfigMaps # across relevant namespaces using the current Ceph monitor map. # ────────────────────────────── update_ceph_csi_configmaps() { - local NEW_MONITORS - NEW_MONITORS=$(ceph mon dump -f json | jq -c '[.mons[].public_addr | split("/")[0]]') - # Namespaces ceph-rbd, ceph-cephfs, default, services contains ceph-csi-config configmap - for ns in ceph-rbd ceph-cephfs default services; do - if kubectl -n "$ns" get cm ceph-csi-config >/dev/null 2>&1; then - echo "Updating ceph-csi-config in namespace $ns" - kubectl -n "$ns" get cm ceph-csi-config -o json \ - | jq --argjson mons "$NEW_MONITORS" ' + local NEW_MONITORS + NEW_MONITORS=$(ceph mon dump -f json | jq -c '[.mons[].public_addr | split("/")[0]]') + # Namespaces ceph-rbd, ceph-cephfs, default, services contains ceph-csi-config configmap + for ns in ceph-rbd ceph-cephfs default services; do + if kubectl -n "$ns" get cm ceph-csi-config >/dev/null 2>&1; then + echo "Updating ceph-csi-config in namespace $ns" + kubectl -n "$ns" get cm ceph-csi-config -o json | + jq --argjson mons "$NEW_MONITORS" ' .data["config.json"] |= ( fromjson | map(.monitors = $mons) | tojson ) - ' \ - | kubectl apply -f - - else - echo "Skipping $ns (ceph-csi-config not found)" - fi - done - - # Handle ceph-etc configmap in backups namespace separately - if kubectl -n backups get cm ceph-etc >/dev/null 2>&1; then - echo "Updating ceph-etc in namespace backups" - kubectl -n backups get cm ceph-etc -o json \ - | jq --argjson mons "$NEW_MONITORS" ' + ' | + kubectl apply -f - + else + echo "Skipping $ns (ceph-csi-config not found)" + fi + done + + # Handle ceph-etc configmap in backups namespace separately + if kubectl -n backups get cm ceph-etc >/dev/null 2>&1; then + echo "Updating ceph-etc in namespace backups" + kubectl -n backups get cm ceph-etc -o json | + jq --argjson mons "$NEW_MONITORS" ' .data["config.json"] |= ( fromjson | map(.monitors = $mons) | tojson ) - ' \ - | kubectl apply -f - - else - echo "Skipping backups (ceph-etc not found)" - fi + ' | + kubectl apply -f - + else + echo "Skipping backups (ceph-etc not found)" + fi } # ────────────────────────────── @@ -126,61 +126,61 @@ update_ceph_csi_configmaps() { # ConfigMap (cephExporter.endpoints). # ────────────────────────────── update_customizations() { - local new_mons - new_mons=$(ceph mon dump -f json | jq -r '.mons[].public_addr | split("/")[0]') - local tmpdir - tmpdir=$(mktemp -d -p ~) - trap 'rm -rf "${tmpdir}"' EXIT + local new_mons + new_mons=$(ceph mon dump -f json | jq -r '.mons[].public_addr | split("/")[0]') + local tmpdir + tmpdir=$(mktemp -d -p ~) + + echo "Extracting customizations.yaml from secret..." - echo "Extracting customizations.yaml from secret..." + kubectl get secret -n loftsman site-init \ + -o jsonpath='{.data.customizations\.yaml}' | + base64 -d >"${tmpdir}/customizations.yaml" - kubectl get secret -n loftsman site-init \ - -o jsonpath='{.data.customizations\.yaml}' \ - | base64 -d > "${tmpdir}/customizations.yaml" + echo "Updating monitor list..." - echo "Updating monitor list..." + NEW_MONS="$new_mons" yq -i \ + '.spec.network.netstaticips.nmn_ncn_storage_mons = (strenv(NEW_MONS) | split("\n") | map(select(length > 0)))' \ + "${tmpdir}/customizations.yaml" - NEW_MONS="$new_mons" yq -i \ - '.spec.network.netstaticips.nmn_ncn_storage_mons = (strenv(NEW_MONS) | split("\n") | map(select(length > 0)))' \ - "${tmpdir}/customizations.yaml" + echo "Recreating secret..." - echo "Recreating secret..." + kubectl delete secret -n loftsman site-init --ignore-not-found - kubectl delete secret -n loftsman site-init --ignore-not-found - - kubectl create secret -n loftsman generic site-init \ - --from-file="${tmpdir}/customizations.yaml" + kubectl create secret -n loftsman generic site-init \ + --from-file="${tmpdir}/customizations.yaml" - rm -rf "$tmpdir" - echo "customizations secret updated successfully" + rm -rf "$tmpdir" + trap - EXIT + echo "customizations secret updated successfully" - echo "Updating cephExporter endpoints in loftsman-cray-sysmgmt-health..." + echo "Updating cephExporter endpoints in loftsman-cray-sysmgmt-health..." - local manifest_yaml - manifest_yaml=$(kubectl get cm -n loftsman loftsman-cray-sysmgmt-health \ - -o jsonpath='{.data.manifest\.yaml}') + local manifest_yaml + manifest_yaml=$(kubectl get cm -n loftsman loftsman-cray-sysmgmt-health \ + -o jsonpath='{.data.manifest\.yaml}') - local updated_manifest - updated_manifest=$(echo "$manifest_yaml" \ - | NEW_MONS="$new_mons" yq e \ - '.spec.charts[].values.cephExporter.endpoints = (strenv(NEW_MONS) | split("\n") | map(select(length > 0)))' -) + local updated_manifest + updated_manifest=$(echo "$manifest_yaml" | + NEW_MONS="$new_mons" yq e \ + '.spec.charts[].values.cephExporter.endpoints = (strenv(NEW_MONS) | split("\n") | map(select(length > 0)))' -) - kubectl patch cm -n loftsman loftsman-cray-sysmgmt-health \ - --type merge \ - -p "{\"data\":{\"manifest.yaml\":$(echo "$updated_manifest" | jq -Rs .)}}" + kubectl patch cm -n loftsman loftsman-cray-sysmgmt-health \ + --type merge \ + -p "{\"data\":{\"manifest.yaml\":$(echo "$updated_manifest" | jq -Rs .)}}" - echo "loftsman-cray-sysmgmt-health ConfigMap updated successfully" + echo "loftsman-cray-sysmgmt-health ConfigMap updated successfully" } # Execute functions wait_for_ceph_orch || { - echo "Ceph did not stabilize. Exiting." - exit 1 + echo "Ceph did not stabilize. Exiting." + exit 1 } update_ceph_csi_configmaps update_customizations -# Update the ceph.conf file on all k8s master nodes to ensure they have the latest CEPH configuration which would have changed during upgrade to enable CEPH access from the k8s masters +# Update the ceph.conf file on all k8s master nodes to ensure they have the latest CEPH configuration which would have changed during upgrade to enable CEPH access from the k8s masters for master in $(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o jsonpath='{.items[*].metadata.name}'); do - scp -o StrictHostKeyChecking=no /etc/ceph/ceph.conf "${master}:/etc/ceph/ceph.conf" + scp -o StrictHostKeyChecking=no /etc/ceph/ceph.conf "${master}:/etc/ceph/ceph.conf" done From 3c58c494f7ae6b06495e7ab1ba21c9d23a1f1960 Mon Sep 17 00:00:00 2001 From: Sravani Sanigepalli Date: Wed, 4 Mar 2026 02:37:52 +0530 Subject: [PATCH 3/5] CASM-5771 Fixing shfmt issues --- upgrade/scripts/ceph/RR_ceph_upgrade.sh | 31 ++++++++++++------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/upgrade/scripts/ceph/RR_ceph_upgrade.sh b/upgrade/scripts/ceph/RR_ceph_upgrade.sh index 14cb8e080e14f..9439d235b0f56 100644 --- a/upgrade/scripts/ceph/RR_ceph_upgrade.sh +++ b/upgrade/scripts/ceph/RR_ceph_upgrade.sh @@ -48,7 +48,7 @@ wait_for_ceph_orch() { while true; do # Check for any non-running ceph daemons - if ceph orch ps | grep -E 'starting|stopped|error|unknown' >/dev/null; then + if ceph orch ps | grep -E 'starting|stopped|error|unknown' > /dev/null; then echo "Daemons still transitioning..." # Ensure all MONs are in quorum @@ -86,34 +86,34 @@ update_ceph_csi_configmaps() { NEW_MONITORS=$(ceph mon dump -f json | jq -c '[.mons[].public_addr | split("/")[0]]') # Namespaces ceph-rbd, ceph-cephfs, default, services contains ceph-csi-config configmap for ns in ceph-rbd ceph-cephfs default services; do - if kubectl -n "$ns" get cm ceph-csi-config >/dev/null 2>&1; then + if kubectl -n "$ns" get cm ceph-csi-config > /dev/null 2>&1; then echo "Updating ceph-csi-config in namespace $ns" - kubectl -n "$ns" get cm ceph-csi-config -o json | - jq --argjson mons "$NEW_MONITORS" ' + kubectl -n "$ns" get cm ceph-csi-config -o json \ + | jq --argjson mons "$NEW_MONITORS" ' .data["config.json"] |= ( fromjson | map(.monitors = $mons) | tojson ) - ' | - kubectl apply -f - + ' \ + | kubectl apply -f - else echo "Skipping $ns (ceph-csi-config not found)" fi done # Handle ceph-etc configmap in backups namespace separately - if kubectl -n backups get cm ceph-etc >/dev/null 2>&1; then + if kubectl -n backups get cm ceph-etc > /dev/null 2>&1; then echo "Updating ceph-etc in namespace backups" - kubectl -n backups get cm ceph-etc -o json | - jq --argjson mons "$NEW_MONITORS" ' + kubectl -n backups get cm ceph-etc -o json \ + | jq --argjson mons "$NEW_MONITORS" ' .data["config.json"] |= ( fromjson | map(.monitors = $mons) | tojson ) - ' | - kubectl apply -f - + ' \ + | kubectl apply -f - else echo "Skipping backups (ceph-etc not found)" fi @@ -134,8 +134,8 @@ update_customizations() { echo "Extracting customizations.yaml from secret..." kubectl get secret -n loftsman site-init \ - -o jsonpath='{.data.customizations\.yaml}' | - base64 -d >"${tmpdir}/customizations.yaml" + -o jsonpath='{.data.customizations\.yaml}' \ + | base64 -d >"${tmpdir}/customizations.yaml" echo "Updating monitor list..." @@ -151,7 +151,6 @@ update_customizations() { --from-file="${tmpdir}/customizations.yaml" rm -rf "$tmpdir" - trap - EXIT echo "customizations secret updated successfully" echo "Updating cephExporter endpoints in loftsman-cray-sysmgmt-health..." @@ -161,8 +160,8 @@ update_customizations() { -o jsonpath='{.data.manifest\.yaml}') local updated_manifest - updated_manifest=$(echo "$manifest_yaml" | - NEW_MONS="$new_mons" yq e \ + updated_manifest=$(echo "$manifest_yaml" \ + | NEW_MONS="$new_mons" yq e \ '.spec.charts[].values.cephExporter.endpoints = (strenv(NEW_MONS) | split("\n") | map(select(length > 0)))' -) kubectl patch cm -n loftsman loftsman-cray-sysmgmt-health \ From 69ad2f7a7f2e8f65fcab1250366d136f2b1c8d70 Mon Sep 17 00:00:00 2001 From: Sravani Sanigepalli Date: Wed, 4 Mar 2026 02:42:56 +0530 Subject: [PATCH 4/5] CASM-5771 Fixing shfmt issues --- upgrade/scripts/ceph/RR_ceph_upgrade.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/scripts/ceph/RR_ceph_upgrade.sh b/upgrade/scripts/ceph/RR_ceph_upgrade.sh index 9439d235b0f56..9a71bf656245d 100644 --- a/upgrade/scripts/ceph/RR_ceph_upgrade.sh +++ b/upgrade/scripts/ceph/RR_ceph_upgrade.sh @@ -135,7 +135,7 @@ update_customizations() { kubectl get secret -n loftsman site-init \ -o jsonpath='{.data.customizations\.yaml}' \ - | base64 -d >"${tmpdir}/customizations.yaml" + | base64 -d > "${tmpdir}/customizations.yaml" echo "Updating monitor list..." From f592a19f331e646d0abcb09eb2edf23125be1f12 Mon Sep 17 00:00:00 2001 From: Sravani Sanigepalli Date: Wed, 20 May 2026 11:07:08 +0530 Subject: [PATCH 5/5] CASM-5771 Adding RR CEPH upgrade script --- upgrade/scripts/ceph/RR_ceph_upgrade.py | 300 ++++++++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100755 upgrade/scripts/ceph/RR_ceph_upgrade.py diff --git a/upgrade/scripts/ceph/RR_ceph_upgrade.py b/upgrade/scripts/ceph/RR_ceph_upgrade.py new file mode 100755 index 0000000000000..f95ec6e970c55 --- /dev/null +++ b/upgrade/scripts/ceph/RR_ceph_upgrade.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +# +# MIT License +# +# (C) Copyright 2021-2026 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +""" +This script is introduced to fix CAST-39537 and is to be used only if Rack Resiliency (RR) is enabled. +It performs four main functions: + 1. Waits for any ongoing Ceph orchestrator operations to complete before syncing the updated + monitor configuration to other nodes and kubernetes cluster. + 2. Updates the monitors list in all Ceph CSI ConfigMaps across relevant namespaces using the + current Ceph monitor map. + 3. Copies the updated /etc/ceph/ceph.conf file from the storage node to all Kubernetes master nodes. + 4. Updates the customizations.yaml file and loftsman-cray-sysmgmt-health ConfigMap with the new + monitor information. + +Note: This script should be executed only after the first storage node rollout on the first Ceph + node (ncn-s001) where kubernetes access is available. +""" + + +import base64 +import json +import os +import re +import shutil +import subprocess +import sys +import tempfile +import time + +import yaml + + +# Assume logger is defined elsewhere or add a basic one +import logging +logger = logging.getLogger(__name__) +if not logger.hasHandlers(): + logging.basicConfig(level=logging.INFO) + + +def run_command(command: str) -> str: + """ + Helper function to run a shell command. + Args: + command (str): The shell command to run. + Returns: + str: The output of the command. + """ + logger.info(f"Running command: {command}") + try: + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=True, check=True) + except subprocess.CalledProcessError as e: + raise ValueError(f"Command {command} errored out with : {e.stderr}") from e + return result.stdout + + +def run_cmd(cmd, check=True, capture_output=True, text=True): + """Run a shell command and return the CompletedProcess result.""" + result = subprocess.run(cmd, shell=True, check=check, capture_output=capture_output, text=text) + return result + + +def wait_for_ceph_orch(timeout=600, interval=15): + """ + Wait for Ceph orchestrator operations to complete. + Checks that all daemons are running, MONs are in quorum, + and cluster health does not report monitor-related issues. + """ + elapsed = 0 + logger.info("Waiting for ceph orchestrator operations to complete...") + + while True: + # Check for any non-running ceph daemons + try: + orch_ps = run_command("ceph orch ps") + except Exception: + orch_ps = "" + if re.search(r"starting|stopped|error|unknown", orch_ps): + logger.info("Daemons still transitioning...") + + # Ensure all MONs are in quorum + quorum_status = run_command("ceph quorum_status --format json") + if not re.search(r'"quorum_names"', quorum_status): + logger.info("Waiting for MON quorum...") + + # Ensure cluster is not reporting monitor-related health issues + try: + ceph_health = run_command("ceph health") + except Exception: + ceph_health = "" + if re.search(r"MON_DOWN|MON_LEFT_QUORUM|MON_JOINED_QUORUM", ceph_health): + logger.info("Waiting for monitor health to stabilize...") + + elif not (re.search(r"starting|stopped|error|unknown", orch_ps) + or not re.search(r'"quorum_names"', quorum_status) + or re.search(r"MON_DOWN|MON_LEFT_QUORUM|MON_JOINED_QUORUM", ceph_health)): + logger.info("Ceph orchestrator operations completed successfully.") + return True + + time.sleep(interval) + elapsed += interval + + if elapsed >= timeout: + logger.info("Timed out waiting for ceph orchestrator to finish.") + return False + + +def get_new_monitors(): + """Get the current list of Ceph monitor addresses from the monitor map.""" + result = run_command("ceph mon dump -f json") + mon_dump = json.loads(result) + return [mon["public_addr"].split("/")[0] for mon in mon_dump["mons"]] + + +def update_ceph_csi_configmaps(): + """ + Update the monitors list in all Ceph CSI ConfigMaps across relevant + namespaces using the current Ceph monitor map. + """ + new_monitors = get_new_monitors() + + # Namespaces ceph-rbd, ceph-cephfs, default, services contain ceph-csi-config configmap + for ns in ["ceph-rbd", "ceph-cephfs", "default", "services"]: + try: + run_command(f"kubectl -n {ns} get cm ceph-csi-config") + except Exception: + print(f"Skipping {ns} (ceph-csi-config not found)") + continue + print(f"Updating ceph-csi-config in namespace {ns}") + result = run_command(f"kubectl -n {ns} get cm ceph-csi-config -o json") + cm = json.loads(result) + + config = json.loads(cm.get("data", {}).get("config.json", "[]")) + for entry in config: + entry["monitors"] = new_monitors + cm["data"]["config.json"] = json.dumps(config) + + subprocess.run( + ["kubectl", "apply", "-f", "-"], + input=json.dumps(cm), + check=True, + capture_output=True, + text=True, + ) + + # Handle ceph-etc configmap in backups namespace separately + try: + run_command("kubectl -n backups get cm ceph-etc") + except Exception: + print("Skipping backups (ceph-etc not found)") + return + print("Updating ceph-etc in namespace backups") + result = run_command("kubectl -n backups get cm ceph-etc -o json") + cm = json.loads(result) + + config = json.loads(cm.get("data", {}).get("config.json", "[]")) + for entry in config: + entry["monitors"] = new_monitors + cm["data"]["config.json"] = json.dumps(config) + + subprocess.run( + ["kubectl", "apply", "-f", "-"], + input=json.dumps(cm), + check=True, + capture_output=True, + text=True, + ) + + +def update_customizations(): + """ + Update the Ceph monitor addresses in the loftsman site-init secret + (customizations.yaml) and the loftsman-cray-sysmgmt-health ConfigMap + (cephExporter.endpoints). + """ + new_mons = get_new_monitors() + tmpdir = tempfile.mkdtemp(dir=os.path.expanduser("~")) + + try: + logger.info("Extracting customizations.yaml from secret...") + + result = run_command( + "kubectl get secret -n loftsman site-init -o jsonpath='{.data.customizations\\.yaml}'" + ) + customizations_b64 = result.strip().strip("'") + customizations_raw = base64.b64decode(customizations_b64).decode("utf-8") + + customizations_path = os.path.join(tmpdir, "customizations.yaml") + with open(customizations_path, "w") as f: + f.write(customizations_raw) + + print("Updating monitor list...") + + with open(customizations_path, "r") as f: + customizations = yaml.safe_load(f) + + # Update nmn_ncn_storage_mons + customizations.setdefault("spec", {}).setdefault("network", {}).setdefault( + "netstaticips", {} + )["nmn_ncn_storage_mons"] = new_mons + + with open(customizations_path, "w") as f: + yaml.dump(customizations, f, default_flow_style=False) + + print("Recreating secret...") + + run_command("kubectl delete secret -n loftsman site-init --ignore-not-found") + run_command( + f"kubectl create secret -n loftsman generic site-init " + f"--from-file={customizations_path}" + ) + + print("customizations secret updated successfully") + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + print("Updating cephExporter endpoints in loftsman-cray-sysmgmt-health...") + + result = run_command( + "kubectl get cm -n loftsman loftsman-cray-sysmgmt-health " + "-o jsonpath='{.data.manifest\\.yaml}'" + ) + manifest_raw = result.strip().strip("'") + manifest = yaml.safe_load(manifest_raw) + + # Update cephExporter.endpoints in all charts + for chart in manifest.get("spec", {}).get("charts", []): + chart.setdefault("values", {}).setdefault("cephExporter", {})[ + "endpoints" + ] = new_mons + + updated_manifest = yaml.dump(manifest, default_flow_style=False) + + patch_payload = json.dumps({"data": {"manifest.yaml": updated_manifest}}) + subprocess.run( + [ + "kubectl", "patch", "cm", "-n", "loftsman", + "loftsman-cray-sysmgmt-health", + "--type", "merge", + "-p", patch_payload, + ], + check=True, + capture_output=True, + text=True, + ) + + print("loftsman-cray-sysmgmt-health ConfigMap updated successfully") + + +def copy_ceph_conf_to_masters(): + """ + Copy the updated /etc/ceph/ceph.conf file to all Kubernetes master/control-plane + nodes to ensure they have the latest Ceph configuration. + """ + result = run_command( + "kubectl get nodes --selector='node-role.kubernetes.io/control-plane' " + "-o jsonpath='{.items[*].metadata.name}'" + ) + masters = result.strip().strip("'").split() + + for master in masters: + print(f"Copying ceph.conf to {master}...") + run_command( + f"scp -o StrictHostKeyChecking=no /etc/ceph/ceph.conf {master}:/etc/ceph/ceph.conf" + ) + + +def main(): + if not wait_for_ceph_orch(): + print("Ceph did not stabilize. Exiting.") + sys.exit(1) + + update_ceph_csi_configmaps() + update_customizations() + copy_ceph_conf_to_masters() + + +if __name__ == "__main__": + main()