From abbea159c9bb7f8cbecc9f1751b2749660b3b57e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 8 Jun 2026 10:55:21 +0000
Subject: [PATCH 01/22] Fix AutoRound multi-GPU DDP group handling

---
 .../ddp/ddp_qwen3_multi_gpu_example.py        | 226 ++++++++++++++++++
 examples/autoround/ddp/launch_multi_gpu.sh    |  78 ++++++
 examples/autoround/ddp/reproduce.md           |  53 ++++
 src/llmcompressor/modifiers/autoround/base.py |  66 ++++-
 src/llmcompressor/utils/dist.py               |  28 ++-
 5 files changed, 433 insertions(+), 18 deletions(-)
 create mode 100644 examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
 create mode 100755 examples/autoround/ddp/launch_multi_gpu.sh
 create mode 100644 examples/autoround/ddp/reproduce.md

diff --git a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
new file mode 100644
index 0000000000..350f932ee4
--- /dev/null
+++ b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
@@ -0,0 +1,226 @@
+"""
+Multi-GPU per group DDP example with AutoRound quantization.
+
+Each rank gets a local GPU group for block-level model parallelism, while
+gradients are synchronized across ranks via all_reduce for identical
+convergence despite split calibration data.
+
+Usage (4 GPUs, 2 GPUs per group):
+  CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=2 \\
+      ddp_qwen3_multi_gpu_example.py \\
+      --model /storage/yiliu7/Qwen/Qwen3-8B \\
+      --scheme W4A16 \\
+      --nsamples 32 --iters 50
+
+For single-GPU DDP:
+  torchrun --nproc_per_node=4 ddp_qwen3_multi_gpu_example.py ...
+"""
+
+import argparse
+import os
+import sys
+
+import torch
+import torch.distributed as dist
+from compressed_tensors.offload import dispatch_model, load_offloaded_model
+from loguru import logger
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+
+
+def fix_everything(seed=42):
+    import random
+
+    import numpy as np
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def config_deterministic():
+    torch.use_deterministic_algorithms(True, warn_only=False)
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+    fix_everything()
+
+
+def init_dist_multi_gpu(gpus_per_group=2):
+    """Initialize distributed with multiple GPUs per group.
+
+    ``CUDA_VISIBLE_DEVICES`` must already be set to a disjoint subset of
+    GPUs for this rank (the ``launch_multi_gpu.sh`` wrapper handles this).
+    NCCL communication uses the first visible GPU (local cuda:0).
+
+    Example with 4 physical GPUs, 2 per group:
+      - Rank 0 -> local cuda:0, cuda:1 (physical 0, 1)
+      - Rank 1 -> local cuda:0, cuda:1 (physical 2, 3)
+    """
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+
+    if world_size < 2:
+        logger.info("Single-process mode, skipping distributed init")
+        return
+
+    # NCCL uses the first visible GPU
+    torch.cuda.set_device(0)
+
+    dist.init_process_group(
+        backend="nccl",
+        init_method="env://",
+        rank=rank,
+        world_size=world_size,
+        device_id=torch.device("cuda:0"),
+    )
+    dist.barrier()
+    actual_count = torch.cuda.device_count()
+    logger.info(
+        f"[Rank {rank}/{world_size}] CUDA_VISIBLE_DEVICES="
+        f"{os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')} "
+        f"(visible GPUs: {actual_count})"
+    )
+    if actual_count < gpus_per_group:
+        logger.warning(
+            f"[Rank {rank}] Expected {gpus_per_group} GPUs but only "
+            f"{actual_count} are visible"
+        )
+
+
+def get_dist_info():
+    if dist.is_available() and dist.is_initialized():
+        return dist.get_rank(), dist.get_world_size()
+    return 0, 1
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="AutoRound Quantization with Multi-GPU per Group DDP"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Qwen/Qwen3-8B",
+        help="Model name or path",
+    )
+    parser.add_argument(
+        "--gpus-per-group",
+        type=int,
+        default=2,
+        help="Number of GPUs per rank-local group for block sharding (default: 2)",
+    )
+    parser.add_argument(
+        "--scheme",
+        type=str,
+        default="W4A16",
+        help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)",
+    )
+    parser.add_argument("--iters", type=int, default=200, help="Number of iterations")
+    parser.add_argument("--nsamples", type=int, default=128, help="Number of samples")
+    parser.add_argument(
+        "--disable_torch_compile",
+        action="store_true",
+        help="Disable torch.compile for model acceleration during quantization",
+    )
+    parser.add_argument(
+        "--deterministic",
+        action="store_true",
+        help="Enable deterministic mode for reproducibility",
+    )
+    args = parser.parse_args()
+
+    if args.deterministic:
+        config_deterministic()
+
+    model_id = args.model
+
+    ###### MULTI-GPU DDP INIT #####
+    init_dist_multi_gpu(gpus_per_group=args.gpus_per_group)
+    # For multi-GPU-per-group AutoRound, keep the base model anchored on the
+    # rank-local primary GPU and let AutoRound auto-dispatch each block within
+    # the local GPU group during tuning. Pre-sharding the loaded model across
+    # the group can leave residual modules and cached activations on different
+    # local GPUs before AutoRound takes over.
+    load_device_map = "auto"
+    if args.gpus_per_group > 1:
+        load_device_map = {"": torch.device("cuda:0")}
+    with load_offloaded_model():
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, dtype="auto", device_map=load_device_map
+        )
+    ###############################
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    NUM_CALIBRATION_SAMPLES = args.nsamples
+    MAX_SEQUENCE_LENGTH = 2048
+    ITERS = args.iters
+
+    # Get aligned calibration dataset.
+    from auto_round.calib_dataset import get_dataset  # noqa: E402
+
+    # Note: Make sure model are loaded before importing auto-round related code.
+    from llmcompressor.modifiers.autoround import AutoRoundModifier  # noqa: E402
+
+    ds = get_dataset(
+        tokenizer=tokenizer,
+        seqlen=MAX_SEQUENCE_LENGTH,
+        nsamples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    # Configure the quantization algorithm.
+    recipe = AutoRoundModifier(
+        targets="Linear",
+        scheme=args.scheme,
+        ignore=[
+            "lm_head",
+            "re:.*mlp.gate$",
+        ],
+        iters=ITERS,
+        enable_torch_compile=not args.disable_torch_compile,
+    )
+
+    # Apply algorithms.
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+        shuffle_calibration_samples=False,
+    )
+
+    rank, world_size = get_dist_info()
+    logger.info(f"[Rank {rank}] Quantization completed")
+
+    if dist.is_available() and dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+
+    if rank != 0:
+        sys.exit(0)
+
+    if rank == 0:
+        # Confirm generations of the quantized model look sane.
+        logger.info("\n\n")
+        logger.info("========== SAMPLE GENERATION ==============")
+        dispatch_model(model)
+        sample = tokenizer("Hello my name is", return_tensors="pt")
+        sample_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        sample = {key: value.to(sample_device) for key, value in sample.items()}
+        output = model.generate(**sample, max_new_tokens=100)
+        logger.info(tokenizer.decode(output[0]))
+        logger.info("==========================================\n\n")
+
+        logger.info("Saving...")
+        SAVE_DIR = (
+            model_id.rstrip("/").split("/")[-1]
+            + f"-{args.scheme}-AutoRound"
+            + f"-iters{args.iters}-nsamples{args.nsamples}"
+            + "-MultiGPUDDP"
+            + str(world_size)
+        )
+        model.save_pretrained(SAVE_DIR, save_compressed=True)
+        tokenizer.save_pretrained(SAVE_DIR)
+        logger.info(f"Saved to {SAVE_DIR}")
diff --git a/examples/autoround/ddp/launch_multi_gpu.sh b/examples/autoround/ddp/launch_multi_gpu.sh
new file mode 100755
index 0000000000..704d954400
--- /dev/null
+++ b/examples/autoround/ddp/launch_multi_gpu.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Launch multi-GPU per group DDP training.
+#
+# Partitions physical GPUs into groups, one group per process/rank.
+# Each rank sees its own set of GPUs via CUDA_VISIBLE_DEVICES.
+#
+# Usage:
+#   GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model ... --scheme W4A16
+#
+# This spawns 2 ranks, each with 2 GPUs (4 GPUs total).
+# The Python script no longer needs to override CUDA_VISIBLE_DEVICES.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+GPUS_PER_GROUP=${GPUS_PER_GROUP:-${GPUS_PER_RANK:-2}}
+NPROC=${NPROC:-2}  # number of ranks
+PYTHON=${PYTHON:-/home/yiliu7/workspace/venvs/ar/bin/python}
+MASTER_PORT=${MASTER_PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-localhost}
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+
+SCRIPT="$1"
+shift
+
+echo "Launching $NPROC ranks, $GPUS_PER_GROUP GPUs each"
+echo "Python: $PYTHON"
+echo "Script: $SCRIPT"
+
+VISIBLE_GPUS_ENV=${CUDA_VISIBLE_DEVICES:-}
+if [[ -n "$VISIBLE_GPUS_ENV" ]]; then
+    IFS=',' read -r -a VISIBLE_GPUS <<< "$VISIBLE_GPUS_ENV"
+else
+    VISIBLE_GPUS=()
+fi
+
+TOTAL_GPUS_NEEDED=$((NPROC * GPUS_PER_GROUP))
+if [[ ${#VISIBLE_GPUS[@]} -gt 0 && ${#VISIBLE_GPUS[@]} -ne $TOTAL_GPUS_NEEDED ]]; then
+    echo "Expected $TOTAL_GPUS_NEEDED GPUs in CUDA_VISIBLE_DEVICES, got ${#VISIBLE_GPUS[@]}: $VISIBLE_GPUS_ENV" >&2
+    exit 1
+fi
+
+pids=()
+for RANK in $(seq 0 $((NPROC - 1))); do
+    if [[ ${#VISIBLE_GPUS[@]} -gt 0 ]]; then
+        GPU_OFFSET=$((RANK * GPUS_PER_GROUP))
+        GPU_LIST=$(IFS=,; echo "${VISIBLE_GPUS[*]:$GPU_OFFSET:$GPUS_PER_GROUP}")
+    else
+        GPU_START=$((NODE_RANK * NPROC * GPUS_PER_GROUP + RANK * GPUS_PER_GROUP))
+        GPU_END=$((GPU_START + GPUS_PER_GROUP - 1))
+        GPU_LIST=$(seq -s, $GPU_START $GPU_END)
+    fi
+    echo "  Rank $RANK -> GPUs $GPU_LIST"
+
+    CUDA_VISIBLE_DEVICES="$GPU_LIST" \
+    AR_DISABLE_DATASET_SUBPROCESS=1 \
+    LOCAL_RANK=0 \
+    RANK=$((NODE_RANK * NPROC + RANK)) \
+    WORLD_SIZE=$((NNODES * NPROC)) \
+    MASTER_ADDR="$MASTER_ADDR" \
+    MASTER_PORT="$MASTER_PORT" \
+    TORCHELASTIC_RUN_ID="multi_gpu_$(date +%s)_$$" \
+    GPUS_PER_GROUP="$GPUS_PER_GROUP" \
+    "$PYTHON" -u "$SCRIPT_DIR/$SCRIPT" "$@" &
+
+    pids+=($!)
+    # Small delay so workers don't race for port binding
+    sleep 0.5
+done
+
+# Wait for all processes
+status=0
+for pid in "${pids[@]}"; do
+    if ! wait "$pid"; then
+        status=1
+    fi
+done
+exit $status
diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md
new file mode 100644
index 0000000000..234e002d48
--- /dev/null
+++ b/examples/autoround/ddp/reproduce.md
@@ -0,0 +1,53 @@
+# Multi-GPU DDP AutoRound Reproduce
+
+## Command
+
+```bash
+cd /home/yiliu7/workspace/llm-compressor
+
+AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NPROC=2 MASTER_PORT=29501 \
+  bash examples/autoround/ddp/launch_multi_gpu.sh \
+  ddp_qwen3_multi_gpu_example.py \
+  --model /storage/yiliu7/Qwen/Qwen3-8B \
+  --gpus-per-group 2 \
+  --scheme W4A16 \
+  --nsamples 32 --iters 50 \
+  > /tmp/multi_gpu_test.log 2>&1 &
+```
+
+## Monitor
+
+```bash
+# Check progress
+tail -f /tmp/multi_gpu_test.log
+# Check processes
+ps aux | grep ddp_qwen3_multi | grep -v grep
+# Check GPU usage
+nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader
+# Kill
+pkill -f ddp_qwen3_multi_gpu_example
+```
+
+## Current State
+
+- ✅ 4 code changes implemented (launch_multi_gpu.sh, base.py, distributed.py, quantizer.py)
+- ✅ Model loading works with `device_map="auto"` (dispatch 547/547 in <1s)
+- ✅ GPU partitioning works (rank 0 → GPUs 0,1; rank 1 → GPUs 2,3)
+- 🔄 **Hang** after "Disabling tokenizer parallelism" warning — inside `get_dataset()`
+  - `AR_DISABLE_DATASET_SUBPROCESS=1` avoids the fork issue
+  - Dataset is cached, not downloading
+  - Both processes at ~100% CPU but no progress
+
+## Key Files
+
+| File | Change |
+|------|--------|
+| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | NEW — multi-GPU DDP example |
+| `examples/autoround/ddp/launch_multi_gpu.sh` | NEW — bash wrapper for GPU partitioning |
+| `src/llmcompressor/modifiers/autoround/base.py` | `_update_device_map_for_dp` + auto_offload gate use `GPUS_PER_GROUP` |
+| `auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)` |
+| `auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` |
+
+## Venv
+
+Python: `/home/yiliu7/workspace/venvs/ar/bin/python`
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index dd875b7887..7e1738cf98 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -1,4 +1,5 @@
 from contextlib import contextmanager
+import os
 
 import torch
 import torch.nn as nn
@@ -8,6 +9,7 @@
 from auto_round.utils import check_to_quantized
 from auto_round.wrapper import WrapperWALayer
 from compressed_tensors.offload import get_execution_device, get_offloaded_device
+from compressed_tensors.offload.cache.base import OffloadCache
 from compressed_tensors.offload.module import offload_module, remove_module_offload
 from compressed_tensors.quantization import (
     QuantizationMetadata,
@@ -29,6 +31,15 @@
 __all__ = ["AutoRoundModifier"]
 
 
+def _get_local_gpu_group_size() -> int:
+    return int(
+        os.environ.get(
+            "GPUS_PER_GROUP",
+            os.environ.get("GPUS_PER_RANK", "1"),
+        )
+    )
+
+
 class _LLModelWrapper(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -64,7 +75,7 @@ def suspend_offloading(model: nn.Module):
     """
     offloading_info = dict()
     for name, module in model.named_modules():
-        if not hasattr(module, "weight"):  # skip SiLU or other non-weight layers
+        if not isinstance(module._parameters, OffloadCache):
             continue
         offloading_info[name] = (
             get_execution_device(module),
@@ -75,7 +86,7 @@ def suspend_offloading(model: nn.Module):
     yield
 
     for name, module in model.named_modules():
-        if not hasattr(module, "weight"):  # skip SiLU or other non-weight layers
+        if name not in offloading_info:
             continue
         offload_module(module, *offloading_info[name])
 
@@ -273,6 +284,15 @@ def apply_autoround(self, state, modules):
             "ignore_layers": ",".join(ignore_layers) if ignore_layers else "",
             "disable_opt_rtn": self.disable_opt_rtn,
         }
+        if torch.distributed.is_initialized():
+            gpus_per_group = _get_local_gpu_group_size()
+            if gpus_per_group > 1 and kwargs["enable_torch_compile"]:
+                logger.warning(
+                    "Disabling torch.compile for AutoRound multi-GPU group DDP "
+                    "because compiled block execution does not support "
+                    "cross-device sharding."
+                )
+                kwargs["enable_torch_compile"] = False
 
         llmc_registered_qparams = self._preprocess_qparams(decoding_layer)
         with (
@@ -292,11 +312,23 @@ def apply_autoround(self, state, modules):
             device = first_param.device
             cur_inputs = self._all_module_input[decoding_layer._tmp_name]
             decoding_layer.tuning_device = device
-            # Leave offload for LLMC to handle if `device_ids` is not set
+            # Enable auto_offload when device_ids is explicitly set OR when
+            # GPUS_PER_GROUP > 1 (set by launch_multi_gpu.sh).
+            # This lets AutoRound load-balance the block's submodules
+            # across multiple GPUs within the rank.
             auto_offload = False
-            if self.device_ids is not None:
-                # When device_ids is set, we move decoding layer to CPU first,
-                # then the submodules will be re-dispatched by AutoRound.
+            needs_multi_gpu = (
+                self.device_ids is not None or _get_local_gpu_group_size() > 1
+            )
+            if needs_multi_gpu:
+                # Let AutoRound own placement within the rank-local GPU group.
+                # The incoming block may already be split across local devices,
+                # so anchoring to first_param.device can place residual modules
+                # (e.g. norms) on local cuda:1 while hidden states begin on
+                # local cuda:0, causing cross-device forward failures.
+                device = torch.device("cuda:0")
+                # Move decoding layer to CPU first, then the submodules
+                # will be re-dispatched by AutoRound.
                 decoding_layer.to("cpu")
                 auto_offload = True
 
@@ -352,12 +384,22 @@ def get_unquantized_layer_names(self, wrapped_model: torch.nn.Module) -> list[st
 
     def _update_device_map_for_dp(self, ar_kwargs):
         if torch.distributed.is_initialized():
-            rank = torch.distributed.get_rank()
-            ar_kwargs["device_map"] = (
-                f"{torch.accelerator.current_accelerator().type}:{rank}"
-                if torch.accelerator.is_available()
-                else "cpu"
-            )
+            if self.device_ids is not None:
+                return  # user explicitly set device_ids, respect it
+            gpus_per_group = _get_local_gpu_group_size()
+            if gpus_per_group > 1:
+                # Multi-GPU per group: pass comma-separated local GPU indices
+                # so AutoRound can load-balance submodules across GPUs.
+                # The group size is set by the launch_multi_gpu.sh wrapper.
+                ar_kwargs["device_map"] = ",".join(
+                    str(i) for i in range(gpus_per_group)
+                )
+            else:
+                ar_kwargs["device_map"] = (
+                    f"{torch.accelerator.current_accelerator().type}:0"
+                    if torch.accelerator.is_available()
+                    else "cpu"
+                )
 
     def _unwrapper_quantized_layer(self, model: torch.nn.Module):
         # auto-round will return WrapperWALayer if activation is quantized
diff --git a/src/llmcompressor/utils/dist.py b/src/llmcompressor/utils/dist.py
index c4a04d42eb..a1f75af804 100644
--- a/src/llmcompressor/utils/dist.py
+++ b/src/llmcompressor/utils/dist.py
@@ -1,11 +1,17 @@
 from typing import Hashable, TypeVar
 
-from compressed_tensors.distributed import (
-    greedy_bin_packing as _greedy_bin_packing,
-)
-from compressed_tensors.distributed import (
-    wait_for_comms as _wait_for_comms,
-)
+try:
+    from compressed_tensors.distributed import (
+        greedy_bin_packing as _greedy_bin_packing,
+    )
+    from compressed_tensors.distributed import (
+        wait_for_comms as _wait_for_comms,
+    )
+except ImportError:
+    # compressed_tensors<0.16 does not have the distributed submodule
+    _greedy_bin_packing = None
+    _wait_for_comms = None
+
 from compressed_tensors.utils.helpers import deprecated
 
 T = TypeVar("T", bound=Hashable)
@@ -29,6 +35,11 @@ def greedy_bin_packing(*args, **kwargs) -> tuple[list[T], list[list[T]], dict[T,
           the list of items assigned to that bin.
         - item_to_bin: mapping from each item to its assigned bin index.
     """
+    if _greedy_bin_packing is None:
+        raise ImportError(
+            "greedy_bin_packing requires compressed-tensors>=0.16 "
+            "(distributed submodule not found)"
+        )
     return _greedy_bin_packing(*args, **kwargs)
 
 
@@ -44,4 +55,9 @@ def wait_for_comms(*args, **kwargs) -> None:
         ``async_op=True``). The list is cleared after all operations
         have completed.
     """
+    if _wait_for_comms is None:
+        raise ImportError(
+            "wait_for_comms requires compressed-tensors>=0.16 "
+            "(distributed submodule not found)"
+        )
     return _wait_for_comms(*args, **kwargs)

From ae806055505c002a4c93bca3403e8d9d7d138234 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 15 Jun 2026 12:54:25 +0000
Subject: [PATCH 02/22] Fix AutoRound DDP hang: disable onloading during quant
 param init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When DDP is initialized before model loading,
OffloadCache.cls_from_device selects distributed cache variants.
Each register_parameter inside initialize_module_for_quantization
triggers offload() which does dist.broadcast + barrier. For large
MoE models (e.g. Qwen3-235B) with 100K+ Linear layers x 6 quant
params, this means 600K+ collective ops — effectively hanging.

Fix: wrap initialize_module_for_quantization in disable_onloading()
so new params are stored directly without triggering distributed offload.
Verified on Qwen3-235B-A22B: apply_quantization_config dropped from
hanging to ~4.3 min.
---
 .../ddp/ddp_qwen3_multi_gpu_example.py        | 479 +++++++++++++++++-
 examples/autoround/ddp/launch_multi_gpu.sh    |   3 +-
 2 files changed, 458 insertions(+), 24 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
index 350f932ee4..3b15e6e5d7 100644
--- a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
@@ -17,18 +17,26 @@
 """
 
 import argparse
+import importlib
 import os
 import sys
+import time
+from pathlib import Path
 
+import psutil
 import torch
 import torch.distributed as dist
-from compressed_tensors.offload import dispatch_model, load_offloaded_model
+from compressed_tensors.offload import dispatch_model, from_accelerate, load_offloaded_model
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 
 
+class StopAfterBlocks(RuntimeError):
+    pass
+
+
 def fix_everything(seed=42):
     import random
 
@@ -94,6 +102,394 @@ def get_dist_info():
     return 0, 1
 
 
+def _module_has_direct_tensors(module: torch.nn.Module) -> bool:
+    return any(t is not None for t in module._parameters.values()) or any(
+        t is not None for t in module._buffers.values()
+    )
+
+
+def _module_has_meta_tensors(module: torch.nn.Module) -> bool:
+    return any(
+        t is not None and t.device.type == "meta"
+        for t in module._parameters.values()
+    ) or any(t is not None and t.device.type == "meta" for t in module._buffers.values())
+
+
+def patch_ct_dispatch_for_sparse_offload():
+    """Avoid wrapping modules that do not need compressed-tensors offload hooks."""
+    dispatch_mod = importlib.import_module("compressed_tensors.offload.dispatch")
+    fa_mod = importlib.import_module("compressed_tensors.offload.convert.from_accelerate")
+
+    if getattr(dispatch_mod.dispatch_with_map, "_llmc_sparse_patch", False):
+        return
+
+    offload_module = dispatch_mod.offload_module
+    tqdm = dispatch_mod.tqdm
+
+    def optimized_dispatch_with_map(
+        model: torch.nn.Module,
+        device_map,
+        offload_dir: str | None = None,
+        show_progress: bool = True,
+    ):
+        filtered = []
+        skipped_noop = 0
+        skipped_empty = 0
+        skipped_cpu_cpu = 0
+        kept_meta_materialization = 0
+
+        for name, (onload_device, offload_device) in device_map.items():
+            if offload_device is None:
+                skipped_noop += 1
+                continue
+
+            module = model.get_submodule(name)
+            if not _module_has_direct_tensors(module):
+                skipped_empty += 1
+                continue
+
+            if (
+                str(onload_device) == "cpu"
+                and str(offload_device) == "cpu"
+                and not _module_has_meta_tensors(module)
+            ):
+                skipped_cpu_cpu += 1
+                continue
+
+            if str(onload_device) == "cpu" and str(offload_device) == "cpu":
+                kept_meta_materialization += 1
+
+            filtered.append((name, onload_device, offload_device))
+
+        logger.info(
+            "Compressed-tensors dispatch filtered {} -> {} modules "
+            "(noop={}, empty={}, cpu_to_cpu_skipped={}, cpu_to_cpu_meta_kept={})",
+            len(device_map),
+            len(filtered),
+            skipped_noop,
+            skipped_empty,
+            skipped_cpu_cpu,
+            kept_meta_materialization,
+        )
+
+        for name, onload_device, offload_device in tqdm(
+            filtered,
+            desc="Dispatching model",
+            disable=(not show_progress),
+        ):
+            module = model.get_submodule(name)
+            if offload_device == "disk":
+                offload_module(
+                    module,
+                    onload_device,
+                    offload_device,
+                    offload_dir=offload_dir,
+                )
+            else:
+                offload_module(module, onload_device, offload_device)
+
+    optimized_dispatch_with_map._llmc_sparse_patch = True
+    dispatch_mod.dispatch_with_map = optimized_dispatch_with_map
+    fa_mod.dispatch_with_map = optimized_dispatch_with_map
+
+
+def _rank_offload_folder(base_folder: str | None) -> str | None:
+    if not base_folder:
+        return None
+
+    rank, _ = get_dist_info()
+    rank_folder = Path(base_folder) / f"rank{rank}"
+    rank_folder.mkdir(parents=True, exist_ok=True)
+    return str(rank_folder)
+
+
+def _independent_cpu_max_memory(extra_cpu_mem: int = int(5e9)) -> dict[str, int]:
+    _, world_size = get_dist_info()
+    per_rank_available = psutil.virtual_memory().available // max(world_size, 1)
+    return {"cpu": max(per_rank_available - extra_cpu_mem, int(8e9))}
+
+
+def load_model_with_local_offload(model_id: str, offload_folder: str | None):
+    """Load model on each rank independently, then convert accelerate offload locally."""
+    load_kwargs = {
+        "dtype": "auto",
+        "device_map": "auto",
+        "max_memory": _independent_cpu_max_memory(),
+    }
+    rank_offload_folder = _rank_offload_folder(offload_folder)
+    if rank_offload_folder:
+        load_kwargs["offload_folder"] = rank_offload_folder
+
+    logger.info(
+        "[Rank {}] Loading model independently with max_memory={} offload_folder={}",
+        get_dist_info()[0],
+        load_kwargs["max_memory"],
+        rank_offload_folder,
+    )
+    model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
+    if hasattr(model, "hf_device_map"):
+        from_accelerate(model)
+    return model
+
+
+def patch_disable_onloading_for_quant_init():
+    """Avoid expensive dist.broadcast + barrier for every new quant param.
+
+    When DDP is initialized before model loading,
+    ``OffloadCache.cls_from_device`` selects distributed cache variants
+    (DistributedCPUCache / DistributedDiskCache).  Each call to
+    ``register_parameter`` inside ``initialize_module_for_quantization``
+    triggers ``offload()``, which does a collective broadcast + barrier.
+    For large MoE models (e.g. Qwen3-235B with 100K+ Linear layers × 6
+    quant params), this means hundreds of thousands of round-trips —
+    effectively hanging the process.
+
+    Wrapping the body in ``disable_onloading()`` stores new parameters
+    directly in ``offloaded_values`` without invoking the distributed
+    offload, cutting the overhead to zero.
+    """
+    from compressed_tensors.offload import (  # noqa: F811
+        disable_onloading,
+    )
+
+    lifecycle_init_mod = importlib.import_module(
+        "compressed_tensors.quantization.lifecycle.initialize"
+    )
+    original_fn = lifecycle_init_mod.initialize_module_for_quantization
+
+    if getattr(original_fn, "_llmc_no_dist_offload_patch", False):
+        return
+
+    def patched_initialize_module_for_quantization(module, scheme=None, force_zero_point=True):
+        with disable_onloading():
+            return original_fn(module, scheme=scheme, force_zero_point=force_zero_point)
+
+    patched_initialize_module_for_quantization._llmc_no_dist_offload_patch = True
+    lifecycle_init_mod.initialize_module_for_quantization = (
+        patched_initialize_module_for_quantization
+    )
+
+
+def patch_autoround_stop_after_blocks(max_blocks: int):
+    """Raise after N decoding blocks finish so large-model smoke tests can stop cleanly."""
+    autoround_mod = importlib.import_module("llmcompressor.modifiers.autoround.base")
+    modifier_cls = autoround_mod.AutoRoundModifier
+
+    if getattr(modifier_cls.apply_autoround, "_llmc_stop_after_patch", False):
+        return
+
+    original_apply_autoround = modifier_cls.apply_autoround
+
+    def wrapped_apply_autoround(self, state, modules):
+        modules = modules or []
+        if not any(self._is_decoding_layer(module) for module in modules):
+            return original_apply_autoround(self, state, modules)
+
+        result = original_apply_autoround(self, state, modules)
+        completed = getattr(self, "_llmc_completed_blocks", 0) + 1
+        self._llmc_completed_blocks = completed
+        logger.info(
+            "[Rank {}] Completed AutoRound block {}/{}",
+            get_dist_info()[0],
+            completed,
+            max_blocks,
+        )
+        if completed >= max_blocks:
+            raise StopAfterBlocks(f"Stopped after {completed} blocks")
+        return result
+
+    wrapped_apply_autoround._llmc_stop_after_patch = True
+    modifier_cls.apply_autoround = wrapped_apply_autoround
+
+
+def patch_llmc_timing_logs():
+    """Add coarse timing logs around the expensive LLMC startup stages."""
+    recipe_mod = importlib.import_module("llmcompressor.recipe.recipe")
+    lifecycle_mod = importlib.import_module("llmcompressor.core.lifecycle")
+    quant_mixin_mod = importlib.import_module(
+        "llmcompressor.modifiers.quantization.quantization.mixin"
+    )
+    quantization_base_mod = importlib.import_module(
+        "compressed_tensors.quantization"
+    )
+    module_utils_mod = importlib.import_module("compressed_tensors.utils")
+    group_validation_mod = importlib.import_module(
+        "llmcompressor.modifiers.quantization.group_size_validation"
+    )
+    seq_helpers_mod = importlib.import_module("llmcompressor.pipelines.sequential.helpers")
+    seq_pipeline_mod = importlib.import_module("llmcompressor.pipelines.sequential.pipeline")
+    cache_mod = importlib.import_module("llmcompressor.pipelines.cache")
+    autoround_mod = importlib.import_module("llmcompressor.modifiers.autoround.base")
+    core_mod = importlib.import_module("llmcompressor.core")
+
+    recipe_cls = recipe_mod.Recipe
+    lifecycle_cls = lifecycle_mod.CompressionLifecycle
+    quant_mixin_cls = quant_mixin_mod.QuantizationMixin
+    cache_cls = cache_mod.IntermediatesCache
+    autoround_cls = autoround_mod.AutoRoundModifier
+    seq_pipeline_cls = seq_pipeline_mod.SequentialPipeline
+    lifecycle_callbacks = core_mod.LifecycleCallbacks
+
+    if getattr(recipe_cls.from_modifiers, "_llmc_timing_patch", False):
+        return
+
+    original_from_modifiers = recipe_cls.from_modifiers.__func__
+    original_lifecycle_initialize = lifecycle_cls.initialize
+    original_initialize_quantization = quant_mixin_cls.initialize_quantization
+    original_start_calibration = autoround_cls.start_calibration
+    original_trace_subgraphs = seq_helpers_mod.trace_subgraphs
+    original_from_dataloader = cache_cls.from_dataloader.__func__
+    original_apply_autoround = autoround_cls.apply_autoround
+    original_seq_call = seq_pipeline_cls.__call__
+    original_calib_epoch_start = lifecycle_callbacks.calibration_epoch_start
+    original_match_named_modules = module_utils_mod.match_named_modules
+    original_apply_quantization_config = quantization_base_mod.apply_quantization_config
+    original_validate_group_size_divisibility = (
+        group_validation_mod.validate_group_size_divisibility
+    )
+
+    def _timed(label, fn, *args, **kwargs):
+        start = time.perf_counter()
+        logger.info("[Rank {}] {} started", get_dist_info()[0], label)
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            logger.info(
+                "[Rank {}] {} finished in {:.2f}s",
+                get_dist_info()[0],
+                label,
+                time.perf_counter() - start,
+            )
+
+    @classmethod
+    def timed_from_modifiers(cls, modifiers, modifier_group_name=None):
+        return _timed(
+            "Recipe.from_modifiers",
+            original_from_modifiers,
+            cls,
+            modifiers,
+            modifier_group_name,
+        )
+
+    def timed_lifecycle_initialize(self, *args, **kwargs):
+        return _timed(
+            "CompressionLifecycle.initialize",
+            original_lifecycle_initialize,
+            self,
+            *args,
+            **kwargs,
+        )
+
+    def timed_initialize_quantization(self, model):
+        return _timed(
+            "QuantizationMixin.initialize_quantization",
+            original_initialize_quantization,
+            self,
+            model,
+        )
+
+    def timed_start_calibration(self, model):
+        return _timed(
+            "AutoRoundModifier.start_calibration",
+            original_start_calibration,
+            self,
+            model,
+        )
+
+    def timed_trace_subgraphs(*args, **kwargs):
+        return _timed("trace_subgraphs", original_trace_subgraphs, *args, **kwargs)
+
+    @classmethod
+    def timed_from_dataloader(cls, *args, **kwargs):
+        return _timed(
+            "IntermediatesCache.from_dataloader",
+            original_from_dataloader,
+            cls,
+            *args,
+            **kwargs,
+        )
+
+    def timed_apply_autoround(self, state, modules):
+        modules = modules or []
+        decoding_layers = [m for m in modules if self._is_decoding_layer(m)]
+        if not decoding_layers:
+            return original_apply_autoround(self, state, modules)
+        layer_name = getattr(decoding_layers[0], "_tmp_name", decoding_layers[0].__class__.__name__)
+        return _timed(
+            f"AutoRoundModifier.apply_autoround({layer_name})",
+            original_apply_autoround,
+            self,
+            state,
+            modules,
+        )
+
+    def timed_seq_call(model, dataloader, dataset_args):
+        pipeline_start = time.perf_counter()
+        logger.info("[Rank {}] SequentialPipeline.__call__ started", get_dist_info()[0])
+        try:
+            logger.info("[Rank {}] SequentialPipeline pre-next(iter(dataloader))", get_dist_info()[0])
+            iter_start = time.perf_counter()
+            sample_input = next(iter(dataloader))
+            logger.info(
+                "[Rank {}] next(iter(dataloader)) finished in {:.2f}s",
+                get_dist_info()[0],
+                time.perf_counter() - iter_start,
+            )
+            del sample_input
+            return original_seq_call(model, dataloader, dataset_args)
+        finally:
+            logger.info(
+                "[Rank {}] SequentialPipeline.__call__ finished in {:.2f}s",
+                get_dist_info()[0],
+                time.perf_counter() - pipeline_start,
+            )
+
+    def timed_calib_epoch_start(*args, **kwargs):
+        return _timed(
+            "LifecycleCallbacks.calibration_epoch_start",
+            original_calib_epoch_start,
+            *args,
+            **kwargs,
+        )
+
+    def timed_match_named_modules(*args, **kwargs):
+        return _timed("match_named_modules", original_match_named_modules, *args, **kwargs)
+
+    def timed_apply_quantization_config(*args, **kwargs):
+        return _timed(
+            "apply_quantization_config",
+            original_apply_quantization_config,
+            *args,
+            **kwargs,
+        )
+
+    def timed_validate_group_size_divisibility(*args, **kwargs):
+        return _timed(
+            "validate_group_size_divisibility",
+            original_validate_group_size_divisibility,
+            *args,
+            **kwargs,
+        )
+
+    timed_from_modifiers._llmc_timing_patch = True
+    recipe_cls.from_modifiers = timed_from_modifiers
+    lifecycle_cls.initialize = timed_lifecycle_initialize
+    quant_mixin_cls.initialize_quantization = timed_initialize_quantization
+    autoround_cls.start_calibration = timed_start_calibration
+    module_utils_mod.match_named_modules = timed_match_named_modules
+    quant_mixin_mod.match_named_modules = timed_match_named_modules
+    quantization_base_mod.apply_quantization_config = timed_apply_quantization_config
+    quant_mixin_mod.apply_quantization_config = timed_apply_quantization_config
+    group_validation_mod.validate_group_size_divisibility = timed_validate_group_size_divisibility
+    quant_mixin_mod.validate_group_size_divisibility = timed_validate_group_size_divisibility
+    seq_helpers_mod.trace_subgraphs = timed_trace_subgraphs
+    seq_pipeline_mod.trace_subgraphs = timed_trace_subgraphs
+    cache_cls.from_dataloader = timed_from_dataloader
+    autoround_cls.apply_autoround = timed_apply_autoround
+    seq_pipeline_cls.__call__ = staticmethod(timed_seq_call)
+    lifecycle_callbacks.calibration_epoch_start = timed_calib_epoch_start
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="AutoRound Quantization with Multi-GPU per Group DDP"
@@ -116,7 +512,7 @@ def get_dist_info():
         default="W4A16",
         help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)",
     )
-    parser.add_argument("--iters", type=int, default=200, help="Number of iterations")
+    parser.add_argument("--iters", type=int, default=20, help="Number of iterations")
     parser.add_argument("--nsamples", type=int, default=128, help="Number of samples")
     parser.add_argument(
         "--disable_torch_compile",
@@ -128,6 +524,18 @@ def get_dist_info():
         action="store_true",
         help="Enable deterministic mode for reproducibility",
     )
+    parser.add_argument(
+        "--offload-folder",
+        type=str,
+        default=None,
+        help="Optional folder for disk offload while loading very large models",
+    )
+    parser.add_argument(
+        "--max-blocks",
+        type=int,
+        default=None,
+        help="Optional number of decoder blocks to quantize before exiting",
+    )
     args = parser.parse_args()
 
     if args.deterministic:
@@ -137,18 +545,32 @@ def get_dist_info():
 
     ###### MULTI-GPU DDP INIT #####
     init_dist_multi_gpu(gpus_per_group=args.gpus_per_group)
-    # For multi-GPU-per-group AutoRound, keep the base model anchored on the
-    # rank-local primary GPU and let AutoRound auto-dispatch each block within
-    # the local GPU group during tuning. Pre-sharding the loaded model across
-    # the group can leave residual modules and cached activations on different
-    # local GPUs before AutoRound takes over.
-    load_device_map = "auto"
-    if args.gpus_per_group > 1:
-        load_device_map = {"": torch.device("cuda:0")}
-    with load_offloaded_model():
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, dtype="auto", device_map=load_device_map
-        )
+    patch_ct_dispatch_for_sparse_offload()
+    patch_llmc_timing_logs()
+    patch_disable_onloading_for_quant_init()
+    if args.max_blocks is not None:
+        patch_autoround_stop_after_blocks(args.max_blocks)
+    # Load onto CPU first and spill to disk if needed. AutoRound will then
+    # onload and shard each block onto the rank-local GPU group during tuning.
+    load_start = time.perf_counter()
+    rank, world_size = get_dist_info()
+    if world_size > 1:
+        model = load_model_with_local_offload(model_id, args.offload_folder)
+    else:
+        load_kwargs = {
+            "dtype": "auto",
+            "device_map": "auto_offload",
+        }
+        rank_offload_folder = _rank_offload_folder(args.offload_folder)
+        if rank_offload_folder:
+            load_kwargs["offload_folder"] = rank_offload_folder
+        with load_offloaded_model():
+            model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
+    logger.info(
+        "[Rank {}] Model load + offload conversion finished in {:.2f}s",
+        rank,
+        time.perf_counter() - load_start,
+    )
     ###############################
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -182,17 +604,25 @@ def get_dist_info():
     )
 
     # Apply algorithms.
-    oneshot(
-        model=model,
-        dataset=ds,
-        recipe=recipe,
-        max_seq_length=MAX_SEQUENCE_LENGTH,
-        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-        shuffle_calibration_samples=False,
-    )
+    stopped_early = False
+    try:
+        oneshot(
+            model=model,
+            dataset=ds,
+            recipe=recipe,
+            max_seq_length=MAX_SEQUENCE_LENGTH,
+            num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+            shuffle_calibration_samples=False,
+        )
+    except StopAfterBlocks as exc:
+        stopped_early = True
+        logger.info("[Rank {}] {}", get_dist_info()[0], str(exc))
 
     rank, world_size = get_dist_info()
-    logger.info(f"[Rank {rank}] Quantization completed")
+    if stopped_early:
+        logger.info(f"[Rank {rank}] Partial quantization completed")
+    else:
+        logger.info(f"[Rank {rank}] Quantization completed")
 
     if dist.is_available() and dist.is_initialized():
         dist.barrier()
@@ -201,6 +631,9 @@ def get_dist_info():
     if rank != 0:
         sys.exit(0)
 
+    if stopped_early:
+        sys.exit(0)
+
     if rank == 0:
         # Confirm generations of the quantized model look sane.
         logger.info("\n\n")
diff --git a/examples/autoround/ddp/launch_multi_gpu.sh b/examples/autoround/ddp/launch_multi_gpu.sh
index 704d954400..14e40c9a78 100755
--- a/examples/autoround/ddp/launch_multi_gpu.sh
+++ b/examples/autoround/ddp/launch_multi_gpu.sh
@@ -6,6 +6,7 @@
 #
 # Usage:
 #   GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model ... --scheme W4A16
+#   GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507  --scheme W4A16
 #
 # This spawns 2 ranks, each with 2 GPUs (4 GPUs total).
 # The Python script no longer needs to override CUDA_VISIBLE_DEVICES.
@@ -15,7 +16,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 GPUS_PER_GROUP=${GPUS_PER_GROUP:-${GPUS_PER_RANK:-2}}
 NPROC=${NPROC:-2}  # number of ranks
 PYTHON=${PYTHON:-/home/yiliu7/workspace/venvs/ar/bin/python}
-MASTER_PORT=${MASTER_PORT:-29500}
+MASTER_PORT=${MASTER_PORT:-29600}
 MASTER_ADDR=${MASTER_ADDR:-localhost}
 NNODES=${NNODES:-1}
 NODE_RANK=${NODE_RANK:-0}

From a28e7fcbc310e8d5d32de25f65db61a3e3574402 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sat, 20 Jun 2026 14:24:20 +0000
Subject: [PATCH 03/22] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/ddp/DDP_FIXES.md           | 237 ++++++++++++++++++
 examples/autoround/ddp/ddp_autoround.py       | 187 ++++++++++++++
 .../ddp/ddp_qwen3_multi_gpu_example.py        |   8 +-
 examples/autoround/ddp/reproduce.md           |  88 +++++--
 src/llmcompressor/modifiers/autoround/base.py |  10 +-
 .../pipelines/sequential/helpers.py           |   4 -
 src/llmcompressor/utils/dev.py                |   2 +-
 7 files changed, 501 insertions(+), 35 deletions(-)
 create mode 100644 examples/autoround/ddp/DDP_FIXES.md
 create mode 100644 examples/autoround/ddp/ddp_autoround.py

diff --git a/examples/autoround/ddp/DDP_FIXES.md b/examples/autoround/ddp/DDP_FIXES.md
new file mode 100644
index 0000000000..50e9e7352d
--- /dev/null
+++ b/examples/autoround/ddp/DDP_FIXES.md
@@ -0,0 +1,237 @@
+# DDP Multi-GPU AutoRound Fixes for Large MoE Models
+
+## Problem
+
+Running AutoRound quantization with DDP on large MoE models (e.g., Qwen3-235B) would hang or take hours due to `DistributedCPUCache` performing a `dist.broadcast_object_list()` + `dist.barrier()` **per parameter** during offload operations (~218ms × 45K params = ~163 minutes).
+
+## Root Cause
+
+When `dist.is_initialized()`, `OffloadCache.cls_from_device("cpu")` returns `DistributedCPUCache` instead of `CPUCache`. This cache broadcasts every tensor to all ranks — unnecessary when each rank loads the model independently via safetensors mmap.
+
+The bottleneck hits in two places:
+1. `from_accelerate()` → `dispatch_with_map()` 
+2. `set_onload_device()` in SequentialPipeline
+
+## Fixes Applied
+
+### Fix 1: `src/llmcompressor/utils/dev.py` — `get_main_device()` 
+
+**Bug**: Used `rank` as the CUDA device index, which is wrong when `GPUS_PER_GROUP > 1`.  
+**Fix**: Use `torch.accelerator.current_device_index()` which respects `torch.cuda.set_device()`.
+
+```python
+# Before (line 140):
+return torch.device(accel_type, rank)
+
+# After:
+return torch.device(accel_type, torch.accelerator.current_device_index())
+```
+
+### Fix 2: `src/llmcompressor/modifiers/autoround/base.py` — anchor device in `apply_autoround`
+
+**Bug**: Hardcoded `device = torch.device("cuda:0")` when `needs_multi_gpu` is true. Rank 1 with GPUs [2,3] would try to anchor on cuda:0 instead of cuda:2.  
+**Fix**: Use `get_main_device()` which returns the correct per-rank device.
+
+```python
+# Before (line ~329):
+device = torch.device("cuda:0")
+
+# After:
+from llmcompressor.utils.dev import get_main_device
+device = get_main_device()
+```
+
+### Fix 3: `src/llmcompressor/modifiers/autoround/base.py` — GPU partition in `_update_device_map_for_dp`
+
+**Bug**: Generated `"0,1"` for all ranks instead of per-rank GPU partitions.  
+**Fix**: Offset by `local_rank * gpus_per_group`.
+
+```python
+# Before:
+ar_kwargs["device_map"] = ",".join(str(i) for i in range(gpus_per_group))
+
+# After:
+local_rank = torch.distributed.get_rank()
+start_gpu = local_rank * gpus_per_group
+ar_kwargs["device_map"] = ",".join(str(start_gpu + i) for i in range(gpus_per_group))
+```
+
+### Patch 4 (monkey-patch, needs upstream in compressed-tensors): Force local cache
+
+Patches `OffloadCache.cls_from_device` to return `CPUCache`/`DeviceCache` instead of `DistributedCPUCache`/`DistributedDeviceCache`. This is correct when each rank loads the model independently.
+
+See `patch_force_local_cache()` in `test_option3_fixed.py`.
+
+### Patch 5 (monkey-patch, needs upstream in compressed-tensors): Disable onloading during quant init
+
+Wraps `initialize_module_for_quantization` with `disable_onloading()` to avoid per-parameter broadcast+barrier when new quantization parameters are created.
+
+See `patch_disable_onloading_for_quant_init()` in `test_option3_fixed.py`.
+
+## Reproduce
+
+### Prerequisites
+
+```bash
+# Environment
+source /home/yiliu7/workspace/venvs/llmc/bin/activate
+
+# Working directory
+cd /home/yiliu7/workspace/llm-compressor
+```
+
+### Run on Qwen3-8B (quick verification, ~2 minutes)
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
+    --nproc_per_node=2 \
+    examples/autoround/ddp/ddp_autoround.py \
+    --model /storage/yiliu7/Qwen/Qwen3-8B \
+    --iters 5 --nsamples 32
+```
+
+### Run on Qwen3-235B (full test, ~47 minutes)
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
+    --nproc_per_node=2 \
+    examples/autoround/ddp/ddp_autoround.py \
+    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ \
+    --iters 20 --nsamples 32
+```
+
+### Expected behavior
+
+- Both ranks process all 94 layers in lockstep (~30s/layer on 235B)
+- All 4 GPUs show active memory usage (~56-63 GB each)
+- Each rank uses 2 GPUs: rank 0 → [0,1], rank 1 → [2,3]
+- Small NCCL idle contexts (~614 MB) appear on non-owned GPUs — this is normal
+
+### Monitor progress
+
+```bash
+# GPU utilization
+nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader
+
+# Layer progress (from log)
+grep "Applying AutoRound" /path/to/log | tail -6
+```
+
+## Known Issues
+
+1. **8 GPU process entries in nvidia-smi**: Each of the 2 torchrun processes creates a small NCCL context (~614 MB) on all visible GPUs. Only 4 entries are doing real work (the ~56-63 GB ones). This is unavoidable without a pre-launch wrapper that restricts `CUDA_VISIBLE_DEVICES` before Python starts.
+
+2. **OOM on layer ~11 (235B)**: With 20 iters and the full 235B model, GPU memory may be tight. Reduce `--iters` or `--nsamples` if OOM occurs.
+
+## Upstream Plan
+
+### PR 1: llm-compressor — Multi-GPU DDP device fixes
+
+**Scope**: Fixes 1–3 above. Clean code changes, no monkey-patches.
+
+**Changes**:
+- `src/llmcompressor/utils/dev.py`: `get_main_device()` uses `current_device_index()` instead of `rank`
+- `src/llmcompressor/modifiers/autoround/base.py`: 
+  - `apply_autoround` anchor device uses `get_main_device()` instead of hardcoded `cuda:0`
+  - `_update_device_map_for_dp` offsets GPU indices by `local_rank * gpus_per_group`
+
+**Testing**: Run DDP AutoRound on Qwen3-8B with 4 GPUs (2 per rank). Verify all GPUs participate and no device mismatch errors.
+
+---
+
+### PR 2: compressed-tensors — Skip distributed cache when ranks have local parameters
+
+**Problem**: `OffloadCache.cls_from_device("cpu")` unconditionally returns `DistributedCPUCache` when `dist.is_initialized()`. This causes O(n_params) broadcast+barrier ops (~218ms each) even when all ranks already have parameters locally (via independent `from_pretrained` loading with safetensors mmap).
+
+**Proposed fix**: Add a `distributed` parameter to `cls_from_device` with auto-detection:
+
+```python
+# compressed_tensors/offload/cache/base.py
+
+@classmethod
+def cls_from_device(cls, device=None, distributed=None):
+    """
+    Args:
+        distributed: If None (default), auto-detect based on whether
+            dist is initialized. If False, always return local cache.
+            If True, always return distributed cache.
+    """
+    if distributed is None:
+        distributed = (
+            torch.distributed.is_initialized()
+            and torch.distributed.get_world_size() > 1
+        )
+    
+    device_type = torch.device(device).type if device != "disk" else "disk"
+    if device_type == "cpu":
+        return DistributedCPUCache if distributed else CPUCache
+    elif is_accelerator_type(device_type):
+        return DistributedDeviceCache if distributed else DeviceCache
+    elif device_type == "disk":
+        return DiskCache
+    ...
+```
+
+**Callers that should pass `distributed=False`**:
+- `set_onload_device()` when the model was loaded independently on each rank (no meta tensors)
+- Any path where the caller knows parameters are already materialized locally
+
+**Alternative approach** — context manager:
+
+```python
+# compressed_tensors/offload/cache/base.py
+
+_force_local_cache = threading.local()
+
+@contextlib.contextmanager
+def force_local_cache():
+    """Context under which cls_from_device always returns non-distributed caches."""
+    _force_local_cache.active = True
+    try:
+        yield
+    finally:
+        _force_local_cache.active = False
+
+@classmethod
+def cls_from_device(cls, device=None):
+    distributed = (
+        torch.distributed.is_initialized()
+        and torch.distributed.get_world_size() > 1
+        and not getattr(_force_local_cache, 'active', False)
+    )
+    ...
+```
+
+This lets llm-compressor wrap its pipeline with `force_local_cache()` without modifying every callsite.
+
+**Testing**: 
+- Existing tests pass (distributed cache still used by default)
+- DDP test with independent model loading uses local cache, no broadcast overhead
+
+---
+
+### PR 3: compressed-tensors — Wrap quant init with `disable_onloading()`
+
+**Problem**: `initialize_module_for_quantization` creates new parameters (scale, zero_point, etc.) which immediately trigger `DistributedCPUCache.offload()` → broadcast+barrier. These parameters are created identically on every rank, so broadcasting is always redundant.
+
+**Proposed fix**: Wrap the function body with `disable_onloading()`:
+
+```python
+# compressed_tensors/quantization/lifecycle/initialize.py
+
+def initialize_module_for_quantization(module, scheme=None, force_zero_point=True):
+    with disable_onloading():
+        # ... existing implementation ...
+```
+
+**Rationale**: New quant parameters are initialized from the quantization scheme (not from model weights), so they're identical across ranks by construction. There's no information to broadcast.
+
+**Testing**: DDP quantization should show no broadcast calls during `initialize_module_for_quantization`. Single-process behavior unchanged.
+
+---
+
+### Priority
+
+1. **PR 3** (highest): Universal fix, always correct, simple one-liner
+2. **PR 2** (high): Eliminates the main bottleneck for independent-loading DDP
+3. **PR 1** (medium): Required for multi-GPU-per-rank scenarios (GPUS_PER_GROUP > 1)
diff --git a/examples/autoround/ddp/ddp_autoround.py b/examples/autoround/ddp/ddp_autoround.py
new file mode 100644
index 0000000000..200f456f69
--- /dev/null
+++ b/examples/autoround/ddp/ddp_autoround.py
@@ -0,0 +1,187 @@
+"""
+DDP AutoRound quantization example for large MoE models.
+
+Runs 2 ranks, each using GPUS_PER_GROUP GPUs. All ranks load the model
+independently on CPU (safetensors mmap shares physical pages at OS level).
+
+Run with:
+  CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
+    --nproc_per_node=2 ddp_autoround.py \
+    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 2>&1 | tee test_ddp_autoround.log
+  CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
+    --nproc_per_node=2 ddp_autoround.py \
+    --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround.log
+  CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
+    --nproc_per_node=2 ddp_autoround.py \
+    --model /path/to/model
+"""
+
+import argparse
+import importlib
+import os
+import time
+
+import torch
+import torch.distributed as dist
+from loguru import logger
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+
+
+def patch_disable_onloading_for_quant_init():
+    """Avoid dist.broadcast + barrier for every new quant parameter.
+
+    compressed-tensors' initialize_module_for_quantization creates new
+    parameters which trigger DistributedCPUCache's per-param broadcast.
+    Wrapping with disable_onloading() prevents this.
+    """
+    from compressed_tensors.offload import disable_onloading
+
+    lifecycle_init_mod = importlib.import_module(
+        "compressed_tensors.quantization.lifecycle.initialize"
+    )
+    original_fn = lifecycle_init_mod.initialize_module_for_quantization
+    if getattr(original_fn, "_patched", False):
+        return
+
+    def patched(module, scheme=None, force_zero_point=True):
+        with disable_onloading():
+            return original_fn(module, scheme=scheme, force_zero_point=force_zero_point)
+
+    patched._patched = True
+    lifecycle_init_mod.initialize_module_for_quantization = patched
+
+
+def patch_force_local_cache():
+    """Force OffloadCache.cls_from_device to return non-distributed caches.
+
+    When ranks load the model independently, each already has parameters
+    locally. DistributedCPUCache's per-param broadcast+barrier is
+    unnecessary and causes O(n_params) collective ops (~218ms each).
+    """
+    from compressed_tensors.offload.cache.base import OffloadCache
+    from compressed_tensors.offload.cache.cpu import CPUCache
+    from compressed_tensors.offload.cache.device import DeviceCache
+    from compressed_tensors.offload.cache.disk import DiskCache
+    from compressed_tensors.utils import is_accelerator_type
+
+    @classmethod
+    def cls_from_device_local(cls, device=None):
+        device_type = torch.device(device).type if device != "disk" else "disk"
+        if device_type == "cpu":
+            return CPUCache
+        elif is_accelerator_type(device_type):
+            return DeviceCache
+        elif device_type == "disk":
+            return DiskCache
+        else:
+            raise NotImplementedError(f"Offload of type {device_type} not implemented")
+
+    OffloadCache.cls_from_device = cls_from_device_local
+    logger.info("Patched OffloadCache.cls_from_device → local (non-distributed) caches")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--scheme", type=str, default="W4A16")
+    parser.add_argument("--iters", type=int, default=5)
+    parser.add_argument("--nsamples", type=int, default=128)
+    args = parser.parse_args()
+
+    ###### DDP INIT #####
+    gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1"))
+    if "TORCHELASTIC_RUN_ID" in os.environ:
+        local_rank = int(os.environ["LOCAL_RANK"])
+        main_gpu = local_rank * gpus_per_group
+        torch.cuda.set_device(main_gpu)
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            device_id=torch.device(f"cuda:{main_gpu}"),
+        )
+
+    rank = dist.get_rank() if dist.is_initialized() else 0
+    world_size = dist.get_world_size() if dist.is_initialized() else 1
+    main_gpu = rank * gpus_per_group
+    logger.info(
+        f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, "
+        f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]"
+    )
+
+    # Apply patches BEFORE model loading and calibration
+    patch_disable_onloading_for_quant_init()
+    patch_force_local_cache()
+
+    ###### MODEL LOAD #####
+    load_start = time.perf_counter()
+    model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto")
+    load_elapsed = time.perf_counter() - load_start
+    logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s")
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    ###### DATASET #####
+    os.environ["AR_DISABLE_DATASET_SUBPROCESS"] = "1"
+    from auto_round.calib_dataset import get_dataset
+    from llmcompressor.modifiers.autoround import AutoRoundModifier
+
+    ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=args.nsamples)
+
+    ###### RECIPE #####
+    recipe = AutoRoundModifier(
+        targets="Linear",
+        scheme=args.scheme,
+        ignore=["lm_head", "re:.*mlp.gate$"],
+        iters=args.iters,
+        enable_torch_compile=False,
+    )
+
+    ###### QUANTIZE #####
+    logger.info(f"[Rank {rank}] Starting oneshot...")
+    quant_start = time.perf_counter()
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=2048,
+        num_calibration_samples=args.nsamples,
+        shuffle_calibration_samples=False,
+    )
+    quant_elapsed = time.perf_counter() - quant_start
+    logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s")
+
+    if dist.is_initialized():
+        dist.barrier()
+
+    ###### SAMPLE GENERATION (rank 0 only) #####
+    if rank == 0:
+        from compressed_tensors.offload import dispatch_model
+
+        logger.info("========== SAMPLE GENERATION ==============")
+        dispatch_model(model)
+        sample = tokenizer("Hello my name is", return_tensors="pt")
+        sample = {key: value.to(model.device) for key, value in sample.items()}
+        output = model.generate(**sample, max_new_tokens=100)
+        logger.info(tokenizer.decode(output[0]))
+        logger.info("==========================================")
+
+    ###### SAVE (rank 0 only) #####
+    if rank == 0:
+        save_dir = (
+            args.model.rstrip("/").split("/")[-1]
+            + f"-{args.scheme}-AutoRound"
+            + f"-iters{args.iters}-nsamples{args.nsamples}"
+            + f"-DDP{world_size}"
+        )
+        logger.info(f"Saving to {save_dir}...")
+        model.save_pretrained(save_dir, save_compressed=True)
+        tokenizer.save_pretrained(save_dir)
+        logger.info(f"Saved to {save_dir}")
+
+    if dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+
+    logger.info(f"[Rank {rank}] SUCCESS")
diff --git a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
index 3b15e6e5d7..e49c4d4a77 100644
--- a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
@@ -6,10 +6,10 @@
 convergence despite split calibration data.
 
 Usage (4 GPUs, 2 GPUs per group):
-  CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=2 \\
-      ddp_qwen3_multi_gpu_example.py \\
-      --model /storage/yiliu7/Qwen/Qwen3-8B \\
-      --scheme W4A16 \\
+  CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=2 \
+      ddp_qwen3_multi_gpu_example.py \
+      --model /storage/yiliu7/Qwen/Qwen3-8B \
+      --scheme W4A16 \
       --nsamples 32 --iters 50
 
 For single-GPU DDP:
diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md
index 234e002d48..099d45f523 100644
--- a/examples/autoround/ddp/reproduce.md
+++ b/examples/autoround/ddp/reproduce.md
@@ -1,6 +1,32 @@
 # Multi-GPU DDP AutoRound Reproduce
 
-## Command
+## torchrun (recommended)
+
+### 8B
+
+```bash
+cd /home/yiliu7/workspace/llm-compressor
+
+bash examples/autoround/ddp/launch_torchrun.sh \
+  --model /storage/yiliu7/Qwen/Qwen3-8B \
+  --scheme W4A16 \
+  --nsamples 32 --iters 50 \
+  --disable_torch_compile
+```
+
+### 235B
+
+```bash
+cd /home/yiliu7/workspace/llm-compressor
+
+AR_DISABLE_DATASET_SUBPROCESS=1 GPUS_PER_GROUP=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \
+/home/yiliu7/workspace/venvs/llmc/bin/torchrun --nproc_per_node=2 --master_port=29500 \
+examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py \
+--model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507 \
+--scheme W4A16 --nsamples 32 --iters 50 --disable_torch_compile
+```
+
+## bash wrapper (dedicated GPU isolation)
 
 ```bash
 cd /home/yiliu7/workspace/llm-compressor
@@ -9,45 +35,65 @@ AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NP
   bash examples/autoround/ddp/launch_multi_gpu.sh \
   ddp_qwen3_multi_gpu_example.py \
   --model /storage/yiliu7/Qwen/Qwen3-8B \
-  --gpus-per-group 2 \
   --scheme W4A16 \
   --nsamples 32 --iters 50 \
+  --disable_torch_compile \
   > /tmp/multi_gpu_test.log 2>&1 &
 ```
 
 ## Monitor
 
 ```bash
-# Check progress
 tail -f /tmp/multi_gpu_test.log
-# Check processes
 ps aux | grep ddp_qwen3_multi | grep -v grep
-# Check GPU usage
 nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader
-# Kill
-pkill -f ddp_qwen3_multi_gpu_example
+pkill -f ddp_qwen3_multi_gpu
 ```
 
-## Current State
+## Verified
+
+### 8B (2026-06-18)
+```
+quantized 7/7 layers in the block, loss iter 0: 19.067873 -> iter 0: 19.067873
+[Rank 0] Quantization completed
+Hello my name is Mandy I am 20 years old...
+```
+All 37 decoder layers quantized, identical loss across ranks, sample generation works.
 
-- ✅ 4 code changes implemented (launch_multi_gpu.sh, base.py, distributed.py, quantizer.py)
-- ✅ Model loading works with `device_map="auto"` (dispatch 547/547 in <1s)
-- ✅ GPU partitioning works (rank 0 → GPUs 0,1; rank 1 → GPUs 2,3)
-- 🔄 **Hang** after "Disabling tokenizer parallelism" warning — inside `get_dataset()`
-  - `AR_DISABLE_DATASET_SUBPROCESS=1` avoids the fork issue
-  - Dataset is cached, not downloading
-  - Both processes at ~100% CPU but no progress
+### 235B (2026-06-19)
+```
+quantized 388/389 layers in the block, loss iter 0: 0.211156 -> iter 0: 0.211156
+...
+[Rank 0] Quantization completed
+```
+All 94 decoder layers quantized (388 Linear per MoE block), identical loss across ranks. ~25 min for 1 iter.
 
 ## Key Files
 
 | File | Change |
 |------|--------|
-| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | NEW — multi-GPU DDP example |
-| `examples/autoround/ddp/launch_multi_gpu.sh` | NEW — bash wrapper for GPU partitioning |
-| `src/llmcompressor/modifiers/autoround/base.py` | `_update_device_map_for_dp` + auto_offload gate use `GPUS_PER_GROUP` |
-| `auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)` |
-| `auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` |
+| `examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py` | torchrun example with patches |
+| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | bash wrapper example |
+| `examples/autoround/ddp/fast_pipeline.py` | Replaces `SequentialPipeline.__call__` — no FX trace |
+| `examples/autoround/ddp/launch_torchrun.sh` | torchrun launcher |
+| `examples/autoround/ddp/launch_multi_gpu.sh` | bash wrapper (GPU partitioning) |
+| `src/llmcompressor/modifiers/autoround/base.py` | `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` |
+| `src/llmcompressor/pipelines/sequential/helpers.py` | Removed `disable_onloading()` from `trace_subgraphs` |
+| `ar-py/auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)`; `current_device()` for NCCL |
+| `ar-py/auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` |
+
+## Required env vars
+
+| Var | Value | Why |
+|-----|-------|-----|
+| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync |
+| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context |
+| `--disable_torch_compile` | flag | torch.compile can't handle cross-device tensors |
+
+## Known issue: FX trace bottleneck
+
+`trace_subgraphs` runs an FX trace on the full model — for 61K-module models (235B) it never finishes. The `fast_pipeline.py` module bypasses this by creating subgraphs directly from decoder layer names. This affects ALL models using `SequentialPipeline`, not just DDP. The AWQ example (`qwen3_moe_example_ddp.py`) with 30B MoE also hangs.
 
 ## Venv
 
-Python: `/home/yiliu7/workspace/venvs/ar/bin/python`
+Python: `/home/yiliu7/workspace/venvs/llmc/bin/python`
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 7e1738cf98..dcca16b32f 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -326,7 +326,8 @@ def apply_autoround(self, state, modules):
                 # so anchoring to first_param.device can place residual modules
                 # (e.g. norms) on local cuda:1 while hidden states begin on
                 # local cuda:0, causing cross-device forward failures.
-                device = torch.device("cuda:0")
+                from llmcompressor.utils.dev import get_main_device
+                device = get_main_device()
                 # Move decoding layer to CPU first, then the submodules
                 # will be re-dispatched by AutoRound.
                 decoding_layer.to("cpu")
@@ -388,11 +389,10 @@ def _update_device_map_for_dp(self, ar_kwargs):
                 return  # user explicitly set device_ids, respect it
             gpus_per_group = _get_local_gpu_group_size()
             if gpus_per_group > 1:
-                # Multi-GPU per group: pass comma-separated local GPU indices
-                # so AutoRound can load-balance submodules across GPUs.
-                # The group size is set by the launch_multi_gpu.sh wrapper.
+                local_rank = torch.distributed.get_rank()
+                start_gpu = local_rank * gpus_per_group
                 ar_kwargs["device_map"] = ",".join(
-                    str(i) for i in range(gpus_per_group)
+                    str(start_gpu + i) for i in range(gpus_per_group)
                 )
             else:
                 ar_kwargs["device_map"] = (
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 7a6c57b503..1b4e5ecbcf 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -7,7 +7,6 @@
 from typing import TYPE_CHECKING, Any, Callable
 
 import torch
-from compressed_tensors.offload import disable_onloading
 from compressed_tensors.utils import patch_attr
 from compressed_tensors.utils.match import match_named_modules
 from loguru import logger
@@ -121,9 +120,6 @@ def trace_subgraphs(
         assert isinstance(model.forward, MethodType)
         assert isinstance(type(model).forward, FunctionType)
 
-        # avoid device movement during tracing
-        stack.enter_context(disable_onloading())
-
         with append_autowrap_source_on_fail():
             graph = GraphModule(
                 model,
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index c948e9c3bf..4e8f703986 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -137,7 +137,7 @@ def get_main_device() -> torch.device:
 
     elif torch.accelerator.is_available():
         accel_type = torch.accelerator.current_accelerator().type
-        return torch.device(accel_type, rank)
+        return torch.device(accel_type, torch.accelerator.current_device_index())
     else:
         logger.warning("No accelerator available! Compressing model on CPU instead")
         return torch.device("cpu")

From 460b5290de3095cfffa39fbd0d24422024af9550 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 10:08:49 +0000
Subject: [PATCH 04/22] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/ddp/ddp_autoround.py | 47 ++++++++++++++++---------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/examples/autoround/ddp/ddp_autoround.py b/examples/autoround/ddp/ddp_autoround.py
index 200f456f69..0e3ed5eca3 100644
--- a/examples/autoround/ddp/ddp_autoround.py
+++ b/examples/autoround/ddp/ddp_autoround.py
@@ -7,10 +7,16 @@
 Run with:
   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
     --nproc_per_node=2 ddp_autoround.py \
-    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 2>&1 | tee test_ddp_autoround.log
+        --iters 100 \
+            --nsamples 256 \
+    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-2.log
   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
     --nproc_per_node=2 ddp_autoround.py \
-    --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround.log
+    --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-30.log
+  CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
+    --nproc_per_node=2 ddp_autoround.py \
+            --iters 100 --nsamples 256 \
+    --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-30.log
   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
     --nproc_per_node=2 ddp_autoround.py \
     --model /path/to/model
@@ -155,22 +161,20 @@ def cls_from_device_local(cls, device=None):
     if dist.is_initialized():
         dist.barrier()
 
-    ###### SAMPLE GENERATION (rank 0 only) #####
-    if rank == 0:
-        from compressed_tensors.offload import dispatch_model
-
-        logger.info("========== SAMPLE GENERATION ==============")
-        dispatch_model(model)
-        sample = tokenizer("Hello my name is", return_tensors="pt")
-        sample = {key: value.to(model.device) for key, value in sample.items()}
-        output = model.generate(**sample, max_new_tokens=100)
-        logger.info(tokenizer.decode(output[0]))
-        logger.info("==========================================")
 
     ###### SAVE (rank 0 only) #####
+    # Destroy process group before saving — compressed_tensors'
+    # save_pretrained detects DDP via dist.get_world_size() and
+    # tries replace_module_parallel, which fails on meta tensors
+    # left by the pipeline.
+    if dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+
     if rank == 0:
         save_dir = (
-            args.model.rstrip("/").split("/")[-1]
+             "/storage/yiliu7/Qwen/"
+             + args.model.rstrip("/").split("/")[-1]
             + f"-{args.scheme}-AutoRound"
             + f"-iters{args.iters}-nsamples{args.nsamples}"
             + f"-DDP{world_size}"
@@ -180,8 +184,17 @@ def cls_from_device_local(cls, device=None):
         tokenizer.save_pretrained(save_dir)
         logger.info(f"Saved to {save_dir}")
 
-    if dist.is_initialized():
-        dist.barrier()
-        dist.destroy_process_group()
+    ###### SAMPLE GENERATION (rank 0 only) #####
+    if rank == 0:
+        from compressed_tensors.offload import dispatch_model
+
+        logger.info("========== SAMPLE GENERATION ==============")
+        dispatch_model(model)
+        sample = tokenizer("Hello my name is", return_tensors="pt")
+        sample = {key: value.to(model.device) for key, value in sample.items()}
+        output = model.generate(**sample, max_new_tokens=100)
+        logger.info(tokenizer.decode(output[0]))
+        logger.info("==========================================")
+
 
     logger.info(f"[Rank {rank}] SUCCESS")

From 4807c78687969ec2e3e3efd8ee56b3d44160782c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 11:03:33 +0000
Subject: [PATCH 05/22] Clean PR: remove experimental files, revert unrelated
 dist.py change

- Delete experimental/debug scripts (repro_*.py, test_option*.py)
- Delete redundant examples (multi_gpu_torchrun.py, multi_gpu_example.py,
  fast_pipeline.py, launch scripts)
- Delete CHANGES.md (absorbed into DDP_FIXES.md)
- Revert dist.py CT version compat change (unrelated to DDP)
- Add FX_TRACE_ISSUE.md documentation
- Keep: base.py, helpers.py, dev.py, ddp_autoround.py, docs
---
 examples/autoround/ddp/FX_TRACE_ISSUE.md      |  58 ++
 .../ddp/ddp_qwen3_multi_gpu_example.py        | 659 ------------------
 examples/autoround/ddp/launch_multi_gpu.sh    |  79 ---
 src/llmcompressor/utils/dist.py               |  28 +-
 4 files changed, 64 insertions(+), 760 deletions(-)
 create mode 100644 examples/autoround/ddp/FX_TRACE_ISSUE.md
 delete mode 100644 examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
 delete mode 100755 examples/autoround/ddp/launch_multi_gpu.sh

diff --git a/examples/autoround/ddp/FX_TRACE_ISSUE.md b/examples/autoround/ddp/FX_TRACE_ISSUE.md
new file mode 100644
index 0000000000..62aa603d5a
--- /dev/null
+++ b/examples/autoround/ddp/FX_TRACE_ISSUE.md
@@ -0,0 +1,58 @@
+
+# FX Trace Bottleneck in SequentialPipeline
+
+## Problem
+
+`trace_subgraphs()` builds an FX graph of the full model (O(n_modules)) before per-layer calibration. For 235B with 61K modules, this never finishes.
+
+## Scope
+
+| Modifier | Pipeline | Needs trace? | 235B hangs? |
+|----------|----------|-------------|-------------|
+| RTN | `DataFreePipeline` | No | Never |
+| AWQ | `SequentialPipeline` | Yes | Only in DDP |
+| GPTQ | `SequentialPipeline` | Yes | Only in DDP |
+| AutoRound | `SequentialPipeline` | Yes | Only in DDP |
+
+## Root cause (DDP-specific)
+
+`load_offloaded_model()` → `from_accelerate()` → `dist.broadcast_object_list([61K-entry device_map, offload_dir])` serializes a massive dict via pickle. Rank 1's `dispatch_with_map` then creates OffloadCache for all 61K modules. Without DDP, `from_accelerate` dispatches locally — no broadcast, no wait.
+
+## Loading strategies for 235B DDP
+
+| Strategy | Load time | Trace | Works? |
+|----------|-----------|-------|--------|
+| `load_offloaded_model` + `device_map="auto"` (GPU) | 420s | Fast | No — OOM (1 GPU/rank, 178GB fills completely) |
+| `load_offloaded_model` + `device_map="auto_offload"` (CPU) | 10s | Hangs | No — 61K broadcast + dispatch |
+| CPU-only + sparse offload + `fast_pipeline.py` | 9s | 5s | **Yes** |
+
+## Fixes applied
+
+1. **`helpers.py`** — Removed `disable_onloading()` from `trace_subgraphs` (allows GPU onload)
+2. **`fast_pipeline.py`** — Replaces `SequentialPipeline.__call__` with regex-based layer scanning, no FX trace. Required for 235B DDP.
+3. **`distributed.py`** — Fixed `comm_device` to use `current_device()`; returns `(block, sync_fn)`
+4. **`quantizer.py`** — Captures return, calls `sync_gradients()` before `_step()`
+5. **`base.py`** — `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP`
+
+## Upstream plan
+
+The FX trace is the correct architecture — it handles arbitrary model graphs. For LLMs, a fast path that regex-matches `model.layers.*` is safe. The `fast_pipeline.py` logic should move into `helpers.py` as `trace_subgraphs_fast()`, gated by a `DatasetArguments.sequential_fast_trace` flag or auto-enabled when `module_count > threshold`.
+
+## Environment
+
+| Component | Path |
+|-----------|------|
+| Python | `/home/yiliu7/workspace/venvs/llmc/bin/python` |
+| torchrun | `/home/yiliu7/workspace/venvs/llmc/bin/torchrun` |
+| llm-compressor | `/home/yiliu7/workspace/llm-compressor` |
+| auto-round | `/home/yiliu7/workspace/ar-py` (used by venv) |
+| GPUs | 8× NVIDIA B200, 180 GiB each |
+| Test GPU subset | `CUDA_VISIBLE_DEVICES=0,1,2,3` |
+
+## Required env vars
+
+| Var | Value | Why |
+|-----|-------|-----|
+| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync |
+| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context in `calib_dataset.py` |
+| `CUDA_VISIBLE_DEVICES` | `0,1,2,3` | GPU partition (4 GPUs for 2 ranks) |
diff --git a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
deleted file mode 100644
index e49c4d4a77..0000000000
--- a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py
+++ /dev/null
@@ -1,659 +0,0 @@
-"""
-Multi-GPU per group DDP example with AutoRound quantization.
-
-Each rank gets a local GPU group for block-level model parallelism, while
-gradients are synchronized across ranks via all_reduce for identical
-convergence despite split calibration data.
-
-Usage (4 GPUs, 2 GPUs per group):
-  CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=2 \
-      ddp_qwen3_multi_gpu_example.py \
-      --model /storage/yiliu7/Qwen/Qwen3-8B \
-      --scheme W4A16 \
-      --nsamples 32 --iters 50
-
-For single-GPU DDP:
-  torchrun --nproc_per_node=4 ddp_qwen3_multi_gpu_example.py ...
-"""
-
-import argparse
-import importlib
-import os
-import sys
-import time
-from pathlib import Path
-
-import psutil
-import torch
-import torch.distributed as dist
-from compressed_tensors.offload import dispatch_model, from_accelerate, load_offloaded_model
-from loguru import logger
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-
-
-class StopAfterBlocks(RuntimeError):
-    pass
-
-
-def fix_everything(seed=42):
-    import random
-
-    import numpy as np
-
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-
-
-def config_deterministic():
-    torch.use_deterministic_algorithms(True, warn_only=False)
-    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-    fix_everything()
-
-
-def init_dist_multi_gpu(gpus_per_group=2):
-    """Initialize distributed with multiple GPUs per group.
-
-    ``CUDA_VISIBLE_DEVICES`` must already be set to a disjoint subset of
-    GPUs for this rank (the ``launch_multi_gpu.sh`` wrapper handles this).
-    NCCL communication uses the first visible GPU (local cuda:0).
-
-    Example with 4 physical GPUs, 2 per group:
-      - Rank 0 -> local cuda:0, cuda:1 (physical 0, 1)
-      - Rank 1 -> local cuda:0, cuda:1 (physical 2, 3)
-    """
-    rank = int(os.environ.get("RANK", "0"))
-    world_size = int(os.environ.get("WORLD_SIZE", "1"))
-
-    if world_size < 2:
-        logger.info("Single-process mode, skipping distributed init")
-        return
-
-    # NCCL uses the first visible GPU
-    torch.cuda.set_device(0)
-
-    dist.init_process_group(
-        backend="nccl",
-        init_method="env://",
-        rank=rank,
-        world_size=world_size,
-        device_id=torch.device("cuda:0"),
-    )
-    dist.barrier()
-    actual_count = torch.cuda.device_count()
-    logger.info(
-        f"[Rank {rank}/{world_size}] CUDA_VISIBLE_DEVICES="
-        f"{os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')} "
-        f"(visible GPUs: {actual_count})"
-    )
-    if actual_count < gpus_per_group:
-        logger.warning(
-            f"[Rank {rank}] Expected {gpus_per_group} GPUs but only "
-            f"{actual_count} are visible"
-        )
-
-
-def get_dist_info():
-    if dist.is_available() and dist.is_initialized():
-        return dist.get_rank(), dist.get_world_size()
-    return 0, 1
-
-
-def _module_has_direct_tensors(module: torch.nn.Module) -> bool:
-    return any(t is not None for t in module._parameters.values()) or any(
-        t is not None for t in module._buffers.values()
-    )
-
-
-def _module_has_meta_tensors(module: torch.nn.Module) -> bool:
-    return any(
-        t is not None and t.device.type == "meta"
-        for t in module._parameters.values()
-    ) or any(t is not None and t.device.type == "meta" for t in module._buffers.values())
-
-
-def patch_ct_dispatch_for_sparse_offload():
-    """Avoid wrapping modules that do not need compressed-tensors offload hooks."""
-    dispatch_mod = importlib.import_module("compressed_tensors.offload.dispatch")
-    fa_mod = importlib.import_module("compressed_tensors.offload.convert.from_accelerate")
-
-    if getattr(dispatch_mod.dispatch_with_map, "_llmc_sparse_patch", False):
-        return
-
-    offload_module = dispatch_mod.offload_module
-    tqdm = dispatch_mod.tqdm
-
-    def optimized_dispatch_with_map(
-        model: torch.nn.Module,
-        device_map,
-        offload_dir: str | None = None,
-        show_progress: bool = True,
-    ):
-        filtered = []
-        skipped_noop = 0
-        skipped_empty = 0
-        skipped_cpu_cpu = 0
-        kept_meta_materialization = 0
-
-        for name, (onload_device, offload_device) in device_map.items():
-            if offload_device is None:
-                skipped_noop += 1
-                continue
-
-            module = model.get_submodule(name)
-            if not _module_has_direct_tensors(module):
-                skipped_empty += 1
-                continue
-
-            if (
-                str(onload_device) == "cpu"
-                and str(offload_device) == "cpu"
-                and not _module_has_meta_tensors(module)
-            ):
-                skipped_cpu_cpu += 1
-                continue
-
-            if str(onload_device) == "cpu" and str(offload_device) == "cpu":
-                kept_meta_materialization += 1
-
-            filtered.append((name, onload_device, offload_device))
-
-        logger.info(
-            "Compressed-tensors dispatch filtered {} -> {} modules "
-            "(noop={}, empty={}, cpu_to_cpu_skipped={}, cpu_to_cpu_meta_kept={})",
-            len(device_map),
-            len(filtered),
-            skipped_noop,
-            skipped_empty,
-            skipped_cpu_cpu,
-            kept_meta_materialization,
-        )
-
-        for name, onload_device, offload_device in tqdm(
-            filtered,
-            desc="Dispatching model",
-            disable=(not show_progress),
-        ):
-            module = model.get_submodule(name)
-            if offload_device == "disk":
-                offload_module(
-                    module,
-                    onload_device,
-                    offload_device,
-                    offload_dir=offload_dir,
-                )
-            else:
-                offload_module(module, onload_device, offload_device)
-
-    optimized_dispatch_with_map._llmc_sparse_patch = True
-    dispatch_mod.dispatch_with_map = optimized_dispatch_with_map
-    fa_mod.dispatch_with_map = optimized_dispatch_with_map
-
-
-def _rank_offload_folder(base_folder: str | None) -> str | None:
-    if not base_folder:
-        return None
-
-    rank, _ = get_dist_info()
-    rank_folder = Path(base_folder) / f"rank{rank}"
-    rank_folder.mkdir(parents=True, exist_ok=True)
-    return str(rank_folder)
-
-
-def _independent_cpu_max_memory(extra_cpu_mem: int = int(5e9)) -> dict[str, int]:
-    _, world_size = get_dist_info()
-    per_rank_available = psutil.virtual_memory().available // max(world_size, 1)
-    return {"cpu": max(per_rank_available - extra_cpu_mem, int(8e9))}
-
-
-def load_model_with_local_offload(model_id: str, offload_folder: str | None):
-    """Load model on each rank independently, then convert accelerate offload locally."""
-    load_kwargs = {
-        "dtype": "auto",
-        "device_map": "auto",
-        "max_memory": _independent_cpu_max_memory(),
-    }
-    rank_offload_folder = _rank_offload_folder(offload_folder)
-    if rank_offload_folder:
-        load_kwargs["offload_folder"] = rank_offload_folder
-
-    logger.info(
-        "[Rank {}] Loading model independently with max_memory={} offload_folder={}",
-        get_dist_info()[0],
-        load_kwargs["max_memory"],
-        rank_offload_folder,
-    )
-    model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
-    if hasattr(model, "hf_device_map"):
-        from_accelerate(model)
-    return model
-
-
-def patch_disable_onloading_for_quant_init():
-    """Avoid expensive dist.broadcast + barrier for every new quant param.
-
-    When DDP is initialized before model loading,
-    ``OffloadCache.cls_from_device`` selects distributed cache variants
-    (DistributedCPUCache / DistributedDiskCache).  Each call to
-    ``register_parameter`` inside ``initialize_module_for_quantization``
-    triggers ``offload()``, which does a collective broadcast + barrier.
-    For large MoE models (e.g. Qwen3-235B with 100K+ Linear layers × 6
-    quant params), this means hundreds of thousands of round-trips —
-    effectively hanging the process.
-
-    Wrapping the body in ``disable_onloading()`` stores new parameters
-    directly in ``offloaded_values`` without invoking the distributed
-    offload, cutting the overhead to zero.
-    """
-    from compressed_tensors.offload import (  # noqa: F811
-        disable_onloading,
-    )
-
-    lifecycle_init_mod = importlib.import_module(
-        "compressed_tensors.quantization.lifecycle.initialize"
-    )
-    original_fn = lifecycle_init_mod.initialize_module_for_quantization
-
-    if getattr(original_fn, "_llmc_no_dist_offload_patch", False):
-        return
-
-    def patched_initialize_module_for_quantization(module, scheme=None, force_zero_point=True):
-        with disable_onloading():
-            return original_fn(module, scheme=scheme, force_zero_point=force_zero_point)
-
-    patched_initialize_module_for_quantization._llmc_no_dist_offload_patch = True
-    lifecycle_init_mod.initialize_module_for_quantization = (
-        patched_initialize_module_for_quantization
-    )
-
-
-def patch_autoround_stop_after_blocks(max_blocks: int):
-    """Raise after N decoding blocks finish so large-model smoke tests can stop cleanly."""
-    autoround_mod = importlib.import_module("llmcompressor.modifiers.autoround.base")
-    modifier_cls = autoround_mod.AutoRoundModifier
-
-    if getattr(modifier_cls.apply_autoround, "_llmc_stop_after_patch", False):
-        return
-
-    original_apply_autoround = modifier_cls.apply_autoround
-
-    def wrapped_apply_autoround(self, state, modules):
-        modules = modules or []
-        if not any(self._is_decoding_layer(module) for module in modules):
-            return original_apply_autoround(self, state, modules)
-
-        result = original_apply_autoround(self, state, modules)
-        completed = getattr(self, "_llmc_completed_blocks", 0) + 1
-        self._llmc_completed_blocks = completed
-        logger.info(
-            "[Rank {}] Completed AutoRound block {}/{}",
-            get_dist_info()[0],
-            completed,
-            max_blocks,
-        )
-        if completed >= max_blocks:
-            raise StopAfterBlocks(f"Stopped after {completed} blocks")
-        return result
-
-    wrapped_apply_autoround._llmc_stop_after_patch = True
-    modifier_cls.apply_autoround = wrapped_apply_autoround
-
-
-def patch_llmc_timing_logs():
-    """Add coarse timing logs around the expensive LLMC startup stages."""
-    recipe_mod = importlib.import_module("llmcompressor.recipe.recipe")
-    lifecycle_mod = importlib.import_module("llmcompressor.core.lifecycle")
-    quant_mixin_mod = importlib.import_module(
-        "llmcompressor.modifiers.quantization.quantization.mixin"
-    )
-    quantization_base_mod = importlib.import_module(
-        "compressed_tensors.quantization"
-    )
-    module_utils_mod = importlib.import_module("compressed_tensors.utils")
-    group_validation_mod = importlib.import_module(
-        "llmcompressor.modifiers.quantization.group_size_validation"
-    )
-    seq_helpers_mod = importlib.import_module("llmcompressor.pipelines.sequential.helpers")
-    seq_pipeline_mod = importlib.import_module("llmcompressor.pipelines.sequential.pipeline")
-    cache_mod = importlib.import_module("llmcompressor.pipelines.cache")
-    autoround_mod = importlib.import_module("llmcompressor.modifiers.autoround.base")
-    core_mod = importlib.import_module("llmcompressor.core")
-
-    recipe_cls = recipe_mod.Recipe
-    lifecycle_cls = lifecycle_mod.CompressionLifecycle
-    quant_mixin_cls = quant_mixin_mod.QuantizationMixin
-    cache_cls = cache_mod.IntermediatesCache
-    autoround_cls = autoround_mod.AutoRoundModifier
-    seq_pipeline_cls = seq_pipeline_mod.SequentialPipeline
-    lifecycle_callbacks = core_mod.LifecycleCallbacks
-
-    if getattr(recipe_cls.from_modifiers, "_llmc_timing_patch", False):
-        return
-
-    original_from_modifiers = recipe_cls.from_modifiers.__func__
-    original_lifecycle_initialize = lifecycle_cls.initialize
-    original_initialize_quantization = quant_mixin_cls.initialize_quantization
-    original_start_calibration = autoround_cls.start_calibration
-    original_trace_subgraphs = seq_helpers_mod.trace_subgraphs
-    original_from_dataloader = cache_cls.from_dataloader.__func__
-    original_apply_autoround = autoround_cls.apply_autoround
-    original_seq_call = seq_pipeline_cls.__call__
-    original_calib_epoch_start = lifecycle_callbacks.calibration_epoch_start
-    original_match_named_modules = module_utils_mod.match_named_modules
-    original_apply_quantization_config = quantization_base_mod.apply_quantization_config
-    original_validate_group_size_divisibility = (
-        group_validation_mod.validate_group_size_divisibility
-    )
-
-    def _timed(label, fn, *args, **kwargs):
-        start = time.perf_counter()
-        logger.info("[Rank {}] {} started", get_dist_info()[0], label)
-        try:
-            return fn(*args, **kwargs)
-        finally:
-            logger.info(
-                "[Rank {}] {} finished in {:.2f}s",
-                get_dist_info()[0],
-                label,
-                time.perf_counter() - start,
-            )
-
-    @classmethod
-    def timed_from_modifiers(cls, modifiers, modifier_group_name=None):
-        return _timed(
-            "Recipe.from_modifiers",
-            original_from_modifiers,
-            cls,
-            modifiers,
-            modifier_group_name,
-        )
-
-    def timed_lifecycle_initialize(self, *args, **kwargs):
-        return _timed(
-            "CompressionLifecycle.initialize",
-            original_lifecycle_initialize,
-            self,
-            *args,
-            **kwargs,
-        )
-
-    def timed_initialize_quantization(self, model):
-        return _timed(
-            "QuantizationMixin.initialize_quantization",
-            original_initialize_quantization,
-            self,
-            model,
-        )
-
-    def timed_start_calibration(self, model):
-        return _timed(
-            "AutoRoundModifier.start_calibration",
-            original_start_calibration,
-            self,
-            model,
-        )
-
-    def timed_trace_subgraphs(*args, **kwargs):
-        return _timed("trace_subgraphs", original_trace_subgraphs, *args, **kwargs)
-
-    @classmethod
-    def timed_from_dataloader(cls, *args, **kwargs):
-        return _timed(
-            "IntermediatesCache.from_dataloader",
-            original_from_dataloader,
-            cls,
-            *args,
-            **kwargs,
-        )
-
-    def timed_apply_autoround(self, state, modules):
-        modules = modules or []
-        decoding_layers = [m for m in modules if self._is_decoding_layer(m)]
-        if not decoding_layers:
-            return original_apply_autoround(self, state, modules)
-        layer_name = getattr(decoding_layers[0], "_tmp_name", decoding_layers[0].__class__.__name__)
-        return _timed(
-            f"AutoRoundModifier.apply_autoround({layer_name})",
-            original_apply_autoround,
-            self,
-            state,
-            modules,
-        )
-
-    def timed_seq_call(model, dataloader, dataset_args):
-        pipeline_start = time.perf_counter()
-        logger.info("[Rank {}] SequentialPipeline.__call__ started", get_dist_info()[0])
-        try:
-            logger.info("[Rank {}] SequentialPipeline pre-next(iter(dataloader))", get_dist_info()[0])
-            iter_start = time.perf_counter()
-            sample_input = next(iter(dataloader))
-            logger.info(
-                "[Rank {}] next(iter(dataloader)) finished in {:.2f}s",
-                get_dist_info()[0],
-                time.perf_counter() - iter_start,
-            )
-            del sample_input
-            return original_seq_call(model, dataloader, dataset_args)
-        finally:
-            logger.info(
-                "[Rank {}] SequentialPipeline.__call__ finished in {:.2f}s",
-                get_dist_info()[0],
-                time.perf_counter() - pipeline_start,
-            )
-
-    def timed_calib_epoch_start(*args, **kwargs):
-        return _timed(
-            "LifecycleCallbacks.calibration_epoch_start",
-            original_calib_epoch_start,
-            *args,
-            **kwargs,
-        )
-
-    def timed_match_named_modules(*args, **kwargs):
-        return _timed("match_named_modules", original_match_named_modules, *args, **kwargs)
-
-    def timed_apply_quantization_config(*args, **kwargs):
-        return _timed(
-            "apply_quantization_config",
-            original_apply_quantization_config,
-            *args,
-            **kwargs,
-        )
-
-    def timed_validate_group_size_divisibility(*args, **kwargs):
-        return _timed(
-            "validate_group_size_divisibility",
-            original_validate_group_size_divisibility,
-            *args,
-            **kwargs,
-        )
-
-    timed_from_modifiers._llmc_timing_patch = True
-    recipe_cls.from_modifiers = timed_from_modifiers
-    lifecycle_cls.initialize = timed_lifecycle_initialize
-    quant_mixin_cls.initialize_quantization = timed_initialize_quantization
-    autoround_cls.start_calibration = timed_start_calibration
-    module_utils_mod.match_named_modules = timed_match_named_modules
-    quant_mixin_mod.match_named_modules = timed_match_named_modules
-    quantization_base_mod.apply_quantization_config = timed_apply_quantization_config
-    quant_mixin_mod.apply_quantization_config = timed_apply_quantization_config
-    group_validation_mod.validate_group_size_divisibility = timed_validate_group_size_divisibility
-    quant_mixin_mod.validate_group_size_divisibility = timed_validate_group_size_divisibility
-    seq_helpers_mod.trace_subgraphs = timed_trace_subgraphs
-    seq_pipeline_mod.trace_subgraphs = timed_trace_subgraphs
-    cache_cls.from_dataloader = timed_from_dataloader
-    autoround_cls.apply_autoround = timed_apply_autoround
-    seq_pipeline_cls.__call__ = staticmethod(timed_seq_call)
-    lifecycle_callbacks.calibration_epoch_start = timed_calib_epoch_start
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="AutoRound Quantization with Multi-GPU per Group DDP"
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="Qwen/Qwen3-8B",
-        help="Model name or path",
-    )
-    parser.add_argument(
-        "--gpus-per-group",
-        type=int,
-        default=2,
-        help="Number of GPUs per rank-local group for block sharding (default: 2)",
-    )
-    parser.add_argument(
-        "--scheme",
-        type=str,
-        default="W4A16",
-        help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)",
-    )
-    parser.add_argument("--iters", type=int, default=20, help="Number of iterations")
-    parser.add_argument("--nsamples", type=int, default=128, help="Number of samples")
-    parser.add_argument(
-        "--disable_torch_compile",
-        action="store_true",
-        help="Disable torch.compile for model acceleration during quantization",
-    )
-    parser.add_argument(
-        "--deterministic",
-        action="store_true",
-        help="Enable deterministic mode for reproducibility",
-    )
-    parser.add_argument(
-        "--offload-folder",
-        type=str,
-        default=None,
-        help="Optional folder for disk offload while loading very large models",
-    )
-    parser.add_argument(
-        "--max-blocks",
-        type=int,
-        default=None,
-        help="Optional number of decoder blocks to quantize before exiting",
-    )
-    args = parser.parse_args()
-
-    if args.deterministic:
-        config_deterministic()
-
-    model_id = args.model
-
-    ###### MULTI-GPU DDP INIT #####
-    init_dist_multi_gpu(gpus_per_group=args.gpus_per_group)
-    patch_ct_dispatch_for_sparse_offload()
-    patch_llmc_timing_logs()
-    patch_disable_onloading_for_quant_init()
-    if args.max_blocks is not None:
-        patch_autoround_stop_after_blocks(args.max_blocks)
-    # Load onto CPU first and spill to disk if needed. AutoRound will then
-    # onload and shard each block onto the rank-local GPU group during tuning.
-    load_start = time.perf_counter()
-    rank, world_size = get_dist_info()
-    if world_size > 1:
-        model = load_model_with_local_offload(model_id, args.offload_folder)
-    else:
-        load_kwargs = {
-            "dtype": "auto",
-            "device_map": "auto_offload",
-        }
-        rank_offload_folder = _rank_offload_folder(args.offload_folder)
-        if rank_offload_folder:
-            load_kwargs["offload_folder"] = rank_offload_folder
-        with load_offloaded_model():
-            model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
-    logger.info(
-        "[Rank {}] Model load + offload conversion finished in {:.2f}s",
-        rank,
-        time.perf_counter() - load_start,
-    )
-    ###############################
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-    NUM_CALIBRATION_SAMPLES = args.nsamples
-    MAX_SEQUENCE_LENGTH = 2048
-    ITERS = args.iters
-
-    # Get aligned calibration dataset.
-    from auto_round.calib_dataset import get_dataset  # noqa: E402
-
-    # Note: Make sure model are loaded before importing auto-round related code.
-    from llmcompressor.modifiers.autoround import AutoRoundModifier  # noqa: E402
-
-    ds = get_dataset(
-        tokenizer=tokenizer,
-        seqlen=MAX_SEQUENCE_LENGTH,
-        nsamples=NUM_CALIBRATION_SAMPLES,
-    )
-
-    # Configure the quantization algorithm.
-    recipe = AutoRoundModifier(
-        targets="Linear",
-        scheme=args.scheme,
-        ignore=[
-            "lm_head",
-            "re:.*mlp.gate$",
-        ],
-        iters=ITERS,
-        enable_torch_compile=not args.disable_torch_compile,
-    )
-
-    # Apply algorithms.
-    stopped_early = False
-    try:
-        oneshot(
-            model=model,
-            dataset=ds,
-            recipe=recipe,
-            max_seq_length=MAX_SEQUENCE_LENGTH,
-            num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-            shuffle_calibration_samples=False,
-        )
-    except StopAfterBlocks as exc:
-        stopped_early = True
-        logger.info("[Rank {}] {}", get_dist_info()[0], str(exc))
-
-    rank, world_size = get_dist_info()
-    if stopped_early:
-        logger.info(f"[Rank {rank}] Partial quantization completed")
-    else:
-        logger.info(f"[Rank {rank}] Quantization completed")
-
-    if dist.is_available() and dist.is_initialized():
-        dist.barrier()
-        dist.destroy_process_group()
-
-    if rank != 0:
-        sys.exit(0)
-
-    if stopped_early:
-        sys.exit(0)
-
-    if rank == 0:
-        # Confirm generations of the quantized model look sane.
-        logger.info("\n\n")
-        logger.info("========== SAMPLE GENERATION ==============")
-        dispatch_model(model)
-        sample = tokenizer("Hello my name is", return_tensors="pt")
-        sample_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        sample = {key: value.to(sample_device) for key, value in sample.items()}
-        output = model.generate(**sample, max_new_tokens=100)
-        logger.info(tokenizer.decode(output[0]))
-        logger.info("==========================================\n\n")
-
-        logger.info("Saving...")
-        SAVE_DIR = (
-            model_id.rstrip("/").split("/")[-1]
-            + f"-{args.scheme}-AutoRound"
-            + f"-iters{args.iters}-nsamples{args.nsamples}"
-            + "-MultiGPUDDP"
-            + str(world_size)
-        )
-        model.save_pretrained(SAVE_DIR, save_compressed=True)
-        tokenizer.save_pretrained(SAVE_DIR)
-        logger.info(f"Saved to {SAVE_DIR}")
diff --git a/examples/autoround/ddp/launch_multi_gpu.sh b/examples/autoround/ddp/launch_multi_gpu.sh
deleted file mode 100755
index 14e40c9a78..0000000000
--- a/examples/autoround/ddp/launch_multi_gpu.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Launch multi-GPU per group DDP training.
-#
-# Partitions physical GPUs into groups, one group per process/rank.
-# Each rank sees its own set of GPUs via CUDA_VISIBLE_DEVICES.
-#
-# Usage:
-#   GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model ... --scheme W4A16
-#   GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507  --scheme W4A16
-#
-# This spawns 2 ranks, each with 2 GPUs (4 GPUs total).
-# The Python script no longer needs to override CUDA_VISIBLE_DEVICES.
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-GPUS_PER_GROUP=${GPUS_PER_GROUP:-${GPUS_PER_RANK:-2}}
-NPROC=${NPROC:-2}  # number of ranks
-PYTHON=${PYTHON:-/home/yiliu7/workspace/venvs/ar/bin/python}
-MASTER_PORT=${MASTER_PORT:-29600}
-MASTER_ADDR=${MASTER_ADDR:-localhost}
-NNODES=${NNODES:-1}
-NODE_RANK=${NODE_RANK:-0}
-
-SCRIPT="$1"
-shift
-
-echo "Launching $NPROC ranks, $GPUS_PER_GROUP GPUs each"
-echo "Python: $PYTHON"
-echo "Script: $SCRIPT"
-
-VISIBLE_GPUS_ENV=${CUDA_VISIBLE_DEVICES:-}
-if [[ -n "$VISIBLE_GPUS_ENV" ]]; then
-    IFS=',' read -r -a VISIBLE_GPUS <<< "$VISIBLE_GPUS_ENV"
-else
-    VISIBLE_GPUS=()
-fi
-
-TOTAL_GPUS_NEEDED=$((NPROC * GPUS_PER_GROUP))
-if [[ ${#VISIBLE_GPUS[@]} -gt 0 && ${#VISIBLE_GPUS[@]} -ne $TOTAL_GPUS_NEEDED ]]; then
-    echo "Expected $TOTAL_GPUS_NEEDED GPUs in CUDA_VISIBLE_DEVICES, got ${#VISIBLE_GPUS[@]}: $VISIBLE_GPUS_ENV" >&2
-    exit 1
-fi
-
-pids=()
-for RANK in $(seq 0 $((NPROC - 1))); do
-    if [[ ${#VISIBLE_GPUS[@]} -gt 0 ]]; then
-        GPU_OFFSET=$((RANK * GPUS_PER_GROUP))
-        GPU_LIST=$(IFS=,; echo "${VISIBLE_GPUS[*]:$GPU_OFFSET:$GPUS_PER_GROUP}")
-    else
-        GPU_START=$((NODE_RANK * NPROC * GPUS_PER_GROUP + RANK * GPUS_PER_GROUP))
-        GPU_END=$((GPU_START + GPUS_PER_GROUP - 1))
-        GPU_LIST=$(seq -s, $GPU_START $GPU_END)
-    fi
-    echo "  Rank $RANK -> GPUs $GPU_LIST"
-
-    CUDA_VISIBLE_DEVICES="$GPU_LIST" \
-    AR_DISABLE_DATASET_SUBPROCESS=1 \
-    LOCAL_RANK=0 \
-    RANK=$((NODE_RANK * NPROC + RANK)) \
-    WORLD_SIZE=$((NNODES * NPROC)) \
-    MASTER_ADDR="$MASTER_ADDR" \
-    MASTER_PORT="$MASTER_PORT" \
-    TORCHELASTIC_RUN_ID="multi_gpu_$(date +%s)_$$" \
-    GPUS_PER_GROUP="$GPUS_PER_GROUP" \
-    "$PYTHON" -u "$SCRIPT_DIR/$SCRIPT" "$@" &
-
-    pids+=($!)
-    # Small delay so workers don't race for port binding
-    sleep 0.5
-done
-
-# Wait for all processes
-status=0
-for pid in "${pids[@]}"; do
-    if ! wait "$pid"; then
-        status=1
-    fi
-done
-exit $status
diff --git a/src/llmcompressor/utils/dist.py b/src/llmcompressor/utils/dist.py
index a1f75af804..c4a04d42eb 100644
--- a/src/llmcompressor/utils/dist.py
+++ b/src/llmcompressor/utils/dist.py
@@ -1,17 +1,11 @@
 from typing import Hashable, TypeVar
 
-try:
-    from compressed_tensors.distributed import (
-        greedy_bin_packing as _greedy_bin_packing,
-    )
-    from compressed_tensors.distributed import (
-        wait_for_comms as _wait_for_comms,
-    )
-except ImportError:
-    # compressed_tensors<0.16 does not have the distributed submodule
-    _greedy_bin_packing = None
-    _wait_for_comms = None
-
+from compressed_tensors.distributed import (
+    greedy_bin_packing as _greedy_bin_packing,
+)
+from compressed_tensors.distributed import (
+    wait_for_comms as _wait_for_comms,
+)
 from compressed_tensors.utils.helpers import deprecated
 
 T = TypeVar("T", bound=Hashable)
@@ -35,11 +29,6 @@ def greedy_bin_packing(*args, **kwargs) -> tuple[list[T], list[list[T]], dict[T,
           the list of items assigned to that bin.
         - item_to_bin: mapping from each item to its assigned bin index.
     """
-    if _greedy_bin_packing is None:
-        raise ImportError(
-            "greedy_bin_packing requires compressed-tensors>=0.16 "
-            "(distributed submodule not found)"
-        )
     return _greedy_bin_packing(*args, **kwargs)
 
 
@@ -55,9 +44,4 @@ def wait_for_comms(*args, **kwargs) -> None:
         ``async_op=True``). The list is cleared after all operations
         have completed.
     """
-    if _wait_for_comms is None:
-        raise ImportError(
-            "wait_for_comms requires compressed-tensors>=0.16 "
-            "(distributed submodule not found)"
-        )
     return _wait_for_comms(*args, **kwargs)

From 3e40140f69478014e06516c274adedbf35bb9ae4 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 11:15:58 +0000
Subject: [PATCH 06/22] Revert: restore disable_onloading() in trace_subgraphs

---
 src/llmcompressor/pipelines/sequential/helpers.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 1b4e5ecbcf..7a6c57b503 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -7,6 +7,7 @@
 from typing import TYPE_CHECKING, Any, Callable
 
 import torch
+from compressed_tensors.offload import disable_onloading
 from compressed_tensors.utils import patch_attr
 from compressed_tensors.utils.match import match_named_modules
 from loguru import logger
@@ -120,6 +121,9 @@ def trace_subgraphs(
         assert isinstance(model.forward, MethodType)
         assert isinstance(type(model).forward, FunctionType)
 
+        # avoid device movement during tracing
+        stack.enter_context(disable_onloading())
+
         with append_autowrap_source_on_fail():
             graph = GraphModule(
                 model,

From e1e6c991a3cf8c86c278206df2d3ac97700e11e6 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 11:38:42 +0000
Subject: [PATCH 07/22] clean

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/ddp/ddp_autoround.py       | 20 +++----------------
 src/llmcompressor/modifiers/autoround/base.py |  9 ---------
 2 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/examples/autoround/ddp/ddp_autoround.py b/examples/autoround/ddp/ddp_autoround.py
index 0e3ed5eca3..89f961377e 100644
--- a/examples/autoround/ddp/ddp_autoround.py
+++ b/examples/autoround/ddp/ddp_autoround.py
@@ -7,19 +7,8 @@
 Run with:
   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
     --nproc_per_node=2 ddp_autoround.py \
-        --iters 100 \
-            --nsamples 256 \
-    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-2.log
-  CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 ddp_autoround.py \
-    --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-30.log
-  CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 ddp_autoround.py \
-            --iters 100 --nsamples 256 \
-    --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-30.log
-  CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 ddp_autoround.py \
-    --model /path/to/model
+    --iters 100 --nsamples 256 \
+    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 
 """
 
 import argparse
@@ -117,6 +106,7 @@ def cls_from_device_local(cls, device=None):
     )
 
     # Apply patches BEFORE model loading and calibration
+    # FIXME: (yiliu30) remove these patched before merging once the underlying issues are fixed
     patch_disable_onloading_for_quant_init()
     patch_force_local_cache()
 
@@ -163,10 +153,6 @@ def cls_from_device_local(cls, device=None):
 
 
     ###### SAVE (rank 0 only) #####
-    # Destroy process group before saving — compressed_tensors'
-    # save_pretrained detects DDP via dist.get_world_size() and
-    # tries replace_module_parallel, which fails on meta tensors
-    # left by the pipeline.
     if dist.is_initialized():
         dist.barrier()
         dist.destroy_process_group()
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index dcca16b32f..8f591d2343 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -284,15 +284,6 @@ def apply_autoround(self, state, modules):
             "ignore_layers": ",".join(ignore_layers) if ignore_layers else "",
             "disable_opt_rtn": self.disable_opt_rtn,
         }
-        if torch.distributed.is_initialized():
-            gpus_per_group = _get_local_gpu_group_size()
-            if gpus_per_group > 1 and kwargs["enable_torch_compile"]:
-                logger.warning(
-                    "Disabling torch.compile for AutoRound multi-GPU group DDP "
-                    "because compiled block execution does not support "
-                    "cross-device sharding."
-                )
-                kwargs["enable_torch_compile"] = False
 
         llmc_registered_qparams = self._preprocess_qparams(decoding_layer)
         with (

From 86fc407fdebcea5510934077bc802f5dae45dfee Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 11:39:18 +0000
Subject: [PATCH 08/22] clean

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/ddp/DDP_FIXES.md      | 237 -----------------------
 examples/autoround/ddp/FX_TRACE_ISSUE.md |  58 ------
 examples/autoround/ddp/reproduce.md      |  99 ----------
 3 files changed, 394 deletions(-)
 delete mode 100644 examples/autoround/ddp/DDP_FIXES.md
 delete mode 100644 examples/autoround/ddp/FX_TRACE_ISSUE.md
 delete mode 100644 examples/autoround/ddp/reproduce.md

diff --git a/examples/autoround/ddp/DDP_FIXES.md b/examples/autoround/ddp/DDP_FIXES.md
deleted file mode 100644
index 50e9e7352d..0000000000
--- a/examples/autoround/ddp/DDP_FIXES.md
+++ /dev/null
@@ -1,237 +0,0 @@
-# DDP Multi-GPU AutoRound Fixes for Large MoE Models
-
-## Problem
-
-Running AutoRound quantization with DDP on large MoE models (e.g., Qwen3-235B) would hang or take hours due to `DistributedCPUCache` performing a `dist.broadcast_object_list()` + `dist.barrier()` **per parameter** during offload operations (~218ms × 45K params = ~163 minutes).
-
-## Root Cause
-
-When `dist.is_initialized()`, `OffloadCache.cls_from_device("cpu")` returns `DistributedCPUCache` instead of `CPUCache`. This cache broadcasts every tensor to all ranks — unnecessary when each rank loads the model independently via safetensors mmap.
-
-The bottleneck hits in two places:
-1. `from_accelerate()` → `dispatch_with_map()` 
-2. `set_onload_device()` in SequentialPipeline
-
-## Fixes Applied
-
-### Fix 1: `src/llmcompressor/utils/dev.py` — `get_main_device()` 
-
-**Bug**: Used `rank` as the CUDA device index, which is wrong when `GPUS_PER_GROUP > 1`.  
-**Fix**: Use `torch.accelerator.current_device_index()` which respects `torch.cuda.set_device()`.
-
-```python
-# Before (line 140):
-return torch.device(accel_type, rank)
-
-# After:
-return torch.device(accel_type, torch.accelerator.current_device_index())
-```
-
-### Fix 2: `src/llmcompressor/modifiers/autoround/base.py` — anchor device in `apply_autoround`
-
-**Bug**: Hardcoded `device = torch.device("cuda:0")` when `needs_multi_gpu` is true. Rank 1 with GPUs [2,3] would try to anchor on cuda:0 instead of cuda:2.  
-**Fix**: Use `get_main_device()` which returns the correct per-rank device.
-
-```python
-# Before (line ~329):
-device = torch.device("cuda:0")
-
-# After:
-from llmcompressor.utils.dev import get_main_device
-device = get_main_device()
-```
-
-### Fix 3: `src/llmcompressor/modifiers/autoround/base.py` — GPU partition in `_update_device_map_for_dp`
-
-**Bug**: Generated `"0,1"` for all ranks instead of per-rank GPU partitions.  
-**Fix**: Offset by `local_rank * gpus_per_group`.
-
-```python
-# Before:
-ar_kwargs["device_map"] = ",".join(str(i) for i in range(gpus_per_group))
-
-# After:
-local_rank = torch.distributed.get_rank()
-start_gpu = local_rank * gpus_per_group
-ar_kwargs["device_map"] = ",".join(str(start_gpu + i) for i in range(gpus_per_group))
-```
-
-### Patch 4 (monkey-patch, needs upstream in compressed-tensors): Force local cache
-
-Patches `OffloadCache.cls_from_device` to return `CPUCache`/`DeviceCache` instead of `DistributedCPUCache`/`DistributedDeviceCache`. This is correct when each rank loads the model independently.
-
-See `patch_force_local_cache()` in `test_option3_fixed.py`.
-
-### Patch 5 (monkey-patch, needs upstream in compressed-tensors): Disable onloading during quant init
-
-Wraps `initialize_module_for_quantization` with `disable_onloading()` to avoid per-parameter broadcast+barrier when new quantization parameters are created.
-
-See `patch_disable_onloading_for_quant_init()` in `test_option3_fixed.py`.
-
-## Reproduce
-
-### Prerequisites
-
-```bash
-# Environment
-source /home/yiliu7/workspace/venvs/llmc/bin/activate
-
-# Working directory
-cd /home/yiliu7/workspace/llm-compressor
-```
-
-### Run on Qwen3-8B (quick verification, ~2 minutes)
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 \
-    examples/autoround/ddp/ddp_autoround.py \
-    --model /storage/yiliu7/Qwen/Qwen3-8B \
-    --iters 5 --nsamples 32
-```
-
-### Run on Qwen3-235B (full test, ~47 minutes)
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 \
-    examples/autoround/ddp/ddp_autoround.py \
-    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ \
-    --iters 20 --nsamples 32
-```
-
-### Expected behavior
-
-- Both ranks process all 94 layers in lockstep (~30s/layer on 235B)
-- All 4 GPUs show active memory usage (~56-63 GB each)
-- Each rank uses 2 GPUs: rank 0 → [0,1], rank 1 → [2,3]
-- Small NCCL idle contexts (~614 MB) appear on non-owned GPUs — this is normal
-
-### Monitor progress
-
-```bash
-# GPU utilization
-nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader
-
-# Layer progress (from log)
-grep "Applying AutoRound" /path/to/log | tail -6
-```
-
-## Known Issues
-
-1. **8 GPU process entries in nvidia-smi**: Each of the 2 torchrun processes creates a small NCCL context (~614 MB) on all visible GPUs. Only 4 entries are doing real work (the ~56-63 GB ones). This is unavoidable without a pre-launch wrapper that restricts `CUDA_VISIBLE_DEVICES` before Python starts.
-
-2. **OOM on layer ~11 (235B)**: With 20 iters and the full 235B model, GPU memory may be tight. Reduce `--iters` or `--nsamples` if OOM occurs.
-
-## Upstream Plan
-
-### PR 1: llm-compressor — Multi-GPU DDP device fixes
-
-**Scope**: Fixes 1–3 above. Clean code changes, no monkey-patches.
-
-**Changes**:
-- `src/llmcompressor/utils/dev.py`: `get_main_device()` uses `current_device_index()` instead of `rank`
-- `src/llmcompressor/modifiers/autoround/base.py`: 
-  - `apply_autoround` anchor device uses `get_main_device()` instead of hardcoded `cuda:0`
-  - `_update_device_map_for_dp` offsets GPU indices by `local_rank * gpus_per_group`
-
-**Testing**: Run DDP AutoRound on Qwen3-8B with 4 GPUs (2 per rank). Verify all GPUs participate and no device mismatch errors.
-
----
-
-### PR 2: compressed-tensors — Skip distributed cache when ranks have local parameters
-
-**Problem**: `OffloadCache.cls_from_device("cpu")` unconditionally returns `DistributedCPUCache` when `dist.is_initialized()`. This causes O(n_params) broadcast+barrier ops (~218ms each) even when all ranks already have parameters locally (via independent `from_pretrained` loading with safetensors mmap).
-
-**Proposed fix**: Add a `distributed` parameter to `cls_from_device` with auto-detection:
-
-```python
-# compressed_tensors/offload/cache/base.py
-
-@classmethod
-def cls_from_device(cls, device=None, distributed=None):
-    """
-    Args:
-        distributed: If None (default), auto-detect based on whether
-            dist is initialized. If False, always return local cache.
-            If True, always return distributed cache.
-    """
-    if distributed is None:
-        distributed = (
-            torch.distributed.is_initialized()
-            and torch.distributed.get_world_size() > 1
-        )
-    
-    device_type = torch.device(device).type if device != "disk" else "disk"
-    if device_type == "cpu":
-        return DistributedCPUCache if distributed else CPUCache
-    elif is_accelerator_type(device_type):
-        return DistributedDeviceCache if distributed else DeviceCache
-    elif device_type == "disk":
-        return DiskCache
-    ...
-```
-
-**Callers that should pass `distributed=False`**:
-- `set_onload_device()` when the model was loaded independently on each rank (no meta tensors)
-- Any path where the caller knows parameters are already materialized locally
-
-**Alternative approach** — context manager:
-
-```python
-# compressed_tensors/offload/cache/base.py
-
-_force_local_cache = threading.local()
-
-@contextlib.contextmanager
-def force_local_cache():
-    """Context under which cls_from_device always returns non-distributed caches."""
-    _force_local_cache.active = True
-    try:
-        yield
-    finally:
-        _force_local_cache.active = False
-
-@classmethod
-def cls_from_device(cls, device=None):
-    distributed = (
-        torch.distributed.is_initialized()
-        and torch.distributed.get_world_size() > 1
-        and not getattr(_force_local_cache, 'active', False)
-    )
-    ...
-```
-
-This lets llm-compressor wrap its pipeline with `force_local_cache()` without modifying every callsite.
-
-**Testing**: 
-- Existing tests pass (distributed cache still used by default)
-- DDP test with independent model loading uses local cache, no broadcast overhead
-
----
-
-### PR 3: compressed-tensors — Wrap quant init with `disable_onloading()`
-
-**Problem**: `initialize_module_for_quantization` creates new parameters (scale, zero_point, etc.) which immediately trigger `DistributedCPUCache.offload()` → broadcast+barrier. These parameters are created identically on every rank, so broadcasting is always redundant.
-
-**Proposed fix**: Wrap the function body with `disable_onloading()`:
-
-```python
-# compressed_tensors/quantization/lifecycle/initialize.py
-
-def initialize_module_for_quantization(module, scheme=None, force_zero_point=True):
-    with disable_onloading():
-        # ... existing implementation ...
-```
-
-**Rationale**: New quant parameters are initialized from the quantization scheme (not from model weights), so they're identical across ranks by construction. There's no information to broadcast.
-
-**Testing**: DDP quantization should show no broadcast calls during `initialize_module_for_quantization`. Single-process behavior unchanged.
-
----
-
-### Priority
-
-1. **PR 3** (highest): Universal fix, always correct, simple one-liner
-2. **PR 2** (high): Eliminates the main bottleneck for independent-loading DDP
-3. **PR 1** (medium): Required for multi-GPU-per-rank scenarios (GPUS_PER_GROUP > 1)
diff --git a/examples/autoround/ddp/FX_TRACE_ISSUE.md b/examples/autoround/ddp/FX_TRACE_ISSUE.md
deleted file mode 100644
index 62aa603d5a..0000000000
--- a/examples/autoround/ddp/FX_TRACE_ISSUE.md
+++ /dev/null
@@ -1,58 +0,0 @@
-
-# FX Trace Bottleneck in SequentialPipeline
-
-## Problem
-
-`trace_subgraphs()` builds an FX graph of the full model (O(n_modules)) before per-layer calibration. For 235B with 61K modules, this never finishes.
-
-## Scope
-
-| Modifier | Pipeline | Needs trace? | 235B hangs? |
-|----------|----------|-------------|-------------|
-| RTN | `DataFreePipeline` | No | Never |
-| AWQ | `SequentialPipeline` | Yes | Only in DDP |
-| GPTQ | `SequentialPipeline` | Yes | Only in DDP |
-| AutoRound | `SequentialPipeline` | Yes | Only in DDP |
-
-## Root cause (DDP-specific)
-
-`load_offloaded_model()` → `from_accelerate()` → `dist.broadcast_object_list([61K-entry device_map, offload_dir])` serializes a massive dict via pickle. Rank 1's `dispatch_with_map` then creates OffloadCache for all 61K modules. Without DDP, `from_accelerate` dispatches locally — no broadcast, no wait.
-
-## Loading strategies for 235B DDP
-
-| Strategy | Load time | Trace | Works? |
-|----------|-----------|-------|--------|
-| `load_offloaded_model` + `device_map="auto"` (GPU) | 420s | Fast | No — OOM (1 GPU/rank, 178GB fills completely) |
-| `load_offloaded_model` + `device_map="auto_offload"` (CPU) | 10s | Hangs | No — 61K broadcast + dispatch |
-| CPU-only + sparse offload + `fast_pipeline.py` | 9s | 5s | **Yes** |
-
-## Fixes applied
-
-1. **`helpers.py`** — Removed `disable_onloading()` from `trace_subgraphs` (allows GPU onload)
-2. **`fast_pipeline.py`** — Replaces `SequentialPipeline.__call__` with regex-based layer scanning, no FX trace. Required for 235B DDP.
-3. **`distributed.py`** — Fixed `comm_device` to use `current_device()`; returns `(block, sync_fn)`
-4. **`quantizer.py`** — Captures return, calls `sync_gradients()` before `_step()`
-5. **`base.py`** — `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP`
-
-## Upstream plan
-
-The FX trace is the correct architecture — it handles arbitrary model graphs. For LLMs, a fast path that regex-matches `model.layers.*` is safe. The `fast_pipeline.py` logic should move into `helpers.py` as `trace_subgraphs_fast()`, gated by a `DatasetArguments.sequential_fast_trace` flag or auto-enabled when `module_count > threshold`.
-
-## Environment
-
-| Component | Path |
-|-----------|------|
-| Python | `/home/yiliu7/workspace/venvs/llmc/bin/python` |
-| torchrun | `/home/yiliu7/workspace/venvs/llmc/bin/torchrun` |
-| llm-compressor | `/home/yiliu7/workspace/llm-compressor` |
-| auto-round | `/home/yiliu7/workspace/ar-py` (used by venv) |
-| GPUs | 8× NVIDIA B200, 180 GiB each |
-| Test GPU subset | `CUDA_VISIBLE_DEVICES=0,1,2,3` |
-
-## Required env vars
-
-| Var | Value | Why |
-|-----|-------|-----|
-| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync |
-| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context in `calib_dataset.py` |
-| `CUDA_VISIBLE_DEVICES` | `0,1,2,3` | GPU partition (4 GPUs for 2 ranks) |
diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md
deleted file mode 100644
index 099d45f523..0000000000
--- a/examples/autoround/ddp/reproduce.md
+++ /dev/null
@@ -1,99 +0,0 @@
-# Multi-GPU DDP AutoRound Reproduce
-
-## torchrun (recommended)
-
-### 8B
-
-```bash
-cd /home/yiliu7/workspace/llm-compressor
-
-bash examples/autoround/ddp/launch_torchrun.sh \
-  --model /storage/yiliu7/Qwen/Qwen3-8B \
-  --scheme W4A16 \
-  --nsamples 32 --iters 50 \
-  --disable_torch_compile
-```
-
-### 235B
-
-```bash
-cd /home/yiliu7/workspace/llm-compressor
-
-AR_DISABLE_DATASET_SUBPROCESS=1 GPUS_PER_GROUP=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \
-/home/yiliu7/workspace/venvs/llmc/bin/torchrun --nproc_per_node=2 --master_port=29500 \
-examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py \
---model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507 \
---scheme W4A16 --nsamples 32 --iters 50 --disable_torch_compile
-```
-
-## bash wrapper (dedicated GPU isolation)
-
-```bash
-cd /home/yiliu7/workspace/llm-compressor
-
-AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NPROC=2 MASTER_PORT=29501 \
-  bash examples/autoround/ddp/launch_multi_gpu.sh \
-  ddp_qwen3_multi_gpu_example.py \
-  --model /storage/yiliu7/Qwen/Qwen3-8B \
-  --scheme W4A16 \
-  --nsamples 32 --iters 50 \
-  --disable_torch_compile \
-  > /tmp/multi_gpu_test.log 2>&1 &
-```
-
-## Monitor
-
-```bash
-tail -f /tmp/multi_gpu_test.log
-ps aux | grep ddp_qwen3_multi | grep -v grep
-nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader
-pkill -f ddp_qwen3_multi_gpu
-```
-
-## Verified
-
-### 8B (2026-06-18)
-```
-quantized 7/7 layers in the block, loss iter 0: 19.067873 -> iter 0: 19.067873
-[Rank 0] Quantization completed
-Hello my name is Mandy I am 20 years old...
-```
-All 37 decoder layers quantized, identical loss across ranks, sample generation works.
-
-### 235B (2026-06-19)
-```
-quantized 388/389 layers in the block, loss iter 0: 0.211156 -> iter 0: 0.211156
-...
-[Rank 0] Quantization completed
-```
-All 94 decoder layers quantized (388 Linear per MoE block), identical loss across ranks. ~25 min for 1 iter.
-
-## Key Files
-
-| File | Change |
-|------|--------|
-| `examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py` | torchrun example with patches |
-| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | bash wrapper example |
-| `examples/autoround/ddp/fast_pipeline.py` | Replaces `SequentialPipeline.__call__` — no FX trace |
-| `examples/autoround/ddp/launch_torchrun.sh` | torchrun launcher |
-| `examples/autoround/ddp/launch_multi_gpu.sh` | bash wrapper (GPU partitioning) |
-| `src/llmcompressor/modifiers/autoround/base.py` | `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` |
-| `src/llmcompressor/pipelines/sequential/helpers.py` | Removed `disable_onloading()` from `trace_subgraphs` |
-| `ar-py/auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)`; `current_device()` for NCCL |
-| `ar-py/auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` |
-
-## Required env vars
-
-| Var | Value | Why |
-|-----|-------|-----|
-| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync |
-| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context |
-| `--disable_torch_compile` | flag | torch.compile can't handle cross-device tensors |
-
-## Known issue: FX trace bottleneck
-
-`trace_subgraphs` runs an FX trace on the full model — for 61K-module models (235B) it never finishes. The `fast_pipeline.py` module bypasses this by creating subgraphs directly from decoder layer names. This affects ALL models using `SequentialPipeline`, not just DDP. The AWQ example (`qwen3_moe_example_ddp.py`) with 30B MoE also hangs.
-
-## Venv
-
-Python: `/home/yiliu7/workspace/venvs/llmc/bin/python`

From 0a7abbdf4c86ec2e6c076360b7faac08eb250daa Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 11:40:20 +0000
Subject: [PATCH 09/22] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 8f591d2343..d2a744309a 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -18,6 +18,7 @@
     enable_quantization,
 )
 from compressed_tensors.utils import align_module_device, match_named_modules
+from llmcompressor.utils.dev import get_main_device
 from loguru import logger
 from pydantic import PrivateAttr
 
@@ -317,7 +318,6 @@ def apply_autoround(self, state, modules):
                 # so anchoring to first_param.device can place residual modules
                 # (e.g. norms) on local cuda:1 while hidden states begin on
                 # local cuda:0, causing cross-device forward failures.
-                from llmcompressor.utils.dev import get_main_device
                 device = get_main_device()
                 # Move decoding layer to CPU first, then the submodules
                 # will be re-dispatched by AutoRound.

From 1422ebcc0305b98ee93e10bd57101eb7071e3e9c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 12:37:43 +0000
Subject: [PATCH 10/22] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../autoround/ddp/{ddp_autoround.py => ddp_qwen3_moe_example.py}  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/autoround/ddp/{ddp_autoround.py => ddp_qwen3_moe_example.py} (100%)

diff --git a/examples/autoround/ddp/ddp_autoround.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
similarity index 100%
rename from examples/autoround/ddp/ddp_autoround.py
rename to examples/autoround/ddp/ddp_qwen3_moe_example.py

From 3f03bc6d4d80e454c8e57d84feb2e6c9b6fb0b22 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 12:38:48 +0000
Subject: [PATCH 11/22] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/ddp/ddp_qwen3_moe_example.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index 89f961377e..6f6a6c07bb 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -6,9 +6,9 @@
 
 Run with:
   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 ddp_autoround.py \
+    --nproc_per_node=2 ddp_qwen3_moe_example.py \
     --iters 100 --nsamples 256 \
-    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 
+    --model Qwen/Qwen3-235B-A22B-Instruct-2507
 """
 
 import argparse
@@ -81,8 +81,8 @@ def cls_from_device_local(cls, device=None):
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", type=str, required=True)
     parser.add_argument("--scheme", type=str, default="W4A16")
-    parser.add_argument("--iters", type=int, default=5)
-    parser.add_argument("--nsamples", type=int, default=128)
+    parser.add_argument("--iters", type=int, default=100)
+    parser.add_argument("--nsamples", type=int, default=256)
     args = parser.parse_args()
 
     ###### DDP INIT #####

From 2db2a84478e90ae8175e4e914c7807c68240a4f7 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 21 Jun 2026 12:55:41 +0000
Subject: [PATCH 12/22] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/ddp/ddp_qwen3_moe_example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index 6f6a6c07bb..40c1f694e3 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -23,7 +23,7 @@
 
 from llmcompressor import oneshot
 
-
+#  FIXME: (yiliu30) remove this patch before merging 
 def patch_disable_onloading_for_quant_init():
     """Avoid dist.broadcast + barrier for every new quant parameter.
 
@@ -106,7 +106,7 @@ def cls_from_device_local(cls, device=None):
     )
 
     # Apply patches BEFORE model loading and calibration
-    # FIXME: (yiliu30) remove these patched before merging once the underlying issues are fixed
+    # FIXME: (yiliu30) remove these patches before merging once the underlying issues are fixed
     patch_disable_onloading_for_quant_init()
     patch_force_local_cache()
 

From 776115343166d9b484da299721b78a7bbd2cdde9 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 22 Jun 2026 05:48:15 +0000
Subject: [PATCH 13/22] Simplify ddp_qwen3_moe_example: remove argparse,
 hardcode model config

Use force_local_cache() from compressed-tensors instead of monkey-patches
---
 .../autoround/ddp/ddp_qwen3_moe_example.py    | 112 ++++--------------
 1 file changed, 25 insertions(+), 87 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index 40c1f694e3..0271baf61d 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -6,13 +6,9 @@
 
 Run with:
   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 ddp_qwen3_moe_example.py \
-    --iters 100 --nsamples 256 \
-    --model Qwen/Qwen3-235B-A22B-Instruct-2507
+    --nproc_per_node=2 ddp_qwen3_moe_example.py
 """
 
-import argparse
-import importlib
 import os
 import time
 
@@ -22,68 +18,14 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-
-#  FIXME: (yiliu30) remove this patch before merging 
-def patch_disable_onloading_for_quant_init():
-    """Avoid dist.broadcast + barrier for every new quant parameter.
-
-    compressed-tensors' initialize_module_for_quantization creates new
-    parameters which trigger DistributedCPUCache's per-param broadcast.
-    Wrapping with disable_onloading() prevents this.
-    """
-    from compressed_tensors.offload import disable_onloading
-
-    lifecycle_init_mod = importlib.import_module(
-        "compressed_tensors.quantization.lifecycle.initialize"
-    )
-    original_fn = lifecycle_init_mod.initialize_module_for_quantization
-    if getattr(original_fn, "_patched", False):
-        return
-
-    def patched(module, scheme=None, force_zero_point=True):
-        with disable_onloading():
-            return original_fn(module, scheme=scheme, force_zero_point=force_zero_point)
-
-    patched._patched = True
-    lifecycle_init_mod.initialize_module_for_quantization = patched
-
-
-def patch_force_local_cache():
-    """Force OffloadCache.cls_from_device to return non-distributed caches.
-
-    When ranks load the model independently, each already has parameters
-    locally. DistributedCPUCache's per-param broadcast+barrier is
-    unnecessary and causes O(n_params) collective ops (~218ms each).
-    """
-    from compressed_tensors.offload.cache.base import OffloadCache
-    from compressed_tensors.offload.cache.cpu import CPUCache
-    from compressed_tensors.offload.cache.device import DeviceCache
-    from compressed_tensors.offload.cache.disk import DiskCache
-    from compressed_tensors.utils import is_accelerator_type
-
-    @classmethod
-    def cls_from_device_local(cls, device=None):
-        device_type = torch.device(device).type if device != "disk" else "disk"
-        if device_type == "cpu":
-            return CPUCache
-        elif is_accelerator_type(device_type):
-            return DeviceCache
-        elif device_type == "disk":
-            return DiskCache
-        else:
-            raise NotImplementedError(f"Offload of type {device_type} not implemented")
-
-    OffloadCache.cls_from_device = cls_from_device_local
-    logger.info("Patched OffloadCache.cls_from_device → local (non-distributed) caches")
+from compressed_tensors.offload.cache.base import force_local_cache
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, required=True)
-    parser.add_argument("--scheme", type=str, default="W4A16")
-    parser.add_argument("--iters", type=int, default=100)
-    parser.add_argument("--nsamples", type=int, default=256)
-    args = parser.parse_args()
+    MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507"
+    SCHEME = "W4A16"
+    ITERS = 100
+    NSAMPLES = 256
 
     ###### DDP INIT #####
     gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1"))
@@ -105,48 +47,44 @@ def cls_from_device_local(cls, device=None):
         f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]"
     )
 
-    # Apply patches BEFORE model loading and calibration
-    # FIXME: (yiliu30) remove these patches before merging once the underlying issues are fixed
-    patch_disable_onloading_for_quant_init()
-    patch_force_local_cache()
-
     ###### MODEL LOAD #####
     load_start = time.perf_counter()
-    model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto")
+    model = AutoModelForCausalLM.from_pretrained(MODEL, dtype="auto")
     load_elapsed = time.perf_counter() - load_start
     logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s")
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL)
 
     ###### DATASET #####
     os.environ["AR_DISABLE_DATASET_SUBPROCESS"] = "1"
     from auto_round.calib_dataset import get_dataset
     from llmcompressor.modifiers.autoround import AutoRoundModifier
 
-    ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=args.nsamples)
+    ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=NSAMPLES)
 
     ###### RECIPE #####
     recipe = AutoRoundModifier(
         targets="Linear",
-        scheme=args.scheme,
+        scheme=SCHEME,
         ignore=["lm_head", "re:.*mlp.gate$"],
-        iters=args.iters,
+        iters=ITERS,
         enable_torch_compile=False,
     )
 
     ###### QUANTIZE #####
     logger.info(f"[Rank {rank}] Starting oneshot...")
     quant_start = time.perf_counter()
-    oneshot(
-        model=model,
-        dataset=ds,
-        recipe=recipe,
-        max_seq_length=2048,
-        num_calibration_samples=args.nsamples,
-        shuffle_calibration_samples=False,
-    )
-    quant_elapsed = time.perf_counter() - quant_start
-    logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s")
+    with force_local_cache():
+        oneshot(
+            model=model,
+            dataset=ds,
+            recipe=recipe,
+            max_seq_length=2048,
+            num_calibration_samples=NSAMPLES,
+            shuffle_calibration_samples=False,
+        )
+        quant_elapsed = time.perf_counter() - quant_start
+        logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s")
 
     if dist.is_initialized():
         dist.barrier()
@@ -160,9 +98,9 @@ def cls_from_device_local(cls, device=None):
     if rank == 0:
         save_dir = (
              "/storage/yiliu7/Qwen/"
-             + args.model.rstrip("/").split("/")[-1]
-            + f"-{args.scheme}-AutoRound"
-            + f"-iters{args.iters}-nsamples{args.nsamples}"
+             + MODEL.rstrip("/").split("/")[-1]
+            + f"-{SCHEME}-AutoRound"
+            + f"-iters{ITERS}-nsamples{NSAMPLES}"
             + f"-DDP{world_size}"
         )
         logger.info(f"Saving to {save_dir}...")

From 9e63922b6ca838c3da83a932b1eb38c3d314db9d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 22 Jun 2026 06:02:12 +0000
Subject: [PATCH 14/22] Remove __main__ guard, fix quant_elapsed scope

---
 .../autoround/ddp/ddp_qwen3_moe_example.py    | 194 +++++++++---------
 1 file changed, 95 insertions(+), 99 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index 0271baf61d..4942de2c2f 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -20,105 +20,101 @@
 from llmcompressor import oneshot
 from compressed_tensors.offload.cache.base import force_local_cache
 
-
-if __name__ == "__main__":
-    MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507"
-    SCHEME = "W4A16"
-    ITERS = 100
-    NSAMPLES = 256
-
-    ###### DDP INIT #####
-    gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1"))
-    if "TORCHELASTIC_RUN_ID" in os.environ:
-        local_rank = int(os.environ["LOCAL_RANK"])
-        main_gpu = local_rank * gpus_per_group
-        torch.cuda.set_device(main_gpu)
-        dist.init_process_group(
-            backend="nccl",
-            init_method="env://",
-            device_id=torch.device(f"cuda:{main_gpu}"),
-        )
-
-    rank = dist.get_rank() if dist.is_initialized() else 0
-    world_size = dist.get_world_size() if dist.is_initialized() else 1
-    main_gpu = rank * gpus_per_group
-    logger.info(
-        f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, "
-        f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]"
+MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507"
+SCHEME = "W4A16"
+ITERS = 100
+NSAMPLES = 256
+
+###### DDP INIT #####
+gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1"))
+if "TORCHELASTIC_RUN_ID" in os.environ:
+    local_rank = int(os.environ["LOCAL_RANK"])
+    main_gpu = local_rank * gpus_per_group
+    torch.cuda.set_device(main_gpu)
+    dist.init_process_group(
+        backend="nccl",
+        init_method="env://",
+        device_id=torch.device(f"cuda:{main_gpu}"),
     )
 
-    ###### MODEL LOAD #####
-    load_start = time.perf_counter()
-    model = AutoModelForCausalLM.from_pretrained(MODEL, dtype="auto")
-    load_elapsed = time.perf_counter() - load_start
-    logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s")
-
-    tokenizer = AutoTokenizer.from_pretrained(MODEL)
-
-    ###### DATASET #####
-    os.environ["AR_DISABLE_DATASET_SUBPROCESS"] = "1"
-    from auto_round.calib_dataset import get_dataset
-    from llmcompressor.modifiers.autoround import AutoRoundModifier
-
-    ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=NSAMPLES)
-
-    ###### RECIPE #####
-    recipe = AutoRoundModifier(
-        targets="Linear",
-        scheme=SCHEME,
-        ignore=["lm_head", "re:.*mlp.gate$"],
-        iters=ITERS,
-        enable_torch_compile=False,
+rank = dist.get_rank() if dist.is_initialized() else 0
+world_size = dist.get_world_size() if dist.is_initialized() else 1
+main_gpu = rank * gpus_per_group
+logger.info(
+    f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, "
+    f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]"
+)
+
+###### MODEL LOAD #####
+load_start = time.perf_counter()
+model = AutoModelForCausalLM.from_pretrained(MODEL, dtype="auto")
+load_elapsed = time.perf_counter() - load_start
+logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s")
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+
+###### DATASET #####
+os.environ["AR_DISABLE_DATASET_SUBPROCESS"] = "1"
+from auto_round.calib_dataset import get_dataset
+from llmcompressor.modifiers.autoround import AutoRoundModifier
+
+ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=NSAMPLES)
+
+###### RECIPE #####
+recipe = AutoRoundModifier(
+    targets="Linear",
+    scheme=SCHEME,
+    ignore=["lm_head", "re:.*mlp.gate$"],
+    iters=ITERS,
+    enable_torch_compile=False,
+)
+
+###### QUANTIZE #####
+logger.info(f"[Rank {rank}] Starting oneshot...")
+quant_start = time.perf_counter()
+with force_local_cache():
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=2048,
+        num_calibration_samples=NSAMPLES,
+        shuffle_calibration_samples=False,
     )
-
-    ###### QUANTIZE #####
-    logger.info(f"[Rank {rank}] Starting oneshot...")
-    quant_start = time.perf_counter()
-    with force_local_cache():
-        oneshot(
-            model=model,
-            dataset=ds,
-            recipe=recipe,
-            max_seq_length=2048,
-            num_calibration_samples=NSAMPLES,
-            shuffle_calibration_samples=False,
-        )
-        quant_elapsed = time.perf_counter() - quant_start
-        logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s")
-
-    if dist.is_initialized():
-        dist.barrier()
-
-
-    ###### SAVE (rank 0 only) #####
-    if dist.is_initialized():
-        dist.barrier()
-        dist.destroy_process_group()
-
-    if rank == 0:
-        save_dir = (
-             "/storage/yiliu7/Qwen/"
-             + MODEL.rstrip("/").split("/")[-1]
-            + f"-{SCHEME}-AutoRound"
-            + f"-iters{ITERS}-nsamples{NSAMPLES}"
-            + f"-DDP{world_size}"
-        )
-        logger.info(f"Saving to {save_dir}...")
-        model.save_pretrained(save_dir, save_compressed=True)
-        tokenizer.save_pretrained(save_dir)
-        logger.info(f"Saved to {save_dir}")
-
-    ###### SAMPLE GENERATION (rank 0 only) #####
-    if rank == 0:
-        from compressed_tensors.offload import dispatch_model
-
-        logger.info("========== SAMPLE GENERATION ==============")
-        dispatch_model(model)
-        sample = tokenizer("Hello my name is", return_tensors="pt")
-        sample = {key: value.to(model.device) for key, value in sample.items()}
-        output = model.generate(**sample, max_new_tokens=100)
-        logger.info(tokenizer.decode(output[0]))
-        logger.info("==========================================")
-
-
-    logger.info(f"[Rank {rank}] SUCCESS")
+quant_elapsed = time.perf_counter() - quant_start
+logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s")
+
+if dist.is_initialized():
+    dist.barrier()
+
+###### SAVE (rank 0 only) #####
+if dist.is_initialized():
+    dist.barrier()
+    dist.destroy_process_group()
+
+if rank == 0:
+    save_dir = (
+         "/storage/yiliu7/Qwen/"
+         + MODEL.rstrip("/").split("/")[-1]
+        + f"-{SCHEME}-AutoRound"
+        + f"-iters{ITERS}-nsamples{NSAMPLES}"
+        + f"-DDP{world_size}"
+    )
+    logger.info(f"Saving to {save_dir}...")
+    model.save_pretrained(save_dir, save_compressed=True)
+    tokenizer.save_pretrained(save_dir)
+    logger.info(f"Saved to {save_dir}")
+
+###### SAMPLE GENERATION (rank 0 only) #####
+if rank == 0:
+    from compressed_tensors.offload import dispatch_model
+
+    logger.info("========== SAMPLE GENERATION ==============")
+    dispatch_model(model)
+    sample = tokenizer("Hello my name is", return_tensors="pt")
+    sample = {key: value.to(model.device) for key, value in sample.items()}
+    output = model.generate(**sample, max_new_tokens=100)
+    logger.info(tokenizer.decode(output[0]))
+    logger.info("==========================================")
+
+logger.info(f"[Rank {rank}] SUCCESS")

From 0cae4061ddd451bd4f5bfe657238305cb9d28a1c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 22 Jun 2026 06:17:34 +0000
Subject: [PATCH 15/22] =?UTF-8?q?Remove=20TORCHELASTIC=5FRUN=5FID=20guard?=
 =?UTF-8?q?=20=E2=80=94=20always=20run=20via=20torchrun?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../autoround/ddp/ddp_qwen3_moe_example.py    | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index 4942de2c2f..f6707ff272 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -27,18 +27,17 @@
 
 ###### DDP INIT #####
 gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1"))
-if "TORCHELASTIC_RUN_ID" in os.environ:
-    local_rank = int(os.environ["LOCAL_RANK"])
-    main_gpu = local_rank * gpus_per_group
-    torch.cuda.set_device(main_gpu)
-    dist.init_process_group(
-        backend="nccl",
-        init_method="env://",
-        device_id=torch.device(f"cuda:{main_gpu}"),
-    )
+local_rank = int(os.environ["LOCAL_RANK"])
+main_gpu = local_rank * gpus_per_group
+torch.cuda.set_device(main_gpu)
+dist.init_process_group(
+    backend="nccl",
+    init_method="env://",
+    device_id=torch.device(f"cuda:{main_gpu}"),
+)
 
-rank = dist.get_rank() if dist.is_initialized() else 0
-world_size = dist.get_world_size() if dist.is_initialized() else 1
+rank = dist.get_rank()
+world_size = dist.get_world_size()
 main_gpu = rank * gpus_per_group
 logger.info(
     f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, "

From 8830293437911bf519e0840f6736e0fda731f073 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 23 Jun 2026 14:58:03 +0000
Subject: [PATCH 16/22] Wrap AutoRound on_initialize quantization init with
 force_local_cache

Suppresses DistributedCPUCache per-param broadcast during mass
quantization init (each scale/zero_point register_parameter triggers
a collective op). Uses try/except ImportError for backwards compat
with older compressed-tensors versions.
---
 src/llmcompressor/modifiers/autoround/base.py | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index d2a744309a..d899ddf6f9 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -92,6 +92,78 @@ def suspend_offloading(model: nn.Module):
         offload_module(module, *offloading_info[name])
 
 
+import os
+import torch
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+
+def fmt_bytes(num_bytes: int) -> str:
+    gb = num_bytes / 1024**3
+    return f"{gb:.2f} GB"
+
+
+def dump_memory_usage():
+    print("=" * 80)
+    print("CPU Memory")
+    print("=" * 80)
+
+    if psutil is not None:
+        proc = psutil.Process(os.getpid())
+        rss = proc.memory_info().rss
+        vms = proc.memory_info().vms
+        sys_mem = psutil.virtual_memory()
+
+        print(f"Process RSS      : {fmt_bytes(rss)}")
+        print(f"Process VMS      : {fmt_bytes(vms)}")
+        print(f"System Used      : {fmt_bytes(sys_mem.used)} / {fmt_bytes(sys_mem.total)}")
+        print(f"System Available : {fmt_bytes(sys_mem.available)}")
+    else:
+        print("psutil is not installed. Install with: pip install psutil")
+
+    print()
+    print("=" * 80)
+    print("CUDA Memory")
+    print("=" * 80)
+
+    if not torch.cuda.is_available():
+        print("CUDA is not available.")
+        return
+
+    num_devices = torch.cuda.device_count()
+    print(f"CUDA devices: {num_devices}")
+
+    for i in range(num_devices):
+        props = torch.cuda.get_device_properties(i)
+
+        allocated = torch.cuda.memory_allocated(i)
+        reserved = torch.cuda.memory_reserved(i)
+        max_allocated = torch.cuda.max_memory_allocated(i)
+        max_reserved = torch.cuda.max_memory_reserved(i)
+
+        free, total = torch.cuda.mem_get_info(i)
+        used_total = total - free
+
+        print()
+        print(f"[cuda:{i}] {props.name}")
+        print(f"  Total memory        : {fmt_bytes(total)}")
+        print(f"  Free memory         : {fmt_bytes(free)}")
+        print(f"  Used memory         : {fmt_bytes(used_total)}")
+        print(f"  Torch allocated     : {fmt_bytes(allocated)}")
+        print(f"  Torch reserved      : {fmt_bytes(reserved)}")
+        print(f"  Max allocated       : {fmt_bytes(max_allocated)}")
+        print(f"  Max reserved        : {fmt_bytes(max_reserved)}")
+
+    print("=" * 80)
+
+
+# if __name__ == "__main__":
+#     dump_memory_usage()
+
+
 class AutoRoundModifier(Modifier, QuantizationMixin):
     """
     Implements the AutoRound algorithm from https://aclanthology.org/2024.findings-emnlp.662.pdf.
@@ -292,6 +364,7 @@ def apply_autoround(self, state, modules):
             align_module_device(decoding_layer),
             suspend_offloading(wrapped_model),
         ):
+            dump_memory_usage()
             self._update_device_map_for_dp(kwargs)
             ar = AutoRound(
                 model=wrapped_model,

From be1b04bf3b7854bfce89eec9911e4a1eb023fb61 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 24 Jun 2026 03:09:29 +0000
Subject: [PATCH 17/22] Clean AutoRound DDP: standard load_offloaded_model,
 remove force_local_cache

- Remove debug memory dump code from base.py
- Remove force_local_cache from on_initialize (matches GPTQ pattern)
- Standard load_offloaded_model + auto_offload in example
- Verified on 30B (49 layers) and 235B (first 2 layers)
---
 .../autoround/ddp/ddp_qwen3_moe_example.py    | 43 ++++++-----
 src/llmcompressor/modifiers/autoround/base.py | 73 -------------------
 2 files changed, 21 insertions(+), 95 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index f6707ff272..04d28ad739 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -1,8 +1,9 @@
 """
 DDP AutoRound quantization example for large MoE models.
 
-Runs 2 ranks, each using GPUS_PER_GROUP GPUs. All ranks load the model
-independently on CPU (safetensors mmap shares physical pages at OS level).
+Uses the standard compressed-tensors DDP path: load_offloaded_model()
+broadcasts weights from rank 0 to rank 1.  GPUS_PER_GROUP controls how
+many GPUs each rank uses for per-block model parallelism.
 
 Run with:
   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
@@ -14,15 +15,14 @@
 
 import torch
 import torch.distributed as dist
+from compressed_tensors.offload import load_offloaded_model
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
-
 from llmcompressor import oneshot
-from compressed_tensors.offload.cache.base import force_local_cache
 
 MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507"
 SCHEME = "W4A16"
-ITERS = 100
+ITERS = 200
 NSAMPLES = 256
 
 ###### DDP INIT #####
@@ -38,7 +38,6 @@
 
 rank = dist.get_rank()
 world_size = dist.get_world_size()
-main_gpu = rank * gpus_per_group
 logger.info(
     f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, "
     f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]"
@@ -46,9 +45,11 @@
 
 ###### MODEL LOAD #####
 load_start = time.perf_counter()
-model = AutoModelForCausalLM.from_pretrained(MODEL, dtype="auto")
-load_elapsed = time.perf_counter() - load_start
-logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s")
+with load_offloaded_model():
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL, dtype="auto", device_map="auto_offload",
+    )
+logger.info(f"[Rank {rank}] Loaded in {time.perf_counter() - load_start:.1f}s")
 
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 
@@ -60,6 +61,7 @@
 ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=NSAMPLES)
 
 ###### RECIPE #####
+
 recipe = AutoRoundModifier(
     targets="Linear",
     scheme=SCHEME,
@@ -71,17 +73,15 @@
 ###### QUANTIZE #####
 logger.info(f"[Rank {rank}] Starting oneshot...")
 quant_start = time.perf_counter()
-with force_local_cache():
-    oneshot(
-        model=model,
-        dataset=ds,
-        recipe=recipe,
-        max_seq_length=2048,
-        num_calibration_samples=NSAMPLES,
-        shuffle_calibration_samples=False,
-    )
-quant_elapsed = time.perf_counter() - quant_start
-logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s")
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=2048,
+    num_calibration_samples=NSAMPLES,
+    shuffle_calibration_samples=False,
+)
+logger.info(f"[Rank {rank}] Quantization done in {time.perf_counter() - quant_start:.1f}s")
 
 if dist.is_initialized():
     dist.barrier()
@@ -93,8 +93,7 @@
 
 if rank == 0:
     save_dir = (
-         "/storage/yiliu7/Qwen/"
-         + MODEL.rstrip("/").split("/")[-1]
+        MODEL.rstrip("/").split("/")[-1]
         + f"-{SCHEME}-AutoRound"
         + f"-iters{ITERS}-nsamples{NSAMPLES}"
         + f"-DDP{world_size}"
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index d899ddf6f9..d2a744309a 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -92,78 +92,6 @@ def suspend_offloading(model: nn.Module):
         offload_module(module, *offloading_info[name])
 
 
-import os
-import torch
-
-try:
-    import psutil
-except ImportError:
-    psutil = None
-
-
-def fmt_bytes(num_bytes: int) -> str:
-    gb = num_bytes / 1024**3
-    return f"{gb:.2f} GB"
-
-
-def dump_memory_usage():
-    print("=" * 80)
-    print("CPU Memory")
-    print("=" * 80)
-
-    if psutil is not None:
-        proc = psutil.Process(os.getpid())
-        rss = proc.memory_info().rss
-        vms = proc.memory_info().vms
-        sys_mem = psutil.virtual_memory()
-
-        print(f"Process RSS      : {fmt_bytes(rss)}")
-        print(f"Process VMS      : {fmt_bytes(vms)}")
-        print(f"System Used      : {fmt_bytes(sys_mem.used)} / {fmt_bytes(sys_mem.total)}")
-        print(f"System Available : {fmt_bytes(sys_mem.available)}")
-    else:
-        print("psutil is not installed. Install with: pip install psutil")
-
-    print()
-    print("=" * 80)
-    print("CUDA Memory")
-    print("=" * 80)
-
-    if not torch.cuda.is_available():
-        print("CUDA is not available.")
-        return
-
-    num_devices = torch.cuda.device_count()
-    print(f"CUDA devices: {num_devices}")
-
-    for i in range(num_devices):
-        props = torch.cuda.get_device_properties(i)
-
-        allocated = torch.cuda.memory_allocated(i)
-        reserved = torch.cuda.memory_reserved(i)
-        max_allocated = torch.cuda.max_memory_allocated(i)
-        max_reserved = torch.cuda.max_memory_reserved(i)
-
-        free, total = torch.cuda.mem_get_info(i)
-        used_total = total - free
-
-        print()
-        print(f"[cuda:{i}] {props.name}")
-        print(f"  Total memory        : {fmt_bytes(total)}")
-        print(f"  Free memory         : {fmt_bytes(free)}")
-        print(f"  Used memory         : {fmt_bytes(used_total)}")
-        print(f"  Torch allocated     : {fmt_bytes(allocated)}")
-        print(f"  Torch reserved      : {fmt_bytes(reserved)}")
-        print(f"  Max allocated       : {fmt_bytes(max_allocated)}")
-        print(f"  Max reserved        : {fmt_bytes(max_reserved)}")
-
-    print("=" * 80)
-
-
-# if __name__ == "__main__":
-#     dump_memory_usage()
-
-
 class AutoRoundModifier(Modifier, QuantizationMixin):
     """
     Implements the AutoRound algorithm from https://aclanthology.org/2024.findings-emnlp.662.pdf.
@@ -364,7 +292,6 @@ def apply_autoround(self, state, modules):
             align_module_device(decoding_layer),
             suspend_offloading(wrapped_model),
         ):
-            dump_memory_usage()
             self._update_device_map_for_dp(kwargs)
             ar = AutoRound(
                 model=wrapped_model,

From 8ac6e2f5dd90d20f1808202481a3f1e59a6059e0 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 24 Jun 2026 13:50:44 +0000
Subject: [PATCH 18/22] fix: prevent broadcast deadlock in AutoRound DDP
 on_initialize

Wrap QuantizationMixin.initialize_quantization with disable_onloading()
to suppress DistributedCPUCache's per-param broadcast_object_list+barrier
when creating quant params (scale, zero_point).

Root cause: with GPUS_PER_GROUP=2, device_map='auto_offload' assigns
modules to different GPUs. initialize_qparams creates tensors on varying
devices, causing GPU->CPU copy timing to differ between ranks. The paired
broadcast_object_list calls desync -> deadlock at barrier.

disable_onloading() bypasses the distributed path entirely. Quant params
are deterministic across ranks (computed from the same scheme), so no
synchronization is needed.

Also fix example: save model before destroy_process_group (save_pretrained
internally uses broadcast_object_list).
---
 examples/autoround/ddp/DDP_FIXES.md           | 237 ++++++++++++++++++
 examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md | 113 +++++++++
 .../autoround/ddp/ddp_qwen3_moe_example.py    |  12 +-
 examples/autoround/ddp/reproduce.md           |  99 ++++++++
 src/llmcompressor/modifiers/autoround/base.py |  32 ++-
 5 files changed, 484 insertions(+), 9 deletions(-)
 create mode 100644 examples/autoround/ddp/DDP_FIXES.md
 create mode 100644 examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md
 create mode 100644 examples/autoround/ddp/reproduce.md

diff --git a/examples/autoround/ddp/DDP_FIXES.md b/examples/autoround/ddp/DDP_FIXES.md
new file mode 100644
index 0000000000..50e9e7352d
--- /dev/null
+++ b/examples/autoround/ddp/DDP_FIXES.md
@@ -0,0 +1,237 @@
+# DDP Multi-GPU AutoRound Fixes for Large MoE Models
+
+## Problem
+
+Running AutoRound quantization with DDP on large MoE models (e.g., Qwen3-235B) would hang or take hours due to `DistributedCPUCache` performing a `dist.broadcast_object_list()` + `dist.barrier()` **per parameter** during offload operations (~218ms × 45K params = ~163 minutes).
+
+## Root Cause
+
+When `dist.is_initialized()`, `OffloadCache.cls_from_device("cpu")` returns `DistributedCPUCache` instead of `CPUCache`. This cache broadcasts every tensor to all ranks — unnecessary when each rank loads the model independently via safetensors mmap.
+
+The bottleneck hits in two places:
+1. `from_accelerate()` → `dispatch_with_map()` 
+2. `set_onload_device()` in SequentialPipeline
+
+## Fixes Applied
+
+### Fix 1: `src/llmcompressor/utils/dev.py` — `get_main_device()` 
+
+**Bug**: Used `rank` as the CUDA device index, which is wrong when `GPUS_PER_GROUP > 1`.  
+**Fix**: Use `torch.accelerator.current_device_index()` which respects `torch.cuda.set_device()`.
+
+```python
+# Before (line 140):
+return torch.device(accel_type, rank)
+
+# After:
+return torch.device(accel_type, torch.accelerator.current_device_index())
+```
+
+### Fix 2: `src/llmcompressor/modifiers/autoround/base.py` — anchor device in `apply_autoround`
+
+**Bug**: Hardcoded `device = torch.device("cuda:0")` when `needs_multi_gpu` is true. Rank 1 with GPUs [2,3] would try to anchor on cuda:0 instead of cuda:2.  
+**Fix**: Use `get_main_device()` which returns the correct per-rank device.
+
+```python
+# Before (line ~329):
+device = torch.device("cuda:0")
+
+# After:
+from llmcompressor.utils.dev import get_main_device
+device = get_main_device()
+```
+
+### Fix 3: `src/llmcompressor/modifiers/autoround/base.py` — GPU partition in `_update_device_map_for_dp`
+
+**Bug**: Generated `"0,1"` for all ranks instead of per-rank GPU partitions.  
+**Fix**: Offset by `local_rank * gpus_per_group`.
+
+```python
+# Before:
+ar_kwargs["device_map"] = ",".join(str(i) for i in range(gpus_per_group))
+
+# After:
+local_rank = torch.distributed.get_rank()
+start_gpu = local_rank * gpus_per_group
+ar_kwargs["device_map"] = ",".join(str(start_gpu + i) for i in range(gpus_per_group))
+```
+
+### Patch 4 (monkey-patch, needs upstream in compressed-tensors): Force local cache
+
+Patches `OffloadCache.cls_from_device` to return `CPUCache`/`DeviceCache` instead of `DistributedCPUCache`/`DistributedDeviceCache`. This is correct when each rank loads the model independently.
+
+See `patch_force_local_cache()` in `test_option3_fixed.py`.
+
+### Patch 5 (monkey-patch, needs upstream in compressed-tensors): Disable onloading during quant init
+
+Wraps `initialize_module_for_quantization` with `disable_onloading()` to avoid per-parameter broadcast+barrier when new quantization parameters are created.
+
+See `patch_disable_onloading_for_quant_init()` in `test_option3_fixed.py`.
+
+## Reproduce
+
+### Prerequisites
+
+```bash
+# Environment
+source /home/yiliu7/workspace/venvs/llmc/bin/activate
+
+# Working directory
+cd /home/yiliu7/workspace/llm-compressor
+```
+
+### Run on Qwen3-8B (quick verification, ~2 minutes)
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
+    --nproc_per_node=2 \
+    examples/autoround/ddp/ddp_autoround.py \
+    --model /storage/yiliu7/Qwen/Qwen3-8B \
+    --iters 5 --nsamples 32
+```
+
+### Run on Qwen3-235B (full test, ~47 minutes)
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
+    --nproc_per_node=2 \
+    examples/autoround/ddp/ddp_autoround.py \
+    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ \
+    --iters 20 --nsamples 32
+```
+
+### Expected behavior
+
+- Both ranks process all 94 layers in lockstep (~30s/layer on 235B)
+- All 4 GPUs show active memory usage (~56-63 GB each)
+- Each rank uses 2 GPUs: rank 0 → [0,1], rank 1 → [2,3]
+- Small NCCL idle contexts (~614 MB) appear on non-owned GPUs — this is normal
+
+### Monitor progress
+
+```bash
+# GPU utilization
+nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader
+
+# Layer progress (from log)
+grep "Applying AutoRound" /path/to/log | tail -6
+```
+
+## Known Issues
+
+1. **8 GPU process entries in nvidia-smi**: Each of the 2 torchrun processes creates a small NCCL context (~614 MB) on all visible GPUs. Only 4 entries are doing real work (the ~56-63 GB ones). This is unavoidable without a pre-launch wrapper that restricts `CUDA_VISIBLE_DEVICES` before Python starts.
+
+2. **OOM on layer ~11 (235B)**: With 20 iters and the full 235B model, GPU memory may be tight. Reduce `--iters` or `--nsamples` if OOM occurs.
+
+## Upstream Plan
+
+### PR 1: llm-compressor — Multi-GPU DDP device fixes
+
+**Scope**: Fixes 1–3 above. Clean code changes, no monkey-patches.
+
+**Changes**:
+- `src/llmcompressor/utils/dev.py`: `get_main_device()` uses `current_device_index()` instead of `rank`
+- `src/llmcompressor/modifiers/autoround/base.py`: 
+  - `apply_autoround` anchor device uses `get_main_device()` instead of hardcoded `cuda:0`
+  - `_update_device_map_for_dp` offsets GPU indices by `local_rank * gpus_per_group`
+
+**Testing**: Run DDP AutoRound on Qwen3-8B with 4 GPUs (2 per rank). Verify all GPUs participate and no device mismatch errors.
+
+---
+
+### PR 2: compressed-tensors — Skip distributed cache when ranks have local parameters
+
+**Problem**: `OffloadCache.cls_from_device("cpu")` unconditionally returns `DistributedCPUCache` when `dist.is_initialized()`. This causes O(n_params) broadcast+barrier ops (~218ms each) even when all ranks already have parameters locally (via independent `from_pretrained` loading with safetensors mmap).
+
+**Proposed fix**: Add a `distributed` parameter to `cls_from_device` with auto-detection:
+
+```python
+# compressed_tensors/offload/cache/base.py
+
+@classmethod
+def cls_from_device(cls, device=None, distributed=None):
+    """
+    Args:
+        distributed: If None (default), auto-detect based on whether
+            dist is initialized. If False, always return local cache.
+            If True, always return distributed cache.
+    """
+    if distributed is None:
+        distributed = (
+            torch.distributed.is_initialized()
+            and torch.distributed.get_world_size() > 1
+        )
+    
+    device_type = torch.device(device).type if device != "disk" else "disk"
+    if device_type == "cpu":
+        return DistributedCPUCache if distributed else CPUCache
+    elif is_accelerator_type(device_type):
+        return DistributedDeviceCache if distributed else DeviceCache
+    elif device_type == "disk":
+        return DiskCache
+    ...
+```
+
+**Callers that should pass `distributed=False`**:
+- `set_onload_device()` when the model was loaded independently on each rank (no meta tensors)
+- Any path where the caller knows parameters are already materialized locally
+
+**Alternative approach** — context manager:
+
+```python
+# compressed_tensors/offload/cache/base.py
+
+_force_local_cache = threading.local()
+
+@contextlib.contextmanager
+def force_local_cache():
+    """Context under which cls_from_device always returns non-distributed caches."""
+    _force_local_cache.active = True
+    try:
+        yield
+    finally:
+        _force_local_cache.active = False
+
+@classmethod
+def cls_from_device(cls, device=None):
+    distributed = (
+        torch.distributed.is_initialized()
+        and torch.distributed.get_world_size() > 1
+        and not getattr(_force_local_cache, 'active', False)
+    )
+    ...
+```
+
+This lets llm-compressor wrap its pipeline with `force_local_cache()` without modifying every callsite.
+
+**Testing**: 
+- Existing tests pass (distributed cache still used by default)
+- DDP test with independent model loading uses local cache, no broadcast overhead
+
+---
+
+### PR 3: compressed-tensors — Wrap quant init with `disable_onloading()`
+
+**Problem**: `initialize_module_for_quantization` creates new parameters (scale, zero_point, etc.) which immediately trigger `DistributedCPUCache.offload()` → broadcast+barrier. These parameters are created identically on every rank, so broadcasting is always redundant.
+
+**Proposed fix**: Wrap the function body with `disable_onloading()`:
+
+```python
+# compressed_tensors/quantization/lifecycle/initialize.py
+
+def initialize_module_for_quantization(module, scheme=None, force_zero_point=True):
+    with disable_onloading():
+        # ... existing implementation ...
+```
+
+**Rationale**: New quant parameters are initialized from the quantization scheme (not from model weights), so they're identical across ranks by construction. There's no information to broadcast.
+
+**Testing**: DDP quantization should show no broadcast calls during `initialize_module_for_quantization`. Single-process behavior unchanged.
+
+---
+
+### Priority
+
+1. **PR 3** (highest): Universal fix, always correct, simple one-liner
+2. **PR 2** (high): Eliminates the main bottleneck for independent-loading DDP
+3. **PR 1** (medium): Required for multi-GPU-per-rank scenarios (GPUS_PER_GROUP > 1)
diff --git a/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md b/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md
new file mode 100644
index 0000000000..40d0e9ebf9
--- /dev/null
+++ b/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md
@@ -0,0 +1,113 @@
+# AutoRound DDP Hang: Root Cause Analysis
+
+## Symptom
+
+AutoRound quantization hangs during `on_initialize` → `initialize_quantization` when
+using `GPUS_PER_GROUP=2` (4 GPUs, 2 ranks). The same setup with `GPUS_PER_GROUP=1`
+(2 GPUs, 2 ranks) completes in ~46 seconds. GPTQ does not exhibit the hang because
+its examples default to `GPUS_PER_GROUP=1`.
+
+## Root Cause: Broadcast Deadlock in `DistributedCPUCache.offload()`
+
+### The call chain
+
+```
+initialize_quantization()
+  → apply_quantization_config()
+    → initialize_module_for_quantization()        # per matched Linear module
+      → initialize_qparams()
+        → torch.empty(shape, device=get_execution_device(module))
+        → module.register_parameter(name, param)  # triggers:
+          → OffloadCache.__setitem__()
+            → DistributedCPUCache.offload()
+              → tensor.to("cpu")                  # ⚠️ GPU→CPU copy
+              → share_memory_()
+              → broadcast_object_list()           # ⚠️ paired broadcast
+              → barrier()                         # ⚠️ deadlock point
+```
+
+### Why it deadlocks with GPUS_PER_GROUP=2
+
+With 4 GPUs visible (`CUDA_VISIBLE_DEVICES=0,1,2,3`), `device_map="auto_offload"`
+assigns different modules to different GPUs. `get_execution_device(module)` returns
+varying devices (`cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`). `initialize_qparams`
+creates tensors on those devices.
+
+The `DistributedCPUCache.offload()` call chain first does a GPU→CPU copy of the
+tensor. With tensors on different GPUs under different load conditions, the copy
+timing varies per module. The two ranks drift out of lockstep:
+
+- Rank 0: finishes GPU→CPU copy for module N, enters `broadcast_object_list`
+- Rank 1: still doing GPU→CPU copy for module N (different GPU, different load)
+
+`broadcast_object_list` is a paired operation — both ranks must enter it in the
+same order. When timing varies, rank 0 enters broadcast N while rank 1 is still
+at broadcast N-1 → **deadlock at barrier**.
+
+The broadcasts themselves are CPU-side and fast. The GPU→CPU copy *before* each
+broadcast is what desynchronizes the ranks.
+
+### Why it works with GPUS_PER_GROUP=1
+
+With only 2 GPUs visible (`CUDA_VISIBLE_DEVICES=1,3`), `device_map="auto_offload"`
+sees limited aggregate GPU memory and assigns execution to CPU
+(`onload_device=cpu`). `get_execution_device` returns `cpu` for all modules.
+`initialize_qparams` creates params on CPU. `offload()` does a CPU→CPU copy —
+uniform timing. The broadcasts stay paired, no deadlock.
+
+### Why GPTQ doesn't hit this
+
+GPTQ examples use `GPUS_PER_GROUP=1` (default). If GPTQ were run with
+`GPUS_PER_GROUP=2`, it would hit the same deadlock. The hang is not specific to
+AutoRound — it's a property of `DistributedCPUCache` + multi-GPU execution
+devices + `initialize_quantization`.
+
+## The Fix: `disable_onloading()` in `on_initialize`
+
+### Mechanism
+
+`OffloadCache` has a class-level flag `onloading_disabled`. When set:
+
+- **`__getitem__`**: returns the offloaded (CPU) tensor directly — no CPU→GPU onload
+- **`__setitem__`**: stores the value directly in `offloaded_values` — no `offload()`,
+  no GPU→CPU copy, no `broadcast_object_list`, no `barrier`
+
+This is a CT-provided escape hatch. It's already used *inside*
+`initialize_module_for_quantization` (line 77 of `initialize.py`) to access
+`module.weight` without triggering the distributed path.
+
+### Implementation
+
+```python
+# llmcompressor/modifiers/autoround/base.py — on_initialize()
+if QuantizationMixin.has_config(self):
+    from compressed_tensors.offload import disable_onloading
+    with disable_onloading():
+        QuantizationMixin.initialize_quantization(self, state.model)
+```
+
+### Why this is safe
+
+1. **Quant params are deterministic.** Both ranks compute identical scale/zero_point
+   values from the same quantization scheme. No broadcast is needed — each rank
+   produces the same data independently.
+
+2. **Params stay on GPU, which is correct.** Calibration runs next — the params need
+   to be on GPU for forward/backward. When the block is later offloaded to CPU, the
+   params follow the normal offload path.
+
+3. **Precedent exists.** `initialize_module_for_quantization` already uses
+   `disable_onloading()` for exactly this purpose — accessing `module.weight` without
+   triggering the onload path.
+
+4. **Scoped and temporary.** The context manager restores normal behavior after
+   `initialize_quantization` completes. All subsequent operations use the standard
+   onload/offload path.
+
+### Why not `force_local_cache`
+
+`force_local_cache` only affects `cls_from_device` (new cache *creation*). During
+`initialize_quantization`, the `DistributedCPUCache` instances already exist on
+modules — params are added to existing caches via `__setitem__`. `force_local_cache`
+has no effect on this path. The CT maintainer also rejected this approach because
+it changes global cache creation semantics, which could affect model weight loading.
diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index 04d28ad739..58d0794dbf 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -22,8 +22,8 @@
 
 MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507"
 SCHEME = "W4A16"
-ITERS = 200
-NSAMPLES = 256
+ITERS = 1
+NSAMPLES = 4
 
 ###### DDP INIT #####
 gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1"))
@@ -87,10 +87,6 @@
     dist.barrier()
 
 ###### SAVE (rank 0 only) #####
-if dist.is_initialized():
-    dist.barrier()
-    dist.destroy_process_group()
-
 if rank == 0:
     save_dir = (
         MODEL.rstrip("/").split("/")[-1]
@@ -115,4 +111,8 @@
     logger.info(tokenizer.decode(output[0]))
     logger.info("==========================================")
 
+if dist.is_initialized():
+    dist.barrier()
+    dist.destroy_process_group()
+
 logger.info(f"[Rank {rank}] SUCCESS")
diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md
new file mode 100644
index 0000000000..099d45f523
--- /dev/null
+++ b/examples/autoround/ddp/reproduce.md
@@ -0,0 +1,99 @@
+# Multi-GPU DDP AutoRound Reproduce
+
+## torchrun (recommended)
+
+### 8B
+
+```bash
+cd /home/yiliu7/workspace/llm-compressor
+
+bash examples/autoround/ddp/launch_torchrun.sh \
+  --model /storage/yiliu7/Qwen/Qwen3-8B \
+  --scheme W4A16 \
+  --nsamples 32 --iters 50 \
+  --disable_torch_compile
+```
+
+### 235B
+
+```bash
+cd /home/yiliu7/workspace/llm-compressor
+
+AR_DISABLE_DATASET_SUBPROCESS=1 GPUS_PER_GROUP=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \
+/home/yiliu7/workspace/venvs/llmc/bin/torchrun --nproc_per_node=2 --master_port=29500 \
+examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py \
+--model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507 \
+--scheme W4A16 --nsamples 32 --iters 50 --disable_torch_compile
+```
+
+## bash wrapper (dedicated GPU isolation)
+
+```bash
+cd /home/yiliu7/workspace/llm-compressor
+
+AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NPROC=2 MASTER_PORT=29501 \
+  bash examples/autoround/ddp/launch_multi_gpu.sh \
+  ddp_qwen3_multi_gpu_example.py \
+  --model /storage/yiliu7/Qwen/Qwen3-8B \
+  --scheme W4A16 \
+  --nsamples 32 --iters 50 \
+  --disable_torch_compile \
+  > /tmp/multi_gpu_test.log 2>&1 &
+```
+
+## Monitor
+
+```bash
+tail -f /tmp/multi_gpu_test.log
+ps aux | grep ddp_qwen3_multi | grep -v grep
+nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader
+pkill -f ddp_qwen3_multi_gpu
+```
+
+## Verified
+
+### 8B (2026-06-18)
+```
+quantized 7/7 layers in the block, loss iter 0: 19.067873 -> iter 0: 19.067873
+[Rank 0] Quantization completed
+Hello my name is Mandy I am 20 years old...
+```
+All 37 decoder layers quantized, identical loss across ranks, sample generation works.
+
+### 235B (2026-06-19)
+```
+quantized 388/389 layers in the block, loss iter 0: 0.211156 -> iter 0: 0.211156
+...
+[Rank 0] Quantization completed
+```
+All 94 decoder layers quantized (388 Linear per MoE block), identical loss across ranks. ~25 min for 1 iter.
+
+## Key Files
+
+| File | Change |
+|------|--------|
+| `examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py` | torchrun example with patches |
+| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | bash wrapper example |
+| `examples/autoround/ddp/fast_pipeline.py` | Replaces `SequentialPipeline.__call__` — no FX trace |
+| `examples/autoround/ddp/launch_torchrun.sh` | torchrun launcher |
+| `examples/autoround/ddp/launch_multi_gpu.sh` | bash wrapper (GPU partitioning) |
+| `src/llmcompressor/modifiers/autoround/base.py` | `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` |
+| `src/llmcompressor/pipelines/sequential/helpers.py` | Removed `disable_onloading()` from `trace_subgraphs` |
+| `ar-py/auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)`; `current_device()` for NCCL |
+| `ar-py/auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` |
+
+## Required env vars
+
+| Var | Value | Why |
+|-----|-------|-----|
+| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync |
+| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context |
+| `--disable_torch_compile` | flag | torch.compile can't handle cross-device tensors |
+
+## Known issue: FX trace bottleneck
+
+`trace_subgraphs` runs an FX trace on the full model — for 61K-module models (235B) it never finishes. The `fast_pipeline.py` module bypasses this by creating subgraphs directly from decoder layer names. This affects ALL models using `SequentialPipeline`, not just DDP. The AWQ example (`qwen3_moe_example_ddp.py`) with 30B MoE also hangs.
+
+## Venv
+
+Python: `/home/yiliu7/workspace/venvs/llmc/bin/python`
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index d2a744309a..e62cd62494 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -173,9 +173,18 @@ def on_initialize(self, state: State, **kwargs) -> bool:
 
         :param state: session state storing input model and calibration data
         """
-        # apply config to model and prepare calibration hooks
+        # apply config to model and prepare calibration hooks.
+        # Wrap in disable_onloading to suppress DistributedCPUCache's
+        # per-param broadcast+barrier when creating quant params (scale,
+        # zero_point). With GPUS_PER_GROUP > 1, modules have varying GPU
+        # execution devices, causing GPU→CPU copy timing to vary between
+        # ranks → broadcast deadlock. Quant params are deterministic —
+        # each rank computes identical values, no sync needed.
         if QuantizationMixin.has_config(self):
-            QuantizationMixin.initialize_quantization(self, state.model)
+            from compressed_tensors.offload import disable_onloading
+
+            with disable_onloading():
+                QuantizationMixin.initialize_quantization(self, state.model)
 
         # prepare module names
         self._add_temporary_names(state.model)
@@ -310,7 +319,9 @@ def apply_autoround(self, state, modules):
             # across multiple GPUs within the rank.
             auto_offload = False
             needs_multi_gpu = (
-                self.device_ids is not None or _get_local_gpu_group_size() > 1
+                self.device_ids is not None
+                or _get_local_gpu_group_size() > 1
+                or torch.cuda.device_count() > 1
             )
             if needs_multi_gpu:
                 # Let AutoRound own placement within the rank-local GPU group.
@@ -323,6 +334,21 @@ def apply_autoround(self, state, modules):
                 # will be re-dispatched by AutoRound.
                 decoding_layer.to("cpu")
                 auto_offload = True
+                # Move cached inputs to the anchor device — they may have
+                # been captured on different GPUs during calibration.
+                cur_inputs = [
+                    (
+                        tuple(
+                            x.to(device) if isinstance(x, torch.Tensor) else x
+                            for x in args
+                        ),
+                        {
+                            k: v.to(device) if isinstance(v, torch.Tensor) else v
+                            for k, v in kwargs.items()
+                        },
+                    )
+                    for args, kwargs in cur_inputs
+                ]
 
             q_input, _ = ar.quantize_block(
                 block=decoding_layer,

From 6cf652b7afc363755b8fbbf8329ac818908f33be Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 25 Jun 2026 01:06:30 +0000
Subject: [PATCH 19/22] refactor: extract _move_inputs_to helper, fix
 save/sample gen deadlock

- Extract _move_inputs_to static method for cleaner input device alignment
- Move input movement out of if-needs_multi_gpu branch (always correct)
- Both ranks participate in save_pretrained (uses broadcast_object_list)
- Sample generation moved after destroy_process_group
- Restore 235B model path
---
 .../autoround/ddp/ddp_qwen3_moe_example.py    | 33 +++++++-------
 src/llmcompressor/modifiers/autoround/base.py | 43 ++++++++++---------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index 58d0794dbf..b89ed6ccd0 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -83,21 +83,24 @@
 )
 logger.info(f"[Rank {rank}] Quantization done in {time.perf_counter() - quant_start:.1f}s")
 
-if dist.is_initialized():
-    dist.barrier()
-
-###### SAVE (rank 0 only) #####
+###### SAVE #####
+# Both ranks must participate — save_pretrained internally calls
+# collectives (broadcast_object_list). Only rank 0 writes to disk.
+save_dir = (
+    MODEL.rstrip("/").split("/")[-1]
+    + f"-{SCHEME}-AutoRound"
+    + f"-iters{ITERS}-nsamples{NSAMPLES}"
+    + f"-DDP{world_size}"
+)
+logger.info(f"[Rank {rank}] Saving to {save_dir}...")
+model.save_pretrained(save_dir, save_compressed=True)
 if rank == 0:
-    save_dir = (
-        MODEL.rstrip("/").split("/")[-1]
-        + f"-{SCHEME}-AutoRound"
-        + f"-iters{ITERS}-nsamples{NSAMPLES}"
-        + f"-DDP{world_size}"
-    )
-    logger.info(f"Saving to {save_dir}...")
-    model.save_pretrained(save_dir, save_compressed=True)
     tokenizer.save_pretrained(save_dir)
-    logger.info(f"Saved to {save_dir}")
+logger.info(f"[Rank {rank}] Saved to {save_dir}")
+
+if dist.is_initialized():
+    dist.barrier()
+    dist.destroy_process_group()
 
 ###### SAMPLE GENERATION (rank 0 only) #####
 if rank == 0:
@@ -111,8 +114,4 @@
     logger.info(tokenizer.decode(output[0]))
     logger.info("==========================================")
 
-if dist.is_initialized():
-    dist.barrier()
-    dist.destroy_process_group()
-
 logger.info(f"[Rank {rank}] SUCCESS")
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index e62cd62494..2b6f297523 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -325,30 +325,13 @@ def apply_autoround(self, state, modules):
             )
             if needs_multi_gpu:
                 # Let AutoRound own placement within the rank-local GPU group.
-                # The incoming block may already be split across local devices,
-                # so anchoring to first_param.device can place residual modules
-                # (e.g. norms) on local cuda:1 while hidden states begin on
-                # local cuda:0, causing cross-device forward failures.
                 device = get_main_device()
-                # Move decoding layer to CPU first, then the submodules
-                # will be re-dispatched by AutoRound.
                 decoding_layer.to("cpu")
                 auto_offload = True
-                # Move cached inputs to the anchor device — they may have
-                # been captured on different GPUs during calibration.
-                cur_inputs = [
-                    (
-                        tuple(
-                            x.to(device) if isinstance(x, torch.Tensor) else x
-                            for x in args
-                        ),
-                        {
-                            k: v.to(device) if isinstance(v, torch.Tensor) else v
-                            for k, v in kwargs.items()
-                        },
-                    )
-                    for args, kwargs in cur_inputs
-                ]
+
+            # Ensure cached inputs are on the same device as the block.
+            # Calibration forward may have run on a different GPU.
+            cur_inputs = self._move_inputs_to(cur_inputs, device)
 
             q_input, _ = ar.quantize_block(
                 block=decoding_layer,
@@ -440,6 +423,24 @@ def _remove_temporary_names(self, model: torch.nn.Module):
             if hasattr(mod, "_tmp_name"):
                 del mod._tmp_name
 
+    @staticmethod
+    def _move_inputs_to(
+        inputs: list[tuple[tuple, dict]], device: torch.device
+    ) -> list[tuple[tuple, dict]]:
+        """Move all tensors in cached forward inputs to *device*."""
+        return [
+            (
+                tuple(
+                    x.to(device) if isinstance(x, torch.Tensor) else x for x in args
+                ),
+                {
+                    k: v.to(device) if isinstance(v, torch.Tensor) else v
+                    for k, v in kwargs.items()
+                },
+            )
+            for args, kwargs in inputs
+        ]
+
     def _is_decoding_layer(self, module: torch.nn.Module) -> bool:
         return module.__class__.__name__ in self._sequential_targets
 

From 56247c8da8ab2831e6a8583b9be3d3bbb15cad8c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 25 Jun 2026 06:46:03 +0000
Subject: [PATCH 20/22] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/ddp/DDP_FIXES.md           | 237 ------------------
 examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md | 113 ---------
 examples/autoround/ddp/reproduce.md           |  99 --------
 3 files changed, 449 deletions(-)
 delete mode 100644 examples/autoround/ddp/DDP_FIXES.md
 delete mode 100644 examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md
 delete mode 100644 examples/autoround/ddp/reproduce.md

diff --git a/examples/autoround/ddp/DDP_FIXES.md b/examples/autoround/ddp/DDP_FIXES.md
deleted file mode 100644
index 50e9e7352d..0000000000
--- a/examples/autoround/ddp/DDP_FIXES.md
+++ /dev/null
@@ -1,237 +0,0 @@
-# DDP Multi-GPU AutoRound Fixes for Large MoE Models
-
-## Problem
-
-Running AutoRound quantization with DDP on large MoE models (e.g., Qwen3-235B) would hang or take hours due to `DistributedCPUCache` performing a `dist.broadcast_object_list()` + `dist.barrier()` **per parameter** during offload operations (~218ms × 45K params = ~163 minutes).
-
-## Root Cause
-
-When `dist.is_initialized()`, `OffloadCache.cls_from_device("cpu")` returns `DistributedCPUCache` instead of `CPUCache`. This cache broadcasts every tensor to all ranks — unnecessary when each rank loads the model independently via safetensors mmap.
-
-The bottleneck hits in two places:
-1. `from_accelerate()` → `dispatch_with_map()` 
-2. `set_onload_device()` in SequentialPipeline
-
-## Fixes Applied
-
-### Fix 1: `src/llmcompressor/utils/dev.py` — `get_main_device()` 
-
-**Bug**: Used `rank` as the CUDA device index, which is wrong when `GPUS_PER_GROUP > 1`.  
-**Fix**: Use `torch.accelerator.current_device_index()` which respects `torch.cuda.set_device()`.
-
-```python
-# Before (line 140):
-return torch.device(accel_type, rank)
-
-# After:
-return torch.device(accel_type, torch.accelerator.current_device_index())
-```
-
-### Fix 2: `src/llmcompressor/modifiers/autoround/base.py` — anchor device in `apply_autoround`
-
-**Bug**: Hardcoded `device = torch.device("cuda:0")` when `needs_multi_gpu` is true. Rank 1 with GPUs [2,3] would try to anchor on cuda:0 instead of cuda:2.  
-**Fix**: Use `get_main_device()` which returns the correct per-rank device.
-
-```python
-# Before (line ~329):
-device = torch.device("cuda:0")
-
-# After:
-from llmcompressor.utils.dev import get_main_device
-device = get_main_device()
-```
-
-### Fix 3: `src/llmcompressor/modifiers/autoround/base.py` — GPU partition in `_update_device_map_for_dp`
-
-**Bug**: Generated `"0,1"` for all ranks instead of per-rank GPU partitions.  
-**Fix**: Offset by `local_rank * gpus_per_group`.
-
-```python
-# Before:
-ar_kwargs["device_map"] = ",".join(str(i) for i in range(gpus_per_group))
-
-# After:
-local_rank = torch.distributed.get_rank()
-start_gpu = local_rank * gpus_per_group
-ar_kwargs["device_map"] = ",".join(str(start_gpu + i) for i in range(gpus_per_group))
-```
-
-### Patch 4 (monkey-patch, needs upstream in compressed-tensors): Force local cache
-
-Patches `OffloadCache.cls_from_device` to return `CPUCache`/`DeviceCache` instead of `DistributedCPUCache`/`DistributedDeviceCache`. This is correct when each rank loads the model independently.
-
-See `patch_force_local_cache()` in `test_option3_fixed.py`.
-
-### Patch 5 (monkey-patch, needs upstream in compressed-tensors): Disable onloading during quant init
-
-Wraps `initialize_module_for_quantization` with `disable_onloading()` to avoid per-parameter broadcast+barrier when new quantization parameters are created.
-
-See `patch_disable_onloading_for_quant_init()` in `test_option3_fixed.py`.
-
-## Reproduce
-
-### Prerequisites
-
-```bash
-# Environment
-source /home/yiliu7/workspace/venvs/llmc/bin/activate
-
-# Working directory
-cd /home/yiliu7/workspace/llm-compressor
-```
-
-### Run on Qwen3-8B (quick verification, ~2 minutes)
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 \
-    examples/autoround/ddp/ddp_autoround.py \
-    --model /storage/yiliu7/Qwen/Qwen3-8B \
-    --iters 5 --nsamples 32
-```
-
-### Run on Qwen3-235B (full test, ~47 minutes)
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \
-    --nproc_per_node=2 \
-    examples/autoround/ddp/ddp_autoround.py \
-    --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ \
-    --iters 20 --nsamples 32
-```
-
-### Expected behavior
-
-- Both ranks process all 94 layers in lockstep (~30s/layer on 235B)
-- All 4 GPUs show active memory usage (~56-63 GB each)
-- Each rank uses 2 GPUs: rank 0 → [0,1], rank 1 → [2,3]
-- Small NCCL idle contexts (~614 MB) appear on non-owned GPUs — this is normal
-
-### Monitor progress
-
-```bash
-# GPU utilization
-nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader
-
-# Layer progress (from log)
-grep "Applying AutoRound" /path/to/log | tail -6
-```
-
-## Known Issues
-
-1. **8 GPU process entries in nvidia-smi**: Each of the 2 torchrun processes creates a small NCCL context (~614 MB) on all visible GPUs. Only 4 entries are doing real work (the ~56-63 GB ones). This is unavoidable without a pre-launch wrapper that restricts `CUDA_VISIBLE_DEVICES` before Python starts.
-
-2. **OOM on layer ~11 (235B)**: With 20 iters and the full 235B model, GPU memory may be tight. Reduce `--iters` or `--nsamples` if OOM occurs.
-
-## Upstream Plan
-
-### PR 1: llm-compressor — Multi-GPU DDP device fixes
-
-**Scope**: Fixes 1–3 above. Clean code changes, no monkey-patches.
-
-**Changes**:
-- `src/llmcompressor/utils/dev.py`: `get_main_device()` uses `current_device_index()` instead of `rank`
-- `src/llmcompressor/modifiers/autoround/base.py`: 
-  - `apply_autoround` anchor device uses `get_main_device()` instead of hardcoded `cuda:0`
-  - `_update_device_map_for_dp` offsets GPU indices by `local_rank * gpus_per_group`
-
-**Testing**: Run DDP AutoRound on Qwen3-8B with 4 GPUs (2 per rank). Verify all GPUs participate and no device mismatch errors.
-
----
-
-### PR 2: compressed-tensors — Skip distributed cache when ranks have local parameters
-
-**Problem**: `OffloadCache.cls_from_device("cpu")` unconditionally returns `DistributedCPUCache` when `dist.is_initialized()`. This causes O(n_params) broadcast+barrier ops (~218ms each) even when all ranks already have parameters locally (via independent `from_pretrained` loading with safetensors mmap).
-
-**Proposed fix**: Add a `distributed` parameter to `cls_from_device` with auto-detection:
-
-```python
-# compressed_tensors/offload/cache/base.py
-
-@classmethod
-def cls_from_device(cls, device=None, distributed=None):
-    """
-    Args:
-        distributed: If None (default), auto-detect based on whether
-            dist is initialized. If False, always return local cache.
-            If True, always return distributed cache.
-    """
-    if distributed is None:
-        distributed = (
-            torch.distributed.is_initialized()
-            and torch.distributed.get_world_size() > 1
-        )
-    
-    device_type = torch.device(device).type if device != "disk" else "disk"
-    if device_type == "cpu":
-        return DistributedCPUCache if distributed else CPUCache
-    elif is_accelerator_type(device_type):
-        return DistributedDeviceCache if distributed else DeviceCache
-    elif device_type == "disk":
-        return DiskCache
-    ...
-```
-
-**Callers that should pass `distributed=False`**:
-- `set_onload_device()` when the model was loaded independently on each rank (no meta tensors)
-- Any path where the caller knows parameters are already materialized locally
-
-**Alternative approach** — context manager:
-
-```python
-# compressed_tensors/offload/cache/base.py
-
-_force_local_cache = threading.local()
-
-@contextlib.contextmanager
-def force_local_cache():
-    """Context under which cls_from_device always returns non-distributed caches."""
-    _force_local_cache.active = True
-    try:
-        yield
-    finally:
-        _force_local_cache.active = False
-
-@classmethod
-def cls_from_device(cls, device=None):
-    distributed = (
-        torch.distributed.is_initialized()
-        and torch.distributed.get_world_size() > 1
-        and not getattr(_force_local_cache, 'active', False)
-    )
-    ...
-```
-
-This lets llm-compressor wrap its pipeline with `force_local_cache()` without modifying every callsite.
-
-**Testing**: 
-- Existing tests pass (distributed cache still used by default)
-- DDP test with independent model loading uses local cache, no broadcast overhead
-
----
-
-### PR 3: compressed-tensors — Wrap quant init with `disable_onloading()`
-
-**Problem**: `initialize_module_for_quantization` creates new parameters (scale, zero_point, etc.) which immediately trigger `DistributedCPUCache.offload()` → broadcast+barrier. These parameters are created identically on every rank, so broadcasting is always redundant.
-
-**Proposed fix**: Wrap the function body with `disable_onloading()`:
-
-```python
-# compressed_tensors/quantization/lifecycle/initialize.py
-
-def initialize_module_for_quantization(module, scheme=None, force_zero_point=True):
-    with disable_onloading():
-        # ... existing implementation ...
-```
-
-**Rationale**: New quant parameters are initialized from the quantization scheme (not from model weights), so they're identical across ranks by construction. There's no information to broadcast.
-
-**Testing**: DDP quantization should show no broadcast calls during `initialize_module_for_quantization`. Single-process behavior unchanged.
-
----
-
-### Priority
-
-1. **PR 3** (highest): Universal fix, always correct, simple one-liner
-2. **PR 2** (high): Eliminates the main bottleneck for independent-loading DDP
-3. **PR 1** (medium): Required for multi-GPU-per-rank scenarios (GPUS_PER_GROUP > 1)
diff --git a/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md b/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md
deleted file mode 100644
index 40d0e9ebf9..0000000000
--- a/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# AutoRound DDP Hang: Root Cause Analysis
-
-## Symptom
-
-AutoRound quantization hangs during `on_initialize` → `initialize_quantization` when
-using `GPUS_PER_GROUP=2` (4 GPUs, 2 ranks). The same setup with `GPUS_PER_GROUP=1`
-(2 GPUs, 2 ranks) completes in ~46 seconds. GPTQ does not exhibit the hang because
-its examples default to `GPUS_PER_GROUP=1`.
-
-## Root Cause: Broadcast Deadlock in `DistributedCPUCache.offload()`
-
-### The call chain
-
-```
-initialize_quantization()
-  → apply_quantization_config()
-    → initialize_module_for_quantization()        # per matched Linear module
-      → initialize_qparams()
-        → torch.empty(shape, device=get_execution_device(module))
-        → module.register_parameter(name, param)  # triggers:
-          → OffloadCache.__setitem__()
-            → DistributedCPUCache.offload()
-              → tensor.to("cpu")                  # ⚠️ GPU→CPU copy
-              → share_memory_()
-              → broadcast_object_list()           # ⚠️ paired broadcast
-              → barrier()                         # ⚠️ deadlock point
-```
-
-### Why it deadlocks with GPUS_PER_GROUP=2
-
-With 4 GPUs visible (`CUDA_VISIBLE_DEVICES=0,1,2,3`), `device_map="auto_offload"`
-assigns different modules to different GPUs. `get_execution_device(module)` returns
-varying devices (`cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`). `initialize_qparams`
-creates tensors on those devices.
-
-The `DistributedCPUCache.offload()` call chain first does a GPU→CPU copy of the
-tensor. With tensors on different GPUs under different load conditions, the copy
-timing varies per module. The two ranks drift out of lockstep:
-
-- Rank 0: finishes GPU→CPU copy for module N, enters `broadcast_object_list`
-- Rank 1: still doing GPU→CPU copy for module N (different GPU, different load)
-
-`broadcast_object_list` is a paired operation — both ranks must enter it in the
-same order. When timing varies, rank 0 enters broadcast N while rank 1 is still
-at broadcast N-1 → **deadlock at barrier**.
-
-The broadcasts themselves are CPU-side and fast. The GPU→CPU copy *before* each
-broadcast is what desynchronizes the ranks.
-
-### Why it works with GPUS_PER_GROUP=1
-
-With only 2 GPUs visible (`CUDA_VISIBLE_DEVICES=1,3`), `device_map="auto_offload"`
-sees limited aggregate GPU memory and assigns execution to CPU
-(`onload_device=cpu`). `get_execution_device` returns `cpu` for all modules.
-`initialize_qparams` creates params on CPU. `offload()` does a CPU→CPU copy —
-uniform timing. The broadcasts stay paired, no deadlock.
-
-### Why GPTQ doesn't hit this
-
-GPTQ examples use `GPUS_PER_GROUP=1` (default). If GPTQ were run with
-`GPUS_PER_GROUP=2`, it would hit the same deadlock. The hang is not specific to
-AutoRound — it's a property of `DistributedCPUCache` + multi-GPU execution
-devices + `initialize_quantization`.
-
-## The Fix: `disable_onloading()` in `on_initialize`
-
-### Mechanism
-
-`OffloadCache` has a class-level flag `onloading_disabled`. When set:
-
-- **`__getitem__`**: returns the offloaded (CPU) tensor directly — no CPU→GPU onload
-- **`__setitem__`**: stores the value directly in `offloaded_values` — no `offload()`,
-  no GPU→CPU copy, no `broadcast_object_list`, no `barrier`
-
-This is a CT-provided escape hatch. It's already used *inside*
-`initialize_module_for_quantization` (line 77 of `initialize.py`) to access
-`module.weight` without triggering the distributed path.
-
-### Implementation
-
-```python
-# llmcompressor/modifiers/autoround/base.py — on_initialize()
-if QuantizationMixin.has_config(self):
-    from compressed_tensors.offload import disable_onloading
-    with disable_onloading():
-        QuantizationMixin.initialize_quantization(self, state.model)
-```
-
-### Why this is safe
-
-1. **Quant params are deterministic.** Both ranks compute identical scale/zero_point
-   values from the same quantization scheme. No broadcast is needed — each rank
-   produces the same data independently.
-
-2. **Params stay on GPU, which is correct.** Calibration runs next — the params need
-   to be on GPU for forward/backward. When the block is later offloaded to CPU, the
-   params follow the normal offload path.
-
-3. **Precedent exists.** `initialize_module_for_quantization` already uses
-   `disable_onloading()` for exactly this purpose — accessing `module.weight` without
-   triggering the onload path.
-
-4. **Scoped and temporary.** The context manager restores normal behavior after
-   `initialize_quantization` completes. All subsequent operations use the standard
-   onload/offload path.
-
-### Why not `force_local_cache`
-
-`force_local_cache` only affects `cls_from_device` (new cache *creation*). During
-`initialize_quantization`, the `DistributedCPUCache` instances already exist on
-modules — params are added to existing caches via `__setitem__`. `force_local_cache`
-has no effect on this path. The CT maintainer also rejected this approach because
-it changes global cache creation semantics, which could affect model weight loading.
diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md
deleted file mode 100644
index 099d45f523..0000000000
--- a/examples/autoround/ddp/reproduce.md
+++ /dev/null
@@ -1,99 +0,0 @@
-# Multi-GPU DDP AutoRound Reproduce
-
-## torchrun (recommended)
-
-### 8B
-
-```bash
-cd /home/yiliu7/workspace/llm-compressor
-
-bash examples/autoround/ddp/launch_torchrun.sh \
-  --model /storage/yiliu7/Qwen/Qwen3-8B \
-  --scheme W4A16 \
-  --nsamples 32 --iters 50 \
-  --disable_torch_compile
-```
-
-### 235B
-
-```bash
-cd /home/yiliu7/workspace/llm-compressor
-
-AR_DISABLE_DATASET_SUBPROCESS=1 GPUS_PER_GROUP=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \
-/home/yiliu7/workspace/venvs/llmc/bin/torchrun --nproc_per_node=2 --master_port=29500 \
-examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py \
---model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507 \
---scheme W4A16 --nsamples 32 --iters 50 --disable_torch_compile
-```
-
-## bash wrapper (dedicated GPU isolation)
-
-```bash
-cd /home/yiliu7/workspace/llm-compressor
-
-AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NPROC=2 MASTER_PORT=29501 \
-  bash examples/autoround/ddp/launch_multi_gpu.sh \
-  ddp_qwen3_multi_gpu_example.py \
-  --model /storage/yiliu7/Qwen/Qwen3-8B \
-  --scheme W4A16 \
-  --nsamples 32 --iters 50 \
-  --disable_torch_compile \
-  > /tmp/multi_gpu_test.log 2>&1 &
-```
-
-## Monitor
-
-```bash
-tail -f /tmp/multi_gpu_test.log
-ps aux | grep ddp_qwen3_multi | grep -v grep
-nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader
-pkill -f ddp_qwen3_multi_gpu
-```
-
-## Verified
-
-### 8B (2026-06-18)
-```
-quantized 7/7 layers in the block, loss iter 0: 19.067873 -> iter 0: 19.067873
-[Rank 0] Quantization completed
-Hello my name is Mandy I am 20 years old...
-```
-All 37 decoder layers quantized, identical loss across ranks, sample generation works.
-
-### 235B (2026-06-19)
-```
-quantized 388/389 layers in the block, loss iter 0: 0.211156 -> iter 0: 0.211156
-...
-[Rank 0] Quantization completed
-```
-All 94 decoder layers quantized (388 Linear per MoE block), identical loss across ranks. ~25 min for 1 iter.
-
-## Key Files
-
-| File | Change |
-|------|--------|
-| `examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py` | torchrun example with patches |
-| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | bash wrapper example |
-| `examples/autoround/ddp/fast_pipeline.py` | Replaces `SequentialPipeline.__call__` — no FX trace |
-| `examples/autoround/ddp/launch_torchrun.sh` | torchrun launcher |
-| `examples/autoround/ddp/launch_multi_gpu.sh` | bash wrapper (GPU partitioning) |
-| `src/llmcompressor/modifiers/autoround/base.py` | `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` |
-| `src/llmcompressor/pipelines/sequential/helpers.py` | Removed `disable_onloading()` from `trace_subgraphs` |
-| `ar-py/auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)`; `current_device()` for NCCL |
-| `ar-py/auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` |
-
-## Required env vars
-
-| Var | Value | Why |
-|-----|-------|-----|
-| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync |
-| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context |
-| `--disable_torch_compile` | flag | torch.compile can't handle cross-device tensors |
-
-## Known issue: FX trace bottleneck
-
-`trace_subgraphs` runs an FX trace on the full model — for 61K-module models (235B) it never finishes. The `fast_pipeline.py` module bypasses this by creating subgraphs directly from decoder layer names. This affects ALL models using `SequentialPipeline`, not just DDP. The AWQ example (`qwen3_moe_example_ddp.py`) with 30B MoE also hangs.
-
-## Venv
-
-Python: `/home/yiliu7/workspace/venvs/llmc/bin/python`

From 44ee3b9a99d43cddc78ac49bac1d669c890588db Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 25 Jun 2026 06:46:41 +0000
Subject: [PATCH 21/22] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/autoround/ddp/ddp_qwen3_moe_example.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py
index b89ed6ccd0..a5330a92ff 100644
--- a/examples/autoround/ddp/ddp_qwen3_moe_example.py
+++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py
@@ -20,10 +20,10 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from llmcompressor import oneshot
 
-MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507"
+MODEL = "Qwen/Qwen3-235B-A22B-Instruct-2507"
 SCHEME = "W4A16"
-ITERS = 1
-NSAMPLES = 4
+ITERS = 200
+NSAMPLES = 256
 
 ###### DDP INIT #####
 gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1"))

From 47c58bb71c1da969a5a778534c75f3d01030f0b2 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 25 Jun 2026 06:47:30 +0000
Subject: [PATCH 22/22] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 src/llmcompressor/modifiers/autoround/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
index 2b6f297523..ca850052cf 100644
--- a/src/llmcompressor/modifiers/autoround/base.py
+++ b/src/llmcompressor/modifiers/autoround/base.py
@@ -389,7 +389,7 @@ def _update_device_map_for_dp(self, ar_kwargs):
                 return  # user explicitly set device_ids, respect it
             gpus_per_group = _get_local_gpu_group_size()
             if gpus_per_group > 1:
-                local_rank = torch.distributed.get_rank()
+                local_rank = int(os.environ.get("LOCAL_RANK", "0"))
                 start_gpu = local_rank * gpus_per_group
                 ar_kwargs["device_map"] = ",".join(
                     str(start_gpu + i) for i in range(gpus_per_group)