From abbea159c9bb7f8cbecc9f1751b2749660b3b57e Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 8 Jun 2026 10:55:21 +0000 Subject: [PATCH 01/22] Fix AutoRound multi-GPU DDP group handling --- .../ddp/ddp_qwen3_multi_gpu_example.py | 226 ++++++++++++++++++ examples/autoround/ddp/launch_multi_gpu.sh | 78 ++++++ examples/autoround/ddp/reproduce.md | 53 ++++ src/llmcompressor/modifiers/autoround/base.py | 66 ++++- src/llmcompressor/utils/dist.py | 28 ++- 5 files changed, 433 insertions(+), 18 deletions(-) create mode 100644 examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py create mode 100755 examples/autoround/ddp/launch_multi_gpu.sh create mode 100644 examples/autoround/ddp/reproduce.md diff --git a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py new file mode 100644 index 0000000000..350f932ee4 --- /dev/null +++ b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py @@ -0,0 +1,226 @@ +""" +Multi-GPU per group DDP example with AutoRound quantization. + +Each rank gets a local GPU group for block-level model parallelism, while +gradients are synchronized across ranks via all_reduce for identical +convergence despite split calibration data. + +Usage (4 GPUs, 2 GPUs per group): + CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=2 \\ + ddp_qwen3_multi_gpu_example.py \\ + --model /storage/yiliu7/Qwen/Qwen3-8B \\ + --scheme W4A16 \\ + --nsamples 32 --iters 50 + +For single-GPU DDP: + torchrun --nproc_per_node=4 ddp_qwen3_multi_gpu_example.py ... +""" + +import argparse +import os +import sys + +import torch +import torch.distributed as dist +from compressed_tensors.offload import dispatch_model, load_offloaded_model +from loguru import logger +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot + + +def fix_everything(seed=42): + import random + + import numpy as np + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def config_deterministic(): + torch.use_deterministic_algorithms(True, warn_only=False) + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" + fix_everything() + + +def init_dist_multi_gpu(gpus_per_group=2): + """Initialize distributed with multiple GPUs per group. + + ``CUDA_VISIBLE_DEVICES`` must already be set to a disjoint subset of + GPUs for this rank (the ``launch_multi_gpu.sh`` wrapper handles this). + NCCL communication uses the first visible GPU (local cuda:0). + + Example with 4 physical GPUs, 2 per group: + - Rank 0 -> local cuda:0, cuda:1 (physical 0, 1) + - Rank 1 -> local cuda:0, cuda:1 (physical 2, 3) + """ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + + if world_size < 2: + logger.info("Single-process mode, skipping distributed init") + return + + # NCCL uses the first visible GPU + torch.cuda.set_device(0) + + dist.init_process_group( + backend="nccl", + init_method="env://", + rank=rank, + world_size=world_size, + device_id=torch.device("cuda:0"), + ) + dist.barrier() + actual_count = torch.cuda.device_count() + logger.info( + f"[Rank {rank}/{world_size}] CUDA_VISIBLE_DEVICES=" + f"{os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')} " + f"(visible GPUs: {actual_count})" + ) + if actual_count < gpus_per_group: + logger.warning( + f"[Rank {rank}] Expected {gpus_per_group} GPUs but only " + f"{actual_count} are visible" + ) + + +def get_dist_info(): + if dist.is_available() and dist.is_initialized(): + return dist.get_rank(), dist.get_world_size() + return 0, 1 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="AutoRound Quantization with Multi-GPU per Group DDP" + ) + parser.add_argument( + "--model", + type=str, + default="Qwen/Qwen3-8B", + help="Model name or path", + ) + parser.add_argument( + "--gpus-per-group", + type=int, + default=2, + help="Number of GPUs per rank-local group for block sharding (default: 2)", + ) + parser.add_argument( + "--scheme", + type=str, + default="W4A16", + help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)", + ) + parser.add_argument("--iters", type=int, default=200, help="Number of iterations") + parser.add_argument("--nsamples", type=int, default=128, help="Number of samples") + parser.add_argument( + "--disable_torch_compile", + action="store_true", + help="Disable torch.compile for model acceleration during quantization", + ) + parser.add_argument( + "--deterministic", + action="store_true", + help="Enable deterministic mode for reproducibility", + ) + args = parser.parse_args() + + if args.deterministic: + config_deterministic() + + model_id = args.model + + ###### MULTI-GPU DDP INIT ##### + init_dist_multi_gpu(gpus_per_group=args.gpus_per_group) + # For multi-GPU-per-group AutoRound, keep the base model anchored on the + # rank-local primary GPU and let AutoRound auto-dispatch each block within + # the local GPU group during tuning. Pre-sharding the loaded model across + # the group can leave residual modules and cached activations on different + # local GPUs before AutoRound takes over. + load_device_map = "auto" + if args.gpus_per_group > 1: + load_device_map = {"": torch.device("cuda:0")} + with load_offloaded_model(): + model = AutoModelForCausalLM.from_pretrained( + model_id, dtype="auto", device_map=load_device_map + ) + ############################### + + tokenizer = AutoTokenizer.from_pretrained(model_id) + + NUM_CALIBRATION_SAMPLES = args.nsamples + MAX_SEQUENCE_LENGTH = 2048 + ITERS = args.iters + + # Get aligned calibration dataset. + from auto_round.calib_dataset import get_dataset # noqa: E402 + + # Note: Make sure model are loaded before importing auto-round related code. + from llmcompressor.modifiers.autoround import AutoRoundModifier # noqa: E402 + + ds = get_dataset( + tokenizer=tokenizer, + seqlen=MAX_SEQUENCE_LENGTH, + nsamples=NUM_CALIBRATION_SAMPLES, + ) + + # Configure the quantization algorithm. + recipe = AutoRoundModifier( + targets="Linear", + scheme=args.scheme, + ignore=[ + "lm_head", + "re:.*mlp.gate$", + ], + iters=ITERS, + enable_torch_compile=not args.disable_torch_compile, + ) + + # Apply algorithms. + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + shuffle_calibration_samples=False, + ) + + rank, world_size = get_dist_info() + logger.info(f"[Rank {rank}] Quantization completed") + + if dist.is_available() and dist.is_initialized(): + dist.barrier() + dist.destroy_process_group() + + if rank != 0: + sys.exit(0) + + if rank == 0: + # Confirm generations of the quantized model look sane. + logger.info("\n\n") + logger.info("========== SAMPLE GENERATION ==============") + dispatch_model(model) + sample = tokenizer("Hello my name is", return_tensors="pt") + sample_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + sample = {key: value.to(sample_device) for key, value in sample.items()} + output = model.generate(**sample, max_new_tokens=100) + logger.info(tokenizer.decode(output[0])) + logger.info("==========================================\n\n") + + logger.info("Saving...") + SAVE_DIR = ( + model_id.rstrip("/").split("/")[-1] + + f"-{args.scheme}-AutoRound" + + f"-iters{args.iters}-nsamples{args.nsamples}" + + "-MultiGPUDDP" + + str(world_size) + ) + model.save_pretrained(SAVE_DIR, save_compressed=True) + tokenizer.save_pretrained(SAVE_DIR) + logger.info(f"Saved to {SAVE_DIR}") diff --git a/examples/autoround/ddp/launch_multi_gpu.sh b/examples/autoround/ddp/launch_multi_gpu.sh new file mode 100755 index 0000000000..704d954400 --- /dev/null +++ b/examples/autoround/ddp/launch_multi_gpu.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Launch multi-GPU per group DDP training. +# +# Partitions physical GPUs into groups, one group per process/rank. +# Each rank sees its own set of GPUs via CUDA_VISIBLE_DEVICES. +# +# Usage: +# GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model ... --scheme W4A16 +# +# This spawns 2 ranks, each with 2 GPUs (4 GPUs total). +# The Python script no longer needs to override CUDA_VISIBLE_DEVICES. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GPUS_PER_GROUP=${GPUS_PER_GROUP:-${GPUS_PER_RANK:-2}} +NPROC=${NPROC:-2} # number of ranks +PYTHON=${PYTHON:-/home/yiliu7/workspace/venvs/ar/bin/python} +MASTER_PORT=${MASTER_PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-localhost} +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} + +SCRIPT="$1" +shift + +echo "Launching $NPROC ranks, $GPUS_PER_GROUP GPUs each" +echo "Python: $PYTHON" +echo "Script: $SCRIPT" + +VISIBLE_GPUS_ENV=${CUDA_VISIBLE_DEVICES:-} +if [[ -n "$VISIBLE_GPUS_ENV" ]]; then + IFS=',' read -r -a VISIBLE_GPUS <<< "$VISIBLE_GPUS_ENV" +else + VISIBLE_GPUS=() +fi + +TOTAL_GPUS_NEEDED=$((NPROC * GPUS_PER_GROUP)) +if [[ ${#VISIBLE_GPUS[@]} -gt 0 && ${#VISIBLE_GPUS[@]} -ne $TOTAL_GPUS_NEEDED ]]; then + echo "Expected $TOTAL_GPUS_NEEDED GPUs in CUDA_VISIBLE_DEVICES, got ${#VISIBLE_GPUS[@]}: $VISIBLE_GPUS_ENV" >&2 + exit 1 +fi + +pids=() +for RANK in $(seq 0 $((NPROC - 1))); do + if [[ ${#VISIBLE_GPUS[@]} -gt 0 ]]; then + GPU_OFFSET=$((RANK * GPUS_PER_GROUP)) + GPU_LIST=$(IFS=,; echo "${VISIBLE_GPUS[*]:$GPU_OFFSET:$GPUS_PER_GROUP}") + else + GPU_START=$((NODE_RANK * NPROC * GPUS_PER_GROUP + RANK * GPUS_PER_GROUP)) + GPU_END=$((GPU_START + GPUS_PER_GROUP - 1)) + GPU_LIST=$(seq -s, $GPU_START $GPU_END) + fi + echo " Rank $RANK -> GPUs $GPU_LIST" + + CUDA_VISIBLE_DEVICES="$GPU_LIST" \ + AR_DISABLE_DATASET_SUBPROCESS=1 \ + LOCAL_RANK=0 \ + RANK=$((NODE_RANK * NPROC + RANK)) \ + WORLD_SIZE=$((NNODES * NPROC)) \ + MASTER_ADDR="$MASTER_ADDR" \ + MASTER_PORT="$MASTER_PORT" \ + TORCHELASTIC_RUN_ID="multi_gpu_$(date +%s)_$$" \ + GPUS_PER_GROUP="$GPUS_PER_GROUP" \ + "$PYTHON" -u "$SCRIPT_DIR/$SCRIPT" "$@" & + + pids+=($!) + # Small delay so workers don't race for port binding + sleep 0.5 +done + +# Wait for all processes +status=0 +for pid in "${pids[@]}"; do + if ! wait "$pid"; then + status=1 + fi +done +exit $status diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md new file mode 100644 index 0000000000..234e002d48 --- /dev/null +++ b/examples/autoround/ddp/reproduce.md @@ -0,0 +1,53 @@ +# Multi-GPU DDP AutoRound Reproduce + +## Command + +```bash +cd /home/yiliu7/workspace/llm-compressor + +AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NPROC=2 MASTER_PORT=29501 \ + bash examples/autoround/ddp/launch_multi_gpu.sh \ + ddp_qwen3_multi_gpu_example.py \ + --model /storage/yiliu7/Qwen/Qwen3-8B \ + --gpus-per-group 2 \ + --scheme W4A16 \ + --nsamples 32 --iters 50 \ + > /tmp/multi_gpu_test.log 2>&1 & +``` + +## Monitor + +```bash +# Check progress +tail -f /tmp/multi_gpu_test.log +# Check processes +ps aux | grep ddp_qwen3_multi | grep -v grep +# Check GPU usage +nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader +# Kill +pkill -f ddp_qwen3_multi_gpu_example +``` + +## Current State + +- ✅ 4 code changes implemented (launch_multi_gpu.sh, base.py, distributed.py, quantizer.py) +- ✅ Model loading works with `device_map="auto"` (dispatch 547/547 in <1s) +- ✅ GPU partitioning works (rank 0 → GPUs 0,1; rank 1 → GPUs 2,3) +- 🔄 **Hang** after "Disabling tokenizer parallelism" warning — inside `get_dataset()` + - `AR_DISABLE_DATASET_SUBPROCESS=1` avoids the fork issue + - Dataset is cached, not downloading + - Both processes at ~100% CPU but no progress + +## Key Files + +| File | Change | +|------|--------| +| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | NEW — multi-GPU DDP example | +| `examples/autoround/ddp/launch_multi_gpu.sh` | NEW — bash wrapper for GPU partitioning | +| `src/llmcompressor/modifiers/autoround/base.py` | `_update_device_map_for_dp` + auto_offload gate use `GPUS_PER_GROUP` | +| `auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)` | +| `auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` | + +## Venv + +Python: `/home/yiliu7/workspace/venvs/ar/bin/python` diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index dd875b7887..7e1738cf98 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -1,4 +1,5 @@ from contextlib import contextmanager +import os import torch import torch.nn as nn @@ -8,6 +9,7 @@ from auto_round.utils import check_to_quantized from auto_round.wrapper import WrapperWALayer from compressed_tensors.offload import get_execution_device, get_offloaded_device +from compressed_tensors.offload.cache.base import OffloadCache from compressed_tensors.offload.module import offload_module, remove_module_offload from compressed_tensors.quantization import ( QuantizationMetadata, @@ -29,6 +31,15 @@ __all__ = ["AutoRoundModifier"] +def _get_local_gpu_group_size() -> int: + return int( + os.environ.get( + "GPUS_PER_GROUP", + os.environ.get("GPUS_PER_RANK", "1"), + ) + ) + + class _LLModelWrapper(torch.nn.Module): def __init__(self): super().__init__() @@ -64,7 +75,7 @@ def suspend_offloading(model: nn.Module): """ offloading_info = dict() for name, module in model.named_modules(): - if not hasattr(module, "weight"): # skip SiLU or other non-weight layers + if not isinstance(module._parameters, OffloadCache): continue offloading_info[name] = ( get_execution_device(module), @@ -75,7 +86,7 @@ def suspend_offloading(model: nn.Module): yield for name, module in model.named_modules(): - if not hasattr(module, "weight"): # skip SiLU or other non-weight layers + if name not in offloading_info: continue offload_module(module, *offloading_info[name]) @@ -273,6 +284,15 @@ def apply_autoround(self, state, modules): "ignore_layers": ",".join(ignore_layers) if ignore_layers else "", "disable_opt_rtn": self.disable_opt_rtn, } + if torch.distributed.is_initialized(): + gpus_per_group = _get_local_gpu_group_size() + if gpus_per_group > 1 and kwargs["enable_torch_compile"]: + logger.warning( + "Disabling torch.compile for AutoRound multi-GPU group DDP " + "because compiled block execution does not support " + "cross-device sharding." + ) + kwargs["enable_torch_compile"] = False llmc_registered_qparams = self._preprocess_qparams(decoding_layer) with ( @@ -292,11 +312,23 @@ def apply_autoround(self, state, modules): device = first_param.device cur_inputs = self._all_module_input[decoding_layer._tmp_name] decoding_layer.tuning_device = device - # Leave offload for LLMC to handle if `device_ids` is not set + # Enable auto_offload when device_ids is explicitly set OR when + # GPUS_PER_GROUP > 1 (set by launch_multi_gpu.sh). + # This lets AutoRound load-balance the block's submodules + # across multiple GPUs within the rank. auto_offload = False - if self.device_ids is not None: - # When device_ids is set, we move decoding layer to CPU first, - # then the submodules will be re-dispatched by AutoRound. + needs_multi_gpu = ( + self.device_ids is not None or _get_local_gpu_group_size() > 1 + ) + if needs_multi_gpu: + # Let AutoRound own placement within the rank-local GPU group. + # The incoming block may already be split across local devices, + # so anchoring to first_param.device can place residual modules + # (e.g. norms) on local cuda:1 while hidden states begin on + # local cuda:0, causing cross-device forward failures. + device = torch.device("cuda:0") + # Move decoding layer to CPU first, then the submodules + # will be re-dispatched by AutoRound. decoding_layer.to("cpu") auto_offload = True @@ -352,12 +384,22 @@ def get_unquantized_layer_names(self, wrapped_model: torch.nn.Module) -> list[st def _update_device_map_for_dp(self, ar_kwargs): if torch.distributed.is_initialized(): - rank = torch.distributed.get_rank() - ar_kwargs["device_map"] = ( - f"{torch.accelerator.current_accelerator().type}:{rank}" - if torch.accelerator.is_available() - else "cpu" - ) + if self.device_ids is not None: + return # user explicitly set device_ids, respect it + gpus_per_group = _get_local_gpu_group_size() + if gpus_per_group > 1: + # Multi-GPU per group: pass comma-separated local GPU indices + # so AutoRound can load-balance submodules across GPUs. + # The group size is set by the launch_multi_gpu.sh wrapper. + ar_kwargs["device_map"] = ",".join( + str(i) for i in range(gpus_per_group) + ) + else: + ar_kwargs["device_map"] = ( + f"{torch.accelerator.current_accelerator().type}:0" + if torch.accelerator.is_available() + else "cpu" + ) def _unwrapper_quantized_layer(self, model: torch.nn.Module): # auto-round will return WrapperWALayer if activation is quantized diff --git a/src/llmcompressor/utils/dist.py b/src/llmcompressor/utils/dist.py index c4a04d42eb..a1f75af804 100644 --- a/src/llmcompressor/utils/dist.py +++ b/src/llmcompressor/utils/dist.py @@ -1,11 +1,17 @@ from typing import Hashable, TypeVar -from compressed_tensors.distributed import ( - greedy_bin_packing as _greedy_bin_packing, -) -from compressed_tensors.distributed import ( - wait_for_comms as _wait_for_comms, -) +try: + from compressed_tensors.distributed import ( + greedy_bin_packing as _greedy_bin_packing, + ) + from compressed_tensors.distributed import ( + wait_for_comms as _wait_for_comms, + ) +except ImportError: + # compressed_tensors<0.16 does not have the distributed submodule + _greedy_bin_packing = None + _wait_for_comms = None + from compressed_tensors.utils.helpers import deprecated T = TypeVar("T", bound=Hashable) @@ -29,6 +35,11 @@ def greedy_bin_packing(*args, **kwargs) -> tuple[list[T], list[list[T]], dict[T, the list of items assigned to that bin. - item_to_bin: mapping from each item to its assigned bin index. """ + if _greedy_bin_packing is None: + raise ImportError( + "greedy_bin_packing requires compressed-tensors>=0.16 " + "(distributed submodule not found)" + ) return _greedy_bin_packing(*args, **kwargs) @@ -44,4 +55,9 @@ def wait_for_comms(*args, **kwargs) -> None: ``async_op=True``). The list is cleared after all operations have completed. """ + if _wait_for_comms is None: + raise ImportError( + "wait_for_comms requires compressed-tensors>=0.16 " + "(distributed submodule not found)" + ) return _wait_for_comms(*args, **kwargs) From ae806055505c002a4c93bca3403e8d9d7d138234 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 15 Jun 2026 12:54:25 +0000 Subject: [PATCH 02/22] Fix AutoRound DDP hang: disable onloading during quant param init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When DDP is initialized before model loading, OffloadCache.cls_from_device selects distributed cache variants. Each register_parameter inside initialize_module_for_quantization triggers offload() which does dist.broadcast + barrier. For large MoE models (e.g. Qwen3-235B) with 100K+ Linear layers x 6 quant params, this means 600K+ collective ops — effectively hanging. Fix: wrap initialize_module_for_quantization in disable_onloading() so new params are stored directly without triggering distributed offload. Verified on Qwen3-235B-A22B: apply_quantization_config dropped from hanging to ~4.3 min. --- .../ddp/ddp_qwen3_multi_gpu_example.py | 479 +++++++++++++++++- examples/autoround/ddp/launch_multi_gpu.sh | 3 +- 2 files changed, 458 insertions(+), 24 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py index 350f932ee4..3b15e6e5d7 100644 --- a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py +++ b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py @@ -17,18 +17,26 @@ """ import argparse +import importlib import os import sys +import time +from pathlib import Path +import psutil import torch import torch.distributed as dist -from compressed_tensors.offload import dispatch_model, load_offloaded_model +from compressed_tensors.offload import dispatch_model, from_accelerate, load_offloaded_model from loguru import logger from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot +class StopAfterBlocks(RuntimeError): + pass + + def fix_everything(seed=42): import random @@ -94,6 +102,394 @@ def get_dist_info(): return 0, 1 +def _module_has_direct_tensors(module: torch.nn.Module) -> bool: + return any(t is not None for t in module._parameters.values()) or any( + t is not None for t in module._buffers.values() + ) + + +def _module_has_meta_tensors(module: torch.nn.Module) -> bool: + return any( + t is not None and t.device.type == "meta" + for t in module._parameters.values() + ) or any(t is not None and t.device.type == "meta" for t in module._buffers.values()) + + +def patch_ct_dispatch_for_sparse_offload(): + """Avoid wrapping modules that do not need compressed-tensors offload hooks.""" + dispatch_mod = importlib.import_module("compressed_tensors.offload.dispatch") + fa_mod = importlib.import_module("compressed_tensors.offload.convert.from_accelerate") + + if getattr(dispatch_mod.dispatch_with_map, "_llmc_sparse_patch", False): + return + + offload_module = dispatch_mod.offload_module + tqdm = dispatch_mod.tqdm + + def optimized_dispatch_with_map( + model: torch.nn.Module, + device_map, + offload_dir: str | None = None, + show_progress: bool = True, + ): + filtered = [] + skipped_noop = 0 + skipped_empty = 0 + skipped_cpu_cpu = 0 + kept_meta_materialization = 0 + + for name, (onload_device, offload_device) in device_map.items(): + if offload_device is None: + skipped_noop += 1 + continue + + module = model.get_submodule(name) + if not _module_has_direct_tensors(module): + skipped_empty += 1 + continue + + if ( + str(onload_device) == "cpu" + and str(offload_device) == "cpu" + and not _module_has_meta_tensors(module) + ): + skipped_cpu_cpu += 1 + continue + + if str(onload_device) == "cpu" and str(offload_device) == "cpu": + kept_meta_materialization += 1 + + filtered.append((name, onload_device, offload_device)) + + logger.info( + "Compressed-tensors dispatch filtered {} -> {} modules " + "(noop={}, empty={}, cpu_to_cpu_skipped={}, cpu_to_cpu_meta_kept={})", + len(device_map), + len(filtered), + skipped_noop, + skipped_empty, + skipped_cpu_cpu, + kept_meta_materialization, + ) + + for name, onload_device, offload_device in tqdm( + filtered, + desc="Dispatching model", + disable=(not show_progress), + ): + module = model.get_submodule(name) + if offload_device == "disk": + offload_module( + module, + onload_device, + offload_device, + offload_dir=offload_dir, + ) + else: + offload_module(module, onload_device, offload_device) + + optimized_dispatch_with_map._llmc_sparse_patch = True + dispatch_mod.dispatch_with_map = optimized_dispatch_with_map + fa_mod.dispatch_with_map = optimized_dispatch_with_map + + +def _rank_offload_folder(base_folder: str | None) -> str | None: + if not base_folder: + return None + + rank, _ = get_dist_info() + rank_folder = Path(base_folder) / f"rank{rank}" + rank_folder.mkdir(parents=True, exist_ok=True) + return str(rank_folder) + + +def _independent_cpu_max_memory(extra_cpu_mem: int = int(5e9)) -> dict[str, int]: + _, world_size = get_dist_info() + per_rank_available = psutil.virtual_memory().available // max(world_size, 1) + return {"cpu": max(per_rank_available - extra_cpu_mem, int(8e9))} + + +def load_model_with_local_offload(model_id: str, offload_folder: str | None): + """Load model on each rank independently, then convert accelerate offload locally.""" + load_kwargs = { + "dtype": "auto", + "device_map": "auto", + "max_memory": _independent_cpu_max_memory(), + } + rank_offload_folder = _rank_offload_folder(offload_folder) + if rank_offload_folder: + load_kwargs["offload_folder"] = rank_offload_folder + + logger.info( + "[Rank {}] Loading model independently with max_memory={} offload_folder={}", + get_dist_info()[0], + load_kwargs["max_memory"], + rank_offload_folder, + ) + model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs) + if hasattr(model, "hf_device_map"): + from_accelerate(model) + return model + + +def patch_disable_onloading_for_quant_init(): + """Avoid expensive dist.broadcast + barrier for every new quant param. + + When DDP is initialized before model loading, + ``OffloadCache.cls_from_device`` selects distributed cache variants + (DistributedCPUCache / DistributedDiskCache). Each call to + ``register_parameter`` inside ``initialize_module_for_quantization`` + triggers ``offload()``, which does a collective broadcast + barrier. + For large MoE models (e.g. Qwen3-235B with 100K+ Linear layers × 6 + quant params), this means hundreds of thousands of round-trips — + effectively hanging the process. + + Wrapping the body in ``disable_onloading()`` stores new parameters + directly in ``offloaded_values`` without invoking the distributed + offload, cutting the overhead to zero. + """ + from compressed_tensors.offload import ( # noqa: F811 + disable_onloading, + ) + + lifecycle_init_mod = importlib.import_module( + "compressed_tensors.quantization.lifecycle.initialize" + ) + original_fn = lifecycle_init_mod.initialize_module_for_quantization + + if getattr(original_fn, "_llmc_no_dist_offload_patch", False): + return + + def patched_initialize_module_for_quantization(module, scheme=None, force_zero_point=True): + with disable_onloading(): + return original_fn(module, scheme=scheme, force_zero_point=force_zero_point) + + patched_initialize_module_for_quantization._llmc_no_dist_offload_patch = True + lifecycle_init_mod.initialize_module_for_quantization = ( + patched_initialize_module_for_quantization + ) + + +def patch_autoround_stop_after_blocks(max_blocks: int): + """Raise after N decoding blocks finish so large-model smoke tests can stop cleanly.""" + autoround_mod = importlib.import_module("llmcompressor.modifiers.autoround.base") + modifier_cls = autoround_mod.AutoRoundModifier + + if getattr(modifier_cls.apply_autoround, "_llmc_stop_after_patch", False): + return + + original_apply_autoround = modifier_cls.apply_autoround + + def wrapped_apply_autoround(self, state, modules): + modules = modules or [] + if not any(self._is_decoding_layer(module) for module in modules): + return original_apply_autoround(self, state, modules) + + result = original_apply_autoround(self, state, modules) + completed = getattr(self, "_llmc_completed_blocks", 0) + 1 + self._llmc_completed_blocks = completed + logger.info( + "[Rank {}] Completed AutoRound block {}/{}", + get_dist_info()[0], + completed, + max_blocks, + ) + if completed >= max_blocks: + raise StopAfterBlocks(f"Stopped after {completed} blocks") + return result + + wrapped_apply_autoround._llmc_stop_after_patch = True + modifier_cls.apply_autoround = wrapped_apply_autoround + + +def patch_llmc_timing_logs(): + """Add coarse timing logs around the expensive LLMC startup stages.""" + recipe_mod = importlib.import_module("llmcompressor.recipe.recipe") + lifecycle_mod = importlib.import_module("llmcompressor.core.lifecycle") + quant_mixin_mod = importlib.import_module( + "llmcompressor.modifiers.quantization.quantization.mixin" + ) + quantization_base_mod = importlib.import_module( + "compressed_tensors.quantization" + ) + module_utils_mod = importlib.import_module("compressed_tensors.utils") + group_validation_mod = importlib.import_module( + "llmcompressor.modifiers.quantization.group_size_validation" + ) + seq_helpers_mod = importlib.import_module("llmcompressor.pipelines.sequential.helpers") + seq_pipeline_mod = importlib.import_module("llmcompressor.pipelines.sequential.pipeline") + cache_mod = importlib.import_module("llmcompressor.pipelines.cache") + autoround_mod = importlib.import_module("llmcompressor.modifiers.autoround.base") + core_mod = importlib.import_module("llmcompressor.core") + + recipe_cls = recipe_mod.Recipe + lifecycle_cls = lifecycle_mod.CompressionLifecycle + quant_mixin_cls = quant_mixin_mod.QuantizationMixin + cache_cls = cache_mod.IntermediatesCache + autoround_cls = autoround_mod.AutoRoundModifier + seq_pipeline_cls = seq_pipeline_mod.SequentialPipeline + lifecycle_callbacks = core_mod.LifecycleCallbacks + + if getattr(recipe_cls.from_modifiers, "_llmc_timing_patch", False): + return + + original_from_modifiers = recipe_cls.from_modifiers.__func__ + original_lifecycle_initialize = lifecycle_cls.initialize + original_initialize_quantization = quant_mixin_cls.initialize_quantization + original_start_calibration = autoround_cls.start_calibration + original_trace_subgraphs = seq_helpers_mod.trace_subgraphs + original_from_dataloader = cache_cls.from_dataloader.__func__ + original_apply_autoround = autoround_cls.apply_autoround + original_seq_call = seq_pipeline_cls.__call__ + original_calib_epoch_start = lifecycle_callbacks.calibration_epoch_start + original_match_named_modules = module_utils_mod.match_named_modules + original_apply_quantization_config = quantization_base_mod.apply_quantization_config + original_validate_group_size_divisibility = ( + group_validation_mod.validate_group_size_divisibility + ) + + def _timed(label, fn, *args, **kwargs): + start = time.perf_counter() + logger.info("[Rank {}] {} started", get_dist_info()[0], label) + try: + return fn(*args, **kwargs) + finally: + logger.info( + "[Rank {}] {} finished in {:.2f}s", + get_dist_info()[0], + label, + time.perf_counter() - start, + ) + + @classmethod + def timed_from_modifiers(cls, modifiers, modifier_group_name=None): + return _timed( + "Recipe.from_modifiers", + original_from_modifiers, + cls, + modifiers, + modifier_group_name, + ) + + def timed_lifecycle_initialize(self, *args, **kwargs): + return _timed( + "CompressionLifecycle.initialize", + original_lifecycle_initialize, + self, + *args, + **kwargs, + ) + + def timed_initialize_quantization(self, model): + return _timed( + "QuantizationMixin.initialize_quantization", + original_initialize_quantization, + self, + model, + ) + + def timed_start_calibration(self, model): + return _timed( + "AutoRoundModifier.start_calibration", + original_start_calibration, + self, + model, + ) + + def timed_trace_subgraphs(*args, **kwargs): + return _timed("trace_subgraphs", original_trace_subgraphs, *args, **kwargs) + + @classmethod + def timed_from_dataloader(cls, *args, **kwargs): + return _timed( + "IntermediatesCache.from_dataloader", + original_from_dataloader, + cls, + *args, + **kwargs, + ) + + def timed_apply_autoround(self, state, modules): + modules = modules or [] + decoding_layers = [m for m in modules if self._is_decoding_layer(m)] + if not decoding_layers: + return original_apply_autoround(self, state, modules) + layer_name = getattr(decoding_layers[0], "_tmp_name", decoding_layers[0].__class__.__name__) + return _timed( + f"AutoRoundModifier.apply_autoround({layer_name})", + original_apply_autoround, + self, + state, + modules, + ) + + def timed_seq_call(model, dataloader, dataset_args): + pipeline_start = time.perf_counter() + logger.info("[Rank {}] SequentialPipeline.__call__ started", get_dist_info()[0]) + try: + logger.info("[Rank {}] SequentialPipeline pre-next(iter(dataloader))", get_dist_info()[0]) + iter_start = time.perf_counter() + sample_input = next(iter(dataloader)) + logger.info( + "[Rank {}] next(iter(dataloader)) finished in {:.2f}s", + get_dist_info()[0], + time.perf_counter() - iter_start, + ) + del sample_input + return original_seq_call(model, dataloader, dataset_args) + finally: + logger.info( + "[Rank {}] SequentialPipeline.__call__ finished in {:.2f}s", + get_dist_info()[0], + time.perf_counter() - pipeline_start, + ) + + def timed_calib_epoch_start(*args, **kwargs): + return _timed( + "LifecycleCallbacks.calibration_epoch_start", + original_calib_epoch_start, + *args, + **kwargs, + ) + + def timed_match_named_modules(*args, **kwargs): + return _timed("match_named_modules", original_match_named_modules, *args, **kwargs) + + def timed_apply_quantization_config(*args, **kwargs): + return _timed( + "apply_quantization_config", + original_apply_quantization_config, + *args, + **kwargs, + ) + + def timed_validate_group_size_divisibility(*args, **kwargs): + return _timed( + "validate_group_size_divisibility", + original_validate_group_size_divisibility, + *args, + **kwargs, + ) + + timed_from_modifiers._llmc_timing_patch = True + recipe_cls.from_modifiers = timed_from_modifiers + lifecycle_cls.initialize = timed_lifecycle_initialize + quant_mixin_cls.initialize_quantization = timed_initialize_quantization + autoround_cls.start_calibration = timed_start_calibration + module_utils_mod.match_named_modules = timed_match_named_modules + quant_mixin_mod.match_named_modules = timed_match_named_modules + quantization_base_mod.apply_quantization_config = timed_apply_quantization_config + quant_mixin_mod.apply_quantization_config = timed_apply_quantization_config + group_validation_mod.validate_group_size_divisibility = timed_validate_group_size_divisibility + quant_mixin_mod.validate_group_size_divisibility = timed_validate_group_size_divisibility + seq_helpers_mod.trace_subgraphs = timed_trace_subgraphs + seq_pipeline_mod.trace_subgraphs = timed_trace_subgraphs + cache_cls.from_dataloader = timed_from_dataloader + autoround_cls.apply_autoround = timed_apply_autoround + seq_pipeline_cls.__call__ = staticmethod(timed_seq_call) + lifecycle_callbacks.calibration_epoch_start = timed_calib_epoch_start + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="AutoRound Quantization with Multi-GPU per Group DDP" @@ -116,7 +512,7 @@ def get_dist_info(): default="W4A16", help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)", ) - parser.add_argument("--iters", type=int, default=200, help="Number of iterations") + parser.add_argument("--iters", type=int, default=20, help="Number of iterations") parser.add_argument("--nsamples", type=int, default=128, help="Number of samples") parser.add_argument( "--disable_torch_compile", @@ -128,6 +524,18 @@ def get_dist_info(): action="store_true", help="Enable deterministic mode for reproducibility", ) + parser.add_argument( + "--offload-folder", + type=str, + default=None, + help="Optional folder for disk offload while loading very large models", + ) + parser.add_argument( + "--max-blocks", + type=int, + default=None, + help="Optional number of decoder blocks to quantize before exiting", + ) args = parser.parse_args() if args.deterministic: @@ -137,18 +545,32 @@ def get_dist_info(): ###### MULTI-GPU DDP INIT ##### init_dist_multi_gpu(gpus_per_group=args.gpus_per_group) - # For multi-GPU-per-group AutoRound, keep the base model anchored on the - # rank-local primary GPU and let AutoRound auto-dispatch each block within - # the local GPU group during tuning. Pre-sharding the loaded model across - # the group can leave residual modules and cached activations on different - # local GPUs before AutoRound takes over. - load_device_map = "auto" - if args.gpus_per_group > 1: - load_device_map = {"": torch.device("cuda:0")} - with load_offloaded_model(): - model = AutoModelForCausalLM.from_pretrained( - model_id, dtype="auto", device_map=load_device_map - ) + patch_ct_dispatch_for_sparse_offload() + patch_llmc_timing_logs() + patch_disable_onloading_for_quant_init() + if args.max_blocks is not None: + patch_autoround_stop_after_blocks(args.max_blocks) + # Load onto CPU first and spill to disk if needed. AutoRound will then + # onload and shard each block onto the rank-local GPU group during tuning. + load_start = time.perf_counter() + rank, world_size = get_dist_info() + if world_size > 1: + model = load_model_with_local_offload(model_id, args.offload_folder) + else: + load_kwargs = { + "dtype": "auto", + "device_map": "auto_offload", + } + rank_offload_folder = _rank_offload_folder(args.offload_folder) + if rank_offload_folder: + load_kwargs["offload_folder"] = rank_offload_folder + with load_offloaded_model(): + model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs) + logger.info( + "[Rank {}] Model load + offload conversion finished in {:.2f}s", + rank, + time.perf_counter() - load_start, + ) ############################### tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -182,17 +604,25 @@ def get_dist_info(): ) # Apply algorithms. - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - shuffle_calibration_samples=False, - ) + stopped_early = False + try: + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + shuffle_calibration_samples=False, + ) + except StopAfterBlocks as exc: + stopped_early = True + logger.info("[Rank {}] {}", get_dist_info()[0], str(exc)) rank, world_size = get_dist_info() - logger.info(f"[Rank {rank}] Quantization completed") + if stopped_early: + logger.info(f"[Rank {rank}] Partial quantization completed") + else: + logger.info(f"[Rank {rank}] Quantization completed") if dist.is_available() and dist.is_initialized(): dist.barrier() @@ -201,6 +631,9 @@ def get_dist_info(): if rank != 0: sys.exit(0) + if stopped_early: + sys.exit(0) + if rank == 0: # Confirm generations of the quantized model look sane. logger.info("\n\n") diff --git a/examples/autoround/ddp/launch_multi_gpu.sh b/examples/autoround/ddp/launch_multi_gpu.sh index 704d954400..14e40c9a78 100755 --- a/examples/autoround/ddp/launch_multi_gpu.sh +++ b/examples/autoround/ddp/launch_multi_gpu.sh @@ -6,6 +6,7 @@ # # Usage: # GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model ... --scheme W4A16 +# GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507 --scheme W4A16 # # This spawns 2 ranks, each with 2 GPUs (4 GPUs total). # The Python script no longer needs to override CUDA_VISIBLE_DEVICES. @@ -15,7 +16,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" GPUS_PER_GROUP=${GPUS_PER_GROUP:-${GPUS_PER_RANK:-2}} NPROC=${NPROC:-2} # number of ranks PYTHON=${PYTHON:-/home/yiliu7/workspace/venvs/ar/bin/python} -MASTER_PORT=${MASTER_PORT:-29500} +MASTER_PORT=${MASTER_PORT:-29600} MASTER_ADDR=${MASTER_ADDR:-localhost} NNODES=${NNODES:-1} NODE_RANK=${NODE_RANK:-0} From a28e7fcbc310e8d5d32de25f65db61a3e3574402 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sat, 20 Jun 2026 14:24:20 +0000 Subject: [PATCH 03/22] update Signed-off-by: yiliu30 --- examples/autoround/ddp/DDP_FIXES.md | 237 ++++++++++++++++++ examples/autoround/ddp/ddp_autoround.py | 187 ++++++++++++++ .../ddp/ddp_qwen3_multi_gpu_example.py | 8 +- examples/autoround/ddp/reproduce.md | 88 +++++-- src/llmcompressor/modifiers/autoround/base.py | 10 +- .../pipelines/sequential/helpers.py | 4 - src/llmcompressor/utils/dev.py | 2 +- 7 files changed, 501 insertions(+), 35 deletions(-) create mode 100644 examples/autoround/ddp/DDP_FIXES.md create mode 100644 examples/autoround/ddp/ddp_autoround.py diff --git a/examples/autoround/ddp/DDP_FIXES.md b/examples/autoround/ddp/DDP_FIXES.md new file mode 100644 index 0000000000..50e9e7352d --- /dev/null +++ b/examples/autoround/ddp/DDP_FIXES.md @@ -0,0 +1,237 @@ +# DDP Multi-GPU AutoRound Fixes for Large MoE Models + +## Problem + +Running AutoRound quantization with DDP on large MoE models (e.g., Qwen3-235B) would hang or take hours due to `DistributedCPUCache` performing a `dist.broadcast_object_list()` + `dist.barrier()` **per parameter** during offload operations (~218ms × 45K params = ~163 minutes). + +## Root Cause + +When `dist.is_initialized()`, `OffloadCache.cls_from_device("cpu")` returns `DistributedCPUCache` instead of `CPUCache`. This cache broadcasts every tensor to all ranks — unnecessary when each rank loads the model independently via safetensors mmap. + +The bottleneck hits in two places: +1. `from_accelerate()` → `dispatch_with_map()` +2. `set_onload_device()` in SequentialPipeline + +## Fixes Applied + +### Fix 1: `src/llmcompressor/utils/dev.py` — `get_main_device()` + +**Bug**: Used `rank` as the CUDA device index, which is wrong when `GPUS_PER_GROUP > 1`. +**Fix**: Use `torch.accelerator.current_device_index()` which respects `torch.cuda.set_device()`. + +```python +# Before (line 140): +return torch.device(accel_type, rank) + +# After: +return torch.device(accel_type, torch.accelerator.current_device_index()) +``` + +### Fix 2: `src/llmcompressor/modifiers/autoround/base.py` — anchor device in `apply_autoround` + +**Bug**: Hardcoded `device = torch.device("cuda:0")` when `needs_multi_gpu` is true. Rank 1 with GPUs [2,3] would try to anchor on cuda:0 instead of cuda:2. +**Fix**: Use `get_main_device()` which returns the correct per-rank device. + +```python +# Before (line ~329): +device = torch.device("cuda:0") + +# After: +from llmcompressor.utils.dev import get_main_device +device = get_main_device() +``` + +### Fix 3: `src/llmcompressor/modifiers/autoround/base.py` — GPU partition in `_update_device_map_for_dp` + +**Bug**: Generated `"0,1"` for all ranks instead of per-rank GPU partitions. +**Fix**: Offset by `local_rank * gpus_per_group`. + +```python +# Before: +ar_kwargs["device_map"] = ",".join(str(i) for i in range(gpus_per_group)) + +# After: +local_rank = torch.distributed.get_rank() +start_gpu = local_rank * gpus_per_group +ar_kwargs["device_map"] = ",".join(str(start_gpu + i) for i in range(gpus_per_group)) +``` + +### Patch 4 (monkey-patch, needs upstream in compressed-tensors): Force local cache + +Patches `OffloadCache.cls_from_device` to return `CPUCache`/`DeviceCache` instead of `DistributedCPUCache`/`DistributedDeviceCache`. This is correct when each rank loads the model independently. + +See `patch_force_local_cache()` in `test_option3_fixed.py`. + +### Patch 5 (monkey-patch, needs upstream in compressed-tensors): Disable onloading during quant init + +Wraps `initialize_module_for_quantization` with `disable_onloading()` to avoid per-parameter broadcast+barrier when new quantization parameters are created. + +See `patch_disable_onloading_for_quant_init()` in `test_option3_fixed.py`. + +## Reproduce + +### Prerequisites + +```bash +# Environment +source /home/yiliu7/workspace/venvs/llmc/bin/activate + +# Working directory +cd /home/yiliu7/workspace/llm-compressor +``` + +### Run on Qwen3-8B (quick verification, ~2 minutes) + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ + --nproc_per_node=2 \ + examples/autoround/ddp/ddp_autoround.py \ + --model /storage/yiliu7/Qwen/Qwen3-8B \ + --iters 5 --nsamples 32 +``` + +### Run on Qwen3-235B (full test, ~47 minutes) + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ + --nproc_per_node=2 \ + examples/autoround/ddp/ddp_autoround.py \ + --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ \ + --iters 20 --nsamples 32 +``` + +### Expected behavior + +- Both ranks process all 94 layers in lockstep (~30s/layer on 235B) +- All 4 GPUs show active memory usage (~56-63 GB each) +- Each rank uses 2 GPUs: rank 0 → [0,1], rank 1 → [2,3] +- Small NCCL idle contexts (~614 MB) appear on non-owned GPUs — this is normal + +### Monitor progress + +```bash +# GPU utilization +nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader + +# Layer progress (from log) +grep "Applying AutoRound" /path/to/log | tail -6 +``` + +## Known Issues + +1. **8 GPU process entries in nvidia-smi**: Each of the 2 torchrun processes creates a small NCCL context (~614 MB) on all visible GPUs. Only 4 entries are doing real work (the ~56-63 GB ones). This is unavoidable without a pre-launch wrapper that restricts `CUDA_VISIBLE_DEVICES` before Python starts. + +2. **OOM on layer ~11 (235B)**: With 20 iters and the full 235B model, GPU memory may be tight. Reduce `--iters` or `--nsamples` if OOM occurs. + +## Upstream Plan + +### PR 1: llm-compressor — Multi-GPU DDP device fixes + +**Scope**: Fixes 1–3 above. Clean code changes, no monkey-patches. + +**Changes**: +- `src/llmcompressor/utils/dev.py`: `get_main_device()` uses `current_device_index()` instead of `rank` +- `src/llmcompressor/modifiers/autoround/base.py`: + - `apply_autoround` anchor device uses `get_main_device()` instead of hardcoded `cuda:0` + - `_update_device_map_for_dp` offsets GPU indices by `local_rank * gpus_per_group` + +**Testing**: Run DDP AutoRound on Qwen3-8B with 4 GPUs (2 per rank). Verify all GPUs participate and no device mismatch errors. + +--- + +### PR 2: compressed-tensors — Skip distributed cache when ranks have local parameters + +**Problem**: `OffloadCache.cls_from_device("cpu")` unconditionally returns `DistributedCPUCache` when `dist.is_initialized()`. This causes O(n_params) broadcast+barrier ops (~218ms each) even when all ranks already have parameters locally (via independent `from_pretrained` loading with safetensors mmap). + +**Proposed fix**: Add a `distributed` parameter to `cls_from_device` with auto-detection: + +```python +# compressed_tensors/offload/cache/base.py + +@classmethod +def cls_from_device(cls, device=None, distributed=None): + """ + Args: + distributed: If None (default), auto-detect based on whether + dist is initialized. If False, always return local cache. + If True, always return distributed cache. + """ + if distributed is None: + distributed = ( + torch.distributed.is_initialized() + and torch.distributed.get_world_size() > 1 + ) + + device_type = torch.device(device).type if device != "disk" else "disk" + if device_type == "cpu": + return DistributedCPUCache if distributed else CPUCache + elif is_accelerator_type(device_type): + return DistributedDeviceCache if distributed else DeviceCache + elif device_type == "disk": + return DiskCache + ... +``` + +**Callers that should pass `distributed=False`**: +- `set_onload_device()` when the model was loaded independently on each rank (no meta tensors) +- Any path where the caller knows parameters are already materialized locally + +**Alternative approach** — context manager: + +```python +# compressed_tensors/offload/cache/base.py + +_force_local_cache = threading.local() + +@contextlib.contextmanager +def force_local_cache(): + """Context under which cls_from_device always returns non-distributed caches.""" + _force_local_cache.active = True + try: + yield + finally: + _force_local_cache.active = False + +@classmethod +def cls_from_device(cls, device=None): + distributed = ( + torch.distributed.is_initialized() + and torch.distributed.get_world_size() > 1 + and not getattr(_force_local_cache, 'active', False) + ) + ... +``` + +This lets llm-compressor wrap its pipeline with `force_local_cache()` without modifying every callsite. + +**Testing**: +- Existing tests pass (distributed cache still used by default) +- DDP test with independent model loading uses local cache, no broadcast overhead + +--- + +### PR 3: compressed-tensors — Wrap quant init with `disable_onloading()` + +**Problem**: `initialize_module_for_quantization` creates new parameters (scale, zero_point, etc.) which immediately trigger `DistributedCPUCache.offload()` → broadcast+barrier. These parameters are created identically on every rank, so broadcasting is always redundant. + +**Proposed fix**: Wrap the function body with `disable_onloading()`: + +```python +# compressed_tensors/quantization/lifecycle/initialize.py + +def initialize_module_for_quantization(module, scheme=None, force_zero_point=True): + with disable_onloading(): + # ... existing implementation ... +``` + +**Rationale**: New quant parameters are initialized from the quantization scheme (not from model weights), so they're identical across ranks by construction. There's no information to broadcast. + +**Testing**: DDP quantization should show no broadcast calls during `initialize_module_for_quantization`. Single-process behavior unchanged. + +--- + +### Priority + +1. **PR 3** (highest): Universal fix, always correct, simple one-liner +2. **PR 2** (high): Eliminates the main bottleneck for independent-loading DDP +3. **PR 1** (medium): Required for multi-GPU-per-rank scenarios (GPUS_PER_GROUP > 1) diff --git a/examples/autoround/ddp/ddp_autoround.py b/examples/autoround/ddp/ddp_autoround.py new file mode 100644 index 0000000000..200f456f69 --- /dev/null +++ b/examples/autoround/ddp/ddp_autoround.py @@ -0,0 +1,187 @@ +""" +DDP AutoRound quantization example for large MoE models. + +Runs 2 ranks, each using GPUS_PER_GROUP GPUs. All ranks load the model +independently on CPU (safetensors mmap shares physical pages at OS level). + +Run with: + CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ + --nproc_per_node=2 ddp_autoround.py \ + --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 2>&1 | tee test_ddp_autoround.log + CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ + --nproc_per_node=2 ddp_autoround.py \ + --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround.log + CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ + --nproc_per_node=2 ddp_autoround.py \ + --model /path/to/model +""" + +import argparse +import importlib +import os +import time + +import torch +import torch.distributed as dist +from loguru import logger +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot + + +def patch_disable_onloading_for_quant_init(): + """Avoid dist.broadcast + barrier for every new quant parameter. + + compressed-tensors' initialize_module_for_quantization creates new + parameters which trigger DistributedCPUCache's per-param broadcast. + Wrapping with disable_onloading() prevents this. + """ + from compressed_tensors.offload import disable_onloading + + lifecycle_init_mod = importlib.import_module( + "compressed_tensors.quantization.lifecycle.initialize" + ) + original_fn = lifecycle_init_mod.initialize_module_for_quantization + if getattr(original_fn, "_patched", False): + return + + def patched(module, scheme=None, force_zero_point=True): + with disable_onloading(): + return original_fn(module, scheme=scheme, force_zero_point=force_zero_point) + + patched._patched = True + lifecycle_init_mod.initialize_module_for_quantization = patched + + +def patch_force_local_cache(): + """Force OffloadCache.cls_from_device to return non-distributed caches. + + When ranks load the model independently, each already has parameters + locally. DistributedCPUCache's per-param broadcast+barrier is + unnecessary and causes O(n_params) collective ops (~218ms each). + """ + from compressed_tensors.offload.cache.base import OffloadCache + from compressed_tensors.offload.cache.cpu import CPUCache + from compressed_tensors.offload.cache.device import DeviceCache + from compressed_tensors.offload.cache.disk import DiskCache + from compressed_tensors.utils import is_accelerator_type + + @classmethod + def cls_from_device_local(cls, device=None): + device_type = torch.device(device).type if device != "disk" else "disk" + if device_type == "cpu": + return CPUCache + elif is_accelerator_type(device_type): + return DeviceCache + elif device_type == "disk": + return DiskCache + else: + raise NotImplementedError(f"Offload of type {device_type} not implemented") + + OffloadCache.cls_from_device = cls_from_device_local + logger.info("Patched OffloadCache.cls_from_device → local (non-distributed) caches") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--scheme", type=str, default="W4A16") + parser.add_argument("--iters", type=int, default=5) + parser.add_argument("--nsamples", type=int, default=128) + args = parser.parse_args() + + ###### DDP INIT ##### + gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1")) + if "TORCHELASTIC_RUN_ID" in os.environ: + local_rank = int(os.environ["LOCAL_RANK"]) + main_gpu = local_rank * gpus_per_group + torch.cuda.set_device(main_gpu) + dist.init_process_group( + backend="nccl", + init_method="env://", + device_id=torch.device(f"cuda:{main_gpu}"), + ) + + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + main_gpu = rank * gpus_per_group + logger.info( + f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, " + f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]" + ) + + # Apply patches BEFORE model loading and calibration + patch_disable_onloading_for_quant_init() + patch_force_local_cache() + + ###### MODEL LOAD ##### + load_start = time.perf_counter() + model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto") + load_elapsed = time.perf_counter() - load_start + logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s") + + tokenizer = AutoTokenizer.from_pretrained(args.model) + + ###### DATASET ##### + os.environ["AR_DISABLE_DATASET_SUBPROCESS"] = "1" + from auto_round.calib_dataset import get_dataset + from llmcompressor.modifiers.autoround import AutoRoundModifier + + ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=args.nsamples) + + ###### RECIPE ##### + recipe = AutoRoundModifier( + targets="Linear", + scheme=args.scheme, + ignore=["lm_head", "re:.*mlp.gate$"], + iters=args.iters, + enable_torch_compile=False, + ) + + ###### QUANTIZE ##### + logger.info(f"[Rank {rank}] Starting oneshot...") + quant_start = time.perf_counter() + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=2048, + num_calibration_samples=args.nsamples, + shuffle_calibration_samples=False, + ) + quant_elapsed = time.perf_counter() - quant_start + logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s") + + if dist.is_initialized(): + dist.barrier() + + ###### SAMPLE GENERATION (rank 0 only) ##### + if rank == 0: + from compressed_tensors.offload import dispatch_model + + logger.info("========== SAMPLE GENERATION ==============") + dispatch_model(model) + sample = tokenizer("Hello my name is", return_tensors="pt") + sample = {key: value.to(model.device) for key, value in sample.items()} + output = model.generate(**sample, max_new_tokens=100) + logger.info(tokenizer.decode(output[0])) + logger.info("==========================================") + + ###### SAVE (rank 0 only) ##### + if rank == 0: + save_dir = ( + args.model.rstrip("/").split("/")[-1] + + f"-{args.scheme}-AutoRound" + + f"-iters{args.iters}-nsamples{args.nsamples}" + + f"-DDP{world_size}" + ) + logger.info(f"Saving to {save_dir}...") + model.save_pretrained(save_dir, save_compressed=True) + tokenizer.save_pretrained(save_dir) + logger.info(f"Saved to {save_dir}") + + if dist.is_initialized(): + dist.barrier() + dist.destroy_process_group() + + logger.info(f"[Rank {rank}] SUCCESS") diff --git a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py index 3b15e6e5d7..e49c4d4a77 100644 --- a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py +++ b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py @@ -6,10 +6,10 @@ convergence despite split calibration data. Usage (4 GPUs, 2 GPUs per group): - CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=2 \\ - ddp_qwen3_multi_gpu_example.py \\ - --model /storage/yiliu7/Qwen/Qwen3-8B \\ - --scheme W4A16 \\ + CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=2 \ + ddp_qwen3_multi_gpu_example.py \ + --model /storage/yiliu7/Qwen/Qwen3-8B \ + --scheme W4A16 \ --nsamples 32 --iters 50 For single-GPU DDP: diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md index 234e002d48..099d45f523 100644 --- a/examples/autoround/ddp/reproduce.md +++ b/examples/autoround/ddp/reproduce.md @@ -1,6 +1,32 @@ # Multi-GPU DDP AutoRound Reproduce -## Command +## torchrun (recommended) + +### 8B + +```bash +cd /home/yiliu7/workspace/llm-compressor + +bash examples/autoround/ddp/launch_torchrun.sh \ + --model /storage/yiliu7/Qwen/Qwen3-8B \ + --scheme W4A16 \ + --nsamples 32 --iters 50 \ + --disable_torch_compile +``` + +### 235B + +```bash +cd /home/yiliu7/workspace/llm-compressor + +AR_DISABLE_DATASET_SUBPROCESS=1 GPUS_PER_GROUP=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \ +/home/yiliu7/workspace/venvs/llmc/bin/torchrun --nproc_per_node=2 --master_port=29500 \ +examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py \ +--model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507 \ +--scheme W4A16 --nsamples 32 --iters 50 --disable_torch_compile +``` + +## bash wrapper (dedicated GPU isolation) ```bash cd /home/yiliu7/workspace/llm-compressor @@ -9,45 +35,65 @@ AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NP bash examples/autoround/ddp/launch_multi_gpu.sh \ ddp_qwen3_multi_gpu_example.py \ --model /storage/yiliu7/Qwen/Qwen3-8B \ - --gpus-per-group 2 \ --scheme W4A16 \ --nsamples 32 --iters 50 \ + --disable_torch_compile \ > /tmp/multi_gpu_test.log 2>&1 & ``` ## Monitor ```bash -# Check progress tail -f /tmp/multi_gpu_test.log -# Check processes ps aux | grep ddp_qwen3_multi | grep -v grep -# Check GPU usage nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader -# Kill -pkill -f ddp_qwen3_multi_gpu_example +pkill -f ddp_qwen3_multi_gpu ``` -## Current State +## Verified + +### 8B (2026-06-18) +``` +quantized 7/7 layers in the block, loss iter 0: 19.067873 -> iter 0: 19.067873 +[Rank 0] Quantization completed +Hello my name is Mandy I am 20 years old... +``` +All 37 decoder layers quantized, identical loss across ranks, sample generation works. -- ✅ 4 code changes implemented (launch_multi_gpu.sh, base.py, distributed.py, quantizer.py) -- ✅ Model loading works with `device_map="auto"` (dispatch 547/547 in <1s) -- ✅ GPU partitioning works (rank 0 → GPUs 0,1; rank 1 → GPUs 2,3) -- 🔄 **Hang** after "Disabling tokenizer parallelism" warning — inside `get_dataset()` - - `AR_DISABLE_DATASET_SUBPROCESS=1` avoids the fork issue - - Dataset is cached, not downloading - - Both processes at ~100% CPU but no progress +### 235B (2026-06-19) +``` +quantized 388/389 layers in the block, loss iter 0: 0.211156 -> iter 0: 0.211156 +... +[Rank 0] Quantization completed +``` +All 94 decoder layers quantized (388 Linear per MoE block), identical loss across ranks. ~25 min for 1 iter. ## Key Files | File | Change | |------|--------| -| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | NEW — multi-GPU DDP example | -| `examples/autoround/ddp/launch_multi_gpu.sh` | NEW — bash wrapper for GPU partitioning | -| `src/llmcompressor/modifiers/autoround/base.py` | `_update_device_map_for_dp` + auto_offload gate use `GPUS_PER_GROUP` | -| `auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)` | -| `auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` | +| `examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py` | torchrun example with patches | +| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | bash wrapper example | +| `examples/autoround/ddp/fast_pipeline.py` | Replaces `SequentialPipeline.__call__` — no FX trace | +| `examples/autoround/ddp/launch_torchrun.sh` | torchrun launcher | +| `examples/autoround/ddp/launch_multi_gpu.sh` | bash wrapper (GPU partitioning) | +| `src/llmcompressor/modifiers/autoround/base.py` | `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` | +| `src/llmcompressor/pipelines/sequential/helpers.py` | Removed `disable_onloading()` from `trace_subgraphs` | +| `ar-py/auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)`; `current_device()` for NCCL | +| `ar-py/auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` | + +## Required env vars + +| Var | Value | Why | +|-----|-------|-----| +| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync | +| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context | +| `--disable_torch_compile` | flag | torch.compile can't handle cross-device tensors | + +## Known issue: FX trace bottleneck + +`trace_subgraphs` runs an FX trace on the full model — for 61K-module models (235B) it never finishes. The `fast_pipeline.py` module bypasses this by creating subgraphs directly from decoder layer names. This affects ALL models using `SequentialPipeline`, not just DDP. The AWQ example (`qwen3_moe_example_ddp.py`) with 30B MoE also hangs. ## Venv -Python: `/home/yiliu7/workspace/venvs/ar/bin/python` +Python: `/home/yiliu7/workspace/venvs/llmc/bin/python` diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 7e1738cf98..dcca16b32f 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -326,7 +326,8 @@ def apply_autoround(self, state, modules): # so anchoring to first_param.device can place residual modules # (e.g. norms) on local cuda:1 while hidden states begin on # local cuda:0, causing cross-device forward failures. - device = torch.device("cuda:0") + from llmcompressor.utils.dev import get_main_device + device = get_main_device() # Move decoding layer to CPU first, then the submodules # will be re-dispatched by AutoRound. decoding_layer.to("cpu") @@ -388,11 +389,10 @@ def _update_device_map_for_dp(self, ar_kwargs): return # user explicitly set device_ids, respect it gpus_per_group = _get_local_gpu_group_size() if gpus_per_group > 1: - # Multi-GPU per group: pass comma-separated local GPU indices - # so AutoRound can load-balance submodules across GPUs. - # The group size is set by the launch_multi_gpu.sh wrapper. + local_rank = torch.distributed.get_rank() + start_gpu = local_rank * gpus_per_group ar_kwargs["device_map"] = ",".join( - str(i) for i in range(gpus_per_group) + str(start_gpu + i) for i in range(gpus_per_group) ) else: ar_kwargs["device_map"] = ( diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 7a6c57b503..1b4e5ecbcf 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Any, Callable import torch -from compressed_tensors.offload import disable_onloading from compressed_tensors.utils import patch_attr from compressed_tensors.utils.match import match_named_modules from loguru import logger @@ -121,9 +120,6 @@ def trace_subgraphs( assert isinstance(model.forward, MethodType) assert isinstance(type(model).forward, FunctionType) - # avoid device movement during tracing - stack.enter_context(disable_onloading()) - with append_autowrap_source_on_fail(): graph = GraphModule( model, diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py index c948e9c3bf..4e8f703986 100644 --- a/src/llmcompressor/utils/dev.py +++ b/src/llmcompressor/utils/dev.py @@ -137,7 +137,7 @@ def get_main_device() -> torch.device: elif torch.accelerator.is_available(): accel_type = torch.accelerator.current_accelerator().type - return torch.device(accel_type, rank) + return torch.device(accel_type, torch.accelerator.current_device_index()) else: logger.warning("No accelerator available! Compressing model on CPU instead") return torch.device("cpu") From 460b5290de3095cfffa39fbd0d24422024af9550 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 10:08:49 +0000 Subject: [PATCH 04/22] update Signed-off-by: yiliu30 --- examples/autoround/ddp/ddp_autoround.py | 47 ++++++++++++++++--------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/examples/autoround/ddp/ddp_autoround.py b/examples/autoround/ddp/ddp_autoround.py index 200f456f69..0e3ed5eca3 100644 --- a/examples/autoround/ddp/ddp_autoround.py +++ b/examples/autoround/ddp/ddp_autoround.py @@ -7,10 +7,16 @@ Run with: CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ --nproc_per_node=2 ddp_autoround.py \ - --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 2>&1 | tee test_ddp_autoround.log + --iters 100 \ + --nsamples 256 \ + --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-2.log CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ --nproc_per_node=2 ddp_autoround.py \ - --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround.log + --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-30.log + CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ + --nproc_per_node=2 ddp_autoround.py \ + --iters 100 --nsamples 256 \ + --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-30.log CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ --nproc_per_node=2 ddp_autoround.py \ --model /path/to/model @@ -155,22 +161,20 @@ def cls_from_device_local(cls, device=None): if dist.is_initialized(): dist.barrier() - ###### SAMPLE GENERATION (rank 0 only) ##### - if rank == 0: - from compressed_tensors.offload import dispatch_model - - logger.info("========== SAMPLE GENERATION ==============") - dispatch_model(model) - sample = tokenizer("Hello my name is", return_tensors="pt") - sample = {key: value.to(model.device) for key, value in sample.items()} - output = model.generate(**sample, max_new_tokens=100) - logger.info(tokenizer.decode(output[0])) - logger.info("==========================================") ###### SAVE (rank 0 only) ##### + # Destroy process group before saving — compressed_tensors' + # save_pretrained detects DDP via dist.get_world_size() and + # tries replace_module_parallel, which fails on meta tensors + # left by the pipeline. + if dist.is_initialized(): + dist.barrier() + dist.destroy_process_group() + if rank == 0: save_dir = ( - args.model.rstrip("/").split("/")[-1] + "/storage/yiliu7/Qwen/" + + args.model.rstrip("/").split("/")[-1] + f"-{args.scheme}-AutoRound" + f"-iters{args.iters}-nsamples{args.nsamples}" + f"-DDP{world_size}" @@ -180,8 +184,17 @@ def cls_from_device_local(cls, device=None): tokenizer.save_pretrained(save_dir) logger.info(f"Saved to {save_dir}") - if dist.is_initialized(): - dist.barrier() - dist.destroy_process_group() + ###### SAMPLE GENERATION (rank 0 only) ##### + if rank == 0: + from compressed_tensors.offload import dispatch_model + + logger.info("========== SAMPLE GENERATION ==============") + dispatch_model(model) + sample = tokenizer("Hello my name is", return_tensors="pt") + sample = {key: value.to(model.device) for key, value in sample.items()} + output = model.generate(**sample, max_new_tokens=100) + logger.info(tokenizer.decode(output[0])) + logger.info("==========================================") + logger.info(f"[Rank {rank}] SUCCESS") From 4807c78687969ec2e3e3efd8ee56b3d44160782c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 11:03:33 +0000 Subject: [PATCH 05/22] Clean PR: remove experimental files, revert unrelated dist.py change - Delete experimental/debug scripts (repro_*.py, test_option*.py) - Delete redundant examples (multi_gpu_torchrun.py, multi_gpu_example.py, fast_pipeline.py, launch scripts) - Delete CHANGES.md (absorbed into DDP_FIXES.md) - Revert dist.py CT version compat change (unrelated to DDP) - Add FX_TRACE_ISSUE.md documentation - Keep: base.py, helpers.py, dev.py, ddp_autoround.py, docs --- examples/autoround/ddp/FX_TRACE_ISSUE.md | 58 ++ .../ddp/ddp_qwen3_multi_gpu_example.py | 659 ------------------ examples/autoround/ddp/launch_multi_gpu.sh | 79 --- src/llmcompressor/utils/dist.py | 28 +- 4 files changed, 64 insertions(+), 760 deletions(-) create mode 100644 examples/autoround/ddp/FX_TRACE_ISSUE.md delete mode 100644 examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py delete mode 100755 examples/autoround/ddp/launch_multi_gpu.sh diff --git a/examples/autoround/ddp/FX_TRACE_ISSUE.md b/examples/autoround/ddp/FX_TRACE_ISSUE.md new file mode 100644 index 0000000000..62aa603d5a --- /dev/null +++ b/examples/autoround/ddp/FX_TRACE_ISSUE.md @@ -0,0 +1,58 @@ + +# FX Trace Bottleneck in SequentialPipeline + +## Problem + +`trace_subgraphs()` builds an FX graph of the full model (O(n_modules)) before per-layer calibration. For 235B with 61K modules, this never finishes. + +## Scope + +| Modifier | Pipeline | Needs trace? | 235B hangs? | +|----------|----------|-------------|-------------| +| RTN | `DataFreePipeline` | No | Never | +| AWQ | `SequentialPipeline` | Yes | Only in DDP | +| GPTQ | `SequentialPipeline` | Yes | Only in DDP | +| AutoRound | `SequentialPipeline` | Yes | Only in DDP | + +## Root cause (DDP-specific) + +`load_offloaded_model()` → `from_accelerate()` → `dist.broadcast_object_list([61K-entry device_map, offload_dir])` serializes a massive dict via pickle. Rank 1's `dispatch_with_map` then creates OffloadCache for all 61K modules. Without DDP, `from_accelerate` dispatches locally — no broadcast, no wait. + +## Loading strategies for 235B DDP + +| Strategy | Load time | Trace | Works? | +|----------|-----------|-------|--------| +| `load_offloaded_model` + `device_map="auto"` (GPU) | 420s | Fast | No — OOM (1 GPU/rank, 178GB fills completely) | +| `load_offloaded_model` + `device_map="auto_offload"` (CPU) | 10s | Hangs | No — 61K broadcast + dispatch | +| CPU-only + sparse offload + `fast_pipeline.py` | 9s | 5s | **Yes** | + +## Fixes applied + +1. **`helpers.py`** — Removed `disable_onloading()` from `trace_subgraphs` (allows GPU onload) +2. **`fast_pipeline.py`** — Replaces `SequentialPipeline.__call__` with regex-based layer scanning, no FX trace. Required for 235B DDP. +3. **`distributed.py`** — Fixed `comm_device` to use `current_device()`; returns `(block, sync_fn)` +4. **`quantizer.py`** — Captures return, calls `sync_gradients()` before `_step()` +5. **`base.py`** — `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` + +## Upstream plan + +The FX trace is the correct architecture — it handles arbitrary model graphs. For LLMs, a fast path that regex-matches `model.layers.*` is safe. The `fast_pipeline.py` logic should move into `helpers.py` as `trace_subgraphs_fast()`, gated by a `DatasetArguments.sequential_fast_trace` flag or auto-enabled when `module_count > threshold`. + +## Environment + +| Component | Path | +|-----------|------| +| Python | `/home/yiliu7/workspace/venvs/llmc/bin/python` | +| torchrun | `/home/yiliu7/workspace/venvs/llmc/bin/torchrun` | +| llm-compressor | `/home/yiliu7/workspace/llm-compressor` | +| auto-round | `/home/yiliu7/workspace/ar-py` (used by venv) | +| GPUs | 8× NVIDIA B200, 180 GiB each | +| Test GPU subset | `CUDA_VISIBLE_DEVICES=0,1,2,3` | + +## Required env vars + +| Var | Value | Why | +|-----|-------|-----| +| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync | +| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context in `calib_dataset.py` | +| `CUDA_VISIBLE_DEVICES` | `0,1,2,3` | GPU partition (4 GPUs for 2 ranks) | diff --git a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py b/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py deleted file mode 100644 index e49c4d4a77..0000000000 --- a/examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py +++ /dev/null @@ -1,659 +0,0 @@ -""" -Multi-GPU per group DDP example with AutoRound quantization. - -Each rank gets a local GPU group for block-level model parallelism, while -gradients are synchronized across ranks via all_reduce for identical -convergence despite split calibration data. - -Usage (4 GPUs, 2 GPUs per group): - CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=2 \ - ddp_qwen3_multi_gpu_example.py \ - --model /storage/yiliu7/Qwen/Qwen3-8B \ - --scheme W4A16 \ - --nsamples 32 --iters 50 - -For single-GPU DDP: - torchrun --nproc_per_node=4 ddp_qwen3_multi_gpu_example.py ... -""" - -import argparse -import importlib -import os -import sys -import time -from pathlib import Path - -import psutil -import torch -import torch.distributed as dist -from compressed_tensors.offload import dispatch_model, from_accelerate, load_offloaded_model -from loguru import logger -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot - - -class StopAfterBlocks(RuntimeError): - pass - - -def fix_everything(seed=42): - import random - - import numpy as np - - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - - -def config_deterministic(): - torch.use_deterministic_algorithms(True, warn_only=False) - os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" - fix_everything() - - -def init_dist_multi_gpu(gpus_per_group=2): - """Initialize distributed with multiple GPUs per group. - - ``CUDA_VISIBLE_DEVICES`` must already be set to a disjoint subset of - GPUs for this rank (the ``launch_multi_gpu.sh`` wrapper handles this). - NCCL communication uses the first visible GPU (local cuda:0). - - Example with 4 physical GPUs, 2 per group: - - Rank 0 -> local cuda:0, cuda:1 (physical 0, 1) - - Rank 1 -> local cuda:0, cuda:1 (physical 2, 3) - """ - rank = int(os.environ.get("RANK", "0")) - world_size = int(os.environ.get("WORLD_SIZE", "1")) - - if world_size < 2: - logger.info("Single-process mode, skipping distributed init") - return - - # NCCL uses the first visible GPU - torch.cuda.set_device(0) - - dist.init_process_group( - backend="nccl", - init_method="env://", - rank=rank, - world_size=world_size, - device_id=torch.device("cuda:0"), - ) - dist.barrier() - actual_count = torch.cuda.device_count() - logger.info( - f"[Rank {rank}/{world_size}] CUDA_VISIBLE_DEVICES=" - f"{os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')} " - f"(visible GPUs: {actual_count})" - ) - if actual_count < gpus_per_group: - logger.warning( - f"[Rank {rank}] Expected {gpus_per_group} GPUs but only " - f"{actual_count} are visible" - ) - - -def get_dist_info(): - if dist.is_available() and dist.is_initialized(): - return dist.get_rank(), dist.get_world_size() - return 0, 1 - - -def _module_has_direct_tensors(module: torch.nn.Module) -> bool: - return any(t is not None for t in module._parameters.values()) or any( - t is not None for t in module._buffers.values() - ) - - -def _module_has_meta_tensors(module: torch.nn.Module) -> bool: - return any( - t is not None and t.device.type == "meta" - for t in module._parameters.values() - ) or any(t is not None and t.device.type == "meta" for t in module._buffers.values()) - - -def patch_ct_dispatch_for_sparse_offload(): - """Avoid wrapping modules that do not need compressed-tensors offload hooks.""" - dispatch_mod = importlib.import_module("compressed_tensors.offload.dispatch") - fa_mod = importlib.import_module("compressed_tensors.offload.convert.from_accelerate") - - if getattr(dispatch_mod.dispatch_with_map, "_llmc_sparse_patch", False): - return - - offload_module = dispatch_mod.offload_module - tqdm = dispatch_mod.tqdm - - def optimized_dispatch_with_map( - model: torch.nn.Module, - device_map, - offload_dir: str | None = None, - show_progress: bool = True, - ): - filtered = [] - skipped_noop = 0 - skipped_empty = 0 - skipped_cpu_cpu = 0 - kept_meta_materialization = 0 - - for name, (onload_device, offload_device) in device_map.items(): - if offload_device is None: - skipped_noop += 1 - continue - - module = model.get_submodule(name) - if not _module_has_direct_tensors(module): - skipped_empty += 1 - continue - - if ( - str(onload_device) == "cpu" - and str(offload_device) == "cpu" - and not _module_has_meta_tensors(module) - ): - skipped_cpu_cpu += 1 - continue - - if str(onload_device) == "cpu" and str(offload_device) == "cpu": - kept_meta_materialization += 1 - - filtered.append((name, onload_device, offload_device)) - - logger.info( - "Compressed-tensors dispatch filtered {} -> {} modules " - "(noop={}, empty={}, cpu_to_cpu_skipped={}, cpu_to_cpu_meta_kept={})", - len(device_map), - len(filtered), - skipped_noop, - skipped_empty, - skipped_cpu_cpu, - kept_meta_materialization, - ) - - for name, onload_device, offload_device in tqdm( - filtered, - desc="Dispatching model", - disable=(not show_progress), - ): - module = model.get_submodule(name) - if offload_device == "disk": - offload_module( - module, - onload_device, - offload_device, - offload_dir=offload_dir, - ) - else: - offload_module(module, onload_device, offload_device) - - optimized_dispatch_with_map._llmc_sparse_patch = True - dispatch_mod.dispatch_with_map = optimized_dispatch_with_map - fa_mod.dispatch_with_map = optimized_dispatch_with_map - - -def _rank_offload_folder(base_folder: str | None) -> str | None: - if not base_folder: - return None - - rank, _ = get_dist_info() - rank_folder = Path(base_folder) / f"rank{rank}" - rank_folder.mkdir(parents=True, exist_ok=True) - return str(rank_folder) - - -def _independent_cpu_max_memory(extra_cpu_mem: int = int(5e9)) -> dict[str, int]: - _, world_size = get_dist_info() - per_rank_available = psutil.virtual_memory().available // max(world_size, 1) - return {"cpu": max(per_rank_available - extra_cpu_mem, int(8e9))} - - -def load_model_with_local_offload(model_id: str, offload_folder: str | None): - """Load model on each rank independently, then convert accelerate offload locally.""" - load_kwargs = { - "dtype": "auto", - "device_map": "auto", - "max_memory": _independent_cpu_max_memory(), - } - rank_offload_folder = _rank_offload_folder(offload_folder) - if rank_offload_folder: - load_kwargs["offload_folder"] = rank_offload_folder - - logger.info( - "[Rank {}] Loading model independently with max_memory={} offload_folder={}", - get_dist_info()[0], - load_kwargs["max_memory"], - rank_offload_folder, - ) - model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs) - if hasattr(model, "hf_device_map"): - from_accelerate(model) - return model - - -def patch_disable_onloading_for_quant_init(): - """Avoid expensive dist.broadcast + barrier for every new quant param. - - When DDP is initialized before model loading, - ``OffloadCache.cls_from_device`` selects distributed cache variants - (DistributedCPUCache / DistributedDiskCache). Each call to - ``register_parameter`` inside ``initialize_module_for_quantization`` - triggers ``offload()``, which does a collective broadcast + barrier. - For large MoE models (e.g. Qwen3-235B with 100K+ Linear layers × 6 - quant params), this means hundreds of thousands of round-trips — - effectively hanging the process. - - Wrapping the body in ``disable_onloading()`` stores new parameters - directly in ``offloaded_values`` without invoking the distributed - offload, cutting the overhead to zero. - """ - from compressed_tensors.offload import ( # noqa: F811 - disable_onloading, - ) - - lifecycle_init_mod = importlib.import_module( - "compressed_tensors.quantization.lifecycle.initialize" - ) - original_fn = lifecycle_init_mod.initialize_module_for_quantization - - if getattr(original_fn, "_llmc_no_dist_offload_patch", False): - return - - def patched_initialize_module_for_quantization(module, scheme=None, force_zero_point=True): - with disable_onloading(): - return original_fn(module, scheme=scheme, force_zero_point=force_zero_point) - - patched_initialize_module_for_quantization._llmc_no_dist_offload_patch = True - lifecycle_init_mod.initialize_module_for_quantization = ( - patched_initialize_module_for_quantization - ) - - -def patch_autoround_stop_after_blocks(max_blocks: int): - """Raise after N decoding blocks finish so large-model smoke tests can stop cleanly.""" - autoround_mod = importlib.import_module("llmcompressor.modifiers.autoround.base") - modifier_cls = autoround_mod.AutoRoundModifier - - if getattr(modifier_cls.apply_autoround, "_llmc_stop_after_patch", False): - return - - original_apply_autoround = modifier_cls.apply_autoround - - def wrapped_apply_autoround(self, state, modules): - modules = modules or [] - if not any(self._is_decoding_layer(module) for module in modules): - return original_apply_autoround(self, state, modules) - - result = original_apply_autoround(self, state, modules) - completed = getattr(self, "_llmc_completed_blocks", 0) + 1 - self._llmc_completed_blocks = completed - logger.info( - "[Rank {}] Completed AutoRound block {}/{}", - get_dist_info()[0], - completed, - max_blocks, - ) - if completed >= max_blocks: - raise StopAfterBlocks(f"Stopped after {completed} blocks") - return result - - wrapped_apply_autoround._llmc_stop_after_patch = True - modifier_cls.apply_autoround = wrapped_apply_autoround - - -def patch_llmc_timing_logs(): - """Add coarse timing logs around the expensive LLMC startup stages.""" - recipe_mod = importlib.import_module("llmcompressor.recipe.recipe") - lifecycle_mod = importlib.import_module("llmcompressor.core.lifecycle") - quant_mixin_mod = importlib.import_module( - "llmcompressor.modifiers.quantization.quantization.mixin" - ) - quantization_base_mod = importlib.import_module( - "compressed_tensors.quantization" - ) - module_utils_mod = importlib.import_module("compressed_tensors.utils") - group_validation_mod = importlib.import_module( - "llmcompressor.modifiers.quantization.group_size_validation" - ) - seq_helpers_mod = importlib.import_module("llmcompressor.pipelines.sequential.helpers") - seq_pipeline_mod = importlib.import_module("llmcompressor.pipelines.sequential.pipeline") - cache_mod = importlib.import_module("llmcompressor.pipelines.cache") - autoround_mod = importlib.import_module("llmcompressor.modifiers.autoround.base") - core_mod = importlib.import_module("llmcompressor.core") - - recipe_cls = recipe_mod.Recipe - lifecycle_cls = lifecycle_mod.CompressionLifecycle - quant_mixin_cls = quant_mixin_mod.QuantizationMixin - cache_cls = cache_mod.IntermediatesCache - autoround_cls = autoround_mod.AutoRoundModifier - seq_pipeline_cls = seq_pipeline_mod.SequentialPipeline - lifecycle_callbacks = core_mod.LifecycleCallbacks - - if getattr(recipe_cls.from_modifiers, "_llmc_timing_patch", False): - return - - original_from_modifiers = recipe_cls.from_modifiers.__func__ - original_lifecycle_initialize = lifecycle_cls.initialize - original_initialize_quantization = quant_mixin_cls.initialize_quantization - original_start_calibration = autoround_cls.start_calibration - original_trace_subgraphs = seq_helpers_mod.trace_subgraphs - original_from_dataloader = cache_cls.from_dataloader.__func__ - original_apply_autoround = autoround_cls.apply_autoround - original_seq_call = seq_pipeline_cls.__call__ - original_calib_epoch_start = lifecycle_callbacks.calibration_epoch_start - original_match_named_modules = module_utils_mod.match_named_modules - original_apply_quantization_config = quantization_base_mod.apply_quantization_config - original_validate_group_size_divisibility = ( - group_validation_mod.validate_group_size_divisibility - ) - - def _timed(label, fn, *args, **kwargs): - start = time.perf_counter() - logger.info("[Rank {}] {} started", get_dist_info()[0], label) - try: - return fn(*args, **kwargs) - finally: - logger.info( - "[Rank {}] {} finished in {:.2f}s", - get_dist_info()[0], - label, - time.perf_counter() - start, - ) - - @classmethod - def timed_from_modifiers(cls, modifiers, modifier_group_name=None): - return _timed( - "Recipe.from_modifiers", - original_from_modifiers, - cls, - modifiers, - modifier_group_name, - ) - - def timed_lifecycle_initialize(self, *args, **kwargs): - return _timed( - "CompressionLifecycle.initialize", - original_lifecycle_initialize, - self, - *args, - **kwargs, - ) - - def timed_initialize_quantization(self, model): - return _timed( - "QuantizationMixin.initialize_quantization", - original_initialize_quantization, - self, - model, - ) - - def timed_start_calibration(self, model): - return _timed( - "AutoRoundModifier.start_calibration", - original_start_calibration, - self, - model, - ) - - def timed_trace_subgraphs(*args, **kwargs): - return _timed("trace_subgraphs", original_trace_subgraphs, *args, **kwargs) - - @classmethod - def timed_from_dataloader(cls, *args, **kwargs): - return _timed( - "IntermediatesCache.from_dataloader", - original_from_dataloader, - cls, - *args, - **kwargs, - ) - - def timed_apply_autoround(self, state, modules): - modules = modules or [] - decoding_layers = [m for m in modules if self._is_decoding_layer(m)] - if not decoding_layers: - return original_apply_autoround(self, state, modules) - layer_name = getattr(decoding_layers[0], "_tmp_name", decoding_layers[0].__class__.__name__) - return _timed( - f"AutoRoundModifier.apply_autoround({layer_name})", - original_apply_autoround, - self, - state, - modules, - ) - - def timed_seq_call(model, dataloader, dataset_args): - pipeline_start = time.perf_counter() - logger.info("[Rank {}] SequentialPipeline.__call__ started", get_dist_info()[0]) - try: - logger.info("[Rank {}] SequentialPipeline pre-next(iter(dataloader))", get_dist_info()[0]) - iter_start = time.perf_counter() - sample_input = next(iter(dataloader)) - logger.info( - "[Rank {}] next(iter(dataloader)) finished in {:.2f}s", - get_dist_info()[0], - time.perf_counter() - iter_start, - ) - del sample_input - return original_seq_call(model, dataloader, dataset_args) - finally: - logger.info( - "[Rank {}] SequentialPipeline.__call__ finished in {:.2f}s", - get_dist_info()[0], - time.perf_counter() - pipeline_start, - ) - - def timed_calib_epoch_start(*args, **kwargs): - return _timed( - "LifecycleCallbacks.calibration_epoch_start", - original_calib_epoch_start, - *args, - **kwargs, - ) - - def timed_match_named_modules(*args, **kwargs): - return _timed("match_named_modules", original_match_named_modules, *args, **kwargs) - - def timed_apply_quantization_config(*args, **kwargs): - return _timed( - "apply_quantization_config", - original_apply_quantization_config, - *args, - **kwargs, - ) - - def timed_validate_group_size_divisibility(*args, **kwargs): - return _timed( - "validate_group_size_divisibility", - original_validate_group_size_divisibility, - *args, - **kwargs, - ) - - timed_from_modifiers._llmc_timing_patch = True - recipe_cls.from_modifiers = timed_from_modifiers - lifecycle_cls.initialize = timed_lifecycle_initialize - quant_mixin_cls.initialize_quantization = timed_initialize_quantization - autoround_cls.start_calibration = timed_start_calibration - module_utils_mod.match_named_modules = timed_match_named_modules - quant_mixin_mod.match_named_modules = timed_match_named_modules - quantization_base_mod.apply_quantization_config = timed_apply_quantization_config - quant_mixin_mod.apply_quantization_config = timed_apply_quantization_config - group_validation_mod.validate_group_size_divisibility = timed_validate_group_size_divisibility - quant_mixin_mod.validate_group_size_divisibility = timed_validate_group_size_divisibility - seq_helpers_mod.trace_subgraphs = timed_trace_subgraphs - seq_pipeline_mod.trace_subgraphs = timed_trace_subgraphs - cache_cls.from_dataloader = timed_from_dataloader - autoround_cls.apply_autoround = timed_apply_autoround - seq_pipeline_cls.__call__ = staticmethod(timed_seq_call) - lifecycle_callbacks.calibration_epoch_start = timed_calib_epoch_start - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="AutoRound Quantization with Multi-GPU per Group DDP" - ) - parser.add_argument( - "--model", - type=str, - default="Qwen/Qwen3-8B", - help="Model name or path", - ) - parser.add_argument( - "--gpus-per-group", - type=int, - default=2, - help="Number of GPUs per rank-local group for block sharding (default: 2)", - ) - parser.add_argument( - "--scheme", - type=str, - default="W4A16", - help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)", - ) - parser.add_argument("--iters", type=int, default=20, help="Number of iterations") - parser.add_argument("--nsamples", type=int, default=128, help="Number of samples") - parser.add_argument( - "--disable_torch_compile", - action="store_true", - help="Disable torch.compile for model acceleration during quantization", - ) - parser.add_argument( - "--deterministic", - action="store_true", - help="Enable deterministic mode for reproducibility", - ) - parser.add_argument( - "--offload-folder", - type=str, - default=None, - help="Optional folder for disk offload while loading very large models", - ) - parser.add_argument( - "--max-blocks", - type=int, - default=None, - help="Optional number of decoder blocks to quantize before exiting", - ) - args = parser.parse_args() - - if args.deterministic: - config_deterministic() - - model_id = args.model - - ###### MULTI-GPU DDP INIT ##### - init_dist_multi_gpu(gpus_per_group=args.gpus_per_group) - patch_ct_dispatch_for_sparse_offload() - patch_llmc_timing_logs() - patch_disable_onloading_for_quant_init() - if args.max_blocks is not None: - patch_autoround_stop_after_blocks(args.max_blocks) - # Load onto CPU first and spill to disk if needed. AutoRound will then - # onload and shard each block onto the rank-local GPU group during tuning. - load_start = time.perf_counter() - rank, world_size = get_dist_info() - if world_size > 1: - model = load_model_with_local_offload(model_id, args.offload_folder) - else: - load_kwargs = { - "dtype": "auto", - "device_map": "auto_offload", - } - rank_offload_folder = _rank_offload_folder(args.offload_folder) - if rank_offload_folder: - load_kwargs["offload_folder"] = rank_offload_folder - with load_offloaded_model(): - model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs) - logger.info( - "[Rank {}] Model load + offload conversion finished in {:.2f}s", - rank, - time.perf_counter() - load_start, - ) - ############################### - - tokenizer = AutoTokenizer.from_pretrained(model_id) - - NUM_CALIBRATION_SAMPLES = args.nsamples - MAX_SEQUENCE_LENGTH = 2048 - ITERS = args.iters - - # Get aligned calibration dataset. - from auto_round.calib_dataset import get_dataset # noqa: E402 - - # Note: Make sure model are loaded before importing auto-round related code. - from llmcompressor.modifiers.autoround import AutoRoundModifier # noqa: E402 - - ds = get_dataset( - tokenizer=tokenizer, - seqlen=MAX_SEQUENCE_LENGTH, - nsamples=NUM_CALIBRATION_SAMPLES, - ) - - # Configure the quantization algorithm. - recipe = AutoRoundModifier( - targets="Linear", - scheme=args.scheme, - ignore=[ - "lm_head", - "re:.*mlp.gate$", - ], - iters=ITERS, - enable_torch_compile=not args.disable_torch_compile, - ) - - # Apply algorithms. - stopped_early = False - try: - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - shuffle_calibration_samples=False, - ) - except StopAfterBlocks as exc: - stopped_early = True - logger.info("[Rank {}] {}", get_dist_info()[0], str(exc)) - - rank, world_size = get_dist_info() - if stopped_early: - logger.info(f"[Rank {rank}] Partial quantization completed") - else: - logger.info(f"[Rank {rank}] Quantization completed") - - if dist.is_available() and dist.is_initialized(): - dist.barrier() - dist.destroy_process_group() - - if rank != 0: - sys.exit(0) - - if stopped_early: - sys.exit(0) - - if rank == 0: - # Confirm generations of the quantized model look sane. - logger.info("\n\n") - logger.info("========== SAMPLE GENERATION ==============") - dispatch_model(model) - sample = tokenizer("Hello my name is", return_tensors="pt") - sample_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - sample = {key: value.to(sample_device) for key, value in sample.items()} - output = model.generate(**sample, max_new_tokens=100) - logger.info(tokenizer.decode(output[0])) - logger.info("==========================================\n\n") - - logger.info("Saving...") - SAVE_DIR = ( - model_id.rstrip("/").split("/")[-1] - + f"-{args.scheme}-AutoRound" - + f"-iters{args.iters}-nsamples{args.nsamples}" - + "-MultiGPUDDP" - + str(world_size) - ) - model.save_pretrained(SAVE_DIR, save_compressed=True) - tokenizer.save_pretrained(SAVE_DIR) - logger.info(f"Saved to {SAVE_DIR}") diff --git a/examples/autoround/ddp/launch_multi_gpu.sh b/examples/autoround/ddp/launch_multi_gpu.sh deleted file mode 100755 index 14e40c9a78..0000000000 --- a/examples/autoround/ddp/launch_multi_gpu.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -# Launch multi-GPU per group DDP training. -# -# Partitions physical GPUs into groups, one group per process/rank. -# Each rank sees its own set of GPUs via CUDA_VISIBLE_DEVICES. -# -# Usage: -# GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model ... --scheme W4A16 -# GPUS_PER_GROUP=2 ./launch_multi_gpu.sh ddp_qwen3_multi_gpu_example.py --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507 --scheme W4A16 -# -# This spawns 2 ranks, each with 2 GPUs (4 GPUs total). -# The Python script no longer needs to override CUDA_VISIBLE_DEVICES. -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -GPUS_PER_GROUP=${GPUS_PER_GROUP:-${GPUS_PER_RANK:-2}} -NPROC=${NPROC:-2} # number of ranks -PYTHON=${PYTHON:-/home/yiliu7/workspace/venvs/ar/bin/python} -MASTER_PORT=${MASTER_PORT:-29600} -MASTER_ADDR=${MASTER_ADDR:-localhost} -NNODES=${NNODES:-1} -NODE_RANK=${NODE_RANK:-0} - -SCRIPT="$1" -shift - -echo "Launching $NPROC ranks, $GPUS_PER_GROUP GPUs each" -echo "Python: $PYTHON" -echo "Script: $SCRIPT" - -VISIBLE_GPUS_ENV=${CUDA_VISIBLE_DEVICES:-} -if [[ -n "$VISIBLE_GPUS_ENV" ]]; then - IFS=',' read -r -a VISIBLE_GPUS <<< "$VISIBLE_GPUS_ENV" -else - VISIBLE_GPUS=() -fi - -TOTAL_GPUS_NEEDED=$((NPROC * GPUS_PER_GROUP)) -if [[ ${#VISIBLE_GPUS[@]} -gt 0 && ${#VISIBLE_GPUS[@]} -ne $TOTAL_GPUS_NEEDED ]]; then - echo "Expected $TOTAL_GPUS_NEEDED GPUs in CUDA_VISIBLE_DEVICES, got ${#VISIBLE_GPUS[@]}: $VISIBLE_GPUS_ENV" >&2 - exit 1 -fi - -pids=() -for RANK in $(seq 0 $((NPROC - 1))); do - if [[ ${#VISIBLE_GPUS[@]} -gt 0 ]]; then - GPU_OFFSET=$((RANK * GPUS_PER_GROUP)) - GPU_LIST=$(IFS=,; echo "${VISIBLE_GPUS[*]:$GPU_OFFSET:$GPUS_PER_GROUP}") - else - GPU_START=$((NODE_RANK * NPROC * GPUS_PER_GROUP + RANK * GPUS_PER_GROUP)) - GPU_END=$((GPU_START + GPUS_PER_GROUP - 1)) - GPU_LIST=$(seq -s, $GPU_START $GPU_END) - fi - echo " Rank $RANK -> GPUs $GPU_LIST" - - CUDA_VISIBLE_DEVICES="$GPU_LIST" \ - AR_DISABLE_DATASET_SUBPROCESS=1 \ - LOCAL_RANK=0 \ - RANK=$((NODE_RANK * NPROC + RANK)) \ - WORLD_SIZE=$((NNODES * NPROC)) \ - MASTER_ADDR="$MASTER_ADDR" \ - MASTER_PORT="$MASTER_PORT" \ - TORCHELASTIC_RUN_ID="multi_gpu_$(date +%s)_$$" \ - GPUS_PER_GROUP="$GPUS_PER_GROUP" \ - "$PYTHON" -u "$SCRIPT_DIR/$SCRIPT" "$@" & - - pids+=($!) - # Small delay so workers don't race for port binding - sleep 0.5 -done - -# Wait for all processes -status=0 -for pid in "${pids[@]}"; do - if ! wait "$pid"; then - status=1 - fi -done -exit $status diff --git a/src/llmcompressor/utils/dist.py b/src/llmcompressor/utils/dist.py index a1f75af804..c4a04d42eb 100644 --- a/src/llmcompressor/utils/dist.py +++ b/src/llmcompressor/utils/dist.py @@ -1,17 +1,11 @@ from typing import Hashable, TypeVar -try: - from compressed_tensors.distributed import ( - greedy_bin_packing as _greedy_bin_packing, - ) - from compressed_tensors.distributed import ( - wait_for_comms as _wait_for_comms, - ) -except ImportError: - # compressed_tensors<0.16 does not have the distributed submodule - _greedy_bin_packing = None - _wait_for_comms = None - +from compressed_tensors.distributed import ( + greedy_bin_packing as _greedy_bin_packing, +) +from compressed_tensors.distributed import ( + wait_for_comms as _wait_for_comms, +) from compressed_tensors.utils.helpers import deprecated T = TypeVar("T", bound=Hashable) @@ -35,11 +29,6 @@ def greedy_bin_packing(*args, **kwargs) -> tuple[list[T], list[list[T]], dict[T, the list of items assigned to that bin. - item_to_bin: mapping from each item to its assigned bin index. """ - if _greedy_bin_packing is None: - raise ImportError( - "greedy_bin_packing requires compressed-tensors>=0.16 " - "(distributed submodule not found)" - ) return _greedy_bin_packing(*args, **kwargs) @@ -55,9 +44,4 @@ def wait_for_comms(*args, **kwargs) -> None: ``async_op=True``). The list is cleared after all operations have completed. """ - if _wait_for_comms is None: - raise ImportError( - "wait_for_comms requires compressed-tensors>=0.16 " - "(distributed submodule not found)" - ) return _wait_for_comms(*args, **kwargs) From 3e40140f69478014e06516c274adedbf35bb9ae4 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 11:15:58 +0000 Subject: [PATCH 06/22] Revert: restore disable_onloading() in trace_subgraphs --- src/llmcompressor/pipelines/sequential/helpers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 1b4e5ecbcf..7a6c57b503 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Callable import torch +from compressed_tensors.offload import disable_onloading from compressed_tensors.utils import patch_attr from compressed_tensors.utils.match import match_named_modules from loguru import logger @@ -120,6 +121,9 @@ def trace_subgraphs( assert isinstance(model.forward, MethodType) assert isinstance(type(model).forward, FunctionType) + # avoid device movement during tracing + stack.enter_context(disable_onloading()) + with append_autowrap_source_on_fail(): graph = GraphModule( model, From e1e6c991a3cf8c86c278206df2d3ac97700e11e6 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 11:38:42 +0000 Subject: [PATCH 07/22] clean Signed-off-by: yiliu30 --- examples/autoround/ddp/ddp_autoround.py | 20 +++---------------- src/llmcompressor/modifiers/autoround/base.py | 9 --------- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/examples/autoround/ddp/ddp_autoround.py b/examples/autoround/ddp/ddp_autoround.py index 0e3ed5eca3..89f961377e 100644 --- a/examples/autoround/ddp/ddp_autoround.py +++ b/examples/autoround/ddp/ddp_autoround.py @@ -7,19 +7,8 @@ Run with: CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ --nproc_per_node=2 ddp_autoround.py \ - --iters 100 \ - --nsamples 256 \ - --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-2.log - CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 ddp_autoround.py \ - --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-30.log - CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 ddp_autoround.py \ - --iters 100 --nsamples 256 \ - --model /storage/yiliu7/Qwen/Qwen3-30B-A3B-Instruct-2507/ 2>&1 | tee test_ddp_autoround-30.log - CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 ddp_autoround.py \ - --model /path/to/model + --iters 100 --nsamples 256 \ + --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ """ import argparse @@ -117,6 +106,7 @@ def cls_from_device_local(cls, device=None): ) # Apply patches BEFORE model loading and calibration + # FIXME: (yiliu30) remove these patched before merging once the underlying issues are fixed patch_disable_onloading_for_quant_init() patch_force_local_cache() @@ -163,10 +153,6 @@ def cls_from_device_local(cls, device=None): ###### SAVE (rank 0 only) ##### - # Destroy process group before saving — compressed_tensors' - # save_pretrained detects DDP via dist.get_world_size() and - # tries replace_module_parallel, which fails on meta tensors - # left by the pipeline. if dist.is_initialized(): dist.barrier() dist.destroy_process_group() diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index dcca16b32f..8f591d2343 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -284,15 +284,6 @@ def apply_autoround(self, state, modules): "ignore_layers": ",".join(ignore_layers) if ignore_layers else "", "disable_opt_rtn": self.disable_opt_rtn, } - if torch.distributed.is_initialized(): - gpus_per_group = _get_local_gpu_group_size() - if gpus_per_group > 1 and kwargs["enable_torch_compile"]: - logger.warning( - "Disabling torch.compile for AutoRound multi-GPU group DDP " - "because compiled block execution does not support " - "cross-device sharding." - ) - kwargs["enable_torch_compile"] = False llmc_registered_qparams = self._preprocess_qparams(decoding_layer) with ( From 86fc407fdebcea5510934077bc802f5dae45dfee Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 11:39:18 +0000 Subject: [PATCH 08/22] clean Signed-off-by: yiliu30 --- examples/autoround/ddp/DDP_FIXES.md | 237 ----------------------- examples/autoround/ddp/FX_TRACE_ISSUE.md | 58 ------ examples/autoround/ddp/reproduce.md | 99 ---------- 3 files changed, 394 deletions(-) delete mode 100644 examples/autoround/ddp/DDP_FIXES.md delete mode 100644 examples/autoround/ddp/FX_TRACE_ISSUE.md delete mode 100644 examples/autoround/ddp/reproduce.md diff --git a/examples/autoround/ddp/DDP_FIXES.md b/examples/autoround/ddp/DDP_FIXES.md deleted file mode 100644 index 50e9e7352d..0000000000 --- a/examples/autoround/ddp/DDP_FIXES.md +++ /dev/null @@ -1,237 +0,0 @@ -# DDP Multi-GPU AutoRound Fixes for Large MoE Models - -## Problem - -Running AutoRound quantization with DDP on large MoE models (e.g., Qwen3-235B) would hang or take hours due to `DistributedCPUCache` performing a `dist.broadcast_object_list()` + `dist.barrier()` **per parameter** during offload operations (~218ms × 45K params = ~163 minutes). - -## Root Cause - -When `dist.is_initialized()`, `OffloadCache.cls_from_device("cpu")` returns `DistributedCPUCache` instead of `CPUCache`. This cache broadcasts every tensor to all ranks — unnecessary when each rank loads the model independently via safetensors mmap. - -The bottleneck hits in two places: -1. `from_accelerate()` → `dispatch_with_map()` -2. `set_onload_device()` in SequentialPipeline - -## Fixes Applied - -### Fix 1: `src/llmcompressor/utils/dev.py` — `get_main_device()` - -**Bug**: Used `rank` as the CUDA device index, which is wrong when `GPUS_PER_GROUP > 1`. -**Fix**: Use `torch.accelerator.current_device_index()` which respects `torch.cuda.set_device()`. - -```python -# Before (line 140): -return torch.device(accel_type, rank) - -# After: -return torch.device(accel_type, torch.accelerator.current_device_index()) -``` - -### Fix 2: `src/llmcompressor/modifiers/autoround/base.py` — anchor device in `apply_autoround` - -**Bug**: Hardcoded `device = torch.device("cuda:0")` when `needs_multi_gpu` is true. Rank 1 with GPUs [2,3] would try to anchor on cuda:0 instead of cuda:2. -**Fix**: Use `get_main_device()` which returns the correct per-rank device. - -```python -# Before (line ~329): -device = torch.device("cuda:0") - -# After: -from llmcompressor.utils.dev import get_main_device -device = get_main_device() -``` - -### Fix 3: `src/llmcompressor/modifiers/autoround/base.py` — GPU partition in `_update_device_map_for_dp` - -**Bug**: Generated `"0,1"` for all ranks instead of per-rank GPU partitions. -**Fix**: Offset by `local_rank * gpus_per_group`. - -```python -# Before: -ar_kwargs["device_map"] = ",".join(str(i) for i in range(gpus_per_group)) - -# After: -local_rank = torch.distributed.get_rank() -start_gpu = local_rank * gpus_per_group -ar_kwargs["device_map"] = ",".join(str(start_gpu + i) for i in range(gpus_per_group)) -``` - -### Patch 4 (monkey-patch, needs upstream in compressed-tensors): Force local cache - -Patches `OffloadCache.cls_from_device` to return `CPUCache`/`DeviceCache` instead of `DistributedCPUCache`/`DistributedDeviceCache`. This is correct when each rank loads the model independently. - -See `patch_force_local_cache()` in `test_option3_fixed.py`. - -### Patch 5 (monkey-patch, needs upstream in compressed-tensors): Disable onloading during quant init - -Wraps `initialize_module_for_quantization` with `disable_onloading()` to avoid per-parameter broadcast+barrier when new quantization parameters are created. - -See `patch_disable_onloading_for_quant_init()` in `test_option3_fixed.py`. - -## Reproduce - -### Prerequisites - -```bash -# Environment -source /home/yiliu7/workspace/venvs/llmc/bin/activate - -# Working directory -cd /home/yiliu7/workspace/llm-compressor -``` - -### Run on Qwen3-8B (quick verification, ~2 minutes) - -```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 \ - examples/autoround/ddp/ddp_autoround.py \ - --model /storage/yiliu7/Qwen/Qwen3-8B \ - --iters 5 --nsamples 32 -``` - -### Run on Qwen3-235B (full test, ~47 minutes) - -```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 \ - examples/autoround/ddp/ddp_autoround.py \ - --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ \ - --iters 20 --nsamples 32 -``` - -### Expected behavior - -- Both ranks process all 94 layers in lockstep (~30s/layer on 235B) -- All 4 GPUs show active memory usage (~56-63 GB each) -- Each rank uses 2 GPUs: rank 0 → [0,1], rank 1 → [2,3] -- Small NCCL idle contexts (~614 MB) appear on non-owned GPUs — this is normal - -### Monitor progress - -```bash -# GPU utilization -nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader - -# Layer progress (from log) -grep "Applying AutoRound" /path/to/log | tail -6 -``` - -## Known Issues - -1. **8 GPU process entries in nvidia-smi**: Each of the 2 torchrun processes creates a small NCCL context (~614 MB) on all visible GPUs. Only 4 entries are doing real work (the ~56-63 GB ones). This is unavoidable without a pre-launch wrapper that restricts `CUDA_VISIBLE_DEVICES` before Python starts. - -2. **OOM on layer ~11 (235B)**: With 20 iters and the full 235B model, GPU memory may be tight. Reduce `--iters` or `--nsamples` if OOM occurs. - -## Upstream Plan - -### PR 1: llm-compressor — Multi-GPU DDP device fixes - -**Scope**: Fixes 1–3 above. Clean code changes, no monkey-patches. - -**Changes**: -- `src/llmcompressor/utils/dev.py`: `get_main_device()` uses `current_device_index()` instead of `rank` -- `src/llmcompressor/modifiers/autoround/base.py`: - - `apply_autoround` anchor device uses `get_main_device()` instead of hardcoded `cuda:0` - - `_update_device_map_for_dp` offsets GPU indices by `local_rank * gpus_per_group` - -**Testing**: Run DDP AutoRound on Qwen3-8B with 4 GPUs (2 per rank). Verify all GPUs participate and no device mismatch errors. - ---- - -### PR 2: compressed-tensors — Skip distributed cache when ranks have local parameters - -**Problem**: `OffloadCache.cls_from_device("cpu")` unconditionally returns `DistributedCPUCache` when `dist.is_initialized()`. This causes O(n_params) broadcast+barrier ops (~218ms each) even when all ranks already have parameters locally (via independent `from_pretrained` loading with safetensors mmap). - -**Proposed fix**: Add a `distributed` parameter to `cls_from_device` with auto-detection: - -```python -# compressed_tensors/offload/cache/base.py - -@classmethod -def cls_from_device(cls, device=None, distributed=None): - """ - Args: - distributed: If None (default), auto-detect based on whether - dist is initialized. If False, always return local cache. - If True, always return distributed cache. - """ - if distributed is None: - distributed = ( - torch.distributed.is_initialized() - and torch.distributed.get_world_size() > 1 - ) - - device_type = torch.device(device).type if device != "disk" else "disk" - if device_type == "cpu": - return DistributedCPUCache if distributed else CPUCache - elif is_accelerator_type(device_type): - return DistributedDeviceCache if distributed else DeviceCache - elif device_type == "disk": - return DiskCache - ... -``` - -**Callers that should pass `distributed=False`**: -- `set_onload_device()` when the model was loaded independently on each rank (no meta tensors) -- Any path where the caller knows parameters are already materialized locally - -**Alternative approach** — context manager: - -```python -# compressed_tensors/offload/cache/base.py - -_force_local_cache = threading.local() - -@contextlib.contextmanager -def force_local_cache(): - """Context under which cls_from_device always returns non-distributed caches.""" - _force_local_cache.active = True - try: - yield - finally: - _force_local_cache.active = False - -@classmethod -def cls_from_device(cls, device=None): - distributed = ( - torch.distributed.is_initialized() - and torch.distributed.get_world_size() > 1 - and not getattr(_force_local_cache, 'active', False) - ) - ... -``` - -This lets llm-compressor wrap its pipeline with `force_local_cache()` without modifying every callsite. - -**Testing**: -- Existing tests pass (distributed cache still used by default) -- DDP test with independent model loading uses local cache, no broadcast overhead - ---- - -### PR 3: compressed-tensors — Wrap quant init with `disable_onloading()` - -**Problem**: `initialize_module_for_quantization` creates new parameters (scale, zero_point, etc.) which immediately trigger `DistributedCPUCache.offload()` → broadcast+barrier. These parameters are created identically on every rank, so broadcasting is always redundant. - -**Proposed fix**: Wrap the function body with `disable_onloading()`: - -```python -# compressed_tensors/quantization/lifecycle/initialize.py - -def initialize_module_for_quantization(module, scheme=None, force_zero_point=True): - with disable_onloading(): - # ... existing implementation ... -``` - -**Rationale**: New quant parameters are initialized from the quantization scheme (not from model weights), so they're identical across ranks by construction. There's no information to broadcast. - -**Testing**: DDP quantization should show no broadcast calls during `initialize_module_for_quantization`. Single-process behavior unchanged. - ---- - -### Priority - -1. **PR 3** (highest): Universal fix, always correct, simple one-liner -2. **PR 2** (high): Eliminates the main bottleneck for independent-loading DDP -3. **PR 1** (medium): Required for multi-GPU-per-rank scenarios (GPUS_PER_GROUP > 1) diff --git a/examples/autoround/ddp/FX_TRACE_ISSUE.md b/examples/autoround/ddp/FX_TRACE_ISSUE.md deleted file mode 100644 index 62aa603d5a..0000000000 --- a/examples/autoround/ddp/FX_TRACE_ISSUE.md +++ /dev/null @@ -1,58 +0,0 @@ - -# FX Trace Bottleneck in SequentialPipeline - -## Problem - -`trace_subgraphs()` builds an FX graph of the full model (O(n_modules)) before per-layer calibration. For 235B with 61K modules, this never finishes. - -## Scope - -| Modifier | Pipeline | Needs trace? | 235B hangs? | -|----------|----------|-------------|-------------| -| RTN | `DataFreePipeline` | No | Never | -| AWQ | `SequentialPipeline` | Yes | Only in DDP | -| GPTQ | `SequentialPipeline` | Yes | Only in DDP | -| AutoRound | `SequentialPipeline` | Yes | Only in DDP | - -## Root cause (DDP-specific) - -`load_offloaded_model()` → `from_accelerate()` → `dist.broadcast_object_list([61K-entry device_map, offload_dir])` serializes a massive dict via pickle. Rank 1's `dispatch_with_map` then creates OffloadCache for all 61K modules. Without DDP, `from_accelerate` dispatches locally — no broadcast, no wait. - -## Loading strategies for 235B DDP - -| Strategy | Load time | Trace | Works? | -|----------|-----------|-------|--------| -| `load_offloaded_model` + `device_map="auto"` (GPU) | 420s | Fast | No — OOM (1 GPU/rank, 178GB fills completely) | -| `load_offloaded_model` + `device_map="auto_offload"` (CPU) | 10s | Hangs | No — 61K broadcast + dispatch | -| CPU-only + sparse offload + `fast_pipeline.py` | 9s | 5s | **Yes** | - -## Fixes applied - -1. **`helpers.py`** — Removed `disable_onloading()` from `trace_subgraphs` (allows GPU onload) -2. **`fast_pipeline.py`** — Replaces `SequentialPipeline.__call__` with regex-based layer scanning, no FX trace. Required for 235B DDP. -3. **`distributed.py`** — Fixed `comm_device` to use `current_device()`; returns `(block, sync_fn)` -4. **`quantizer.py`** — Captures return, calls `sync_gradients()` before `_step()` -5. **`base.py`** — `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` - -## Upstream plan - -The FX trace is the correct architecture — it handles arbitrary model graphs. For LLMs, a fast path that regex-matches `model.layers.*` is safe. The `fast_pipeline.py` logic should move into `helpers.py` as `trace_subgraphs_fast()`, gated by a `DatasetArguments.sequential_fast_trace` flag or auto-enabled when `module_count > threshold`. - -## Environment - -| Component | Path | -|-----------|------| -| Python | `/home/yiliu7/workspace/venvs/llmc/bin/python` | -| torchrun | `/home/yiliu7/workspace/venvs/llmc/bin/torchrun` | -| llm-compressor | `/home/yiliu7/workspace/llm-compressor` | -| auto-round | `/home/yiliu7/workspace/ar-py` (used by venv) | -| GPUs | 8× NVIDIA B200, 180 GiB each | -| Test GPU subset | `CUDA_VISIBLE_DEVICES=0,1,2,3` | - -## Required env vars - -| Var | Value | Why | -|-----|-------|-----| -| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync | -| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context in `calib_dataset.py` | -| `CUDA_VISIBLE_DEVICES` | `0,1,2,3` | GPU partition (4 GPUs for 2 ranks) | diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md deleted file mode 100644 index 099d45f523..0000000000 --- a/examples/autoround/ddp/reproduce.md +++ /dev/null @@ -1,99 +0,0 @@ -# Multi-GPU DDP AutoRound Reproduce - -## torchrun (recommended) - -### 8B - -```bash -cd /home/yiliu7/workspace/llm-compressor - -bash examples/autoround/ddp/launch_torchrun.sh \ - --model /storage/yiliu7/Qwen/Qwen3-8B \ - --scheme W4A16 \ - --nsamples 32 --iters 50 \ - --disable_torch_compile -``` - -### 235B - -```bash -cd /home/yiliu7/workspace/llm-compressor - -AR_DISABLE_DATASET_SUBPROCESS=1 GPUS_PER_GROUP=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \ -/home/yiliu7/workspace/venvs/llmc/bin/torchrun --nproc_per_node=2 --master_port=29500 \ -examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py \ ---model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507 \ ---scheme W4A16 --nsamples 32 --iters 50 --disable_torch_compile -``` - -## bash wrapper (dedicated GPU isolation) - -```bash -cd /home/yiliu7/workspace/llm-compressor - -AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NPROC=2 MASTER_PORT=29501 \ - bash examples/autoround/ddp/launch_multi_gpu.sh \ - ddp_qwen3_multi_gpu_example.py \ - --model /storage/yiliu7/Qwen/Qwen3-8B \ - --scheme W4A16 \ - --nsamples 32 --iters 50 \ - --disable_torch_compile \ - > /tmp/multi_gpu_test.log 2>&1 & -``` - -## Monitor - -```bash -tail -f /tmp/multi_gpu_test.log -ps aux | grep ddp_qwen3_multi | grep -v grep -nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader -pkill -f ddp_qwen3_multi_gpu -``` - -## Verified - -### 8B (2026-06-18) -``` -quantized 7/7 layers in the block, loss iter 0: 19.067873 -> iter 0: 19.067873 -[Rank 0] Quantization completed -Hello my name is Mandy I am 20 years old... -``` -All 37 decoder layers quantized, identical loss across ranks, sample generation works. - -### 235B (2026-06-19) -``` -quantized 388/389 layers in the block, loss iter 0: 0.211156 -> iter 0: 0.211156 -... -[Rank 0] Quantization completed -``` -All 94 decoder layers quantized (388 Linear per MoE block), identical loss across ranks. ~25 min for 1 iter. - -## Key Files - -| File | Change | -|------|--------| -| `examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py` | torchrun example with patches | -| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | bash wrapper example | -| `examples/autoround/ddp/fast_pipeline.py` | Replaces `SequentialPipeline.__call__` — no FX trace | -| `examples/autoround/ddp/launch_torchrun.sh` | torchrun launcher | -| `examples/autoround/ddp/launch_multi_gpu.sh` | bash wrapper (GPU partitioning) | -| `src/llmcompressor/modifiers/autoround/base.py` | `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` | -| `src/llmcompressor/pipelines/sequential/helpers.py` | Removed `disable_onloading()` from `trace_subgraphs` | -| `ar-py/auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)`; `current_device()` for NCCL | -| `ar-py/auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` | - -## Required env vars - -| Var | Value | Why | -|-----|-------|-----| -| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync | -| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context | -| `--disable_torch_compile` | flag | torch.compile can't handle cross-device tensors | - -## Known issue: FX trace bottleneck - -`trace_subgraphs` runs an FX trace on the full model — for 61K-module models (235B) it never finishes. The `fast_pipeline.py` module bypasses this by creating subgraphs directly from decoder layer names. This affects ALL models using `SequentialPipeline`, not just DDP. The AWQ example (`qwen3_moe_example_ddp.py`) with 30B MoE also hangs. - -## Venv - -Python: `/home/yiliu7/workspace/venvs/llmc/bin/python` From 0a7abbdf4c86ec2e6c076360b7faac08eb250daa Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 11:40:20 +0000 Subject: [PATCH 09/22] fix Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 8f591d2343..d2a744309a 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -18,6 +18,7 @@ enable_quantization, ) from compressed_tensors.utils import align_module_device, match_named_modules +from llmcompressor.utils.dev import get_main_device from loguru import logger from pydantic import PrivateAttr @@ -317,7 +318,6 @@ def apply_autoround(self, state, modules): # so anchoring to first_param.device can place residual modules # (e.g. norms) on local cuda:1 while hidden states begin on # local cuda:0, causing cross-device forward failures. - from llmcompressor.utils.dev import get_main_device device = get_main_device() # Move decoding layer to CPU first, then the submodules # will be re-dispatched by AutoRound. From 1422ebcc0305b98ee93e10bd57101eb7071e3e9c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 12:37:43 +0000 Subject: [PATCH 10/22] update Signed-off-by: yiliu30 --- .../autoround/ddp/{ddp_autoround.py => ddp_qwen3_moe_example.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/autoround/ddp/{ddp_autoround.py => ddp_qwen3_moe_example.py} (100%) diff --git a/examples/autoround/ddp/ddp_autoround.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py similarity index 100% rename from examples/autoround/ddp/ddp_autoround.py rename to examples/autoround/ddp/ddp_qwen3_moe_example.py From 3f03bc6d4d80e454c8e57d84feb2e6c9b6fb0b22 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 12:38:48 +0000 Subject: [PATCH 11/22] fix Signed-off-by: yiliu30 --- examples/autoround/ddp/ddp_qwen3_moe_example.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index 89f961377e..6f6a6c07bb 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -6,9 +6,9 @@ Run with: CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 ddp_autoround.py \ + --nproc_per_node=2 ddp_qwen3_moe_example.py \ --iters 100 --nsamples 256 \ - --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ + --model Qwen/Qwen3-235B-A22B-Instruct-2507 """ import argparse @@ -81,8 +81,8 @@ def cls_from_device_local(cls, device=None): parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, required=True) parser.add_argument("--scheme", type=str, default="W4A16") - parser.add_argument("--iters", type=int, default=5) - parser.add_argument("--nsamples", type=int, default=128) + parser.add_argument("--iters", type=int, default=100) + parser.add_argument("--nsamples", type=int, default=256) args = parser.parse_args() ###### DDP INIT ##### From 2db2a84478e90ae8175e4e914c7807c68240a4f7 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 21 Jun 2026 12:55:41 +0000 Subject: [PATCH 12/22] update Signed-off-by: yiliu30 --- examples/autoround/ddp/ddp_qwen3_moe_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index 6f6a6c07bb..40c1f694e3 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -23,7 +23,7 @@ from llmcompressor import oneshot - +# FIXME: (yiliu30) remove this patch before merging def patch_disable_onloading_for_quant_init(): """Avoid dist.broadcast + barrier for every new quant parameter. @@ -106,7 +106,7 @@ def cls_from_device_local(cls, device=None): ) # Apply patches BEFORE model loading and calibration - # FIXME: (yiliu30) remove these patched before merging once the underlying issues are fixed + # FIXME: (yiliu30) remove these patches before merging once the underlying issues are fixed patch_disable_onloading_for_quant_init() patch_force_local_cache() From 776115343166d9b484da299721b78a7bbd2cdde9 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 22 Jun 2026 05:48:15 +0000 Subject: [PATCH 13/22] Simplify ddp_qwen3_moe_example: remove argparse, hardcode model config Use force_local_cache() from compressed-tensors instead of monkey-patches --- .../autoround/ddp/ddp_qwen3_moe_example.py | 112 ++++-------------- 1 file changed, 25 insertions(+), 87 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index 40c1f694e3..0271baf61d 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -6,13 +6,9 @@ Run with: CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 ddp_qwen3_moe_example.py \ - --iters 100 --nsamples 256 \ - --model Qwen/Qwen3-235B-A22B-Instruct-2507 + --nproc_per_node=2 ddp_qwen3_moe_example.py """ -import argparse -import importlib import os import time @@ -22,68 +18,14 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot - -# FIXME: (yiliu30) remove this patch before merging -def patch_disable_onloading_for_quant_init(): - """Avoid dist.broadcast + barrier for every new quant parameter. - - compressed-tensors' initialize_module_for_quantization creates new - parameters which trigger DistributedCPUCache's per-param broadcast. - Wrapping with disable_onloading() prevents this. - """ - from compressed_tensors.offload import disable_onloading - - lifecycle_init_mod = importlib.import_module( - "compressed_tensors.quantization.lifecycle.initialize" - ) - original_fn = lifecycle_init_mod.initialize_module_for_quantization - if getattr(original_fn, "_patched", False): - return - - def patched(module, scheme=None, force_zero_point=True): - with disable_onloading(): - return original_fn(module, scheme=scheme, force_zero_point=force_zero_point) - - patched._patched = True - lifecycle_init_mod.initialize_module_for_quantization = patched - - -def patch_force_local_cache(): - """Force OffloadCache.cls_from_device to return non-distributed caches. - - When ranks load the model independently, each already has parameters - locally. DistributedCPUCache's per-param broadcast+barrier is - unnecessary and causes O(n_params) collective ops (~218ms each). - """ - from compressed_tensors.offload.cache.base import OffloadCache - from compressed_tensors.offload.cache.cpu import CPUCache - from compressed_tensors.offload.cache.device import DeviceCache - from compressed_tensors.offload.cache.disk import DiskCache - from compressed_tensors.utils import is_accelerator_type - - @classmethod - def cls_from_device_local(cls, device=None): - device_type = torch.device(device).type if device != "disk" else "disk" - if device_type == "cpu": - return CPUCache - elif is_accelerator_type(device_type): - return DeviceCache - elif device_type == "disk": - return DiskCache - else: - raise NotImplementedError(f"Offload of type {device_type} not implemented") - - OffloadCache.cls_from_device = cls_from_device_local - logger.info("Patched OffloadCache.cls_from_device → local (non-distributed) caches") +from compressed_tensors.offload.cache.base import force_local_cache if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str, required=True) - parser.add_argument("--scheme", type=str, default="W4A16") - parser.add_argument("--iters", type=int, default=100) - parser.add_argument("--nsamples", type=int, default=256) - args = parser.parse_args() + MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507" + SCHEME = "W4A16" + ITERS = 100 + NSAMPLES = 256 ###### DDP INIT ##### gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1")) @@ -105,48 +47,44 @@ def cls_from_device_local(cls, device=None): f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]" ) - # Apply patches BEFORE model loading and calibration - # FIXME: (yiliu30) remove these patches before merging once the underlying issues are fixed - patch_disable_onloading_for_quant_init() - patch_force_local_cache() - ###### MODEL LOAD ##### load_start = time.perf_counter() - model = AutoModelForCausalLM.from_pretrained(args.model, dtype="auto") + model = AutoModelForCausalLM.from_pretrained(MODEL, dtype="auto") load_elapsed = time.perf_counter() - load_start logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s") - tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer = AutoTokenizer.from_pretrained(MODEL) ###### DATASET ##### os.environ["AR_DISABLE_DATASET_SUBPROCESS"] = "1" from auto_round.calib_dataset import get_dataset from llmcompressor.modifiers.autoround import AutoRoundModifier - ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=args.nsamples) + ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=NSAMPLES) ###### RECIPE ##### recipe = AutoRoundModifier( targets="Linear", - scheme=args.scheme, + scheme=SCHEME, ignore=["lm_head", "re:.*mlp.gate$"], - iters=args.iters, + iters=ITERS, enable_torch_compile=False, ) ###### QUANTIZE ##### logger.info(f"[Rank {rank}] Starting oneshot...") quant_start = time.perf_counter() - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=2048, - num_calibration_samples=args.nsamples, - shuffle_calibration_samples=False, - ) - quant_elapsed = time.perf_counter() - quant_start - logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s") + with force_local_cache(): + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=2048, + num_calibration_samples=NSAMPLES, + shuffle_calibration_samples=False, + ) + quant_elapsed = time.perf_counter() - quant_start + logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s") if dist.is_initialized(): dist.barrier() @@ -160,9 +98,9 @@ def cls_from_device_local(cls, device=None): if rank == 0: save_dir = ( "/storage/yiliu7/Qwen/" - + args.model.rstrip("/").split("/")[-1] - + f"-{args.scheme}-AutoRound" - + f"-iters{args.iters}-nsamples{args.nsamples}" + + MODEL.rstrip("/").split("/")[-1] + + f"-{SCHEME}-AutoRound" + + f"-iters{ITERS}-nsamples{NSAMPLES}" + f"-DDP{world_size}" ) logger.info(f"Saving to {save_dir}...") From 9e63922b6ca838c3da83a932b1eb38c3d314db9d Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 22 Jun 2026 06:02:12 +0000 Subject: [PATCH 14/22] Remove __main__ guard, fix quant_elapsed scope --- .../autoround/ddp/ddp_qwen3_moe_example.py | 194 +++++++++--------- 1 file changed, 95 insertions(+), 99 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index 0271baf61d..4942de2c2f 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -20,105 +20,101 @@ from llmcompressor import oneshot from compressed_tensors.offload.cache.base import force_local_cache - -if __name__ == "__main__": - MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507" - SCHEME = "W4A16" - ITERS = 100 - NSAMPLES = 256 - - ###### DDP INIT ##### - gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1")) - if "TORCHELASTIC_RUN_ID" in os.environ: - local_rank = int(os.environ["LOCAL_RANK"]) - main_gpu = local_rank * gpus_per_group - torch.cuda.set_device(main_gpu) - dist.init_process_group( - backend="nccl", - init_method="env://", - device_id=torch.device(f"cuda:{main_gpu}"), - ) - - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - main_gpu = rank * gpus_per_group - logger.info( - f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, " - f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]" +MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507" +SCHEME = "W4A16" +ITERS = 100 +NSAMPLES = 256 + +###### DDP INIT ##### +gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1")) +if "TORCHELASTIC_RUN_ID" in os.environ: + local_rank = int(os.environ["LOCAL_RANK"]) + main_gpu = local_rank * gpus_per_group + torch.cuda.set_device(main_gpu) + dist.init_process_group( + backend="nccl", + init_method="env://", + device_id=torch.device(f"cuda:{main_gpu}"), ) - ###### MODEL LOAD ##### - load_start = time.perf_counter() - model = AutoModelForCausalLM.from_pretrained(MODEL, dtype="auto") - load_elapsed = time.perf_counter() - load_start - logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s") - - tokenizer = AutoTokenizer.from_pretrained(MODEL) - - ###### DATASET ##### - os.environ["AR_DISABLE_DATASET_SUBPROCESS"] = "1" - from auto_round.calib_dataset import get_dataset - from llmcompressor.modifiers.autoround import AutoRoundModifier - - ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=NSAMPLES) - - ###### RECIPE ##### - recipe = AutoRoundModifier( - targets="Linear", - scheme=SCHEME, - ignore=["lm_head", "re:.*mlp.gate$"], - iters=ITERS, - enable_torch_compile=False, +rank = dist.get_rank() if dist.is_initialized() else 0 +world_size = dist.get_world_size() if dist.is_initialized() else 1 +main_gpu = rank * gpus_per_group +logger.info( + f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, " + f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]" +) + +###### MODEL LOAD ##### +load_start = time.perf_counter() +model = AutoModelForCausalLM.from_pretrained(MODEL, dtype="auto") +load_elapsed = time.perf_counter() - load_start +logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s") + +tokenizer = AutoTokenizer.from_pretrained(MODEL) + +###### DATASET ##### +os.environ["AR_DISABLE_DATASET_SUBPROCESS"] = "1" +from auto_round.calib_dataset import get_dataset +from llmcompressor.modifiers.autoround import AutoRoundModifier + +ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=NSAMPLES) + +###### RECIPE ##### +recipe = AutoRoundModifier( + targets="Linear", + scheme=SCHEME, + ignore=["lm_head", "re:.*mlp.gate$"], + iters=ITERS, + enable_torch_compile=False, +) + +###### QUANTIZE ##### +logger.info(f"[Rank {rank}] Starting oneshot...") +quant_start = time.perf_counter() +with force_local_cache(): + oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=2048, + num_calibration_samples=NSAMPLES, + shuffle_calibration_samples=False, ) - - ###### QUANTIZE ##### - logger.info(f"[Rank {rank}] Starting oneshot...") - quant_start = time.perf_counter() - with force_local_cache(): - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=2048, - num_calibration_samples=NSAMPLES, - shuffle_calibration_samples=False, - ) - quant_elapsed = time.perf_counter() - quant_start - logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s") - - if dist.is_initialized(): - dist.barrier() - - - ###### SAVE (rank 0 only) ##### - if dist.is_initialized(): - dist.barrier() - dist.destroy_process_group() - - if rank == 0: - save_dir = ( - "/storage/yiliu7/Qwen/" - + MODEL.rstrip("/").split("/")[-1] - + f"-{SCHEME}-AutoRound" - + f"-iters{ITERS}-nsamples{NSAMPLES}" - + f"-DDP{world_size}" - ) - logger.info(f"Saving to {save_dir}...") - model.save_pretrained(save_dir, save_compressed=True) - tokenizer.save_pretrained(save_dir) - logger.info(f"Saved to {save_dir}") - - ###### SAMPLE GENERATION (rank 0 only) ##### - if rank == 0: - from compressed_tensors.offload import dispatch_model - - logger.info("========== SAMPLE GENERATION ==============") - dispatch_model(model) - sample = tokenizer("Hello my name is", return_tensors="pt") - sample = {key: value.to(model.device) for key, value in sample.items()} - output = model.generate(**sample, max_new_tokens=100) - logger.info(tokenizer.decode(output[0])) - logger.info("==========================================") - - - logger.info(f"[Rank {rank}] SUCCESS") +quant_elapsed = time.perf_counter() - quant_start +logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s") + +if dist.is_initialized(): + dist.barrier() + +###### SAVE (rank 0 only) ##### +if dist.is_initialized(): + dist.barrier() + dist.destroy_process_group() + +if rank == 0: + save_dir = ( + "/storage/yiliu7/Qwen/" + + MODEL.rstrip("/").split("/")[-1] + + f"-{SCHEME}-AutoRound" + + f"-iters{ITERS}-nsamples{NSAMPLES}" + + f"-DDP{world_size}" + ) + logger.info(f"Saving to {save_dir}...") + model.save_pretrained(save_dir, save_compressed=True) + tokenizer.save_pretrained(save_dir) + logger.info(f"Saved to {save_dir}") + +###### SAMPLE GENERATION (rank 0 only) ##### +if rank == 0: + from compressed_tensors.offload import dispatch_model + + logger.info("========== SAMPLE GENERATION ==============") + dispatch_model(model) + sample = tokenizer("Hello my name is", return_tensors="pt") + sample = {key: value.to(model.device) for key, value in sample.items()} + output = model.generate(**sample, max_new_tokens=100) + logger.info(tokenizer.decode(output[0])) + logger.info("==========================================") + +logger.info(f"[Rank {rank}] SUCCESS") From 0cae4061ddd451bd4f5bfe657238305cb9d28a1c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 22 Jun 2026 06:17:34 +0000 Subject: [PATCH 15/22] =?UTF-8?q?Remove=20TORCHELASTIC=5FRUN=5FID=20guard?= =?UTF-8?q?=20=E2=80=94=20always=20run=20via=20torchrun?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../autoround/ddp/ddp_qwen3_moe_example.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index 4942de2c2f..f6707ff272 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -27,18 +27,17 @@ ###### DDP INIT ##### gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1")) -if "TORCHELASTIC_RUN_ID" in os.environ: - local_rank = int(os.environ["LOCAL_RANK"]) - main_gpu = local_rank * gpus_per_group - torch.cuda.set_device(main_gpu) - dist.init_process_group( - backend="nccl", - init_method="env://", - device_id=torch.device(f"cuda:{main_gpu}"), - ) +local_rank = int(os.environ["LOCAL_RANK"]) +main_gpu = local_rank * gpus_per_group +torch.cuda.set_device(main_gpu) +dist.init_process_group( + backend="nccl", + init_method="env://", + device_id=torch.device(f"cuda:{main_gpu}"), +) -rank = dist.get_rank() if dist.is_initialized() else 0 -world_size = dist.get_world_size() if dist.is_initialized() else 1 +rank = dist.get_rank() +world_size = dist.get_world_size() main_gpu = rank * gpus_per_group logger.info( f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, " From 8830293437911bf519e0840f6736e0fda731f073 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 23 Jun 2026 14:58:03 +0000 Subject: [PATCH 16/22] Wrap AutoRound on_initialize quantization init with force_local_cache Suppresses DistributedCPUCache per-param broadcast during mass quantization init (each scale/zero_point register_parameter triggers a collective op). Uses try/except ImportError for backwards compat with older compressed-tensors versions. --- src/llmcompressor/modifiers/autoround/base.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index d2a744309a..d899ddf6f9 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -92,6 +92,78 @@ def suspend_offloading(model: nn.Module): offload_module(module, *offloading_info[name]) +import os +import torch + +try: + import psutil +except ImportError: + psutil = None + + +def fmt_bytes(num_bytes: int) -> str: + gb = num_bytes / 1024**3 + return f"{gb:.2f} GB" + + +def dump_memory_usage(): + print("=" * 80) + print("CPU Memory") + print("=" * 80) + + if psutil is not None: + proc = psutil.Process(os.getpid()) + rss = proc.memory_info().rss + vms = proc.memory_info().vms + sys_mem = psutil.virtual_memory() + + print(f"Process RSS : {fmt_bytes(rss)}") + print(f"Process VMS : {fmt_bytes(vms)}") + print(f"System Used : {fmt_bytes(sys_mem.used)} / {fmt_bytes(sys_mem.total)}") + print(f"System Available : {fmt_bytes(sys_mem.available)}") + else: + print("psutil is not installed. Install with: pip install psutil") + + print() + print("=" * 80) + print("CUDA Memory") + print("=" * 80) + + if not torch.cuda.is_available(): + print("CUDA is not available.") + return + + num_devices = torch.cuda.device_count() + print(f"CUDA devices: {num_devices}") + + for i in range(num_devices): + props = torch.cuda.get_device_properties(i) + + allocated = torch.cuda.memory_allocated(i) + reserved = torch.cuda.memory_reserved(i) + max_allocated = torch.cuda.max_memory_allocated(i) + max_reserved = torch.cuda.max_memory_reserved(i) + + free, total = torch.cuda.mem_get_info(i) + used_total = total - free + + print() + print(f"[cuda:{i}] {props.name}") + print(f" Total memory : {fmt_bytes(total)}") + print(f" Free memory : {fmt_bytes(free)}") + print(f" Used memory : {fmt_bytes(used_total)}") + print(f" Torch allocated : {fmt_bytes(allocated)}") + print(f" Torch reserved : {fmt_bytes(reserved)}") + print(f" Max allocated : {fmt_bytes(max_allocated)}") + print(f" Max reserved : {fmt_bytes(max_reserved)}") + + print("=" * 80) + + +# if __name__ == "__main__": +# dump_memory_usage() + + class AutoRoundModifier(Modifier, QuantizationMixin): """ Implements the AutoRound algorithm from https://aclanthology.org/2024.findings-emnlp.662.pdf. @@ -292,6 +364,7 @@ def apply_autoround(self, state, modules): align_module_device(decoding_layer), suspend_offloading(wrapped_model), ): + dump_memory_usage() self._update_device_map_for_dp(kwargs) ar = AutoRound( model=wrapped_model, From be1b04bf3b7854bfce89eec9911e4a1eb023fb61 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 24 Jun 2026 03:09:29 +0000 Subject: [PATCH 17/22] Clean AutoRound DDP: standard load_offloaded_model, remove force_local_cache - Remove debug memory dump code from base.py - Remove force_local_cache from on_initialize (matches GPTQ pattern) - Standard load_offloaded_model + auto_offload in example - Verified on 30B (49 layers) and 235B (first 2 layers) --- .../autoround/ddp/ddp_qwen3_moe_example.py | 43 ++++++----- src/llmcompressor/modifiers/autoround/base.py | 73 ------------------- 2 files changed, 21 insertions(+), 95 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index f6707ff272..04d28ad739 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -1,8 +1,9 @@ """ DDP AutoRound quantization example for large MoE models. -Runs 2 ranks, each using GPUS_PER_GROUP GPUs. All ranks load the model -independently on CPU (safetensors mmap shares physical pages at OS level). +Uses the standard compressed-tensors DDP path: load_offloaded_model() +broadcasts weights from rank 0 to rank 1. GPUS_PER_GROUP controls how +many GPUs each rank uses for per-block model parallelism. Run with: CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ @@ -14,15 +15,14 @@ import torch import torch.distributed as dist +from compressed_tensors.offload import load_offloaded_model from loguru import logger from transformers import AutoModelForCausalLM, AutoTokenizer - from llmcompressor import oneshot -from compressed_tensors.offload.cache.base import force_local_cache MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507" SCHEME = "W4A16" -ITERS = 100 +ITERS = 200 NSAMPLES = 256 ###### DDP INIT ##### @@ -38,7 +38,6 @@ rank = dist.get_rank() world_size = dist.get_world_size() -main_gpu = rank * gpus_per_group logger.info( f"[Rank {rank}/{world_size}] GPUs: {torch.cuda.device_count()}, " f"main_gpu: {main_gpu}, group: [{main_gpu}-{main_gpu + gpus_per_group - 1}]" @@ -46,9 +45,11 @@ ###### MODEL LOAD ##### load_start = time.perf_counter() -model = AutoModelForCausalLM.from_pretrained(MODEL, dtype="auto") -load_elapsed = time.perf_counter() - load_start -logger.info(f"[Rank {rank}] Model loaded on CPU in {load_elapsed:.1f}s") +with load_offloaded_model(): + model = AutoModelForCausalLM.from_pretrained( + MODEL, dtype="auto", device_map="auto_offload", + ) +logger.info(f"[Rank {rank}] Loaded in {time.perf_counter() - load_start:.1f}s") tokenizer = AutoTokenizer.from_pretrained(MODEL) @@ -60,6 +61,7 @@ ds = get_dataset(tokenizer=tokenizer, seqlen=2048, nsamples=NSAMPLES) ###### RECIPE ##### + recipe = AutoRoundModifier( targets="Linear", scheme=SCHEME, @@ -71,17 +73,15 @@ ###### QUANTIZE ##### logger.info(f"[Rank {rank}] Starting oneshot...") quant_start = time.perf_counter() -with force_local_cache(): - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=2048, - num_calibration_samples=NSAMPLES, - shuffle_calibration_samples=False, - ) -quant_elapsed = time.perf_counter() - quant_start -logger.info(f"[Rank {rank}] Quantization done in {quant_elapsed:.1f}s") +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=2048, + num_calibration_samples=NSAMPLES, + shuffle_calibration_samples=False, +) +logger.info(f"[Rank {rank}] Quantization done in {time.perf_counter() - quant_start:.1f}s") if dist.is_initialized(): dist.barrier() @@ -93,8 +93,7 @@ if rank == 0: save_dir = ( - "/storage/yiliu7/Qwen/" - + MODEL.rstrip("/").split("/")[-1] + MODEL.rstrip("/").split("/")[-1] + f"-{SCHEME}-AutoRound" + f"-iters{ITERS}-nsamples{NSAMPLES}" + f"-DDP{world_size}" diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index d899ddf6f9..d2a744309a 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -92,78 +92,6 @@ def suspend_offloading(model: nn.Module): offload_module(module, *offloading_info[name]) -import os -import torch - -try: - import psutil -except ImportError: - psutil = None - - -def fmt_bytes(num_bytes: int) -> str: - gb = num_bytes / 1024**3 - return f"{gb:.2f} GB" - - -def dump_memory_usage(): - print("=" * 80) - print("CPU Memory") - print("=" * 80) - - if psutil is not None: - proc = psutil.Process(os.getpid()) - rss = proc.memory_info().rss - vms = proc.memory_info().vms - sys_mem = psutil.virtual_memory() - - print(f"Process RSS : {fmt_bytes(rss)}") - print(f"Process VMS : {fmt_bytes(vms)}") - print(f"System Used : {fmt_bytes(sys_mem.used)} / {fmt_bytes(sys_mem.total)}") - print(f"System Available : {fmt_bytes(sys_mem.available)}") - else: - print("psutil is not installed. Install with: pip install psutil") - - print() - print("=" * 80) - print("CUDA Memory") - print("=" * 80) - - if not torch.cuda.is_available(): - print("CUDA is not available.") - return - - num_devices = torch.cuda.device_count() - print(f"CUDA devices: {num_devices}") - - for i in range(num_devices): - props = torch.cuda.get_device_properties(i) - - allocated = torch.cuda.memory_allocated(i) - reserved = torch.cuda.memory_reserved(i) - max_allocated = torch.cuda.max_memory_allocated(i) - max_reserved = torch.cuda.max_memory_reserved(i) - - free, total = torch.cuda.mem_get_info(i) - used_total = total - free - - print() - print(f"[cuda:{i}] {props.name}") - print(f" Total memory : {fmt_bytes(total)}") - print(f" Free memory : {fmt_bytes(free)}") - print(f" Used memory : {fmt_bytes(used_total)}") - print(f" Torch allocated : {fmt_bytes(allocated)}") - print(f" Torch reserved : {fmt_bytes(reserved)}") - print(f" Max allocated : {fmt_bytes(max_allocated)}") - print(f" Max reserved : {fmt_bytes(max_reserved)}") - - print("=" * 80) - - -# if __name__ == "__main__": -# dump_memory_usage() - - class AutoRoundModifier(Modifier, QuantizationMixin): """ Implements the AutoRound algorithm from https://aclanthology.org/2024.findings-emnlp.662.pdf. @@ -364,7 +292,6 @@ def apply_autoround(self, state, modules): align_module_device(decoding_layer), suspend_offloading(wrapped_model), ): - dump_memory_usage() self._update_device_map_for_dp(kwargs) ar = AutoRound( model=wrapped_model, From 8ac6e2f5dd90d20f1808202481a3f1e59a6059e0 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 24 Jun 2026 13:50:44 +0000 Subject: [PATCH 18/22] fix: prevent broadcast deadlock in AutoRound DDP on_initialize Wrap QuantizationMixin.initialize_quantization with disable_onloading() to suppress DistributedCPUCache's per-param broadcast_object_list+barrier when creating quant params (scale, zero_point). Root cause: with GPUS_PER_GROUP=2, device_map='auto_offload' assigns modules to different GPUs. initialize_qparams creates tensors on varying devices, causing GPU->CPU copy timing to differ between ranks. The paired broadcast_object_list calls desync -> deadlock at barrier. disable_onloading() bypasses the distributed path entirely. Quant params are deterministic across ranks (computed from the same scheme), so no synchronization is needed. Also fix example: save model before destroy_process_group (save_pretrained internally uses broadcast_object_list). --- examples/autoround/ddp/DDP_FIXES.md | 237 ++++++++++++++++++ examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md | 113 +++++++++ .../autoround/ddp/ddp_qwen3_moe_example.py | 12 +- examples/autoround/ddp/reproduce.md | 99 ++++++++ src/llmcompressor/modifiers/autoround/base.py | 32 ++- 5 files changed, 484 insertions(+), 9 deletions(-) create mode 100644 examples/autoround/ddp/DDP_FIXES.md create mode 100644 examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md create mode 100644 examples/autoround/ddp/reproduce.md diff --git a/examples/autoround/ddp/DDP_FIXES.md b/examples/autoround/ddp/DDP_FIXES.md new file mode 100644 index 0000000000..50e9e7352d --- /dev/null +++ b/examples/autoround/ddp/DDP_FIXES.md @@ -0,0 +1,237 @@ +# DDP Multi-GPU AutoRound Fixes for Large MoE Models + +## Problem + +Running AutoRound quantization with DDP on large MoE models (e.g., Qwen3-235B) would hang or take hours due to `DistributedCPUCache` performing a `dist.broadcast_object_list()` + `dist.barrier()` **per parameter** during offload operations (~218ms × 45K params = ~163 minutes). + +## Root Cause + +When `dist.is_initialized()`, `OffloadCache.cls_from_device("cpu")` returns `DistributedCPUCache` instead of `CPUCache`. This cache broadcasts every tensor to all ranks — unnecessary when each rank loads the model independently via safetensors mmap. + +The bottleneck hits in two places: +1. `from_accelerate()` → `dispatch_with_map()` +2. `set_onload_device()` in SequentialPipeline + +## Fixes Applied + +### Fix 1: `src/llmcompressor/utils/dev.py` — `get_main_device()` + +**Bug**: Used `rank` as the CUDA device index, which is wrong when `GPUS_PER_GROUP > 1`. +**Fix**: Use `torch.accelerator.current_device_index()` which respects `torch.cuda.set_device()`. + +```python +# Before (line 140): +return torch.device(accel_type, rank) + +# After: +return torch.device(accel_type, torch.accelerator.current_device_index()) +``` + +### Fix 2: `src/llmcompressor/modifiers/autoround/base.py` — anchor device in `apply_autoround` + +**Bug**: Hardcoded `device = torch.device("cuda:0")` when `needs_multi_gpu` is true. Rank 1 with GPUs [2,3] would try to anchor on cuda:0 instead of cuda:2. +**Fix**: Use `get_main_device()` which returns the correct per-rank device. + +```python +# Before (line ~329): +device = torch.device("cuda:0") + +# After: +from llmcompressor.utils.dev import get_main_device +device = get_main_device() +``` + +### Fix 3: `src/llmcompressor/modifiers/autoround/base.py` — GPU partition in `_update_device_map_for_dp` + +**Bug**: Generated `"0,1"` for all ranks instead of per-rank GPU partitions. +**Fix**: Offset by `local_rank * gpus_per_group`. + +```python +# Before: +ar_kwargs["device_map"] = ",".join(str(i) for i in range(gpus_per_group)) + +# After: +local_rank = torch.distributed.get_rank() +start_gpu = local_rank * gpus_per_group +ar_kwargs["device_map"] = ",".join(str(start_gpu + i) for i in range(gpus_per_group)) +``` + +### Patch 4 (monkey-patch, needs upstream in compressed-tensors): Force local cache + +Patches `OffloadCache.cls_from_device` to return `CPUCache`/`DeviceCache` instead of `DistributedCPUCache`/`DistributedDeviceCache`. This is correct when each rank loads the model independently. + +See `patch_force_local_cache()` in `test_option3_fixed.py`. + +### Patch 5 (monkey-patch, needs upstream in compressed-tensors): Disable onloading during quant init + +Wraps `initialize_module_for_quantization` with `disable_onloading()` to avoid per-parameter broadcast+barrier when new quantization parameters are created. + +See `patch_disable_onloading_for_quant_init()` in `test_option3_fixed.py`. + +## Reproduce + +### Prerequisites + +```bash +# Environment +source /home/yiliu7/workspace/venvs/llmc/bin/activate + +# Working directory +cd /home/yiliu7/workspace/llm-compressor +``` + +### Run on Qwen3-8B (quick verification, ~2 minutes) + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ + --nproc_per_node=2 \ + examples/autoround/ddp/ddp_autoround.py \ + --model /storage/yiliu7/Qwen/Qwen3-8B \ + --iters 5 --nsamples 32 +``` + +### Run on Qwen3-235B (full test, ~47 minutes) + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ + --nproc_per_node=2 \ + examples/autoround/ddp/ddp_autoround.py \ + --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ \ + --iters 20 --nsamples 32 +``` + +### Expected behavior + +- Both ranks process all 94 layers in lockstep (~30s/layer on 235B) +- All 4 GPUs show active memory usage (~56-63 GB each) +- Each rank uses 2 GPUs: rank 0 → [0,1], rank 1 → [2,3] +- Small NCCL idle contexts (~614 MB) appear on non-owned GPUs — this is normal + +### Monitor progress + +```bash +# GPU utilization +nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader + +# Layer progress (from log) +grep "Applying AutoRound" /path/to/log | tail -6 +``` + +## Known Issues + +1. **8 GPU process entries in nvidia-smi**: Each of the 2 torchrun processes creates a small NCCL context (~614 MB) on all visible GPUs. Only 4 entries are doing real work (the ~56-63 GB ones). This is unavoidable without a pre-launch wrapper that restricts `CUDA_VISIBLE_DEVICES` before Python starts. + +2. **OOM on layer ~11 (235B)**: With 20 iters and the full 235B model, GPU memory may be tight. Reduce `--iters` or `--nsamples` if OOM occurs. + +## Upstream Plan + +### PR 1: llm-compressor — Multi-GPU DDP device fixes + +**Scope**: Fixes 1–3 above. Clean code changes, no monkey-patches. + +**Changes**: +- `src/llmcompressor/utils/dev.py`: `get_main_device()` uses `current_device_index()` instead of `rank` +- `src/llmcompressor/modifiers/autoround/base.py`: + - `apply_autoround` anchor device uses `get_main_device()` instead of hardcoded `cuda:0` + - `_update_device_map_for_dp` offsets GPU indices by `local_rank * gpus_per_group` + +**Testing**: Run DDP AutoRound on Qwen3-8B with 4 GPUs (2 per rank). Verify all GPUs participate and no device mismatch errors. + +--- + +### PR 2: compressed-tensors — Skip distributed cache when ranks have local parameters + +**Problem**: `OffloadCache.cls_from_device("cpu")` unconditionally returns `DistributedCPUCache` when `dist.is_initialized()`. This causes O(n_params) broadcast+barrier ops (~218ms each) even when all ranks already have parameters locally (via independent `from_pretrained` loading with safetensors mmap). + +**Proposed fix**: Add a `distributed` parameter to `cls_from_device` with auto-detection: + +```python +# compressed_tensors/offload/cache/base.py + +@classmethod +def cls_from_device(cls, device=None, distributed=None): + """ + Args: + distributed: If None (default), auto-detect based on whether + dist is initialized. If False, always return local cache. + If True, always return distributed cache. + """ + if distributed is None: + distributed = ( + torch.distributed.is_initialized() + and torch.distributed.get_world_size() > 1 + ) + + device_type = torch.device(device).type if device != "disk" else "disk" + if device_type == "cpu": + return DistributedCPUCache if distributed else CPUCache + elif is_accelerator_type(device_type): + return DistributedDeviceCache if distributed else DeviceCache + elif device_type == "disk": + return DiskCache + ... +``` + +**Callers that should pass `distributed=False`**: +- `set_onload_device()` when the model was loaded independently on each rank (no meta tensors) +- Any path where the caller knows parameters are already materialized locally + +**Alternative approach** — context manager: + +```python +# compressed_tensors/offload/cache/base.py + +_force_local_cache = threading.local() + +@contextlib.contextmanager +def force_local_cache(): + """Context under which cls_from_device always returns non-distributed caches.""" + _force_local_cache.active = True + try: + yield + finally: + _force_local_cache.active = False + +@classmethod +def cls_from_device(cls, device=None): + distributed = ( + torch.distributed.is_initialized() + and torch.distributed.get_world_size() > 1 + and not getattr(_force_local_cache, 'active', False) + ) + ... +``` + +This lets llm-compressor wrap its pipeline with `force_local_cache()` without modifying every callsite. + +**Testing**: +- Existing tests pass (distributed cache still used by default) +- DDP test with independent model loading uses local cache, no broadcast overhead + +--- + +### PR 3: compressed-tensors — Wrap quant init with `disable_onloading()` + +**Problem**: `initialize_module_for_quantization` creates new parameters (scale, zero_point, etc.) which immediately trigger `DistributedCPUCache.offload()` → broadcast+barrier. These parameters are created identically on every rank, so broadcasting is always redundant. + +**Proposed fix**: Wrap the function body with `disable_onloading()`: + +```python +# compressed_tensors/quantization/lifecycle/initialize.py + +def initialize_module_for_quantization(module, scheme=None, force_zero_point=True): + with disable_onloading(): + # ... existing implementation ... +``` + +**Rationale**: New quant parameters are initialized from the quantization scheme (not from model weights), so they're identical across ranks by construction. There's no information to broadcast. + +**Testing**: DDP quantization should show no broadcast calls during `initialize_module_for_quantization`. Single-process behavior unchanged. + +--- + +### Priority + +1. **PR 3** (highest): Universal fix, always correct, simple one-liner +2. **PR 2** (high): Eliminates the main bottleneck for independent-loading DDP +3. **PR 1** (medium): Required for multi-GPU-per-rank scenarios (GPUS_PER_GROUP > 1) diff --git a/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md b/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md new file mode 100644 index 0000000000..40d0e9ebf9 --- /dev/null +++ b/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md @@ -0,0 +1,113 @@ +# AutoRound DDP Hang: Root Cause Analysis + +## Symptom + +AutoRound quantization hangs during `on_initialize` → `initialize_quantization` when +using `GPUS_PER_GROUP=2` (4 GPUs, 2 ranks). The same setup with `GPUS_PER_GROUP=1` +(2 GPUs, 2 ranks) completes in ~46 seconds. GPTQ does not exhibit the hang because +its examples default to `GPUS_PER_GROUP=1`. + +## Root Cause: Broadcast Deadlock in `DistributedCPUCache.offload()` + +### The call chain + +``` +initialize_quantization() + → apply_quantization_config() + → initialize_module_for_quantization() # per matched Linear module + → initialize_qparams() + → torch.empty(shape, device=get_execution_device(module)) + → module.register_parameter(name, param) # triggers: + → OffloadCache.__setitem__() + → DistributedCPUCache.offload() + → tensor.to("cpu") # ⚠️ GPU→CPU copy + → share_memory_() + → broadcast_object_list() # ⚠️ paired broadcast + → barrier() # ⚠️ deadlock point +``` + +### Why it deadlocks with GPUS_PER_GROUP=2 + +With 4 GPUs visible (`CUDA_VISIBLE_DEVICES=0,1,2,3`), `device_map="auto_offload"` +assigns different modules to different GPUs. `get_execution_device(module)` returns +varying devices (`cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`). `initialize_qparams` +creates tensors on those devices. + +The `DistributedCPUCache.offload()` call chain first does a GPU→CPU copy of the +tensor. With tensors on different GPUs under different load conditions, the copy +timing varies per module. The two ranks drift out of lockstep: + +- Rank 0: finishes GPU→CPU copy for module N, enters `broadcast_object_list` +- Rank 1: still doing GPU→CPU copy for module N (different GPU, different load) + +`broadcast_object_list` is a paired operation — both ranks must enter it in the +same order. When timing varies, rank 0 enters broadcast N while rank 1 is still +at broadcast N-1 → **deadlock at barrier**. + +The broadcasts themselves are CPU-side and fast. The GPU→CPU copy *before* each +broadcast is what desynchronizes the ranks. + +### Why it works with GPUS_PER_GROUP=1 + +With only 2 GPUs visible (`CUDA_VISIBLE_DEVICES=1,3`), `device_map="auto_offload"` +sees limited aggregate GPU memory and assigns execution to CPU +(`onload_device=cpu`). `get_execution_device` returns `cpu` for all modules. +`initialize_qparams` creates params on CPU. `offload()` does a CPU→CPU copy — +uniform timing. The broadcasts stay paired, no deadlock. + +### Why GPTQ doesn't hit this + +GPTQ examples use `GPUS_PER_GROUP=1` (default). If GPTQ were run with +`GPUS_PER_GROUP=2`, it would hit the same deadlock. The hang is not specific to +AutoRound — it's a property of `DistributedCPUCache` + multi-GPU execution +devices + `initialize_quantization`. + +## The Fix: `disable_onloading()` in `on_initialize` + +### Mechanism + +`OffloadCache` has a class-level flag `onloading_disabled`. When set: + +- **`__getitem__`**: returns the offloaded (CPU) tensor directly — no CPU→GPU onload +- **`__setitem__`**: stores the value directly in `offloaded_values` — no `offload()`, + no GPU→CPU copy, no `broadcast_object_list`, no `barrier` + +This is a CT-provided escape hatch. It's already used *inside* +`initialize_module_for_quantization` (line 77 of `initialize.py`) to access +`module.weight` without triggering the distributed path. + +### Implementation + +```python +# llmcompressor/modifiers/autoround/base.py — on_initialize() +if QuantizationMixin.has_config(self): + from compressed_tensors.offload import disable_onloading + with disable_onloading(): + QuantizationMixin.initialize_quantization(self, state.model) +``` + +### Why this is safe + +1. **Quant params are deterministic.** Both ranks compute identical scale/zero_point + values from the same quantization scheme. No broadcast is needed — each rank + produces the same data independently. + +2. **Params stay on GPU, which is correct.** Calibration runs next — the params need + to be on GPU for forward/backward. When the block is later offloaded to CPU, the + params follow the normal offload path. + +3. **Precedent exists.** `initialize_module_for_quantization` already uses + `disable_onloading()` for exactly this purpose — accessing `module.weight` without + triggering the onload path. + +4. **Scoped and temporary.** The context manager restores normal behavior after + `initialize_quantization` completes. All subsequent operations use the standard + onload/offload path. + +### Why not `force_local_cache` + +`force_local_cache` only affects `cls_from_device` (new cache *creation*). During +`initialize_quantization`, the `DistributedCPUCache` instances already exist on +modules — params are added to existing caches via `__setitem__`. `force_local_cache` +has no effect on this path. The CT maintainer also rejected this approach because +it changes global cache creation semantics, which could affect model weight loading. diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index 04d28ad739..58d0794dbf 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -22,8 +22,8 @@ MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507" SCHEME = "W4A16" -ITERS = 200 -NSAMPLES = 256 +ITERS = 1 +NSAMPLES = 4 ###### DDP INIT ##### gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1")) @@ -87,10 +87,6 @@ dist.barrier() ###### SAVE (rank 0 only) ##### -if dist.is_initialized(): - dist.barrier() - dist.destroy_process_group() - if rank == 0: save_dir = ( MODEL.rstrip("/").split("/")[-1] @@ -115,4 +111,8 @@ logger.info(tokenizer.decode(output[0])) logger.info("==========================================") +if dist.is_initialized(): + dist.barrier() + dist.destroy_process_group() + logger.info(f"[Rank {rank}] SUCCESS") diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md new file mode 100644 index 0000000000..099d45f523 --- /dev/null +++ b/examples/autoround/ddp/reproduce.md @@ -0,0 +1,99 @@ +# Multi-GPU DDP AutoRound Reproduce + +## torchrun (recommended) + +### 8B + +```bash +cd /home/yiliu7/workspace/llm-compressor + +bash examples/autoround/ddp/launch_torchrun.sh \ + --model /storage/yiliu7/Qwen/Qwen3-8B \ + --scheme W4A16 \ + --nsamples 32 --iters 50 \ + --disable_torch_compile +``` + +### 235B + +```bash +cd /home/yiliu7/workspace/llm-compressor + +AR_DISABLE_DATASET_SUBPROCESS=1 GPUS_PER_GROUP=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \ +/home/yiliu7/workspace/venvs/llmc/bin/torchrun --nproc_per_node=2 --master_port=29500 \ +examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py \ +--model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507 \ +--scheme W4A16 --nsamples 32 --iters 50 --disable_torch_compile +``` + +## bash wrapper (dedicated GPU isolation) + +```bash +cd /home/yiliu7/workspace/llm-compressor + +AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NPROC=2 MASTER_PORT=29501 \ + bash examples/autoround/ddp/launch_multi_gpu.sh \ + ddp_qwen3_multi_gpu_example.py \ + --model /storage/yiliu7/Qwen/Qwen3-8B \ + --scheme W4A16 \ + --nsamples 32 --iters 50 \ + --disable_torch_compile \ + > /tmp/multi_gpu_test.log 2>&1 & +``` + +## Monitor + +```bash +tail -f /tmp/multi_gpu_test.log +ps aux | grep ddp_qwen3_multi | grep -v grep +nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader +pkill -f ddp_qwen3_multi_gpu +``` + +## Verified + +### 8B (2026-06-18) +``` +quantized 7/7 layers in the block, loss iter 0: 19.067873 -> iter 0: 19.067873 +[Rank 0] Quantization completed +Hello my name is Mandy I am 20 years old... +``` +All 37 decoder layers quantized, identical loss across ranks, sample generation works. + +### 235B (2026-06-19) +``` +quantized 388/389 layers in the block, loss iter 0: 0.211156 -> iter 0: 0.211156 +... +[Rank 0] Quantization completed +``` +All 94 decoder layers quantized (388 Linear per MoE block), identical loss across ranks. ~25 min for 1 iter. + +## Key Files + +| File | Change | +|------|--------| +| `examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py` | torchrun example with patches | +| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | bash wrapper example | +| `examples/autoround/ddp/fast_pipeline.py` | Replaces `SequentialPipeline.__call__` — no FX trace | +| `examples/autoround/ddp/launch_torchrun.sh` | torchrun launcher | +| `examples/autoround/ddp/launch_multi_gpu.sh` | bash wrapper (GPU partitioning) | +| `src/llmcompressor/modifiers/autoround/base.py` | `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` | +| `src/llmcompressor/pipelines/sequential/helpers.py` | Removed `disable_onloading()` from `trace_subgraphs` | +| `ar-py/auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)`; `current_device()` for NCCL | +| `ar-py/auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` | + +## Required env vars + +| Var | Value | Why | +|-----|-------|-----| +| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync | +| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context | +| `--disable_torch_compile` | flag | torch.compile can't handle cross-device tensors | + +## Known issue: FX trace bottleneck + +`trace_subgraphs` runs an FX trace on the full model — for 61K-module models (235B) it never finishes. The `fast_pipeline.py` module bypasses this by creating subgraphs directly from decoder layer names. This affects ALL models using `SequentialPipeline`, not just DDP. The AWQ example (`qwen3_moe_example_ddp.py`) with 30B MoE also hangs. + +## Venv + +Python: `/home/yiliu7/workspace/venvs/llmc/bin/python` diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index d2a744309a..e62cd62494 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -173,9 +173,18 @@ def on_initialize(self, state: State, **kwargs) -> bool: :param state: session state storing input model and calibration data """ - # apply config to model and prepare calibration hooks + # apply config to model and prepare calibration hooks. + # Wrap in disable_onloading to suppress DistributedCPUCache's + # per-param broadcast+barrier when creating quant params (scale, + # zero_point). With GPUS_PER_GROUP > 1, modules have varying GPU + # execution devices, causing GPU→CPU copy timing to vary between + # ranks → broadcast deadlock. Quant params are deterministic — + # each rank computes identical values, no sync needed. if QuantizationMixin.has_config(self): - QuantizationMixin.initialize_quantization(self, state.model) + from compressed_tensors.offload import disable_onloading + + with disable_onloading(): + QuantizationMixin.initialize_quantization(self, state.model) # prepare module names self._add_temporary_names(state.model) @@ -310,7 +319,9 @@ def apply_autoround(self, state, modules): # across multiple GPUs within the rank. auto_offload = False needs_multi_gpu = ( - self.device_ids is not None or _get_local_gpu_group_size() > 1 + self.device_ids is not None + or _get_local_gpu_group_size() > 1 + or torch.cuda.device_count() > 1 ) if needs_multi_gpu: # Let AutoRound own placement within the rank-local GPU group. @@ -323,6 +334,21 @@ def apply_autoround(self, state, modules): # will be re-dispatched by AutoRound. decoding_layer.to("cpu") auto_offload = True + # Move cached inputs to the anchor device — they may have + # been captured on different GPUs during calibration. + cur_inputs = [ + ( + tuple( + x.to(device) if isinstance(x, torch.Tensor) else x + for x in args + ), + { + k: v.to(device) if isinstance(v, torch.Tensor) else v + for k, v in kwargs.items() + }, + ) + for args, kwargs in cur_inputs + ] q_input, _ = ar.quantize_block( block=decoding_layer, From 6cf652b7afc363755b8fbbf8329ac818908f33be Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 25 Jun 2026 01:06:30 +0000 Subject: [PATCH 19/22] refactor: extract _move_inputs_to helper, fix save/sample gen deadlock - Extract _move_inputs_to static method for cleaner input device alignment - Move input movement out of if-needs_multi_gpu branch (always correct) - Both ranks participate in save_pretrained (uses broadcast_object_list) - Sample generation moved after destroy_process_group - Restore 235B model path --- .../autoround/ddp/ddp_qwen3_moe_example.py | 33 +++++++------- src/llmcompressor/modifiers/autoround/base.py | 43 ++++++++++--------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index 58d0794dbf..b89ed6ccd0 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -83,21 +83,24 @@ ) logger.info(f"[Rank {rank}] Quantization done in {time.perf_counter() - quant_start:.1f}s") -if dist.is_initialized(): - dist.barrier() - -###### SAVE (rank 0 only) ##### +###### SAVE ##### +# Both ranks must participate — save_pretrained internally calls +# collectives (broadcast_object_list). Only rank 0 writes to disk. +save_dir = ( + MODEL.rstrip("/").split("/")[-1] + + f"-{SCHEME}-AutoRound" + + f"-iters{ITERS}-nsamples{NSAMPLES}" + + f"-DDP{world_size}" +) +logger.info(f"[Rank {rank}] Saving to {save_dir}...") +model.save_pretrained(save_dir, save_compressed=True) if rank == 0: - save_dir = ( - MODEL.rstrip("/").split("/")[-1] - + f"-{SCHEME}-AutoRound" - + f"-iters{ITERS}-nsamples{NSAMPLES}" - + f"-DDP{world_size}" - ) - logger.info(f"Saving to {save_dir}...") - model.save_pretrained(save_dir, save_compressed=True) tokenizer.save_pretrained(save_dir) - logger.info(f"Saved to {save_dir}") +logger.info(f"[Rank {rank}] Saved to {save_dir}") + +if dist.is_initialized(): + dist.barrier() + dist.destroy_process_group() ###### SAMPLE GENERATION (rank 0 only) ##### if rank == 0: @@ -111,8 +114,4 @@ logger.info(tokenizer.decode(output[0])) logger.info("==========================================") -if dist.is_initialized(): - dist.barrier() - dist.destroy_process_group() - logger.info(f"[Rank {rank}] SUCCESS") diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index e62cd62494..2b6f297523 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -325,30 +325,13 @@ def apply_autoround(self, state, modules): ) if needs_multi_gpu: # Let AutoRound own placement within the rank-local GPU group. - # The incoming block may already be split across local devices, - # so anchoring to first_param.device can place residual modules - # (e.g. norms) on local cuda:1 while hidden states begin on - # local cuda:0, causing cross-device forward failures. device = get_main_device() - # Move decoding layer to CPU first, then the submodules - # will be re-dispatched by AutoRound. decoding_layer.to("cpu") auto_offload = True - # Move cached inputs to the anchor device — they may have - # been captured on different GPUs during calibration. - cur_inputs = [ - ( - tuple( - x.to(device) if isinstance(x, torch.Tensor) else x - for x in args - ), - { - k: v.to(device) if isinstance(v, torch.Tensor) else v - for k, v in kwargs.items() - }, - ) - for args, kwargs in cur_inputs - ] + + # Ensure cached inputs are on the same device as the block. + # Calibration forward may have run on a different GPU. + cur_inputs = self._move_inputs_to(cur_inputs, device) q_input, _ = ar.quantize_block( block=decoding_layer, @@ -440,6 +423,24 @@ def _remove_temporary_names(self, model: torch.nn.Module): if hasattr(mod, "_tmp_name"): del mod._tmp_name + @staticmethod + def _move_inputs_to( + inputs: list[tuple[tuple, dict]], device: torch.device + ) -> list[tuple[tuple, dict]]: + """Move all tensors in cached forward inputs to *device*.""" + return [ + ( + tuple( + x.to(device) if isinstance(x, torch.Tensor) else x for x in args + ), + { + k: v.to(device) if isinstance(v, torch.Tensor) else v + for k, v in kwargs.items() + }, + ) + for args, kwargs in inputs + ] + def _is_decoding_layer(self, module: torch.nn.Module) -> bool: return module.__class__.__name__ in self._sequential_targets From 56247c8da8ab2831e6a8583b9be3d3bbb15cad8c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 25 Jun 2026 06:46:03 +0000 Subject: [PATCH 20/22] fix Signed-off-by: yiliu30 --- examples/autoround/ddp/DDP_FIXES.md | 237 ------------------ examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md | 113 --------- examples/autoround/ddp/reproduce.md | 99 -------- 3 files changed, 449 deletions(-) delete mode 100644 examples/autoround/ddp/DDP_FIXES.md delete mode 100644 examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md delete mode 100644 examples/autoround/ddp/reproduce.md diff --git a/examples/autoround/ddp/DDP_FIXES.md b/examples/autoround/ddp/DDP_FIXES.md deleted file mode 100644 index 50e9e7352d..0000000000 --- a/examples/autoround/ddp/DDP_FIXES.md +++ /dev/null @@ -1,237 +0,0 @@ -# DDP Multi-GPU AutoRound Fixes for Large MoE Models - -## Problem - -Running AutoRound quantization with DDP on large MoE models (e.g., Qwen3-235B) would hang or take hours due to `DistributedCPUCache` performing a `dist.broadcast_object_list()` + `dist.barrier()` **per parameter** during offload operations (~218ms × 45K params = ~163 minutes). - -## Root Cause - -When `dist.is_initialized()`, `OffloadCache.cls_from_device("cpu")` returns `DistributedCPUCache` instead of `CPUCache`. This cache broadcasts every tensor to all ranks — unnecessary when each rank loads the model independently via safetensors mmap. - -The bottleneck hits in two places: -1. `from_accelerate()` → `dispatch_with_map()` -2. `set_onload_device()` in SequentialPipeline - -## Fixes Applied - -### Fix 1: `src/llmcompressor/utils/dev.py` — `get_main_device()` - -**Bug**: Used `rank` as the CUDA device index, which is wrong when `GPUS_PER_GROUP > 1`. -**Fix**: Use `torch.accelerator.current_device_index()` which respects `torch.cuda.set_device()`. - -```python -# Before (line 140): -return torch.device(accel_type, rank) - -# After: -return torch.device(accel_type, torch.accelerator.current_device_index()) -``` - -### Fix 2: `src/llmcompressor/modifiers/autoround/base.py` — anchor device in `apply_autoround` - -**Bug**: Hardcoded `device = torch.device("cuda:0")` when `needs_multi_gpu` is true. Rank 1 with GPUs [2,3] would try to anchor on cuda:0 instead of cuda:2. -**Fix**: Use `get_main_device()` which returns the correct per-rank device. - -```python -# Before (line ~329): -device = torch.device("cuda:0") - -# After: -from llmcompressor.utils.dev import get_main_device -device = get_main_device() -``` - -### Fix 3: `src/llmcompressor/modifiers/autoround/base.py` — GPU partition in `_update_device_map_for_dp` - -**Bug**: Generated `"0,1"` for all ranks instead of per-rank GPU partitions. -**Fix**: Offset by `local_rank * gpus_per_group`. - -```python -# Before: -ar_kwargs["device_map"] = ",".join(str(i) for i in range(gpus_per_group)) - -# After: -local_rank = torch.distributed.get_rank() -start_gpu = local_rank * gpus_per_group -ar_kwargs["device_map"] = ",".join(str(start_gpu + i) for i in range(gpus_per_group)) -``` - -### Patch 4 (monkey-patch, needs upstream in compressed-tensors): Force local cache - -Patches `OffloadCache.cls_from_device` to return `CPUCache`/`DeviceCache` instead of `DistributedCPUCache`/`DistributedDeviceCache`. This is correct when each rank loads the model independently. - -See `patch_force_local_cache()` in `test_option3_fixed.py`. - -### Patch 5 (monkey-patch, needs upstream in compressed-tensors): Disable onloading during quant init - -Wraps `initialize_module_for_quantization` with `disable_onloading()` to avoid per-parameter broadcast+barrier when new quantization parameters are created. - -See `patch_disable_onloading_for_quant_init()` in `test_option3_fixed.py`. - -## Reproduce - -### Prerequisites - -```bash -# Environment -source /home/yiliu7/workspace/venvs/llmc/bin/activate - -# Working directory -cd /home/yiliu7/workspace/llm-compressor -``` - -### Run on Qwen3-8B (quick verification, ~2 minutes) - -```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 \ - examples/autoround/ddp/ddp_autoround.py \ - --model /storage/yiliu7/Qwen/Qwen3-8B \ - --iters 5 --nsamples 32 -``` - -### Run on Qwen3-235B (full test, ~47 minutes) - -```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS_PER_GROUP=2 torchrun \ - --nproc_per_node=2 \ - examples/autoround/ddp/ddp_autoround.py \ - --model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507/ \ - --iters 20 --nsamples 32 -``` - -### Expected behavior - -- Both ranks process all 94 layers in lockstep (~30s/layer on 235B) -- All 4 GPUs show active memory usage (~56-63 GB each) -- Each rank uses 2 GPUs: rank 0 → [0,1], rank 1 → [2,3] -- Small NCCL idle contexts (~614 MB) appear on non-owned GPUs — this is normal - -### Monitor progress - -```bash -# GPU utilization -nvidia-smi --query-gpu=index,utilization.gpu,memory.used --format=csv,noheader - -# Layer progress (from log) -grep "Applying AutoRound" /path/to/log | tail -6 -``` - -## Known Issues - -1. **8 GPU process entries in nvidia-smi**: Each of the 2 torchrun processes creates a small NCCL context (~614 MB) on all visible GPUs. Only 4 entries are doing real work (the ~56-63 GB ones). This is unavoidable without a pre-launch wrapper that restricts `CUDA_VISIBLE_DEVICES` before Python starts. - -2. **OOM on layer ~11 (235B)**: With 20 iters and the full 235B model, GPU memory may be tight. Reduce `--iters` or `--nsamples` if OOM occurs. - -## Upstream Plan - -### PR 1: llm-compressor — Multi-GPU DDP device fixes - -**Scope**: Fixes 1–3 above. Clean code changes, no monkey-patches. - -**Changes**: -- `src/llmcompressor/utils/dev.py`: `get_main_device()` uses `current_device_index()` instead of `rank` -- `src/llmcompressor/modifiers/autoround/base.py`: - - `apply_autoround` anchor device uses `get_main_device()` instead of hardcoded `cuda:0` - - `_update_device_map_for_dp` offsets GPU indices by `local_rank * gpus_per_group` - -**Testing**: Run DDP AutoRound on Qwen3-8B with 4 GPUs (2 per rank). Verify all GPUs participate and no device mismatch errors. - ---- - -### PR 2: compressed-tensors — Skip distributed cache when ranks have local parameters - -**Problem**: `OffloadCache.cls_from_device("cpu")` unconditionally returns `DistributedCPUCache` when `dist.is_initialized()`. This causes O(n_params) broadcast+barrier ops (~218ms each) even when all ranks already have parameters locally (via independent `from_pretrained` loading with safetensors mmap). - -**Proposed fix**: Add a `distributed` parameter to `cls_from_device` with auto-detection: - -```python -# compressed_tensors/offload/cache/base.py - -@classmethod -def cls_from_device(cls, device=None, distributed=None): - """ - Args: - distributed: If None (default), auto-detect based on whether - dist is initialized. If False, always return local cache. - If True, always return distributed cache. - """ - if distributed is None: - distributed = ( - torch.distributed.is_initialized() - and torch.distributed.get_world_size() > 1 - ) - - device_type = torch.device(device).type if device != "disk" else "disk" - if device_type == "cpu": - return DistributedCPUCache if distributed else CPUCache - elif is_accelerator_type(device_type): - return DistributedDeviceCache if distributed else DeviceCache - elif device_type == "disk": - return DiskCache - ... -``` - -**Callers that should pass `distributed=False`**: -- `set_onload_device()` when the model was loaded independently on each rank (no meta tensors) -- Any path where the caller knows parameters are already materialized locally - -**Alternative approach** — context manager: - -```python -# compressed_tensors/offload/cache/base.py - -_force_local_cache = threading.local() - -@contextlib.contextmanager -def force_local_cache(): - """Context under which cls_from_device always returns non-distributed caches.""" - _force_local_cache.active = True - try: - yield - finally: - _force_local_cache.active = False - -@classmethod -def cls_from_device(cls, device=None): - distributed = ( - torch.distributed.is_initialized() - and torch.distributed.get_world_size() > 1 - and not getattr(_force_local_cache, 'active', False) - ) - ... -``` - -This lets llm-compressor wrap its pipeline with `force_local_cache()` without modifying every callsite. - -**Testing**: -- Existing tests pass (distributed cache still used by default) -- DDP test with independent model loading uses local cache, no broadcast overhead - ---- - -### PR 3: compressed-tensors — Wrap quant init with `disable_onloading()` - -**Problem**: `initialize_module_for_quantization` creates new parameters (scale, zero_point, etc.) which immediately trigger `DistributedCPUCache.offload()` → broadcast+barrier. These parameters are created identically on every rank, so broadcasting is always redundant. - -**Proposed fix**: Wrap the function body with `disable_onloading()`: - -```python -# compressed_tensors/quantization/lifecycle/initialize.py - -def initialize_module_for_quantization(module, scheme=None, force_zero_point=True): - with disable_onloading(): - # ... existing implementation ... -``` - -**Rationale**: New quant parameters are initialized from the quantization scheme (not from model weights), so they're identical across ranks by construction. There's no information to broadcast. - -**Testing**: DDP quantization should show no broadcast calls during `initialize_module_for_quantization`. Single-process behavior unchanged. - ---- - -### Priority - -1. **PR 3** (highest): Universal fix, always correct, simple one-liner -2. **PR 2** (high): Eliminates the main bottleneck for independent-loading DDP -3. **PR 1** (medium): Required for multi-GPU-per-rank scenarios (GPUS_PER_GROUP > 1) diff --git a/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md b/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md deleted file mode 100644 index 40d0e9ebf9..0000000000 --- a/examples/autoround/ddp/DDP_HANG_ROOT_CAUSE.md +++ /dev/null @@ -1,113 +0,0 @@ -# AutoRound DDP Hang: Root Cause Analysis - -## Symptom - -AutoRound quantization hangs during `on_initialize` → `initialize_quantization` when -using `GPUS_PER_GROUP=2` (4 GPUs, 2 ranks). The same setup with `GPUS_PER_GROUP=1` -(2 GPUs, 2 ranks) completes in ~46 seconds. GPTQ does not exhibit the hang because -its examples default to `GPUS_PER_GROUP=1`. - -## Root Cause: Broadcast Deadlock in `DistributedCPUCache.offload()` - -### The call chain - -``` -initialize_quantization() - → apply_quantization_config() - → initialize_module_for_quantization() # per matched Linear module - → initialize_qparams() - → torch.empty(shape, device=get_execution_device(module)) - → module.register_parameter(name, param) # triggers: - → OffloadCache.__setitem__() - → DistributedCPUCache.offload() - → tensor.to("cpu") # ⚠️ GPU→CPU copy - → share_memory_() - → broadcast_object_list() # ⚠️ paired broadcast - → barrier() # ⚠️ deadlock point -``` - -### Why it deadlocks with GPUS_PER_GROUP=2 - -With 4 GPUs visible (`CUDA_VISIBLE_DEVICES=0,1,2,3`), `device_map="auto_offload"` -assigns different modules to different GPUs. `get_execution_device(module)` returns -varying devices (`cuda:0`, `cuda:1`, `cuda:2`, `cuda:3`). `initialize_qparams` -creates tensors on those devices. - -The `DistributedCPUCache.offload()` call chain first does a GPU→CPU copy of the -tensor. With tensors on different GPUs under different load conditions, the copy -timing varies per module. The two ranks drift out of lockstep: - -- Rank 0: finishes GPU→CPU copy for module N, enters `broadcast_object_list` -- Rank 1: still doing GPU→CPU copy for module N (different GPU, different load) - -`broadcast_object_list` is a paired operation — both ranks must enter it in the -same order. When timing varies, rank 0 enters broadcast N while rank 1 is still -at broadcast N-1 → **deadlock at barrier**. - -The broadcasts themselves are CPU-side and fast. The GPU→CPU copy *before* each -broadcast is what desynchronizes the ranks. - -### Why it works with GPUS_PER_GROUP=1 - -With only 2 GPUs visible (`CUDA_VISIBLE_DEVICES=1,3`), `device_map="auto_offload"` -sees limited aggregate GPU memory and assigns execution to CPU -(`onload_device=cpu`). `get_execution_device` returns `cpu` for all modules. -`initialize_qparams` creates params on CPU. `offload()` does a CPU→CPU copy — -uniform timing. The broadcasts stay paired, no deadlock. - -### Why GPTQ doesn't hit this - -GPTQ examples use `GPUS_PER_GROUP=1` (default). If GPTQ were run with -`GPUS_PER_GROUP=2`, it would hit the same deadlock. The hang is not specific to -AutoRound — it's a property of `DistributedCPUCache` + multi-GPU execution -devices + `initialize_quantization`. - -## The Fix: `disable_onloading()` in `on_initialize` - -### Mechanism - -`OffloadCache` has a class-level flag `onloading_disabled`. When set: - -- **`__getitem__`**: returns the offloaded (CPU) tensor directly — no CPU→GPU onload -- **`__setitem__`**: stores the value directly in `offloaded_values` — no `offload()`, - no GPU→CPU copy, no `broadcast_object_list`, no `barrier` - -This is a CT-provided escape hatch. It's already used *inside* -`initialize_module_for_quantization` (line 77 of `initialize.py`) to access -`module.weight` without triggering the distributed path. - -### Implementation - -```python -# llmcompressor/modifiers/autoround/base.py — on_initialize() -if QuantizationMixin.has_config(self): - from compressed_tensors.offload import disable_onloading - with disable_onloading(): - QuantizationMixin.initialize_quantization(self, state.model) -``` - -### Why this is safe - -1. **Quant params are deterministic.** Both ranks compute identical scale/zero_point - values from the same quantization scheme. No broadcast is needed — each rank - produces the same data independently. - -2. **Params stay on GPU, which is correct.** Calibration runs next — the params need - to be on GPU for forward/backward. When the block is later offloaded to CPU, the - params follow the normal offload path. - -3. **Precedent exists.** `initialize_module_for_quantization` already uses - `disable_onloading()` for exactly this purpose — accessing `module.weight` without - triggering the onload path. - -4. **Scoped and temporary.** The context manager restores normal behavior after - `initialize_quantization` completes. All subsequent operations use the standard - onload/offload path. - -### Why not `force_local_cache` - -`force_local_cache` only affects `cls_from_device` (new cache *creation*). During -`initialize_quantization`, the `DistributedCPUCache` instances already exist on -modules — params are added to existing caches via `__setitem__`. `force_local_cache` -has no effect on this path. The CT maintainer also rejected this approach because -it changes global cache creation semantics, which could affect model weight loading. diff --git a/examples/autoround/ddp/reproduce.md b/examples/autoround/ddp/reproduce.md deleted file mode 100644 index 099d45f523..0000000000 --- a/examples/autoround/ddp/reproduce.md +++ /dev/null @@ -1,99 +0,0 @@ -# Multi-GPU DDP AutoRound Reproduce - -## torchrun (recommended) - -### 8B - -```bash -cd /home/yiliu7/workspace/llm-compressor - -bash examples/autoround/ddp/launch_torchrun.sh \ - --model /storage/yiliu7/Qwen/Qwen3-8B \ - --scheme W4A16 \ - --nsamples 32 --iters 50 \ - --disable_torch_compile -``` - -### 235B - -```bash -cd /home/yiliu7/workspace/llm-compressor - -AR_DISABLE_DATASET_SUBPROCESS=1 GPUS_PER_GROUP=2 CUDA_VISIBLE_DEVICES=0,1,2,3 \ -/home/yiliu7/workspace/venvs/llmc/bin/torchrun --nproc_per_node=2 --master_port=29500 \ -examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py \ ---model /storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507 \ ---scheme W4A16 --nsamples 32 --iters 50 --disable_torch_compile -``` - -## bash wrapper (dedicated GPU isolation) - -```bash -cd /home/yiliu7/workspace/llm-compressor - -AR_DISABLE_DATASET_SUBPROCESS=1 CUDA_VISIBLE_DEVICES=0,1,6,7 GPUS_PER_GROUP=2 NPROC=2 MASTER_PORT=29501 \ - bash examples/autoround/ddp/launch_multi_gpu.sh \ - ddp_qwen3_multi_gpu_example.py \ - --model /storage/yiliu7/Qwen/Qwen3-8B \ - --scheme W4A16 \ - --nsamples 32 --iters 50 \ - --disable_torch_compile \ - > /tmp/multi_gpu_test.log 2>&1 & -``` - -## Monitor - -```bash -tail -f /tmp/multi_gpu_test.log -ps aux | grep ddp_qwen3_multi | grep -v grep -nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader -pkill -f ddp_qwen3_multi_gpu -``` - -## Verified - -### 8B (2026-06-18) -``` -quantized 7/7 layers in the block, loss iter 0: 19.067873 -> iter 0: 19.067873 -[Rank 0] Quantization completed -Hello my name is Mandy I am 20 years old... -``` -All 37 decoder layers quantized, identical loss across ranks, sample generation works. - -### 235B (2026-06-19) -``` -quantized 388/389 layers in the block, loss iter 0: 0.211156 -> iter 0: 0.211156 -... -[Rank 0] Quantization completed -``` -All 94 decoder layers quantized (388 Linear per MoE block), identical loss across ranks. ~25 min for 1 iter. - -## Key Files - -| File | Change | -|------|--------| -| `examples/autoround/ddp/ddp_qwen3_multi_gpu_torchrun.py` | torchrun example with patches | -| `examples/autoround/ddp/ddp_qwen3_multi_gpu_example.py` | bash wrapper example | -| `examples/autoround/ddp/fast_pipeline.py` | Replaces `SequentialPipeline.__call__` — no FX trace | -| `examples/autoround/ddp/launch_torchrun.sh` | torchrun launcher | -| `examples/autoround/ddp/launch_multi_gpu.sh` | bash wrapper (GPU partitioning) | -| `src/llmcompressor/modifiers/autoround/base.py` | `_get_local_gpu_group_size()` reads `GPUS_PER_GROUP` | -| `src/llmcompressor/pipelines/sequential/helpers.py` | Removed `disable_onloading()` from `trace_subgraphs` | -| `ar-py/auto_round/utils/distributed.py` | `setup_ddp_if_needed_` returns `(block, sync_fn)`; `current_device()` for NCCL | -| `ar-py/auto_round/algorithms/quantization/sign_round/quantizer.py` | Captures return, calls `sync_gradients()` before `_step()` | - -## Required env vars - -| Var | Value | Why | -|-----|-------|-----| -| `GPUS_PER_GROUP` | `2` | Triggers multi-GPU block dispatch + manual all_reduce sync | -| `AR_DISABLE_DATASET_SUBPROCESS` | `1` | Avoids `fork()` with CUDA context | -| `--disable_torch_compile` | flag | torch.compile can't handle cross-device tensors | - -## Known issue: FX trace bottleneck - -`trace_subgraphs` runs an FX trace on the full model — for 61K-module models (235B) it never finishes. The `fast_pipeline.py` module bypasses this by creating subgraphs directly from decoder layer names. This affects ALL models using `SequentialPipeline`, not just DDP. The AWQ example (`qwen3_moe_example_ddp.py`) with 30B MoE also hangs. - -## Venv - -Python: `/home/yiliu7/workspace/venvs/llmc/bin/python` From 44ee3b9a99d43cddc78ac49bac1d669c890588db Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 25 Jun 2026 06:46:41 +0000 Subject: [PATCH 21/22] update Signed-off-by: yiliu30 --- examples/autoround/ddp/ddp_qwen3_moe_example.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/autoround/ddp/ddp_qwen3_moe_example.py b/examples/autoround/ddp/ddp_qwen3_moe_example.py index b89ed6ccd0..a5330a92ff 100644 --- a/examples/autoround/ddp/ddp_qwen3_moe_example.py +++ b/examples/autoround/ddp/ddp_qwen3_moe_example.py @@ -20,10 +20,10 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot -MODEL = "/storage/yiliu7/Qwen/Qwen3-235B-A22B-Instruct-2507" +MODEL = "Qwen/Qwen3-235B-A22B-Instruct-2507" SCHEME = "W4A16" -ITERS = 1 -NSAMPLES = 4 +ITERS = 200 +NSAMPLES = 256 ###### DDP INIT ##### gpus_per_group = int(os.environ.get("GPUS_PER_GROUP", "1")) From 47c58bb71c1da969a5a778534c75f3d01030f0b2 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 25 Jun 2026 06:47:30 +0000 Subject: [PATCH 22/22] fix Signed-off-by: yiliu30 --- src/llmcompressor/modifiers/autoround/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py index 2b6f297523..ca850052cf 100644 --- a/src/llmcompressor/modifiers/autoround/base.py +++ b/src/llmcompressor/modifiers/autoround/base.py @@ -389,7 +389,7 @@ def _update_device_map_for_dp(self, ar_kwargs): return # user explicitly set device_ids, respect it gpus_per_group = _get_local_gpu_group_size() if gpus_per_group > 1: - local_rank = torch.distributed.get_rank() + local_rank = int(os.environ.get("LOCAL_RANK", "0")) start_gpu = local_rank * gpus_per_group ar_kwargs["device_map"] = ",".join( str(start_gpu + i) for i in range(gpus_per_group)