From c5fb6c7a406b9e6a0e4f3a81a8cc163db946e59c Mon Sep 17 00:00:00 2001
From: john-rocky <rockyshikoku@gmail.com>
Date: Sun, 3 May 2026 21:03:23 +0900
Subject: [PATCH] Fix palettize_weights with enable_per_channel_scale=True
 crashing on ANE (macOS 26)

When OpPalettizerConfig is configured with enable_per_channel_scale=True,
palettize_weights wraps the constexpr_lut_to_dense output in a
constexpr_blockwise_shift_scale op (data=<dense fp16 weight>, scale=<per-channel
fp16>). On macOS 26, the MPSGraph backend lowering for that constexpr op fails
verification when targeting the Apple Neural Engine:

    'mps.dequantize' op operand #2 must be tensor of quantized values,
    but got 'tensor<1xf16>'
    ... failed assertion `original module failed verification'

The MPSGraph lowering of constexpr_blockwise_shift_scale assumes the data
operand is a quantized integer tensor (it lowers to mps.dequantize); with
enable_per_channel_scale=True, the data is the dense fp16 weight, which fails
that assumption. CPU and GPU compute units accept the wrapper and predict
correctly; only the ANE-targeted MIL -> MPSGraph dispatch is broken.

Fix: bake per_channel_scale into the LUT entries at compile time and re-emit
constexpr_lut_to_dense, instead of leaving the scale as a runtime constexpr.
Both data and scale are fp16 and the wrapper's only effect is data * scale, so
the fold is mathematically identical. The failing MPSGraph dispatch is
eliminated entirely, and CPU / GPU numerics stay bit-identical with the prior
behavior. Resulting graph also has one fewer runtime constexpr per palettized
const.

Test updated: TestPalettizeWeights::test_palettization_pcs previously asserted
that the constexpr_blockwise_shift_scale wrapper was emitted; it now asserts
the wrapper is absent (the LUT is pre-scaled). Numerical equivalence vs the
unpalettized model is verified by the existing verify_model_outputs call on
macOS 15+.

Tested:
  - test_palettization_pcs:                                    PASS
  - All 155 TestPalettizeWeights / TestJointCompressWeights:   PASS
  - Manual: Qwen3-VL 2B stateful chunk on macOS 26 + M4 ANE:
    MPSGraph verification crash gone (was reproducible at every load).
---
 .../optimize/coreml/_quantization_passes.py   | 26 +++++++++++++++----
 .../coreml/test_post_training_quantization.py |  8 +++---
 2 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/coremltools/optimize/coreml/_quantization_passes.py b/coremltools/optimize/coreml/_quantization_passes.py
index 849e2fe34..53ca42a2f 100644
--- a/coremltools/optimize/coreml/_quantization_passes.py
+++ b/coremltools/optimize/coreml/_quantization_passes.py
@@ -1139,12 +1139,28 @@ def transform_op(self, op: Operation):
                         "Palettization with per-channel-scale is only supported since "
                         "iOS18. Please set minimum_deployment_target accordingly."
                     )
-                new_var = mb.constexpr_blockwise_shift_scale(
-                    data=new_var,
-                    scale=per_channel_scale,
-                    offset=None,
-                    before_op=op,
+                # Bake per_channel_scale into the LUT entries instead of
+                # wrapping the dense weight in a runtime
+                # constexpr_blockwise_shift_scale: that wrapper fails MPSGraph
+                # verification on Apple Neural Engine (macOS 26+) because the
+                # mps.dequantize lowering expects an integer data operand.
+                # Folding is mathematically identical (output = data * scale).
+                lut = lut_params.lut.copy()
+                # LUT has trailing dims [group, num_palette, vector_size] that
+                # are not present in per_channel_scale; broadcast across those.
+                pcs_bcast = per_channel_scale.reshape(
+                    per_channel_scale.shape
+                    + (1,) * (lut.ndim - per_channel_scale.ndim)
+                )
+                lut = (
+                    lut.astype(np.float32) * pcs_bcast.astype(np.float32)
+                ).astype(lut.dtype)
+                new_var = frontend_utils._construct_constexpr_lut_op(
+                    lut_params.indices,
+                    lut,
+                    lut_params.vector_axis,
                     name=op.name + "_palettized_pcs",
+                    before_op=op,
                 )
         else:
             decompressed_val = self.decompress(lut_params)
diff --git a/coremltools/test/optimize/coreml/test_post_training_quantization.py b/coremltools/test/optimize/coreml/test_post_training_quantization.py
index 37148aba6..4478eb993 100644
--- a/coremltools/test/optimize/coreml/test_post_training_quantization.py
+++ b/coremltools/test/optimize/coreml/test_post_training_quantization.py
@@ -1683,13 +1683,13 @@ def test_palettization_pcs(self, compute_unit, backend):
             op_type="constexpr_lut_to_dense"
         )[0]
         assert types.builtin_to_string(palettize_op.indices.dtype) == "uint4"
-        # The per-channel-scale is represented by a quant op to do scaling.
+        # per_channel_scale is folded into the LUT entries at compile time, so
+        # no runtime constexpr_blockwise_shift_scale wrapper is emitted (see
+        # palettize_weights in _quantization_passes.py for the rationale).
         quantize_ops = mlmodel_palettized._mil_program.functions["main"].find_ops(
             op_type="constexpr_blockwise_shift_scale"
         )
-        assert len(quantize_ops) > 0
-        # Order of quant and lut op is determined by canonicalize_quantized_lut_pattern graph pass.
-        assert quantize_ops[0].outputs[0].child_ops[0].op_type == "constexpr_lut_to_dense"
+        assert len(quantize_ops) == 0
 
         if _macos_version() >= (15, 0):
             verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)