From c5fb6c7a406b9e6a0e4f3a81a8cc163db946e59c Mon Sep 17 00:00:00 2001 From: john-rocky Date: Sun, 3 May 2026 21:03:23 +0900 Subject: [PATCH] Fix palettize_weights with enable_per_channel_scale=True crashing on ANE (macOS 26) When OpPalettizerConfig is configured with enable_per_channel_scale=True, palettize_weights wraps the constexpr_lut_to_dense output in a constexpr_blockwise_shift_scale op (data=, scale=). On macOS 26, the MPSGraph backend lowering for that constexpr op fails verification when targeting the Apple Neural Engine: 'mps.dequantize' op operand #2 must be tensor of quantized values, but got 'tensor<1xf16>' ... failed assertion `original module failed verification' The MPSGraph lowering of constexpr_blockwise_shift_scale assumes the data operand is a quantized integer tensor (it lowers to mps.dequantize); with enable_per_channel_scale=True, the data is the dense fp16 weight, which fails that assumption. CPU and GPU compute units accept the wrapper and predict correctly; only the ANE-targeted MIL -> MPSGraph dispatch is broken. Fix: bake per_channel_scale into the LUT entries at compile time and re-emit constexpr_lut_to_dense, instead of leaving the scale as a runtime constexpr. Both data and scale are fp16 and the wrapper's only effect is data * scale, so the fold is mathematically identical. The failing MPSGraph dispatch is eliminated entirely, and CPU / GPU numerics stay bit-identical with the prior behavior. Resulting graph also has one fewer runtime constexpr per palettized const. Test updated: TestPalettizeWeights::test_palettization_pcs previously asserted that the constexpr_blockwise_shift_scale wrapper was emitted; it now asserts the wrapper is absent (the LUT is pre-scaled). Numerical equivalence vs the unpalettized model is verified by the existing verify_model_outputs call on macOS 15+. Tested: - test_palettization_pcs: PASS - All 155 TestPalettizeWeights / TestJointCompressWeights: PASS - Manual: Qwen3-VL 2B stateful chunk on macOS 26 + M4 ANE: MPSGraph verification crash gone (was reproducible at every load). --- .../optimize/coreml/_quantization_passes.py | 26 +++++++++++++++---- .../coreml/test_post_training_quantization.py | 8 +++--- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/coremltools/optimize/coreml/_quantization_passes.py b/coremltools/optimize/coreml/_quantization_passes.py index 849e2fe34..53ca42a2f 100644 --- a/coremltools/optimize/coreml/_quantization_passes.py +++ b/coremltools/optimize/coreml/_quantization_passes.py @@ -1139,12 +1139,28 @@ def transform_op(self, op: Operation): "Palettization with per-channel-scale is only supported since " "iOS18. Please set minimum_deployment_target accordingly." ) - new_var = mb.constexpr_blockwise_shift_scale( - data=new_var, - scale=per_channel_scale, - offset=None, - before_op=op, + # Bake per_channel_scale into the LUT entries instead of + # wrapping the dense weight in a runtime + # constexpr_blockwise_shift_scale: that wrapper fails MPSGraph + # verification on Apple Neural Engine (macOS 26+) because the + # mps.dequantize lowering expects an integer data operand. + # Folding is mathematically identical (output = data * scale). + lut = lut_params.lut.copy() + # LUT has trailing dims [group, num_palette, vector_size] that + # are not present in per_channel_scale; broadcast across those. + pcs_bcast = per_channel_scale.reshape( + per_channel_scale.shape + + (1,) * (lut.ndim - per_channel_scale.ndim) + ) + lut = ( + lut.astype(np.float32) * pcs_bcast.astype(np.float32) + ).astype(lut.dtype) + new_var = frontend_utils._construct_constexpr_lut_op( + lut_params.indices, + lut, + lut_params.vector_axis, name=op.name + "_palettized_pcs", + before_op=op, ) else: decompressed_val = self.decompress(lut_params) diff --git a/coremltools/test/optimize/coreml/test_post_training_quantization.py b/coremltools/test/optimize/coreml/test_post_training_quantization.py index 37148aba6..4478eb993 100644 --- a/coremltools/test/optimize/coreml/test_post_training_quantization.py +++ b/coremltools/test/optimize/coreml/test_post_training_quantization.py @@ -1683,13 +1683,13 @@ def test_palettization_pcs(self, compute_unit, backend): op_type="constexpr_lut_to_dense" )[0] assert types.builtin_to_string(palettize_op.indices.dtype) == "uint4" - # The per-channel-scale is represented by a quant op to do scaling. + # per_channel_scale is folded into the LUT entries at compile time, so + # no runtime constexpr_blockwise_shift_scale wrapper is emitted (see + # palettize_weights in _quantization_passes.py for the rationale). quantize_ops = mlmodel_palettized._mil_program.functions["main"].find_ops( op_type="constexpr_blockwise_shift_scale" ) - assert len(quantize_ops) > 0 - # Order of quant and lut op is determined by canonicalize_quantized_lut_pattern graph pass. - assert quantize_ops[0].outputs[0].child_ops[0].op_type == "constexpr_lut_to_dense" + assert len(quantize_ops) == 0 if _macos_version() >= (15, 0): verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)