huggingface · avihu111 · May 18, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1078,6 +1078,8 @@
         title: GLM-ASR
       - local: model_doc/granite_speech
         title: GraniteSpeech
+      - local: model_doc/granite_speech_nar
+        title: GraniteSpeechNar
       - local: model_doc/granite_speech_plus
         title: GraniteSpeechPlus
       - local: model_doc/higgs_audio_v2

diff --git a/docs/source/en/model_doc/granite_speech_nar.md b/docs/source/en/model_doc/granite_speech_nar.md
@@ -0,0 +1,72 @@
+<!--Copyright 2026 IBM and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+*This model was released on 2026-03-09 and added to Hugging Face Transformers on 2026-06-03.*
+
+# GraniteSpeechNar
+
+## Overview
+
+GraniteSpeechNar is a non-autoregressive (NAR) speech recognition model based on [NLE: Non-autoregressive LLM-based ASR by Transcript Editing](https://huggingface.co/papers/2603.08397). It formulates ASR as conditional transcript editing, achieving fully parallel prediction with significant speedups over autoregressive baselines.
+
+The model consists of:
+
+1. **Conformer Encoder**: A conformer encoder trained with CTC on BPE targets, using block-attention and self-conditioned CTC from the middle layer.
+
+2. **QFormer Projector**: A windowed query-transformer that maps multi-layer encoder features to the LLM embedding space with temporal downsampling.
+
+3. **Bidirectional Granite LLM**: A Granite language model with bidirectional (non-causal) attention that refines CTC predictions in a single forward pass.
+
+The model performs inference in a single pass: the encoder produces initial CTC predictions, which are interleaved with blank insertion slots (exploiting the identity mapping bias of Transformers) and fed alongside projected audio embeddings to the bidirectional LLM for refinement via a latent alignment objective.
+
+This model was contributed by [Avihu Dekel](https://huggingface.co/Avihu).
+
+## GraniteSpeechNarConfig
+
+[[autodoc]] GraniteSpeechNarConfig
+
+## GraniteSpeechNarEncoderConfig
+
+[[autodoc]] GraniteSpeechNarEncoderConfig
+
+## GraniteSpeechNarProjectorConfig
+
+[[autodoc]] GraniteSpeechNarProjectorConfig
+
+## GraniteSpeechNarProcessor
+
+[[autodoc]] GraniteSpeechNarProcessor
+    - __call__
+    - batch_decode
+
+## GraniteSpeechNarFeatureExtractor
+
+[[autodoc]] GraniteSpeechNarFeatureExtractor
+
+## GraniteSpeechNarModel
+
+[[autodoc]] GraniteSpeechNarModel
+    - forward
+
+## GraniteSpeechNarLanguageModel
+
+[[autodoc]] GraniteSpeechNarLanguageModel
+    - forward
+
+## GraniteSpeechNarForCTC
+
+[[autodoc]] GraniteSpeechNarForCTC
+    - forward
+    - generate
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -92,6 +92,7 @@
     "audioflamingo3": "qwen2_audio",
     "glmasr": "qwen2_audio",
     "musicflamingo": "qwen2_audio",
+    "granite_speech_nar": "granite_speech",
     "granite_speech_plus": "granite_speech",
     "gemma3n_text": "qwen3_5_text",
     "qwen3_5_moe_text": "qwen3_5_text",
@@ -116,6 +117,7 @@
     "AudioFlamingo3Model": "Qwen2AudioModel",
     "GlmAsrModel": "Qwen2AudioModel",
     "MusicFlamingoModel": "Qwen2AudioModel",
+    "GraniteSpeechNarModel": "GraniteSpeechModel",
     "GraniteSpeechPlusModel": "GraniteSpeechModel",
     "MaskFormerDetrDecoder": "DetrModel",
     "Qwen2_5_VLForConditionalGeneration": "Qwen2VLForConditionalGeneration",

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -184,6 +184,7 @@
     from .granite import *
     from .granite4_vision import *
     from .granite_speech import *
+    from .granite_speech_nar import *
     from .granite_speech_plus import *
     from .granitemoe import *
     from .granitemoehybrid import *

diff --git a/src/transformers/models/auto/auto_mappings.py b/src/transformers/models/auto/auto_mappings.py
@@ -244,6 +244,9 @@
         ("granite4_vision_text", "Granite4VisionTextConfig"),
         ("granite_speech", "GraniteSpeechConfig"),
         ("granite_speech_encoder", "GraniteSpeechEncoderConfig"),
+        ("granite_speech_nar", "GraniteSpeechNarConfig"),
+        ("granite_speech_nar_encoder", "GraniteSpeechNarEncoderConfig"),
+        ("granite_speech_nar_projector", "GraniteSpeechNarProjectorConfig"),
         ("granite_speech_plus", "GraniteSpeechPlusConfig"),
         ("granite_speech_plus_encoder", "GraniteSpeechPlusEncoderConfig"),
         ("granitemoe", "GraniteMoeConfig"),
@@ -733,6 +736,8 @@
         ("glmasr_encoder", "glmasr"),
         ("granite4_vision_text", "granite4_vision"),
         ("granite_speech_encoder", "granite_speech"),
+        ("granite_speech_nar_encoder", "granite_speech_nar"),
+        ("granite_speech_nar_projector", "granite_speech_nar"),
         ("granite_speech_plus_encoder", "granite_speech_plus"),
         ("grounding-dino", "grounding_dino"),
         ("groupvit_text_model", "groupvit"),

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
@@ -49,6 +49,7 @@
         ("gemma4", "Gemma4AudioFeatureExtractor"),
         ("glmasr", "WhisperFeatureExtractor"),
         ("granite_speech", "GraniteSpeechFeatureExtractor"),
+        ("granite_speech_nar", "GraniteSpeechNarFeatureExtractor"),
         ("granite_speech_plus", "GraniteSpeechFeatureExtractor"),
         ("higgs_audio_v2_tokenizer", "DacFeatureExtractor"),
         ("hubert", "Wav2Vec2FeatureExtractor"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -217,6 +217,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("granite", "GraniteModel"),
         ("granite4_vision", "Granite4VisionModel"),
         ("granite_speech", "GraniteSpeechModel"),
+        ("granite_speech_nar", "GraniteSpeechNarForCTC"),
         ("granite_speech_plus", "GraniteSpeechPlusModel"),
         ("granitemoe", "GraniteMoeModel"),
         ("granitemoehybrid", "GraniteMoeHybridModel"),
@@ -1672,6 +1673,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
     [
         # Model for Connectionist temporal classification (CTC) mapping
         ("data2vec-audio", "Data2VecAudioForCTC"),
+        ("granite_speech_nar", "GraniteSpeechNarForCTC"),
         ("hubert", "HubertForCTC"),
         ("lasr_ctc", "LasrForCTC"),
         ("parakeet_ctc", "ParakeetForCTC"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -93,6 +93,7 @@
             ("got_ocr2", "GotOcr2Processor"),
             ("granite4_vision", "Granite4VisionProcessor"),
             ("granite_speech", "GraniteSpeechProcessor"),
+            ("granite_speech_nar", "GraniteSpeechNarProcessor"),
             ("granite_speech_plus", "GraniteSpeechProcessor"),
             ("grounding-dino", "GroundingDinoProcessor"),
             ("groupvit", "CLIPProcessor"),

diff --git a/src/transformers/models/granite_speech_nar/__init__.py b/src/transformers/models/granite_speech_nar/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2026 IBM and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granite_speech_nar import *
+    from .feature_extraction_granite_speech_nar import *
+    from .modeling_granite_speech_nar import *
+    from .processing_granite_speech_nar import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)