From 710b8c17c13bd24b839121608efd29fd5801e4f0 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Fri, 22 Aug 2025 11:23:55 +0000
Subject: [PATCH 01/20] 1st draft

---
 Dockerfile.neuron                             |  43 +++++
 backends/Cargo.toml                           |   1 +
 backends/neuron/Cargo.toml                    |  16 ++
 backends/neuron/server/README.md              |   0
 .../server/text_embeddings_server/__init__.py |   0
 .../server/text_embeddings_server/cli.py      |  55 +++++++
 .../text_embeddings_server/models/__init__.py | 126 +++++++++++++++
 .../server/text_embeddings_server/server.py   |  92 +++++++++++
 backends/neuron/src/lib.rs                    | 132 ++++++++++++++++
 backends/neuron/src/logging.rs                |  61 ++++++++
 backends/neuron/src/management.rs             | 148 ++++++++++++++++++
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/local_neuron.md                |   1 +
 integration_tests/neuron/conftest.py          |   0
 integration_tests/neuron/test_embed.py        |   0
 15 files changed, 677 insertions(+)
 create mode 100644 Dockerfile.neuron
 create mode 100644 backends/neuron/Cargo.toml
 create mode 100644 backends/neuron/server/README.md
 create mode 100644 backends/neuron/server/text_embeddings_server/__init__.py
 create mode 100644 backends/neuron/server/text_embeddings_server/cli.py
 create mode 100644 backends/neuron/server/text_embeddings_server/models/__init__.py
 create mode 100644 backends/neuron/server/text_embeddings_server/server.py
 create mode 100644 backends/neuron/src/lib.rs
 create mode 100644 backends/neuron/src/logging.rs
 create mode 100644 backends/neuron/src/management.rs
 create mode 100644 docs/source/en/local_neuron.md
 create mode 100644 integration_tests/neuron/conftest.py
 create mode 100644 integration_tests/neuron/test_embed.py

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
new file mode 100644
index 000000000..f8b03ab26
--- /dev/null
+++ b/Dockerfile.neuron
@@ -0,0 +1,43 @@
+ARG PLATFORM=neuron
+FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef
+WORKDIR /usr/src
+
+ENV SCCACHE=0.10.0
+ENV RUSTC_WRAPPER=/usr/local/bin/sccache
+
+# Donwload, configure sccache
+RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
+    chmod +x /usr/local/bin/sccache
+
+FROM chef AS planner
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM chef AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+# sccache specific variables
+ARG SCCACHE_GHA_ENABLED
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+WORKDIR /usr/src
+
diff --git a/backends/Cargo.toml b/backends/Cargo.toml
index bb9d74191..7d821ff40 100644
--- a/backends/Cargo.toml
+++ b/backends/Cargo.toml
@@ -21,6 +21,7 @@ rand = { workspace = true }
 [features]
 clap = ["dep:clap", "text-embeddings-backend-core/clap"]
 python = ["dep:text-embeddings-backend-python"]
+neuron = ["dep:text-embeddings-backend-neuron"]
 ort = ["dep:text-embeddings-backend-ort"]
 candle = ["dep:text-embeddings-backend-candle"]
 cuda = ["text-embeddings-backend-candle?/cuda"]
diff --git a/backends/neuron/Cargo.toml b/backends/neuron/Cargo.toml
new file mode 100644
index 000000000..b38f350ed
--- /dev/null
+++ b/backends/neuron/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "text-embeddings-backend-python"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[dependencies]
+backend-grpc-client = { path = "../grpc-client" }
+nohash-hasher = "^0.2"
+serde = { version = "^1.0", features = ["derive"]  }
+serde_json = "^1.0"
+text-embeddings-backend-core = { path = "../core" }
+thiserror = "^1.0"
+tokio = { version = "^1.25", features = ["sync"] }
+tracing = "^0.1"
diff --git a/backends/neuron/server/README.md b/backends/neuron/server/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/backends/neuron/server/text_embeddings_server/__init__.py b/backends/neuron/server/text_embeddings_server/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backends/neuron/server/text_embeddings_server/cli.py b/backends/neuron/server/text_embeddings_server/cli.py
new file mode 100644
index 000000000..c4dfaa4c1
--- /dev/null
+++ b/backends/neuron/server/text_embeddings_server/cli.py
@@ -0,0 +1,55 @@
+import sys
+import typer
+
+from pathlib import Path
+from loguru import logger
+from typing import Optional
+from enum import Enum
+
+app = typer.Typer()
+
+
+class Dtype(str, Enum):
+    float32 = "float32"
+    float16 = "float16"
+    bloat16 = "bfloat16"
+
+
+@app.command()
+def serve(
+    model_path: Path,
+    dtype: Dtype = "float32",
+    uds_path: Path = "/tmp/text-embeddings-server",
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-embeddings-inference.server",
+    pool: str = "cls",
+):
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_embeddings_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    # Import here after the logger is added to log potential import exceptions
+    from text_embeddings_server import server
+    from text_embeddings_server.utils.tracing import setup_tracing
+
+    # Setup OpenTelemetry distributed tracing
+    if otlp_endpoint is not None:
+        setup_tracing(otlp_endpoint=otlp_endpoint, otlp_service_name=otlp_service_name)
+
+    # Downgrade enum into str for easier management later on
+    dtype = None if dtype is None else dtype.value
+    server.serve(model_path, dtype, uds_path, pool)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/backends/neuron/server/text_embeddings_server/models/__init__.py b/backends/neuron/server/text_embeddings_server/models/__init__.py
new file mode 100644
index 000000000..06c39832c
--- /dev/null
+++ b/backends/neuron/server/text_embeddings_server/models/__init__.py
@@ -0,0 +1,126 @@
+import os
+import torch
+
+from loguru import logger
+from pathlib import Path
+from typing import Optional
+from transformers import AutoConfig
+from transformers.models.bert import BertConfig
+
+from text_embeddings_server.models.model import Model
+from text_embeddings_server.models.masked_model import MaskedLanguageModel
+from text_embeddings_server.models.default_model import DefaultModel
+from text_embeddings_server.models.classification_model import ClassificationModel
+from text_embeddings_server.models.jinaBert_model import FlashJinaBert
+from text_embeddings_server.models.flash_mistral import FlashMistral
+from text_embeddings_server.models.flash_qwen3 import FlashQwen3
+from text_embeddings_server.utils.device import get_device, use_ipex
+
+__all__ = ["Model"]
+
+TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "false").lower() in ["true", "1"]
+DISABLE_TENSOR_CACHE = os.getenv("DISABLE_TENSOR_CACHE", "false").lower() in [
+    "true",
+    "1",
+]
+# Disable gradients
+torch.set_grad_enabled(False)
+
+FLASH_ATTENTION = True
+try:
+    from text_embeddings_server.models.flash_bert import FlashBert
+except ImportError as e:
+    logger.warning(f"Could not import Flash Attention enabled models: {e}")
+    FLASH_ATTENTION = False
+
+if FLASH_ATTENTION:
+    __all__.append(FlashBert)
+
+
+def create_model(model_class, model_path, device, datatype, pool="cls"):
+    """Create a model instance and load it into Neuron devices."""
+    model_handle = model_class(
+        model_path,
+        device,
+        datatype,
+        pool,
+        trust_remote=TRUST_REMOTE_CODE,
+    )
+    return model_handle
+
+
+def get_model(model_path: Path, dtype: Optional[str], pool: str):
+    if dtype == "float32":
+        datatype = torch.float32
+    elif dtype == "float16":
+        datatype = torch.float16
+    elif dtype == "bfloat16":
+        datatype = torch.bfloat16
+    else:
+        raise RuntimeError(f"Unknown dtype {dtype}")
+
+    device = get_device()
+    logger.info(f"backend device: {device}")
+
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
+
+    if (
+        hasattr(config, "auto_map")
+        and isinstance(config.auto_map, dict)
+        and "AutoModel" in config.auto_map
+        and config.auto_map["AutoModel"]
+        == "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertModel"
+    ):
+        # Add specific offline modeling for model "jinaai/jina-embeddings-v2-base-code" which uses "autoMap" to reference code in other repository
+        return create_model(FlashJinaBert, model_path, device, datatype)
+
+    if config.model_type == "bert":
+        config: BertConfig
+        if (
+            use_ipex()
+            or device.type in ["cuda", "hpu"]
+            and config.position_embedding_type == "absolute"
+            and datatype in [torch.float16, torch.bfloat16]
+            and FLASH_ATTENTION
+        ):
+            if pool != "cls":
+                if config.architectures[0].endswith("ForMaskedLM") and pool == "splade":
+                    return create_model(
+                        MaskedLanguageModel, model_path, device, datatype, pool
+                    )
+                return create_model(DefaultModel, model_path, device, datatype, pool)
+
+            try:
+                return create_model(FlashBert, model_path, device, datatype)
+            except FileNotFoundError:
+                logger.info(
+                    "Do not have safetensors file for this model, use default transformers model path instead"
+                )
+                return create_model(DefaultModel, model_path, device, datatype, pool)
+
+        if config.architectures[0].endswith("Classification"):
+            return create_model(ClassificationModel, model_path, device, datatype)
+        elif config.architectures[0].endswith("ForMaskedLM") and pool == "splade":
+            return create_model(MaskedLanguageModel, model_path, device, datatype)
+        else:
+            return create_model(DefaultModel, model_path, device, datatype, pool)
+
+    if config.model_type == "mistral" and device.type == "hpu":
+        try:
+            return create_model(FlashMistral, model_path, device, datatype, pool)
+        except FileNotFoundError:
+            return create_model(DefaultModel, model_path, device, datatype, pool)
+
+    if config.model_type == "qwen3" and device.type == "hpu":
+        try:
+            return create_model(FlashQwen3, model_path, device, datatype, pool)
+        except FileNotFoundError:
+            return create_model(DefaultModel, model_path, device, datatype, pool)
+
+    # Default case
+    if config.architectures[0].endswith("Classification"):
+        return create_model(ClassificationModel, model_path, device, datatype)
+    elif config.architectures[0].endswith("ForMaskedLM") and pool == "splade":
+        return create_model(MaskedLanguageModel, model_path, device, datatype)
+    else:
+        return create_model(DefaultModel, model_path, device, datatype, pool)
diff --git a/backends/neuron/server/text_embeddings_server/server.py b/backends/neuron/server/text_embeddings_server/server.py
new file mode 100644
index 000000000..646d79bc9
--- /dev/null
+++ b/backends/neuron/server/text_embeddings_server/server.py
@@ -0,0 +1,92 @@
+import asyncio
+import torch
+from grpc import aio
+from loguru import logger
+
+from grpc_reflection.v1alpha import reflection
+from pathlib import Path
+from typing import Optional
+
+from text_embeddings_server.models import Model, get_model
+from text_embeddings_server.pb import embed_pb2_grpc, embed_pb2
+from text_embeddings_server.utils.tracing import UDSOpenTelemetryAioServerInterceptor
+from text_embeddings_server.utils.interceptor import ExceptionInterceptor
+
+
+class EmbeddingService(embed_pb2_grpc.EmbeddingServiceServicer):
+    def __init__(self, model: Model):
+        self.model = model
+        # Force inference mode for the lifetime of EmbeddingService
+        self._inference_mode_raii_guard = torch._C._InferenceMode(True)
+
+    async def Health(self, request, context):
+        if self.model.device.type == "cuda":
+            torch.zeros((2, 2), device="cuda")
+        return embed_pb2.HealthResponse()
+
+    async def Embed(self, request, context):
+        max_input_length = self.model.max_input_length
+        batch = self.model.batch_type.from_pb(
+            request, self.model.device, max_input_length
+        )
+
+        embeddings = self.model.embed(batch)
+
+        return embed_pb2.EmbedResponse(embeddings=embeddings)
+
+    async def Predict(self, request, context):
+        max_input_length = self.model.max_input_length
+        batch = self.model.batch_type.from_pb(
+            request, self.model.device, max_input_length
+        )
+
+        scores = self.model.predict(batch)
+
+        return embed_pb2.PredictResponse(scores=scores)
+
+
+def serve(
+    model_path: Path,
+    dtype: Optional[str],
+    uds_path: Path,
+    pool: str,
+):
+    async def serve_inner(
+        model_path: Path,
+        dtype: Optional[str] = None,
+    ):
+        unix_socket = f"unix://{uds_path}"
+
+        try:
+            model = get_model(model_path, dtype, pool)
+        except Exception:
+            logger.exception("Error when initializing model")
+            raise
+
+        server = aio.server(
+            interceptors=[
+                ExceptionInterceptor(),
+                UDSOpenTelemetryAioServerInterceptor(),
+            ]
+        )
+        embed_pb2_grpc.add_EmbeddingServiceServicer_to_server(
+            EmbeddingService(model), server
+        )
+        SERVICE_NAMES = (
+            embed_pb2.DESCRIPTOR.services_by_name["EmbeddingService"].full_name,
+            reflection.SERVICE_NAME,
+        )
+        reflection.enable_server_reflection(SERVICE_NAMES, server)
+        server.add_insecure_port(unix_socket)
+
+        await server.start()
+
+        logger.info(f"Server started at {unix_socket}")
+
+        try:
+            await server.wait_for_termination()
+        except KeyboardInterrupt:
+            logger.info("Signal received. Shutting down")
+            await server.stop(0)
+
+    asyncio.run(serve_inner(model_path, dtype))
diff --git a/backends/neuron/src/lib.rs b/backends/neuron/src/lib.rs
new file mode 100644
index 000000000..53255b07d
--- /dev/null
+++ b/backends/neuron/src/lib.rs
@@ -0,0 +1,132 @@
+mod logging;
+mod management;
+
+use backend_grpc_client::Client;
+use nohash_hasher::BuildNoHashHasher;
+use std::collections::HashMap;
+use text_embeddings_backend_core::{
+    Backend, BackendError, Batch, Embedding, Embeddings, ModelType, Pool, Predictions,
+};
+use tokio::runtime::Runtime;
+
+pub struct PythonBackend {
+    _backend_process: management::BackendProcess,
+    tokio_runtime: Runtime,
+    backend_client: Client,
+}
+
+impl PythonBackend {
+    pub fn new(
+        model_path: String,
+        dtype: String,
+        model_type: ModelType,
+        uds_path: String,
+        otlp_endpoint: Option<String>,
+        otlp_service_name: String,
+    ) -> Result<Self, BackendError> {
+        let pool = match model_type {
+            ModelType::Classifier => Pool::Cls,
+            ModelType::Embedding(pool) => pool,
+        };
+
+        let backend_process = management::BackendProcess::new(
+            model_path,
+            dtype,
+            &uds_path,
+            otlp_endpoint,
+            otlp_service_name,
+            pool,
+        )?;
+        let tokio_runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .map_err(|err| BackendError::Start(format!("Could not start Tokio runtime: {err}")))?;
+
+        let backend_client = tokio_runtime
+            .block_on(Client::connect_uds(uds_path))
+            .map_err(|err| {
+                BackendError::Start(format!("Could not connect to backend process: {err}"))
+            })?;
+
+        Ok(Self {
+            _backend_process: backend_process,
+            tokio_runtime,
+            backend_client,
+        })
+    }
+}
+
+impl Backend for PythonBackend {
+    fn health(&self) -> Result<(), BackendError> {
+        if self
+            .tokio_runtime
+            .block_on(self.backend_client.clone().health())
+            .is_err()
+        {
+            return Err(BackendError::Unhealthy);
+        }
+        Ok(())
+    }
+
+    fn is_padded(&self) -> bool {
+        false
+    }
+
+    fn embed(&self, batch: Batch) -> Result<Embeddings, BackendError> {
+        if !batch.raw_indices.is_empty() {
+            return Err(BackendError::Inference(
+                "raw embeddings are not supported for the Python backend.".to_string(),
+            ));
+        }
+        let batch_size = batch.len();
+
+        let results = self
+            .tokio_runtime
+            .block_on(self.backend_client.clone().embed(
+                batch.input_ids,
+                batch.token_type_ids,
+                batch.position_ids,
+                batch.cumulative_seq_lengths,
+                batch.max_length,
+            ))
+            .map_err(|err| BackendError::Inference(err.to_string()))?;
+        let pooled_embeddings: Vec<Vec<f32>> = results.into_iter().map(|r| r.values).collect();
+
+        let mut embeddings =
+            HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default());
+        for (i, e) in pooled_embeddings.into_iter().enumerate() {
+            embeddings.insert(i, Embedding::Pooled(e));
+        }
+
+        Ok(embeddings)
+    }
+
+    fn predict(&self, batch: Batch) -> Result<Predictions, BackendError> {
+        if !batch.raw_indices.is_empty() {
+            return Err(BackendError::Inference(
+                "raw embeddings are not supported for the Python backend.".to_string(),
+            ));
+        }
+        let batch_size = batch.len();
+        let results = self
+            .tokio_runtime
+            .block_on(self.backend_client.clone().predict(
+                batch.input_ids,
+                batch.token_type_ids,
+                batch.position_ids,
+                batch.cumulative_seq_lengths,
+                batch.max_length,
+            ))
+            .map_err(|err| BackendError::Inference(err.to_string()))?;
+        let raw_results: Vec<Vec<f32>> = results.into_iter().map(|r| r.values).collect();
+
+        let mut predictions =
+            HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default());
+
+        for (i, r) in raw_results.into_iter().enumerate() {
+            predictions.insert(i, r);
+        }
+
+        Ok(predictions)
+    }
+}
diff --git a/backends/neuron/src/logging.rs b/backends/neuron/src/logging.rs
new file mode 100644
index 000000000..8f55e8e6b
--- /dev/null
+++ b/backends/neuron/src/logging.rs
@@ -0,0 +1,61 @@
+use serde::Deserialize;
+use std::io::{BufRead, Lines};
+
+#[derive(Deserialize)]
+#[serde(rename_all = "UPPERCASE")]
+enum PythonLogLevelEnum {
+    Trace,
+    Debug,
+    Info,
+    Success,
+    Warning,
+    Error,
+    Critical,
+}
+
+#[derive(Deserialize)]
+struct PythonLogLevel {
+    name: PythonLogLevelEnum,
+}
+
+#[derive(Deserialize)]
+struct PythonLogRecord {
+    level: PythonLogLevel,
+}
+
+#[derive(Deserialize)]
+struct PythonLogMessage {
+    text: String,
+    record: PythonLogRecord,
+}
+
+impl PythonLogMessage {
+    fn trace(&self) {
+        match self.record.level.name {
+            PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text),
+            PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text),
+            PythonLogLevelEnum::Info => tracing::info!("{}", self.text),
+            PythonLogLevelEnum::Success => tracing::info!("{}", self.text),
+            PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text),
+            PythonLogLevelEnum::Error => tracing::error!("{}", self.text),
+            PythonLogLevelEnum::Critical => tracing::error!("{}", self.text),
+        }
+    }
+}
+
+impl TryFrom<&String> for PythonLogMessage {
+    type Error = serde_json::Error;
+
+    fn try_from(value: &String) -> Result<Self, Self::Error> {
+        serde_json::from_str::<Self>(value)
+    }
+}
+
+pub(crate) fn log_lines<S: Sized + BufRead>(lines: Lines<S>) {
+    for line in lines.map_while(Result::ok) {
+        match PythonLogMessage::try_from(&line) {
+            Ok(log) => log.trace(),
+            Err(_) => tracing::debug!("{line}"),
+        }
+    }
+}
diff --git a/backends/neuron/src/management.rs b/backends/neuron/src/management.rs
new file mode 100644
index 000000000..81c294a92
--- /dev/null
+++ b/backends/neuron/src/management.rs
@@ -0,0 +1,148 @@
+use crate::logging::log_lines;
+use std::ffi::OsString;
+use std::io::{BufRead, BufReader};
+use std::os::unix::process::{CommandExt, ExitStatusExt};
+use std::path::Path;
+use std::process::{Child, Command, Stdio};
+use std::sync::mpsc;
+use std::thread::sleep;
+use std::time::{Duration, Instant};
+use std::{env, fs, io, thread};
+use text_embeddings_backend_core::{BackendError, Pool};
+
+#[derive(Debug)]
+pub(crate) struct BackendProcess {
+    inner: Child,
+}
+
+impl BackendProcess {
+    pub(crate) fn new(
+        model_path: String,
+        dtype: String,
+        uds_path: &str,
+        otlp_endpoint: Option<String>,
+        otlp_service_name: String,
+        pool: Pool,
+    ) -> Result<Self, BackendError> {
+        // Get UDS path
+        let uds = Path::new(uds_path);
+
+        // Clean previous runs
+        if uds.exists() {
+            fs::remove_file(uds).expect("could not remove UDS file");
+        }
+
+        let pool = match pool {
+            Pool::Cls => "cls",
+            Pool::Mean => "mean",
+            Pool::LastToken => "lasttoken",
+            Pool::Splade => "splade",
+        };
+
+        // Process args
+        let mut python_server_args = vec![
+            model_path,
+            "--dtype".to_owned(),
+            dtype,
+            "--uds-path".to_owned(),
+            uds_path.to_owned(),
+            "--logger-level".to_owned(),
+            "INFO".to_owned(),
+            "--json-output".to_owned(),
+            "--pool".to_owned(),
+            pool.to_owned(),
+        ];
+
+        // OpenTelemetry
+        if let Some(otlp_endpoint) = otlp_endpoint {
+            python_server_args.push("--otlp-endpoint".to_owned());
+            python_server_args.push(otlp_endpoint);
+        }
+
+        python_server_args.push("--otlp-service-name".to_owned());
+        python_server_args.push(otlp_service_name);
+
+        // Copy current process env
+        let envs: Vec<(OsString, OsString)> = env::vars_os().collect();
+
+        tracing::info!("Starting Python backend");
+        let mut p = match Command::new("python-text-embeddings-server")
+            .args(python_server_args)
+            .envs(envs)
+            .stdout(Stdio::piped())
+            .stderr(Stdio::piped())
+            .process_group(0)
+            .spawn()
+        {
+            Ok(p) => p,
+            Err(err) => {
+                if err.kind() == io::ErrorKind::NotFound {
+                    return Err(BackendError::Start(
+                        "python-text-embeddings-server not found in PATH".to_owned(),
+                    ));
+                }
+                return Err(BackendError::Start(err.to_string()));
+            }
+        };
+
+        let stdout_reader = BufReader::new(p.stdout.take().unwrap());
+        let stderr_reader = BufReader::new(p.stderr.take().unwrap());
+
+        //stdout tracing thread
+        thread::spawn(move || {
+            let _span = tracing::span!(tracing::Level::INFO, "python-backend").entered();
+            log_lines(stdout_reader.lines());
+        });
+
+        let start_time = Instant::now();
+        let mut wait_time = Instant::now();
+
+        loop {
+            // Process exited
+            if let Some(exit_status) = p.try_wait().unwrap() {
+                // We read stderr in another thread as it seems that lines() can block in some cases
+                let (err_sender, err_receiver) = mpsc::channel();
+                thread::spawn(move || {
+                    for line in stderr_reader.lines().map_while(Result::ok) {
+                        err_sender.send(line).unwrap_or(());
+                    }
+                });
+                let mut err = String::new();
+                while let Ok(line) = err_receiver.recv_timeout(Duration::from_millis(10)) {
+                    err = err + "\n" + &line;
+                }
+
+                tracing::debug!("Python Backend complete standard error output:\n{err}");
+
+                if let Some(signal) = exit_status.signal() {
+                    return Err(BackendError::Start(format!(
+                        "Python Backend process was signaled to shutdown with signal {signal}"
+                    )));
+                }
+                return Err(BackendError::Start(
+                    "Python backend failed to start".to_string(),
+                ));
+            }
+
+            // Shard is ready
+            if uds.exists() {
+                tracing::info!("Python backend ready in {:?}", start_time.elapsed());
+                break;
+            } else if wait_time.elapsed() > Duration::from_secs(10) {
+                tracing::info!("Waiting for Python backend to be ready...");
+                wait_time = Instant::now();
+            }
+            sleep(Duration::from_millis(5));
+        }
+
+        Ok(Self { inner: p })
+    }
+}
+
+impl Drop for BackendProcess {
+    fn drop(&mut self) {
+        self.inner.kill().unwrap();
+        let _ = self.inner.wait();
+        tracing::info!("Python backend process terminated");
+    }
+}
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index fa6f21e63..b9eebac2c 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -19,6 +19,8 @@
     title: Build custom container for TEI
   - local: intel_container
     title: Using TEI container with Intel Hardware
+  - local: local_neuron
+    title: Using TEI container with AWS Neuron
   - local: examples
     title: Example uses
   title: Tutorials
diff --git a/docs/source/en/local_neuron.md b/docs/source/en/local_neuron.md
new file mode 100644
index 000000000..e0a2cf2ba
--- /dev/null
+++ b/docs/source/en/local_neuron.md
@@ -0,0 +1 @@
+# Neuron backend for AWS Trainium and Inferentia
\ No newline at end of file
diff --git a/integration_tests/neuron/conftest.py b/integration_tests/neuron/conftest.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py
new file mode 100644
index 000000000..e69de29bb

From 139b179f1cd346705fc3267b1f39162e438d8b21 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Wed, 22 Oct 2025 16:29:32 +0000
Subject: [PATCH 02/20] feat: sentence transformer for neuron

---
 backends/neuron/Cargo.toml                    |  16 --
 backends/neuron/server/README.md              |   0
 .../server/text_embeddings_server/__init__.py |   0
 .../server/text_embeddings_server/cli.py      |  55 -------
 .../text_embeddings_server/models/__init__.py | 126 ---------------
 .../server/text_embeddings_server/server.py   |  92 -----------
 backends/neuron/src/lib.rs                    | 132 ----------------
 backends/neuron/src/logging.rs                |  61 --------
 backends/neuron/src/management.rs             | 148 ------------------
 .../text_embeddings_server/models/__init__.py |   9 +-
 .../models/neuron_models.py                   |  67 ++++++++
 .../text_embeddings_server/utils/device.py    |  19 +++
 12 files changed, 94 insertions(+), 631 deletions(-)
 delete mode 100644 backends/neuron/Cargo.toml
 delete mode 100644 backends/neuron/server/README.md
 delete mode 100644 backends/neuron/server/text_embeddings_server/__init__.py
 delete mode 100644 backends/neuron/server/text_embeddings_server/cli.py
 delete mode 100644 backends/neuron/server/text_embeddings_server/models/__init__.py
 delete mode 100644 backends/neuron/server/text_embeddings_server/server.py
 delete mode 100644 backends/neuron/src/lib.rs
 delete mode 100644 backends/neuron/src/logging.rs
 delete mode 100644 backends/neuron/src/management.rs
 create mode 100644 backends/python/server/text_embeddings_server/models/neuron_models.py

diff --git a/backends/neuron/Cargo.toml b/backends/neuron/Cargo.toml
deleted file mode 100644
index b38f350ed..000000000
--- a/backends/neuron/Cargo.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-[package]
-name = "text-embeddings-backend-python"
-version.workspace = true
-edition.workspace = true
-authors.workspace = true
-homepage.workspace = true
-
-[dependencies]
-backend-grpc-client = { path = "../grpc-client" }
-nohash-hasher = "^0.2"
-serde = { version = "^1.0", features = ["derive"]  }
-serde_json = "^1.0"
-text-embeddings-backend-core = { path = "../core" }
-thiserror = "^1.0"
-tokio = { version = "^1.25", features = ["sync"] }
-tracing = "^0.1"
diff --git a/backends/neuron/server/README.md b/backends/neuron/server/README.md
deleted file mode 100644
index e69de29bb..000000000
diff --git a/backends/neuron/server/text_embeddings_server/__init__.py b/backends/neuron/server/text_embeddings_server/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/backends/neuron/server/text_embeddings_server/cli.py b/backends/neuron/server/text_embeddings_server/cli.py
deleted file mode 100644
index c4dfaa4c1..000000000
--- a/backends/neuron/server/text_embeddings_server/cli.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import sys
-import typer
-
-from pathlib import Path
-from loguru import logger
-from typing import Optional
-from enum import Enum
-
-app = typer.Typer()
-
-
-class Dtype(str, Enum):
-    float32 = "float32"
-    float16 = "float16"
-    bloat16 = "bfloat16"
-
-
-@app.command()
-def serve(
-    model_path: Path,
-    dtype: Dtype = "float32",
-    uds_path: Path = "/tmp/text-embeddings-server",
-    logger_level: str = "INFO",
-    json_output: bool = False,
-    otlp_endpoint: Optional[str] = None,
-    otlp_service_name: str = "text-embeddings-inference.server",
-    pool: str = "cls",
-):
-    # Remove default handler
-    logger.remove()
-    logger.add(
-        sys.stdout,
-        format="{message}",
-        filter="text_embeddings_server",
-        level=logger_level,
-        serialize=json_output,
-        backtrace=True,
-        diagnose=False,
-    )
-
-    # Import here after the logger is added to log potential import exceptions
-    from text_embeddings_server import server
-    from text_embeddings_server.utils.tracing import setup_tracing
-
-    # Setup OpenTelemetry distributed tracing
-    if otlp_endpoint is not None:
-        setup_tracing(otlp_endpoint=otlp_endpoint, otlp_service_name=otlp_service_name)
-
-    # Downgrade enum into str for easier management later on
-    dtype = None if dtype is None else dtype.value
-    server.serve(model_path, dtype, uds_path, pool)
-
-
-if __name__ == "__main__":
-    app()
diff --git a/backends/neuron/server/text_embeddings_server/models/__init__.py b/backends/neuron/server/text_embeddings_server/models/__init__.py
deleted file mode 100644
index 06c39832c..000000000
--- a/backends/neuron/server/text_embeddings_server/models/__init__.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import os
-import torch
-
-from loguru import logger
-from pathlib import Path
-from typing import Optional
-from transformers import AutoConfig
-from transformers.models.bert import BertConfig
-
-from text_embeddings_server.models.model import Model
-from text_embeddings_server.models.masked_model import MaskedLanguageModel
-from text_embeddings_server.models.default_model import DefaultModel
-from text_embeddings_server.models.classification_model import ClassificationModel
-from text_embeddings_server.models.jinaBert_model import FlashJinaBert
-from text_embeddings_server.models.flash_mistral import FlashMistral
-from text_embeddings_server.models.flash_qwen3 import FlashQwen3
-from text_embeddings_server.utils.device import get_device, use_ipex
-
-__all__ = ["Model"]
-
-TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "false").lower() in ["true", "1"]
-DISABLE_TENSOR_CACHE = os.getenv("DISABLE_TENSOR_CACHE", "false").lower() in [
-    "true",
-    "1",
-]
-# Disable gradients
-torch.set_grad_enabled(False)
-
-FLASH_ATTENTION = True
-try:
-    from text_embeddings_server.models.flash_bert import FlashBert
-except ImportError as e:
-    logger.warning(f"Could not import Flash Attention enabled models: {e}")
-    FLASH_ATTENTION = False
-
-if FLASH_ATTENTION:
-    __all__.append(FlashBert)
-
-
-def create_model(model_class, model_path, device, datatype, pool="cls"):
-    """Create a model instance and load it into Neuron devices."""
-    model_handle = model_class(
-        model_path,
-        device,
-        datatype,
-        pool,
-        trust_remote=TRUST_REMOTE_CODE,
-    )
-    return model_handle
-
-
-def get_model(model_path: Path, dtype: Optional[str], pool: str):
-    if dtype == "float32":
-        datatype = torch.float32
-    elif dtype == "float16":
-        datatype = torch.float16
-    elif dtype == "bfloat16":
-        datatype = torch.bfloat16
-    else:
-        raise RuntimeError(f"Unknown dtype {dtype}")
-
-    device = get_device()
-    logger.info(f"backend device: {device}")
-
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
-
-    if (
-        hasattr(config, "auto_map")
-        and isinstance(config.auto_map, dict)
-        and "AutoModel" in config.auto_map
-        and config.auto_map["AutoModel"]
-        == "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertModel"
-    ):
-        # Add specific offline modeling for model "jinaai/jina-embeddings-v2-base-code" which uses "autoMap" to reference code in other repository
-        return create_model(FlashJinaBert, model_path, device, datatype)
-
-    if config.model_type == "bert":
-        config: BertConfig
-        if (
-            use_ipex()
-            or device.type in ["cuda", "hpu"]
-            and config.position_embedding_type == "absolute"
-            and datatype in [torch.float16, torch.bfloat16]
-            and FLASH_ATTENTION
-        ):
-            if pool != "cls":
-                if config.architectures[0].endswith("ForMaskedLM") and pool == "splade":
-                    return create_model(
-                        MaskedLanguageModel, model_path, device, datatype, pool
-                    )
-                return create_model(DefaultModel, model_path, device, datatype, pool)
-
-            try:
-                return create_model(FlashBert, model_path, device, datatype)
-            except FileNotFoundError:
-                logger.info(
-                    "Do not have safetensors file for this model, use default transformers model path instead"
-                )
-                return create_model(DefaultModel, model_path, device, datatype, pool)
-
-        if config.architectures[0].endswith("Classification"):
-            return create_model(ClassificationModel, model_path, device, datatype)
-        elif config.architectures[0].endswith("ForMaskedLM") and pool == "splade":
-            return create_model(MaskedLanguageModel, model_path, device, datatype)
-        else:
-            return create_model(DefaultModel, model_path, device, datatype, pool)
-
-    if config.model_type == "mistral" and device.type == "hpu":
-        try:
-            return create_model(FlashMistral, model_path, device, datatype, pool)
-        except FileNotFoundError:
-            return create_model(DefaultModel, model_path, device, datatype, pool)
-
-    if config.model_type == "qwen3" and device.type == "hpu":
-        try:
-            return create_model(FlashQwen3, model_path, device, datatype, pool)
-        except FileNotFoundError:
-            return create_model(DefaultModel, model_path, device, datatype, pool)
-
-    # Default case
-    if config.architectures[0].endswith("Classification"):
-        return create_model(ClassificationModel, model_path, device, datatype)
-    elif config.architectures[0].endswith("ForMaskedLM") and pool == "splade":
-        return create_model(MaskedLanguageModel, model_path, device, datatype)
-    else:
-        return create_model(DefaultModel, model_path, device, datatype, pool)
diff --git a/backends/neuron/server/text_embeddings_server/server.py b/backends/neuron/server/text_embeddings_server/server.py
deleted file mode 100644
index 646d79bc9..000000000
--- a/backends/neuron/server/text_embeddings_server/server.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import asyncio
-import torch
-from grpc import aio
-from loguru import logger
-
-from grpc_reflection.v1alpha import reflection
-from pathlib import Path
-from typing import Optional
-
-from text_embeddings_server.models import Model, get_model
-from text_embeddings_server.pb import embed_pb2_grpc, embed_pb2
-from text_embeddings_server.utils.tracing import UDSOpenTelemetryAioServerInterceptor
-from text_embeddings_server.utils.interceptor import ExceptionInterceptor
-
-
-class EmbeddingService(embed_pb2_grpc.EmbeddingServiceServicer):
-    def __init__(self, model: Model):
-        self.model = model
-        # Force inference mode for the lifetime of EmbeddingService
-        self._inference_mode_raii_guard = torch._C._InferenceMode(True)
-
-    async def Health(self, request, context):
-        if self.model.device.type == "cuda":
-            torch.zeros((2, 2), device="cuda")
-        return embed_pb2.HealthResponse()
-
-    async def Embed(self, request, context):
-        max_input_length = self.model.max_input_length
-        batch = self.model.batch_type.from_pb(
-            request, self.model.device, max_input_length
-        )
-
-        embeddings = self.model.embed(batch)
-
-        return embed_pb2.EmbedResponse(embeddings=embeddings)
-
-    async def Predict(self, request, context):
-        max_input_length = self.model.max_input_length
-        batch = self.model.batch_type.from_pb(
-            request, self.model.device, max_input_length
-        )
-
-        scores = self.model.predict(batch)
-
-        return embed_pb2.PredictResponse(scores=scores)
-
-
-def serve(
-    model_path: Path,
-    dtype: Optional[str],
-    uds_path: Path,
-    pool: str,
-):
-    async def serve_inner(
-        model_path: Path,
-        dtype: Optional[str] = None,
-    ):
-        unix_socket = f"unix://{uds_path}"
-
-        try:
-            model = get_model(model_path, dtype, pool)
-        except Exception:
-            logger.exception("Error when initializing model")
-            raise
-
-        server = aio.server(
-            interceptors=[
-                ExceptionInterceptor(),
-                UDSOpenTelemetryAioServerInterceptor(),
-            ]
-        )
-        embed_pb2_grpc.add_EmbeddingServiceServicer_to_server(
-            EmbeddingService(model), server
-        )
-        SERVICE_NAMES = (
-            embed_pb2.DESCRIPTOR.services_by_name["EmbeddingService"].full_name,
-            reflection.SERVICE_NAME,
-        )
-        reflection.enable_server_reflection(SERVICE_NAMES, server)
-        server.add_insecure_port(unix_socket)
-
-        await server.start()
-
-        logger.info(f"Server started at {unix_socket}")
-
-        try:
-            await server.wait_for_termination()
-        except KeyboardInterrupt:
-            logger.info("Signal received. Shutting down")
-            await server.stop(0)
-
-    asyncio.run(serve_inner(model_path, dtype))
diff --git a/backends/neuron/src/lib.rs b/backends/neuron/src/lib.rs
deleted file mode 100644
index 53255b07d..000000000
--- a/backends/neuron/src/lib.rs
+++ /dev/null
@@ -1,132 +0,0 @@
-mod logging;
-mod management;
-
-use backend_grpc_client::Client;
-use nohash_hasher::BuildNoHashHasher;
-use std::collections::HashMap;
-use text_embeddings_backend_core::{
-    Backend, BackendError, Batch, Embedding, Embeddings, ModelType, Pool, Predictions,
-};
-use tokio::runtime::Runtime;
-
-pub struct PythonBackend {
-    _backend_process: management::BackendProcess,
-    tokio_runtime: Runtime,
-    backend_client: Client,
-}
-
-impl PythonBackend {
-    pub fn new(
-        model_path: String,
-        dtype: String,
-        model_type: ModelType,
-        uds_path: String,
-        otlp_endpoint: Option<String>,
-        otlp_service_name: String,
-    ) -> Result<Self, BackendError> {
-        let pool = match model_type {
-            ModelType::Classifier => Pool::Cls,
-            ModelType::Embedding(pool) => pool,
-        };
-
-        let backend_process = management::BackendProcess::new(
-            model_path,
-            dtype,
-            &uds_path,
-            otlp_endpoint,
-            otlp_service_name,
-            pool,
-        )?;
-        let tokio_runtime = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .map_err(|err| BackendError::Start(format!("Could not start Tokio runtime: {err}")))?;
-
-        let backend_client = tokio_runtime
-            .block_on(Client::connect_uds(uds_path))
-            .map_err(|err| {
-                BackendError::Start(format!("Could not connect to backend process: {err}"))
-            })?;
-
-        Ok(Self {
-            _backend_process: backend_process,
-            tokio_runtime,
-            backend_client,
-        })
-    }
-}
-
-impl Backend for PythonBackend {
-    fn health(&self) -> Result<(), BackendError> {
-        if self
-            .tokio_runtime
-            .block_on(self.backend_client.clone().health())
-            .is_err()
-        {
-            return Err(BackendError::Unhealthy);
-        }
-        Ok(())
-    }
-
-    fn is_padded(&self) -> bool {
-        false
-    }
-
-    fn embed(&self, batch: Batch) -> Result<Embeddings, BackendError> {
-        if !batch.raw_indices.is_empty() {
-            return Err(BackendError::Inference(
-                "raw embeddings are not supported for the Python backend.".to_string(),
-            ));
-        }
-        let batch_size = batch.len();
-
-        let results = self
-            .tokio_runtime
-            .block_on(self.backend_client.clone().embed(
-                batch.input_ids,
-                batch.token_type_ids,
-                batch.position_ids,
-                batch.cumulative_seq_lengths,
-                batch.max_length,
-            ))
-            .map_err(|err| BackendError::Inference(err.to_string()))?;
-        let pooled_embeddings: Vec<Vec<f32>> = results.into_iter().map(|r| r.values).collect();
-
-        let mut embeddings =
-            HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default());
-        for (i, e) in pooled_embeddings.into_iter().enumerate() {
-            embeddings.insert(i, Embedding::Pooled(e));
-        }
-
-        Ok(embeddings)
-    }
-
-    fn predict(&self, batch: Batch) -> Result<Predictions, BackendError> {
-        if !batch.raw_indices.is_empty() {
-            return Err(BackendError::Inference(
-                "raw embeddings are not supported for the Python backend.".to_string(),
-            ));
-        }
-        let batch_size = batch.len();
-        let results = self
-            .tokio_runtime
-            .block_on(self.backend_client.clone().predict(
-                batch.input_ids,
-                batch.token_type_ids,
-                batch.position_ids,
-                batch.cumulative_seq_lengths,
-                batch.max_length,
-            ))
-            .map_err(|err| BackendError::Inference(err.to_string()))?;
-        let raw_results: Vec<Vec<f32>> = results.into_iter().map(|r| r.values).collect();
-
-        let mut predictions =
-            HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default());
-
-        for (i, r) in raw_results.into_iter().enumerate() {
-            predictions.insert(i, r);
-        }
-
-        Ok(predictions)
-    }
-}
diff --git a/backends/neuron/src/logging.rs b/backends/neuron/src/logging.rs
deleted file mode 100644
index 8f55e8e6b..000000000
--- a/backends/neuron/src/logging.rs
+++ /dev/null
@@ -1,61 +0,0 @@
-use serde::Deserialize;
-use std::io::{BufRead, Lines};
-
-#[derive(Deserialize)]
-#[serde(rename_all = "UPPERCASE")]
-enum PythonLogLevelEnum {
-    Trace,
-    Debug,
-    Info,
-    Success,
-    Warning,
-    Error,
-    Critical,
-}
-
-#[derive(Deserialize)]
-struct PythonLogLevel {
-    name: PythonLogLevelEnum,
-}
-
-#[derive(Deserialize)]
-struct PythonLogRecord {
-    level: PythonLogLevel,
-}
-
-#[derive(Deserialize)]
-struct PythonLogMessage {
-    text: String,
-    record: PythonLogRecord,
-}
-
-impl PythonLogMessage {
-    fn trace(&self) {
-        match self.record.level.name {
-            PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text),
-            PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text),
-            PythonLogLevelEnum::Info => tracing::info!("{}", self.text),
-            PythonLogLevelEnum::Success => tracing::info!("{}", self.text),
-            PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text),
-            PythonLogLevelEnum::Error => tracing::error!("{}", self.text),
-            PythonLogLevelEnum::Critical => tracing::error!("{}", self.text),
-        }
-    }
-}
-
-impl TryFrom<&String> for PythonLogMessage {
-    type Error = serde_json::Error;
-
-    fn try_from(value: &String) -> Result<Self, Self::Error> {
-        serde_json::from_str::<Self>(value)
-    }
-}
-
-pub(crate) fn log_lines<S: Sized + BufRead>(lines: Lines<S>) {
-    for line in lines.map_while(Result::ok) {
-        match PythonLogMessage::try_from(&line) {
-            Ok(log) => log.trace(),
-            Err(_) => tracing::debug!("{line}"),
-        }
-    }
-}
diff --git a/backends/neuron/src/management.rs b/backends/neuron/src/management.rs
deleted file mode 100644
index 81c294a92..000000000
--- a/backends/neuron/src/management.rs
+++ /dev/null
@@ -1,148 +0,0 @@
-use crate::logging::log_lines;
-use std::ffi::OsString;
-use std::io::{BufRead, BufReader};
-use std::os::unix::process::{CommandExt, ExitStatusExt};
-use std::path::Path;
-use std::process::{Child, Command, Stdio};
-use std::sync::mpsc;
-use std::thread::sleep;
-use std::time::{Duration, Instant};
-use std::{env, fs, io, thread};
-use text_embeddings_backend_core::{BackendError, Pool};
-
-#[derive(Debug)]
-pub(crate) struct BackendProcess {
-    inner: Child,
-}
-
-impl BackendProcess {
-    pub(crate) fn new(
-        model_path: String,
-        dtype: String,
-        uds_path: &str,
-        otlp_endpoint: Option<String>,
-        otlp_service_name: String,
-        pool: Pool,
-    ) -> Result<Self, BackendError> {
-        // Get UDS path
-        let uds = Path::new(uds_path);
-
-        // Clean previous runs
-        if uds.exists() {
-            fs::remove_file(uds).expect("could not remove UDS file");
-        }
-
-        let pool = match pool {
-            Pool::Cls => "cls",
-            Pool::Mean => "mean",
-            Pool::LastToken => "lasttoken",
-            Pool::Splade => "splade",
-        };
-
-        // Process args
-        let mut python_server_args = vec![
-            model_path,
-            "--dtype".to_owned(),
-            dtype,
-            "--uds-path".to_owned(),
-            uds_path.to_owned(),
-            "--logger-level".to_owned(),
-            "INFO".to_owned(),
-            "--json-output".to_owned(),
-            "--pool".to_owned(),
-            pool.to_owned(),
-        ];
-
-        // OpenTelemetry
-        if let Some(otlp_endpoint) = otlp_endpoint {
-            python_server_args.push("--otlp-endpoint".to_owned());
-            python_server_args.push(otlp_endpoint);
-        }
-
-        python_server_args.push("--otlp-service-name".to_owned());
-        python_server_args.push(otlp_service_name);
-
-        // Copy current process env
-        let envs: Vec<(OsString, OsString)> = env::vars_os().collect();
-
-        tracing::info!("Starting Python backend");
-        let mut p = match Command::new("python-text-embeddings-server")
-            .args(python_server_args)
-            .envs(envs)
-            .stdout(Stdio::piped())
-            .stderr(Stdio::piped())
-            .process_group(0)
-            .spawn()
-        {
-            Ok(p) => p,
-            Err(err) => {
-                if err.kind() == io::ErrorKind::NotFound {
-                    return Err(BackendError::Start(
-                        "python-text-embeddings-server not found in PATH".to_owned(),
-                    ));
-                }
-                return Err(BackendError::Start(err.to_string()));
-            }
-        };
-
-        let stdout_reader = BufReader::new(p.stdout.take().unwrap());
-        let stderr_reader = BufReader::new(p.stderr.take().unwrap());
-
-        //stdout tracing thread
-        thread::spawn(move || {
-            let _span = tracing::span!(tracing::Level::INFO, "python-backend").entered();
-            log_lines(stdout_reader.lines());
-        });
-
-        let start_time = Instant::now();
-        let mut wait_time = Instant::now();
-
-        loop {
-            // Process exited
-            if let Some(exit_status) = p.try_wait().unwrap() {
-                // We read stderr in another thread as it seems that lines() can block in some cases
-                let (err_sender, err_receiver) = mpsc::channel();
-                thread::spawn(move || {
-                    for line in stderr_reader.lines().map_while(Result::ok) {
-                        err_sender.send(line).unwrap_or(());
-                    }
-                });
-                let mut err = String::new();
-                while let Ok(line) = err_receiver.recv_timeout(Duration::from_millis(10)) {
-                    err = err + "\n" + &line;
-                }
-
-                tracing::debug!("Python Backend complete standard error output:\n{err}");
-
-                if let Some(signal) = exit_status.signal() {
-                    return Err(BackendError::Start(format!(
-                        "Python Backend process was signaled to shutdown with signal {signal}"
-                    )));
-                }
-                return Err(BackendError::Start(
-                    "Python backend failed to start".to_string(),
-                ));
-            }
-
-            // Shard is ready
-            if uds.exists() {
-                tracing::info!("Python backend ready in {:?}", start_time.elapsed());
-                break;
-            } else if wait_time.elapsed() > Duration::from_secs(10) {
-                tracing::info!("Waiting for Python backend to be ready...");
-                wait_time = Instant::now();
-            }
-            sleep(Duration::from_millis(5));
-        }
-
-        Ok(Self { inner: p })
-    }
-}
-
-impl Drop for BackendProcess {
-    fn drop(&mut self) {
-        self.inner.kill().unwrap();
-        let _ = self.inner.wait();
-        tracing::info!("Python backend process terminated");
-    }
-}
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
index 1e919f233..8fb4076c0 100644
--- a/backends/python/server/text_embeddings_server/models/__init__.py
+++ b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -14,7 +14,9 @@
 from text_embeddings_server.models.jinaBert_model import FlashJinaBert
 from text_embeddings_server.models.flash_mistral import FlashMistral
 from text_embeddings_server.models.flash_qwen3 import FlashQwen3
-from text_embeddings_server.utils.device import get_device, use_ipex
+from text_embeddings_server.models.neuron_models import NeuronSentenceTransformers
+
+from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron
 
 __all__ = ["Model"]
 
@@ -74,6 +76,11 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str):
     logger.info(f"backend device: {device}")
 
     config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
+    
+    # Neuron cases
+    if is_neuron():
+        if config.model_type == "bert":
+            return create_model(NeuronSentenceTransformers, model_path)
 
     if (
         hasattr(config, "auto_map")
diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py
new file mode 100644
index 000000000..d795db071
--- /dev/null
+++ b/backends/python/server/text_embeddings_server/models/neuron_models.py
@@ -0,0 +1,67 @@
+import inspect
+import torch
+
+from pathlib import Path
+from typing import Type, List
+from optimum.neuron import NeuronModelForSentenceTransformers
+from opentelemetry import trace
+
+from text_embeddings_server.models import Model
+from text_embeddings_server.models.types import PaddedBatch, Embedding, Score
+
+tracer = trace.get_tracer(__name__)
+
+
+class NeuronSentenceTransformers(Model):
+    def __init__(
+        self,
+        model_path: Path,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        model = NeuronModelForSentenceTransformers.from_pretrained(model_path)
+
+        self.hidden_size = model.config.hidden_size
+        position_offset = 0
+        model_type = model.config.model_type
+        if model_type in ["xlm-roberta", "camembert", "roberta"]:
+            position_offset = model.config.pad_token_id + 1
+        if hasattr(model.config, "max_seq_length"):
+            self.max_input_length = model.config.max_seq_length
+        else:
+            self.max_input_length = (
+                model.config.max_position_embeddings - position_offset
+            )
+
+        self.has_position_ids = (
+            inspect.signature(model.forward).parameters.get("position_ids", None)
+            is not None
+        )
+        self.has_token_type_ids = (
+            inspect.signature(model.forward).parameters.get("token_type_ids", None)
+            is not None
+        )
+
+        super(NeuronSentenceTransformers, self).__init__(
+            model=model, dtype=dtype, device=device
+        )
+
+    @property
+    def batch_type(self) -> Type[PaddedBatch]:
+        return PaddedBatch
+
+    @tracer.start_as_current_span("embed")
+    def embed(self, batch: PaddedBatch) -> List[Embedding]:
+        pass
+
+    @tracer.start_as_current_span("predict")
+    def predict(self, batch: PaddedBatch) -> List[Score]:
+        kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask}
+        if self.has_token_type_ids:
+            kwargs["token_type_ids"] = batch.token_type_ids
+        if self.has_position_ids:
+            kwargs["position_ids"] = batch.position_ids
+
+        output = self.model(**kwargs, return_dict=True)
+        all_scores = output.logits.tolist()
+        return [Score(values=scores) for scores in all_scores]
diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py
index 3f3b04dd7..46b81370f 100644
--- a/backends/python/server/text_embeddings_server/utils/device.py
+++ b/backends/python/server/text_embeddings_server/utils/device.py
@@ -1,4 +1,6 @@
 import os
+import re
+import functools
 from loguru import logger
 import importlib.metadata
 import importlib.util
@@ -49,6 +51,21 @@ def is_hpu() -> bool:
         is_hpu_available = False
     return is_hpu_available
 
+@functools.cache
+def get_neuron_major() -> int:
+    MAJORS_FILE = "/proc/devices"
+    NEURON_MAJOR_LINE = re.compile(r"^\s*(\d+)\s+neuron\s*$")
+    if not os.path.exists(MAJORS_FILE):
+        return -1
+    with open(MAJORS_FILE, "r") as f:
+        for l in f.readlines():
+            m = NEURON_MAJOR_LINE.match(l)
+            if m:
+                return int(m.group(1))
+    return -1
+
+def is_neuron() -> bool:
+    return get_neuron_major > -1
 
 def use_ipex() -> bool:
     value = os.environ.get("USE_IPEX", "True").lower()
@@ -72,5 +89,7 @@ def get_device():
 
         if hasattr(torch, "xpu") and torch.xpu.is_available():
             device = torch.device("xpu")
+    elif is_neuron():
+        device = torch.device("xla")
 
     return device

From dd0c08ddad7abe38caf76f720844e7438e42067a Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Mon, 27 Oct 2025 17:10:38 +0000
Subject: [PATCH 03/20] fix: neuron dockerfile

---
 Dockerfile-neuron                             | 187 ++++++++++++++++++
 Dockerfile.neuron                             |  43 ----
 backends/Cargo.toml                           |   1 -
 .../python/server/requirements-neuron.txt     |   1 +
 docs/source/en/ aws_neuron.md                 |  37 ++++
 docs/source/en/local_neuron.md                |   1 -
 6 files changed, 225 insertions(+), 45 deletions(-)
 create mode 100644 Dockerfile-neuron
 delete mode 100644 Dockerfile.neuron
 create mode 100644 backends/python/server/requirements-neuron.txt
 create mode 100644 docs/source/en/ aws_neuron.md
 delete mode 100644 docs/source/en/local_neuron.md

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
new file mode 100644
index 000000000..52797d687
--- /dev/null
+++ b/Dockerfile-neuron
@@ -0,0 +1,187 @@
+ARG PLATFORM=neuron
+FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef
+WORKDIR /usr/src
+
+ENV SCCACHE=0.10.0
+ENV RUSTC_WRAPPER=/usr/local/bin/sccache
+
+# Donwload, configure sccache
+RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
+    chmod +x /usr/local/bin/sccache
+
+FROM chef AS planner
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM chef AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+# sccache specific variables
+ARG SCCACHE_GHA_ENABLED
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    cargo chef cook --release --features ort,candle,mkl,static-linking --no-default-features --recipe-path recipe.json && sccache -s
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+FROM builder AS http-builder
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    cargo build --release --bin text-embeddings-router --features ort,candle,mkl,static-linking,http --no-default-features && sccache -s
+
+FROM builder AS grpc-builder
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY proto proto
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    cargo build --release --bin text-embeddings-router --features ort,candle,mkl,static-linking,grpc --no-default-features && sccache -s
+
+FROM public.ecr.aws/docker/library/ubuntu:22.04 AS neuron
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    PORT=80
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-dev \
+    build-essential \
+    git \
+    curl \
+    cmake \
+    pkg-config \
+    protobuf-compiler \
+    ninja-build \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python3 /usr/local/bin/python || true
+RUN ln -s /usr/bin/pip3 /usr/local/bin/pip || true
+
+WORKDIR /usr/src
+COPY backends backends
+COPY backends/python/server/text_embeddings_server/models/__init__.py backends/python/server/text_embeddings_server/models/__init__.py
+COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml
+RUN cd backends/python/server && \
+    make install
+
+ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
+ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
+ARG NEURONX_TOOLS_VERSION=2.26.14.0
+
+ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7
+ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca
+ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf
+ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.10598+a59fdc00
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get install -y --no-install-recommends \
+    apt-transport-https \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    emacs \
+    git \
+    gnupg2 \
+    gpg-agent \
+    jq \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libcap-dev \
+    libhwloc-dev \
+    openjdk-11-jdk \
+    unzip \
+    vim \
+    wget \
+    zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+RUN apt-get update \
+ && apt-get install -y \
+   aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
+   aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
+   aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
+    --extra-index-url https://pypi.org/simple \
+    --trusted-host pip.repos.neuron.amazonaws.com \
+    neuronx-cc==$NEURONX_CC_VERSION \
+    torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
+    neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \
+ && rm -rf ~/.cache/pip/*
+
+# HF ARGS
+ARG TRANSFORMERS_VERSION=4.55.4
+ARG DIFFUSERS_VERSION=0.35.2
+ARG HUGGINGFACE_HUB_VERSION=0.36.0
+ARG OPTIMUM_NEURON_VERSION=0.4.1
+ARG SENTENCE_TRANSFORMERS=5.1.2
+ARG PEFT_VERSION=0.17.0
+ARG DATASETS_VERSION=4.1.1
+
+# install Hugging Face libraries and its dependencies
+RUN pip install --no-cache-dir -U \
+	networkx==2.8.8 \
+	transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
+    diffusers==${DIFFUSERS_VERSION} \
+    compel \
+    controlnet-aux \
+    huggingface_hub==${HUGGINGFACE_HUB_VERSION} \
+    hf_transfer \
+    datasets==${DATASETS_VERSION} \
+    optimum-neuron==${OPTIMUM_NEURON_VERSION} \
+ 	sentence_transformers==${SENTENCE_TRANSFORMERS} \
+	peft==${PEFT_VERSION} \
+ && rm -rf ~/.cache/pip/*
+
+
+FROM neuron AS grpc
+
+COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
+
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]
+
+FROM neuron
+
+COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
+
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]
+
+
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
deleted file mode 100644
index f8b03ab26..000000000
--- a/Dockerfile.neuron
+++ /dev/null
@@ -1,43 +0,0 @@
-ARG PLATFORM=neuron
-FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef
-WORKDIR /usr/src
-
-ENV SCCACHE=0.10.0
-ENV RUSTC_WRAPPER=/usr/local/bin/sccache
-
-# Donwload, configure sccache
-RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
-    chmod +x /usr/local/bin/sccache
-
-FROM chef AS planner
-
-COPY backends backends
-COPY core core
-COPY router router
-COPY Cargo.toml ./
-COPY Cargo.lock ./
-
-RUN cargo chef prepare  --recipe-path recipe.json
-
-FROM chef AS builder
-
-ARG GIT_SHA
-ARG DOCKER_LABEL
-
-# sccache specific variables
-ARG SCCACHE_GHA_ENABLED
-
-COPY --from=planner /usr/src/recipe.json recipe.json
-
-RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
-    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s
-
-COPY backends backends
-COPY core core
-COPY router router
-COPY Cargo.toml ./
-COPY Cargo.lock ./
-
-WORKDIR /usr/src
-
diff --git a/backends/Cargo.toml b/backends/Cargo.toml
index 7d821ff40..bb9d74191 100644
--- a/backends/Cargo.toml
+++ b/backends/Cargo.toml
@@ -21,7 +21,6 @@ rand = { workspace = true }
 [features]
 clap = ["dep:clap", "text-embeddings-backend-core/clap"]
 python = ["dep:text-embeddings-backend-python"]
-neuron = ["dep:text-embeddings-backend-neuron"]
 ort = ["dep:text-embeddings-backend-ort"]
 candle = ["dep:text-embeddings-backend-candle"]
 cuda = ["text-embeddings-backend-candle?/cuda"]
diff --git a/backends/python/server/requirements-neuron.txt b/backends/python/server/requirements-neuron.txt
new file mode 100644
index 000000000..b8ce3518e
--- /dev/null
+++ b/backends/python/server/requirements-neuron.txt
@@ -0,0 +1 @@
+transformers==4.55.4
\ No newline at end of file
diff --git a/docs/source/en/ aws_neuron.md b/docs/source/en/ aws_neuron.md
new file mode 100644
index 000000000..13ea7f86e
--- /dev/null
+++ b/docs/source/en/ aws_neuron.md	
@@ -0,0 +1,37 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+# Using TEI Container with AWS Trainium and Inferentia Instances
+
+## Build Docker Image
+
+To build a container optimized for AWS Neuron devices, run the following command:
+
+```shell
+platform="neuron"
+
+docker build . -f Dockerfile-neuron -t tei_neuron
+```
+
+### Deploy Docker Container
+
+To deploy your model on an AWS Trainium or Inferentia instance, use the following command:
+
+```shell
+model='Qwen/Qwen3-Embedding-0.6B'
+volume=$PWD/data
+
+docker run -p 8080:80 -v $volume:/data tei_neuron --model-id $model
+```
\ No newline at end of file
diff --git a/docs/source/en/local_neuron.md b/docs/source/en/local_neuron.md
deleted file mode 100644
index e0a2cf2ba..000000000
--- a/docs/source/en/local_neuron.md
+++ /dev/null
@@ -1 +0,0 @@
-# Neuron backend for AWS Trainium and Inferentia
\ No newline at end of file

From 1e4f3c92d03c9193c62f9d7d20e476b0f2f11dda Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Tue, 28 Oct 2025 17:23:49 +0000
Subject: [PATCH 04/20] remove useless

---
 Dockerfile-neuron                              | 2 --
 backends/python/server/requirements-neuron.txt | 1 -
 2 files changed, 3 deletions(-)
 delete mode 100644 backends/python/server/requirements-neuron.txt

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index 52797d687..a536ab7dd 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -183,5 +183,3 @@ COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/loc
 
 ENTRYPOINT ["text-embeddings-router"]
 CMD ["--json-output"]
-
-
diff --git a/backends/python/server/requirements-neuron.txt b/backends/python/server/requirements-neuron.txt
deleted file mode 100644
index b8ce3518e..000000000
--- a/backends/python/server/requirements-neuron.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers==4.55.4
\ No newline at end of file

From a25cf98d6d98135258ad5ec18549ebdeed02f02a Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Fri, 31 Oct 2025 13:11:12 +0000
Subject: [PATCH 05/20] fix dockerfile

---
 Dockerfile-neuron | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index a536ab7dd..e09c64915 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -1,4 +1,3 @@
-ARG PLATFORM=neuron
 FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef
 WORKDIR /usr/src
 
@@ -31,7 +30,7 @@ COPY --from=planner /usr/src/recipe.json recipe.json
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo chef cook --release --features ort,candle,mkl,static-linking --no-default-features --recipe-path recipe.json && sccache -s
+    cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s
 
 COPY backends backends
 COPY core core
@@ -39,25 +38,25 @@ COPY router router
 COPY Cargo.toml ./
 COPY Cargo.lock ./
 
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
 FROM builder AS http-builder
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo build --release --bin text-embeddings-router --features ort,candle,mkl,static-linking,http --no-default-features && sccache -s
+    cargo build --release --bin text-embeddings-router -F python -F http --no-default-features && sccache -s
 
 FROM builder AS grpc-builder
 
-RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
-    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
-    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
-    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
-    rm -f $PROTOC_ZIP
-
 COPY proto proto
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo build --release --bin text-embeddings-router --features ort,candle,mkl,static-linking,grpc --no-default-features && sccache -s
+    cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s
 
 FROM public.ecr.aws/docker/library/ubuntu:22.04 AS neuron
 

From 56c15d896b15341dd3656a04042bb790102d0205 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Mon, 3 Nov 2025 10:53:09 +0000
Subject: [PATCH 06/20] neuron path

---
 Dockerfile-neuron | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index e09c64915..16005db2d 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -136,6 +136,9 @@ RUN apt-get update \
  && rm -rf /tmp/tmp* \
  && apt-get clean
 
+ENV PATH="/opt/aws/neuron/bin:${PATH}"
+ENV NEURON_RT_VISIBLE_CORES=ALL
+
 RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
     --extra-index-url https://pypi.org/simple \
     --trusted-host pip.repos.neuron.amazonaws.com \
@@ -145,7 +148,7 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
  && rm -rf ~/.cache/pip/*
 
 # HF ARGS
-ARG TRANSFORMERS_VERSION=4.55.4
+ARG TRANSFORMERS_VERSION=4.57.1
 ARG DIFFUSERS_VERSION=0.35.2
 ARG HUGGINGFACE_HUB_VERSION=0.36.0
 ARG OPTIMUM_NEURON_VERSION=0.4.1
@@ -154,6 +157,7 @@ ARG PEFT_VERSION=0.17.0
 ARG DATASETS_VERSION=4.1.1
 
 # install Hugging Face libraries and its dependencies
+# optimum-neuron==${OPTIMUM_NEURON_VERSION} \
 RUN pip install --no-cache-dir -U \
 	networkx==2.8.8 \
 	transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
@@ -163,7 +167,7 @@ RUN pip install --no-cache-dir -U \
     huggingface_hub==${HUGGINGFACE_HUB_VERSION} \
     hf_transfer \
     datasets==${DATASETS_VERSION} \
-    optimum-neuron==${OPTIMUM_NEURON_VERSION} \
+    "optimum-neuron @ git+https://github.com/huggingface/optimum-neuron@main" \
  	sentence_transformers==${SENTENCE_TRANSFORMERS} \
 	peft==${PEFT_VERSION} \
  && rm -rf ~/.cache/pip/*

From 142520a5e829d2ac5018e98af5a373612634ae3c Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Mon, 3 Nov 2025 16:38:18 +0000
Subject: [PATCH 07/20] fix container env + Neuron related changes

---
 Dockerfile-neuron                             |  7 +--
 .../text_embeddings_server/models/__init__.py |  4 +-
 .../models/neuron_models.py                   | 30 ++++++----
 backends/src/lib.rs                           | 60 ++++++++++++++++---
 docs/source/en/ aws_neuron.md                 |  6 +-
 5 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index 16005db2d..9f4b23740 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -90,10 +90,9 @@ ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
 ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
 ARG NEURONX_TOOLS_VERSION=2.26.14.0
 
-ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7
-ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca
+ARG NEURONX_CC_VERSION=2.21.33363.0+82129205
+ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.16998+e9bf8a50
 ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf
-ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.10598+a59fdc00
 
 RUN apt-get update \
  && apt-get upgrade -y \
@@ -137,13 +136,13 @@ RUN apt-get update \
  && apt-get clean
 
 ENV PATH="/opt/aws/neuron/bin:${PATH}"
-ENV NEURON_RT_VISIBLE_CORES=ALL
 
 RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
     --extra-index-url https://pypi.org/simple \
     --trusted-host pip.repos.neuron.amazonaws.com \
     neuronx-cc==$NEURONX_CC_VERSION \
     torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
+    torchvision \
     neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \
  && rm -rf ~/.cache/pip/*
 
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
index 8fb4076c0..0ca8b584c 100644
--- a/backends/python/server/text_embeddings_server/models/__init__.py
+++ b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -14,7 +14,7 @@
 from text_embeddings_server.models.jinaBert_model import FlashJinaBert
 from text_embeddings_server.models.flash_mistral import FlashMistral
 from text_embeddings_server.models.flash_qwen3 import FlashQwen3
-from text_embeddings_server.models.neuron_models import NeuronSentenceTransformers
+from text_embeddings_server.models.neuron_models import NeuronSentenceTransformersModel
 
 from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron
 
@@ -80,7 +80,7 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str):
     # Neuron cases
     if is_neuron():
         if config.model_type == "bert":
-            return create_model(NeuronSentenceTransformers, model_path)
+            return create_model(NeuronSentenceTransformersModel, model_path, device, datatype)
 
     if (
         hasattr(config, "auto_map")
diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py
index d795db071..e3b850c3e 100644
--- a/backends/python/server/text_embeddings_server/models/neuron_models.py
+++ b/backends/python/server/text_embeddings_server/models/neuron_models.py
@@ -3,7 +3,7 @@
 
 from pathlib import Path
 from typing import Type, List
-from optimum.neuron import NeuronModelForSentenceTransformers
+from optimum.neuron import NeuronSentenceTransformers
 from opentelemetry import trace
 
 from text_embeddings_server.models import Model
@@ -12,14 +12,14 @@
 tracer = trace.get_tracer(__name__)
 
 
-class NeuronSentenceTransformers(Model):
+class NeuronSentenceTransformersModel(Model):
     def __init__(
         self,
         model_path: Path,
         device: torch.device,
         dtype: torch.dtype,
     ):
-        model = NeuronModelForSentenceTransformers.from_pretrained(model_path)
+        model = NeuronSentenceTransformers.from_pretrained(model_path)
 
         self.hidden_size = model.config.hidden_size
         position_offset = 0
@@ -42,7 +42,7 @@ def __init__(
             is not None
         )
 
-        super(NeuronSentenceTransformers, self).__init__(
+        super(NeuronSentenceTransformersModel, self).__init__(
             model=model, dtype=dtype, device=device
         )
 
@@ -52,16 +52,20 @@ def batch_type(self) -> Type[PaddedBatch]:
 
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        pass
-
-    @tracer.start_as_current_span("predict")
-    def predict(self, batch: PaddedBatch) -> List[Score]:
         kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask}
         if self.has_token_type_ids:
             kwargs["token_type_ids"] = batch.token_type_ids
-        if self.has_position_ids:
-            kwargs["position_ids"] = batch.position_ids
+        output = self.model(**kwargs)
+
+        sentence_embedding = output["sentence_embedding"]
 
-        output = self.model(**kwargs, return_dict=True)
-        all_scores = output.logits.tolist()
-        return [Score(values=scores) for scores in all_scores]
+        return [
+            Embedding(
+                values=sentence_embedding[i * self.hidden_size : (i + 1) * self.hidden_size]
+            )
+            for i in range(len(batch))
+        ]
+
+    @tracer.start_as_current_span("predict")
+    def predict(self, batch: PaddedBatch) -> List[Score]:
+        pass
diff --git a/backends/src/lib.rs b/backends/src/lib.rs
index 245715b38..b53067de1 100644
--- a/backends/src/lib.rs
+++ b/backends/src/lib.rs
@@ -67,6 +67,15 @@ fn is_hpu() -> bool {
     }
 }
 
+fn is_neuron() -> bool {
+    match Command::new("neuron-ls")
+        .output()
+    {
+        Ok(output) => output.status.success(),
+        Err(_) => false,
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct Backend {
     /// Channel to communicate with the background thread
@@ -409,16 +418,39 @@ async fn init_backend(
     if let Some(api_repo) = api_repo.as_ref() {
         if cfg!(feature = "python") || cfg!(feature = "candle") {
             let start = std::time::Instant::now();
-            if download_safetensors(api_repo).await.is_err() {
-                tracing::warn!("safetensors weights not found. Using `pytorch_model.bin` instead. Model loading will be significantly slower.");
-                tracing::info!("Downloading `pytorch_model.bin`");
-                api_repo
-                    .get("pytorch_model.bin")
+            if is_neuron() {
+                tracing::info!("Downloading `model.neuron`");
+                let model_files = download_neuron(api_repo)
                     .await
                     .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?;
-            }
 
-            tracing::info!("Model weights downloaded in {:?}", start.elapsed());
+                if model_files.is_empty() {
+                    tracing::error!(
+                        "Neuron model files not found in the repository. \
+                        You can easily compile your model to neuron format following the guide: \
+                        https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview "
+                    );
+                    return Err(BackendError::WeightsNotFound(
+                        "No Neuron model files found".into(),
+                    ));
+                }
+
+                tracing::info!("Neuron model downloaded in {:?}", start.elapsed());
+            } else {
+                if download_safetensors(api_repo).await.is_err() {
+                    tracing::warn!(
+                        "safetensors weights not found. Using `pytorch_model.bin` instead. \
+                        Model loading will be significantly slower."
+                    );
+                    tracing::info!("Downloading `pytorch_model.bin`");
+                    api_repo
+                        .get("pytorch_model.bin")
+                        .await
+                        .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?;
+                }
+
+                tracing::info!("Model weights downloaded in {:?}", start.elapsed());
+            }
         }
     }
 
@@ -655,6 +687,20 @@ async fn download_onnx(api: &ApiRepo) -> Result<Vec<PathBuf>, ApiError> {
     Ok(model_files)
 }
 
+async fn download_neuron(api: &ApiRepo) -> Result<Vec<PathBuf>, ApiError> {
+    let mut model_files: Vec<PathBuf> = Vec::new();
+
+    tracing::info!("Downloading `model.neuron`");
+    match api.get("model.neuron").await {
+        Ok(p) => model_files.push(p),
+        Err(err) => {
+            tracing::warn!("Could not download `model.neuron`: {err}");
+        }
+    };
+
+    Ok(model_files)
+}
+
 #[cfg(feature = "candle")]
 #[derive(Debug, Clone, Deserialize, PartialEq)]
 enum ModuleType {
diff --git a/docs/source/en/ aws_neuron.md b/docs/source/en/ aws_neuron.md
index 13ea7f86e..d383fdba8 100644
--- a/docs/source/en/ aws_neuron.md	
+++ b/docs/source/en/ aws_neuron.md	
@@ -22,7 +22,7 @@ To build a container optimized for AWS Neuron devices, run the following command
 ```shell
 platform="neuron"
 
-docker build . -f Dockerfile-neuron -t tei_neuron
+docker build . -f Dockerfile-neuron -t tei-neuron:main
 ```
 
 ### Deploy Docker Container
@@ -30,8 +30,8 @@ docker build . -f Dockerfile-neuron -t tei_neuron
 To deploy your model on an AWS Trainium or Inferentia instance, use the following command:
 
 ```shell
-model='Qwen/Qwen3-Embedding-0.6B'
+model='optimum/bge-base-en-v1.5-neuronx'
 volume=$PWD/data
 
-docker run -p 8080:80 -v $volume:/data tei_neuron --model-id $model
+docker run -p 8080:80 -v $volume:/data tei-neuron:main --model-id $model --dtype float32
 ```
\ No newline at end of file

From 7ada87700b2d994b742333468b5c33a2b4db2cff Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Tue, 3 Feb 2026 17:09:05 +0000
Subject: [PATCH 08/20] fix for neuron backend + tests

---
 Dockerfile-neuron                             |  25 +-
 .../text_embeddings_server/models/__init__.py |  63 ++-
 .../models/neuron_models.py                   | 420 +++++++++++++++++-
 .../text_embeddings_server/utils/device.py    |   2 +-
 backends/src/lib.rs                           |  19 +-
 integration_tests/README.md                   | 114 ++++-
 integration_tests/neuron/conftest.py          | 299 +++++++++++++
 integration_tests/neuron/test_embed.py        | 223 ++++++++++
 8 files changed, 1109 insertions(+), 56 deletions(-)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index 9f4b23740..dbf1e9a29 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -4,7 +4,7 @@ WORKDIR /usr/src
 ENV SCCACHE=0.10.0
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
 
-# Donwload, configure sccache
+# Download, configure sccache
 RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
     chmod +x /usr/local/bin/sccache
 
@@ -63,6 +63,8 @@ FROM public.ecr.aws/docker/library/ubuntu:22.04 AS neuron
 ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80
 
+ENV PATH="/usr/local/bin:/root/.local/bin:${PATH}"
+
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     python3 \
     python3-pip \
@@ -123,8 +125,9 @@ RUN apt-get update \
  && rm -rf /tmp/tmp* \
  && apt-get clean
 
-RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
-RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+# Ubuntu 22.04 = jammy; use signed-by (apt-key is deprecated)
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | gpg --dearmor -o /usr/share/keyrings/neuron-archive-keyring.gpg && \
+    echo "deb [signed-by=/usr/share/keyrings/neuron-archive-keyring.gpg] https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
 
 RUN apt-get update \
  && apt-get install -y \
@@ -147,7 +150,8 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
  && rm -rf ~/.cache/pip/*
 
 # HF ARGS
-ARG TRANSFORMERS_VERSION=4.57.1
+# Note: optimum-neuron 0.4.1 requires transformers~=4.55.4
+ARG TRANSFORMERS_VERSION=4.55.4
 ARG DIFFUSERS_VERSION=0.35.2
 ARG HUGGINGFACE_HUB_VERSION=0.36.0
 ARG OPTIMUM_NEURON_VERSION=0.4.1
@@ -155,20 +159,19 @@ ARG SENTENCE_TRANSFORMERS=5.1.2
 ARG PEFT_VERSION=0.17.0
 ARG DATASETS_VERSION=4.1.1
 
-# install Hugging Face libraries and its dependencies
-# optimum-neuron==${OPTIMUM_NEURON_VERSION} \
+# Install Hugging Face libraries and dependencies for TEI on Neuron
 RUN pip install --no-cache-dir -U \
-	networkx==2.8.8 \
-	transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
+    networkx==2.8.8 \
+    transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
     diffusers==${DIFFUSERS_VERSION} \
     compel \
     controlnet-aux \
     huggingface_hub==${HUGGINGFACE_HUB_VERSION} \
     hf_transfer \
     datasets==${DATASETS_VERSION} \
-    "optimum-neuron @ git+https://github.com/huggingface/optimum-neuron@main" \
- 	sentence_transformers==${SENTENCE_TRANSFORMERS} \
-	peft==${PEFT_VERSION} \
+    optimum-neuron==${OPTIMUM_NEURON_VERSION} \
+    sentence_transformers==${SENTENCE_TRANSFORMERS} \
+    peft==${PEFT_VERSION} \
  && rm -rf ~/.cache/pip/*
 
 
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
index 0ca8b584c..1de5f9b1b 100644
--- a/backends/python/server/text_embeddings_server/models/__init__.py
+++ b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -11,10 +11,6 @@
 from text_embeddings_server.models.masked_model import MaskedLanguageModel
 from text_embeddings_server.models.default_model import DefaultModel
 from text_embeddings_server.models.classification_model import ClassificationModel
-from text_embeddings_server.models.jinaBert_model import FlashJinaBert
-from text_embeddings_server.models.flash_mistral import FlashMistral
-from text_embeddings_server.models.flash_qwen3 import FlashQwen3
-from text_embeddings_server.models.neuron_models import NeuronSentenceTransformersModel
 
 from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron
 
@@ -25,12 +21,21 @@
     "true",
     "1",
 ]
-# Disable gradients
-torch.set_grad_enabled(False)
 
+# Flash Attention models - only available when flash_attn is installed
 FLASH_ATTENTION = True
+FlashBert = None
+FlashJinaBert = None
+FlashMistral = None
+FlashQwen3 = None
+
 try:
     from text_embeddings_server.models.flash_bert import FlashBert
+    from text_embeddings_server.models.jinaBert_model import FlashJinaBert
+    from text_embeddings_server.models.flash_mistral import FlashMistral
+    from text_embeddings_server.models.flash_qwen3 import FlashQwen3
+    # Disable gradients
+    torch.set_grad_enabled(False)
 except ImportError as e:
     logger.warning(f"Could not import Flash Attention enabled models: {e}")
     FLASH_ATTENTION = False
@@ -38,6 +43,25 @@
 if FLASH_ATTENTION:
     __all__.append(FlashBert)
 
+# Neuron models - only import when on Neuron device to avoid unnecessary dependencies
+NeuronSentenceTransformersModel = None
+NeuronEmbeddingModel = None
+NeuronClassificationModel = None
+NeuronMaskedLMModel = None
+create_neuron_model = None
+
+if is_neuron():
+    try:
+        from text_embeddings_server.models.neuron_models import (
+            NeuronSentenceTransformersModel,
+            NeuronEmbeddingModel,
+            NeuronClassificationModel,
+            NeuronMaskedLMModel,
+            create_neuron_model,
+        )
+    except ImportError as e:
+        logger.warning(f"Could not import Neuron models: {e}")
+
 
 def wrap_model_if_hpu(model_handle, device):
     """Wrap the model in HPU graph if the device is HPU."""
@@ -76,14 +100,27 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str):
     logger.info(f"backend device: {device}")
 
     config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
-    
-    # Neuron cases
+
+    # Neuron cases - use optimum-neuron for all supported model types
     if is_neuron():
-        if config.model_type == "bert":
-            return create_model(NeuronSentenceTransformersModel, model_path, device, datatype)
+        logger.info(f"Neuron device detected, using optimum-neuron backend for model type: {config.model_type}")
+        try:
+            return create_neuron_model(
+                model_path=model_path,
+                device=device,
+                dtype=datatype,
+                pool=pool,
+                trust_remote=TRUST_REMOTE_CODE,
+                config=config,
+            )
+        except Exception as e:
+            logger.warning(f"Failed to load model with optimum-neuron: {e}")
+            logger.warning("Falling back to default model loading path")
+            # Fall through to default model loading
 
     if (
-        hasattr(config, "auto_map")
+        FlashJinaBert is not None
+        and hasattr(config, "auto_map")
         and isinstance(config.auto_map, dict)
         and "AutoModel" in config.auto_map
         and config.auto_map["AutoModel"]
@@ -123,13 +160,13 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str):
         else:
             return create_model(DefaultModel, model_path, device, datatype, pool)
 
-    if config.model_type == "mistral" and device.type == "hpu":
+    if config.model_type == "mistral" and device.type == "hpu" and FlashMistral is not None:
         try:
             return create_model(FlashMistral, model_path, device, datatype, pool)
         except FileNotFoundError:
             return create_model(DefaultModel, model_path, device, datatype, pool)
 
-    if config.model_type == "qwen3" and device.type == "hpu":
+    if config.model_type == "qwen3" and device.type == "hpu" and FlashQwen3 is not None:
         try:
             return create_model(FlashQwen3, model_path, device, datatype, pool)
         except FileNotFoundError:
diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py
index e3b850c3e..4589f6b77 100644
--- a/backends/python/server/text_embeddings_server/models/neuron_models.py
+++ b/backends/python/server/text_embeddings_server/models/neuron_models.py
@@ -1,71 +1,443 @@
 import inspect
+import os
 import torch
 
+from abc import ABC
 from pathlib import Path
-from typing import Type, List
-from optimum.neuron import NeuronSentenceTransformers
+from typing import Type, List, Optional
 from opentelemetry import trace
+from loguru import logger
 
-from text_embeddings_server.models import Model
+from text_embeddings_server.models.model import Model
 from text_embeddings_server.models.types import PaddedBatch, Embedding, Score
 
 tracer = trace.get_tracer(__name__)
 
+# Neuron compilation parameters from environment variables
+NEURON_BATCH_SIZE = int(os.getenv("NEURON_BATCH_SIZE", "1"))
+NEURON_SEQUENCE_LENGTH = int(os.getenv("NEURON_SEQUENCE_LENGTH", "512"))
+
+
+class NeuronBaseModel(Model, ABC):
+    """Base class for all Neuron models with common functionality."""
 
-class NeuronSentenceTransformersModel(Model):
     def __init__(
         self,
+        model,
         model_path: Path,
         device: torch.device,
         dtype: torch.dtype,
     ):
-        model = NeuronSentenceTransformers.from_pretrained(model_path)
-
         self.hidden_size = model.config.hidden_size
+
+        # Calculate max input length based on model type
         position_offset = 0
         model_type = model.config.model_type
         if model_type in ["xlm-roberta", "camembert", "roberta"]:
-            position_offset = model.config.pad_token_id + 1
+            position_offset = getattr(model.config, "pad_token_id", 1) + 1
+
         if hasattr(model.config, "max_seq_length"):
             self.max_input_length = model.config.max_seq_length
+        elif hasattr(model.config, "n_positions"):
+            self.max_input_length = model.config.n_positions
         else:
             self.max_input_length = (
                 model.config.max_position_embeddings - position_offset
             )
 
-        self.has_position_ids = (
-            inspect.signature(model.forward).parameters.get("position_ids", None)
-            is not None
-        )
-        self.has_token_type_ids = (
-            inspect.signature(model.forward).parameters.get("token_type_ids", None)
-            is not None
-        )
+        # Check which inputs the model supports
+        self.has_position_ids = self._check_param_exists(model, "position_ids")
+        self.has_token_type_ids = self._check_param_exists(model, "token_type_ids")
 
-        super(NeuronSentenceTransformersModel, self).__init__(
-            model=model, dtype=dtype, device=device
-        )
+        super().__init__(model=model, dtype=dtype, device=device)
+
+    @staticmethod
+    def _check_param_exists(model, param_name: str) -> bool:
+        """Check if a parameter exists in the model's forward signature."""
+        try:
+            forward_fn = model.forward if hasattr(model, 'forward') else model.__call__
+            return (
+                inspect.signature(forward_fn).parameters.get(param_name, None)
+                is not None
+            )
+        except (ValueError, TypeError):
+            return False
 
     @property
     def batch_type(self) -> Type[PaddedBatch]:
         return PaddedBatch
 
+    def _prepare_inputs(self, batch: PaddedBatch) -> dict:
+        """Prepare input kwargs for model forward pass.
+
+        Note: Neuron models require int64 (long) tensors for inputs.
+        """
+        kwargs = {
+            "input_ids": batch.input_ids.to(torch.long),
+            "attention_mask": batch.attention_mask.to(torch.long),
+        }
+        if self.has_token_type_ids:
+            kwargs["token_type_ids"] = batch.token_type_ids.to(torch.long)
+        if self.has_position_ids:
+            kwargs["position_ids"] = batch.position_ids.to(torch.long)
+        return kwargs
+
+
+class NeuronSentenceTransformersModel(NeuronBaseModel):
+    """
+    Neuron-optimized model for sentence-transformers.
+
+    Uses optimum.neuron.NeuronModelForSentenceTransformers which is designed
+    for sentence embedding models that output sentence_embedding directly.
+    """
+
+    def __init__(
+        self,
+        model_path: Path,
+        device: torch.device,
+        dtype: torch.dtype,
+        pool: str = "cls",
+        trust_remote: bool = False,
+    ):
+        try:
+            from optimum.neuron import NeuronModelForSentenceTransformers
+            is_compiled = self._is_neuron_compiled(model_path)
+            export_kwargs = {}
+            if not is_compiled:
+                export_kwargs = {
+                    "export": True,
+                    "batch_size": NEURON_BATCH_SIZE,
+                    "sequence_length": NEURON_SEQUENCE_LENGTH,
+                }
+                logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
+            model = NeuronModelForSentenceTransformers.from_pretrained(
+                model_path,
+                **export_kwargs,
+            )
+        except ImportError:
+            # Fallback to legacy import
+            from optimum.neuron import NeuronSentenceTransformers
+            model = NeuronSentenceTransformers.from_pretrained(model_path)
+
+        super().__init__(model, model_path, device, dtype)
+        self.pool = pool
+        logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}")
+
+    @staticmethod
+    def _is_neuron_compiled(model_path: Path) -> bool:
+        """Check if the model is already compiled for Neuron."""
+        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
+        return len(neuron_files) > 0
+
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask}
-        if self.has_token_type_ids:
-            kwargs["token_type_ids"] = batch.token_type_ids
+        kwargs = self._prepare_inputs(batch)
         output = self.model(**kwargs)
 
-        sentence_embedding = output["sentence_embedding"]
+        sentence_embedding = None
+        # NeuronModelForSentenceTransformers returns sentence_embedding directly
+        if hasattr(output, "sentence_embedding") and output.sentence_embedding is not None:
+            candidate = output.sentence_embedding
+            if candidate.abs().sum() > 0:
+                sentence_embedding = candidate
+        
+        # If sentence_embedding is invalid, fall back to manual pooling of token_embeddings
+        if sentence_embedding is None:
+            # Get token embeddings
+            if hasattr(output, "token_embeddings") and output.token_embeddings is not None:
+                token_embeddings = output.token_embeddings
+            else:
+                raise ValueError(f"Cannot extract embeddings from model output: {type(output)}")
+        
+        # Apply pooling based on self.pool setting
+        if self.pool == "cls":
+            sentence_embedding = token_embeddings[:, 0, :]
+        elif self.pool == "mean":
+            attention_mask = kwargs["attention_mask"].unsqueeze(-1).float()
+            sentence_embedding = (token_embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
+        elif self.pool == "last_token":
+            seq_lengths = kwargs["attention_mask"].sum(dim=1) - 1
+            sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
+        else:
+            raise ValueError(f"Invalid pooling mode: {self.pool}")
+
+        # Convert to list format expected by the gRPC interface
+        cpu_results = sentence_embedding.view(-1).tolist()
 
         return [
             Embedding(
-                values=sentence_embedding[i * self.hidden_size : (i + 1) * self.hidden_size]
+                values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size]
             )
             for i in range(len(batch))
         ]
 
     @tracer.start_as_current_span("predict")
     def predict(self, batch: PaddedBatch) -> List[Score]:
-        pass
+        raise NotImplementedError("Prediction not supported for sentence transformer models")
+
+
+class NeuronEmbeddingModel(NeuronBaseModel):
+    """
+    Neuron-optimized model for feature extraction / embeddings.
+
+    Uses optimum.neuron.NeuronModelForFeatureExtraction for models that
+    output hidden states which need to be pooled.
+    """
+
+    def __init__(
+        self,
+        model_path: Path,
+        device: torch.device,
+        dtype: torch.dtype,
+        pool: str = "cls",
+        trust_remote: bool = False,
+    ):
+        from optimum.neuron import NeuronModelForFeatureExtraction
+
+        is_compiled = self._is_neuron_compiled(model_path)
+        export_kwargs = {}
+        if not is_compiled:
+            export_kwargs = {
+                "export": True,
+                "batch_size": NEURON_BATCH_SIZE,
+                "sequence_length": NEURON_SEQUENCE_LENGTH,
+            }
+            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
+        model = NeuronModelForFeatureExtraction.from_pretrained(
+            model_path,
+            **export_kwargs,
+        )
+
+        logger.info(f"DEBUG: model type = {type(model)}")
+
+        super().__init__(model, model_path, device, dtype)
+        self.pool = pool
+
+        # Initialize pooling layer
+        from text_embeddings_server.models.pooling import DefaultPooling
+        self.pooling = DefaultPooling(self.hidden_size, pooling_mode=pool)
+
+        logger.info(f"Loaded NeuronEmbeddingModel with pool={pool}")
+
+    @staticmethod
+    def _is_neuron_compiled(model_path: Path) -> bool:
+        """Check if the model is already compiled for Neuron."""
+        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
+        return len(neuron_files) > 0
+
+    @tracer.start_as_current_span("embed")
+    def embed(self, batch: PaddedBatch) -> List[Embedding]:
+        kwargs = self._prepare_inputs(batch)
+        output = self.model(**kwargs)
+
+        # Apply pooling to get sentence embeddings
+        embedding = self.pooling.forward(output, batch.attention_mask)
+
+        cpu_results = embedding.view(-1).tolist()
+
+        return [
+            Embedding(
+                values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size]
+            )
+            for i in range(len(batch))
+        ]
+
+    @tracer.start_as_current_span("predict")
+    def predict(self, batch: PaddedBatch) -> List[Score]:
+        raise NotImplementedError("Prediction not supported for embedding models")
+
+
+class NeuronClassificationModel(NeuronBaseModel):
+    """
+    Neuron-optimized model for sequence classification.
+
+    Uses optimum.neuron.NeuronModelForSequenceClassification for classification tasks.
+    """
+
+    def __init__(
+        self,
+        model_path: Path,
+        device: torch.device,
+        dtype: torch.dtype,
+        pool: str = "cls",
+        trust_remote: bool = False,
+    ):
+        from optimum.neuron import NeuronModelForSequenceClassification
+
+        is_compiled = self._is_neuron_compiled(model_path)
+        export_kwargs = {}
+        if not is_compiled:
+            export_kwargs = {
+                "export": True,
+                "batch_size": NEURON_BATCH_SIZE,
+                "sequence_length": NEURON_SEQUENCE_LENGTH,
+            }
+            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
+        model = NeuronModelForSequenceClassification.from_pretrained(
+            model_path,
+            **export_kwargs,
+        )
+
+        super().__init__(model, model_path, device, dtype)
+        logger.info("Loaded NeuronClassificationModel")
+
+    @staticmethod
+    def _is_neuron_compiled(model_path: Path) -> bool:
+        """Check if the model is already compiled for Neuron."""
+        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
+        return len(neuron_files) > 0
+
+    @tracer.start_as_current_span("embed")
+    def embed(self, batch: PaddedBatch) -> List[Embedding]:
+        raise NotImplementedError("Embedding not supported for classification models")
+
+    @tracer.start_as_current_span("predict")
+    def predict(self, batch: PaddedBatch) -> List[Score]:
+        kwargs = self._prepare_inputs(batch)
+        output = self.model(**kwargs)
+
+        # Get logits from output
+        if hasattr(output, "logits"):
+            logits = output.logits
+        else:
+            logits = output[0]
+
+        all_scores = logits.tolist()
+        return [Score(values=scores) for scores in all_scores]
+
+
+class NeuronMaskedLMModel(NeuronBaseModel):
+    """
+    Neuron-optimized model for Masked Language Modeling (SPLADE).
+
+    Uses optimum.neuron.NeuronModelForMaskedLM for SPLADE-style sparse embeddings.
+    """
+
+    def __init__(
+        self,
+        model_path: Path,
+        device: torch.device,
+        dtype: torch.dtype,
+        pool: str = "splade",
+        trust_remote: bool = False,
+    ):
+        from optimum.neuron import NeuronModelForMaskedLM
+
+        is_compiled = self._is_neuron_compiled(model_path)
+        export_kwargs = {}
+        if not is_compiled:
+            export_kwargs = {
+                "export": True,
+                "batch_size": NEURON_BATCH_SIZE,
+                "sequence_length": NEURON_SEQUENCE_LENGTH,
+            }
+            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
+        model = NeuronModelForMaskedLM.from_pretrained(
+            model_path,
+            **export_kwargs,
+        )
+
+        super().__init__(model, model_path, device, dtype)
+
+        # Get vocab size for SPLADE output
+        self.vocab_size = model.config.vocab_size
+        logger.info(f"Loaded NeuronMaskedLMModel with vocab_size={self.vocab_size}")
+
+    @staticmethod
+    def _is_neuron_compiled(model_path: Path) -> bool:
+        """Check if the model is already compiled for Neuron."""
+        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
+        return len(neuron_files) > 0
+
+    @tracer.start_as_current_span("embed")
+    def embed(self, batch: PaddedBatch) -> List[Embedding]:
+        kwargs = self._prepare_inputs(batch)
+        output = self.model(**kwargs)
+
+        # Get logits for SPLADE pooling
+        if hasattr(output, "logits"):
+            hidden_states = output.logits
+        else:
+            hidden_states = output[0]
+
+        # SPLADE pooling: ReLU -> log(1+x) -> max pooling
+        hidden_states = torch.relu(hidden_states)
+        hidden_states = (1 + hidden_states).log()
+        hidden_states = torch.mul(hidden_states, batch.attention_mask.unsqueeze(-1))
+        sparse_embedding = hidden_states.max(dim=1).values
+
+        cpu_results = sparse_embedding.view(-1).tolist()
+
+        return [
+            Embedding(
+                values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size]
+            )
+            for i in range(len(batch))
+        ]
+
+    @tracer.start_as_current_span("predict")
+    def predict(self, batch: PaddedBatch) -> List[Score]:
+        raise NotImplementedError("Prediction not supported for masked LM models")
+
+
+def create_neuron_model(
+    model_path: Path,
+    device: torch.device,
+    dtype: torch.dtype,
+    pool: str = "cls",
+    trust_remote: bool = False,
+    config=None,
+) -> Model:
+    """
+    Factory function to create the appropriate Neuron model based on the model config.
+
+    Args:
+        model_path: Path to the model
+        device: Target device (should be xla for Neuron)
+        dtype: Data type for the model
+        pool: Pooling strategy (cls, mean, lasttoken, splade)
+        trust_remote: Whether to trust remote code
+        config: Pre-loaded model config (optional)
+
+    Returns:
+        Appropriate Neuron model instance
+    """
+    from transformers import AutoConfig
+
+    if config is None:
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote)
+
+    architectures = getattr(config, "architectures", []) or []
+    architecture = architectures[0] if architectures else ""
+
+    logger.info(f"Creating Neuron model for architecture: {architecture}, pool: {pool}")
+
+    # Check for classification models
+    if architecture.endswith("ForSequenceClassification") or architecture.endswith("Classification"):
+        return NeuronClassificationModel(model_path, device, dtype, pool, trust_remote)
+
+    # Check for SPLADE (masked LM) models
+    if pool == "splade" or architecture.endswith("ForMaskedLM"):
+        return NeuronMaskedLMModel(model_path, device, dtype, pool, trust_remote)
+
+    # Check for sentence-transformers models
+    # These typically have specific config attributes or are in specific repositories
+    is_sentence_transformer = (
+        hasattr(config, "sentence_transformers_config") or
+        hasattr(config, "_name_or_path") and "sentence-transformers" in str(config._name_or_path).lower() or
+        hasattr(config, "pooling_mode") or
+        (model_path / "sentence_bert_config.json").exists() if model_path.is_dir() else False
+    )
+
+    if is_sentence_transformer:
+        try:
+            return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote)
+        except Exception as e:
+            logger.warning(f"Failed to load as SentenceTransformer, falling back to FeatureExtraction: {e}")
+
+    # Default to feature extraction model
+    try:
+        return NeuronEmbeddingModel(model_path, device, dtype, pool, trust_remote)
+    except Exception as e:
+        logger.warning(f"Failed to load NeuronEmbeddingModel, trying NeuronSentenceTransformersModel: {e}")
+        return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote)
diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py
index 46b81370f..4963b012c 100644
--- a/backends/python/server/text_embeddings_server/utils/device.py
+++ b/backends/python/server/text_embeddings_server/utils/device.py
@@ -65,7 +65,7 @@ def get_neuron_major() -> int:
     return -1
 
 def is_neuron() -> bool:
-    return get_neuron_major > -1
+    return get_neuron_major() > -1
 
 def use_ipex() -> bool:
     value = os.environ.get("USE_IPEX", "True").lower()
diff --git a/backends/src/lib.rs b/backends/src/lib.rs
index b53067de1..4d45a5b02 100644
--- a/backends/src/lib.rs
+++ b/backends/src/lib.rs
@@ -425,14 +425,23 @@ async fn init_backend(
                     .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?;
 
                 if model_files.is_empty() {
-                    tracing::error!(
+                    tracing::warn!(
                         "Neuron model files not found in the repository. \
-                        You can easily compile your model to neuron format following the guide: \
+                        The Python backend will attempt to compile the model on-the-fly using optimum-neuron. \
+                        This may take several minutes. For faster startup, consider pre-compiling your model: \
                         https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview "
                     );
-                    return Err(BackendError::WeightsNotFound(
-                        "No Neuron model files found".into(),
-                    ));
+                    // Fall back to downloading regular model files for on-the-fly compilation
+                    if download_safetensors(api_repo).await.is_err() {
+                        tracing::warn!(
+                            "safetensors weights not found. Using `pytorch_model.bin` instead."
+                        );
+                        tracing::info!("Downloading `pytorch_model.bin`");
+                        api_repo
+                            .get("pytorch_model.bin")
+                            .await
+                            .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?;
+                    }
                 }
 
                 tracing::info!("Neuron model downloaded in {:?}", start.elapsed());
diff --git a/integration_tests/README.md b/integration_tests/README.md
index 641d8fce3..18b9232ad 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -1,8 +1,18 @@
 # Integration Tests
 
-This directory contains integration tests for the project. This starts the TEI server and run an /embed request to it while checking the output is as expected.
+This directory contains integration tests for the project. This starts the TEI server and runs an /embed request to it while checking the output is as expected.
 
-## Running the tests for HPU
+## How Tests Work
+
+The tests use pytest fixtures to:
+1. Start a Docker container with the TEI server
+2. Wait for the server to become healthy
+3. Send embedding requests and validate responses
+4. Stop and remove the container after tests complete
+
+The Docker image must be built before running tests. The `uv run pytest` command will start containers automatically using the pre-built image.
+
+## Running the tests for HPU (Habana Gaudi)
 
 First you have to build the docker image.
 ```bash
@@ -13,5 +23,105 @@ docker build . -f Dockerfile-intel --build-arg PLATFORM=$platform -t tei_hpu
 
 Then you can run the tests.
 ```bash
+cd integration_tests/gaudi
+uv run pytest --durations=0 -sv .
+```
+
+### Environment Variables (HPU)
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `DOCKER_IMAGE` | Docker image to use | `tei_hpu` |
+| `DOCKER_VOLUME` | Volume for model cache (recommended) | None |
+| `HF_TOKEN` | HuggingFace token for gated models | None |
+| `LOG_LEVEL` | Server log level | `info` |
+
+## Running the tests for Neuron (AWS Inferentia/Trainium)
+
+### Prerequisites
+
+1. **AWS Neuron instance**: Tests must run on an EC2 instance with Neuron devices (inf1, inf2, or trn1)
+2. **Neuron drivers**: Ensure Neuron drivers are installed and `/dev/neuron*` devices are available
+3. **Pre-compiled models**: Neuron requires models to be pre-compiled to `.neuron` format
+
+### Building the Docker Image
+
+```bash
+docker build . -f Dockerfile-neuron -t tei-neuron
+```
+
+### Running the Tests
+
+```bash
+cd integration_tests/neuron
 uv run pytest --durations=0 -sv .
 ```
+
+### Environment Variables (Neuron)
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `DOCKER_IMAGE` | Docker image to use | `tei-neuron` |
+| `DOCKER_VOLUME` | Volume for model cache (recommended) | None |
+| `HF_TOKEN` | HuggingFace token for gated models | None |
+| `LOG_LEVEL` | Server log level | `info` |
+| `NEURON_RT_NUM_CORES` | Number of Neuron cores to use | `1` |
+| `NEURON_RT_VISIBLE_CORES` | Which Neuron cores are visible | `0` |
+
+### Using Pre-compiled Neuron Models
+
+Neuron models must be pre-compiled before use. You have two options:
+
+1. **Use models with pre-compiled Neuron artifacts**: Some models on HuggingFace Hub have `.neuron` files available
+
+2. **Compile models yourself**: Follow the [Optimum Neuron guide](https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview) to compile your models
+
+Example compilation:
+```python
+from optimum.neuron import NeuronModelForSentenceTransformers
+
+# Compile and save
+model = NeuronModelForSentenceTransformers.from_pretrained(
+    "sentence-transformers/all-MiniLM-L6-v2",
+    export=True,
+    batch_size=1,
+    sequence_length=512,
+)
+model.save_pretrained("./all-MiniLM-L6-v2-neuron")
+model.push_to_hub("your-username/all-MiniLM-L6-v2-neuron")
+```
+
+### Troubleshooting Neuron Tests
+
+**Container exits immediately**:
+- Check if Neuron devices are available: `ls /dev/neuron*`
+- Check container logs for "Neuron model files not found" - model needs compilation
+- Ensure the Docker image was built with Neuron support
+
+**Long startup times**:
+- Neuron models may take several minutes to load due to compilation
+- The test timeout is set to 600 seconds (10 minutes) by default
+
+**Permission errors**:
+- Ensure Docker has access to Neuron devices
+- The tests add `IPC_LOCK` capability and mount `/dev/neuron*` devices
+
+## Adding New Test Models
+
+To add a new model to test, update the `TEST_CONFIGS` dictionary in `test_embed.py`:
+
+```python
+TEST_CONFIGS = {
+    "your-model/name": {
+        "model_id": "your-model/name",
+        "input": "Test input text",
+        "batch_inputs": ["Text 1", "Text 2"],
+        "args": ["--dtype", "float32"],
+        "env_config": {
+            "MAX_WARMUP_SEQUENCE_LENGTH": "512",
+        },
+    },
+}
+```
+
+For Habana tests, you can also add `expected_output` to validate exact embedding values.
diff --git a/integration_tests/neuron/conftest.py b/integration_tests/neuron/conftest.py
index e69de29bb..40d16b05a 100644
--- a/integration_tests/neuron/conftest.py
+++ b/integration_tests/neuron/conftest.py
@@ -0,0 +1,299 @@
+import asyncio
+import contextlib
+import os
+import shlex
+import subprocess
+import sys
+import threading
+import time
+from tempfile import TemporaryDirectory
+
+import docker
+import pytest
+from docker.errors import NotFound
+import logging
+from test_embed import TEST_CONFIGS
+import aiohttp
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)-8s | %(name)s:%(funcName)s:%(lineno)d - %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
+
+# Use the latest image from the local docker build
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tei-neuron")
+DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
+
+if DOCKER_VOLUME is None:
+    logger.warning(
+        "DOCKER_VOLUME is not set, this will lead to the tests redownloading the models on each run, consider setting it to speed up testing"
+    )
+
+LOG_LEVEL = os.getenv("LOG_LEVEL", "info")
+
+BASE_ENV = {
+    "HF_HUB_ENABLE_HF_TRANSFER": "1",
+    "LOG_LEVEL": LOG_LEVEL,
+    # Neuron-specific environment variables
+    "NEURON_RT_NUM_CORES": os.getenv("NEURON_RT_NUM_CORES", "1"),
+    "NEURON_RT_VISIBLE_CORES": os.getenv("NEURON_RT_VISIBLE_CORES", "0"),
+}
+
+# Neuron requires privileged mode for OCI hook to work
+NEURON_RUN_ARGS = {
+    "privileged": True,
+}
+
+
+def stream_container_logs(container, test_name):
+    """Stream container logs in a separate thread."""
+    try:
+        for log in container.logs(stream=True, follow=True):
+            print(
+                f"[TEI Server Logs - {test_name}] {log.decode('utf-8')}",
+                end="",
+                file=sys.stderr,
+                flush=True,
+            )
+    except Exception as e:
+        logger.error(f"Error streaming container logs: {str(e)}")
+
+
+class LauncherHandle:
+    def __init__(self, port: int):
+        self.port = port
+        self.base_url = f"http://127.0.0.1:{port}"
+
+    async def generate(self, prompt: str):
+        """Send embed request to the TEI server (alias for embed)."""
+        return await self.embed(prompt)
+
+    async def embed(self, text: str):
+        """Send embed request to the TEI server."""
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                f"{self.base_url}/embed",
+                json={"inputs": text},
+                headers={"Content-Type": "application/json"}
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise RuntimeError(f"Request failed with status {response.status}: {error_text}")
+                return await response.json()
+
+    async def embed_batch(self, texts: list):
+        """Send batch embed request to the TEI server."""
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                f"{self.base_url}/embed",
+                json={"inputs": texts},
+                headers={"Content-Type": "application/json"}
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise RuntimeError(f"Request failed with status {response.status}: {error_text}")
+                return await response.json()
+
+    async def predict(self, text: str):
+        """Send predict request to the TEI server (for classification models)."""
+        async with aiohttp.ClientSession() as session:
+            async with session.post(
+                f"{self.base_url}/predict",
+                json={"inputs": text},
+                headers={"Content-Type": "application/json"}
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise RuntimeError(f"Request failed with status {response.status}: {error_text}")
+                return await response.json()
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 300):
+        """Wait for the server to be healthy.
+
+        Neuron models may take longer to compile/load, so default timeout is higher.
+        """
+        assert timeout > 0
+        start_time = time.time()
+        logger.info(f"Starting health check with timeout of {timeout}s")
+
+        for attempt in range(timeout):
+            if not self._inner_health():
+                logger.error("Launcher crashed during health check")
+                raise RuntimeError("Launcher crashed")
+
+            try:
+                # Try to make a request using generate (like Habana tests)
+                await self.generate("test")
+                elapsed = time.time() - start_time
+                logger.info(f"Health check passed after {elapsed:.1f}s")
+                return
+            except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+                if attempt == timeout - 1:
+                    logger.error(f"Health check failed after {timeout}s: {str(e)}")
+                    raise RuntimeError(f"Health check failed: {str(e)}")
+                if attempt % 10 == 0 and attempt != 0:  # Only log every 10th attempt
+                    logger.debug(f"Connection attempt {attempt}/{timeout} failed: {str(e)}")
+                await asyncio.sleep(1)
+            except Exception as e:
+                logger.error(f"Unexpected error during health check: {str(e)}")
+                import traceback
+                logger.error(f"Full traceback:\n{traceback.format_exc()}")
+                raise
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, docker_client, container_name, port: int):
+        super().__init__(port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+
+    def _inner_health(self) -> bool:
+        try:
+            container = self.docker_client.containers.get(self.container_name)
+            status = container.status
+            if status not in ["running", "created"]:
+                logger.warning(f"Container status is {status}")
+                # Get container logs for debugging
+                logs = container.logs().decode("utf-8")
+                logger.debug(f"Container logs:\n{logs}")
+                return False
+            return True
+        except Exception as e:
+            logger.error(f"Error checking container health: {str(e)}")
+            return False
+
+
+class ProcessLauncherHandle(LauncherHandle):
+    def __init__(self, process, port: int):
+        super(ProcessLauncherHandle, self).__init__(port)
+        self.process = process
+
+    def _inner_health(self) -> bool:
+        return self.process.poll() is None
+
+
+@pytest.fixture(scope="module")
+def data_volume():
+    tmpdir = TemporaryDirectory()
+    yield tmpdir.name
+    try:
+        # Cleanup the temporary directory using sudo as it contains root files created by the container
+        subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"), check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error cleaning up temporary directory: {str(e)}")
+
+
+@pytest.fixture(scope="function")
+def neuron_launcher():
+    @contextlib.contextmanager
+    def docker_launcher(
+        model_id: str,
+        test_name: str,
+    ):
+        logger.info(
+            f"Starting docker launcher for model {model_id} and test {test_name}"
+        )
+
+        port = 8080
+
+        client = docker.from_env()
+
+        container_name = f"tei-neuron-test-{test_name.replace('/', '-').replace('_', '-')}"
+
+        try:
+            container = client.containers.get(container_name)
+            logger.info(
+                f"Stopping existing container {container_name} for test {test_name}"
+            )
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+        except Exception as e:
+            logger.error(f"Error handling existing container: {str(e)}")
+
+        tei_args = TEST_CONFIGS[test_name]["args"].copy()
+
+        # add model_id to tei args
+        tei_args.append("--model-id")
+        tei_args.append(model_id)
+
+        env = BASE_ENV.copy()
+        env["HF_TOKEN"] = os.getenv("HF_TOKEN")
+
+        # Add env config that is defined in the fixture parameter
+        if "env_config" in TEST_CONFIGS[test_name]:
+            env.update(TEST_CONFIGS[test_name]["env_config"].copy())
+
+        volumes = [f"{DOCKER_VOLUME}:/data"] if DOCKER_VOLUME else []
+        logger.debug(f"Using volume {volumes}")
+
+        try:
+            logger.info(f"Creating container with name {container_name}")
+
+            # Build run arguments - use privileged mode for Neuron OCI hook
+            run_args = NEURON_RUN_ARGS.copy()
+
+            container = client.containers.run(
+                DOCKER_IMAGE,
+                command=tei_args,
+                name=container_name,
+                environment=env,
+                detach=True,
+                volumes=volumes if volumes else None,
+                ports={"80/tcp": port},
+                **run_args,
+            )
+
+            logger.info(f"Container {container_name} started successfully")
+
+            # Start log streaming in a background thread
+            log_thread = threading.Thread(
+                target=stream_container_logs,
+                args=(container, test_name),
+                daemon=True,  # This ensures the thread will be killed when the main program exits
+            )
+            log_thread.start()
+
+            # Add a small delay to allow container to initialize
+            time.sleep(2)
+
+            # Check container status after creation
+            status = container.status
+            logger.debug(f"Initial container status: {status}")
+            if status not in ["running", "created"]:
+                logs = container.logs().decode("utf-8")
+                logger.error(f"Container failed to start properly. Logs:\n{logs}")
+
+            yield ContainerLauncherHandle(client, container.name, port)
+
+        except Exception as e:
+            logger.error(f"Error starting container: {str(e)}")
+            # Get full traceback for debugging
+            import traceback
+
+            logger.error(f"Full traceback:\n{traceback.format_exc()}")
+            raise
+        finally:
+            try:
+                container = client.containers.get(container_name)
+                logger.info(f"Stopping container {container_name}")
+                container.stop()
+                container.wait()
+
+                container_output = container.logs().decode("utf-8")
+                print(container_output, file=sys.stderr)
+
+                container.remove()
+                logger.info(f"Container {container_name} removed successfully")
+            except NotFound:
+                pass
+            except Exception as e:
+                logger.warning(f"Error cleaning up container: {str(e)}")
+
+    return docker_launcher
diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py
index e69de29bb..69b0fee7a 100644
--- a/integration_tests/neuron/test_embed.py
+++ b/integration_tests/neuron/test_embed.py
@@ -0,0 +1,223 @@
+from typing import Any, Dict, Generator
+from _pytest.fixtures import SubRequest
+
+import pytest
+import pytest_asyncio
+import numpy as np
+
+
+# Test configurations for Neuron backend
+# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
+TEST_CONFIGS = {
+    # BERT-based embedding model - commonly used and well-supported on Neuron
+    "sentence-transformers/all-MiniLM-L6-v2": {
+        "model_id": "sentence-transformers/all-MiniLM-L6-v2",
+        "input": "What is Deep Learning?",
+        "batch_inputs": [
+            "What is Deep Learning?",
+            "How does machine learning work?",
+            "Tell me about neural networks.",
+        ],
+        # Expected output for first 50 dimensions (to keep config manageable)
+        # These values should be generated from a known-good run
+        "expected_output_prefix": None,  # Will validate structure only if None
+        "args": [
+            "--dtype", "float32",
+            "--max-batch-requests", "1",
+        ],
+        "env_config": {
+            "MAX_WARMUP_SEQUENCE_LENGTH": "512",
+        },
+    },
+}
+
+
+@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
+def test_config(request: SubRequest) -> Dict[str, Any]:
+    """Fixture that provides model configurations for testing."""
+    model_name = request.param
+    test_config = TEST_CONFIGS[model_name].copy()
+    test_config["test_name"] = model_name
+    return test_config
+
+
+@pytest.fixture(scope="module")
+def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]:
+    yield test_config["model_id"]
+
+
+@pytest.fixture(scope="module")
+def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]:
+    yield test_config["test_name"]
+
+
+@pytest.fixture(scope="module")
+def input_text(test_config: Dict[str, Any]) -> str:
+    return test_config["input"]
+
+
+@pytest.fixture(scope="module")
+def batch_inputs(test_config: Dict[str, Any]) -> list:
+    return test_config.get("batch_inputs", [test_config["input"]])
+
+
+@pytest.fixture(scope="module")
+def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, Any]:
+    return {
+        "expected_output_prefix": test_config.get("expected_output_prefix"),
+    }
+
+
+@pytest.fixture(scope="function")
+def tei_service(neuron_launcher, model_id: str, test_name: str):
+    with neuron_launcher(model_id, test_name) as tei_service:
+        yield tei_service
+
+
+@pytest_asyncio.fixture(scope="function")
+async def tei_client(tei_service):
+    # Neuron models may take longer to load due to compilation
+    await tei_service.health(600)  # 10 minute timeout for Neuron compilation
+    return tei_service
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(
+    tei_client, expected_outputs: Dict[str, Any], input_text: str
+):
+    """Test single embedding request."""
+    response = await tei_client.embed(input_text)
+
+    # Verify response structure
+    assert isinstance(response, list), f"Expected list, got {type(response)}"
+    assert len(response) > 0, "Embedding should not be empty"
+
+    response_array = np.array(response)
+
+    # Check that values are numeric
+    assert response_array.dtype in [np.float32, np.float64, np.float16], \
+        f"Expected float array, got {response_array.dtype}"
+
+    # If expected output is provided, validate against it
+    expected_prefix = expected_outputs.get("expected_output_prefix")
+    if expected_prefix is not None:
+        expected_array = np.array(eval(expected_prefix) if isinstance(expected_prefix, str) else expected_prefix)
+        prefix_len = len(expected_array.flatten())
+        response_flat = response_array.flatten()[:prefix_len]
+
+        if not np.allclose(response_flat, expected_array.flatten(), rtol=1e-4, atol=1e-4):
+            print("\nExpected output (prefix):")
+            print(f"{expected_array.tolist()}")
+            print("\nReceived output (prefix):")
+            print(f"{response_flat.tolist()}")
+            raise AssertionError("Response array does not match expected array within tolerance")
+
+    # Check embedding dimensions are reasonable (typically 384, 768, 1024, etc.)
+    embedding_dim = response_array.shape[-1] if response_array.ndim > 1 else len(response_array)
+    assert embedding_dim > 0, "Embedding dimension should be positive"
+
+    print(f"Single request embedding shape: {response_array.shape}")
+    print(f"Embedding dimension: {embedding_dim}")
+
+
+@pytest.mark.asyncio
+async def test_model_batch_request(tei_client, batch_inputs: list):
+    """Test batch embedding request."""
+    response = await tei_client.embed_batch(batch_inputs)
+
+    # Verify response is a list of embeddings
+    assert isinstance(response, list), f"Expected list, got {type(response)}"
+    assert len(response) == len(batch_inputs), \
+        f"Expected {len(batch_inputs)} embeddings, got {len(response)}"
+
+    response_array = np.array(response)
+    print(f"Batch request response shape: {response_array.shape}")
+
+    # Check each embedding
+    for i, embedding in enumerate(response):
+        assert isinstance(embedding, list), f"Embedding {i} should be a list"
+        assert len(embedding) > 0, f"Embedding {i} should not be empty"
+
+
+@pytest.mark.asyncio
+async def test_model_embedding_consistency(tei_client, input_text: str):
+    """Test that the same input produces consistent embeddings."""
+    response1 = await tei_client.embed(input_text)
+    response2 = await tei_client.embed(input_text)
+
+    array1 = np.array(response1)
+    array2 = np.array(response2)
+
+    # Embeddings for the same input should be identical (or very close)
+    assert np.allclose(array1, array2, rtol=1e-4, atol=1e-4), \
+        "Same input should produce consistent embeddings"
+
+
+@pytest.mark.asyncio
+async def test_model_different_inputs_different_embeddings(tei_client):
+    """Test that different inputs produce different embeddings."""
+    input1 = "The weather is sunny today."
+    input2 = "Machine learning is a subset of artificial intelligence."
+
+    response1 = await tei_client.embed(input1)
+    response2 = await tei_client.embed(input2)
+
+    array1 = np.array(response1)
+    array2 = np.array(response2)
+
+    # Different inputs should produce different embeddings
+    assert not np.allclose(array1, array2, rtol=1e-2, atol=1e-2), \
+        "Different inputs should produce different embeddings"
+
+
+@pytest.mark.asyncio
+async def test_model_embedding_normalization(tei_client, input_text: str):
+    """Test embedding properties (optional - some models normalize, some don't)."""
+    response = await tei_client.embed(input_text)
+    array = np.array(response)
+
+    # Flatten if needed
+    if array.ndim > 1:
+        array = array.flatten()
+
+    # Check L2 norm - many sentence transformers normalize to unit length
+    l2_norm = np.linalg.norm(array)
+    print(f"Embedding L2 norm: {l2_norm}")
+
+    # Just verify the norm is reasonable (not zero, not extremely large)
+    assert l2_norm > 0.1, "Embedding norm should be positive"
+    assert l2_norm < 1000, "Embedding norm should not be extremely large"
+
+
+@pytest.mark.asyncio
+async def test_model_long_input(tei_client):
+    """Test handling of longer input text."""
+    # Create a longer input (but still within typical model limits)
+    long_input = "This is a test sentence. " * 20  # ~100 tokens
+
+    response = await tei_client.embed(long_input)
+
+    assert isinstance(response, list), f"Expected list, got {type(response)}"
+    assert len(response) > 0, "Embedding should not be empty"
+
+
+@pytest.mark.asyncio
+async def test_model_special_characters(tei_client):
+    """Test handling of special characters in input."""
+    special_input = "Hello! How are you? I'm fine, thanks. #test @user $100"
+
+    response = await tei_client.embed(special_input)
+
+    assert isinstance(response, list), f"Expected list, got {type(response)}"
+    assert len(response) > 0, "Embedding should not be empty"
+
+
+@pytest.mark.asyncio
+async def test_model_unicode_input(tei_client):
+    """Test handling of unicode characters."""
+    unicode_input = "Hello world! Bonjour le monde!"
+
+    response = await tei_client.embed(unicode_input)
+
+    assert isinstance(response, list), f"Expected list, got {type(response)}"
+    assert len(response) > 0, "Embedding should not be empty"

From 976b71c617fa58279957939b0719ff6670f88610 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Wed, 4 Feb 2026 14:24:30 +0000
Subject: [PATCH 09/20] add to CI & add pre-compiled test

---
 ...test.yaml => integration-test-habana.yaml} |   4 +-
 .../workflows/integration-test-neuron.yaml    |  33 +++++
 .../models/neuron_models.py                   |  26 ++--
 integration_tests/neuron/test_embed.py        | 115 ++++++------------
 4 files changed, 87 insertions(+), 91 deletions(-)
 rename .github/workflows/{integration-test.yaml => integration-test-habana.yaml} (90%)
 create mode 100644 .github/workflows/integration-test-neuron.yaml

diff --git a/.github/workflows/integration-test.yaml b/.github/workflows/integration-test-habana.yaml
similarity index 90%
rename from .github/workflows/integration-test.yaml
rename to .github/workflows/integration-test-habana.yaml
index b6f042179..d17a9cb14 100644
--- a/.github/workflows/integration-test.yaml
+++ b/.github/workflows/integration-test-habana.yaml
@@ -1,4 +1,4 @@
-name: Run integration tests
+name: Run Habana integration tests
 
 on:
   workflow_dispatch:
@@ -28,4 +28,4 @@ jobs:
         working-directory: integration_tests
         run: |
           uv sync --locked --all-extras --dev
-          uv run pytest --durations=0 -sv .
+          uv run pytest --durations=0 -sv gaudi/
diff --git a/.github/workflows/integration-test-neuron.yaml b/.github/workflows/integration-test-neuron.yaml
new file mode 100644
index 000000000..8be3630e2
--- /dev/null
+++ b/.github/workflows/integration-test-neuron.yaml
@@ -0,0 +1,33 @@
+name: Run Neuron integration tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'  # Run the workflow nightly to check Neuron integration is working
+
+jobs:
+  tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on:
+      group: aws-inf2-8xlarge
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Build Docker image for Neuron
+        run: |
+          docker build . -f Dockerfile-neuron -t tei-neuron
+
+      - name: Run integration tests
+        working-directory: integration_tests
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          DOCKER_IMAGE: tei-neuron
+        run: |
+          uv sync --locked --all-extras --dev
+          uv run pytest --durations=0 -sv neuron/
diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py
index 4589f6b77..f430a35e7 100644
--- a/backends/python/server/text_embeddings_server/models/neuron_models.py
+++ b/backends/python/server/text_embeddings_server/models/neuron_models.py
@@ -4,7 +4,7 @@
 
 from abc import ABC
 from pathlib import Path
-from typing import Type, List, Optional
+from typing import Type, List
 from opentelemetry import trace
 from loguru import logger
 
@@ -148,18 +148,18 @@ def embed(self, batch: PaddedBatch) -> List[Embedding]:
                 token_embeddings = output.token_embeddings
             else:
                 raise ValueError(f"Cannot extract embeddings from model output: {type(output)}")
-        
-        # Apply pooling based on self.pool setting
-        if self.pool == "cls":
-            sentence_embedding = token_embeddings[:, 0, :]
-        elif self.pool == "mean":
-            attention_mask = kwargs["attention_mask"].unsqueeze(-1).float()
-            sentence_embedding = (token_embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
-        elif self.pool == "last_token":
-            seq_lengths = kwargs["attention_mask"].sum(dim=1) - 1
-            sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
-        else:
-            raise ValueError(f"Invalid pooling mode: {self.pool}")
+
+            # Apply pooling based on self.pool setting
+            if self.pool == "cls":
+                sentence_embedding = token_embeddings[:, 0, :]
+            elif self.pool == "mean":
+                attention_mask = kwargs["attention_mask"].unsqueeze(-1).float()
+                sentence_embedding = (token_embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
+            elif self.pool == "last_token":
+                seq_lengths = kwargs["attention_mask"].sum(dim=1) - 1
+                sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
+            else:
+                raise ValueError(f"Invalid pooling mode: {self.pool}")
 
         # Convert to list format expected by the gRPC interface
         cpu_results = sentence_embedding.view(-1).tolist()
diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py
index 69b0fee7a..03da9d494 100644
--- a/integration_tests/neuron/test_embed.py
+++ b/integration_tests/neuron/test_embed.py
@@ -7,20 +7,52 @@
 
 
 # Test configurations for Neuron backend
-# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
 TEST_CONFIGS = {
-    # BERT-based embedding model - commonly used and well-supported on Neuron
-    "sentence-transformers/all-MiniLM-L6-v2": {
-        "model_id": "sentence-transformers/all-MiniLM-L6-v2",
+    # # On-the-fly Neuron compilation
+    # "sentence-transformers/all-MiniLM-L6-v2": {
+    #     "model_id": "sentence-transformers/all-MiniLM-L6-v2",
+    #     "input": "What is Deep Learning?",
+    #     "batch_inputs": [
+    #         "What is Deep Learning?",
+    #         "How does machine learning work?",
+    #         "Tell me about neural networks.",
+    #     ],
+    #     "expected_output_prefix": None,
+    #     "args": [
+    #         "--dtype", "float32",
+    #         "--max-batch-requests", "1",
+    #     ],
+    #     "env_config": {
+    #         "MAX_WARMUP_SEQUENCE_LENGTH": "512",
+    #     },
+    # },
+    # "BAAI/bge-base-en-v1.5": {
+    #     "model_id": "BAAI/bge-base-en-v1.5",
+    #     "input": "What is Deep Learning?",
+    #     "batch_inputs": [
+    #         "What is Deep Learning?",
+    #         "How does machine learning work?",
+    #         "Tell me about neural networks.",
+    #     ],
+    #     "expected_output_prefix": None,
+    #     "args": [
+    #         "--dtype", "float32",
+    #         "--max-batch-requests", "1",
+    #     ],
+    #     "env_config": {
+    #         "MAX_WARMUP_SEQUENCE_LENGTH": "512",
+    #     },
+    # },
+    # Pre-compiled Neuron model
+    "optimum/bge-base-en-v1.5-neuronx": {
+        "model_id": "optimum/bge-base-en-v1.5-neuronx",
         "input": "What is Deep Learning?",
         "batch_inputs": [
             "What is Deep Learning?",
             "How does machine learning work?",
             "Tell me about neural networks.",
         ],
-        # Expected output for first 50 dimensions (to keep config manageable)
-        # These values should be generated from a known-good run
-        "expected_output_prefix": None,  # Will validate structure only if None
+        "expected_output_prefix": None,
         "args": [
             "--dtype", "float32",
             "--max-batch-requests", "1",
@@ -152,72 +184,3 @@ async def test_model_embedding_consistency(tei_client, input_text: str):
     assert np.allclose(array1, array2, rtol=1e-4, atol=1e-4), \
         "Same input should produce consistent embeddings"
 
-
-@pytest.mark.asyncio
-async def test_model_different_inputs_different_embeddings(tei_client):
-    """Test that different inputs produce different embeddings."""
-    input1 = "The weather is sunny today."
-    input2 = "Machine learning is a subset of artificial intelligence."
-
-    response1 = await tei_client.embed(input1)
-    response2 = await tei_client.embed(input2)
-
-    array1 = np.array(response1)
-    array2 = np.array(response2)
-
-    # Different inputs should produce different embeddings
-    assert not np.allclose(array1, array2, rtol=1e-2, atol=1e-2), \
-        "Different inputs should produce different embeddings"
-
-
-@pytest.mark.asyncio
-async def test_model_embedding_normalization(tei_client, input_text: str):
-    """Test embedding properties (optional - some models normalize, some don't)."""
-    response = await tei_client.embed(input_text)
-    array = np.array(response)
-
-    # Flatten if needed
-    if array.ndim > 1:
-        array = array.flatten()
-
-    # Check L2 norm - many sentence transformers normalize to unit length
-    l2_norm = np.linalg.norm(array)
-    print(f"Embedding L2 norm: {l2_norm}")
-
-    # Just verify the norm is reasonable (not zero, not extremely large)
-    assert l2_norm > 0.1, "Embedding norm should be positive"
-    assert l2_norm < 1000, "Embedding norm should not be extremely large"
-
-
-@pytest.mark.asyncio
-async def test_model_long_input(tei_client):
-    """Test handling of longer input text."""
-    # Create a longer input (but still within typical model limits)
-    long_input = "This is a test sentence. " * 20  # ~100 tokens
-
-    response = await tei_client.embed(long_input)
-
-    assert isinstance(response, list), f"Expected list, got {type(response)}"
-    assert len(response) > 0, "Embedding should not be empty"
-
-
-@pytest.mark.asyncio
-async def test_model_special_characters(tei_client):
-    """Test handling of special characters in input."""
-    special_input = "Hello! How are you? I'm fine, thanks. #test @user $100"
-
-    response = await tei_client.embed(special_input)
-
-    assert isinstance(response, list), f"Expected list, got {type(response)}"
-    assert len(response) > 0, "Embedding should not be empty"
-
-
-@pytest.mark.asyncio
-async def test_model_unicode_input(tei_client):
-    """Test handling of unicode characters."""
-    unicode_input = "Hello world! Bonjour le monde!"
-
-    response = await tei_client.embed(unicode_input)
-
-    assert isinstance(response, list), f"Expected list, got {type(response)}"
-    assert len(response) > 0, "Embedding should not be empty"

From dc3edc2c51ca28a7713e6b3fafe1e85992e39cea Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Wed, 4 Feb 2026 22:32:05 +0000
Subject: [PATCH 10/20] fix tests

---
 Dockerfile-neuron                             |   6 +-
 .../text_embeddings_server/models/__init__.py |   2 -
 .../models/neuron_models.py                   | 224 +++++++-----------
 docs/source/en/ aws_neuron.md                 |  37 ---
 docs/source/en/aws_neuron.md                  | 105 ++++++++
 integration_tests/neuron/test_embed.py        |  70 +++---
 6 files changed, 232 insertions(+), 212 deletions(-)
 delete mode 100644 docs/source/en/ aws_neuron.md
 create mode 100644 docs/source/en/aws_neuron.md

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index dbf1e9a29..741084c8b 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -150,11 +150,11 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
  && rm -rf ~/.cache/pip/*
 
 # HF ARGS
-# Note: optimum-neuron 0.4.1 requires transformers~=4.55.4
-ARG TRANSFORMERS_VERSION=4.55.4
+# Note: optimum-neuron 0.4.4 requires transformers~=4.57.1
+ARG TRANSFORMERS_VERSION=4.57.1
 ARG DIFFUSERS_VERSION=0.35.2
 ARG HUGGINGFACE_HUB_VERSION=0.36.0
-ARG OPTIMUM_NEURON_VERSION=0.4.1
+ARG OPTIMUM_NEURON_VERSION=0.4.4
 ARG SENTENCE_TRANSFORMERS=5.1.2
 ARG PEFT_VERSION=0.17.0
 ARG DATASETS_VERSION=4.1.1
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
index 1de5f9b1b..8a48510d6 100644
--- a/backends/python/server/text_embeddings_server/models/__init__.py
+++ b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -45,7 +45,6 @@
 
 # Neuron models - only import when on Neuron device to avoid unnecessary dependencies
 NeuronSentenceTransformersModel = None
-NeuronEmbeddingModel = None
 NeuronClassificationModel = None
 NeuronMaskedLMModel = None
 create_neuron_model = None
@@ -54,7 +53,6 @@
     try:
         from text_embeddings_server.models.neuron_models import (
             NeuronSentenceTransformersModel,
-            NeuronEmbeddingModel,
             NeuronClassificationModel,
             NeuronMaskedLMModel,
             create_neuron_model,
diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py
index f430a35e7..f95c2b3c5 100644
--- a/backends/python/server/text_embeddings_server/models/neuron_models.py
+++ b/backends/python/server/text_embeddings_server/models/neuron_models.py
@@ -13,13 +13,13 @@
 
 tracer = trace.get_tracer(__name__)
 
-# Neuron compilation parameters from environment variables
+# Neuron static shapes compilation parameters
 NEURON_BATCH_SIZE = int(os.getenv("NEURON_BATCH_SIZE", "1"))
 NEURON_SEQUENCE_LENGTH = int(os.getenv("NEURON_SEQUENCE_LENGTH", "512"))
 
 
 class NeuronBaseModel(Model, ABC):
-    """Base class for all Neuron models with common functionality."""
+    """Base class for all Neuron models."""
 
     def __init__(
         self,
@@ -83,12 +83,12 @@ def _prepare_inputs(self, batch: PaddedBatch) -> dict:
         return kwargs
 
 
-class NeuronSentenceTransformersModel(NeuronBaseModel):
+class NeuronSentenceTransformersModel(Model):
     """
-    Neuron-optimized model for sentence-transformers.
+    Neuron model for sentence-transformers.
 
-    Uses optimum.neuron.NeuronModelForSentenceTransformers which is designed
-    for sentence embedding models that output sentence_embedding directly.
+    Uses optimum.neuron.NeuronSentenceTransformers which is designed
+    for sentence embedding models.
     """
 
     def __init__(
@@ -99,29 +99,43 @@ def __init__(
         pool: str = "cls",
         trust_remote: bool = False,
     ):
-        try:
-            from optimum.neuron import NeuronModelForSentenceTransformers
-            is_compiled = self._is_neuron_compiled(model_path)
-            export_kwargs = {}
-            if not is_compiled:
-                export_kwargs = {
-                    "export": True,
-                    "batch_size": NEURON_BATCH_SIZE,
-                    "sequence_length": NEURON_SEQUENCE_LENGTH,
-                }
-                logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
-            model = NeuronModelForSentenceTransformers.from_pretrained(
+        from optimum.neuron import NeuronSentenceTransformers
+        from transformers import AutoConfig
+
+        # Load config separately for reliable access
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote)
+        self.hidden_size = config.hidden_size
+
+        # Calculate max input length
+        position_offset = 0
+        model_type = config.model_type
+        if model_type in ["xlm-roberta", "camembert", "roberta"]:
+            position_offset = getattr(config, "pad_token_id", 1) + 1
+
+        if hasattr(config, "max_seq_length"):
+            self.max_input_length = config.max_seq_length
+        elif hasattr(config, "n_positions"):
+            self.max_input_length = config.n_positions
+        else:
+            self.max_input_length = (
+                config.max_position_embeddings - position_offset
+            )
+
+        is_compiled = self._is_neuron_compiled(model_path)
+        if not is_compiled:
+            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
+            model = NeuronSentenceTransformers.from_pretrained(
                 model_path,
-                **export_kwargs,
+                export=True,
+                batch_size=NEURON_BATCH_SIZE,
+                sequence_length=NEURON_SEQUENCE_LENGTH,
             )
-        except ImportError:
-            # Fallback to legacy import
-            from optimum.neuron import NeuronSentenceTransformers
+        else:
             model = NeuronSentenceTransformers.from_pretrained(model_path)
 
-        super().__init__(model, model_path, device, dtype)
         self.pool = pool
-        logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}")
+        super().__init__(model=model, dtype=dtype, device=device)
+        logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}, hidden_size={self.hidden_size}")
 
     @staticmethod
     def _is_neuron_compiled(model_path: Path) -> bool:
@@ -129,37 +143,67 @@ def _is_neuron_compiled(model_path: Path) -> bool:
         neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
         return len(neuron_files) > 0
 
+    @property
+    def batch_type(self) -> Type[PaddedBatch]:
+        return PaddedBatch
+
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        kwargs = self._prepare_inputs(batch)
-        output = self.model(**kwargs)
+        # Prepare inputs
+        input_ids = batch.input_ids.to(torch.long)
+        attention_mask = batch.attention_mask.to(torch.long)
+
+        # NeuronSentenceTransformers forward pass expects positional arguments
+        output = self.model(input_ids, attention_mask)
 
+        # Get sentence embeddings from output
         sentence_embedding = None
-        # NeuronModelForSentenceTransformers returns sentence_embedding directly
-        if hasattr(output, "sentence_embedding") and output.sentence_embedding is not None:
-            candidate = output.sentence_embedding
-            if candidate.abs().sum() > 0:
-                sentence_embedding = candidate
-        
-        # If sentence_embedding is invalid, fall back to manual pooling of token_embeddings
-        if sentence_embedding is None:
-            # Get token embeddings
-            if hasattr(output, "token_embeddings") and output.token_embeddings is not None:
-                token_embeddings = output.token_embeddings
+        if isinstance(output, dict):
+            # Check if sentence_embedding exists and has non-zero values
+            # NeuronSentenceTransformers may return zeros for sentence_embedding when pooling fails
+            has_valid_sentence_embedding = (
+                "sentence_embedding" in output
+                and output["sentence_embedding"] is not None
+                and output["sentence_embedding"].abs().sum() > 0
+            )
+            if has_valid_sentence_embedding:
+                sentence_embedding = output["sentence_embedding"]
+            elif "token_embeddings" in output and output["token_embeddings"] is not None:
+                # Apply manual pooling when sentence_embedding is not valid
+                logger.debug(f"Using token_embeddings with manual {self.pool} pooling")
+                token_embeddings = output["token_embeddings"]
+
+                if self.pool == "cls":
+                    sentence_embedding = token_embeddings[:, 0, :]
+                elif self.pool == "mean":
+                    mask = attention_mask.unsqueeze(-1).float()
+                    sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1)
+                elif self.pool == "last_token":
+                    seq_lengths = attention_mask.sum(dim=1) - 1
+                    sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
+                else:
+                    raise ValueError(f"Invalid pooling mode: {self.pool}")
             else:
-                raise ValueError(f"Cannot extract embeddings from model output: {type(output)}")
-
-            # Apply pooling based on self.pool setting
+                raise ValueError(f"Cannot extract embeddings from model output dict: {output.keys()}")
+        elif hasattr(output, "sentence_embedding") and output.sentence_embedding is not None:
+            sentence_embedding = output.sentence_embedding
+        elif hasattr(output, "token_embeddings") and output.token_embeddings is not None:
+            token_embeddings = output.token_embeddings
             if self.pool == "cls":
                 sentence_embedding = token_embeddings[:, 0, :]
             elif self.pool == "mean":
-                attention_mask = kwargs["attention_mask"].unsqueeze(-1).float()
-                sentence_embedding = (token_embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
+                mask = attention_mask.unsqueeze(-1).float()
+                sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1)
             elif self.pool == "last_token":
-                seq_lengths = kwargs["attention_mask"].sum(dim=1) - 1
+                seq_lengths = attention_mask.sum(dim=1) - 1
                 sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
             else:
                 raise ValueError(f"Invalid pooling mode: {self.pool}")
+        elif torch.is_tensor(output):
+            # Assume output is the sentence embedding tensor directly
+            sentence_embedding = output
+        else:
+            raise ValueError(f"Cannot extract embeddings from model output: type={type(output)}")
 
         # Convert to list format expected by the gRPC interface
         cpu_results = sentence_embedding.view(-1).tolist()
@@ -176,77 +220,6 @@ def predict(self, batch: PaddedBatch) -> List[Score]:
         raise NotImplementedError("Prediction not supported for sentence transformer models")
 
 
-class NeuronEmbeddingModel(NeuronBaseModel):
-    """
-    Neuron-optimized model for feature extraction / embeddings.
-
-    Uses optimum.neuron.NeuronModelForFeatureExtraction for models that
-    output hidden states which need to be pooled.
-    """
-
-    def __init__(
-        self,
-        model_path: Path,
-        device: torch.device,
-        dtype: torch.dtype,
-        pool: str = "cls",
-        trust_remote: bool = False,
-    ):
-        from optimum.neuron import NeuronModelForFeatureExtraction
-
-        is_compiled = self._is_neuron_compiled(model_path)
-        export_kwargs = {}
-        if not is_compiled:
-            export_kwargs = {
-                "export": True,
-                "batch_size": NEURON_BATCH_SIZE,
-                "sequence_length": NEURON_SEQUENCE_LENGTH,
-            }
-            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
-        model = NeuronModelForFeatureExtraction.from_pretrained(
-            model_path,
-            **export_kwargs,
-        )
-
-        logger.info(f"DEBUG: model type = {type(model)}")
-
-        super().__init__(model, model_path, device, dtype)
-        self.pool = pool
-
-        # Initialize pooling layer
-        from text_embeddings_server.models.pooling import DefaultPooling
-        self.pooling = DefaultPooling(self.hidden_size, pooling_mode=pool)
-
-        logger.info(f"Loaded NeuronEmbeddingModel with pool={pool}")
-
-    @staticmethod
-    def _is_neuron_compiled(model_path: Path) -> bool:
-        """Check if the model is already compiled for Neuron."""
-        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
-        return len(neuron_files) > 0
-
-    @tracer.start_as_current_span("embed")
-    def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        kwargs = self._prepare_inputs(batch)
-        output = self.model(**kwargs)
-
-        # Apply pooling to get sentence embeddings
-        embedding = self.pooling.forward(output, batch.attention_mask)
-
-        cpu_results = embedding.view(-1).tolist()
-
-        return [
-            Embedding(
-                values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size]
-            )
-            for i in range(len(batch))
-        ]
-
-    @tracer.start_as_current_span("predict")
-    def predict(self, batch: PaddedBatch) -> List[Score]:
-        raise NotImplementedError("Prediction not supported for embedding models")
-
-
 class NeuronClassificationModel(NeuronBaseModel):
     """
     Neuron-optimized model for sequence classification.
@@ -420,24 +393,5 @@ def create_neuron_model(
     if pool == "splade" or architecture.endswith("ForMaskedLM"):
         return NeuronMaskedLMModel(model_path, device, dtype, pool, trust_remote)
 
-    # Check for sentence-transformers models
-    # These typically have specific config attributes or are in specific repositories
-    is_sentence_transformer = (
-        hasattr(config, "sentence_transformers_config") or
-        hasattr(config, "_name_or_path") and "sentence-transformers" in str(config._name_or_path).lower() or
-        hasattr(config, "pooling_mode") or
-        (model_path / "sentence_bert_config.json").exists() if model_path.is_dir() else False
-    )
-
-    if is_sentence_transformer:
-        try:
-            return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote)
-        except Exception as e:
-            logger.warning(f"Failed to load as SentenceTransformer, falling back to FeatureExtraction: {e}")
-
-    # Default to feature extraction model
-    try:
-        return NeuronEmbeddingModel(model_path, device, dtype, pool, trust_remote)
-    except Exception as e:
-        logger.warning(f"Failed to load NeuronEmbeddingModel, trying NeuronSentenceTransformersModel: {e}")
-        return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote)
+    # Default to NeuronSentenceTransformers for all embedding models
+    return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote)
diff --git a/docs/source/en/ aws_neuron.md b/docs/source/en/ aws_neuron.md
deleted file mode 100644
index d383fdba8..000000000
--- a/docs/source/en/ aws_neuron.md	
+++ /dev/null
@@ -1,37 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-# Using TEI Container with AWS Trainium and Inferentia Instances
-
-## Build Docker Image
-
-To build a container optimized for AWS Neuron devices, run the following command:
-
-```shell
-platform="neuron"
-
-docker build . -f Dockerfile-neuron -t tei-neuron:main
-```
-
-### Deploy Docker Container
-
-To deploy your model on an AWS Trainium or Inferentia instance, use the following command:
-
-```shell
-model='optimum/bge-base-en-v1.5-neuronx'
-volume=$PWD/data
-
-docker run -p 8080:80 -v $volume:/data tei-neuron:main --model-id $model --dtype float32
-```
\ No newline at end of file
diff --git a/docs/source/en/aws_neuron.md b/docs/source/en/aws_neuron.md
new file mode 100644
index 000000000..d4d056141
--- /dev/null
+++ b/docs/source/en/aws_neuron.md
@@ -0,0 +1,105 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+# Using TEI with AWS Trainium and Inferentia
+
+Text Embeddings Inference (TEI) supports AWS Trainium and Inferentia accelerators through the [optimum-neuron](https://huggingface.co/docs/optimum-neuron) library. This integration uses `NeuronSentenceTransformers` to run embedding models efficiently on AWS Neuron devices.
+
+## Supported Model Types
+
+- **Embedding models**: Uses `NeuronSentenceTransformers` for sentence embeddings (e.g., BGE, sentence-transformers models)
+- **Classification models**: Uses `NeuronModelForSequenceClassification` for sequence classification tasks
+- **SPLADE models**: Uses `NeuronModelForMaskedLM` for sparse embeddings
+
+## Build Docker Image
+
+To build a container optimized for AWS Neuron devices:
+
+```shell
+docker build . -f Dockerfile-neuron -t tei-neuron:main
+```
+
+## Deploy with Pre-compiled Models
+
+Pre-compiled models are recommended for production use as they skip the compilation step and start faster.
+
+```shell
+model='optimum/bge-base-en-v1.5-neuronx'
+volume=$PWD/data
+
+docker run --privileged \
+    -p 8080:80 \
+    -v $volume:/data \
+    tei-neuron:main \
+    --model-id $model \
+    --dtype float32
+```
+
+> **Note**: The `--privileged` flag is required for the Neuron OCI hook to work properly.
+
+## Deploy with On-the-fly Compilation
+
+You can also use non-pre-compiled models. TEI will compile the model for Neuron automatically on first load. This takes additional time but allows you to use any compatible model.
+
+```shell
+model='BAAI/bge-base-en-v1.5'
+volume=$PWD/data
+
+docker run --privileged \
+    -p 8080:80 \
+    -v $volume:/data \
+    -e NEURON_BATCH_SIZE=1 \
+    -e NEURON_SEQUENCE_LENGTH=512 \
+    tei-neuron:main \
+    --model-id $model \
+    --dtype float32
+```
+
+### Compilation Environment Variables
+
+When using on-the-fly compilation, you can configure the following environment variables:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `NEURON_BATCH_SIZE` | 1 | Batch size for Neuron compilation (static shape) |
+| `NEURON_SEQUENCE_LENGTH` | 512 | Maximum sequence length for Neuron compilation (static shape) |
+
+> **Note**: Neuron requires static shapes for compilation. The batch size and sequence length are fixed at compilation time.
+
+## Runtime Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `NEURON_RT_NUM_CORES` | 1 | Number of Neuron cores to use |
+| `NEURON_RT_VISIBLE_CORES` | 0 | Which Neuron cores are visible to the runtime |
+
+## Pre-compiled Models
+
+For faster startup, use pre-compiled Neuron models from the Hugging Face Hub:
+
+- [optimum/bge-base-en-v1.5-neuronx](https://huggingface.co/optimum/bge-base-en-v1.5-neuronx)
+
+You can also compile your own models using the [Optimum Neuron guide](https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview).
+
+## Testing Your Deployment
+
+Once the container is running, you can test the embedding endpoint:
+
+```shell
+curl 127.0.0.1:8080/embed \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{"inputs": "What is Deep Learning?"}'
+```
diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py
index 03da9d494..171c19fee 100644
--- a/integration_tests/neuron/test_embed.py
+++ b/integration_tests/neuron/test_embed.py
@@ -8,41 +8,41 @@
 
 # Test configurations for Neuron backend
 TEST_CONFIGS = {
-    # # On-the-fly Neuron compilation
-    # "sentence-transformers/all-MiniLM-L6-v2": {
-    #     "model_id": "sentence-transformers/all-MiniLM-L6-v2",
-    #     "input": "What is Deep Learning?",
-    #     "batch_inputs": [
-    #         "What is Deep Learning?",
-    #         "How does machine learning work?",
-    #         "Tell me about neural networks.",
-    #     ],
-    #     "expected_output_prefix": None,
-    #     "args": [
-    #         "--dtype", "float32",
-    #         "--max-batch-requests", "1",
-    #     ],
-    #     "env_config": {
-    #         "MAX_WARMUP_SEQUENCE_LENGTH": "512",
-    #     },
-    # },
-    # "BAAI/bge-base-en-v1.5": {
-    #     "model_id": "BAAI/bge-base-en-v1.5",
-    #     "input": "What is Deep Learning?",
-    #     "batch_inputs": [
-    #         "What is Deep Learning?",
-    #         "How does machine learning work?",
-    #         "Tell me about neural networks.",
-    #     ],
-    #     "expected_output_prefix": None,
-    #     "args": [
-    #         "--dtype", "float32",
-    #         "--max-batch-requests", "1",
-    #     ],
-    #     "env_config": {
-    #         "MAX_WARMUP_SEQUENCE_LENGTH": "512",
-    #     },
-    # },
+    # On-the-fly Neuron compilation
+    "sentence-transformers/all-MiniLM-L6-v2": {
+        "model_id": "sentence-transformers/all-MiniLM-L6-v2",
+        "input": "What is Deep Learning?",
+        "batch_inputs": [
+            "What is Deep Learning?",
+            "How does machine learning work?",
+            "Tell me about neural networks.",
+        ],
+        "expected_output_prefix": None,
+        "args": [
+            "--dtype", "float32",
+            "--max-batch-requests", "1",
+        ],
+        "env_config": {
+            "MAX_WARMUP_SEQUENCE_LENGTH": "512",
+        },
+    },
+    "BAAI/bge-base-en-v1.5": {
+        "model_id": "BAAI/bge-base-en-v1.5",
+        "input": "What is Deep Learning?",
+        "batch_inputs": [
+            "What is Deep Learning?",
+            "How does machine learning work?",
+            "Tell me about neural networks.",
+        ],
+        "expected_output_prefix": None,
+        "args": [
+            "--dtype", "float32",
+            "--max-batch-requests", "1",
+        ],
+        "env_config": {
+            "MAX_WARMUP_SEQUENCE_LENGTH": "512",
+        },
+    },
     # Pre-compiled Neuron model
     "optimum/bge-base-en-v1.5-neuronx": {
         "model_id": "optimum/bge-base-en-v1.5-neuronx",

From b80356699f40b86c59b3f16e561fdb9bd903301f Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Thu, 5 Feb 2026 10:51:28 +0000
Subject: [PATCH 11/20] snol fix

---
 backends/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/src/lib.rs b/backends/src/lib.rs
index 3d7e083af..e0ad5b4f8 100644
--- a/backends/src/lib.rs
+++ b/backends/src/lib.rs
@@ -442,7 +442,7 @@ async fn init_backend(
                         https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview "
                     );
                     // Fall back to downloading regular model files for on-the-fly compilation
-                    if download_safetensors(api_repo).await.is_err() {
+                    if download_safetensors(api_repo.clone()).await.is_err() {
                         tracing::warn!(
                             "safetensors weights not found. Using `pytorch_model.bin` instead."
                         );
@@ -456,7 +456,7 @@ async fn init_backend(
 
                 tracing::info!("Neuron model downloaded in {:?}", start.elapsed());
             } else {
-                if download_safetensors(api_repo).await.is_err() {
+                if download_safetensors(api_repo.clone()).await.is_err() {
                     tracing::warn!(
                         "safetensors weights not found. Using `pytorch_model.bin` instead. \
                         Model loading will be significantly slower."

From 81c57d35f3507a9f5e243f8ab69f073ce3a42fc4 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Thu, 5 Feb 2026 10:56:41 +0000
Subject: [PATCH 12/20] fix doc index

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b9eebac2c..69ace4e17 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -19,8 +19,8 @@
     title: Build custom container for TEI
   - local: intel_container
     title: Using TEI container with Intel Hardware
-  - local: local_neuron
-    title: Using TEI container with AWS Neuron
+  - local: aws_neuron
+    title: Using TEI with AWS Trainium and Inferentia
   - local: examples
     title: Example uses
   title: Tutorials

From 7f517b996431d6e8c501d6e1d1b9ebfb4ee4ae2d Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Thu, 5 Feb 2026 11:14:23 +0000
Subject: [PATCH 13/20] fix style

---
 backends/src/lib.rs                    | 4 +---
 integration_tests/neuron/test_embed.py | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/backends/src/lib.rs b/backends/src/lib.rs
index e0ad5b4f8..3471c344c 100644
--- a/backends/src/lib.rs
+++ b/backends/src/lib.rs
@@ -68,9 +68,7 @@ fn is_hpu() -> bool {
 }
 
 fn is_neuron() -> bool {
-    match Command::new("neuron-ls")
-        .output()
-    {
+    match Command::new("neuron-ls").output() {
         Ok(output) => output.status.success(),
         Err(_) => false,
     }
diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py
index 171c19fee..4ca4aadb9 100644
--- a/integration_tests/neuron/test_embed.py
+++ b/integration_tests/neuron/test_embed.py
@@ -183,4 +183,3 @@ async def test_model_embedding_consistency(tei_client, input_text: str):
     # Embeddings for the same input should be identical (or very close)
     assert np.allclose(array1, array2, rtol=1e-4, atol=1e-4), \
         "Same input should produce consistent embeddings"
-

From 975299802c7ee3b657f7c108b194825cab5407bc Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Thu, 5 Feb 2026 11:18:28 +0000
Subject: [PATCH 14/20] build and push neuron docker images in CI

---
 .github/workflows/build.yaml  | 1 +
 .github/workflows/matrix.json | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 52352c7f7..3b9032614 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -18,6 +18,7 @@ on:
       - "Cargo.lock"
       - "rust-toolchain.toml"
       - "Dockerfile"
+      - "Dockerfile-neuron"
     branches:
       - 'main'
 
diff --git a/.github/workflows/matrix.json b/.github/workflows/matrix.json
index a7f6660b7..92430a4a2 100644
--- a/.github/workflows/matrix.json
+++ b/.github/workflows/matrix.json
@@ -87,5 +87,13 @@
     "extraBuildArgs": "PLATFORM=hpu",
     "grpc": true,
     "dockerfile": "Dockerfile-intel"
+  },
+  {
+    "name": "neuron",
+    "imageNamePrefix": "neuron-",
+    "runOn": "always",
+    "sccache": true,
+    "grpc": true,
+    "dockerfile": "Dockerfile-neuron"
   }
 ]

From c517aa227582ba571e21ada9a2d5fcaf66a9f1a5 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Thu, 5 Feb 2026 13:40:25 +0000
Subject: [PATCH 15/20] smol changes

---
 .../models/neuron_models.py                   |  2 -
 docs/source/en/aws_neuron.md                  |  4 +-
 integration_tests/README.md                   | 79 +------------------
 3 files changed, 3 insertions(+), 82 deletions(-)

diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py
index f95c2b3c5..80745edc8 100644
--- a/backends/python/server/text_embeddings_server/models/neuron_models.py
+++ b/backends/python/server/text_embeddings_server/models/neuron_models.py
@@ -106,7 +106,6 @@ def __init__(
         config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote)
         self.hidden_size = config.hidden_size
 
-        # Calculate max input length
         position_offset = 0
         model_type = config.model_type
         if model_type in ["xlm-roberta", "camembert", "roberta"]:
@@ -153,7 +152,6 @@ def embed(self, batch: PaddedBatch) -> List[Embedding]:
         input_ids = batch.input_ids.to(torch.long)
         attention_mask = batch.attention_mask.to(torch.long)
 
-        # NeuronSentenceTransformers forward pass expects positional arguments
         output = self.model(input_ids, attention_mask)
 
         # Get sentence embeddings from output
diff --git a/docs/source/en/aws_neuron.md b/docs/source/en/aws_neuron.md
index d4d056141..2d02999a6 100644
--- a/docs/source/en/aws_neuron.md
+++ b/docs/source/en/aws_neuron.md
@@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.
 -->
 # Using TEI with AWS Trainium and Inferentia
 
-Text Embeddings Inference (TEI) supports AWS Trainium and Inferentia accelerators through the [optimum-neuron](https://huggingface.co/docs/optimum-neuron) library. This integration uses `NeuronSentenceTransformers` to run embedding models efficiently on AWS Neuron devices.
+Text Embeddings Inference (TEI) supports AWS Trainium and Inferentia accelerators through the [optimum-neuron](https://huggingface.co/docs/optimum-neuron) library.
 
 ## Supported Model Types
 
@@ -87,7 +87,7 @@ When using on-the-fly compilation, you can configure the following environment v
 
 ## Pre-compiled Models
 
-For faster startup, use pre-compiled Neuron models from the Hugging Face Hub:
+For faster startup, use pre-compiled Neuron models from the Hugging Face Hub like:
 
 - [optimum/bge-base-en-v1.5-neuronx](https://huggingface.co/optimum/bge-base-en-v1.5-neuronx)
 
diff --git a/integration_tests/README.md b/integration_tests/README.md
index 18b9232ad..69679a95e 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -27,20 +27,11 @@ cd integration_tests/gaudi
 uv run pytest --durations=0 -sv .
 ```
 
-### Environment Variables (HPU)
-
-| Variable | Description | Default |
-|----------|-------------|---------|
-| `DOCKER_IMAGE` | Docker image to use | `tei_hpu` |
-| `DOCKER_VOLUME` | Volume for model cache (recommended) | None |
-| `HF_TOKEN` | HuggingFace token for gated models | None |
-| `LOG_LEVEL` | Server log level | `info` |
-
 ## Running the tests for Neuron (AWS Inferentia/Trainium)
 
 ### Prerequisites
 
-1. **AWS Neuron instance**: Tests must run on an EC2 instance with Neuron devices (inf1, inf2, or trn1)
+1. **AWS Neuron instance**: Tests must run on an EC2 instance with Neuron devices (inf2, trn1 or trn2)
 2. **Neuron drivers**: Ensure Neuron drivers are installed and `/dev/neuron*` devices are available
 3. **Pre-compiled models**: Neuron requires models to be pre-compiled to `.neuron` format
 
@@ -57,71 +48,3 @@ cd integration_tests/neuron
 uv run pytest --durations=0 -sv .
 ```
 
-### Environment Variables (Neuron)
-
-| Variable | Description | Default |
-|----------|-------------|---------|
-| `DOCKER_IMAGE` | Docker image to use | `tei-neuron` |
-| `DOCKER_VOLUME` | Volume for model cache (recommended) | None |
-| `HF_TOKEN` | HuggingFace token for gated models | None |
-| `LOG_LEVEL` | Server log level | `info` |
-| `NEURON_RT_NUM_CORES` | Number of Neuron cores to use | `1` |
-| `NEURON_RT_VISIBLE_CORES` | Which Neuron cores are visible | `0` |
-
-### Using Pre-compiled Neuron Models
-
-Neuron models must be pre-compiled before use. You have two options:
-
-1. **Use models with pre-compiled Neuron artifacts**: Some models on HuggingFace Hub have `.neuron` files available
-
-2. **Compile models yourself**: Follow the [Optimum Neuron guide](https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview) to compile your models
-
-Example compilation:
-```python
-from optimum.neuron import NeuronModelForSentenceTransformers
-
-# Compile and save
-model = NeuronModelForSentenceTransformers.from_pretrained(
-    "sentence-transformers/all-MiniLM-L6-v2",
-    export=True,
-    batch_size=1,
-    sequence_length=512,
-)
-model.save_pretrained("./all-MiniLM-L6-v2-neuron")
-model.push_to_hub("your-username/all-MiniLM-L6-v2-neuron")
-```
-
-### Troubleshooting Neuron Tests
-
-**Container exits immediately**:
-- Check if Neuron devices are available: `ls /dev/neuron*`
-- Check container logs for "Neuron model files not found" - model needs compilation
-- Ensure the Docker image was built with Neuron support
-
-**Long startup times**:
-- Neuron models may take several minutes to load due to compilation
-- The test timeout is set to 600 seconds (10 minutes) by default
-
-**Permission errors**:
-- Ensure Docker has access to Neuron devices
-- The tests add `IPC_LOCK` capability and mount `/dev/neuron*` devices
-
-## Adding New Test Models
-
-To add a new model to test, update the `TEST_CONFIGS` dictionary in `test_embed.py`:
-
-```python
-TEST_CONFIGS = {
-    "your-model/name": {
-        "model_id": "your-model/name",
-        "input": "Test input text",
-        "batch_inputs": ["Text 1", "Text 2"],
-        "args": ["--dtype", "float32"],
-        "env_config": {
-            "MAX_WARMUP_SEQUENCE_LENGTH": "512",
-        },
-    },
-}
-```
-
-For Habana tests, you can also add `expected_output` to validate exact embedding values.

From 533d8538fceb3cfd92c84b1344049a266fc51c15 Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Fri, 20 Feb 2026 16:02:20 +0100
Subject: [PATCH 16/20] Update Dockerfile-neuron

Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
---
 Dockerfile-neuron | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index 741084c8b..044bbf596 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -1,4 +1,4 @@
-FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.92-bookworm AS chef
 WORKDIR /usr/src
 
 ENV SCCACHE=0.10.0

From 0829b6f239e429199873e618fca36797ee8aa2cf Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Mon, 23 Feb 2026 16:31:54 +0100
Subject: [PATCH 17/20] Apply suggestions from code review

Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
---
 Dockerfile-neuron | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index 044bbf596..be2427140 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -174,7 +174,6 @@ RUN pip install --no-cache-dir -U \
     peft==${PEFT_VERSION} \
  && rm -rf ~/.cache/pip/*
 
-
 FROM neuron AS grpc
 
 COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
@@ -182,7 +181,7 @@ COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/loc
 ENTRYPOINT ["text-embeddings-router"]
 CMD ["--json-output"]
 
-FROM neuron
+FROM neuron AS http
 
 COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
 

From 1464cc3a8cd84fa91cf9efa35d426e4e5c05e15e Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Mon, 23 Feb 2026 16:05:15 +0000
Subject: [PATCH 18/20] review:suggestions

---
 Dockerfile-neuron                             |  6 +--
 backends/Cargo.toml                           |  1 +
 .../text_embeddings_server/models/__init__.py | 24 +--------
 .../models/habana/__init__.py                 | 14 +++++
 .../{neuron_models.py => neuron/__init__.py}  |  0
 backends/src/dtype.rs                         | 25 ++++++---
 backends/src/lib.rs                           | 51 +++++++++----------
 integration_tests/README.md                   |  1 -
 router/Cargo.toml                             |  1 +
 9 files changed, 63 insertions(+), 60 deletions(-)
 create mode 100644 backends/python/server/text_embeddings_server/models/habana/__init__.py
 rename backends/python/server/text_embeddings_server/models/{neuron_models.py => neuron/__init__.py} (100%)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index 044bbf596..6900b72f8 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -30,7 +30,7 @@ COPY --from=planner /usr/src/recipe.json recipe.json
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s
+    cargo chef cook --release --features python-neuron --no-default-features --recipe-path recipe.json && sccache -s
 
 COPY backends backends
 COPY core core
@@ -48,7 +48,7 @@ FROM builder AS http-builder
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo build --release --bin text-embeddings-router -F python -F http --no-default-features && sccache -s
+    cargo build --release --bin text-embeddings-router -F python-neuron -F http --no-default-features && sccache -s
 
 FROM builder AS grpc-builder
 
@@ -56,7 +56,7 @@ COPY proto proto
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s
+    cargo build --release --bin text-embeddings-router -F grpc -F python-neuron --no-default-features && sccache -s
 
 FROM public.ecr.aws/docker/library/ubuntu:22.04 AS neuron
 
diff --git a/backends/Cargo.toml b/backends/Cargo.toml
index bb9d74191..fd0ab74ae 100644
--- a/backends/Cargo.toml
+++ b/backends/Cargo.toml
@@ -21,6 +21,7 @@ rand = { workspace = true }
 [features]
 clap = ["dep:clap", "text-embeddings-backend-core/clap"]
 python = ["dep:text-embeddings-backend-python"]
+python-neuron = ["dep:text-embeddings-backend-python"]
 ort = ["dep:text-embeddings-backend-ort"]
 candle = ["dep:text-embeddings-backend-candle"]
 cuda = ["text-embeddings-backend-candle?/cuda"]
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
index 8a48510d6..8845163eb 100644
--- a/backends/python/server/text_embeddings_server/models/__init__.py
+++ b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -11,16 +11,13 @@
 from text_embeddings_server.models.masked_model import MaskedLanguageModel
 from text_embeddings_server.models.default_model import DefaultModel
 from text_embeddings_server.models.classification_model import ClassificationModel
+from text_embeddings_server.models.habana import wrap_model_if_hpu
 
 from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron
 
 __all__ = ["Model"]
 
 TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "false").lower() in ["true", "1"]
-DISABLE_TENSOR_CACHE = os.getenv("DISABLE_TENSOR_CACHE", "false").lower() in [
-    "true",
-    "1",
-]
 
 # Flash Attention models - only available when flash_attn is installed
 FLASH_ATTENTION = True
@@ -44,34 +41,17 @@
     __all__.append(FlashBert)
 
 # Neuron models - only import when on Neuron device to avoid unnecessary dependencies
-NeuronSentenceTransformersModel = None
-NeuronClassificationModel = None
-NeuronMaskedLMModel = None
 create_neuron_model = None
 
 if is_neuron():
     try:
-        from text_embeddings_server.models.neuron_models import (
-            NeuronSentenceTransformersModel,
-            NeuronClassificationModel,
-            NeuronMaskedLMModel,
+        from text_embeddings_server.models.neuron import (
             create_neuron_model,
         )
     except ImportError as e:
         logger.warning(f"Could not import Neuron models: {e}")
 
 
-def wrap_model_if_hpu(model_handle, device):
-    """Wrap the model in HPU graph if the device is HPU."""
-    if device.type == "hpu":
-        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
-
-        model_handle.model = wrap_in_hpu_graph(
-            model_handle.model, disable_tensor_cache=DISABLE_TENSOR_CACHE
-        )
-    return model_handle
-
-
 def create_model(model_class, model_path, device, datatype, pool="cls"):
     """Create a model instance and wrap it if needed."""
     model_handle = model_class(
diff --git a/backends/python/server/text_embeddings_server/models/habana/__init__.py b/backends/python/server/text_embeddings_server/models/habana/__init__.py
new file mode 100644
index 000000000..267830de1
--- /dev/null
+++ b/backends/python/server/text_embeddings_server/models/habana/__init__.py
@@ -0,0 +1,14 @@
+import os
+
+DISABLE_TENSOR_CACHE = os.getenv("DISABLE_TENSOR_CACHE", "false").lower() in ["true", "1"]
+
+
+def wrap_model_if_hpu(model_handle, device):
+    """Wrap the model in HPU graph if the device is HPU."""
+    if device.type == "hpu":
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        model_handle.model = wrap_in_hpu_graph(
+            model_handle.model, disable_tensor_cache=DISABLE_TENSOR_CACHE
+        )
+    return model_handle
diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron/__init__.py
similarity index 100%
rename from backends/python/server/text_embeddings_server/models/neuron_models.py
rename to backends/python/server/text_embeddings_server/models/neuron/__init__.py
diff --git a/backends/src/dtype.rs b/backends/src/dtype.rs
index 80292be79..ef16ca556 100644
--- a/backends/src/dtype.rs
+++ b/backends/src/dtype.rs
@@ -9,12 +9,18 @@ pub enum DType {
     // Float16 is not available on accelerate
     #[cfg(any(
         feature = "python",
+        feature = "python-neuron",
         all(feature = "candle", not(feature = "accelerate"))
     ))]
     Float16,
-    #[cfg(any(feature = "python", feature = "candle", feature = "ort"))]
+    #[cfg(any(
+        feature = "python",
+        feature = "python-neuron",
+        feature = "candle",
+        feature = "ort"
+    ))]
     Float32,
-    #[cfg(feature = "python")]
+    #[cfg(any(feature = "python", feature = "python-neuron"))]
     Bfloat16,
 }
 
@@ -24,12 +30,18 @@ impl fmt::Display for DType {
             // Float16 is not available on accelerate
             #[cfg(any(
                 feature = "python",
+                feature = "python-neuron",
                 all(feature = "candle", not(feature = "accelerate"))
             ))]
             DType::Float16 => write!(f, "float16"),
-            #[cfg(any(feature = "python", feature = "candle", feature = "ort"))]
+            #[cfg(any(
+                feature = "python",
+                feature = "python-neuron",
+                feature = "candle",
+                feature = "ort"
+            ))]
             DType::Float32 => write!(f, "float32"),
-            #[cfg(feature = "python")]
+            #[cfg(any(feature = "python", feature = "python-neuron"))]
             DType::Bfloat16 => write!(f, "bfloat16"),
         }
     }
@@ -46,12 +58,13 @@ impl Default for DType {
             feature = "accelerate",
             feature = "mkl",
             feature = "ort",
-            feature = "python"
+            feature = "python",
+            feature = "python-neuron"
         )))]
         {
             DType::Float16
         }
-        #[cfg(feature = "python")]
+        #[cfg(any(feature = "python", feature = "python-neuron"))]
         {
             DType::Bfloat16
         }
diff --git a/backends/src/lib.rs b/backends/src/lib.rs
index c6a5e3a27..8f9ee2838 100644
--- a/backends/src/lib.rs
+++ b/backends/src/lib.rs
@@ -28,7 +28,7 @@ use text_embeddings_backend_candle::CandleBackend;
 #[cfg(feature = "ort")]
 use text_embeddings_backend_ort::OrtBackend;
 
-#[cfg(feature = "python")]
+#[cfg(any(feature = "python", feature = "python-neuron"))]
 use text_embeddings_backend_python::PythonBackend;
 
 fn powers_of_two(max_value: usize) -> Vec<usize> {
@@ -68,13 +68,6 @@ fn is_hpu() -> bool {
     }
 }
 
-fn is_neuron() -> bool {
-    match Command::new("neuron-ls").output() {
-        Ok(output) => output.status.success(),
-        Err(_) => false,
-    }
-}
-
 #[derive(Debug, Clone)]
 pub struct Backend {
     /// Channel to communicate with the background thread
@@ -423,9 +416,10 @@ async fn init_backend(
     }
 
     if let Some(api_repo) = api_repo.as_ref() {
-        if cfg!(feature = "python") || cfg!(feature = "candle") {
-            let start = std::time::Instant::now();
-            if is_neuron() {
+        let start = std::time::Instant::now();
+        if cfg!(feature = "python-neuron") {
+            #[cfg(feature = "python-neuron")]
+            {
                 tracing::info!("Downloading `model.neuron`");
                 let model_files = download_neuron(api_repo)
                     .await
@@ -436,7 +430,7 @@ async fn init_backend(
                         "Neuron model files not found in the repository. \
                         The Python backend will attempt to compile the model on-the-fly using optimum-neuron. \
                         This may take several minutes. For faster startup, consider pre-compiling your model: \
-                        https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview "
+                        https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview"
                     );
                     // Fall back to downloading regular model files for on-the-fly compilation
                     if download_safetensors(api_repo.clone()).await.is_err() {
@@ -452,21 +446,21 @@ async fn init_backend(
                 }
 
                 tracing::info!("Neuron model downloaded in {:?}", start.elapsed());
-            } else {
-                if download_safetensors(api_repo.clone()).await.is_err() {
-                    tracing::warn!(
-                        "safetensors weights not found. Using `pytorch_model.bin` instead. \
-                        Model loading will be significantly slower."
-                    );
-                    tracing::info!("Downloading `pytorch_model.bin`");
-                    api_repo
-                        .get("pytorch_model.bin")
-                        .await
-                        .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?;
-                }
-
-                tracing::info!("Model weights downloaded in {:?}", start.elapsed());
             }
+        } else if cfg!(feature = "python") || cfg!(feature = "candle") {
+            if download_safetensors(api_repo.clone()).await.is_err() {
+                tracing::warn!(
+                    "safetensors weights not found. Using `pytorch_model.bin` instead. \
+                    Model loading will be significantly slower."
+                );
+                tracing::info!("Downloading `pytorch_model.bin`");
+                api_repo
+                    .get("pytorch_model.bin")
+                    .await
+                    .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?;
+            }
+
+            tracing::info!("Model weights downloaded in {:?}", start.elapsed());
         }
     }
 
@@ -533,8 +527,8 @@ async fn init_backend(
         }
     }
 
-    if cfg!(feature = "python") {
-        #[cfg(feature = "python")]
+    if cfg!(feature = "python") || cfg!(feature = "python-neuron") {
+        #[cfg(any(feature = "python", feature = "python-neuron"))]
         {
             let backend = std::thread::spawn(move || {
                 PythonBackend::new(
@@ -775,6 +769,7 @@ async fn download_onnx(api: Arc<ApiRepo>) -> Result<Vec<PathBuf>, ApiError> {
     }
 }
 
+#[cfg(feature = "python-neuron")]
 async fn download_neuron(api: &ApiRepo) -> Result<Vec<PathBuf>, ApiError> {
     let mut model_files: Vec<PathBuf> = Vec::new();
 
diff --git a/integration_tests/README.md b/integration_tests/README.md
index 69679a95e..ca20fbb9c 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -47,4 +47,3 @@ docker build . -f Dockerfile-neuron -t tei-neuron
 cd integration_tests/neuron
 uv run pytest --durations=0 -sv .
 ```
-
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 381d611c0..605fa4dc3 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -86,6 +86,7 @@ metal = ["text-embeddings-backend/metal"]
 mkl = ["text-embeddings-backend/mkl", "dep:intel-mkl-src"]
 accelerate = ["text-embeddings-backend/accelerate"]
 python = ["text-embeddings-backend/python"]
+python-neuron = ["text-embeddings-backend/python-neuron"]
 ort = ["text-embeddings-backend/ort"]
 candle = ["text-embeddings-backend/candle"]
 candle-cuda = ["candle", "text-embeddings-backend/flash-attn", "dep:cudarc"]

From 3b48cbf33b78928769fd5f370c3f005bf85d4e37 Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Fri, 17 Apr 2026 16:45:01 +0000
Subject: [PATCH 19/20] draft:support in TorchNeuron way

---
 Dockerfile-neuron                             |  13 +-
 .../models/neuron/__init__.py                 | 402 ++++++------------
 .../text_embeddings_server/utils/device.py    |   3 +-
 3 files changed, 145 insertions(+), 273 deletions(-)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index 112c742b7..b4dceed65 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -150,26 +150,17 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
  && rm -rf ~/.cache/pip/*
 
 # HF ARGS
-# Note: optimum-neuron 0.4.4 requires transformers~=4.57.1
-ARG TRANSFORMERS_VERSION=4.57.1
-ARG DIFFUSERS_VERSION=0.35.2
+ARG TRANSFORMERS_VERSION=4.47.0
 ARG HUGGINGFACE_HUB_VERSION=0.36.0
-ARG OPTIMUM_NEURON_VERSION=0.4.4
 ARG SENTENCE_TRANSFORMERS=5.1.2
 ARG PEFT_VERSION=0.17.0
-ARG DATASETS_VERSION=4.1.1
 
 # Install Hugging Face libraries and dependencies for TEI on Neuron
 RUN pip install --no-cache-dir -U \
     networkx==2.8.8 \
-    transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
-    diffusers==${DIFFUSERS_VERSION} \
-    compel \
-    controlnet-aux \
+    transformers[sentencepiece]==${TRANSFORMERS_VERSION} \
     huggingface_hub==${HUGGINGFACE_HUB_VERSION} \
     hf_transfer \
-    datasets==${DATASETS_VERSION} \
-    optimum-neuron==${OPTIMUM_NEURON_VERSION} \
     sentence_transformers==${SENTENCE_TRANSFORMERS} \
     peft==${PEFT_VERSION} \
  && rm -rf ~/.cache/pip/*
diff --git a/backends/python/server/text_embeddings_server/models/neuron/__init__.py b/backends/python/server/text_embeddings_server/models/neuron/__init__.py
index 80745edc8..297e05eef 100644
--- a/backends/python/server/text_embeddings_server/models/neuron/__init__.py
+++ b/backends/python/server/text_embeddings_server/models/neuron/__init__.py
@@ -1,95 +1,107 @@
 import inspect
 import os
 import torch
+import torch.nn.functional as F
 
-from abc import ABC
+from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Type, List
+from typing import Type, List, Tuple
 from opentelemetry import trace
 from loguru import logger
 
 from text_embeddings_server.models.model import Model
+from text_embeddings_server.models.pooling import DefaultPooling
 from text_embeddings_server.models.types import PaddedBatch, Embedding, Score
 
 tracer = trace.get_tracer(__name__)
 
-# Neuron static shapes compilation parameters
+NEURON_MODE = os.getenv("NEURON_MODE", "eager")  # "eager" | "compile"
 NEURON_BATCH_SIZE = int(os.getenv("NEURON_BATCH_SIZE", "1"))
 NEURON_SEQUENCE_LENGTH = int(os.getenv("NEURON_SEQUENCE_LENGTH", "512"))
 
 
+def _get_orig_module(model) -> torch.nn.Module:
+    """Return the unwrapped module whether or not it has been torch.compiled."""
+    return getattr(model, "_orig_mod", model)
+
+
+def _check_param(model, param_name: str) -> bool:
+    try:
+        fn = model.forward if hasattr(model, "forward") else model.__call__
+        return inspect.signature(fn).parameters.get(param_name) is not None
+    except (ValueError, TypeError):
+        return False
+
+
 class NeuronBaseModel(Model, ABC):
-    """Base class for all Neuron models."""
+    """Base class for Neuron models using torch-native eager or torch.compile mode."""
 
-    def __init__(
-        self,
-        model,
-        model_path: Path,
-        device: torch.device,
-        dtype: torch.dtype,
-    ):
-        self.hidden_size = model.config.hidden_size
+    def __init__(self, model, device: torch.device, dtype: torch.dtype):
+        orig = _get_orig_module(model)
+        config = orig.config
+
+        self.hidden_size = config.hidden_size
 
-        # Calculate max input length based on model type
         position_offset = 0
-        model_type = model.config.model_type
-        if model_type in ["xlm-roberta", "camembert", "roberta"]:
-            position_offset = getattr(model.config, "pad_token_id", 1) + 1
-
-        if hasattr(model.config, "max_seq_length"):
-            self.max_input_length = model.config.max_seq_length
-        elif hasattr(model.config, "n_positions"):
-            self.max_input_length = model.config.n_positions
+        if config.model_type in ["xlm-roberta", "camembert", "roberta"]:
+            position_offset = getattr(config, "pad_token_id", 1) + 1
+
+        if hasattr(config, "max_seq_length"):
+            self.max_input_length = config.max_seq_length
+        elif hasattr(config, "n_positions"):
+            self.max_input_length = config.n_positions
         else:
-            self.max_input_length = (
-                model.config.max_position_embeddings - position_offset
-            )
+            self.max_input_length = config.max_position_embeddings - position_offset
 
-        # Check which inputs the model supports
-        self.has_position_ids = self._check_param_exists(model, "position_ids")
-        self.has_token_type_ids = self._check_param_exists(model, "token_type_ids")
+        self.has_position_ids = _check_param(orig, "position_ids")
+        self.has_token_type_ids = _check_param(orig, "token_type_ids")
 
         super().__init__(model=model, dtype=dtype, device=device)
 
-    @staticmethod
-    def _check_param_exists(model, param_name: str) -> bool:
-        """Check if a parameter exists in the model's forward signature."""
-        try:
-            forward_fn = model.forward if hasattr(model, 'forward') else model.__call__
-            return (
-                inspect.signature(forward_fn).parameters.get(param_name, None)
-                is not None
-            )
-        except (ValueError, TypeError):
-            return False
-
     @property
     def batch_type(self) -> Type[PaddedBatch]:
         return PaddedBatch
 
-    def _prepare_inputs(self, batch: PaddedBatch) -> dict:
-        """Prepare input kwargs for model forward pass.
+    def _pad_to_static_shape(self, batch: PaddedBatch) -> Tuple[dict, int]:
+        """Pad all inputs to (NEURON_BATCH_SIZE, NEURON_SEQUENCE_LENGTH).
 
-        Note: Neuron models require int64 (long) tensors for inputs.
+        Neuron requires static shapes; padding to fixed dims avoids recompilation
+        on every distinct (batch, seq) pair seen in production.
+        Returns (padded_kwargs_on_cpu, actual_batch_size).
         """
-        kwargs = {
-            "input_ids": batch.input_ids.to(torch.long),
-            "attention_mask": batch.attention_mask.to(torch.long),
-        }
+        actual_bs = batch.input_ids.shape[0]
+        actual_seq = batch.input_ids.shape[1]
+
+        if actual_bs > NEURON_BATCH_SIZE:
+            raise ValueError(
+                f"Batch size {actual_bs} exceeds NEURON_BATCH_SIZE={NEURON_BATCH_SIZE}. "
+                f"Set NEURON_BATCH_SIZE>={actual_bs} to serve this batch."
+            )
+
+        seq_pad = max(0, NEURON_SEQUENCE_LENGTH - actual_seq)
+        batch_pad = max(0, NEURON_BATCH_SIZE - actual_bs)
+
+        def _pad(t: torch.Tensor) -> torch.Tensor:
+            if seq_pad > 0:
+                t = F.pad(t, (0, seq_pad), value=0)
+            if batch_pad > 0:
+                t = F.pad(t, (0, 0, 0, batch_pad), value=0)
+            return t
+
+        input_ids = _pad(batch.input_ids.to(torch.long))
+        attention_mask = _pad(batch.attention_mask.to(torch.long))
+        kwargs: dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+
         if self.has_token_type_ids:
-            kwargs["token_type_ids"] = batch.token_type_ids.to(torch.long)
+            kwargs["token_type_ids"] = _pad(batch.token_type_ids.to(torch.long))
         if self.has_position_ids:
-            kwargs["position_ids"] = batch.position_ids.to(torch.long)
-        return kwargs
+            kwargs["position_ids"] = _pad(batch.position_ids.to(torch.long))
 
+        return kwargs, actual_bs
 
-class NeuronSentenceTransformersModel(Model):
-    """
-    Neuron model for sentence-transformers.
 
-    Uses optimum.neuron.NeuronSentenceTransformers which is designed
-    for sentence embedding models.
-    """
+class NeuronDefaultModel(NeuronBaseModel):
+    """Neuron model for dense sentence embeddings."""
 
     def __init__(
         self,
@@ -99,131 +111,48 @@ def __init__(
         pool: str = "cls",
         trust_remote: bool = False,
     ):
-        from optimum.neuron import NeuronSentenceTransformers
-        from transformers import AutoConfig
+        from transformers import AutoModel
 
-        # Load config separately for reliable access
-        config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote)
-        self.hidden_size = config.hidden_size
+        model = AutoModel.from_pretrained(
+            model_path, trust_remote_code=trust_remote
+        ).to(dtype).to(device)
 
-        position_offset = 0
-        model_type = config.model_type
-        if model_type in ["xlm-roberta", "camembert", "roberta"]:
-            position_offset = getattr(config, "pad_token_id", 1) + 1
+        # Extract before optional compile so DefaultPooling gets the hidden size
+        self.pooling = DefaultPooling(model.config.hidden_size, pooling_mode=pool)
 
-        if hasattr(config, "max_seq_length"):
-            self.max_input_length = config.max_seq_length
-        elif hasattr(config, "n_positions"):
-            self.max_input_length = config.n_positions
-        else:
-            self.max_input_length = (
-                config.max_position_embeddings - position_offset
-            )
-
-        is_compiled = self._is_neuron_compiled(model_path)
-        if not is_compiled:
-            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
-            model = NeuronSentenceTransformers.from_pretrained(
-                model_path,
-                export=True,
-                batch_size=NEURON_BATCH_SIZE,
-                sequence_length=NEURON_SEQUENCE_LENGTH,
-            )
-        else:
-            model = NeuronSentenceTransformers.from_pretrained(model_path)
-
-        self.pool = pool
-        super().__init__(model=model, dtype=dtype, device=device)
-        logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}, hidden_size={self.hidden_size}")
-
-    @staticmethod
-    def _is_neuron_compiled(model_path: Path) -> bool:
-        """Check if the model is already compiled for Neuron."""
-        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
-        return len(neuron_files) > 0
+        if NEURON_MODE == "compile":
+            logger.info("Wrapping NeuronDefaultModel with torch.compile(backend='neuron')")
+            model = torch.compile(model, backend="neuron", fullgraph=False)
 
-    @property
-    def batch_type(self) -> Type[PaddedBatch]:
-        return PaddedBatch
+        super().__init__(model, device, dtype)
+        logger.info(f"NeuronDefaultModel ready (mode={NEURON_MODE}, pool={pool})")
 
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        # Prepare inputs
-        input_ids = batch.input_ids.to(torch.long)
-        attention_mask = batch.attention_mask.to(torch.long)
-
-        output = self.model(input_ids, attention_mask)
-
-        # Get sentence embeddings from output
-        sentence_embedding = None
-        if isinstance(output, dict):
-            # Check if sentence_embedding exists and has non-zero values
-            # NeuronSentenceTransformers may return zeros for sentence_embedding when pooling fails
-            has_valid_sentence_embedding = (
-                "sentence_embedding" in output
-                and output["sentence_embedding"] is not None
-                and output["sentence_embedding"].abs().sum() > 0
-            )
-            if has_valid_sentence_embedding:
-                sentence_embedding = output["sentence_embedding"]
-            elif "token_embeddings" in output and output["token_embeddings"] is not None:
-                # Apply manual pooling when sentence_embedding is not valid
-                logger.debug(f"Using token_embeddings with manual {self.pool} pooling")
-                token_embeddings = output["token_embeddings"]
-
-                if self.pool == "cls":
-                    sentence_embedding = token_embeddings[:, 0, :]
-                elif self.pool == "mean":
-                    mask = attention_mask.unsqueeze(-1).float()
-                    sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1)
-                elif self.pool == "last_token":
-                    seq_lengths = attention_mask.sum(dim=1) - 1
-                    sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
-                else:
-                    raise ValueError(f"Invalid pooling mode: {self.pool}")
-            else:
-                raise ValueError(f"Cannot extract embeddings from model output dict: {output.keys()}")
-        elif hasattr(output, "sentence_embedding") and output.sentence_embedding is not None:
-            sentence_embedding = output.sentence_embedding
-        elif hasattr(output, "token_embeddings") and output.token_embeddings is not None:
-            token_embeddings = output.token_embeddings
-            if self.pool == "cls":
-                sentence_embedding = token_embeddings[:, 0, :]
-            elif self.pool == "mean":
-                mask = attention_mask.unsqueeze(-1).float()
-                sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1)
-            elif self.pool == "last_token":
-                seq_lengths = attention_mask.sum(dim=1) - 1
-                sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
-            else:
-                raise ValueError(f"Invalid pooling mode: {self.pool}")
-        elif torch.is_tensor(output):
-            # Assume output is the sentence embedding tensor directly
-            sentence_embedding = output
-        else:
-            raise ValueError(f"Cannot extract embeddings from model output: type={type(output)}")
+        kwargs, actual_bs = self._pad_to_static_shape(batch)
+
+        output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()})
 
-        # Convert to list format expected by the gRPC interface
-        cpu_results = sentence_embedding.view(-1).tolist()
+        # Move token embeddings back to CPU; pooling runs on CPU
+        token_embeddings = output[0][:actual_bs].to("cpu")
+        pool_mask = kwargs["attention_mask"][:actual_bs]  # already on CPU
+
+        # DefaultPooling.forward accepts list[tensor] so it can index [0]
+        embedding = self.pooling.forward([token_embeddings], pool_mask)
+        cpu_results = embedding.view(-1).tolist()
 
         return [
-            Embedding(
-                values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size]
-            )
-            for i in range(len(batch))
+            Embedding(values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size])
+            for i in range(actual_bs)
         ]
 
     @tracer.start_as_current_span("predict")
     def predict(self, batch: PaddedBatch) -> List[Score]:
-        raise NotImplementedError("Prediction not supported for sentence transformer models")
+        raise NotImplementedError("predict not supported for embedding models")
 
 
 class NeuronClassificationModel(NeuronBaseModel):
-    """
-    Neuron-optimized model for sequence classification.
-
-    Uses optimum.neuron.NeuronModelForSequenceClassification for classification tasks.
-    """
+    """Neuron model for sequence classification."""
 
     def __init__(
         self,
@@ -233,56 +162,37 @@ def __init__(
         pool: str = "cls",
         trust_remote: bool = False,
     ):
-        from optimum.neuron import NeuronModelForSequenceClassification
-
-        is_compiled = self._is_neuron_compiled(model_path)
-        export_kwargs = {}
-        if not is_compiled:
-            export_kwargs = {
-                "export": True,
-                "batch_size": NEURON_BATCH_SIZE,
-                "sequence_length": NEURON_SEQUENCE_LENGTH,
-            }
-            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
-        model = NeuronModelForSequenceClassification.from_pretrained(
-            model_path,
-            **export_kwargs,
-        )
-
-        super().__init__(model, model_path, device, dtype)
-        logger.info("Loaded NeuronClassificationModel")
-
-    @staticmethod
-    def _is_neuron_compiled(model_path: Path) -> bool:
-        """Check if the model is already compiled for Neuron."""
-        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
-        return len(neuron_files) > 0
+        from transformers import AutoModelForSequenceClassification
+
+        model = AutoModelForSequenceClassification.from_pretrained(
+            model_path, trust_remote_code=trust_remote
+        ).to(dtype).to(device)
+
+        if NEURON_MODE == "compile":
+            logger.info("Wrapping NeuronClassificationModel with torch.compile(backend='neuron')")
+            model = torch.compile(model, backend="neuron", fullgraph=False)
+
+        super().__init__(model, device, dtype)
+        logger.info(f"NeuronClassificationModel ready (mode={NEURON_MODE})")
 
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        raise NotImplementedError("Embedding not supported for classification models")
+        raise NotImplementedError("embed not supported for classification models")
 
     @tracer.start_as_current_span("predict")
     def predict(self, batch: PaddedBatch) -> List[Score]:
-        kwargs = self._prepare_inputs(batch)
-        output = self.model(**kwargs)
+        kwargs, actual_bs = self._pad_to_static_shape(batch)
 
-        # Get logits from output
-        if hasattr(output, "logits"):
-            logits = output.logits
-        else:
-            logits = output[0]
+        output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()})
 
-        all_scores = logits.tolist()
-        return [Score(values=scores) for scores in all_scores]
+        logits = output.logits if hasattr(output, "logits") else output[0]
+        logits_cpu = logits[:actual_bs].to("cpu").tolist()
 
+        return [Score(values=scores) for scores in logits_cpu]
 
-class NeuronMaskedLMModel(NeuronBaseModel):
-    """
-    Neuron-optimized model for Masked Language Modeling (SPLADE).
 
-    Uses optimum.neuron.NeuronModelForMaskedLM for SPLADE-style sparse embeddings.
-    """
+class NeuronMaskedLMModel(NeuronBaseModel):
+    """Neuron model for masked language modeling (SPLADE sparse embeddings)."""
 
     def __init__(
         self,
@@ -292,63 +202,47 @@ def __init__(
         pool: str = "splade",
         trust_remote: bool = False,
     ):
-        from optimum.neuron import NeuronModelForMaskedLM
-
-        is_compiled = self._is_neuron_compiled(model_path)
-        export_kwargs = {}
-        if not is_compiled:
-            export_kwargs = {
-                "export": True,
-                "batch_size": NEURON_BATCH_SIZE,
-                "sequence_length": NEURON_SEQUENCE_LENGTH,
-            }
-            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
-        model = NeuronModelForMaskedLM.from_pretrained(
-            model_path,
-            **export_kwargs,
-        )
-
-        super().__init__(model, model_path, device, dtype)
-
-        # Get vocab size for SPLADE output
+        from transformers import AutoModelForMaskedLM
+
+        model = AutoModelForMaskedLM.from_pretrained(
+            model_path, trust_remote_code=trust_remote
+        ).to(dtype).to(device)
+
+        # Extract before optional compile
         self.vocab_size = model.config.vocab_size
-        logger.info(f"Loaded NeuronMaskedLMModel with vocab_size={self.vocab_size}")
 
-    @staticmethod
-    def _is_neuron_compiled(model_path: Path) -> bool:
-        """Check if the model is already compiled for Neuron."""
-        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
-        return len(neuron_files) > 0
+        if NEURON_MODE == "compile":
+            logger.info("Wrapping NeuronMaskedLMModel with torch.compile(backend='neuron')")
+            model = torch.compile(model, backend="neuron", fullgraph=False)
+
+        super().__init__(model, device, dtype)
+        logger.info(f"NeuronMaskedLMModel ready (mode={NEURON_MODE}, vocab_size={self.vocab_size})")
 
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        kwargs = self._prepare_inputs(batch)
-        output = self.model(**kwargs)
+        kwargs, actual_bs = self._pad_to_static_shape(batch)
 
-        # Get logits for SPLADE pooling
-        if hasattr(output, "logits"):
-            hidden_states = output.logits
-        else:
-            hidden_states = output[0]
+        output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()})
+
+        hidden_states = output.logits if hasattr(output, "logits") else output[0]
+        hidden_states = hidden_states[:actual_bs].to("cpu")
+        mask = kwargs["attention_mask"][:actual_bs].unsqueeze(-1).float()
 
-        # SPLADE pooling: ReLU -> log(1+x) -> max pooling
+        # SPLADE pooling: ReLU → log(1+x) → mask → max over sequence
         hidden_states = torch.relu(hidden_states)
         hidden_states = (1 + hidden_states).log()
-        hidden_states = torch.mul(hidden_states, batch.attention_mask.unsqueeze(-1))
+        hidden_states = hidden_states * mask
         sparse_embedding = hidden_states.max(dim=1).values
 
         cpu_results = sparse_embedding.view(-1).tolist()
-
         return [
-            Embedding(
-                values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size]
-            )
-            for i in range(len(batch))
+            Embedding(values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size])
+            for i in range(actual_bs)
         ]
 
     @tracer.start_as_current_span("predict")
     def predict(self, batch: PaddedBatch) -> List[Score]:
-        raise NotImplementedError("Prediction not supported for masked LM models")
+        raise NotImplementedError("predict not supported for masked LM models")
 
 
 def create_neuron_model(
@@ -359,20 +253,7 @@ def create_neuron_model(
     trust_remote: bool = False,
     config=None,
 ) -> Model:
-    """
-    Factory function to create the appropriate Neuron model based on the model config.
-
-    Args:
-        model_path: Path to the model
-        device: Target device (should be xla for Neuron)
-        dtype: Data type for the model
-        pool: Pooling strategy (cls, mean, lasttoken, splade)
-        trust_remote: Whether to trust remote code
-        config: Pre-loaded model config (optional)
-
-    Returns:
-        Appropriate Neuron model instance
-    """
+    """Factory: pick the right Neuron model class from the model architecture."""
     from transformers import AutoConfig
 
     if config is None:
@@ -381,15 +262,14 @@ def create_neuron_model(
     architectures = getattr(config, "architectures", []) or []
     architecture = architectures[0] if architectures else ""
 
-    logger.info(f"Creating Neuron model for architecture: {architecture}, pool: {pool}")
+    logger.info(
+        f"Creating Neuron model: architecture={architecture}, pool={pool}, mode={NEURON_MODE}"
+    )
 
-    # Check for classification models
     if architecture.endswith("ForSequenceClassification") or architecture.endswith("Classification"):
         return NeuronClassificationModel(model_path, device, dtype, pool, trust_remote)
 
-    # Check for SPLADE (masked LM) models
     if pool == "splade" or architecture.endswith("ForMaskedLM"):
         return NeuronMaskedLMModel(model_path, device, dtype, pool, trust_remote)
 
-    # Default to NeuronSentenceTransformers for all embedding models
-    return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote)
+    return NeuronDefaultModel(model_path, device, dtype, pool, trust_remote)
diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py
index 4963b012c..0ec69260e 100644
--- a/backends/python/server/text_embeddings_server/utils/device.py
+++ b/backends/python/server/text_embeddings_server/utils/device.py
@@ -90,6 +90,7 @@ def get_device():
         if hasattr(torch, "xpu") and torch.xpu.is_available():
             device = torch.device("xpu")
     elif is_neuron():
-        device = torch.device("xla")
+        import torch_neuronx  # noqa: F401 — registers torch.device("neuron") as PrivateUse1
+        device = torch.device("neuron")
 
     return device

From 4d39a8bd0cadbdf63fc317dfcdfd5ab7dcae424a Mon Sep 17 00:00:00 2001
From: JingyaHuang <huang_jingya@outlook.com>
Date: Fri, 17 Apr 2026 16:46:45 +0000
Subject: [PATCH 20/20] Revert "draft:support in TorchNeuron way"

This reverts commit 3b48cbf33b78928769fd5f370c3f005bf85d4e37.
---
 Dockerfile-neuron                             |  13 +-
 .../models/neuron/__init__.py                 | 402 ++++++++++++------
 .../text_embeddings_server/utils/device.py    |   3 +-
 3 files changed, 273 insertions(+), 145 deletions(-)

diff --git a/Dockerfile-neuron b/Dockerfile-neuron
index b4dceed65..112c742b7 100644
--- a/Dockerfile-neuron
+++ b/Dockerfile-neuron
@@ -150,17 +150,26 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
  && rm -rf ~/.cache/pip/*
 
 # HF ARGS
-ARG TRANSFORMERS_VERSION=4.47.0
+# Note: optimum-neuron 0.4.4 requires transformers~=4.57.1
+ARG TRANSFORMERS_VERSION=4.57.1
+ARG DIFFUSERS_VERSION=0.35.2
 ARG HUGGINGFACE_HUB_VERSION=0.36.0
+ARG OPTIMUM_NEURON_VERSION=0.4.4
 ARG SENTENCE_TRANSFORMERS=5.1.2
 ARG PEFT_VERSION=0.17.0
+ARG DATASETS_VERSION=4.1.1
 
 # Install Hugging Face libraries and dependencies for TEI on Neuron
 RUN pip install --no-cache-dir -U \
     networkx==2.8.8 \
-    transformers[sentencepiece]==${TRANSFORMERS_VERSION} \
+    transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \
+    diffusers==${DIFFUSERS_VERSION} \
+    compel \
+    controlnet-aux \
     huggingface_hub==${HUGGINGFACE_HUB_VERSION} \
     hf_transfer \
+    datasets==${DATASETS_VERSION} \
+    optimum-neuron==${OPTIMUM_NEURON_VERSION} \
     sentence_transformers==${SENTENCE_TRANSFORMERS} \
     peft==${PEFT_VERSION} \
  && rm -rf ~/.cache/pip/*
diff --git a/backends/python/server/text_embeddings_server/models/neuron/__init__.py b/backends/python/server/text_embeddings_server/models/neuron/__init__.py
index 297e05eef..80745edc8 100644
--- a/backends/python/server/text_embeddings_server/models/neuron/__init__.py
+++ b/backends/python/server/text_embeddings_server/models/neuron/__init__.py
@@ -1,107 +1,95 @@
 import inspect
 import os
 import torch
-import torch.nn.functional as F
 
-from abc import ABC, abstractmethod
+from abc import ABC
 from pathlib import Path
-from typing import Type, List, Tuple
+from typing import Type, List
 from opentelemetry import trace
 from loguru import logger
 
 from text_embeddings_server.models.model import Model
-from text_embeddings_server.models.pooling import DefaultPooling
 from text_embeddings_server.models.types import PaddedBatch, Embedding, Score
 
 tracer = trace.get_tracer(__name__)
 
-NEURON_MODE = os.getenv("NEURON_MODE", "eager")  # "eager" | "compile"
+# Neuron static shapes compilation parameters
 NEURON_BATCH_SIZE = int(os.getenv("NEURON_BATCH_SIZE", "1"))
 NEURON_SEQUENCE_LENGTH = int(os.getenv("NEURON_SEQUENCE_LENGTH", "512"))
 
 
-def _get_orig_module(model) -> torch.nn.Module:
-    """Return the unwrapped module whether or not it has been torch.compiled."""
-    return getattr(model, "_orig_mod", model)
-
-
-def _check_param(model, param_name: str) -> bool:
-    try:
-        fn = model.forward if hasattr(model, "forward") else model.__call__
-        return inspect.signature(fn).parameters.get(param_name) is not None
-    except (ValueError, TypeError):
-        return False
-
-
 class NeuronBaseModel(Model, ABC):
-    """Base class for Neuron models using torch-native eager or torch.compile mode."""
-
-    def __init__(self, model, device: torch.device, dtype: torch.dtype):
-        orig = _get_orig_module(model)
-        config = orig.config
+    """Base class for all Neuron models."""
 
-        self.hidden_size = config.hidden_size
+    def __init__(
+        self,
+        model,
+        model_path: Path,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        self.hidden_size = model.config.hidden_size
 
+        # Calculate max input length based on model type
         position_offset = 0
-        if config.model_type in ["xlm-roberta", "camembert", "roberta"]:
-            position_offset = getattr(config, "pad_token_id", 1) + 1
-
-        if hasattr(config, "max_seq_length"):
-            self.max_input_length = config.max_seq_length
-        elif hasattr(config, "n_positions"):
-            self.max_input_length = config.n_positions
+        model_type = model.config.model_type
+        if model_type in ["xlm-roberta", "camembert", "roberta"]:
+            position_offset = getattr(model.config, "pad_token_id", 1) + 1
+
+        if hasattr(model.config, "max_seq_length"):
+            self.max_input_length = model.config.max_seq_length
+        elif hasattr(model.config, "n_positions"):
+            self.max_input_length = model.config.n_positions
         else:
-            self.max_input_length = config.max_position_embeddings - position_offset
+            self.max_input_length = (
+                model.config.max_position_embeddings - position_offset
+            )
 
-        self.has_position_ids = _check_param(orig, "position_ids")
-        self.has_token_type_ids = _check_param(orig, "token_type_ids")
+        # Check which inputs the model supports
+        self.has_position_ids = self._check_param_exists(model, "position_ids")
+        self.has_token_type_ids = self._check_param_exists(model, "token_type_ids")
 
         super().__init__(model=model, dtype=dtype, device=device)
 
+    @staticmethod
+    def _check_param_exists(model, param_name: str) -> bool:
+        """Check if a parameter exists in the model's forward signature."""
+        try:
+            forward_fn = model.forward if hasattr(model, 'forward') else model.__call__
+            return (
+                inspect.signature(forward_fn).parameters.get(param_name, None)
+                is not None
+            )
+        except (ValueError, TypeError):
+            return False
+
     @property
     def batch_type(self) -> Type[PaddedBatch]:
         return PaddedBatch
 
-    def _pad_to_static_shape(self, batch: PaddedBatch) -> Tuple[dict, int]:
-        """Pad all inputs to (NEURON_BATCH_SIZE, NEURON_SEQUENCE_LENGTH).
+    def _prepare_inputs(self, batch: PaddedBatch) -> dict:
+        """Prepare input kwargs for model forward pass.
 
-        Neuron requires static shapes; padding to fixed dims avoids recompilation
-        on every distinct (batch, seq) pair seen in production.
-        Returns (padded_kwargs_on_cpu, actual_batch_size).
+        Note: Neuron models require int64 (long) tensors for inputs.
         """
-        actual_bs = batch.input_ids.shape[0]
-        actual_seq = batch.input_ids.shape[1]
-
-        if actual_bs > NEURON_BATCH_SIZE:
-            raise ValueError(
-                f"Batch size {actual_bs} exceeds NEURON_BATCH_SIZE={NEURON_BATCH_SIZE}. "
-                f"Set NEURON_BATCH_SIZE>={actual_bs} to serve this batch."
-            )
-
-        seq_pad = max(0, NEURON_SEQUENCE_LENGTH - actual_seq)
-        batch_pad = max(0, NEURON_BATCH_SIZE - actual_bs)
-
-        def _pad(t: torch.Tensor) -> torch.Tensor:
-            if seq_pad > 0:
-                t = F.pad(t, (0, seq_pad), value=0)
-            if batch_pad > 0:
-                t = F.pad(t, (0, 0, 0, batch_pad), value=0)
-            return t
-
-        input_ids = _pad(batch.input_ids.to(torch.long))
-        attention_mask = _pad(batch.attention_mask.to(torch.long))
-        kwargs: dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-
+        kwargs = {
+            "input_ids": batch.input_ids.to(torch.long),
+            "attention_mask": batch.attention_mask.to(torch.long),
+        }
         if self.has_token_type_ids:
-            kwargs["token_type_ids"] = _pad(batch.token_type_ids.to(torch.long))
+            kwargs["token_type_ids"] = batch.token_type_ids.to(torch.long)
         if self.has_position_ids:
-            kwargs["position_ids"] = _pad(batch.position_ids.to(torch.long))
+            kwargs["position_ids"] = batch.position_ids.to(torch.long)
+        return kwargs
 
-        return kwargs, actual_bs
 
+class NeuronSentenceTransformersModel(Model):
+    """
+    Neuron model for sentence-transformers.
 
-class NeuronDefaultModel(NeuronBaseModel):
-    """Neuron model for dense sentence embeddings."""
+    Uses optimum.neuron.NeuronSentenceTransformers which is designed
+    for sentence embedding models.
+    """
 
     def __init__(
         self,
@@ -111,48 +99,131 @@ def __init__(
         pool: str = "cls",
         trust_remote: bool = False,
     ):
-        from transformers import AutoModel
+        from optimum.neuron import NeuronSentenceTransformers
+        from transformers import AutoConfig
 
-        model = AutoModel.from_pretrained(
-            model_path, trust_remote_code=trust_remote
-        ).to(dtype).to(device)
+        # Load config separately for reliable access
+        config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote)
+        self.hidden_size = config.hidden_size
 
-        # Extract before optional compile so DefaultPooling gets the hidden size
-        self.pooling = DefaultPooling(model.config.hidden_size, pooling_mode=pool)
+        position_offset = 0
+        model_type = config.model_type
+        if model_type in ["xlm-roberta", "camembert", "roberta"]:
+            position_offset = getattr(config, "pad_token_id", 1) + 1
 
-        if NEURON_MODE == "compile":
-            logger.info("Wrapping NeuronDefaultModel with torch.compile(backend='neuron')")
-            model = torch.compile(model, backend="neuron", fullgraph=False)
+        if hasattr(config, "max_seq_length"):
+            self.max_input_length = config.max_seq_length
+        elif hasattr(config, "n_positions"):
+            self.max_input_length = config.n_positions
+        else:
+            self.max_input_length = (
+                config.max_position_embeddings - position_offset
+            )
 
-        super().__init__(model, device, dtype)
-        logger.info(f"NeuronDefaultModel ready (mode={NEURON_MODE}, pool={pool})")
+        is_compiled = self._is_neuron_compiled(model_path)
+        if not is_compiled:
+            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
+            model = NeuronSentenceTransformers.from_pretrained(
+                model_path,
+                export=True,
+                batch_size=NEURON_BATCH_SIZE,
+                sequence_length=NEURON_SEQUENCE_LENGTH,
+            )
+        else:
+            model = NeuronSentenceTransformers.from_pretrained(model_path)
 
-    @tracer.start_as_current_span("embed")
-    def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        kwargs, actual_bs = self._pad_to_static_shape(batch)
+        self.pool = pool
+        super().__init__(model=model, dtype=dtype, device=device)
+        logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}, hidden_size={self.hidden_size}")
+
+    @staticmethod
+    def _is_neuron_compiled(model_path: Path) -> bool:
+        """Check if the model is already compiled for Neuron."""
+        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
+        return len(neuron_files) > 0
 
-        output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()})
+    @property
+    def batch_type(self) -> Type[PaddedBatch]:
+        return PaddedBatch
 
-        # Move token embeddings back to CPU; pooling runs on CPU
-        token_embeddings = output[0][:actual_bs].to("cpu")
-        pool_mask = kwargs["attention_mask"][:actual_bs]  # already on CPU
+    @tracer.start_as_current_span("embed")
+    def embed(self, batch: PaddedBatch) -> List[Embedding]:
+        # Prepare inputs
+        input_ids = batch.input_ids.to(torch.long)
+        attention_mask = batch.attention_mask.to(torch.long)
+
+        output = self.model(input_ids, attention_mask)
+
+        # Get sentence embeddings from output
+        sentence_embedding = None
+        if isinstance(output, dict):
+            # Check if sentence_embedding exists and has non-zero values
+            # NeuronSentenceTransformers may return zeros for sentence_embedding when pooling fails
+            has_valid_sentence_embedding = (
+                "sentence_embedding" in output
+                and output["sentence_embedding"] is not None
+                and output["sentence_embedding"].abs().sum() > 0
+            )
+            if has_valid_sentence_embedding:
+                sentence_embedding = output["sentence_embedding"]
+            elif "token_embeddings" in output and output["token_embeddings"] is not None:
+                # Apply manual pooling when sentence_embedding is not valid
+                logger.debug(f"Using token_embeddings with manual {self.pool} pooling")
+                token_embeddings = output["token_embeddings"]
+
+                if self.pool == "cls":
+                    sentence_embedding = token_embeddings[:, 0, :]
+                elif self.pool == "mean":
+                    mask = attention_mask.unsqueeze(-1).float()
+                    sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1)
+                elif self.pool == "last_token":
+                    seq_lengths = attention_mask.sum(dim=1) - 1
+                    sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
+                else:
+                    raise ValueError(f"Invalid pooling mode: {self.pool}")
+            else:
+                raise ValueError(f"Cannot extract embeddings from model output dict: {output.keys()}")
+        elif hasattr(output, "sentence_embedding") and output.sentence_embedding is not None:
+            sentence_embedding = output.sentence_embedding
+        elif hasattr(output, "token_embeddings") and output.token_embeddings is not None:
+            token_embeddings = output.token_embeddings
+            if self.pool == "cls":
+                sentence_embedding = token_embeddings[:, 0, :]
+            elif self.pool == "mean":
+                mask = attention_mask.unsqueeze(-1).float()
+                sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1)
+            elif self.pool == "last_token":
+                seq_lengths = attention_mask.sum(dim=1) - 1
+                sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths]
+            else:
+                raise ValueError(f"Invalid pooling mode: {self.pool}")
+        elif torch.is_tensor(output):
+            # Assume output is the sentence embedding tensor directly
+            sentence_embedding = output
+        else:
+            raise ValueError(f"Cannot extract embeddings from model output: type={type(output)}")
 
-        # DefaultPooling.forward accepts list[tensor] so it can index [0]
-        embedding = self.pooling.forward([token_embeddings], pool_mask)
-        cpu_results = embedding.view(-1).tolist()
+        # Convert to list format expected by the gRPC interface
+        cpu_results = sentence_embedding.view(-1).tolist()
 
         return [
-            Embedding(values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size])
-            for i in range(actual_bs)
+            Embedding(
+                values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size]
+            )
+            for i in range(len(batch))
         ]
 
     @tracer.start_as_current_span("predict")
     def predict(self, batch: PaddedBatch) -> List[Score]:
-        raise NotImplementedError("predict not supported for embedding models")
+        raise NotImplementedError("Prediction not supported for sentence transformer models")
 
 
 class NeuronClassificationModel(NeuronBaseModel):
-    """Neuron model for sequence classification."""
+    """
+    Neuron-optimized model for sequence classification.
+
+    Uses optimum.neuron.NeuronModelForSequenceClassification for classification tasks.
+    """
 
     def __init__(
         self,
@@ -162,37 +233,56 @@ def __init__(
         pool: str = "cls",
         trust_remote: bool = False,
     ):
-        from transformers import AutoModelForSequenceClassification
-
-        model = AutoModelForSequenceClassification.from_pretrained(
-            model_path, trust_remote_code=trust_remote
-        ).to(dtype).to(device)
-
-        if NEURON_MODE == "compile":
-            logger.info("Wrapping NeuronClassificationModel with torch.compile(backend='neuron')")
-            model = torch.compile(model, backend="neuron", fullgraph=False)
-
-        super().__init__(model, device, dtype)
-        logger.info(f"NeuronClassificationModel ready (mode={NEURON_MODE})")
+        from optimum.neuron import NeuronModelForSequenceClassification
+
+        is_compiled = self._is_neuron_compiled(model_path)
+        export_kwargs = {}
+        if not is_compiled:
+            export_kwargs = {
+                "export": True,
+                "batch_size": NEURON_BATCH_SIZE,
+                "sequence_length": NEURON_SEQUENCE_LENGTH,
+            }
+            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
+        model = NeuronModelForSequenceClassification.from_pretrained(
+            model_path,
+            **export_kwargs,
+        )
+
+        super().__init__(model, model_path, device, dtype)
+        logger.info("Loaded NeuronClassificationModel")
+
+    @staticmethod
+    def _is_neuron_compiled(model_path: Path) -> bool:
+        """Check if the model is already compiled for Neuron."""
+        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
+        return len(neuron_files) > 0
 
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        raise NotImplementedError("embed not supported for classification models")
+        raise NotImplementedError("Embedding not supported for classification models")
 
     @tracer.start_as_current_span("predict")
     def predict(self, batch: PaddedBatch) -> List[Score]:
-        kwargs, actual_bs = self._pad_to_static_shape(batch)
+        kwargs = self._prepare_inputs(batch)
+        output = self.model(**kwargs)
 
-        output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()})
-
-        logits = output.logits if hasattr(output, "logits") else output[0]
-        logits_cpu = logits[:actual_bs].to("cpu").tolist()
+        # Get logits from output
+        if hasattr(output, "logits"):
+            logits = output.logits
+        else:
+            logits = output[0]
 
-        return [Score(values=scores) for scores in logits_cpu]
+        all_scores = logits.tolist()
+        return [Score(values=scores) for scores in all_scores]
 
 
 class NeuronMaskedLMModel(NeuronBaseModel):
-    """Neuron model for masked language modeling (SPLADE sparse embeddings)."""
+    """
+    Neuron-optimized model for Masked Language Modeling (SPLADE).
+
+    Uses optimum.neuron.NeuronModelForMaskedLM for SPLADE-style sparse embeddings.
+    """
 
     def __init__(
         self,
@@ -202,47 +292,63 @@ def __init__(
         pool: str = "splade",
         trust_remote: bool = False,
     ):
-        from transformers import AutoModelForMaskedLM
-
-        model = AutoModelForMaskedLM.from_pretrained(
-            model_path, trust_remote_code=trust_remote
-        ).to(dtype).to(device)
-
-        # Extract before optional compile
+        from optimum.neuron import NeuronModelForMaskedLM
+
+        is_compiled = self._is_neuron_compiled(model_path)
+        export_kwargs = {}
+        if not is_compiled:
+            export_kwargs = {
+                "export": True,
+                "batch_size": NEURON_BATCH_SIZE,
+                "sequence_length": NEURON_SEQUENCE_LENGTH,
+            }
+            logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}")
+        model = NeuronModelForMaskedLM.from_pretrained(
+            model_path,
+            **export_kwargs,
+        )
+
+        super().__init__(model, model_path, device, dtype)
+
+        # Get vocab size for SPLADE output
         self.vocab_size = model.config.vocab_size
+        logger.info(f"Loaded NeuronMaskedLMModel with vocab_size={self.vocab_size}")
 
-        if NEURON_MODE == "compile":
-            logger.info("Wrapping NeuronMaskedLMModel with torch.compile(backend='neuron')")
-            model = torch.compile(model, backend="neuron", fullgraph=False)
-
-        super().__init__(model, device, dtype)
-        logger.info(f"NeuronMaskedLMModel ready (mode={NEURON_MODE}, vocab_size={self.vocab_size})")
+    @staticmethod
+    def _is_neuron_compiled(model_path: Path) -> bool:
+        """Check if the model is already compiled for Neuron."""
+        neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else []
+        return len(neuron_files) > 0
 
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        kwargs, actual_bs = self._pad_to_static_shape(batch)
+        kwargs = self._prepare_inputs(batch)
+        output = self.model(**kwargs)
 
-        output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()})
-
-        hidden_states = output.logits if hasattr(output, "logits") else output[0]
-        hidden_states = hidden_states[:actual_bs].to("cpu")
-        mask = kwargs["attention_mask"][:actual_bs].unsqueeze(-1).float()
+        # Get logits for SPLADE pooling
+        if hasattr(output, "logits"):
+            hidden_states = output.logits
+        else:
+            hidden_states = output[0]
 
-        # SPLADE pooling: ReLU → log(1+x) → mask → max over sequence
+        # SPLADE pooling: ReLU -> log(1+x) -> max pooling
         hidden_states = torch.relu(hidden_states)
         hidden_states = (1 + hidden_states).log()
-        hidden_states = hidden_states * mask
+        hidden_states = torch.mul(hidden_states, batch.attention_mask.unsqueeze(-1))
         sparse_embedding = hidden_states.max(dim=1).values
 
         cpu_results = sparse_embedding.view(-1).tolist()
+
         return [
-            Embedding(values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size])
-            for i in range(actual_bs)
+            Embedding(
+                values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size]
+            )
+            for i in range(len(batch))
         ]
 
     @tracer.start_as_current_span("predict")
     def predict(self, batch: PaddedBatch) -> List[Score]:
-        raise NotImplementedError("predict not supported for masked LM models")
+        raise NotImplementedError("Prediction not supported for masked LM models")
 
 
 def create_neuron_model(
@@ -253,7 +359,20 @@ def create_neuron_model(
     trust_remote: bool = False,
     config=None,
 ) -> Model:
-    """Factory: pick the right Neuron model class from the model architecture."""
+    """
+    Factory function to create the appropriate Neuron model based on the model config.
+
+    Args:
+        model_path: Path to the model
+        device: Target device (should be xla for Neuron)
+        dtype: Data type for the model
+        pool: Pooling strategy (cls, mean, lasttoken, splade)
+        trust_remote: Whether to trust remote code
+        config: Pre-loaded model config (optional)
+
+    Returns:
+        Appropriate Neuron model instance
+    """
     from transformers import AutoConfig
 
     if config is None:
@@ -262,14 +381,15 @@ def create_neuron_model(
     architectures = getattr(config, "architectures", []) or []
     architecture = architectures[0] if architectures else ""
 
-    logger.info(
-        f"Creating Neuron model: architecture={architecture}, pool={pool}, mode={NEURON_MODE}"
-    )
+    logger.info(f"Creating Neuron model for architecture: {architecture}, pool: {pool}")
 
+    # Check for classification models
     if architecture.endswith("ForSequenceClassification") or architecture.endswith("Classification"):
         return NeuronClassificationModel(model_path, device, dtype, pool, trust_remote)
 
+    # Check for SPLADE (masked LM) models
     if pool == "splade" or architecture.endswith("ForMaskedLM"):
         return NeuronMaskedLMModel(model_path, device, dtype, pool, trust_remote)
 
-    return NeuronDefaultModel(model_path, device, dtype, pool, trust_remote)
+    # Default to NeuronSentenceTransformers for all embedding models
+    return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote)
diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py
index 0ec69260e..4963b012c 100644
--- a/backends/python/server/text_embeddings_server/utils/device.py
+++ b/backends/python/server/text_embeddings_server/utils/device.py
@@ -90,7 +90,6 @@ def get_device():
         if hasattr(torch, "xpu") and torch.xpu.is_available():
             device = torch.device("xpu")
     elif is_neuron():
-        import torch_neuronx  # noqa: F401 — registers torch.device("neuron") as PrivateUse1
-        device = torch.device("neuron")
+        device = torch.device("xla")
 
     return device