From 710b8c17c13bd24b839121608efd29fd5801e4f0 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 22 Aug 2025 11:23:55 +0000 Subject: [PATCH 01/20] 1st draft --- Dockerfile.neuron | 43 +++++ backends/Cargo.toml | 1 + backends/neuron/Cargo.toml | 16 ++ backends/neuron/server/README.md | 0 .../server/text_embeddings_server/__init__.py | 0 .../server/text_embeddings_server/cli.py | 55 +++++++ .../text_embeddings_server/models/__init__.py | 126 +++++++++++++++ .../server/text_embeddings_server/server.py | 92 +++++++++++ backends/neuron/src/lib.rs | 132 ++++++++++++++++ backends/neuron/src/logging.rs | 61 ++++++++ backends/neuron/src/management.rs | 148 ++++++++++++++++++ docs/source/en/_toctree.yml | 2 + docs/source/en/local_neuron.md | 1 + integration_tests/neuron/conftest.py | 0 integration_tests/neuron/test_embed.py | 0 15 files changed, 677 insertions(+) create mode 100644 Dockerfile.neuron create mode 100644 backends/neuron/Cargo.toml create mode 100644 backends/neuron/server/README.md create mode 100644 backends/neuron/server/text_embeddings_server/__init__.py create mode 100644 backends/neuron/server/text_embeddings_server/cli.py create mode 100644 backends/neuron/server/text_embeddings_server/models/__init__.py create mode 100644 backends/neuron/server/text_embeddings_server/server.py create mode 100644 backends/neuron/src/lib.rs create mode 100644 backends/neuron/src/logging.rs create mode 100644 backends/neuron/src/management.rs create mode 100644 docs/source/en/local_neuron.md create mode 100644 integration_tests/neuron/conftest.py create mode 100644 integration_tests/neuron/test_embed.py diff --git a/Dockerfile.neuron b/Dockerfile.neuron new file mode 100644 index 000000000..f8b03ab26 --- /dev/null +++ b/Dockerfile.neuron @@ -0,0 +1,43 @@ +ARG PLATFORM=neuron +FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef +WORKDIR /usr/src + +ENV SCCACHE=0.10.0 +ENV RUSTC_WRAPPER=/usr/local/bin/sccache + +# Donwload, configure sccache +RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \ + chmod +x /usr/local/bin/sccache + +FROM chef AS planner + +COPY backends backends +COPY core core +COPY router router +COPY Cargo.toml ./ +COPY Cargo.lock ./ + +RUN cargo chef prepare --recipe-path recipe.json + +FROM chef AS builder + +ARG GIT_SHA +ARG DOCKER_LABEL + +# sccache specific variables +ARG SCCACHE_GHA_ENABLED + +COPY --from=planner /usr/src/recipe.json recipe.json + +RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ + --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ + cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s + +COPY backends backends +COPY core core +COPY router router +COPY Cargo.toml ./ +COPY Cargo.lock ./ + +WORKDIR /usr/src + diff --git a/backends/Cargo.toml b/backends/Cargo.toml index bb9d74191..7d821ff40 100644 --- a/backends/Cargo.toml +++ b/backends/Cargo.toml @@ -21,6 +21,7 @@ rand = { workspace = true } [features] clap = ["dep:clap", "text-embeddings-backend-core/clap"] python = ["dep:text-embeddings-backend-python"] +neuron = ["dep:text-embeddings-backend-neuron"] ort = ["dep:text-embeddings-backend-ort"] candle = ["dep:text-embeddings-backend-candle"] cuda = ["text-embeddings-backend-candle?/cuda"] diff --git a/backends/neuron/Cargo.toml b/backends/neuron/Cargo.toml new file mode 100644 index 000000000..b38f350ed --- /dev/null +++ b/backends/neuron/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "text-embeddings-backend-python" +version.workspace = true +edition.workspace = true +authors.workspace = true +homepage.workspace = true + +[dependencies] +backend-grpc-client = { path = "../grpc-client" } +nohash-hasher = "^0.2" +serde = { version = "^1.0", features = ["derive"] } +serde_json = "^1.0" +text-embeddings-backend-core = { path = "../core" } +thiserror = "^1.0" +tokio = { version = "^1.25", features = ["sync"] } +tracing = "^0.1" diff --git a/backends/neuron/server/README.md b/backends/neuron/server/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/backends/neuron/server/text_embeddings_server/__init__.py b/backends/neuron/server/text_embeddings_server/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backends/neuron/server/text_embeddings_server/cli.py b/backends/neuron/server/text_embeddings_server/cli.py new file mode 100644 index 000000000..c4dfaa4c1 --- /dev/null +++ b/backends/neuron/server/text_embeddings_server/cli.py @@ -0,0 +1,55 @@ +import sys +import typer + +from pathlib import Path +from loguru import logger +from typing import Optional +from enum import Enum + +app = typer.Typer() + + +class Dtype(str, Enum): + float32 = "float32" + float16 = "float16" + bloat16 = "bfloat16" + + +@app.command() +def serve( + model_path: Path, + dtype: Dtype = "float32", + uds_path: Path = "/tmp/text-embeddings-server", + logger_level: str = "INFO", + json_output: bool = False, + otlp_endpoint: Optional[str] = None, + otlp_service_name: str = "text-embeddings-inference.server", + pool: str = "cls", +): + # Remove default handler + logger.remove() + logger.add( + sys.stdout, + format="{message}", + filter="text_embeddings_server", + level=logger_level, + serialize=json_output, + backtrace=True, + diagnose=False, + ) + + # Import here after the logger is added to log potential import exceptions + from text_embeddings_server import server + from text_embeddings_server.utils.tracing import setup_tracing + + # Setup OpenTelemetry distributed tracing + if otlp_endpoint is not None: + setup_tracing(otlp_endpoint=otlp_endpoint, otlp_service_name=otlp_service_name) + + # Downgrade enum into str for easier management later on + dtype = None if dtype is None else dtype.value + server.serve(model_path, dtype, uds_path, pool) + + +if __name__ == "__main__": + app() diff --git a/backends/neuron/server/text_embeddings_server/models/__init__.py b/backends/neuron/server/text_embeddings_server/models/__init__.py new file mode 100644 index 000000000..06c39832c --- /dev/null +++ b/backends/neuron/server/text_embeddings_server/models/__init__.py @@ -0,0 +1,126 @@ +import os +import torch + +from loguru import logger +from pathlib import Path +from typing import Optional +from transformers import AutoConfig +from transformers.models.bert import BertConfig + +from text_embeddings_server.models.model import Model +from text_embeddings_server.models.masked_model import MaskedLanguageModel +from text_embeddings_server.models.default_model import DefaultModel +from text_embeddings_server.models.classification_model import ClassificationModel +from text_embeddings_server.models.jinaBert_model import FlashJinaBert +from text_embeddings_server.models.flash_mistral import FlashMistral +from text_embeddings_server.models.flash_qwen3 import FlashQwen3 +from text_embeddings_server.utils.device import get_device, use_ipex + +__all__ = ["Model"] + +TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "false").lower() in ["true", "1"] +DISABLE_TENSOR_CACHE = os.getenv("DISABLE_TENSOR_CACHE", "false").lower() in [ + "true", + "1", +] +# Disable gradients +torch.set_grad_enabled(False) + +FLASH_ATTENTION = True +try: + from text_embeddings_server.models.flash_bert import FlashBert +except ImportError as e: + logger.warning(f"Could not import Flash Attention enabled models: {e}") + FLASH_ATTENTION = False + +if FLASH_ATTENTION: + __all__.append(FlashBert) + + +def create_model(model_class, model_path, device, datatype, pool="cls"): + """Create a model instance and load it into Neuron devices.""" + model_handle = model_class( + model_path, + device, + datatype, + pool, + trust_remote=TRUST_REMOTE_CODE, + ) + return model_handle + + +def get_model(model_path: Path, dtype: Optional[str], pool: str): + if dtype == "float32": + datatype = torch.float32 + elif dtype == "float16": + datatype = torch.float16 + elif dtype == "bfloat16": + datatype = torch.bfloat16 + else: + raise RuntimeError(f"Unknown dtype {dtype}") + + device = get_device() + logger.info(f"backend device: {device}") + + config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE) + + if ( + hasattr(config, "auto_map") + and isinstance(config.auto_map, dict) + and "AutoModel" in config.auto_map + and config.auto_map["AutoModel"] + == "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertModel" + ): + # Add specific offline modeling for model "jinaai/jina-embeddings-v2-base-code" which uses "autoMap" to reference code in other repository + return create_model(FlashJinaBert, model_path, device, datatype) + + if config.model_type == "bert": + config: BertConfig + if ( + use_ipex() + or device.type in ["cuda", "hpu"] + and config.position_embedding_type == "absolute" + and datatype in [torch.float16, torch.bfloat16] + and FLASH_ATTENTION + ): + if pool != "cls": + if config.architectures[0].endswith("ForMaskedLM") and pool == "splade": + return create_model( + MaskedLanguageModel, model_path, device, datatype, pool + ) + return create_model(DefaultModel, model_path, device, datatype, pool) + + try: + return create_model(FlashBert, model_path, device, datatype) + except FileNotFoundError: + logger.info( + "Do not have safetensors file for this model, use default transformers model path instead" + ) + return create_model(DefaultModel, model_path, device, datatype, pool) + + if config.architectures[0].endswith("Classification"): + return create_model(ClassificationModel, model_path, device, datatype) + elif config.architectures[0].endswith("ForMaskedLM") and pool == "splade": + return create_model(MaskedLanguageModel, model_path, device, datatype) + else: + return create_model(DefaultModel, model_path, device, datatype, pool) + + if config.model_type == "mistral" and device.type == "hpu": + try: + return create_model(FlashMistral, model_path, device, datatype, pool) + except FileNotFoundError: + return create_model(DefaultModel, model_path, device, datatype, pool) + + if config.model_type == "qwen3" and device.type == "hpu": + try: + return create_model(FlashQwen3, model_path, device, datatype, pool) + except FileNotFoundError: + return create_model(DefaultModel, model_path, device, datatype, pool) + + # Default case + if config.architectures[0].endswith("Classification"): + return create_model(ClassificationModel, model_path, device, datatype) + elif config.architectures[0].endswith("ForMaskedLM") and pool == "splade": + return create_model(MaskedLanguageModel, model_path, device, datatype) + else: + return create_model(DefaultModel, model_path, device, datatype, pool) diff --git a/backends/neuron/server/text_embeddings_server/server.py b/backends/neuron/server/text_embeddings_server/server.py new file mode 100644 index 000000000..646d79bc9 --- /dev/null +++ b/backends/neuron/server/text_embeddings_server/server.py @@ -0,0 +1,92 @@ +import asyncio +import torch +from grpc import aio +from loguru import logger + +from grpc_reflection.v1alpha import reflection +from pathlib import Path +from typing import Optional + +from text_embeddings_server.models import Model, get_model +from text_embeddings_server.pb import embed_pb2_grpc, embed_pb2 +from text_embeddings_server.utils.tracing import UDSOpenTelemetryAioServerInterceptor +from text_embeddings_server.utils.interceptor import ExceptionInterceptor + + +class EmbeddingService(embed_pb2_grpc.EmbeddingServiceServicer): + def __init__(self, model: Model): + self.model = model + # Force inference mode for the lifetime of EmbeddingService + self._inference_mode_raii_guard = torch._C._InferenceMode(True) + + async def Health(self, request, context): + if self.model.device.type == "cuda": + torch.zeros((2, 2), device="cuda") + return embed_pb2.HealthResponse() + + async def Embed(self, request, context): + max_input_length = self.model.max_input_length + batch = self.model.batch_type.from_pb( + request, self.model.device, max_input_length + ) + + embeddings = self.model.embed(batch) + + return embed_pb2.EmbedResponse(embeddings=embeddings) + + async def Predict(self, request, context): + max_input_length = self.model.max_input_length + batch = self.model.batch_type.from_pb( + request, self.model.device, max_input_length + ) + + scores = self.model.predict(batch) + + return embed_pb2.PredictResponse(scores=scores) + + +def serve( + model_path: Path, + dtype: Optional[str], + uds_path: Path, + pool: str, +): + async def serve_inner( + model_path: Path, + dtype: Optional[str] = None, + ): + unix_socket = f"unix://{uds_path}" + + try: + model = get_model(model_path, dtype, pool) + except Exception: + logger.exception("Error when initializing model") + raise + + server = aio.server( + interceptors=[ + ExceptionInterceptor(), + UDSOpenTelemetryAioServerInterceptor(), + ] + ) + embed_pb2_grpc.add_EmbeddingServiceServicer_to_server( + EmbeddingService(model), server + ) + SERVICE_NAMES = ( + embed_pb2.DESCRIPTOR.services_by_name["EmbeddingService"].full_name, + reflection.SERVICE_NAME, + ) + reflection.enable_server_reflection(SERVICE_NAMES, server) + server.add_insecure_port(unix_socket) + + await server.start() + + logger.info(f"Server started at {unix_socket}") + + try: + await server.wait_for_termination() + except KeyboardInterrupt: + logger.info("Signal received. Shutting down") + await server.stop(0) + + asyncio.run(serve_inner(model_path, dtype)) diff --git a/backends/neuron/src/lib.rs b/backends/neuron/src/lib.rs new file mode 100644 index 000000000..53255b07d --- /dev/null +++ b/backends/neuron/src/lib.rs @@ -0,0 +1,132 @@ +mod logging; +mod management; + +use backend_grpc_client::Client; +use nohash_hasher::BuildNoHashHasher; +use std::collections::HashMap; +use text_embeddings_backend_core::{ + Backend, BackendError, Batch, Embedding, Embeddings, ModelType, Pool, Predictions, +}; +use tokio::runtime::Runtime; + +pub struct PythonBackend { + _backend_process: management::BackendProcess, + tokio_runtime: Runtime, + backend_client: Client, +} + +impl PythonBackend { + pub fn new( + model_path: String, + dtype: String, + model_type: ModelType, + uds_path: String, + otlp_endpoint: Option, + otlp_service_name: String, + ) -> Result { + let pool = match model_type { + ModelType::Classifier => Pool::Cls, + ModelType::Embedding(pool) => pool, + }; + + let backend_process = management::BackendProcess::new( + model_path, + dtype, + &uds_path, + otlp_endpoint, + otlp_service_name, + pool, + )?; + let tokio_runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|err| BackendError::Start(format!("Could not start Tokio runtime: {err}")))?; + + let backend_client = tokio_runtime + .block_on(Client::connect_uds(uds_path)) + .map_err(|err| { + BackendError::Start(format!("Could not connect to backend process: {err}")) + })?; + + Ok(Self { + _backend_process: backend_process, + tokio_runtime, + backend_client, + }) + } +} + +impl Backend for PythonBackend { + fn health(&self) -> Result<(), BackendError> { + if self + .tokio_runtime + .block_on(self.backend_client.clone().health()) + .is_err() + { + return Err(BackendError::Unhealthy); + } + Ok(()) + } + + fn is_padded(&self) -> bool { + false + } + + fn embed(&self, batch: Batch) -> Result { + if !batch.raw_indices.is_empty() { + return Err(BackendError::Inference( + "raw embeddings are not supported for the Python backend.".to_string(), + )); + } + let batch_size = batch.len(); + + let results = self + .tokio_runtime + .block_on(self.backend_client.clone().embed( + batch.input_ids, + batch.token_type_ids, + batch.position_ids, + batch.cumulative_seq_lengths, + batch.max_length, + )) + .map_err(|err| BackendError::Inference(err.to_string()))?; + let pooled_embeddings: Vec> = results.into_iter().map(|r| r.values).collect(); + + let mut embeddings = + HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default()); + for (i, e) in pooled_embeddings.into_iter().enumerate() { + embeddings.insert(i, Embedding::Pooled(e)); + } + + Ok(embeddings) + } + + fn predict(&self, batch: Batch) -> Result { + if !batch.raw_indices.is_empty() { + return Err(BackendError::Inference( + "raw embeddings are not supported for the Python backend.".to_string(), + )); + } + let batch_size = batch.len(); + let results = self + .tokio_runtime + .block_on(self.backend_client.clone().predict( + batch.input_ids, + batch.token_type_ids, + batch.position_ids, + batch.cumulative_seq_lengths, + batch.max_length, + )) + .map_err(|err| BackendError::Inference(err.to_string()))?; + let raw_results: Vec> = results.into_iter().map(|r| r.values).collect(); + + let mut predictions = + HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default()); + + for (i, r) in raw_results.into_iter().enumerate() { + predictions.insert(i, r); + } + + Ok(predictions) + } +} diff --git a/backends/neuron/src/logging.rs b/backends/neuron/src/logging.rs new file mode 100644 index 000000000..8f55e8e6b --- /dev/null +++ b/backends/neuron/src/logging.rs @@ -0,0 +1,61 @@ +use serde::Deserialize; +use std::io::{BufRead, Lines}; + +#[derive(Deserialize)] +#[serde(rename_all = "UPPERCASE")] +enum PythonLogLevelEnum { + Trace, + Debug, + Info, + Success, + Warning, + Error, + Critical, +} + +#[derive(Deserialize)] +struct PythonLogLevel { + name: PythonLogLevelEnum, +} + +#[derive(Deserialize)] +struct PythonLogRecord { + level: PythonLogLevel, +} + +#[derive(Deserialize)] +struct PythonLogMessage { + text: String, + record: PythonLogRecord, +} + +impl PythonLogMessage { + fn trace(&self) { + match self.record.level.name { + PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text), + PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text), + PythonLogLevelEnum::Info => tracing::info!("{}", self.text), + PythonLogLevelEnum::Success => tracing::info!("{}", self.text), + PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text), + PythonLogLevelEnum::Error => tracing::error!("{}", self.text), + PythonLogLevelEnum::Critical => tracing::error!("{}", self.text), + } + } +} + +impl TryFrom<&String> for PythonLogMessage { + type Error = serde_json::Error; + + fn try_from(value: &String) -> Result { + serde_json::from_str::(value) + } +} + +pub(crate) fn log_lines(lines: Lines) { + for line in lines.map_while(Result::ok) { + match PythonLogMessage::try_from(&line) { + Ok(log) => log.trace(), + Err(_) => tracing::debug!("{line}"), + } + } +} diff --git a/backends/neuron/src/management.rs b/backends/neuron/src/management.rs new file mode 100644 index 000000000..81c294a92 --- /dev/null +++ b/backends/neuron/src/management.rs @@ -0,0 +1,148 @@ +use crate::logging::log_lines; +use std::ffi::OsString; +use std::io::{BufRead, BufReader}; +use std::os::unix::process::{CommandExt, ExitStatusExt}; +use std::path::Path; +use std::process::{Child, Command, Stdio}; +use std::sync::mpsc; +use std::thread::sleep; +use std::time::{Duration, Instant}; +use std::{env, fs, io, thread}; +use text_embeddings_backend_core::{BackendError, Pool}; + +#[derive(Debug)] +pub(crate) struct BackendProcess { + inner: Child, +} + +impl BackendProcess { + pub(crate) fn new( + model_path: String, + dtype: String, + uds_path: &str, + otlp_endpoint: Option, + otlp_service_name: String, + pool: Pool, + ) -> Result { + // Get UDS path + let uds = Path::new(uds_path); + + // Clean previous runs + if uds.exists() { + fs::remove_file(uds).expect("could not remove UDS file"); + } + + let pool = match pool { + Pool::Cls => "cls", + Pool::Mean => "mean", + Pool::LastToken => "lasttoken", + Pool::Splade => "splade", + }; + + // Process args + let mut python_server_args = vec![ + model_path, + "--dtype".to_owned(), + dtype, + "--uds-path".to_owned(), + uds_path.to_owned(), + "--logger-level".to_owned(), + "INFO".to_owned(), + "--json-output".to_owned(), + "--pool".to_owned(), + pool.to_owned(), + ]; + + // OpenTelemetry + if let Some(otlp_endpoint) = otlp_endpoint { + python_server_args.push("--otlp-endpoint".to_owned()); + python_server_args.push(otlp_endpoint); + } + + python_server_args.push("--otlp-service-name".to_owned()); + python_server_args.push(otlp_service_name); + + // Copy current process env + let envs: Vec<(OsString, OsString)> = env::vars_os().collect(); + + tracing::info!("Starting Python backend"); + let mut p = match Command::new("python-text-embeddings-server") + .args(python_server_args) + .envs(envs) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .process_group(0) + .spawn() + { + Ok(p) => p, + Err(err) => { + if err.kind() == io::ErrorKind::NotFound { + return Err(BackendError::Start( + "python-text-embeddings-server not found in PATH".to_owned(), + )); + } + return Err(BackendError::Start(err.to_string())); + } + }; + + let stdout_reader = BufReader::new(p.stdout.take().unwrap()); + let stderr_reader = BufReader::new(p.stderr.take().unwrap()); + + //stdout tracing thread + thread::spawn(move || { + let _span = tracing::span!(tracing::Level::INFO, "python-backend").entered(); + log_lines(stdout_reader.lines()); + }); + + let start_time = Instant::now(); + let mut wait_time = Instant::now(); + + loop { + // Process exited + if let Some(exit_status) = p.try_wait().unwrap() { + // We read stderr in another thread as it seems that lines() can block in some cases + let (err_sender, err_receiver) = mpsc::channel(); + thread::spawn(move || { + for line in stderr_reader.lines().map_while(Result::ok) { + err_sender.send(line).unwrap_or(()); + } + }); + let mut err = String::new(); + while let Ok(line) = err_receiver.recv_timeout(Duration::from_millis(10)) { + err = err + "\n" + &line; + } + + tracing::debug!("Python Backend complete standard error output:\n{err}"); + + if let Some(signal) = exit_status.signal() { + return Err(BackendError::Start(format!( + "Python Backend process was signaled to shutdown with signal {signal}" + ))); + } + return Err(BackendError::Start( + "Python backend failed to start".to_string(), + )); + } + + // Shard is ready + if uds.exists() { + tracing::info!("Python backend ready in {:?}", start_time.elapsed()); + break; + } else if wait_time.elapsed() > Duration::from_secs(10) { + tracing::info!("Waiting for Python backend to be ready..."); + wait_time = Instant::now(); + } + sleep(Duration::from_millis(5)); + } + + Ok(Self { inner: p }) + } +} + +impl Drop for BackendProcess { + fn drop(&mut self) { + self.inner.kill().unwrap(); + let _ = self.inner.wait(); + tracing::info!("Python backend process terminated"); + } +} diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index fa6f21e63..b9eebac2c 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -19,6 +19,8 @@ title: Build custom container for TEI - local: intel_container title: Using TEI container with Intel Hardware + - local: local_neuron + title: Using TEI container with AWS Neuron - local: examples title: Example uses title: Tutorials diff --git a/docs/source/en/local_neuron.md b/docs/source/en/local_neuron.md new file mode 100644 index 000000000..e0a2cf2ba --- /dev/null +++ b/docs/source/en/local_neuron.md @@ -0,0 +1 @@ +# Neuron backend for AWS Trainium and Inferentia \ No newline at end of file diff --git a/integration_tests/neuron/conftest.py b/integration_tests/neuron/conftest.py new file mode 100644 index 000000000..e69de29bb diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py new file mode 100644 index 000000000..e69de29bb From 139b179f1cd346705fc3267b1f39162e438d8b21 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 22 Oct 2025 16:29:32 +0000 Subject: [PATCH 02/20] feat: sentence transformer for neuron --- backends/neuron/Cargo.toml | 16 -- backends/neuron/server/README.md | 0 .../server/text_embeddings_server/__init__.py | 0 .../server/text_embeddings_server/cli.py | 55 ------- .../text_embeddings_server/models/__init__.py | 126 --------------- .../server/text_embeddings_server/server.py | 92 ----------- backends/neuron/src/lib.rs | 132 ---------------- backends/neuron/src/logging.rs | 61 -------- backends/neuron/src/management.rs | 148 ------------------ .../text_embeddings_server/models/__init__.py | 9 +- .../models/neuron_models.py | 67 ++++++++ .../text_embeddings_server/utils/device.py | 19 +++ 12 files changed, 94 insertions(+), 631 deletions(-) delete mode 100644 backends/neuron/Cargo.toml delete mode 100644 backends/neuron/server/README.md delete mode 100644 backends/neuron/server/text_embeddings_server/__init__.py delete mode 100644 backends/neuron/server/text_embeddings_server/cli.py delete mode 100644 backends/neuron/server/text_embeddings_server/models/__init__.py delete mode 100644 backends/neuron/server/text_embeddings_server/server.py delete mode 100644 backends/neuron/src/lib.rs delete mode 100644 backends/neuron/src/logging.rs delete mode 100644 backends/neuron/src/management.rs create mode 100644 backends/python/server/text_embeddings_server/models/neuron_models.py diff --git a/backends/neuron/Cargo.toml b/backends/neuron/Cargo.toml deleted file mode 100644 index b38f350ed..000000000 --- a/backends/neuron/Cargo.toml +++ /dev/null @@ -1,16 +0,0 @@ -[package] -name = "text-embeddings-backend-python" -version.workspace = true -edition.workspace = true -authors.workspace = true -homepage.workspace = true - -[dependencies] -backend-grpc-client = { path = "../grpc-client" } -nohash-hasher = "^0.2" -serde = { version = "^1.0", features = ["derive"] } -serde_json = "^1.0" -text-embeddings-backend-core = { path = "../core" } -thiserror = "^1.0" -tokio = { version = "^1.25", features = ["sync"] } -tracing = "^0.1" diff --git a/backends/neuron/server/README.md b/backends/neuron/server/README.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/backends/neuron/server/text_embeddings_server/__init__.py b/backends/neuron/server/text_embeddings_server/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/backends/neuron/server/text_embeddings_server/cli.py b/backends/neuron/server/text_embeddings_server/cli.py deleted file mode 100644 index c4dfaa4c1..000000000 --- a/backends/neuron/server/text_embeddings_server/cli.py +++ /dev/null @@ -1,55 +0,0 @@ -import sys -import typer - -from pathlib import Path -from loguru import logger -from typing import Optional -from enum import Enum - -app = typer.Typer() - - -class Dtype(str, Enum): - float32 = "float32" - float16 = "float16" - bloat16 = "bfloat16" - - -@app.command() -def serve( - model_path: Path, - dtype: Dtype = "float32", - uds_path: Path = "/tmp/text-embeddings-server", - logger_level: str = "INFO", - json_output: bool = False, - otlp_endpoint: Optional[str] = None, - otlp_service_name: str = "text-embeddings-inference.server", - pool: str = "cls", -): - # Remove default handler - logger.remove() - logger.add( - sys.stdout, - format="{message}", - filter="text_embeddings_server", - level=logger_level, - serialize=json_output, - backtrace=True, - diagnose=False, - ) - - # Import here after the logger is added to log potential import exceptions - from text_embeddings_server import server - from text_embeddings_server.utils.tracing import setup_tracing - - # Setup OpenTelemetry distributed tracing - if otlp_endpoint is not None: - setup_tracing(otlp_endpoint=otlp_endpoint, otlp_service_name=otlp_service_name) - - # Downgrade enum into str for easier management later on - dtype = None if dtype is None else dtype.value - server.serve(model_path, dtype, uds_path, pool) - - -if __name__ == "__main__": - app() diff --git a/backends/neuron/server/text_embeddings_server/models/__init__.py b/backends/neuron/server/text_embeddings_server/models/__init__.py deleted file mode 100644 index 06c39832c..000000000 --- a/backends/neuron/server/text_embeddings_server/models/__init__.py +++ /dev/null @@ -1,126 +0,0 @@ -import os -import torch - -from loguru import logger -from pathlib import Path -from typing import Optional -from transformers import AutoConfig -from transformers.models.bert import BertConfig - -from text_embeddings_server.models.model import Model -from text_embeddings_server.models.masked_model import MaskedLanguageModel -from text_embeddings_server.models.default_model import DefaultModel -from text_embeddings_server.models.classification_model import ClassificationModel -from text_embeddings_server.models.jinaBert_model import FlashJinaBert -from text_embeddings_server.models.flash_mistral import FlashMistral -from text_embeddings_server.models.flash_qwen3 import FlashQwen3 -from text_embeddings_server.utils.device import get_device, use_ipex - -__all__ = ["Model"] - -TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "false").lower() in ["true", "1"] -DISABLE_TENSOR_CACHE = os.getenv("DISABLE_TENSOR_CACHE", "false").lower() in [ - "true", - "1", -] -# Disable gradients -torch.set_grad_enabled(False) - -FLASH_ATTENTION = True -try: - from text_embeddings_server.models.flash_bert import FlashBert -except ImportError as e: - logger.warning(f"Could not import Flash Attention enabled models: {e}") - FLASH_ATTENTION = False - -if FLASH_ATTENTION: - __all__.append(FlashBert) - - -def create_model(model_class, model_path, device, datatype, pool="cls"): - """Create a model instance and load it into Neuron devices.""" - model_handle = model_class( - model_path, - device, - datatype, - pool, - trust_remote=TRUST_REMOTE_CODE, - ) - return model_handle - - -def get_model(model_path: Path, dtype: Optional[str], pool: str): - if dtype == "float32": - datatype = torch.float32 - elif dtype == "float16": - datatype = torch.float16 - elif dtype == "bfloat16": - datatype = torch.bfloat16 - else: - raise RuntimeError(f"Unknown dtype {dtype}") - - device = get_device() - logger.info(f"backend device: {device}") - - config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE) - - if ( - hasattr(config, "auto_map") - and isinstance(config.auto_map, dict) - and "AutoModel" in config.auto_map - and config.auto_map["AutoModel"] - == "jinaai/jina-bert-v2-qk-post-norm--modeling_bert.JinaBertModel" - ): - # Add specific offline modeling for model "jinaai/jina-embeddings-v2-base-code" which uses "autoMap" to reference code in other repository - return create_model(FlashJinaBert, model_path, device, datatype) - - if config.model_type == "bert": - config: BertConfig - if ( - use_ipex() - or device.type in ["cuda", "hpu"] - and config.position_embedding_type == "absolute" - and datatype in [torch.float16, torch.bfloat16] - and FLASH_ATTENTION - ): - if pool != "cls": - if config.architectures[0].endswith("ForMaskedLM") and pool == "splade": - return create_model( - MaskedLanguageModel, model_path, device, datatype, pool - ) - return create_model(DefaultModel, model_path, device, datatype, pool) - - try: - return create_model(FlashBert, model_path, device, datatype) - except FileNotFoundError: - logger.info( - "Do not have safetensors file for this model, use default transformers model path instead" - ) - return create_model(DefaultModel, model_path, device, datatype, pool) - - if config.architectures[0].endswith("Classification"): - return create_model(ClassificationModel, model_path, device, datatype) - elif config.architectures[0].endswith("ForMaskedLM") and pool == "splade": - return create_model(MaskedLanguageModel, model_path, device, datatype) - else: - return create_model(DefaultModel, model_path, device, datatype, pool) - - if config.model_type == "mistral" and device.type == "hpu": - try: - return create_model(FlashMistral, model_path, device, datatype, pool) - except FileNotFoundError: - return create_model(DefaultModel, model_path, device, datatype, pool) - - if config.model_type == "qwen3" and device.type == "hpu": - try: - return create_model(FlashQwen3, model_path, device, datatype, pool) - except FileNotFoundError: - return create_model(DefaultModel, model_path, device, datatype, pool) - - # Default case - if config.architectures[0].endswith("Classification"): - return create_model(ClassificationModel, model_path, device, datatype) - elif config.architectures[0].endswith("ForMaskedLM") and pool == "splade": - return create_model(MaskedLanguageModel, model_path, device, datatype) - else: - return create_model(DefaultModel, model_path, device, datatype, pool) diff --git a/backends/neuron/server/text_embeddings_server/server.py b/backends/neuron/server/text_embeddings_server/server.py deleted file mode 100644 index 646d79bc9..000000000 --- a/backends/neuron/server/text_embeddings_server/server.py +++ /dev/null @@ -1,92 +0,0 @@ -import asyncio -import torch -from grpc import aio -from loguru import logger - -from grpc_reflection.v1alpha import reflection -from pathlib import Path -from typing import Optional - -from text_embeddings_server.models import Model, get_model -from text_embeddings_server.pb import embed_pb2_grpc, embed_pb2 -from text_embeddings_server.utils.tracing import UDSOpenTelemetryAioServerInterceptor -from text_embeddings_server.utils.interceptor import ExceptionInterceptor - - -class EmbeddingService(embed_pb2_grpc.EmbeddingServiceServicer): - def __init__(self, model: Model): - self.model = model - # Force inference mode for the lifetime of EmbeddingService - self._inference_mode_raii_guard = torch._C._InferenceMode(True) - - async def Health(self, request, context): - if self.model.device.type == "cuda": - torch.zeros((2, 2), device="cuda") - return embed_pb2.HealthResponse() - - async def Embed(self, request, context): - max_input_length = self.model.max_input_length - batch = self.model.batch_type.from_pb( - request, self.model.device, max_input_length - ) - - embeddings = self.model.embed(batch) - - return embed_pb2.EmbedResponse(embeddings=embeddings) - - async def Predict(self, request, context): - max_input_length = self.model.max_input_length - batch = self.model.batch_type.from_pb( - request, self.model.device, max_input_length - ) - - scores = self.model.predict(batch) - - return embed_pb2.PredictResponse(scores=scores) - - -def serve( - model_path: Path, - dtype: Optional[str], - uds_path: Path, - pool: str, -): - async def serve_inner( - model_path: Path, - dtype: Optional[str] = None, - ): - unix_socket = f"unix://{uds_path}" - - try: - model = get_model(model_path, dtype, pool) - except Exception: - logger.exception("Error when initializing model") - raise - - server = aio.server( - interceptors=[ - ExceptionInterceptor(), - UDSOpenTelemetryAioServerInterceptor(), - ] - ) - embed_pb2_grpc.add_EmbeddingServiceServicer_to_server( - EmbeddingService(model), server - ) - SERVICE_NAMES = ( - embed_pb2.DESCRIPTOR.services_by_name["EmbeddingService"].full_name, - reflection.SERVICE_NAME, - ) - reflection.enable_server_reflection(SERVICE_NAMES, server) - server.add_insecure_port(unix_socket) - - await server.start() - - logger.info(f"Server started at {unix_socket}") - - try: - await server.wait_for_termination() - except KeyboardInterrupt: - logger.info("Signal received. Shutting down") - await server.stop(0) - - asyncio.run(serve_inner(model_path, dtype)) diff --git a/backends/neuron/src/lib.rs b/backends/neuron/src/lib.rs deleted file mode 100644 index 53255b07d..000000000 --- a/backends/neuron/src/lib.rs +++ /dev/null @@ -1,132 +0,0 @@ -mod logging; -mod management; - -use backend_grpc_client::Client; -use nohash_hasher::BuildNoHashHasher; -use std::collections::HashMap; -use text_embeddings_backend_core::{ - Backend, BackendError, Batch, Embedding, Embeddings, ModelType, Pool, Predictions, -}; -use tokio::runtime::Runtime; - -pub struct PythonBackend { - _backend_process: management::BackendProcess, - tokio_runtime: Runtime, - backend_client: Client, -} - -impl PythonBackend { - pub fn new( - model_path: String, - dtype: String, - model_type: ModelType, - uds_path: String, - otlp_endpoint: Option, - otlp_service_name: String, - ) -> Result { - let pool = match model_type { - ModelType::Classifier => Pool::Cls, - ModelType::Embedding(pool) => pool, - }; - - let backend_process = management::BackendProcess::new( - model_path, - dtype, - &uds_path, - otlp_endpoint, - otlp_service_name, - pool, - )?; - let tokio_runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .map_err(|err| BackendError::Start(format!("Could not start Tokio runtime: {err}")))?; - - let backend_client = tokio_runtime - .block_on(Client::connect_uds(uds_path)) - .map_err(|err| { - BackendError::Start(format!("Could not connect to backend process: {err}")) - })?; - - Ok(Self { - _backend_process: backend_process, - tokio_runtime, - backend_client, - }) - } -} - -impl Backend for PythonBackend { - fn health(&self) -> Result<(), BackendError> { - if self - .tokio_runtime - .block_on(self.backend_client.clone().health()) - .is_err() - { - return Err(BackendError::Unhealthy); - } - Ok(()) - } - - fn is_padded(&self) -> bool { - false - } - - fn embed(&self, batch: Batch) -> Result { - if !batch.raw_indices.is_empty() { - return Err(BackendError::Inference( - "raw embeddings are not supported for the Python backend.".to_string(), - )); - } - let batch_size = batch.len(); - - let results = self - .tokio_runtime - .block_on(self.backend_client.clone().embed( - batch.input_ids, - batch.token_type_ids, - batch.position_ids, - batch.cumulative_seq_lengths, - batch.max_length, - )) - .map_err(|err| BackendError::Inference(err.to_string()))?; - let pooled_embeddings: Vec> = results.into_iter().map(|r| r.values).collect(); - - let mut embeddings = - HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default()); - for (i, e) in pooled_embeddings.into_iter().enumerate() { - embeddings.insert(i, Embedding::Pooled(e)); - } - - Ok(embeddings) - } - - fn predict(&self, batch: Batch) -> Result { - if !batch.raw_indices.is_empty() { - return Err(BackendError::Inference( - "raw embeddings are not supported for the Python backend.".to_string(), - )); - } - let batch_size = batch.len(); - let results = self - .tokio_runtime - .block_on(self.backend_client.clone().predict( - batch.input_ids, - batch.token_type_ids, - batch.position_ids, - batch.cumulative_seq_lengths, - batch.max_length, - )) - .map_err(|err| BackendError::Inference(err.to_string()))?; - let raw_results: Vec> = results.into_iter().map(|r| r.values).collect(); - - let mut predictions = - HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default()); - - for (i, r) in raw_results.into_iter().enumerate() { - predictions.insert(i, r); - } - - Ok(predictions) - } -} diff --git a/backends/neuron/src/logging.rs b/backends/neuron/src/logging.rs deleted file mode 100644 index 8f55e8e6b..000000000 --- a/backends/neuron/src/logging.rs +++ /dev/null @@ -1,61 +0,0 @@ -use serde::Deserialize; -use std::io::{BufRead, Lines}; - -#[derive(Deserialize)] -#[serde(rename_all = "UPPERCASE")] -enum PythonLogLevelEnum { - Trace, - Debug, - Info, - Success, - Warning, - Error, - Critical, -} - -#[derive(Deserialize)] -struct PythonLogLevel { - name: PythonLogLevelEnum, -} - -#[derive(Deserialize)] -struct PythonLogRecord { - level: PythonLogLevel, -} - -#[derive(Deserialize)] -struct PythonLogMessage { - text: String, - record: PythonLogRecord, -} - -impl PythonLogMessage { - fn trace(&self) { - match self.record.level.name { - PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text), - PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text), - PythonLogLevelEnum::Info => tracing::info!("{}", self.text), - PythonLogLevelEnum::Success => tracing::info!("{}", self.text), - PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text), - PythonLogLevelEnum::Error => tracing::error!("{}", self.text), - PythonLogLevelEnum::Critical => tracing::error!("{}", self.text), - } - } -} - -impl TryFrom<&String> for PythonLogMessage { - type Error = serde_json::Error; - - fn try_from(value: &String) -> Result { - serde_json::from_str::(value) - } -} - -pub(crate) fn log_lines(lines: Lines) { - for line in lines.map_while(Result::ok) { - match PythonLogMessage::try_from(&line) { - Ok(log) => log.trace(), - Err(_) => tracing::debug!("{line}"), - } - } -} diff --git a/backends/neuron/src/management.rs b/backends/neuron/src/management.rs deleted file mode 100644 index 81c294a92..000000000 --- a/backends/neuron/src/management.rs +++ /dev/null @@ -1,148 +0,0 @@ -use crate::logging::log_lines; -use std::ffi::OsString; -use std::io::{BufRead, BufReader}; -use std::os::unix::process::{CommandExt, ExitStatusExt}; -use std::path::Path; -use std::process::{Child, Command, Stdio}; -use std::sync::mpsc; -use std::thread::sleep; -use std::time::{Duration, Instant}; -use std::{env, fs, io, thread}; -use text_embeddings_backend_core::{BackendError, Pool}; - -#[derive(Debug)] -pub(crate) struct BackendProcess { - inner: Child, -} - -impl BackendProcess { - pub(crate) fn new( - model_path: String, - dtype: String, - uds_path: &str, - otlp_endpoint: Option, - otlp_service_name: String, - pool: Pool, - ) -> Result { - // Get UDS path - let uds = Path::new(uds_path); - - // Clean previous runs - if uds.exists() { - fs::remove_file(uds).expect("could not remove UDS file"); - } - - let pool = match pool { - Pool::Cls => "cls", - Pool::Mean => "mean", - Pool::LastToken => "lasttoken", - Pool::Splade => "splade", - }; - - // Process args - let mut python_server_args = vec![ - model_path, - "--dtype".to_owned(), - dtype, - "--uds-path".to_owned(), - uds_path.to_owned(), - "--logger-level".to_owned(), - "INFO".to_owned(), - "--json-output".to_owned(), - "--pool".to_owned(), - pool.to_owned(), - ]; - - // OpenTelemetry - if let Some(otlp_endpoint) = otlp_endpoint { - python_server_args.push("--otlp-endpoint".to_owned()); - python_server_args.push(otlp_endpoint); - } - - python_server_args.push("--otlp-service-name".to_owned()); - python_server_args.push(otlp_service_name); - - // Copy current process env - let envs: Vec<(OsString, OsString)> = env::vars_os().collect(); - - tracing::info!("Starting Python backend"); - let mut p = match Command::new("python-text-embeddings-server") - .args(python_server_args) - .envs(envs) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .process_group(0) - .spawn() - { - Ok(p) => p, - Err(err) => { - if err.kind() == io::ErrorKind::NotFound { - return Err(BackendError::Start( - "python-text-embeddings-server not found in PATH".to_owned(), - )); - } - return Err(BackendError::Start(err.to_string())); - } - }; - - let stdout_reader = BufReader::new(p.stdout.take().unwrap()); - let stderr_reader = BufReader::new(p.stderr.take().unwrap()); - - //stdout tracing thread - thread::spawn(move || { - let _span = tracing::span!(tracing::Level::INFO, "python-backend").entered(); - log_lines(stdout_reader.lines()); - }); - - let start_time = Instant::now(); - let mut wait_time = Instant::now(); - - loop { - // Process exited - if let Some(exit_status) = p.try_wait().unwrap() { - // We read stderr in another thread as it seems that lines() can block in some cases - let (err_sender, err_receiver) = mpsc::channel(); - thread::spawn(move || { - for line in stderr_reader.lines().map_while(Result::ok) { - err_sender.send(line).unwrap_or(()); - } - }); - let mut err = String::new(); - while let Ok(line) = err_receiver.recv_timeout(Duration::from_millis(10)) { - err = err + "\n" + &line; - } - - tracing::debug!("Python Backend complete standard error output:\n{err}"); - - if let Some(signal) = exit_status.signal() { - return Err(BackendError::Start(format!( - "Python Backend process was signaled to shutdown with signal {signal}" - ))); - } - return Err(BackendError::Start( - "Python backend failed to start".to_string(), - )); - } - - // Shard is ready - if uds.exists() { - tracing::info!("Python backend ready in {:?}", start_time.elapsed()); - break; - } else if wait_time.elapsed() > Duration::from_secs(10) { - tracing::info!("Waiting for Python backend to be ready..."); - wait_time = Instant::now(); - } - sleep(Duration::from_millis(5)); - } - - Ok(Self { inner: p }) - } -} - -impl Drop for BackendProcess { - fn drop(&mut self) { - self.inner.kill().unwrap(); - let _ = self.inner.wait(); - tracing::info!("Python backend process terminated"); - } -} diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py index 1e919f233..8fb4076c0 100644 --- a/backends/python/server/text_embeddings_server/models/__init__.py +++ b/backends/python/server/text_embeddings_server/models/__init__.py @@ -14,7 +14,9 @@ from text_embeddings_server.models.jinaBert_model import FlashJinaBert from text_embeddings_server.models.flash_mistral import FlashMistral from text_embeddings_server.models.flash_qwen3 import FlashQwen3 -from text_embeddings_server.utils.device import get_device, use_ipex +from text_embeddings_server.models.neuron_models import NeuronSentenceTransformers + +from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron __all__ = ["Model"] @@ -74,6 +76,11 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str): logger.info(f"backend device: {device}") config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE) + + # Neuron cases + if is_neuron(): + if config.model_type == "bert": + return create_model(NeuronSentenceTransformers, model_path) if ( hasattr(config, "auto_map") diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py new file mode 100644 index 000000000..d795db071 --- /dev/null +++ b/backends/python/server/text_embeddings_server/models/neuron_models.py @@ -0,0 +1,67 @@ +import inspect +import torch + +from pathlib import Path +from typing import Type, List +from optimum.neuron import NeuronModelForSentenceTransformers +from opentelemetry import trace + +from text_embeddings_server.models import Model +from text_embeddings_server.models.types import PaddedBatch, Embedding, Score + +tracer = trace.get_tracer(__name__) + + +class NeuronSentenceTransformers(Model): + def __init__( + self, + model_path: Path, + device: torch.device, + dtype: torch.dtype, + ): + model = NeuronModelForSentenceTransformers.from_pretrained(model_path) + + self.hidden_size = model.config.hidden_size + position_offset = 0 + model_type = model.config.model_type + if model_type in ["xlm-roberta", "camembert", "roberta"]: + position_offset = model.config.pad_token_id + 1 + if hasattr(model.config, "max_seq_length"): + self.max_input_length = model.config.max_seq_length + else: + self.max_input_length = ( + model.config.max_position_embeddings - position_offset + ) + + self.has_position_ids = ( + inspect.signature(model.forward).parameters.get("position_ids", None) + is not None + ) + self.has_token_type_ids = ( + inspect.signature(model.forward).parameters.get("token_type_ids", None) + is not None + ) + + super(NeuronSentenceTransformers, self).__init__( + model=model, dtype=dtype, device=device + ) + + @property + def batch_type(self) -> Type[PaddedBatch]: + return PaddedBatch + + @tracer.start_as_current_span("embed") + def embed(self, batch: PaddedBatch) -> List[Embedding]: + pass + + @tracer.start_as_current_span("predict") + def predict(self, batch: PaddedBatch) -> List[Score]: + kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask} + if self.has_token_type_ids: + kwargs["token_type_ids"] = batch.token_type_ids + if self.has_position_ids: + kwargs["position_ids"] = batch.position_ids + + output = self.model(**kwargs, return_dict=True) + all_scores = output.logits.tolist() + return [Score(values=scores) for scores in all_scores] diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py index 3f3b04dd7..46b81370f 100644 --- a/backends/python/server/text_embeddings_server/utils/device.py +++ b/backends/python/server/text_embeddings_server/utils/device.py @@ -1,4 +1,6 @@ import os +import re +import functools from loguru import logger import importlib.metadata import importlib.util @@ -49,6 +51,21 @@ def is_hpu() -> bool: is_hpu_available = False return is_hpu_available +@functools.cache +def get_neuron_major() -> int: + MAJORS_FILE = "/proc/devices" + NEURON_MAJOR_LINE = re.compile(r"^\s*(\d+)\s+neuron\s*$") + if not os.path.exists(MAJORS_FILE): + return -1 + with open(MAJORS_FILE, "r") as f: + for l in f.readlines(): + m = NEURON_MAJOR_LINE.match(l) + if m: + return int(m.group(1)) + return -1 + +def is_neuron() -> bool: + return get_neuron_major > -1 def use_ipex() -> bool: value = os.environ.get("USE_IPEX", "True").lower() @@ -72,5 +89,7 @@ def get_device(): if hasattr(torch, "xpu") and torch.xpu.is_available(): device = torch.device("xpu") + elif is_neuron(): + device = torch.device("xla") return device From dd0c08ddad7abe38caf76f720844e7438e42067a Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Mon, 27 Oct 2025 17:10:38 +0000 Subject: [PATCH 03/20] fix: neuron dockerfile --- Dockerfile-neuron | 187 ++++++++++++++++++ Dockerfile.neuron | 43 ---- backends/Cargo.toml | 1 - .../python/server/requirements-neuron.txt | 1 + docs/source/en/ aws_neuron.md | 37 ++++ docs/source/en/local_neuron.md | 1 - 6 files changed, 225 insertions(+), 45 deletions(-) create mode 100644 Dockerfile-neuron delete mode 100644 Dockerfile.neuron create mode 100644 backends/python/server/requirements-neuron.txt create mode 100644 docs/source/en/ aws_neuron.md delete mode 100644 docs/source/en/local_neuron.md diff --git a/Dockerfile-neuron b/Dockerfile-neuron new file mode 100644 index 000000000..52797d687 --- /dev/null +++ b/Dockerfile-neuron @@ -0,0 +1,187 @@ +ARG PLATFORM=neuron +FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef +WORKDIR /usr/src + +ENV SCCACHE=0.10.0 +ENV RUSTC_WRAPPER=/usr/local/bin/sccache + +# Donwload, configure sccache +RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \ + chmod +x /usr/local/bin/sccache + +FROM chef AS planner + +COPY backends backends +COPY core core +COPY router router +COPY Cargo.toml ./ +COPY Cargo.lock ./ + +RUN cargo chef prepare --recipe-path recipe.json + +FROM chef AS builder + +ARG GIT_SHA +ARG DOCKER_LABEL + +# sccache specific variables +ARG SCCACHE_GHA_ENABLED + +COPY --from=planner /usr/src/recipe.json recipe.json + +RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ + --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ + cargo chef cook --release --features ort,candle,mkl,static-linking --no-default-features --recipe-path recipe.json && sccache -s + +COPY backends backends +COPY core core +COPY router router +COPY Cargo.toml ./ +COPY Cargo.lock ./ + +FROM builder AS http-builder + +RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ + --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ + cargo build --release --bin text-embeddings-router --features ort,candle,mkl,static-linking,http --no-default-features && sccache -s + +FROM builder AS grpc-builder + +RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ + curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ + unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ + unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ + rm -f $PROTOC_ZIP + +COPY proto proto + +RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ + --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ + cargo build --release --bin text-embeddings-router --features ort,candle,mkl,static-linking,grpc --no-default-features && sccache -s + +FROM public.ecr.aws/docker/library/ubuntu:22.04 AS neuron + +ENV HUGGINGFACE_HUB_CACHE=/data \ + PORT=80 + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + python3-dev \ + build-essential \ + git \ + curl \ + cmake \ + pkg-config \ + protobuf-compiler \ + ninja-build \ + && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/local/bin/python || true +RUN ln -s /usr/bin/pip3 /usr/local/bin/pip || true + +WORKDIR /usr/src +COPY backends backends +COPY backends/python/server/text_embeddings_server/models/__init__.py backends/python/server/text_embeddings_server/models/__init__.py +COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml +RUN cd backends/python/server && \ + make install + +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 +ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 +ARG NEURONX_TOOLS_VERSION=2.26.14.0 + +ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7 +ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca +ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf +ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.10598+a59fdc00 + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + apt-transport-https \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + gnupg2 \ + gpg-agent \ + jq \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libcap-dev \ + libhwloc-dev \ + openjdk-11-jdk \ + unzip \ + vim \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \ + --extra-index-url https://pypi.org/simple \ + --trusted-host pip.repos.neuron.amazonaws.com \ + neuronx-cc==$NEURONX_CC_VERSION \ + torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ + neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ + && rm -rf ~/.cache/pip/* + +# HF ARGS +ARG TRANSFORMERS_VERSION=4.55.4 +ARG DIFFUSERS_VERSION=0.35.2 +ARG HUGGINGFACE_HUB_VERSION=0.36.0 +ARG OPTIMUM_NEURON_VERSION=0.4.1 +ARG SENTENCE_TRANSFORMERS=5.1.2 +ARG PEFT_VERSION=0.17.0 +ARG DATASETS_VERSION=4.1.1 + +# install Hugging Face libraries and its dependencies +RUN pip install --no-cache-dir -U \ + networkx==2.8.8 \ + transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ + diffusers==${DIFFUSERS_VERSION} \ + compel \ + controlnet-aux \ + huggingface_hub==${HUGGINGFACE_HUB_VERSION} \ + hf_transfer \ + datasets==${DATASETS_VERSION} \ + optimum-neuron==${OPTIMUM_NEURON_VERSION} \ + sentence_transformers==${SENTENCE_TRANSFORMERS} \ + peft==${PEFT_VERSION} \ + && rm -rf ~/.cache/pip/* + + +FROM neuron AS grpc + +COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router + +ENTRYPOINT ["text-embeddings-router"] +CMD ["--json-output"] + +FROM neuron + +COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router + +ENTRYPOINT ["text-embeddings-router"] +CMD ["--json-output"] + + diff --git a/Dockerfile.neuron b/Dockerfile.neuron deleted file mode 100644 index f8b03ab26..000000000 --- a/Dockerfile.neuron +++ /dev/null @@ -1,43 +0,0 @@ -ARG PLATFORM=neuron -FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef -WORKDIR /usr/src - -ENV SCCACHE=0.10.0 -ENV RUSTC_WRAPPER=/usr/local/bin/sccache - -# Donwload, configure sccache -RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \ - chmod +x /usr/local/bin/sccache - -FROM chef AS planner - -COPY backends backends -COPY core core -COPY router router -COPY Cargo.toml ./ -COPY Cargo.lock ./ - -RUN cargo chef prepare --recipe-path recipe.json - -FROM chef AS builder - -ARG GIT_SHA -ARG DOCKER_LABEL - -# sccache specific variables -ARG SCCACHE_GHA_ENABLED - -COPY --from=planner /usr/src/recipe.json recipe.json - -RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ - --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s - -COPY backends backends -COPY core core -COPY router router -COPY Cargo.toml ./ -COPY Cargo.lock ./ - -WORKDIR /usr/src - diff --git a/backends/Cargo.toml b/backends/Cargo.toml index 7d821ff40..bb9d74191 100644 --- a/backends/Cargo.toml +++ b/backends/Cargo.toml @@ -21,7 +21,6 @@ rand = { workspace = true } [features] clap = ["dep:clap", "text-embeddings-backend-core/clap"] python = ["dep:text-embeddings-backend-python"] -neuron = ["dep:text-embeddings-backend-neuron"] ort = ["dep:text-embeddings-backend-ort"] candle = ["dep:text-embeddings-backend-candle"] cuda = ["text-embeddings-backend-candle?/cuda"] diff --git a/backends/python/server/requirements-neuron.txt b/backends/python/server/requirements-neuron.txt new file mode 100644 index 000000000..b8ce3518e --- /dev/null +++ b/backends/python/server/requirements-neuron.txt @@ -0,0 +1 @@ +transformers==4.55.4 \ No newline at end of file diff --git a/docs/source/en/ aws_neuron.md b/docs/source/en/ aws_neuron.md new file mode 100644 index 000000000..13ea7f86e --- /dev/null +++ b/docs/source/en/ aws_neuron.md @@ -0,0 +1,37 @@ + +# Using TEI Container with AWS Trainium and Inferentia Instances + +## Build Docker Image + +To build a container optimized for AWS Neuron devices, run the following command: + +```shell +platform="neuron" + +docker build . -f Dockerfile-neuron -t tei_neuron +``` + +### Deploy Docker Container + +To deploy your model on an AWS Trainium or Inferentia instance, use the following command: + +```shell +model='Qwen/Qwen3-Embedding-0.6B' +volume=$PWD/data + +docker run -p 8080:80 -v $volume:/data tei_neuron --model-id $model +``` \ No newline at end of file diff --git a/docs/source/en/local_neuron.md b/docs/source/en/local_neuron.md deleted file mode 100644 index e0a2cf2ba..000000000 --- a/docs/source/en/local_neuron.md +++ /dev/null @@ -1 +0,0 @@ -# Neuron backend for AWS Trainium and Inferentia \ No newline at end of file From 1e4f3c92d03c9193c62f9d7d20e476b0f2f11dda Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Tue, 28 Oct 2025 17:23:49 +0000 Subject: [PATCH 04/20] remove useless --- Dockerfile-neuron | 2 -- backends/python/server/requirements-neuron.txt | 1 - 2 files changed, 3 deletions(-) delete mode 100644 backends/python/server/requirements-neuron.txt diff --git a/Dockerfile-neuron b/Dockerfile-neuron index 52797d687..a536ab7dd 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -183,5 +183,3 @@ COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/loc ENTRYPOINT ["text-embeddings-router"] CMD ["--json-output"] - - diff --git a/backends/python/server/requirements-neuron.txt b/backends/python/server/requirements-neuron.txt deleted file mode 100644 index b8ce3518e..000000000 --- a/backends/python/server/requirements-neuron.txt +++ /dev/null @@ -1 +0,0 @@ -transformers==4.55.4 \ No newline at end of file From a25cf98d6d98135258ad5ec18549ebdeed02f02a Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 31 Oct 2025 13:11:12 +0000 Subject: [PATCH 05/20] fix dockerfile --- Dockerfile-neuron | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index a536ab7dd..e09c64915 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -1,4 +1,3 @@ -ARG PLATFORM=neuron FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef WORKDIR /usr/src @@ -31,7 +30,7 @@ COPY --from=planner /usr/src/recipe.json recipe.json RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo chef cook --release --features ort,candle,mkl,static-linking --no-default-features --recipe-path recipe.json && sccache -s + cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s COPY backends backends COPY core core @@ -39,25 +38,25 @@ COPY router router COPY Cargo.toml ./ COPY Cargo.lock ./ +RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ + curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ + unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ + unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ + rm -f $PROTOC_ZIP + FROM builder AS http-builder RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo build --release --bin text-embeddings-router --features ort,candle,mkl,static-linking,http --no-default-features && sccache -s + cargo build --release --bin text-embeddings-router -F python -F http --no-default-features && sccache -s FROM builder AS grpc-builder -RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ - curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ - unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ - unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ - rm -f $PROTOC_ZIP - COPY proto proto RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo build --release --bin text-embeddings-router --features ort,candle,mkl,static-linking,grpc --no-default-features && sccache -s + cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s FROM public.ecr.aws/docker/library/ubuntu:22.04 AS neuron From 56c15d896b15341dd3656a04042bb790102d0205 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Mon, 3 Nov 2025 10:53:09 +0000 Subject: [PATCH 06/20] neuron path --- Dockerfile-neuron | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index e09c64915..16005db2d 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -136,6 +136,9 @@ RUN apt-get update \ && rm -rf /tmp/tmp* \ && apt-get clean +ENV PATH="/opt/aws/neuron/bin:${PATH}" +ENV NEURON_RT_VISIBLE_CORES=ALL + RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \ --extra-index-url https://pypi.org/simple \ --trusted-host pip.repos.neuron.amazonaws.com \ @@ -145,7 +148,7 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \ && rm -rf ~/.cache/pip/* # HF ARGS -ARG TRANSFORMERS_VERSION=4.55.4 +ARG TRANSFORMERS_VERSION=4.57.1 ARG DIFFUSERS_VERSION=0.35.2 ARG HUGGINGFACE_HUB_VERSION=0.36.0 ARG OPTIMUM_NEURON_VERSION=0.4.1 @@ -154,6 +157,7 @@ ARG PEFT_VERSION=0.17.0 ARG DATASETS_VERSION=4.1.1 # install Hugging Face libraries and its dependencies +# optimum-neuron==${OPTIMUM_NEURON_VERSION} \ RUN pip install --no-cache-dir -U \ networkx==2.8.8 \ transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ @@ -163,7 +167,7 @@ RUN pip install --no-cache-dir -U \ huggingface_hub==${HUGGINGFACE_HUB_VERSION} \ hf_transfer \ datasets==${DATASETS_VERSION} \ - optimum-neuron==${OPTIMUM_NEURON_VERSION} \ + "optimum-neuron @ git+https://github.com/huggingface/optimum-neuron@main" \ sentence_transformers==${SENTENCE_TRANSFORMERS} \ peft==${PEFT_VERSION} \ && rm -rf ~/.cache/pip/* From 142520a5e829d2ac5018e98af5a373612634ae3c Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Mon, 3 Nov 2025 16:38:18 +0000 Subject: [PATCH 07/20] fix container env + Neuron related changes --- Dockerfile-neuron | 7 +-- .../text_embeddings_server/models/__init__.py | 4 +- .../models/neuron_models.py | 30 ++++++---- backends/src/lib.rs | 60 ++++++++++++++++--- docs/source/en/ aws_neuron.md | 6 +- 5 files changed, 78 insertions(+), 29 deletions(-) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index 16005db2d..9f4b23740 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -90,10 +90,9 @@ ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 ARG NEURONX_TOOLS_VERSION=2.26.14.0 -ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7 -ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca +ARG NEURONX_CC_VERSION=2.21.33363.0+82129205 +ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.16998+e9bf8a50 ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf -ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.10598+a59fdc00 RUN apt-get update \ && apt-get upgrade -y \ @@ -137,13 +136,13 @@ RUN apt-get update \ && apt-get clean ENV PATH="/opt/aws/neuron/bin:${PATH}" -ENV NEURON_RT_VISIBLE_CORES=ALL RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \ --extra-index-url https://pypi.org/simple \ --trusted-host pip.repos.neuron.amazonaws.com \ neuronx-cc==$NEURONX_CC_VERSION \ torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ + torchvision \ neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ && rm -rf ~/.cache/pip/* diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py index 8fb4076c0..0ca8b584c 100644 --- a/backends/python/server/text_embeddings_server/models/__init__.py +++ b/backends/python/server/text_embeddings_server/models/__init__.py @@ -14,7 +14,7 @@ from text_embeddings_server.models.jinaBert_model import FlashJinaBert from text_embeddings_server.models.flash_mistral import FlashMistral from text_embeddings_server.models.flash_qwen3 import FlashQwen3 -from text_embeddings_server.models.neuron_models import NeuronSentenceTransformers +from text_embeddings_server.models.neuron_models import NeuronSentenceTransformersModel from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron @@ -80,7 +80,7 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str): # Neuron cases if is_neuron(): if config.model_type == "bert": - return create_model(NeuronSentenceTransformers, model_path) + return create_model(NeuronSentenceTransformersModel, model_path, device, datatype) if ( hasattr(config, "auto_map") diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py index d795db071..e3b850c3e 100644 --- a/backends/python/server/text_embeddings_server/models/neuron_models.py +++ b/backends/python/server/text_embeddings_server/models/neuron_models.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Type, List -from optimum.neuron import NeuronModelForSentenceTransformers +from optimum.neuron import NeuronSentenceTransformers from opentelemetry import trace from text_embeddings_server.models import Model @@ -12,14 +12,14 @@ tracer = trace.get_tracer(__name__) -class NeuronSentenceTransformers(Model): +class NeuronSentenceTransformersModel(Model): def __init__( self, model_path: Path, device: torch.device, dtype: torch.dtype, ): - model = NeuronModelForSentenceTransformers.from_pretrained(model_path) + model = NeuronSentenceTransformers.from_pretrained(model_path) self.hidden_size = model.config.hidden_size position_offset = 0 @@ -42,7 +42,7 @@ def __init__( is not None ) - super(NeuronSentenceTransformers, self).__init__( + super(NeuronSentenceTransformersModel, self).__init__( model=model, dtype=dtype, device=device ) @@ -52,16 +52,20 @@ def batch_type(self) -> Type[PaddedBatch]: @tracer.start_as_current_span("embed") def embed(self, batch: PaddedBatch) -> List[Embedding]: - pass - - @tracer.start_as_current_span("predict") - def predict(self, batch: PaddedBatch) -> List[Score]: kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask} if self.has_token_type_ids: kwargs["token_type_ids"] = batch.token_type_ids - if self.has_position_ids: - kwargs["position_ids"] = batch.position_ids + output = self.model(**kwargs) + + sentence_embedding = output["sentence_embedding"] - output = self.model(**kwargs, return_dict=True) - all_scores = output.logits.tolist() - return [Score(values=scores) for scores in all_scores] + return [ + Embedding( + values=sentence_embedding[i * self.hidden_size : (i + 1) * self.hidden_size] + ) + for i in range(len(batch)) + ] + + @tracer.start_as_current_span("predict") + def predict(self, batch: PaddedBatch) -> List[Score]: + pass diff --git a/backends/src/lib.rs b/backends/src/lib.rs index 245715b38..b53067de1 100644 --- a/backends/src/lib.rs +++ b/backends/src/lib.rs @@ -67,6 +67,15 @@ fn is_hpu() -> bool { } } +fn is_neuron() -> bool { + match Command::new("neuron-ls") + .output() + { + Ok(output) => output.status.success(), + Err(_) => false, + } +} + #[derive(Debug, Clone)] pub struct Backend { /// Channel to communicate with the background thread @@ -409,16 +418,39 @@ async fn init_backend( if let Some(api_repo) = api_repo.as_ref() { if cfg!(feature = "python") || cfg!(feature = "candle") { let start = std::time::Instant::now(); - if download_safetensors(api_repo).await.is_err() { - tracing::warn!("safetensors weights not found. Using `pytorch_model.bin` instead. Model loading will be significantly slower."); - tracing::info!("Downloading `pytorch_model.bin`"); - api_repo - .get("pytorch_model.bin") + if is_neuron() { + tracing::info!("Downloading `model.neuron`"); + let model_files = download_neuron(api_repo) .await .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?; - } - tracing::info!("Model weights downloaded in {:?}", start.elapsed()); + if model_files.is_empty() { + tracing::error!( + "Neuron model files not found in the repository. \ + You can easily compile your model to neuron format following the guide: \ + https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview " + ); + return Err(BackendError::WeightsNotFound( + "No Neuron model files found".into(), + )); + } + + tracing::info!("Neuron model downloaded in {:?}", start.elapsed()); + } else { + if download_safetensors(api_repo).await.is_err() { + tracing::warn!( + "safetensors weights not found. Using `pytorch_model.bin` instead. \ + Model loading will be significantly slower." + ); + tracing::info!("Downloading `pytorch_model.bin`"); + api_repo + .get("pytorch_model.bin") + .await + .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?; + } + + tracing::info!("Model weights downloaded in {:?}", start.elapsed()); + } } } @@ -655,6 +687,20 @@ async fn download_onnx(api: &ApiRepo) -> Result, ApiError> { Ok(model_files) } +async fn download_neuron(api: &ApiRepo) -> Result, ApiError> { + let mut model_files: Vec = Vec::new(); + + tracing::info!("Downloading `model.neuron`"); + match api.get("model.neuron").await { + Ok(p) => model_files.push(p), + Err(err) => { + tracing::warn!("Could not download `model.neuron`: {err}"); + } + }; + + Ok(model_files) +} + #[cfg(feature = "candle")] #[derive(Debug, Clone, Deserialize, PartialEq)] enum ModuleType { diff --git a/docs/source/en/ aws_neuron.md b/docs/source/en/ aws_neuron.md index 13ea7f86e..d383fdba8 100644 --- a/docs/source/en/ aws_neuron.md +++ b/docs/source/en/ aws_neuron.md @@ -22,7 +22,7 @@ To build a container optimized for AWS Neuron devices, run the following command ```shell platform="neuron" -docker build . -f Dockerfile-neuron -t tei_neuron +docker build . -f Dockerfile-neuron -t tei-neuron:main ``` ### Deploy Docker Container @@ -30,8 +30,8 @@ docker build . -f Dockerfile-neuron -t tei_neuron To deploy your model on an AWS Trainium or Inferentia instance, use the following command: ```shell -model='Qwen/Qwen3-Embedding-0.6B' +model='optimum/bge-base-en-v1.5-neuronx' volume=$PWD/data -docker run -p 8080:80 -v $volume:/data tei_neuron --model-id $model +docker run -p 8080:80 -v $volume:/data tei-neuron:main --model-id $model --dtype float32 ``` \ No newline at end of file From 7ada87700b2d994b742333468b5c33a2b4db2cff Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Tue, 3 Feb 2026 17:09:05 +0000 Subject: [PATCH 08/20] fix for neuron backend + tests --- Dockerfile-neuron | 25 +- .../text_embeddings_server/models/__init__.py | 63 ++- .../models/neuron_models.py | 420 +++++++++++++++++- .../text_embeddings_server/utils/device.py | 2 +- backends/src/lib.rs | 19 +- integration_tests/README.md | 114 ++++- integration_tests/neuron/conftest.py | 299 +++++++++++++ integration_tests/neuron/test_embed.py | 223 ++++++++++ 8 files changed, 1109 insertions(+), 56 deletions(-) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index 9f4b23740..dbf1e9a29 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -4,7 +4,7 @@ WORKDIR /usr/src ENV SCCACHE=0.10.0 ENV RUSTC_WRAPPER=/usr/local/bin/sccache -# Donwload, configure sccache +# Download, configure sccache RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \ chmod +x /usr/local/bin/sccache @@ -63,6 +63,8 @@ FROM public.ecr.aws/docker/library/ubuntu:22.04 AS neuron ENV HUGGINGFACE_HUB_CACHE=/data \ PORT=80 +ENV PATH="/usr/local/bin:/root/.local/bin:${PATH}" + RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ python3 \ python3-pip \ @@ -123,8 +125,9 @@ RUN apt-get update \ && rm -rf /tmp/tmp* \ && apt-get clean -RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list -RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - +# Ubuntu 22.04 = jammy; use signed-by (apt-key is deprecated) +RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | gpg --dearmor -o /usr/share/keyrings/neuron-archive-keyring.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/neuron-archive-keyring.gpg] https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list RUN apt-get update \ && apt-get install -y \ @@ -147,7 +150,8 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \ && rm -rf ~/.cache/pip/* # HF ARGS -ARG TRANSFORMERS_VERSION=4.57.1 +# Note: optimum-neuron 0.4.1 requires transformers~=4.55.4 +ARG TRANSFORMERS_VERSION=4.55.4 ARG DIFFUSERS_VERSION=0.35.2 ARG HUGGINGFACE_HUB_VERSION=0.36.0 ARG OPTIMUM_NEURON_VERSION=0.4.1 @@ -155,20 +159,19 @@ ARG SENTENCE_TRANSFORMERS=5.1.2 ARG PEFT_VERSION=0.17.0 ARG DATASETS_VERSION=4.1.1 -# install Hugging Face libraries and its dependencies -# optimum-neuron==${OPTIMUM_NEURON_VERSION} \ +# Install Hugging Face libraries and dependencies for TEI on Neuron RUN pip install --no-cache-dir -U \ - networkx==2.8.8 \ - transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ + networkx==2.8.8 \ + transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ diffusers==${DIFFUSERS_VERSION} \ compel \ controlnet-aux \ huggingface_hub==${HUGGINGFACE_HUB_VERSION} \ hf_transfer \ datasets==${DATASETS_VERSION} \ - "optimum-neuron @ git+https://github.com/huggingface/optimum-neuron@main" \ - sentence_transformers==${SENTENCE_TRANSFORMERS} \ - peft==${PEFT_VERSION} \ + optimum-neuron==${OPTIMUM_NEURON_VERSION} \ + sentence_transformers==${SENTENCE_TRANSFORMERS} \ + peft==${PEFT_VERSION} \ && rm -rf ~/.cache/pip/* diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py index 0ca8b584c..1de5f9b1b 100644 --- a/backends/python/server/text_embeddings_server/models/__init__.py +++ b/backends/python/server/text_embeddings_server/models/__init__.py @@ -11,10 +11,6 @@ from text_embeddings_server.models.masked_model import MaskedLanguageModel from text_embeddings_server.models.default_model import DefaultModel from text_embeddings_server.models.classification_model import ClassificationModel -from text_embeddings_server.models.jinaBert_model import FlashJinaBert -from text_embeddings_server.models.flash_mistral import FlashMistral -from text_embeddings_server.models.flash_qwen3 import FlashQwen3 -from text_embeddings_server.models.neuron_models import NeuronSentenceTransformersModel from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron @@ -25,12 +21,21 @@ "true", "1", ] -# Disable gradients -torch.set_grad_enabled(False) +# Flash Attention models - only available when flash_attn is installed FLASH_ATTENTION = True +FlashBert = None +FlashJinaBert = None +FlashMistral = None +FlashQwen3 = None + try: from text_embeddings_server.models.flash_bert import FlashBert + from text_embeddings_server.models.jinaBert_model import FlashJinaBert + from text_embeddings_server.models.flash_mistral import FlashMistral + from text_embeddings_server.models.flash_qwen3 import FlashQwen3 + # Disable gradients + torch.set_grad_enabled(False) except ImportError as e: logger.warning(f"Could not import Flash Attention enabled models: {e}") FLASH_ATTENTION = False @@ -38,6 +43,25 @@ if FLASH_ATTENTION: __all__.append(FlashBert) +# Neuron models - only import when on Neuron device to avoid unnecessary dependencies +NeuronSentenceTransformersModel = None +NeuronEmbeddingModel = None +NeuronClassificationModel = None +NeuronMaskedLMModel = None +create_neuron_model = None + +if is_neuron(): + try: + from text_embeddings_server.models.neuron_models import ( + NeuronSentenceTransformersModel, + NeuronEmbeddingModel, + NeuronClassificationModel, + NeuronMaskedLMModel, + create_neuron_model, + ) + except ImportError as e: + logger.warning(f"Could not import Neuron models: {e}") + def wrap_model_if_hpu(model_handle, device): """Wrap the model in HPU graph if the device is HPU.""" @@ -76,14 +100,27 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str): logger.info(f"backend device: {device}") config = AutoConfig.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE) - - # Neuron cases + + # Neuron cases - use optimum-neuron for all supported model types if is_neuron(): - if config.model_type == "bert": - return create_model(NeuronSentenceTransformersModel, model_path, device, datatype) + logger.info(f"Neuron device detected, using optimum-neuron backend for model type: {config.model_type}") + try: + return create_neuron_model( + model_path=model_path, + device=device, + dtype=datatype, + pool=pool, + trust_remote=TRUST_REMOTE_CODE, + config=config, + ) + except Exception as e: + logger.warning(f"Failed to load model with optimum-neuron: {e}") + logger.warning("Falling back to default model loading path") + # Fall through to default model loading if ( - hasattr(config, "auto_map") + FlashJinaBert is not None + and hasattr(config, "auto_map") and isinstance(config.auto_map, dict) and "AutoModel" in config.auto_map and config.auto_map["AutoModel"] @@ -123,13 +160,13 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str): else: return create_model(DefaultModel, model_path, device, datatype, pool) - if config.model_type == "mistral" and device.type == "hpu": + if config.model_type == "mistral" and device.type == "hpu" and FlashMistral is not None: try: return create_model(FlashMistral, model_path, device, datatype, pool) except FileNotFoundError: return create_model(DefaultModel, model_path, device, datatype, pool) - if config.model_type == "qwen3" and device.type == "hpu": + if config.model_type == "qwen3" and device.type == "hpu" and FlashQwen3 is not None: try: return create_model(FlashQwen3, model_path, device, datatype, pool) except FileNotFoundError: diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py index e3b850c3e..4589f6b77 100644 --- a/backends/python/server/text_embeddings_server/models/neuron_models.py +++ b/backends/python/server/text_embeddings_server/models/neuron_models.py @@ -1,71 +1,443 @@ import inspect +import os import torch +from abc import ABC from pathlib import Path -from typing import Type, List -from optimum.neuron import NeuronSentenceTransformers +from typing import Type, List, Optional from opentelemetry import trace +from loguru import logger -from text_embeddings_server.models import Model +from text_embeddings_server.models.model import Model from text_embeddings_server.models.types import PaddedBatch, Embedding, Score tracer = trace.get_tracer(__name__) +# Neuron compilation parameters from environment variables +NEURON_BATCH_SIZE = int(os.getenv("NEURON_BATCH_SIZE", "1")) +NEURON_SEQUENCE_LENGTH = int(os.getenv("NEURON_SEQUENCE_LENGTH", "512")) + + +class NeuronBaseModel(Model, ABC): + """Base class for all Neuron models with common functionality.""" -class NeuronSentenceTransformersModel(Model): def __init__( self, + model, model_path: Path, device: torch.device, dtype: torch.dtype, ): - model = NeuronSentenceTransformers.from_pretrained(model_path) - self.hidden_size = model.config.hidden_size + + # Calculate max input length based on model type position_offset = 0 model_type = model.config.model_type if model_type in ["xlm-roberta", "camembert", "roberta"]: - position_offset = model.config.pad_token_id + 1 + position_offset = getattr(model.config, "pad_token_id", 1) + 1 + if hasattr(model.config, "max_seq_length"): self.max_input_length = model.config.max_seq_length + elif hasattr(model.config, "n_positions"): + self.max_input_length = model.config.n_positions else: self.max_input_length = ( model.config.max_position_embeddings - position_offset ) - self.has_position_ids = ( - inspect.signature(model.forward).parameters.get("position_ids", None) - is not None - ) - self.has_token_type_ids = ( - inspect.signature(model.forward).parameters.get("token_type_ids", None) - is not None - ) + # Check which inputs the model supports + self.has_position_ids = self._check_param_exists(model, "position_ids") + self.has_token_type_ids = self._check_param_exists(model, "token_type_ids") - super(NeuronSentenceTransformersModel, self).__init__( - model=model, dtype=dtype, device=device - ) + super().__init__(model=model, dtype=dtype, device=device) + + @staticmethod + def _check_param_exists(model, param_name: str) -> bool: + """Check if a parameter exists in the model's forward signature.""" + try: + forward_fn = model.forward if hasattr(model, 'forward') else model.__call__ + return ( + inspect.signature(forward_fn).parameters.get(param_name, None) + is not None + ) + except (ValueError, TypeError): + return False @property def batch_type(self) -> Type[PaddedBatch]: return PaddedBatch + def _prepare_inputs(self, batch: PaddedBatch) -> dict: + """Prepare input kwargs for model forward pass. + + Note: Neuron models require int64 (long) tensors for inputs. + """ + kwargs = { + "input_ids": batch.input_ids.to(torch.long), + "attention_mask": batch.attention_mask.to(torch.long), + } + if self.has_token_type_ids: + kwargs["token_type_ids"] = batch.token_type_ids.to(torch.long) + if self.has_position_ids: + kwargs["position_ids"] = batch.position_ids.to(torch.long) + return kwargs + + +class NeuronSentenceTransformersModel(NeuronBaseModel): + """ + Neuron-optimized model for sentence-transformers. + + Uses optimum.neuron.NeuronModelForSentenceTransformers which is designed + for sentence embedding models that output sentence_embedding directly. + """ + + def __init__( + self, + model_path: Path, + device: torch.device, + dtype: torch.dtype, + pool: str = "cls", + trust_remote: bool = False, + ): + try: + from optimum.neuron import NeuronModelForSentenceTransformers + is_compiled = self._is_neuron_compiled(model_path) + export_kwargs = {} + if not is_compiled: + export_kwargs = { + "export": True, + "batch_size": NEURON_BATCH_SIZE, + "sequence_length": NEURON_SEQUENCE_LENGTH, + } + logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") + model = NeuronModelForSentenceTransformers.from_pretrained( + model_path, + **export_kwargs, + ) + except ImportError: + # Fallback to legacy import + from optimum.neuron import NeuronSentenceTransformers + model = NeuronSentenceTransformers.from_pretrained(model_path) + + super().__init__(model, model_path, device, dtype) + self.pool = pool + logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}") + + @staticmethod + def _is_neuron_compiled(model_path: Path) -> bool: + """Check if the model is already compiled for Neuron.""" + neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] + return len(neuron_files) > 0 + @tracer.start_as_current_span("embed") def embed(self, batch: PaddedBatch) -> List[Embedding]: - kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask} - if self.has_token_type_ids: - kwargs["token_type_ids"] = batch.token_type_ids + kwargs = self._prepare_inputs(batch) output = self.model(**kwargs) - sentence_embedding = output["sentence_embedding"] + sentence_embedding = None + # NeuronModelForSentenceTransformers returns sentence_embedding directly + if hasattr(output, "sentence_embedding") and output.sentence_embedding is not None: + candidate = output.sentence_embedding + if candidate.abs().sum() > 0: + sentence_embedding = candidate + + # If sentence_embedding is invalid, fall back to manual pooling of token_embeddings + if sentence_embedding is None: + # Get token embeddings + if hasattr(output, "token_embeddings") and output.token_embeddings is not None: + token_embeddings = output.token_embeddings + else: + raise ValueError(f"Cannot extract embeddings from model output: {type(output)}") + + # Apply pooling based on self.pool setting + if self.pool == "cls": + sentence_embedding = token_embeddings[:, 0, :] + elif self.pool == "mean": + attention_mask = kwargs["attention_mask"].unsqueeze(-1).float() + sentence_embedding = (token_embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1) + elif self.pool == "last_token": + seq_lengths = kwargs["attention_mask"].sum(dim=1) - 1 + sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] + else: + raise ValueError(f"Invalid pooling mode: {self.pool}") + + # Convert to list format expected by the gRPC interface + cpu_results = sentence_embedding.view(-1).tolist() return [ Embedding( - values=sentence_embedding[i * self.hidden_size : (i + 1) * self.hidden_size] + values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size] ) for i in range(len(batch)) ] @tracer.start_as_current_span("predict") def predict(self, batch: PaddedBatch) -> List[Score]: - pass + raise NotImplementedError("Prediction not supported for sentence transformer models") + + +class NeuronEmbeddingModel(NeuronBaseModel): + """ + Neuron-optimized model for feature extraction / embeddings. + + Uses optimum.neuron.NeuronModelForFeatureExtraction for models that + output hidden states which need to be pooled. + """ + + def __init__( + self, + model_path: Path, + device: torch.device, + dtype: torch.dtype, + pool: str = "cls", + trust_remote: bool = False, + ): + from optimum.neuron import NeuronModelForFeatureExtraction + + is_compiled = self._is_neuron_compiled(model_path) + export_kwargs = {} + if not is_compiled: + export_kwargs = { + "export": True, + "batch_size": NEURON_BATCH_SIZE, + "sequence_length": NEURON_SEQUENCE_LENGTH, + } + logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") + model = NeuronModelForFeatureExtraction.from_pretrained( + model_path, + **export_kwargs, + ) + + logger.info(f"DEBUG: model type = {type(model)}") + + super().__init__(model, model_path, device, dtype) + self.pool = pool + + # Initialize pooling layer + from text_embeddings_server.models.pooling import DefaultPooling + self.pooling = DefaultPooling(self.hidden_size, pooling_mode=pool) + + logger.info(f"Loaded NeuronEmbeddingModel with pool={pool}") + + @staticmethod + def _is_neuron_compiled(model_path: Path) -> bool: + """Check if the model is already compiled for Neuron.""" + neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] + return len(neuron_files) > 0 + + @tracer.start_as_current_span("embed") + def embed(self, batch: PaddedBatch) -> List[Embedding]: + kwargs = self._prepare_inputs(batch) + output = self.model(**kwargs) + + # Apply pooling to get sentence embeddings + embedding = self.pooling.forward(output, batch.attention_mask) + + cpu_results = embedding.view(-1).tolist() + + return [ + Embedding( + values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size] + ) + for i in range(len(batch)) + ] + + @tracer.start_as_current_span("predict") + def predict(self, batch: PaddedBatch) -> List[Score]: + raise NotImplementedError("Prediction not supported for embedding models") + + +class NeuronClassificationModel(NeuronBaseModel): + """ + Neuron-optimized model for sequence classification. + + Uses optimum.neuron.NeuronModelForSequenceClassification for classification tasks. + """ + + def __init__( + self, + model_path: Path, + device: torch.device, + dtype: torch.dtype, + pool: str = "cls", + trust_remote: bool = False, + ): + from optimum.neuron import NeuronModelForSequenceClassification + + is_compiled = self._is_neuron_compiled(model_path) + export_kwargs = {} + if not is_compiled: + export_kwargs = { + "export": True, + "batch_size": NEURON_BATCH_SIZE, + "sequence_length": NEURON_SEQUENCE_LENGTH, + } + logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") + model = NeuronModelForSequenceClassification.from_pretrained( + model_path, + **export_kwargs, + ) + + super().__init__(model, model_path, device, dtype) + logger.info("Loaded NeuronClassificationModel") + + @staticmethod + def _is_neuron_compiled(model_path: Path) -> bool: + """Check if the model is already compiled for Neuron.""" + neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] + return len(neuron_files) > 0 + + @tracer.start_as_current_span("embed") + def embed(self, batch: PaddedBatch) -> List[Embedding]: + raise NotImplementedError("Embedding not supported for classification models") + + @tracer.start_as_current_span("predict") + def predict(self, batch: PaddedBatch) -> List[Score]: + kwargs = self._prepare_inputs(batch) + output = self.model(**kwargs) + + # Get logits from output + if hasattr(output, "logits"): + logits = output.logits + else: + logits = output[0] + + all_scores = logits.tolist() + return [Score(values=scores) for scores in all_scores] + + +class NeuronMaskedLMModel(NeuronBaseModel): + """ + Neuron-optimized model for Masked Language Modeling (SPLADE). + + Uses optimum.neuron.NeuronModelForMaskedLM for SPLADE-style sparse embeddings. + """ + + def __init__( + self, + model_path: Path, + device: torch.device, + dtype: torch.dtype, + pool: str = "splade", + trust_remote: bool = False, + ): + from optimum.neuron import NeuronModelForMaskedLM + + is_compiled = self._is_neuron_compiled(model_path) + export_kwargs = {} + if not is_compiled: + export_kwargs = { + "export": True, + "batch_size": NEURON_BATCH_SIZE, + "sequence_length": NEURON_SEQUENCE_LENGTH, + } + logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") + model = NeuronModelForMaskedLM.from_pretrained( + model_path, + **export_kwargs, + ) + + super().__init__(model, model_path, device, dtype) + + # Get vocab size for SPLADE output + self.vocab_size = model.config.vocab_size + logger.info(f"Loaded NeuronMaskedLMModel with vocab_size={self.vocab_size}") + + @staticmethod + def _is_neuron_compiled(model_path: Path) -> bool: + """Check if the model is already compiled for Neuron.""" + neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] + return len(neuron_files) > 0 + + @tracer.start_as_current_span("embed") + def embed(self, batch: PaddedBatch) -> List[Embedding]: + kwargs = self._prepare_inputs(batch) + output = self.model(**kwargs) + + # Get logits for SPLADE pooling + if hasattr(output, "logits"): + hidden_states = output.logits + else: + hidden_states = output[0] + + # SPLADE pooling: ReLU -> log(1+x) -> max pooling + hidden_states = torch.relu(hidden_states) + hidden_states = (1 + hidden_states).log() + hidden_states = torch.mul(hidden_states, batch.attention_mask.unsqueeze(-1)) + sparse_embedding = hidden_states.max(dim=1).values + + cpu_results = sparse_embedding.view(-1).tolist() + + return [ + Embedding( + values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size] + ) + for i in range(len(batch)) + ] + + @tracer.start_as_current_span("predict") + def predict(self, batch: PaddedBatch) -> List[Score]: + raise NotImplementedError("Prediction not supported for masked LM models") + + +def create_neuron_model( + model_path: Path, + device: torch.device, + dtype: torch.dtype, + pool: str = "cls", + trust_remote: bool = False, + config=None, +) -> Model: + """ + Factory function to create the appropriate Neuron model based on the model config. + + Args: + model_path: Path to the model + device: Target device (should be xla for Neuron) + dtype: Data type for the model + pool: Pooling strategy (cls, mean, lasttoken, splade) + trust_remote: Whether to trust remote code + config: Pre-loaded model config (optional) + + Returns: + Appropriate Neuron model instance + """ + from transformers import AutoConfig + + if config is None: + config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote) + + architectures = getattr(config, "architectures", []) or [] + architecture = architectures[0] if architectures else "" + + logger.info(f"Creating Neuron model for architecture: {architecture}, pool: {pool}") + + # Check for classification models + if architecture.endswith("ForSequenceClassification") or architecture.endswith("Classification"): + return NeuronClassificationModel(model_path, device, dtype, pool, trust_remote) + + # Check for SPLADE (masked LM) models + if pool == "splade" or architecture.endswith("ForMaskedLM"): + return NeuronMaskedLMModel(model_path, device, dtype, pool, trust_remote) + + # Check for sentence-transformers models + # These typically have specific config attributes or are in specific repositories + is_sentence_transformer = ( + hasattr(config, "sentence_transformers_config") or + hasattr(config, "_name_or_path") and "sentence-transformers" in str(config._name_or_path).lower() or + hasattr(config, "pooling_mode") or + (model_path / "sentence_bert_config.json").exists() if model_path.is_dir() else False + ) + + if is_sentence_transformer: + try: + return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote) + except Exception as e: + logger.warning(f"Failed to load as SentenceTransformer, falling back to FeatureExtraction: {e}") + + # Default to feature extraction model + try: + return NeuronEmbeddingModel(model_path, device, dtype, pool, trust_remote) + except Exception as e: + logger.warning(f"Failed to load NeuronEmbeddingModel, trying NeuronSentenceTransformersModel: {e}") + return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote) diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py index 46b81370f..4963b012c 100644 --- a/backends/python/server/text_embeddings_server/utils/device.py +++ b/backends/python/server/text_embeddings_server/utils/device.py @@ -65,7 +65,7 @@ def get_neuron_major() -> int: return -1 def is_neuron() -> bool: - return get_neuron_major > -1 + return get_neuron_major() > -1 def use_ipex() -> bool: value = os.environ.get("USE_IPEX", "True").lower() diff --git a/backends/src/lib.rs b/backends/src/lib.rs index b53067de1..4d45a5b02 100644 --- a/backends/src/lib.rs +++ b/backends/src/lib.rs @@ -425,14 +425,23 @@ async fn init_backend( .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?; if model_files.is_empty() { - tracing::error!( + tracing::warn!( "Neuron model files not found in the repository. \ - You can easily compile your model to neuron format following the guide: \ + The Python backend will attempt to compile the model on-the-fly using optimum-neuron. \ + This may take several minutes. For faster startup, consider pre-compiling your model: \ https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview " ); - return Err(BackendError::WeightsNotFound( - "No Neuron model files found".into(), - )); + // Fall back to downloading regular model files for on-the-fly compilation + if download_safetensors(api_repo).await.is_err() { + tracing::warn!( + "safetensors weights not found. Using `pytorch_model.bin` instead." + ); + tracing::info!("Downloading `pytorch_model.bin`"); + api_repo + .get("pytorch_model.bin") + .await + .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?; + } } tracing::info!("Neuron model downloaded in {:?}", start.elapsed()); diff --git a/integration_tests/README.md b/integration_tests/README.md index 641d8fce3..18b9232ad 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -1,8 +1,18 @@ # Integration Tests -This directory contains integration tests for the project. This starts the TEI server and run an /embed request to it while checking the output is as expected. +This directory contains integration tests for the project. This starts the TEI server and runs an /embed request to it while checking the output is as expected. -## Running the tests for HPU +## How Tests Work + +The tests use pytest fixtures to: +1. Start a Docker container with the TEI server +2. Wait for the server to become healthy +3. Send embedding requests and validate responses +4. Stop and remove the container after tests complete + +The Docker image must be built before running tests. The `uv run pytest` command will start containers automatically using the pre-built image. + +## Running the tests for HPU (Habana Gaudi) First you have to build the docker image. ```bash @@ -13,5 +23,105 @@ docker build . -f Dockerfile-intel --build-arg PLATFORM=$platform -t tei_hpu Then you can run the tests. ```bash +cd integration_tests/gaudi +uv run pytest --durations=0 -sv . +``` + +### Environment Variables (HPU) + +| Variable | Description | Default | +|----------|-------------|---------| +| `DOCKER_IMAGE` | Docker image to use | `tei_hpu` | +| `DOCKER_VOLUME` | Volume for model cache (recommended) | None | +| `HF_TOKEN` | HuggingFace token for gated models | None | +| `LOG_LEVEL` | Server log level | `info` | + +## Running the tests for Neuron (AWS Inferentia/Trainium) + +### Prerequisites + +1. **AWS Neuron instance**: Tests must run on an EC2 instance with Neuron devices (inf1, inf2, or trn1) +2. **Neuron drivers**: Ensure Neuron drivers are installed and `/dev/neuron*` devices are available +3. **Pre-compiled models**: Neuron requires models to be pre-compiled to `.neuron` format + +### Building the Docker Image + +```bash +docker build . -f Dockerfile-neuron -t tei-neuron +``` + +### Running the Tests + +```bash +cd integration_tests/neuron uv run pytest --durations=0 -sv . ``` + +### Environment Variables (Neuron) + +| Variable | Description | Default | +|----------|-------------|---------| +| `DOCKER_IMAGE` | Docker image to use | `tei-neuron` | +| `DOCKER_VOLUME` | Volume for model cache (recommended) | None | +| `HF_TOKEN` | HuggingFace token for gated models | None | +| `LOG_LEVEL` | Server log level | `info` | +| `NEURON_RT_NUM_CORES` | Number of Neuron cores to use | `1` | +| `NEURON_RT_VISIBLE_CORES` | Which Neuron cores are visible | `0` | + +### Using Pre-compiled Neuron Models + +Neuron models must be pre-compiled before use. You have two options: + +1. **Use models with pre-compiled Neuron artifacts**: Some models on HuggingFace Hub have `.neuron` files available + +2. **Compile models yourself**: Follow the [Optimum Neuron guide](https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview) to compile your models + +Example compilation: +```python +from optimum.neuron import NeuronModelForSentenceTransformers + +# Compile and save +model = NeuronModelForSentenceTransformers.from_pretrained( + "sentence-transformers/all-MiniLM-L6-v2", + export=True, + batch_size=1, + sequence_length=512, +) +model.save_pretrained("./all-MiniLM-L6-v2-neuron") +model.push_to_hub("your-username/all-MiniLM-L6-v2-neuron") +``` + +### Troubleshooting Neuron Tests + +**Container exits immediately**: +- Check if Neuron devices are available: `ls /dev/neuron*` +- Check container logs for "Neuron model files not found" - model needs compilation +- Ensure the Docker image was built with Neuron support + +**Long startup times**: +- Neuron models may take several minutes to load due to compilation +- The test timeout is set to 600 seconds (10 minutes) by default + +**Permission errors**: +- Ensure Docker has access to Neuron devices +- The tests add `IPC_LOCK` capability and mount `/dev/neuron*` devices + +## Adding New Test Models + +To add a new model to test, update the `TEST_CONFIGS` dictionary in `test_embed.py`: + +```python +TEST_CONFIGS = { + "your-model/name": { + "model_id": "your-model/name", + "input": "Test input text", + "batch_inputs": ["Text 1", "Text 2"], + "args": ["--dtype", "float32"], + "env_config": { + "MAX_WARMUP_SEQUENCE_LENGTH": "512", + }, + }, +} +``` + +For Habana tests, you can also add `expected_output` to validate exact embedding values. diff --git a/integration_tests/neuron/conftest.py b/integration_tests/neuron/conftest.py index e69de29bb..40d16b05a 100644 --- a/integration_tests/neuron/conftest.py +++ b/integration_tests/neuron/conftest.py @@ -0,0 +1,299 @@ +import asyncio +import contextlib +import os +import shlex +import subprocess +import sys +import threading +import time +from tempfile import TemporaryDirectory + +import docker +import pytest +from docker.errors import NotFound +import logging +from test_embed import TEST_CONFIGS +import aiohttp + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(levelname)-8s | %(name)s:%(funcName)s:%(lineno)d - %(message)s", + stream=sys.stdout, +) +logger = logging.getLogger(__file__) + +# Use the latest image from the local docker build +DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tei-neuron") +DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None) + +if DOCKER_VOLUME is None: + logger.warning( + "DOCKER_VOLUME is not set, this will lead to the tests redownloading the models on each run, consider setting it to speed up testing" + ) + +LOG_LEVEL = os.getenv("LOG_LEVEL", "info") + +BASE_ENV = { + "HF_HUB_ENABLE_HF_TRANSFER": "1", + "LOG_LEVEL": LOG_LEVEL, + # Neuron-specific environment variables + "NEURON_RT_NUM_CORES": os.getenv("NEURON_RT_NUM_CORES", "1"), + "NEURON_RT_VISIBLE_CORES": os.getenv("NEURON_RT_VISIBLE_CORES", "0"), +} + +# Neuron requires privileged mode for OCI hook to work +NEURON_RUN_ARGS = { + "privileged": True, +} + + +def stream_container_logs(container, test_name): + """Stream container logs in a separate thread.""" + try: + for log in container.logs(stream=True, follow=True): + print( + f"[TEI Server Logs - {test_name}] {log.decode('utf-8')}", + end="", + file=sys.stderr, + flush=True, + ) + except Exception as e: + logger.error(f"Error streaming container logs: {str(e)}") + + +class LauncherHandle: + def __init__(self, port: int): + self.port = port + self.base_url = f"http://127.0.0.1:{port}" + + async def generate(self, prompt: str): + """Send embed request to the TEI server (alias for embed).""" + return await self.embed(prompt) + + async def embed(self, text: str): + """Send embed request to the TEI server.""" + async with aiohttp.ClientSession() as session: + async with session.post( + f"{self.base_url}/embed", + json={"inputs": text}, + headers={"Content-Type": "application/json"} + ) as response: + if response.status != 200: + error_text = await response.text() + raise RuntimeError(f"Request failed with status {response.status}: {error_text}") + return await response.json() + + async def embed_batch(self, texts: list): + """Send batch embed request to the TEI server.""" + async with aiohttp.ClientSession() as session: + async with session.post( + f"{self.base_url}/embed", + json={"inputs": texts}, + headers={"Content-Type": "application/json"} + ) as response: + if response.status != 200: + error_text = await response.text() + raise RuntimeError(f"Request failed with status {response.status}: {error_text}") + return await response.json() + + async def predict(self, text: str): + """Send predict request to the TEI server (for classification models).""" + async with aiohttp.ClientSession() as session: + async with session.post( + f"{self.base_url}/predict", + json={"inputs": text}, + headers={"Content-Type": "application/json"} + ) as response: + if response.status != 200: + error_text = await response.text() + raise RuntimeError(f"Request failed with status {response.status}: {error_text}") + return await response.json() + + def _inner_health(self): + raise NotImplementedError + + async def health(self, timeout: int = 300): + """Wait for the server to be healthy. + + Neuron models may take longer to compile/load, so default timeout is higher. + """ + assert timeout > 0 + start_time = time.time() + logger.info(f"Starting health check with timeout of {timeout}s") + + for attempt in range(timeout): + if not self._inner_health(): + logger.error("Launcher crashed during health check") + raise RuntimeError("Launcher crashed") + + try: + # Try to make a request using generate (like Habana tests) + await self.generate("test") + elapsed = time.time() - start_time + logger.info(f"Health check passed after {elapsed:.1f}s") + return + except (aiohttp.ClientError, asyncio.TimeoutError) as e: + if attempt == timeout - 1: + logger.error(f"Health check failed after {timeout}s: {str(e)}") + raise RuntimeError(f"Health check failed: {str(e)}") + if attempt % 10 == 0 and attempt != 0: # Only log every 10th attempt + logger.debug(f"Connection attempt {attempt}/{timeout} failed: {str(e)}") + await asyncio.sleep(1) + except Exception as e: + logger.error(f"Unexpected error during health check: {str(e)}") + import traceback + logger.error(f"Full traceback:\n{traceback.format_exc()}") + raise + + +class ContainerLauncherHandle(LauncherHandle): + def __init__(self, docker_client, container_name, port: int): + super().__init__(port) + self.docker_client = docker_client + self.container_name = container_name + + def _inner_health(self) -> bool: + try: + container = self.docker_client.containers.get(self.container_name) + status = container.status + if status not in ["running", "created"]: + logger.warning(f"Container status is {status}") + # Get container logs for debugging + logs = container.logs().decode("utf-8") + logger.debug(f"Container logs:\n{logs}") + return False + return True + except Exception as e: + logger.error(f"Error checking container health: {str(e)}") + return False + + +class ProcessLauncherHandle(LauncherHandle): + def __init__(self, process, port: int): + super(ProcessLauncherHandle, self).__init__(port) + self.process = process + + def _inner_health(self) -> bool: + return self.process.poll() is None + + +@pytest.fixture(scope="module") +def data_volume(): + tmpdir = TemporaryDirectory() + yield tmpdir.name + try: + # Cleanup the temporary directory using sudo as it contains root files created by the container + subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"), check=True) + except subprocess.CalledProcessError as e: + logger.error(f"Error cleaning up temporary directory: {str(e)}") + + +@pytest.fixture(scope="function") +def neuron_launcher(): + @contextlib.contextmanager + def docker_launcher( + model_id: str, + test_name: str, + ): + logger.info( + f"Starting docker launcher for model {model_id} and test {test_name}" + ) + + port = 8080 + + client = docker.from_env() + + container_name = f"tei-neuron-test-{test_name.replace('/', '-').replace('_', '-')}" + + try: + container = client.containers.get(container_name) + logger.info( + f"Stopping existing container {container_name} for test {test_name}" + ) + container.stop() + container.wait() + except NotFound: + pass + except Exception as e: + logger.error(f"Error handling existing container: {str(e)}") + + tei_args = TEST_CONFIGS[test_name]["args"].copy() + + # add model_id to tei args + tei_args.append("--model-id") + tei_args.append(model_id) + + env = BASE_ENV.copy() + env["HF_TOKEN"] = os.getenv("HF_TOKEN") + + # Add env config that is defined in the fixture parameter + if "env_config" in TEST_CONFIGS[test_name]: + env.update(TEST_CONFIGS[test_name]["env_config"].copy()) + + volumes = [f"{DOCKER_VOLUME}:/data"] if DOCKER_VOLUME else [] + logger.debug(f"Using volume {volumes}") + + try: + logger.info(f"Creating container with name {container_name}") + + # Build run arguments - use privileged mode for Neuron OCI hook + run_args = NEURON_RUN_ARGS.copy() + + container = client.containers.run( + DOCKER_IMAGE, + command=tei_args, + name=container_name, + environment=env, + detach=True, + volumes=volumes if volumes else None, + ports={"80/tcp": port}, + **run_args, + ) + + logger.info(f"Container {container_name} started successfully") + + # Start log streaming in a background thread + log_thread = threading.Thread( + target=stream_container_logs, + args=(container, test_name), + daemon=True, # This ensures the thread will be killed when the main program exits + ) + log_thread.start() + + # Add a small delay to allow container to initialize + time.sleep(2) + + # Check container status after creation + status = container.status + logger.debug(f"Initial container status: {status}") + if status not in ["running", "created"]: + logs = container.logs().decode("utf-8") + logger.error(f"Container failed to start properly. Logs:\n{logs}") + + yield ContainerLauncherHandle(client, container.name, port) + + except Exception as e: + logger.error(f"Error starting container: {str(e)}") + # Get full traceback for debugging + import traceback + + logger.error(f"Full traceback:\n{traceback.format_exc()}") + raise + finally: + try: + container = client.containers.get(container_name) + logger.info(f"Stopping container {container_name}") + container.stop() + container.wait() + + container_output = container.logs().decode("utf-8") + print(container_output, file=sys.stderr) + + container.remove() + logger.info(f"Container {container_name} removed successfully") + except NotFound: + pass + except Exception as e: + logger.warning(f"Error cleaning up container: {str(e)}") + + return docker_launcher diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py index e69de29bb..69b0fee7a 100644 --- a/integration_tests/neuron/test_embed.py +++ b/integration_tests/neuron/test_embed.py @@ -0,0 +1,223 @@ +from typing import Any, Dict, Generator +from _pytest.fixtures import SubRequest + +import pytest +import pytest_asyncio +import numpy as np + + +# Test configurations for Neuron backend +# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures. +TEST_CONFIGS = { + # BERT-based embedding model - commonly used and well-supported on Neuron + "sentence-transformers/all-MiniLM-L6-v2": { + "model_id": "sentence-transformers/all-MiniLM-L6-v2", + "input": "What is Deep Learning?", + "batch_inputs": [ + "What is Deep Learning?", + "How does machine learning work?", + "Tell me about neural networks.", + ], + # Expected output for first 50 dimensions (to keep config manageable) + # These values should be generated from a known-good run + "expected_output_prefix": None, # Will validate structure only if None + "args": [ + "--dtype", "float32", + "--max-batch-requests", "1", + ], + "env_config": { + "MAX_WARMUP_SEQUENCE_LENGTH": "512", + }, + }, +} + + +@pytest.fixture(scope="module", params=TEST_CONFIGS.keys()) +def test_config(request: SubRequest) -> Dict[str, Any]: + """Fixture that provides model configurations for testing.""" + model_name = request.param + test_config = TEST_CONFIGS[model_name].copy() + test_config["test_name"] = model_name + return test_config + + +@pytest.fixture(scope="module") +def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]: + yield test_config["model_id"] + + +@pytest.fixture(scope="module") +def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]: + yield test_config["test_name"] + + +@pytest.fixture(scope="module") +def input_text(test_config: Dict[str, Any]) -> str: + return test_config["input"] + + +@pytest.fixture(scope="module") +def batch_inputs(test_config: Dict[str, Any]) -> list: + return test_config.get("batch_inputs", [test_config["input"]]) + + +@pytest.fixture(scope="module") +def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, Any]: + return { + "expected_output_prefix": test_config.get("expected_output_prefix"), + } + + +@pytest.fixture(scope="function") +def tei_service(neuron_launcher, model_id: str, test_name: str): + with neuron_launcher(model_id, test_name) as tei_service: + yield tei_service + + +@pytest_asyncio.fixture(scope="function") +async def tei_client(tei_service): + # Neuron models may take longer to load due to compilation + await tei_service.health(600) # 10 minute timeout for Neuron compilation + return tei_service + + +@pytest.mark.asyncio +async def test_model_single_request( + tei_client, expected_outputs: Dict[str, Any], input_text: str +): + """Test single embedding request.""" + response = await tei_client.embed(input_text) + + # Verify response structure + assert isinstance(response, list), f"Expected list, got {type(response)}" + assert len(response) > 0, "Embedding should not be empty" + + response_array = np.array(response) + + # Check that values are numeric + assert response_array.dtype in [np.float32, np.float64, np.float16], \ + f"Expected float array, got {response_array.dtype}" + + # If expected output is provided, validate against it + expected_prefix = expected_outputs.get("expected_output_prefix") + if expected_prefix is not None: + expected_array = np.array(eval(expected_prefix) if isinstance(expected_prefix, str) else expected_prefix) + prefix_len = len(expected_array.flatten()) + response_flat = response_array.flatten()[:prefix_len] + + if not np.allclose(response_flat, expected_array.flatten(), rtol=1e-4, atol=1e-4): + print("\nExpected output (prefix):") + print(f"{expected_array.tolist()}") + print("\nReceived output (prefix):") + print(f"{response_flat.tolist()}") + raise AssertionError("Response array does not match expected array within tolerance") + + # Check embedding dimensions are reasonable (typically 384, 768, 1024, etc.) + embedding_dim = response_array.shape[-1] if response_array.ndim > 1 else len(response_array) + assert embedding_dim > 0, "Embedding dimension should be positive" + + print(f"Single request embedding shape: {response_array.shape}") + print(f"Embedding dimension: {embedding_dim}") + + +@pytest.mark.asyncio +async def test_model_batch_request(tei_client, batch_inputs: list): + """Test batch embedding request.""" + response = await tei_client.embed_batch(batch_inputs) + + # Verify response is a list of embeddings + assert isinstance(response, list), f"Expected list, got {type(response)}" + assert len(response) == len(batch_inputs), \ + f"Expected {len(batch_inputs)} embeddings, got {len(response)}" + + response_array = np.array(response) + print(f"Batch request response shape: {response_array.shape}") + + # Check each embedding + for i, embedding in enumerate(response): + assert isinstance(embedding, list), f"Embedding {i} should be a list" + assert len(embedding) > 0, f"Embedding {i} should not be empty" + + +@pytest.mark.asyncio +async def test_model_embedding_consistency(tei_client, input_text: str): + """Test that the same input produces consistent embeddings.""" + response1 = await tei_client.embed(input_text) + response2 = await tei_client.embed(input_text) + + array1 = np.array(response1) + array2 = np.array(response2) + + # Embeddings for the same input should be identical (or very close) + assert np.allclose(array1, array2, rtol=1e-4, atol=1e-4), \ + "Same input should produce consistent embeddings" + + +@pytest.mark.asyncio +async def test_model_different_inputs_different_embeddings(tei_client): + """Test that different inputs produce different embeddings.""" + input1 = "The weather is sunny today." + input2 = "Machine learning is a subset of artificial intelligence." + + response1 = await tei_client.embed(input1) + response2 = await tei_client.embed(input2) + + array1 = np.array(response1) + array2 = np.array(response2) + + # Different inputs should produce different embeddings + assert not np.allclose(array1, array2, rtol=1e-2, atol=1e-2), \ + "Different inputs should produce different embeddings" + + +@pytest.mark.asyncio +async def test_model_embedding_normalization(tei_client, input_text: str): + """Test embedding properties (optional - some models normalize, some don't).""" + response = await tei_client.embed(input_text) + array = np.array(response) + + # Flatten if needed + if array.ndim > 1: + array = array.flatten() + + # Check L2 norm - many sentence transformers normalize to unit length + l2_norm = np.linalg.norm(array) + print(f"Embedding L2 norm: {l2_norm}") + + # Just verify the norm is reasonable (not zero, not extremely large) + assert l2_norm > 0.1, "Embedding norm should be positive" + assert l2_norm < 1000, "Embedding norm should not be extremely large" + + +@pytest.mark.asyncio +async def test_model_long_input(tei_client): + """Test handling of longer input text.""" + # Create a longer input (but still within typical model limits) + long_input = "This is a test sentence. " * 20 # ~100 tokens + + response = await tei_client.embed(long_input) + + assert isinstance(response, list), f"Expected list, got {type(response)}" + assert len(response) > 0, "Embedding should not be empty" + + +@pytest.mark.asyncio +async def test_model_special_characters(tei_client): + """Test handling of special characters in input.""" + special_input = "Hello! How are you? I'm fine, thanks. #test @user $100" + + response = await tei_client.embed(special_input) + + assert isinstance(response, list), f"Expected list, got {type(response)}" + assert len(response) > 0, "Embedding should not be empty" + + +@pytest.mark.asyncio +async def test_model_unicode_input(tei_client): + """Test handling of unicode characters.""" + unicode_input = "Hello world! Bonjour le monde!" + + response = await tei_client.embed(unicode_input) + + assert isinstance(response, list), f"Expected list, got {type(response)}" + assert len(response) > 0, "Embedding should not be empty" From 976b71c617fa58279957939b0719ff6670f88610 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 4 Feb 2026 14:24:30 +0000 Subject: [PATCH 09/20] add to CI & add pre-compiled test --- ...test.yaml => integration-test-habana.yaml} | 4 +- .../workflows/integration-test-neuron.yaml | 33 +++++ .../models/neuron_models.py | 26 ++-- integration_tests/neuron/test_embed.py | 115 ++++++------------ 4 files changed, 87 insertions(+), 91 deletions(-) rename .github/workflows/{integration-test.yaml => integration-test-habana.yaml} (90%) create mode 100644 .github/workflows/integration-test-neuron.yaml diff --git a/.github/workflows/integration-test.yaml b/.github/workflows/integration-test-habana.yaml similarity index 90% rename from .github/workflows/integration-test.yaml rename to .github/workflows/integration-test-habana.yaml index b6f042179..d17a9cb14 100644 --- a/.github/workflows/integration-test.yaml +++ b/.github/workflows/integration-test-habana.yaml @@ -1,4 +1,4 @@ -name: Run integration tests +name: Run Habana integration tests on: workflow_dispatch: @@ -28,4 +28,4 @@ jobs: working-directory: integration_tests run: | uv sync --locked --all-extras --dev - uv run pytest --durations=0 -sv . + uv run pytest --durations=0 -sv gaudi/ diff --git a/.github/workflows/integration-test-neuron.yaml b/.github/workflows/integration-test-neuron.yaml new file mode 100644 index 000000000..8be3630e2 --- /dev/null +++ b/.github/workflows/integration-test-neuron.yaml @@ -0,0 +1,33 @@ +name: Run Neuron integration tests + +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * *' # Run the workflow nightly to check Neuron integration is working + +jobs: + tests: + concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + runs-on: + group: aws-inf2-8xlarge + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + + - name: Build Docker image for Neuron + run: | + docker build . -f Dockerfile-neuron -t tei-neuron + + - name: Run integration tests + working-directory: integration_tests + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + DOCKER_IMAGE: tei-neuron + run: | + uv sync --locked --all-extras --dev + uv run pytest --durations=0 -sv neuron/ diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py index 4589f6b77..f430a35e7 100644 --- a/backends/python/server/text_embeddings_server/models/neuron_models.py +++ b/backends/python/server/text_embeddings_server/models/neuron_models.py @@ -4,7 +4,7 @@ from abc import ABC from pathlib import Path -from typing import Type, List, Optional +from typing import Type, List from opentelemetry import trace from loguru import logger @@ -148,18 +148,18 @@ def embed(self, batch: PaddedBatch) -> List[Embedding]: token_embeddings = output.token_embeddings else: raise ValueError(f"Cannot extract embeddings from model output: {type(output)}") - - # Apply pooling based on self.pool setting - if self.pool == "cls": - sentence_embedding = token_embeddings[:, 0, :] - elif self.pool == "mean": - attention_mask = kwargs["attention_mask"].unsqueeze(-1).float() - sentence_embedding = (token_embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1) - elif self.pool == "last_token": - seq_lengths = kwargs["attention_mask"].sum(dim=1) - 1 - sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] - else: - raise ValueError(f"Invalid pooling mode: {self.pool}") + + # Apply pooling based on self.pool setting + if self.pool == "cls": + sentence_embedding = token_embeddings[:, 0, :] + elif self.pool == "mean": + attention_mask = kwargs["attention_mask"].unsqueeze(-1).float() + sentence_embedding = (token_embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1) + elif self.pool == "last_token": + seq_lengths = kwargs["attention_mask"].sum(dim=1) - 1 + sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] + else: + raise ValueError(f"Invalid pooling mode: {self.pool}") # Convert to list format expected by the gRPC interface cpu_results = sentence_embedding.view(-1).tolist() diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py index 69b0fee7a..03da9d494 100644 --- a/integration_tests/neuron/test_embed.py +++ b/integration_tests/neuron/test_embed.py @@ -7,20 +7,52 @@ # Test configurations for Neuron backend -# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures. TEST_CONFIGS = { - # BERT-based embedding model - commonly used and well-supported on Neuron - "sentence-transformers/all-MiniLM-L6-v2": { - "model_id": "sentence-transformers/all-MiniLM-L6-v2", + # # On-the-fly Neuron compilation + # "sentence-transformers/all-MiniLM-L6-v2": { + # "model_id": "sentence-transformers/all-MiniLM-L6-v2", + # "input": "What is Deep Learning?", + # "batch_inputs": [ + # "What is Deep Learning?", + # "How does machine learning work?", + # "Tell me about neural networks.", + # ], + # "expected_output_prefix": None, + # "args": [ + # "--dtype", "float32", + # "--max-batch-requests", "1", + # ], + # "env_config": { + # "MAX_WARMUP_SEQUENCE_LENGTH": "512", + # }, + # }, + # "BAAI/bge-base-en-v1.5": { + # "model_id": "BAAI/bge-base-en-v1.5", + # "input": "What is Deep Learning?", + # "batch_inputs": [ + # "What is Deep Learning?", + # "How does machine learning work?", + # "Tell me about neural networks.", + # ], + # "expected_output_prefix": None, + # "args": [ + # "--dtype", "float32", + # "--max-batch-requests", "1", + # ], + # "env_config": { + # "MAX_WARMUP_SEQUENCE_LENGTH": "512", + # }, + # }, + # Pre-compiled Neuron model + "optimum/bge-base-en-v1.5-neuronx": { + "model_id": "optimum/bge-base-en-v1.5-neuronx", "input": "What is Deep Learning?", "batch_inputs": [ "What is Deep Learning?", "How does machine learning work?", "Tell me about neural networks.", ], - # Expected output for first 50 dimensions (to keep config manageable) - # These values should be generated from a known-good run - "expected_output_prefix": None, # Will validate structure only if None + "expected_output_prefix": None, "args": [ "--dtype", "float32", "--max-batch-requests", "1", @@ -152,72 +184,3 @@ async def test_model_embedding_consistency(tei_client, input_text: str): assert np.allclose(array1, array2, rtol=1e-4, atol=1e-4), \ "Same input should produce consistent embeddings" - -@pytest.mark.asyncio -async def test_model_different_inputs_different_embeddings(tei_client): - """Test that different inputs produce different embeddings.""" - input1 = "The weather is sunny today." - input2 = "Machine learning is a subset of artificial intelligence." - - response1 = await tei_client.embed(input1) - response2 = await tei_client.embed(input2) - - array1 = np.array(response1) - array2 = np.array(response2) - - # Different inputs should produce different embeddings - assert not np.allclose(array1, array2, rtol=1e-2, atol=1e-2), \ - "Different inputs should produce different embeddings" - - -@pytest.mark.asyncio -async def test_model_embedding_normalization(tei_client, input_text: str): - """Test embedding properties (optional - some models normalize, some don't).""" - response = await tei_client.embed(input_text) - array = np.array(response) - - # Flatten if needed - if array.ndim > 1: - array = array.flatten() - - # Check L2 norm - many sentence transformers normalize to unit length - l2_norm = np.linalg.norm(array) - print(f"Embedding L2 norm: {l2_norm}") - - # Just verify the norm is reasonable (not zero, not extremely large) - assert l2_norm > 0.1, "Embedding norm should be positive" - assert l2_norm < 1000, "Embedding norm should not be extremely large" - - -@pytest.mark.asyncio -async def test_model_long_input(tei_client): - """Test handling of longer input text.""" - # Create a longer input (but still within typical model limits) - long_input = "This is a test sentence. " * 20 # ~100 tokens - - response = await tei_client.embed(long_input) - - assert isinstance(response, list), f"Expected list, got {type(response)}" - assert len(response) > 0, "Embedding should not be empty" - - -@pytest.mark.asyncio -async def test_model_special_characters(tei_client): - """Test handling of special characters in input.""" - special_input = "Hello! How are you? I'm fine, thanks. #test @user $100" - - response = await tei_client.embed(special_input) - - assert isinstance(response, list), f"Expected list, got {type(response)}" - assert len(response) > 0, "Embedding should not be empty" - - -@pytest.mark.asyncio -async def test_model_unicode_input(tei_client): - """Test handling of unicode characters.""" - unicode_input = "Hello world! Bonjour le monde!" - - response = await tei_client.embed(unicode_input) - - assert isinstance(response, list), f"Expected list, got {type(response)}" - assert len(response) > 0, "Embedding should not be empty" From dc3edc2c51ca28a7713e6b3fafe1e85992e39cea Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Wed, 4 Feb 2026 22:32:05 +0000 Subject: [PATCH 10/20] fix tests --- Dockerfile-neuron | 6 +- .../text_embeddings_server/models/__init__.py | 2 - .../models/neuron_models.py | 224 +++++++----------- docs/source/en/ aws_neuron.md | 37 --- docs/source/en/aws_neuron.md | 105 ++++++++ integration_tests/neuron/test_embed.py | 70 +++--- 6 files changed, 232 insertions(+), 212 deletions(-) delete mode 100644 docs/source/en/ aws_neuron.md create mode 100644 docs/source/en/aws_neuron.md diff --git a/Dockerfile-neuron b/Dockerfile-neuron index dbf1e9a29..741084c8b 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -150,11 +150,11 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \ && rm -rf ~/.cache/pip/* # HF ARGS -# Note: optimum-neuron 0.4.1 requires transformers~=4.55.4 -ARG TRANSFORMERS_VERSION=4.55.4 +# Note: optimum-neuron 0.4.4 requires transformers~=4.57.1 +ARG TRANSFORMERS_VERSION=4.57.1 ARG DIFFUSERS_VERSION=0.35.2 ARG HUGGINGFACE_HUB_VERSION=0.36.0 -ARG OPTIMUM_NEURON_VERSION=0.4.1 +ARG OPTIMUM_NEURON_VERSION=0.4.4 ARG SENTENCE_TRANSFORMERS=5.1.2 ARG PEFT_VERSION=0.17.0 ARG DATASETS_VERSION=4.1.1 diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py index 1de5f9b1b..8a48510d6 100644 --- a/backends/python/server/text_embeddings_server/models/__init__.py +++ b/backends/python/server/text_embeddings_server/models/__init__.py @@ -45,7 +45,6 @@ # Neuron models - only import when on Neuron device to avoid unnecessary dependencies NeuronSentenceTransformersModel = None -NeuronEmbeddingModel = None NeuronClassificationModel = None NeuronMaskedLMModel = None create_neuron_model = None @@ -54,7 +53,6 @@ try: from text_embeddings_server.models.neuron_models import ( NeuronSentenceTransformersModel, - NeuronEmbeddingModel, NeuronClassificationModel, NeuronMaskedLMModel, create_neuron_model, diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py index f430a35e7..f95c2b3c5 100644 --- a/backends/python/server/text_embeddings_server/models/neuron_models.py +++ b/backends/python/server/text_embeddings_server/models/neuron_models.py @@ -13,13 +13,13 @@ tracer = trace.get_tracer(__name__) -# Neuron compilation parameters from environment variables +# Neuron static shapes compilation parameters NEURON_BATCH_SIZE = int(os.getenv("NEURON_BATCH_SIZE", "1")) NEURON_SEQUENCE_LENGTH = int(os.getenv("NEURON_SEQUENCE_LENGTH", "512")) class NeuronBaseModel(Model, ABC): - """Base class for all Neuron models with common functionality.""" + """Base class for all Neuron models.""" def __init__( self, @@ -83,12 +83,12 @@ def _prepare_inputs(self, batch: PaddedBatch) -> dict: return kwargs -class NeuronSentenceTransformersModel(NeuronBaseModel): +class NeuronSentenceTransformersModel(Model): """ - Neuron-optimized model for sentence-transformers. + Neuron model for sentence-transformers. - Uses optimum.neuron.NeuronModelForSentenceTransformers which is designed - for sentence embedding models that output sentence_embedding directly. + Uses optimum.neuron.NeuronSentenceTransformers which is designed + for sentence embedding models. """ def __init__( @@ -99,29 +99,43 @@ def __init__( pool: str = "cls", trust_remote: bool = False, ): - try: - from optimum.neuron import NeuronModelForSentenceTransformers - is_compiled = self._is_neuron_compiled(model_path) - export_kwargs = {} - if not is_compiled: - export_kwargs = { - "export": True, - "batch_size": NEURON_BATCH_SIZE, - "sequence_length": NEURON_SEQUENCE_LENGTH, - } - logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") - model = NeuronModelForSentenceTransformers.from_pretrained( + from optimum.neuron import NeuronSentenceTransformers + from transformers import AutoConfig + + # Load config separately for reliable access + config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote) + self.hidden_size = config.hidden_size + + # Calculate max input length + position_offset = 0 + model_type = config.model_type + if model_type in ["xlm-roberta", "camembert", "roberta"]: + position_offset = getattr(config, "pad_token_id", 1) + 1 + + if hasattr(config, "max_seq_length"): + self.max_input_length = config.max_seq_length + elif hasattr(config, "n_positions"): + self.max_input_length = config.n_positions + else: + self.max_input_length = ( + config.max_position_embeddings - position_offset + ) + + is_compiled = self._is_neuron_compiled(model_path) + if not is_compiled: + logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") + model = NeuronSentenceTransformers.from_pretrained( model_path, - **export_kwargs, + export=True, + batch_size=NEURON_BATCH_SIZE, + sequence_length=NEURON_SEQUENCE_LENGTH, ) - except ImportError: - # Fallback to legacy import - from optimum.neuron import NeuronSentenceTransformers + else: model = NeuronSentenceTransformers.from_pretrained(model_path) - super().__init__(model, model_path, device, dtype) self.pool = pool - logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}") + super().__init__(model=model, dtype=dtype, device=device) + logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}, hidden_size={self.hidden_size}") @staticmethod def _is_neuron_compiled(model_path: Path) -> bool: @@ -129,37 +143,67 @@ def _is_neuron_compiled(model_path: Path) -> bool: neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] return len(neuron_files) > 0 + @property + def batch_type(self) -> Type[PaddedBatch]: + return PaddedBatch + @tracer.start_as_current_span("embed") def embed(self, batch: PaddedBatch) -> List[Embedding]: - kwargs = self._prepare_inputs(batch) - output = self.model(**kwargs) + # Prepare inputs + input_ids = batch.input_ids.to(torch.long) + attention_mask = batch.attention_mask.to(torch.long) + + # NeuronSentenceTransformers forward pass expects positional arguments + output = self.model(input_ids, attention_mask) + # Get sentence embeddings from output sentence_embedding = None - # NeuronModelForSentenceTransformers returns sentence_embedding directly - if hasattr(output, "sentence_embedding") and output.sentence_embedding is not None: - candidate = output.sentence_embedding - if candidate.abs().sum() > 0: - sentence_embedding = candidate - - # If sentence_embedding is invalid, fall back to manual pooling of token_embeddings - if sentence_embedding is None: - # Get token embeddings - if hasattr(output, "token_embeddings") and output.token_embeddings is not None: - token_embeddings = output.token_embeddings + if isinstance(output, dict): + # Check if sentence_embedding exists and has non-zero values + # NeuronSentenceTransformers may return zeros for sentence_embedding when pooling fails + has_valid_sentence_embedding = ( + "sentence_embedding" in output + and output["sentence_embedding"] is not None + and output["sentence_embedding"].abs().sum() > 0 + ) + if has_valid_sentence_embedding: + sentence_embedding = output["sentence_embedding"] + elif "token_embeddings" in output and output["token_embeddings"] is not None: + # Apply manual pooling when sentence_embedding is not valid + logger.debug(f"Using token_embeddings with manual {self.pool} pooling") + token_embeddings = output["token_embeddings"] + + if self.pool == "cls": + sentence_embedding = token_embeddings[:, 0, :] + elif self.pool == "mean": + mask = attention_mask.unsqueeze(-1).float() + sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1) + elif self.pool == "last_token": + seq_lengths = attention_mask.sum(dim=1) - 1 + sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] + else: + raise ValueError(f"Invalid pooling mode: {self.pool}") else: - raise ValueError(f"Cannot extract embeddings from model output: {type(output)}") - - # Apply pooling based on self.pool setting + raise ValueError(f"Cannot extract embeddings from model output dict: {output.keys()}") + elif hasattr(output, "sentence_embedding") and output.sentence_embedding is not None: + sentence_embedding = output.sentence_embedding + elif hasattr(output, "token_embeddings") and output.token_embeddings is not None: + token_embeddings = output.token_embeddings if self.pool == "cls": sentence_embedding = token_embeddings[:, 0, :] elif self.pool == "mean": - attention_mask = kwargs["attention_mask"].unsqueeze(-1).float() - sentence_embedding = (token_embeddings * attention_mask).sum(dim=1) / attention_mask.sum(dim=1) + mask = attention_mask.unsqueeze(-1).float() + sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1) elif self.pool == "last_token": - seq_lengths = kwargs["attention_mask"].sum(dim=1) - 1 + seq_lengths = attention_mask.sum(dim=1) - 1 sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] else: raise ValueError(f"Invalid pooling mode: {self.pool}") + elif torch.is_tensor(output): + # Assume output is the sentence embedding tensor directly + sentence_embedding = output + else: + raise ValueError(f"Cannot extract embeddings from model output: type={type(output)}") # Convert to list format expected by the gRPC interface cpu_results = sentence_embedding.view(-1).tolist() @@ -176,77 +220,6 @@ def predict(self, batch: PaddedBatch) -> List[Score]: raise NotImplementedError("Prediction not supported for sentence transformer models") -class NeuronEmbeddingModel(NeuronBaseModel): - """ - Neuron-optimized model for feature extraction / embeddings. - - Uses optimum.neuron.NeuronModelForFeatureExtraction for models that - output hidden states which need to be pooled. - """ - - def __init__( - self, - model_path: Path, - device: torch.device, - dtype: torch.dtype, - pool: str = "cls", - trust_remote: bool = False, - ): - from optimum.neuron import NeuronModelForFeatureExtraction - - is_compiled = self._is_neuron_compiled(model_path) - export_kwargs = {} - if not is_compiled: - export_kwargs = { - "export": True, - "batch_size": NEURON_BATCH_SIZE, - "sequence_length": NEURON_SEQUENCE_LENGTH, - } - logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") - model = NeuronModelForFeatureExtraction.from_pretrained( - model_path, - **export_kwargs, - ) - - logger.info(f"DEBUG: model type = {type(model)}") - - super().__init__(model, model_path, device, dtype) - self.pool = pool - - # Initialize pooling layer - from text_embeddings_server.models.pooling import DefaultPooling - self.pooling = DefaultPooling(self.hidden_size, pooling_mode=pool) - - logger.info(f"Loaded NeuronEmbeddingModel with pool={pool}") - - @staticmethod - def _is_neuron_compiled(model_path: Path) -> bool: - """Check if the model is already compiled for Neuron.""" - neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] - return len(neuron_files) > 0 - - @tracer.start_as_current_span("embed") - def embed(self, batch: PaddedBatch) -> List[Embedding]: - kwargs = self._prepare_inputs(batch) - output = self.model(**kwargs) - - # Apply pooling to get sentence embeddings - embedding = self.pooling.forward(output, batch.attention_mask) - - cpu_results = embedding.view(-1).tolist() - - return [ - Embedding( - values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size] - ) - for i in range(len(batch)) - ] - - @tracer.start_as_current_span("predict") - def predict(self, batch: PaddedBatch) -> List[Score]: - raise NotImplementedError("Prediction not supported for embedding models") - - class NeuronClassificationModel(NeuronBaseModel): """ Neuron-optimized model for sequence classification. @@ -420,24 +393,5 @@ def create_neuron_model( if pool == "splade" or architecture.endswith("ForMaskedLM"): return NeuronMaskedLMModel(model_path, device, dtype, pool, trust_remote) - # Check for sentence-transformers models - # These typically have specific config attributes or are in specific repositories - is_sentence_transformer = ( - hasattr(config, "sentence_transformers_config") or - hasattr(config, "_name_or_path") and "sentence-transformers" in str(config._name_or_path).lower() or - hasattr(config, "pooling_mode") or - (model_path / "sentence_bert_config.json").exists() if model_path.is_dir() else False - ) - - if is_sentence_transformer: - try: - return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote) - except Exception as e: - logger.warning(f"Failed to load as SentenceTransformer, falling back to FeatureExtraction: {e}") - - # Default to feature extraction model - try: - return NeuronEmbeddingModel(model_path, device, dtype, pool, trust_remote) - except Exception as e: - logger.warning(f"Failed to load NeuronEmbeddingModel, trying NeuronSentenceTransformersModel: {e}") - return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote) + # Default to NeuronSentenceTransformers for all embedding models + return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote) diff --git a/docs/source/en/ aws_neuron.md b/docs/source/en/ aws_neuron.md deleted file mode 100644 index d383fdba8..000000000 --- a/docs/source/en/ aws_neuron.md +++ /dev/null @@ -1,37 +0,0 @@ - -# Using TEI Container with AWS Trainium and Inferentia Instances - -## Build Docker Image - -To build a container optimized for AWS Neuron devices, run the following command: - -```shell -platform="neuron" - -docker build . -f Dockerfile-neuron -t tei-neuron:main -``` - -### Deploy Docker Container - -To deploy your model on an AWS Trainium or Inferentia instance, use the following command: - -```shell -model='optimum/bge-base-en-v1.5-neuronx' -volume=$PWD/data - -docker run -p 8080:80 -v $volume:/data tei-neuron:main --model-id $model --dtype float32 -``` \ No newline at end of file diff --git a/docs/source/en/aws_neuron.md b/docs/source/en/aws_neuron.md new file mode 100644 index 000000000..d4d056141 --- /dev/null +++ b/docs/source/en/aws_neuron.md @@ -0,0 +1,105 @@ + +# Using TEI with AWS Trainium and Inferentia + +Text Embeddings Inference (TEI) supports AWS Trainium and Inferentia accelerators through the [optimum-neuron](https://huggingface.co/docs/optimum-neuron) library. This integration uses `NeuronSentenceTransformers` to run embedding models efficiently on AWS Neuron devices. + +## Supported Model Types + +- **Embedding models**: Uses `NeuronSentenceTransformers` for sentence embeddings (e.g., BGE, sentence-transformers models) +- **Classification models**: Uses `NeuronModelForSequenceClassification` for sequence classification tasks +- **SPLADE models**: Uses `NeuronModelForMaskedLM` for sparse embeddings + +## Build Docker Image + +To build a container optimized for AWS Neuron devices: + +```shell +docker build . -f Dockerfile-neuron -t tei-neuron:main +``` + +## Deploy with Pre-compiled Models + +Pre-compiled models are recommended for production use as they skip the compilation step and start faster. + +```shell +model='optimum/bge-base-en-v1.5-neuronx' +volume=$PWD/data + +docker run --privileged \ + -p 8080:80 \ + -v $volume:/data \ + tei-neuron:main \ + --model-id $model \ + --dtype float32 +``` + +> **Note**: The `--privileged` flag is required for the Neuron OCI hook to work properly. + +## Deploy with On-the-fly Compilation + +You can also use non-pre-compiled models. TEI will compile the model for Neuron automatically on first load. This takes additional time but allows you to use any compatible model. + +```shell +model='BAAI/bge-base-en-v1.5' +volume=$PWD/data + +docker run --privileged \ + -p 8080:80 \ + -v $volume:/data \ + -e NEURON_BATCH_SIZE=1 \ + -e NEURON_SEQUENCE_LENGTH=512 \ + tei-neuron:main \ + --model-id $model \ + --dtype float32 +``` + +### Compilation Environment Variables + +When using on-the-fly compilation, you can configure the following environment variables: + +| Variable | Default | Description | +|----------|---------|-------------| +| `NEURON_BATCH_SIZE` | 1 | Batch size for Neuron compilation (static shape) | +| `NEURON_SEQUENCE_LENGTH` | 512 | Maximum sequence length for Neuron compilation (static shape) | + +> **Note**: Neuron requires static shapes for compilation. The batch size and sequence length are fixed at compilation time. + +## Runtime Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `NEURON_RT_NUM_CORES` | 1 | Number of Neuron cores to use | +| `NEURON_RT_VISIBLE_CORES` | 0 | Which Neuron cores are visible to the runtime | + +## Pre-compiled Models + +For faster startup, use pre-compiled Neuron models from the Hugging Face Hub: + +- [optimum/bge-base-en-v1.5-neuronx](https://huggingface.co/optimum/bge-base-en-v1.5-neuronx) + +You can also compile your own models using the [Optimum Neuron guide](https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview). + +## Testing Your Deployment + +Once the container is running, you can test the embedding endpoint: + +```shell +curl 127.0.0.1:8080/embed \ + -X POST \ + -H 'Content-Type: application/json' \ + -d '{"inputs": "What is Deep Learning?"}' +``` diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py index 03da9d494..171c19fee 100644 --- a/integration_tests/neuron/test_embed.py +++ b/integration_tests/neuron/test_embed.py @@ -8,41 +8,41 @@ # Test configurations for Neuron backend TEST_CONFIGS = { - # # On-the-fly Neuron compilation - # "sentence-transformers/all-MiniLM-L6-v2": { - # "model_id": "sentence-transformers/all-MiniLM-L6-v2", - # "input": "What is Deep Learning?", - # "batch_inputs": [ - # "What is Deep Learning?", - # "How does machine learning work?", - # "Tell me about neural networks.", - # ], - # "expected_output_prefix": None, - # "args": [ - # "--dtype", "float32", - # "--max-batch-requests", "1", - # ], - # "env_config": { - # "MAX_WARMUP_SEQUENCE_LENGTH": "512", - # }, - # }, - # "BAAI/bge-base-en-v1.5": { - # "model_id": "BAAI/bge-base-en-v1.5", - # "input": "What is Deep Learning?", - # "batch_inputs": [ - # "What is Deep Learning?", - # "How does machine learning work?", - # "Tell me about neural networks.", - # ], - # "expected_output_prefix": None, - # "args": [ - # "--dtype", "float32", - # "--max-batch-requests", "1", - # ], - # "env_config": { - # "MAX_WARMUP_SEQUENCE_LENGTH": "512", - # }, - # }, + # On-the-fly Neuron compilation + "sentence-transformers/all-MiniLM-L6-v2": { + "model_id": "sentence-transformers/all-MiniLM-L6-v2", + "input": "What is Deep Learning?", + "batch_inputs": [ + "What is Deep Learning?", + "How does machine learning work?", + "Tell me about neural networks.", + ], + "expected_output_prefix": None, + "args": [ + "--dtype", "float32", + "--max-batch-requests", "1", + ], + "env_config": { + "MAX_WARMUP_SEQUENCE_LENGTH": "512", + }, + }, + "BAAI/bge-base-en-v1.5": { + "model_id": "BAAI/bge-base-en-v1.5", + "input": "What is Deep Learning?", + "batch_inputs": [ + "What is Deep Learning?", + "How does machine learning work?", + "Tell me about neural networks.", + ], + "expected_output_prefix": None, + "args": [ + "--dtype", "float32", + "--max-batch-requests", "1", + ], + "env_config": { + "MAX_WARMUP_SEQUENCE_LENGTH": "512", + }, + }, # Pre-compiled Neuron model "optimum/bge-base-en-v1.5-neuronx": { "model_id": "optimum/bge-base-en-v1.5-neuronx", From b80356699f40b86c59b3f16e561fdb9bd903301f Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 5 Feb 2026 10:51:28 +0000 Subject: [PATCH 11/20] snol fix --- backends/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/src/lib.rs b/backends/src/lib.rs index 3d7e083af..e0ad5b4f8 100644 --- a/backends/src/lib.rs +++ b/backends/src/lib.rs @@ -442,7 +442,7 @@ async fn init_backend( https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview " ); // Fall back to downloading regular model files for on-the-fly compilation - if download_safetensors(api_repo).await.is_err() { + if download_safetensors(api_repo.clone()).await.is_err() { tracing::warn!( "safetensors weights not found. Using `pytorch_model.bin` instead." ); @@ -456,7 +456,7 @@ async fn init_backend( tracing::info!("Neuron model downloaded in {:?}", start.elapsed()); } else { - if download_safetensors(api_repo).await.is_err() { + if download_safetensors(api_repo.clone()).await.is_err() { tracing::warn!( "safetensors weights not found. Using `pytorch_model.bin` instead. \ Model loading will be significantly slower." From 81c57d35f3507a9f5e243f8ab69f073ce3a42fc4 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 5 Feb 2026 10:56:41 +0000 Subject: [PATCH 12/20] fix doc index --- docs/source/en/_toctree.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index b9eebac2c..69ace4e17 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -19,8 +19,8 @@ title: Build custom container for TEI - local: intel_container title: Using TEI container with Intel Hardware - - local: local_neuron - title: Using TEI container with AWS Neuron + - local: aws_neuron + title: Using TEI with AWS Trainium and Inferentia - local: examples title: Example uses title: Tutorials From 7f517b996431d6e8c501d6e1d1b9ebfb4ee4ae2d Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 5 Feb 2026 11:14:23 +0000 Subject: [PATCH 13/20] fix style --- backends/src/lib.rs | 4 +--- integration_tests/neuron/test_embed.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/backends/src/lib.rs b/backends/src/lib.rs index e0ad5b4f8..3471c344c 100644 --- a/backends/src/lib.rs +++ b/backends/src/lib.rs @@ -68,9 +68,7 @@ fn is_hpu() -> bool { } fn is_neuron() -> bool { - match Command::new("neuron-ls") - .output() - { + match Command::new("neuron-ls").output() { Ok(output) => output.status.success(), Err(_) => false, } diff --git a/integration_tests/neuron/test_embed.py b/integration_tests/neuron/test_embed.py index 171c19fee..4ca4aadb9 100644 --- a/integration_tests/neuron/test_embed.py +++ b/integration_tests/neuron/test_embed.py @@ -183,4 +183,3 @@ async def test_model_embedding_consistency(tei_client, input_text: str): # Embeddings for the same input should be identical (or very close) assert np.allclose(array1, array2, rtol=1e-4, atol=1e-4), \ "Same input should produce consistent embeddings" - From 975299802c7ee3b657f7c108b194825cab5407bc Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 5 Feb 2026 11:18:28 +0000 Subject: [PATCH 14/20] build and push neuron docker images in CI --- .github/workflows/build.yaml | 1 + .github/workflows/matrix.json | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 52352c7f7..3b9032614 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -18,6 +18,7 @@ on: - "Cargo.lock" - "rust-toolchain.toml" - "Dockerfile" + - "Dockerfile-neuron" branches: - 'main' diff --git a/.github/workflows/matrix.json b/.github/workflows/matrix.json index a7f6660b7..92430a4a2 100644 --- a/.github/workflows/matrix.json +++ b/.github/workflows/matrix.json @@ -87,5 +87,13 @@ "extraBuildArgs": "PLATFORM=hpu", "grpc": true, "dockerfile": "Dockerfile-intel" + }, + { + "name": "neuron", + "imageNamePrefix": "neuron-", + "runOn": "always", + "sccache": true, + "grpc": true, + "dockerfile": "Dockerfile-neuron" } ] From c517aa227582ba571e21ada9a2d5fcaf66a9f1a5 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Thu, 5 Feb 2026 13:40:25 +0000 Subject: [PATCH 15/20] smol changes --- .../models/neuron_models.py | 2 - docs/source/en/aws_neuron.md | 4 +- integration_tests/README.md | 79 +------------------ 3 files changed, 3 insertions(+), 82 deletions(-) diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py index f95c2b3c5..80745edc8 100644 --- a/backends/python/server/text_embeddings_server/models/neuron_models.py +++ b/backends/python/server/text_embeddings_server/models/neuron_models.py @@ -106,7 +106,6 @@ def __init__( config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote) self.hidden_size = config.hidden_size - # Calculate max input length position_offset = 0 model_type = config.model_type if model_type in ["xlm-roberta", "camembert", "roberta"]: @@ -153,7 +152,6 @@ def embed(self, batch: PaddedBatch) -> List[Embedding]: input_ids = batch.input_ids.to(torch.long) attention_mask = batch.attention_mask.to(torch.long) - # NeuronSentenceTransformers forward pass expects positional arguments output = self.model(input_ids, attention_mask) # Get sentence embeddings from output diff --git a/docs/source/en/aws_neuron.md b/docs/source/en/aws_neuron.md index d4d056141..2d02999a6 100644 --- a/docs/source/en/aws_neuron.md +++ b/docs/source/en/aws_neuron.md @@ -15,7 +15,7 @@ rendered properly in your Markdown viewer. --> # Using TEI with AWS Trainium and Inferentia -Text Embeddings Inference (TEI) supports AWS Trainium and Inferentia accelerators through the [optimum-neuron](https://huggingface.co/docs/optimum-neuron) library. This integration uses `NeuronSentenceTransformers` to run embedding models efficiently on AWS Neuron devices. +Text Embeddings Inference (TEI) supports AWS Trainium and Inferentia accelerators through the [optimum-neuron](https://huggingface.co/docs/optimum-neuron) library. ## Supported Model Types @@ -87,7 +87,7 @@ When using on-the-fly compilation, you can configure the following environment v ## Pre-compiled Models -For faster startup, use pre-compiled Neuron models from the Hugging Face Hub: +For faster startup, use pre-compiled Neuron models from the Hugging Face Hub like: - [optimum/bge-base-en-v1.5-neuronx](https://huggingface.co/optimum/bge-base-en-v1.5-neuronx) diff --git a/integration_tests/README.md b/integration_tests/README.md index 18b9232ad..69679a95e 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -27,20 +27,11 @@ cd integration_tests/gaudi uv run pytest --durations=0 -sv . ``` -### Environment Variables (HPU) - -| Variable | Description | Default | -|----------|-------------|---------| -| `DOCKER_IMAGE` | Docker image to use | `tei_hpu` | -| `DOCKER_VOLUME` | Volume for model cache (recommended) | None | -| `HF_TOKEN` | HuggingFace token for gated models | None | -| `LOG_LEVEL` | Server log level | `info` | - ## Running the tests for Neuron (AWS Inferentia/Trainium) ### Prerequisites -1. **AWS Neuron instance**: Tests must run on an EC2 instance with Neuron devices (inf1, inf2, or trn1) +1. **AWS Neuron instance**: Tests must run on an EC2 instance with Neuron devices (inf2, trn1 or trn2) 2. **Neuron drivers**: Ensure Neuron drivers are installed and `/dev/neuron*` devices are available 3. **Pre-compiled models**: Neuron requires models to be pre-compiled to `.neuron` format @@ -57,71 +48,3 @@ cd integration_tests/neuron uv run pytest --durations=0 -sv . ``` -### Environment Variables (Neuron) - -| Variable | Description | Default | -|----------|-------------|---------| -| `DOCKER_IMAGE` | Docker image to use | `tei-neuron` | -| `DOCKER_VOLUME` | Volume for model cache (recommended) | None | -| `HF_TOKEN` | HuggingFace token for gated models | None | -| `LOG_LEVEL` | Server log level | `info` | -| `NEURON_RT_NUM_CORES` | Number of Neuron cores to use | `1` | -| `NEURON_RT_VISIBLE_CORES` | Which Neuron cores are visible | `0` | - -### Using Pre-compiled Neuron Models - -Neuron models must be pre-compiled before use. You have two options: - -1. **Use models with pre-compiled Neuron artifacts**: Some models on HuggingFace Hub have `.neuron` files available - -2. **Compile models yourself**: Follow the [Optimum Neuron guide](https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview) to compile your models - -Example compilation: -```python -from optimum.neuron import NeuronModelForSentenceTransformers - -# Compile and save -model = NeuronModelForSentenceTransformers.from_pretrained( - "sentence-transformers/all-MiniLM-L6-v2", - export=True, - batch_size=1, - sequence_length=512, -) -model.save_pretrained("./all-MiniLM-L6-v2-neuron") -model.push_to_hub("your-username/all-MiniLM-L6-v2-neuron") -``` - -### Troubleshooting Neuron Tests - -**Container exits immediately**: -- Check if Neuron devices are available: `ls /dev/neuron*` -- Check container logs for "Neuron model files not found" - model needs compilation -- Ensure the Docker image was built with Neuron support - -**Long startup times**: -- Neuron models may take several minutes to load due to compilation -- The test timeout is set to 600 seconds (10 minutes) by default - -**Permission errors**: -- Ensure Docker has access to Neuron devices -- The tests add `IPC_LOCK` capability and mount `/dev/neuron*` devices - -## Adding New Test Models - -To add a new model to test, update the `TEST_CONFIGS` dictionary in `test_embed.py`: - -```python -TEST_CONFIGS = { - "your-model/name": { - "model_id": "your-model/name", - "input": "Test input text", - "batch_inputs": ["Text 1", "Text 2"], - "args": ["--dtype", "float32"], - "env_config": { - "MAX_WARMUP_SEQUENCE_LENGTH": "512", - }, - }, -} -``` - -For Habana tests, you can also add `expected_output` to validate exact embedding values. From 533d8538fceb3cfd92c84b1344049a266fc51c15 Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Fri, 20 Feb 2026 16:02:20 +0100 Subject: [PATCH 16/20] Update Dockerfile-neuron Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> --- Dockerfile-neuron | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index 741084c8b..044bbf596 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -1,4 +1,4 @@ -FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef +FROM lukemathwalker/cargo-chef:latest-rust-1.92-bookworm AS chef WORKDIR /usr/src ENV SCCACHE=0.10.0 From 0829b6f239e429199873e618fca36797ee8aa2cf Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Mon, 23 Feb 2026 16:31:54 +0100 Subject: [PATCH 17/20] Apply suggestions from code review Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> --- Dockerfile-neuron | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index 044bbf596..be2427140 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -174,7 +174,6 @@ RUN pip install --no-cache-dir -U \ peft==${PEFT_VERSION} \ && rm -rf ~/.cache/pip/* - FROM neuron AS grpc COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router @@ -182,7 +181,7 @@ COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/loc ENTRYPOINT ["text-embeddings-router"] CMD ["--json-output"] -FROM neuron +FROM neuron AS http COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router From 1464cc3a8cd84fa91cf9efa35d426e4e5c05e15e Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Mon, 23 Feb 2026 16:05:15 +0000 Subject: [PATCH 18/20] review:suggestions --- Dockerfile-neuron | 6 +-- backends/Cargo.toml | 1 + .../text_embeddings_server/models/__init__.py | 24 +-------- .../models/habana/__init__.py | 14 +++++ .../{neuron_models.py => neuron/__init__.py} | 0 backends/src/dtype.rs | 25 ++++++--- backends/src/lib.rs | 51 +++++++++---------- integration_tests/README.md | 1 - router/Cargo.toml | 1 + 9 files changed, 63 insertions(+), 60 deletions(-) create mode 100644 backends/python/server/text_embeddings_server/models/habana/__init__.py rename backends/python/server/text_embeddings_server/models/{neuron_models.py => neuron/__init__.py} (100%) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index 044bbf596..6900b72f8 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -30,7 +30,7 @@ COPY --from=planner /usr/src/recipe.json recipe.json RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo chef cook --release --features python --no-default-features --recipe-path recipe.json && sccache -s + cargo chef cook --release --features python-neuron --no-default-features --recipe-path recipe.json && sccache -s COPY backends backends COPY core core @@ -48,7 +48,7 @@ FROM builder AS http-builder RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo build --release --bin text-embeddings-router -F python -F http --no-default-features && sccache -s + cargo build --release --bin text-embeddings-router -F python-neuron -F http --no-default-features && sccache -s FROM builder AS grpc-builder @@ -56,7 +56,7 @@ COPY proto proto RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ - cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s + cargo build --release --bin text-embeddings-router -F grpc -F python-neuron --no-default-features && sccache -s FROM public.ecr.aws/docker/library/ubuntu:22.04 AS neuron diff --git a/backends/Cargo.toml b/backends/Cargo.toml index bb9d74191..fd0ab74ae 100644 --- a/backends/Cargo.toml +++ b/backends/Cargo.toml @@ -21,6 +21,7 @@ rand = { workspace = true } [features] clap = ["dep:clap", "text-embeddings-backend-core/clap"] python = ["dep:text-embeddings-backend-python"] +python-neuron = ["dep:text-embeddings-backend-python"] ort = ["dep:text-embeddings-backend-ort"] candle = ["dep:text-embeddings-backend-candle"] cuda = ["text-embeddings-backend-candle?/cuda"] diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py index 8a48510d6..8845163eb 100644 --- a/backends/python/server/text_embeddings_server/models/__init__.py +++ b/backends/python/server/text_embeddings_server/models/__init__.py @@ -11,16 +11,13 @@ from text_embeddings_server.models.masked_model import MaskedLanguageModel from text_embeddings_server.models.default_model import DefaultModel from text_embeddings_server.models.classification_model import ClassificationModel +from text_embeddings_server.models.habana import wrap_model_if_hpu from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron __all__ = ["Model"] TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "false").lower() in ["true", "1"] -DISABLE_TENSOR_CACHE = os.getenv("DISABLE_TENSOR_CACHE", "false").lower() in [ - "true", - "1", -] # Flash Attention models - only available when flash_attn is installed FLASH_ATTENTION = True @@ -44,34 +41,17 @@ __all__.append(FlashBert) # Neuron models - only import when on Neuron device to avoid unnecessary dependencies -NeuronSentenceTransformersModel = None -NeuronClassificationModel = None -NeuronMaskedLMModel = None create_neuron_model = None if is_neuron(): try: - from text_embeddings_server.models.neuron_models import ( - NeuronSentenceTransformersModel, - NeuronClassificationModel, - NeuronMaskedLMModel, + from text_embeddings_server.models.neuron import ( create_neuron_model, ) except ImportError as e: logger.warning(f"Could not import Neuron models: {e}") -def wrap_model_if_hpu(model_handle, device): - """Wrap the model in HPU graph if the device is HPU.""" - if device.type == "hpu": - from habana_frameworks.torch.hpu import wrap_in_hpu_graph - - model_handle.model = wrap_in_hpu_graph( - model_handle.model, disable_tensor_cache=DISABLE_TENSOR_CACHE - ) - return model_handle - - def create_model(model_class, model_path, device, datatype, pool="cls"): """Create a model instance and wrap it if needed.""" model_handle = model_class( diff --git a/backends/python/server/text_embeddings_server/models/habana/__init__.py b/backends/python/server/text_embeddings_server/models/habana/__init__.py new file mode 100644 index 000000000..267830de1 --- /dev/null +++ b/backends/python/server/text_embeddings_server/models/habana/__init__.py @@ -0,0 +1,14 @@ +import os + +DISABLE_TENSOR_CACHE = os.getenv("DISABLE_TENSOR_CACHE", "false").lower() in ["true", "1"] + + +def wrap_model_if_hpu(model_handle, device): + """Wrap the model in HPU graph if the device is HPU.""" + if device.type == "hpu": + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + model_handle.model = wrap_in_hpu_graph( + model_handle.model, disable_tensor_cache=DISABLE_TENSOR_CACHE + ) + return model_handle diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron/__init__.py similarity index 100% rename from backends/python/server/text_embeddings_server/models/neuron_models.py rename to backends/python/server/text_embeddings_server/models/neuron/__init__.py diff --git a/backends/src/dtype.rs b/backends/src/dtype.rs index 80292be79..ef16ca556 100644 --- a/backends/src/dtype.rs +++ b/backends/src/dtype.rs @@ -9,12 +9,18 @@ pub enum DType { // Float16 is not available on accelerate #[cfg(any( feature = "python", + feature = "python-neuron", all(feature = "candle", not(feature = "accelerate")) ))] Float16, - #[cfg(any(feature = "python", feature = "candle", feature = "ort"))] + #[cfg(any( + feature = "python", + feature = "python-neuron", + feature = "candle", + feature = "ort" + ))] Float32, - #[cfg(feature = "python")] + #[cfg(any(feature = "python", feature = "python-neuron"))] Bfloat16, } @@ -24,12 +30,18 @@ impl fmt::Display for DType { // Float16 is not available on accelerate #[cfg(any( feature = "python", + feature = "python-neuron", all(feature = "candle", not(feature = "accelerate")) ))] DType::Float16 => write!(f, "float16"), - #[cfg(any(feature = "python", feature = "candle", feature = "ort"))] + #[cfg(any( + feature = "python", + feature = "python-neuron", + feature = "candle", + feature = "ort" + ))] DType::Float32 => write!(f, "float32"), - #[cfg(feature = "python")] + #[cfg(any(feature = "python", feature = "python-neuron"))] DType::Bfloat16 => write!(f, "bfloat16"), } } @@ -46,12 +58,13 @@ impl Default for DType { feature = "accelerate", feature = "mkl", feature = "ort", - feature = "python" + feature = "python", + feature = "python-neuron" )))] { DType::Float16 } - #[cfg(feature = "python")] + #[cfg(any(feature = "python", feature = "python-neuron"))] { DType::Bfloat16 } diff --git a/backends/src/lib.rs b/backends/src/lib.rs index c6a5e3a27..8f9ee2838 100644 --- a/backends/src/lib.rs +++ b/backends/src/lib.rs @@ -28,7 +28,7 @@ use text_embeddings_backend_candle::CandleBackend; #[cfg(feature = "ort")] use text_embeddings_backend_ort::OrtBackend; -#[cfg(feature = "python")] +#[cfg(any(feature = "python", feature = "python-neuron"))] use text_embeddings_backend_python::PythonBackend; fn powers_of_two(max_value: usize) -> Vec { @@ -68,13 +68,6 @@ fn is_hpu() -> bool { } } -fn is_neuron() -> bool { - match Command::new("neuron-ls").output() { - Ok(output) => output.status.success(), - Err(_) => false, - } -} - #[derive(Debug, Clone)] pub struct Backend { /// Channel to communicate with the background thread @@ -423,9 +416,10 @@ async fn init_backend( } if let Some(api_repo) = api_repo.as_ref() { - if cfg!(feature = "python") || cfg!(feature = "candle") { - let start = std::time::Instant::now(); - if is_neuron() { + let start = std::time::Instant::now(); + if cfg!(feature = "python-neuron") { + #[cfg(feature = "python-neuron")] + { tracing::info!("Downloading `model.neuron`"); let model_files = download_neuron(api_repo) .await @@ -436,7 +430,7 @@ async fn init_backend( "Neuron model files not found in the repository. \ The Python backend will attempt to compile the model on-the-fly using optimum-neuron. \ This may take several minutes. For faster startup, consider pre-compiling your model: \ - https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview " + https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview" ); // Fall back to downloading regular model files for on-the-fly compilation if download_safetensors(api_repo.clone()).await.is_err() { @@ -452,21 +446,21 @@ async fn init_backend( } tracing::info!("Neuron model downloaded in {:?}", start.elapsed()); - } else { - if download_safetensors(api_repo.clone()).await.is_err() { - tracing::warn!( - "safetensors weights not found. Using `pytorch_model.bin` instead. \ - Model loading will be significantly slower." - ); - tracing::info!("Downloading `pytorch_model.bin`"); - api_repo - .get("pytorch_model.bin") - .await - .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?; - } - - tracing::info!("Model weights downloaded in {:?}", start.elapsed()); } + } else if cfg!(feature = "python") || cfg!(feature = "candle") { + if download_safetensors(api_repo.clone()).await.is_err() { + tracing::warn!( + "safetensors weights not found. Using `pytorch_model.bin` instead. \ + Model loading will be significantly slower." + ); + tracing::info!("Downloading `pytorch_model.bin`"); + api_repo + .get("pytorch_model.bin") + .await + .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?; + } + + tracing::info!("Model weights downloaded in {:?}", start.elapsed()); } } @@ -533,8 +527,8 @@ async fn init_backend( } } - if cfg!(feature = "python") { - #[cfg(feature = "python")] + if cfg!(feature = "python") || cfg!(feature = "python-neuron") { + #[cfg(any(feature = "python", feature = "python-neuron"))] { let backend = std::thread::spawn(move || { PythonBackend::new( @@ -775,6 +769,7 @@ async fn download_onnx(api: Arc) -> Result, ApiError> { } } +#[cfg(feature = "python-neuron")] async fn download_neuron(api: &ApiRepo) -> Result, ApiError> { let mut model_files: Vec = Vec::new(); diff --git a/integration_tests/README.md b/integration_tests/README.md index 69679a95e..ca20fbb9c 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -47,4 +47,3 @@ docker build . -f Dockerfile-neuron -t tei-neuron cd integration_tests/neuron uv run pytest --durations=0 -sv . ``` - diff --git a/router/Cargo.toml b/router/Cargo.toml index 381d611c0..605fa4dc3 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -86,6 +86,7 @@ metal = ["text-embeddings-backend/metal"] mkl = ["text-embeddings-backend/mkl", "dep:intel-mkl-src"] accelerate = ["text-embeddings-backend/accelerate"] python = ["text-embeddings-backend/python"] +python-neuron = ["text-embeddings-backend/python-neuron"] ort = ["text-embeddings-backend/ort"] candle = ["text-embeddings-backend/candle"] candle-cuda = ["candle", "text-embeddings-backend/flash-attn", "dep:cudarc"] From 3b48cbf33b78928769fd5f370c3f005bf85d4e37 Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 17 Apr 2026 16:45:01 +0000 Subject: [PATCH 19/20] draft:support in TorchNeuron way --- Dockerfile-neuron | 13 +- .../models/neuron/__init__.py | 402 ++++++------------ .../text_embeddings_server/utils/device.py | 3 +- 3 files changed, 145 insertions(+), 273 deletions(-) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index 112c742b7..b4dceed65 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -150,26 +150,17 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \ && rm -rf ~/.cache/pip/* # HF ARGS -# Note: optimum-neuron 0.4.4 requires transformers~=4.57.1 -ARG TRANSFORMERS_VERSION=4.57.1 -ARG DIFFUSERS_VERSION=0.35.2 +ARG TRANSFORMERS_VERSION=4.47.0 ARG HUGGINGFACE_HUB_VERSION=0.36.0 -ARG OPTIMUM_NEURON_VERSION=0.4.4 ARG SENTENCE_TRANSFORMERS=5.1.2 ARG PEFT_VERSION=0.17.0 -ARG DATASETS_VERSION=4.1.1 # Install Hugging Face libraries and dependencies for TEI on Neuron RUN pip install --no-cache-dir -U \ networkx==2.8.8 \ - transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ - diffusers==${DIFFUSERS_VERSION} \ - compel \ - controlnet-aux \ + transformers[sentencepiece]==${TRANSFORMERS_VERSION} \ huggingface_hub==${HUGGINGFACE_HUB_VERSION} \ hf_transfer \ - datasets==${DATASETS_VERSION} \ - optimum-neuron==${OPTIMUM_NEURON_VERSION} \ sentence_transformers==${SENTENCE_TRANSFORMERS} \ peft==${PEFT_VERSION} \ && rm -rf ~/.cache/pip/* diff --git a/backends/python/server/text_embeddings_server/models/neuron/__init__.py b/backends/python/server/text_embeddings_server/models/neuron/__init__.py index 80745edc8..297e05eef 100644 --- a/backends/python/server/text_embeddings_server/models/neuron/__init__.py +++ b/backends/python/server/text_embeddings_server/models/neuron/__init__.py @@ -1,95 +1,107 @@ import inspect import os import torch +import torch.nn.functional as F -from abc import ABC +from abc import ABC, abstractmethod from pathlib import Path -from typing import Type, List +from typing import Type, List, Tuple from opentelemetry import trace from loguru import logger from text_embeddings_server.models.model import Model +from text_embeddings_server.models.pooling import DefaultPooling from text_embeddings_server.models.types import PaddedBatch, Embedding, Score tracer = trace.get_tracer(__name__) -# Neuron static shapes compilation parameters +NEURON_MODE = os.getenv("NEURON_MODE", "eager") # "eager" | "compile" NEURON_BATCH_SIZE = int(os.getenv("NEURON_BATCH_SIZE", "1")) NEURON_SEQUENCE_LENGTH = int(os.getenv("NEURON_SEQUENCE_LENGTH", "512")) +def _get_orig_module(model) -> torch.nn.Module: + """Return the unwrapped module whether or not it has been torch.compiled.""" + return getattr(model, "_orig_mod", model) + + +def _check_param(model, param_name: str) -> bool: + try: + fn = model.forward if hasattr(model, "forward") else model.__call__ + return inspect.signature(fn).parameters.get(param_name) is not None + except (ValueError, TypeError): + return False + + class NeuronBaseModel(Model, ABC): - """Base class for all Neuron models.""" + """Base class for Neuron models using torch-native eager or torch.compile mode.""" - def __init__( - self, - model, - model_path: Path, - device: torch.device, - dtype: torch.dtype, - ): - self.hidden_size = model.config.hidden_size + def __init__(self, model, device: torch.device, dtype: torch.dtype): + orig = _get_orig_module(model) + config = orig.config + + self.hidden_size = config.hidden_size - # Calculate max input length based on model type position_offset = 0 - model_type = model.config.model_type - if model_type in ["xlm-roberta", "camembert", "roberta"]: - position_offset = getattr(model.config, "pad_token_id", 1) + 1 - - if hasattr(model.config, "max_seq_length"): - self.max_input_length = model.config.max_seq_length - elif hasattr(model.config, "n_positions"): - self.max_input_length = model.config.n_positions + if config.model_type in ["xlm-roberta", "camembert", "roberta"]: + position_offset = getattr(config, "pad_token_id", 1) + 1 + + if hasattr(config, "max_seq_length"): + self.max_input_length = config.max_seq_length + elif hasattr(config, "n_positions"): + self.max_input_length = config.n_positions else: - self.max_input_length = ( - model.config.max_position_embeddings - position_offset - ) + self.max_input_length = config.max_position_embeddings - position_offset - # Check which inputs the model supports - self.has_position_ids = self._check_param_exists(model, "position_ids") - self.has_token_type_ids = self._check_param_exists(model, "token_type_ids") + self.has_position_ids = _check_param(orig, "position_ids") + self.has_token_type_ids = _check_param(orig, "token_type_ids") super().__init__(model=model, dtype=dtype, device=device) - @staticmethod - def _check_param_exists(model, param_name: str) -> bool: - """Check if a parameter exists in the model's forward signature.""" - try: - forward_fn = model.forward if hasattr(model, 'forward') else model.__call__ - return ( - inspect.signature(forward_fn).parameters.get(param_name, None) - is not None - ) - except (ValueError, TypeError): - return False - @property def batch_type(self) -> Type[PaddedBatch]: return PaddedBatch - def _prepare_inputs(self, batch: PaddedBatch) -> dict: - """Prepare input kwargs for model forward pass. + def _pad_to_static_shape(self, batch: PaddedBatch) -> Tuple[dict, int]: + """Pad all inputs to (NEURON_BATCH_SIZE, NEURON_SEQUENCE_LENGTH). - Note: Neuron models require int64 (long) tensors for inputs. + Neuron requires static shapes; padding to fixed dims avoids recompilation + on every distinct (batch, seq) pair seen in production. + Returns (padded_kwargs_on_cpu, actual_batch_size). """ - kwargs = { - "input_ids": batch.input_ids.to(torch.long), - "attention_mask": batch.attention_mask.to(torch.long), - } + actual_bs = batch.input_ids.shape[0] + actual_seq = batch.input_ids.shape[1] + + if actual_bs > NEURON_BATCH_SIZE: + raise ValueError( + f"Batch size {actual_bs} exceeds NEURON_BATCH_SIZE={NEURON_BATCH_SIZE}. " + f"Set NEURON_BATCH_SIZE>={actual_bs} to serve this batch." + ) + + seq_pad = max(0, NEURON_SEQUENCE_LENGTH - actual_seq) + batch_pad = max(0, NEURON_BATCH_SIZE - actual_bs) + + def _pad(t: torch.Tensor) -> torch.Tensor: + if seq_pad > 0: + t = F.pad(t, (0, seq_pad), value=0) + if batch_pad > 0: + t = F.pad(t, (0, 0, 0, batch_pad), value=0) + return t + + input_ids = _pad(batch.input_ids.to(torch.long)) + attention_mask = _pad(batch.attention_mask.to(torch.long)) + kwargs: dict = {"input_ids": input_ids, "attention_mask": attention_mask} + if self.has_token_type_ids: - kwargs["token_type_ids"] = batch.token_type_ids.to(torch.long) + kwargs["token_type_ids"] = _pad(batch.token_type_ids.to(torch.long)) if self.has_position_ids: - kwargs["position_ids"] = batch.position_ids.to(torch.long) - return kwargs + kwargs["position_ids"] = _pad(batch.position_ids.to(torch.long)) + return kwargs, actual_bs -class NeuronSentenceTransformersModel(Model): - """ - Neuron model for sentence-transformers. - Uses optimum.neuron.NeuronSentenceTransformers which is designed - for sentence embedding models. - """ +class NeuronDefaultModel(NeuronBaseModel): + """Neuron model for dense sentence embeddings.""" def __init__( self, @@ -99,131 +111,48 @@ def __init__( pool: str = "cls", trust_remote: bool = False, ): - from optimum.neuron import NeuronSentenceTransformers - from transformers import AutoConfig + from transformers import AutoModel - # Load config separately for reliable access - config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote) - self.hidden_size = config.hidden_size + model = AutoModel.from_pretrained( + model_path, trust_remote_code=trust_remote + ).to(dtype).to(device) - position_offset = 0 - model_type = config.model_type - if model_type in ["xlm-roberta", "camembert", "roberta"]: - position_offset = getattr(config, "pad_token_id", 1) + 1 + # Extract before optional compile so DefaultPooling gets the hidden size + self.pooling = DefaultPooling(model.config.hidden_size, pooling_mode=pool) - if hasattr(config, "max_seq_length"): - self.max_input_length = config.max_seq_length - elif hasattr(config, "n_positions"): - self.max_input_length = config.n_positions - else: - self.max_input_length = ( - config.max_position_embeddings - position_offset - ) - - is_compiled = self._is_neuron_compiled(model_path) - if not is_compiled: - logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") - model = NeuronSentenceTransformers.from_pretrained( - model_path, - export=True, - batch_size=NEURON_BATCH_SIZE, - sequence_length=NEURON_SEQUENCE_LENGTH, - ) - else: - model = NeuronSentenceTransformers.from_pretrained(model_path) - - self.pool = pool - super().__init__(model=model, dtype=dtype, device=device) - logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}, hidden_size={self.hidden_size}") - - @staticmethod - def _is_neuron_compiled(model_path: Path) -> bool: - """Check if the model is already compiled for Neuron.""" - neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] - return len(neuron_files) > 0 + if NEURON_MODE == "compile": + logger.info("Wrapping NeuronDefaultModel with torch.compile(backend='neuron')") + model = torch.compile(model, backend="neuron", fullgraph=False) - @property - def batch_type(self) -> Type[PaddedBatch]: - return PaddedBatch + super().__init__(model, device, dtype) + logger.info(f"NeuronDefaultModel ready (mode={NEURON_MODE}, pool={pool})") @tracer.start_as_current_span("embed") def embed(self, batch: PaddedBatch) -> List[Embedding]: - # Prepare inputs - input_ids = batch.input_ids.to(torch.long) - attention_mask = batch.attention_mask.to(torch.long) - - output = self.model(input_ids, attention_mask) - - # Get sentence embeddings from output - sentence_embedding = None - if isinstance(output, dict): - # Check if sentence_embedding exists and has non-zero values - # NeuronSentenceTransformers may return zeros for sentence_embedding when pooling fails - has_valid_sentence_embedding = ( - "sentence_embedding" in output - and output["sentence_embedding"] is not None - and output["sentence_embedding"].abs().sum() > 0 - ) - if has_valid_sentence_embedding: - sentence_embedding = output["sentence_embedding"] - elif "token_embeddings" in output and output["token_embeddings"] is not None: - # Apply manual pooling when sentence_embedding is not valid - logger.debug(f"Using token_embeddings with manual {self.pool} pooling") - token_embeddings = output["token_embeddings"] - - if self.pool == "cls": - sentence_embedding = token_embeddings[:, 0, :] - elif self.pool == "mean": - mask = attention_mask.unsqueeze(-1).float() - sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1) - elif self.pool == "last_token": - seq_lengths = attention_mask.sum(dim=1) - 1 - sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] - else: - raise ValueError(f"Invalid pooling mode: {self.pool}") - else: - raise ValueError(f"Cannot extract embeddings from model output dict: {output.keys()}") - elif hasattr(output, "sentence_embedding") and output.sentence_embedding is not None: - sentence_embedding = output.sentence_embedding - elif hasattr(output, "token_embeddings") and output.token_embeddings is not None: - token_embeddings = output.token_embeddings - if self.pool == "cls": - sentence_embedding = token_embeddings[:, 0, :] - elif self.pool == "mean": - mask = attention_mask.unsqueeze(-1).float() - sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1) - elif self.pool == "last_token": - seq_lengths = attention_mask.sum(dim=1) - 1 - sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] - else: - raise ValueError(f"Invalid pooling mode: {self.pool}") - elif torch.is_tensor(output): - # Assume output is the sentence embedding tensor directly - sentence_embedding = output - else: - raise ValueError(f"Cannot extract embeddings from model output: type={type(output)}") + kwargs, actual_bs = self._pad_to_static_shape(batch) + + output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()}) - # Convert to list format expected by the gRPC interface - cpu_results = sentence_embedding.view(-1).tolist() + # Move token embeddings back to CPU; pooling runs on CPU + token_embeddings = output[0][:actual_bs].to("cpu") + pool_mask = kwargs["attention_mask"][:actual_bs] # already on CPU + + # DefaultPooling.forward accepts list[tensor] so it can index [0] + embedding = self.pooling.forward([token_embeddings], pool_mask) + cpu_results = embedding.view(-1).tolist() return [ - Embedding( - values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size] - ) - for i in range(len(batch)) + Embedding(values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size]) + for i in range(actual_bs) ] @tracer.start_as_current_span("predict") def predict(self, batch: PaddedBatch) -> List[Score]: - raise NotImplementedError("Prediction not supported for sentence transformer models") + raise NotImplementedError("predict not supported for embedding models") class NeuronClassificationModel(NeuronBaseModel): - """ - Neuron-optimized model for sequence classification. - - Uses optimum.neuron.NeuronModelForSequenceClassification for classification tasks. - """ + """Neuron model for sequence classification.""" def __init__( self, @@ -233,56 +162,37 @@ def __init__( pool: str = "cls", trust_remote: bool = False, ): - from optimum.neuron import NeuronModelForSequenceClassification - - is_compiled = self._is_neuron_compiled(model_path) - export_kwargs = {} - if not is_compiled: - export_kwargs = { - "export": True, - "batch_size": NEURON_BATCH_SIZE, - "sequence_length": NEURON_SEQUENCE_LENGTH, - } - logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") - model = NeuronModelForSequenceClassification.from_pretrained( - model_path, - **export_kwargs, - ) - - super().__init__(model, model_path, device, dtype) - logger.info("Loaded NeuronClassificationModel") - - @staticmethod - def _is_neuron_compiled(model_path: Path) -> bool: - """Check if the model is already compiled for Neuron.""" - neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] - return len(neuron_files) > 0 + from transformers import AutoModelForSequenceClassification + + model = AutoModelForSequenceClassification.from_pretrained( + model_path, trust_remote_code=trust_remote + ).to(dtype).to(device) + + if NEURON_MODE == "compile": + logger.info("Wrapping NeuronClassificationModel with torch.compile(backend='neuron')") + model = torch.compile(model, backend="neuron", fullgraph=False) + + super().__init__(model, device, dtype) + logger.info(f"NeuronClassificationModel ready (mode={NEURON_MODE})") @tracer.start_as_current_span("embed") def embed(self, batch: PaddedBatch) -> List[Embedding]: - raise NotImplementedError("Embedding not supported for classification models") + raise NotImplementedError("embed not supported for classification models") @tracer.start_as_current_span("predict") def predict(self, batch: PaddedBatch) -> List[Score]: - kwargs = self._prepare_inputs(batch) - output = self.model(**kwargs) + kwargs, actual_bs = self._pad_to_static_shape(batch) - # Get logits from output - if hasattr(output, "logits"): - logits = output.logits - else: - logits = output[0] + output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()}) - all_scores = logits.tolist() - return [Score(values=scores) for scores in all_scores] + logits = output.logits if hasattr(output, "logits") else output[0] + logits_cpu = logits[:actual_bs].to("cpu").tolist() + return [Score(values=scores) for scores in logits_cpu] -class NeuronMaskedLMModel(NeuronBaseModel): - """ - Neuron-optimized model for Masked Language Modeling (SPLADE). - Uses optimum.neuron.NeuronModelForMaskedLM for SPLADE-style sparse embeddings. - """ +class NeuronMaskedLMModel(NeuronBaseModel): + """Neuron model for masked language modeling (SPLADE sparse embeddings).""" def __init__( self, @@ -292,63 +202,47 @@ def __init__( pool: str = "splade", trust_remote: bool = False, ): - from optimum.neuron import NeuronModelForMaskedLM - - is_compiled = self._is_neuron_compiled(model_path) - export_kwargs = {} - if not is_compiled: - export_kwargs = { - "export": True, - "batch_size": NEURON_BATCH_SIZE, - "sequence_length": NEURON_SEQUENCE_LENGTH, - } - logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") - model = NeuronModelForMaskedLM.from_pretrained( - model_path, - **export_kwargs, - ) - - super().__init__(model, model_path, device, dtype) - - # Get vocab size for SPLADE output + from transformers import AutoModelForMaskedLM + + model = AutoModelForMaskedLM.from_pretrained( + model_path, trust_remote_code=trust_remote + ).to(dtype).to(device) + + # Extract before optional compile self.vocab_size = model.config.vocab_size - logger.info(f"Loaded NeuronMaskedLMModel with vocab_size={self.vocab_size}") - @staticmethod - def _is_neuron_compiled(model_path: Path) -> bool: - """Check if the model is already compiled for Neuron.""" - neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] - return len(neuron_files) > 0 + if NEURON_MODE == "compile": + logger.info("Wrapping NeuronMaskedLMModel with torch.compile(backend='neuron')") + model = torch.compile(model, backend="neuron", fullgraph=False) + + super().__init__(model, device, dtype) + logger.info(f"NeuronMaskedLMModel ready (mode={NEURON_MODE}, vocab_size={self.vocab_size})") @tracer.start_as_current_span("embed") def embed(self, batch: PaddedBatch) -> List[Embedding]: - kwargs = self._prepare_inputs(batch) - output = self.model(**kwargs) + kwargs, actual_bs = self._pad_to_static_shape(batch) - # Get logits for SPLADE pooling - if hasattr(output, "logits"): - hidden_states = output.logits - else: - hidden_states = output[0] + output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()}) + + hidden_states = output.logits if hasattr(output, "logits") else output[0] + hidden_states = hidden_states[:actual_bs].to("cpu") + mask = kwargs["attention_mask"][:actual_bs].unsqueeze(-1).float() - # SPLADE pooling: ReLU -> log(1+x) -> max pooling + # SPLADE pooling: ReLU → log(1+x) → mask → max over sequence hidden_states = torch.relu(hidden_states) hidden_states = (1 + hidden_states).log() - hidden_states = torch.mul(hidden_states, batch.attention_mask.unsqueeze(-1)) + hidden_states = hidden_states * mask sparse_embedding = hidden_states.max(dim=1).values cpu_results = sparse_embedding.view(-1).tolist() - return [ - Embedding( - values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size] - ) - for i in range(len(batch)) + Embedding(values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size]) + for i in range(actual_bs) ] @tracer.start_as_current_span("predict") def predict(self, batch: PaddedBatch) -> List[Score]: - raise NotImplementedError("Prediction not supported for masked LM models") + raise NotImplementedError("predict not supported for masked LM models") def create_neuron_model( @@ -359,20 +253,7 @@ def create_neuron_model( trust_remote: bool = False, config=None, ) -> Model: - """ - Factory function to create the appropriate Neuron model based on the model config. - - Args: - model_path: Path to the model - device: Target device (should be xla for Neuron) - dtype: Data type for the model - pool: Pooling strategy (cls, mean, lasttoken, splade) - trust_remote: Whether to trust remote code - config: Pre-loaded model config (optional) - - Returns: - Appropriate Neuron model instance - """ + """Factory: pick the right Neuron model class from the model architecture.""" from transformers import AutoConfig if config is None: @@ -381,15 +262,14 @@ def create_neuron_model( architectures = getattr(config, "architectures", []) or [] architecture = architectures[0] if architectures else "" - logger.info(f"Creating Neuron model for architecture: {architecture}, pool: {pool}") + logger.info( + f"Creating Neuron model: architecture={architecture}, pool={pool}, mode={NEURON_MODE}" + ) - # Check for classification models if architecture.endswith("ForSequenceClassification") or architecture.endswith("Classification"): return NeuronClassificationModel(model_path, device, dtype, pool, trust_remote) - # Check for SPLADE (masked LM) models if pool == "splade" or architecture.endswith("ForMaskedLM"): return NeuronMaskedLMModel(model_path, device, dtype, pool, trust_remote) - # Default to NeuronSentenceTransformers for all embedding models - return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote) + return NeuronDefaultModel(model_path, device, dtype, pool, trust_remote) diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py index 4963b012c..0ec69260e 100644 --- a/backends/python/server/text_embeddings_server/utils/device.py +++ b/backends/python/server/text_embeddings_server/utils/device.py @@ -90,6 +90,7 @@ def get_device(): if hasattr(torch, "xpu") and torch.xpu.is_available(): device = torch.device("xpu") elif is_neuron(): - device = torch.device("xla") + import torch_neuronx # noqa: F401 — registers torch.device("neuron") as PrivateUse1 + device = torch.device("neuron") return device From 4d39a8bd0cadbdf63fc317dfcdfd5ab7dcae424a Mon Sep 17 00:00:00 2001 From: JingyaHuang Date: Fri, 17 Apr 2026 16:46:45 +0000 Subject: [PATCH 20/20] Revert "draft:support in TorchNeuron way" This reverts commit 3b48cbf33b78928769fd5f370c3f005bf85d4e37. --- Dockerfile-neuron | 13 +- .../models/neuron/__init__.py | 402 ++++++++++++------ .../text_embeddings_server/utils/device.py | 3 +- 3 files changed, 273 insertions(+), 145 deletions(-) diff --git a/Dockerfile-neuron b/Dockerfile-neuron index b4dceed65..112c742b7 100644 --- a/Dockerfile-neuron +++ b/Dockerfile-neuron @@ -150,17 +150,26 @@ RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \ && rm -rf ~/.cache/pip/* # HF ARGS -ARG TRANSFORMERS_VERSION=4.47.0 +# Note: optimum-neuron 0.4.4 requires transformers~=4.57.1 +ARG TRANSFORMERS_VERSION=4.57.1 +ARG DIFFUSERS_VERSION=0.35.2 ARG HUGGINGFACE_HUB_VERSION=0.36.0 +ARG OPTIMUM_NEURON_VERSION=0.4.4 ARG SENTENCE_TRANSFORMERS=5.1.2 ARG PEFT_VERSION=0.17.0 +ARG DATASETS_VERSION=4.1.1 # Install Hugging Face libraries and dependencies for TEI on Neuron RUN pip install --no-cache-dir -U \ networkx==2.8.8 \ - transformers[sentencepiece]==${TRANSFORMERS_VERSION} \ + transformers[sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ + diffusers==${DIFFUSERS_VERSION} \ + compel \ + controlnet-aux \ huggingface_hub==${HUGGINGFACE_HUB_VERSION} \ hf_transfer \ + datasets==${DATASETS_VERSION} \ + optimum-neuron==${OPTIMUM_NEURON_VERSION} \ sentence_transformers==${SENTENCE_TRANSFORMERS} \ peft==${PEFT_VERSION} \ && rm -rf ~/.cache/pip/* diff --git a/backends/python/server/text_embeddings_server/models/neuron/__init__.py b/backends/python/server/text_embeddings_server/models/neuron/__init__.py index 297e05eef..80745edc8 100644 --- a/backends/python/server/text_embeddings_server/models/neuron/__init__.py +++ b/backends/python/server/text_embeddings_server/models/neuron/__init__.py @@ -1,107 +1,95 @@ import inspect import os import torch -import torch.nn.functional as F -from abc import ABC, abstractmethod +from abc import ABC from pathlib import Path -from typing import Type, List, Tuple +from typing import Type, List from opentelemetry import trace from loguru import logger from text_embeddings_server.models.model import Model -from text_embeddings_server.models.pooling import DefaultPooling from text_embeddings_server.models.types import PaddedBatch, Embedding, Score tracer = trace.get_tracer(__name__) -NEURON_MODE = os.getenv("NEURON_MODE", "eager") # "eager" | "compile" +# Neuron static shapes compilation parameters NEURON_BATCH_SIZE = int(os.getenv("NEURON_BATCH_SIZE", "1")) NEURON_SEQUENCE_LENGTH = int(os.getenv("NEURON_SEQUENCE_LENGTH", "512")) -def _get_orig_module(model) -> torch.nn.Module: - """Return the unwrapped module whether or not it has been torch.compiled.""" - return getattr(model, "_orig_mod", model) - - -def _check_param(model, param_name: str) -> bool: - try: - fn = model.forward if hasattr(model, "forward") else model.__call__ - return inspect.signature(fn).parameters.get(param_name) is not None - except (ValueError, TypeError): - return False - - class NeuronBaseModel(Model, ABC): - """Base class for Neuron models using torch-native eager or torch.compile mode.""" - - def __init__(self, model, device: torch.device, dtype: torch.dtype): - orig = _get_orig_module(model) - config = orig.config + """Base class for all Neuron models.""" - self.hidden_size = config.hidden_size + def __init__( + self, + model, + model_path: Path, + device: torch.device, + dtype: torch.dtype, + ): + self.hidden_size = model.config.hidden_size + # Calculate max input length based on model type position_offset = 0 - if config.model_type in ["xlm-roberta", "camembert", "roberta"]: - position_offset = getattr(config, "pad_token_id", 1) + 1 - - if hasattr(config, "max_seq_length"): - self.max_input_length = config.max_seq_length - elif hasattr(config, "n_positions"): - self.max_input_length = config.n_positions + model_type = model.config.model_type + if model_type in ["xlm-roberta", "camembert", "roberta"]: + position_offset = getattr(model.config, "pad_token_id", 1) + 1 + + if hasattr(model.config, "max_seq_length"): + self.max_input_length = model.config.max_seq_length + elif hasattr(model.config, "n_positions"): + self.max_input_length = model.config.n_positions else: - self.max_input_length = config.max_position_embeddings - position_offset + self.max_input_length = ( + model.config.max_position_embeddings - position_offset + ) - self.has_position_ids = _check_param(orig, "position_ids") - self.has_token_type_ids = _check_param(orig, "token_type_ids") + # Check which inputs the model supports + self.has_position_ids = self._check_param_exists(model, "position_ids") + self.has_token_type_ids = self._check_param_exists(model, "token_type_ids") super().__init__(model=model, dtype=dtype, device=device) + @staticmethod + def _check_param_exists(model, param_name: str) -> bool: + """Check if a parameter exists in the model's forward signature.""" + try: + forward_fn = model.forward if hasattr(model, 'forward') else model.__call__ + return ( + inspect.signature(forward_fn).parameters.get(param_name, None) + is not None + ) + except (ValueError, TypeError): + return False + @property def batch_type(self) -> Type[PaddedBatch]: return PaddedBatch - def _pad_to_static_shape(self, batch: PaddedBatch) -> Tuple[dict, int]: - """Pad all inputs to (NEURON_BATCH_SIZE, NEURON_SEQUENCE_LENGTH). + def _prepare_inputs(self, batch: PaddedBatch) -> dict: + """Prepare input kwargs for model forward pass. - Neuron requires static shapes; padding to fixed dims avoids recompilation - on every distinct (batch, seq) pair seen in production. - Returns (padded_kwargs_on_cpu, actual_batch_size). + Note: Neuron models require int64 (long) tensors for inputs. """ - actual_bs = batch.input_ids.shape[0] - actual_seq = batch.input_ids.shape[1] - - if actual_bs > NEURON_BATCH_SIZE: - raise ValueError( - f"Batch size {actual_bs} exceeds NEURON_BATCH_SIZE={NEURON_BATCH_SIZE}. " - f"Set NEURON_BATCH_SIZE>={actual_bs} to serve this batch." - ) - - seq_pad = max(0, NEURON_SEQUENCE_LENGTH - actual_seq) - batch_pad = max(0, NEURON_BATCH_SIZE - actual_bs) - - def _pad(t: torch.Tensor) -> torch.Tensor: - if seq_pad > 0: - t = F.pad(t, (0, seq_pad), value=0) - if batch_pad > 0: - t = F.pad(t, (0, 0, 0, batch_pad), value=0) - return t - - input_ids = _pad(batch.input_ids.to(torch.long)) - attention_mask = _pad(batch.attention_mask.to(torch.long)) - kwargs: dict = {"input_ids": input_ids, "attention_mask": attention_mask} - + kwargs = { + "input_ids": batch.input_ids.to(torch.long), + "attention_mask": batch.attention_mask.to(torch.long), + } if self.has_token_type_ids: - kwargs["token_type_ids"] = _pad(batch.token_type_ids.to(torch.long)) + kwargs["token_type_ids"] = batch.token_type_ids.to(torch.long) if self.has_position_ids: - kwargs["position_ids"] = _pad(batch.position_ids.to(torch.long)) + kwargs["position_ids"] = batch.position_ids.to(torch.long) + return kwargs - return kwargs, actual_bs +class NeuronSentenceTransformersModel(Model): + """ + Neuron model for sentence-transformers. -class NeuronDefaultModel(NeuronBaseModel): - """Neuron model for dense sentence embeddings.""" + Uses optimum.neuron.NeuronSentenceTransformers which is designed + for sentence embedding models. + """ def __init__( self, @@ -111,48 +99,131 @@ def __init__( pool: str = "cls", trust_remote: bool = False, ): - from transformers import AutoModel + from optimum.neuron import NeuronSentenceTransformers + from transformers import AutoConfig - model = AutoModel.from_pretrained( - model_path, trust_remote_code=trust_remote - ).to(dtype).to(device) + # Load config separately for reliable access + config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote) + self.hidden_size = config.hidden_size - # Extract before optional compile so DefaultPooling gets the hidden size - self.pooling = DefaultPooling(model.config.hidden_size, pooling_mode=pool) + position_offset = 0 + model_type = config.model_type + if model_type in ["xlm-roberta", "camembert", "roberta"]: + position_offset = getattr(config, "pad_token_id", 1) + 1 - if NEURON_MODE == "compile": - logger.info("Wrapping NeuronDefaultModel with torch.compile(backend='neuron')") - model = torch.compile(model, backend="neuron", fullgraph=False) + if hasattr(config, "max_seq_length"): + self.max_input_length = config.max_seq_length + elif hasattr(config, "n_positions"): + self.max_input_length = config.n_positions + else: + self.max_input_length = ( + config.max_position_embeddings - position_offset + ) - super().__init__(model, device, dtype) - logger.info(f"NeuronDefaultModel ready (mode={NEURON_MODE}, pool={pool})") + is_compiled = self._is_neuron_compiled(model_path) + if not is_compiled: + logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") + model = NeuronSentenceTransformers.from_pretrained( + model_path, + export=True, + batch_size=NEURON_BATCH_SIZE, + sequence_length=NEURON_SEQUENCE_LENGTH, + ) + else: + model = NeuronSentenceTransformers.from_pretrained(model_path) - @tracer.start_as_current_span("embed") - def embed(self, batch: PaddedBatch) -> List[Embedding]: - kwargs, actual_bs = self._pad_to_static_shape(batch) + self.pool = pool + super().__init__(model=model, dtype=dtype, device=device) + logger.info(f"Loaded NeuronSentenceTransformersModel with pool={pool}, hidden_size={self.hidden_size}") + + @staticmethod + def _is_neuron_compiled(model_path: Path) -> bool: + """Check if the model is already compiled for Neuron.""" + neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] + return len(neuron_files) > 0 - output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()}) + @property + def batch_type(self) -> Type[PaddedBatch]: + return PaddedBatch - # Move token embeddings back to CPU; pooling runs on CPU - token_embeddings = output[0][:actual_bs].to("cpu") - pool_mask = kwargs["attention_mask"][:actual_bs] # already on CPU + @tracer.start_as_current_span("embed") + def embed(self, batch: PaddedBatch) -> List[Embedding]: + # Prepare inputs + input_ids = batch.input_ids.to(torch.long) + attention_mask = batch.attention_mask.to(torch.long) + + output = self.model(input_ids, attention_mask) + + # Get sentence embeddings from output + sentence_embedding = None + if isinstance(output, dict): + # Check if sentence_embedding exists and has non-zero values + # NeuronSentenceTransformers may return zeros for sentence_embedding when pooling fails + has_valid_sentence_embedding = ( + "sentence_embedding" in output + and output["sentence_embedding"] is not None + and output["sentence_embedding"].abs().sum() > 0 + ) + if has_valid_sentence_embedding: + sentence_embedding = output["sentence_embedding"] + elif "token_embeddings" in output and output["token_embeddings"] is not None: + # Apply manual pooling when sentence_embedding is not valid + logger.debug(f"Using token_embeddings with manual {self.pool} pooling") + token_embeddings = output["token_embeddings"] + + if self.pool == "cls": + sentence_embedding = token_embeddings[:, 0, :] + elif self.pool == "mean": + mask = attention_mask.unsqueeze(-1).float() + sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1) + elif self.pool == "last_token": + seq_lengths = attention_mask.sum(dim=1) - 1 + sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] + else: + raise ValueError(f"Invalid pooling mode: {self.pool}") + else: + raise ValueError(f"Cannot extract embeddings from model output dict: {output.keys()}") + elif hasattr(output, "sentence_embedding") and output.sentence_embedding is not None: + sentence_embedding = output.sentence_embedding + elif hasattr(output, "token_embeddings") and output.token_embeddings is not None: + token_embeddings = output.token_embeddings + if self.pool == "cls": + sentence_embedding = token_embeddings[:, 0, :] + elif self.pool == "mean": + mask = attention_mask.unsqueeze(-1).float() + sentence_embedding = (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1) + elif self.pool == "last_token": + seq_lengths = attention_mask.sum(dim=1) - 1 + sentence_embedding = token_embeddings[torch.arange(token_embeddings.size(0)), seq_lengths] + else: + raise ValueError(f"Invalid pooling mode: {self.pool}") + elif torch.is_tensor(output): + # Assume output is the sentence embedding tensor directly + sentence_embedding = output + else: + raise ValueError(f"Cannot extract embeddings from model output: type={type(output)}") - # DefaultPooling.forward accepts list[tensor] so it can index [0] - embedding = self.pooling.forward([token_embeddings], pool_mask) - cpu_results = embedding.view(-1).tolist() + # Convert to list format expected by the gRPC interface + cpu_results = sentence_embedding.view(-1).tolist() return [ - Embedding(values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size]) - for i in range(actual_bs) + Embedding( + values=cpu_results[i * self.hidden_size : (i + 1) * self.hidden_size] + ) + for i in range(len(batch)) ] @tracer.start_as_current_span("predict") def predict(self, batch: PaddedBatch) -> List[Score]: - raise NotImplementedError("predict not supported for embedding models") + raise NotImplementedError("Prediction not supported for sentence transformer models") class NeuronClassificationModel(NeuronBaseModel): - """Neuron model for sequence classification.""" + """ + Neuron-optimized model for sequence classification. + + Uses optimum.neuron.NeuronModelForSequenceClassification for classification tasks. + """ def __init__( self, @@ -162,37 +233,56 @@ def __init__( pool: str = "cls", trust_remote: bool = False, ): - from transformers import AutoModelForSequenceClassification - - model = AutoModelForSequenceClassification.from_pretrained( - model_path, trust_remote_code=trust_remote - ).to(dtype).to(device) - - if NEURON_MODE == "compile": - logger.info("Wrapping NeuronClassificationModel with torch.compile(backend='neuron')") - model = torch.compile(model, backend="neuron", fullgraph=False) - - super().__init__(model, device, dtype) - logger.info(f"NeuronClassificationModel ready (mode={NEURON_MODE})") + from optimum.neuron import NeuronModelForSequenceClassification + + is_compiled = self._is_neuron_compiled(model_path) + export_kwargs = {} + if not is_compiled: + export_kwargs = { + "export": True, + "batch_size": NEURON_BATCH_SIZE, + "sequence_length": NEURON_SEQUENCE_LENGTH, + } + logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") + model = NeuronModelForSequenceClassification.from_pretrained( + model_path, + **export_kwargs, + ) + + super().__init__(model, model_path, device, dtype) + logger.info("Loaded NeuronClassificationModel") + + @staticmethod + def _is_neuron_compiled(model_path: Path) -> bool: + """Check if the model is already compiled for Neuron.""" + neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] + return len(neuron_files) > 0 @tracer.start_as_current_span("embed") def embed(self, batch: PaddedBatch) -> List[Embedding]: - raise NotImplementedError("embed not supported for classification models") + raise NotImplementedError("Embedding not supported for classification models") @tracer.start_as_current_span("predict") def predict(self, batch: PaddedBatch) -> List[Score]: - kwargs, actual_bs = self._pad_to_static_shape(batch) + kwargs = self._prepare_inputs(batch) + output = self.model(**kwargs) - output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()}) - - logits = output.logits if hasattr(output, "logits") else output[0] - logits_cpu = logits[:actual_bs].to("cpu").tolist() + # Get logits from output + if hasattr(output, "logits"): + logits = output.logits + else: + logits = output[0] - return [Score(values=scores) for scores in logits_cpu] + all_scores = logits.tolist() + return [Score(values=scores) for scores in all_scores] class NeuronMaskedLMModel(NeuronBaseModel): - """Neuron model for masked language modeling (SPLADE sparse embeddings).""" + """ + Neuron-optimized model for Masked Language Modeling (SPLADE). + + Uses optimum.neuron.NeuronModelForMaskedLM for SPLADE-style sparse embeddings. + """ def __init__( self, @@ -202,47 +292,63 @@ def __init__( pool: str = "splade", trust_remote: bool = False, ): - from transformers import AutoModelForMaskedLM - - model = AutoModelForMaskedLM.from_pretrained( - model_path, trust_remote_code=trust_remote - ).to(dtype).to(device) - - # Extract before optional compile + from optimum.neuron import NeuronModelForMaskedLM + + is_compiled = self._is_neuron_compiled(model_path) + export_kwargs = {} + if not is_compiled: + export_kwargs = { + "export": True, + "batch_size": NEURON_BATCH_SIZE, + "sequence_length": NEURON_SEQUENCE_LENGTH, + } + logger.info(f"Compiling model for Neuron with batch_size={NEURON_BATCH_SIZE}, sequence_length={NEURON_SEQUENCE_LENGTH}") + model = NeuronModelForMaskedLM.from_pretrained( + model_path, + **export_kwargs, + ) + + super().__init__(model, model_path, device, dtype) + + # Get vocab size for SPLADE output self.vocab_size = model.config.vocab_size + logger.info(f"Loaded NeuronMaskedLMModel with vocab_size={self.vocab_size}") - if NEURON_MODE == "compile": - logger.info("Wrapping NeuronMaskedLMModel with torch.compile(backend='neuron')") - model = torch.compile(model, backend="neuron", fullgraph=False) - - super().__init__(model, device, dtype) - logger.info(f"NeuronMaskedLMModel ready (mode={NEURON_MODE}, vocab_size={self.vocab_size})") + @staticmethod + def _is_neuron_compiled(model_path: Path) -> bool: + """Check if the model is already compiled for Neuron.""" + neuron_files = list(model_path.glob("*.neuron")) if model_path.is_dir() else [] + return len(neuron_files) > 0 @tracer.start_as_current_span("embed") def embed(self, batch: PaddedBatch) -> List[Embedding]: - kwargs, actual_bs = self._pad_to_static_shape(batch) + kwargs = self._prepare_inputs(batch) + output = self.model(**kwargs) - output = self.model(**{k: v.to(self.device) for k, v in kwargs.items()}) - - hidden_states = output.logits if hasattr(output, "logits") else output[0] - hidden_states = hidden_states[:actual_bs].to("cpu") - mask = kwargs["attention_mask"][:actual_bs].unsqueeze(-1).float() + # Get logits for SPLADE pooling + if hasattr(output, "logits"): + hidden_states = output.logits + else: + hidden_states = output[0] - # SPLADE pooling: ReLU → log(1+x) → mask → max over sequence + # SPLADE pooling: ReLU -> log(1+x) -> max pooling hidden_states = torch.relu(hidden_states) hidden_states = (1 + hidden_states).log() - hidden_states = hidden_states * mask + hidden_states = torch.mul(hidden_states, batch.attention_mask.unsqueeze(-1)) sparse_embedding = hidden_states.max(dim=1).values cpu_results = sparse_embedding.view(-1).tolist() + return [ - Embedding(values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size]) - for i in range(actual_bs) + Embedding( + values=cpu_results[i * self.vocab_size : (i + 1) * self.vocab_size] + ) + for i in range(len(batch)) ] @tracer.start_as_current_span("predict") def predict(self, batch: PaddedBatch) -> List[Score]: - raise NotImplementedError("predict not supported for masked LM models") + raise NotImplementedError("Prediction not supported for masked LM models") def create_neuron_model( @@ -253,7 +359,20 @@ def create_neuron_model( trust_remote: bool = False, config=None, ) -> Model: - """Factory: pick the right Neuron model class from the model architecture.""" + """ + Factory function to create the appropriate Neuron model based on the model config. + + Args: + model_path: Path to the model + device: Target device (should be xla for Neuron) + dtype: Data type for the model + pool: Pooling strategy (cls, mean, lasttoken, splade) + trust_remote: Whether to trust remote code + config: Pre-loaded model config (optional) + + Returns: + Appropriate Neuron model instance + """ from transformers import AutoConfig if config is None: @@ -262,14 +381,15 @@ def create_neuron_model( architectures = getattr(config, "architectures", []) or [] architecture = architectures[0] if architectures else "" - logger.info( - f"Creating Neuron model: architecture={architecture}, pool={pool}, mode={NEURON_MODE}" - ) + logger.info(f"Creating Neuron model for architecture: {architecture}, pool: {pool}") + # Check for classification models if architecture.endswith("ForSequenceClassification") or architecture.endswith("Classification"): return NeuronClassificationModel(model_path, device, dtype, pool, trust_remote) + # Check for SPLADE (masked LM) models if pool == "splade" or architecture.endswith("ForMaskedLM"): return NeuronMaskedLMModel(model_path, device, dtype, pool, trust_remote) - return NeuronDefaultModel(model_path, device, dtype, pool, trust_remote) + # Default to NeuronSentenceTransformers for all embedding models + return NeuronSentenceTransformersModel(model_path, device, dtype, pool, trust_remote) diff --git a/backends/python/server/text_embeddings_server/utils/device.py b/backends/python/server/text_embeddings_server/utils/device.py index 0ec69260e..4963b012c 100644 --- a/backends/python/server/text_embeddings_server/utils/device.py +++ b/backends/python/server/text_embeddings_server/utils/device.py @@ -90,7 +90,6 @@ def get_device(): if hasattr(torch, "xpu") and torch.xpu.is_available(): device = torch.device("xpu") elif is_neuron(): - import torch_neuronx # noqa: F401 — registers torch.device("neuron") as PrivateUse1 - device = torch.device("neuron") + device = torch.device("xla") return device