huggingface · alvarobartt · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 21, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -42,25 +42,26 @@ serde_json = "1.0"
 thiserror = "1.0"
 rand = "0.9"
 serial_test = "2.0.0"
-cudarc = { version = "0.13", features =["cuda-12020"], default-features = false }
+cudarc = { version = "0.19", features = ["cuda-version-from-build-system"], default-features = false }
 intel-mkl-src = { version = "0.8", default-features = false }
-candle = { version = "0.8", package = "candle-core" }
-candle-nn = { version = "0.8" }
-candle-transformers = { version = "0.8" }
-candle-flash-attn = { version = "0.8" }
-candle-cublaslt = { version = "0.0.1" }
-candle-layer-norm = { version = "0.0.1" }
-candle-index-select-cu =  { version = "0.0.1", features = ["cuda-11"], default-features = false }
-candle-rotary = { version = "0.0.1" }
-candle-flash-attn-v1 = { version = "0.0.1" }
-half = { version = "2.3.1", features = ["num-traits"] }
+candle = { version = "0.9.2", package = "candle-core" }
+candle-nn = { version = "0.9.2" }
+candle-transformers = { version = "0.9.2" }
+candle-flash-attn = { version = "0.9.2" }
+candle-cublaslt = { version = "0.0.1", default-features = false }
+candle-layer-norm = { version = "0.0.1", default-features = false }
+candle-rotary = { version = "0.0.1", default-features = false }
+candle-flash-attn-v1 = { version = "0.0.1", default-features = false }
 
 [patch.crates-io]
-cudarc = { git = "https://github.com/Narsil/cudarc" , rev = "8b4f18b4bcd5e4b1a9daf40abc3a2e27f83f06e9"}
-candle = { git = "https://github.com/huggingface/candle", rev = "6381023982251959a2c9bab7378b3013304e192b", package = "candle-core" }
-candle-nn = { git = "https://github.com/huggingface/candle", rev = "6381023982251959a2c9bab7378b3013304e192b", package = "candle-nn" }
-candle-transformers = { git = "https://github.com/huggingface/candle", rev = "6381023982251959a2c9bab7378b3013304e192b", package = "candle-transformers" }
-candle-flash-attn = { git = "https://github.com/huggingface/candle", rev = "6381023982251959a2c9bab7378b3013304e192b", package = "candle-flash-attn" }
+candle = { git = "https://github.com/huggingface/candle", branch = "no-default-linking", package = "candle-core" }
+candle-nn = { git = "https://github.com/huggingface/candle", branch = "no-default-linking", package = "candle-nn" }
+candle-transformers = { git = "https://github.com/huggingface/candle", branch = "no-default-linking", package = "candle-transformers" }
+candle-flash-attn = { git = "https://github.com/huggingface/candle", branch = "no-default-linking", package = "candle-flash-attn" }
+candle-cublaslt = { git = "https://github.com/huggingface/candle-extensions", branch = "allow-static-linking" }
+candle-layer-norm = { git = "https://github.com/huggingface/candle-extensions", branch = "allow-static-linking" }
+candle-rotary = { git = "https://github.com/huggingface/candle-extensions", branch = "allow-static-linking" }
+candle-flash-attn-v1 = { git = "https://github.com/huggingface/candle-extensions", branch = "allow-static-linking" }
 
 [profile.release]
 debug = 0

diff --git a/Dockerfile-cuda b/Dockerfile-cuda
@@ -8,9 +8,12 @@ ENV CARGO_CHEF=0.1.73
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     curl \
+    git \
     libssl-dev \
+    libstdc++-13-dev \
     pkg-config \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf "$(gcc --print-file-name=libstdc++.a)" "/usr/lib/$(gcc -print-multiarch)/libstdc++.a"
 
 # Download and configure sccache (multi-arch)
 ARG TARGETARCH

diff --git a/backends/Cargo.toml b/backends/Cargo.toml
@@ -29,3 +29,5 @@ mkl = ["text-embeddings-backend-candle?/mkl"]
 accelerate = ["text-embeddings-backend-candle?/accelerate"]
 flash-attn = ["text-embeddings-backend-candle?/flash-attn"]
 flash-attn-v1 = ["text-embeddings-backend-candle?/flash-attn-v1"]
+static-linking = ["text-embeddings-backend-candle?/static-linking"]
+dynamic-linking = ["text-embeddings-backend-candle?/dynamic-linking"]
diff --git a/backends/candle/Cargo.toml b/backends/candle/Cargo.toml
@@ -15,7 +15,6 @@ candle-transformers = { workspace = true }
 candle-flash-attn = { workspace = true, optional = true}
 candle-flash-attn-v1 = { workspace = true, optional = true }
 candle-cublaslt = { workspace = true, optional = true }
-candle-index-select-cu = { workspace = true, optional = true, features = ["cuda-11"], default-features = false}
 candle-layer-norm = { workspace = true, optional = true }
 candle-rotary = { workspace = true, optional = true }
 nohash-hasher = { workspace = true }
@@ -41,7 +40,9 @@ anyhow = { version = "1", features = ["backtrace"] }
 [features]
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate"]
 metal = ["candle/metal", "candle-nn/metal"]
-mkl = ["dep:intel-mkl-src", "candle/_mkl"]
-cuda = ["candle/_cuda", "candle-nn/_cuda", "dep:candle-cublaslt", "dep:candle-layer-norm", "dep:candle-rotary", "dep:candle-index-select-cu"]
+mkl = ["dep:intel-mkl-src", "candle/mkl-enabled"]
+cuda = ["candle/cuda-enabled", "candle-nn/cuda-enabled", "dep:candle-cublaslt", "dep:candle-layer-norm", "dep:candle-rotary"]
 flash-attn-v1 = ["dep:candle-flash-attn-v1", "cuda"]
 flash-attn = ["dep:candle-flash-attn", "cuda"]
+static-linking = ["candle-cublaslt?/static-linking", "candle-layer-norm?/static-linking", "candle-rotary?/static-linking", "candle-flash-attn-v1?/static-linking"]
+dynamic-linking = ["candle-cublaslt?/dynamic-linking", "candle-layer-norm?/dynamic-linking", "candle-rotary?/dynamic-linking", "candle-flash-attn-v1?/dynamic-linking"]
diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs
@@ -1,9 +1,9 @@
 use anyhow::Context;
+
 use candle::cuda_backend::cudarc::driver;
 use candle::cuda_backend::cudarc::driver::sys::CUdevice_attribute::{
     CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
 };
-use candle::cuda_backend::cudarc::driver::CudaDevice;
 
 pub fn get_compile_compute_cap() -> Result<usize, anyhow::Error> {
     env!("CUDA_COMPUTE_CAP")
@@ -13,11 +13,11 @@ pub fn get_compile_compute_cap() -> Result<usize, anyhow::Error> {
 
 pub fn get_runtime_compute_cap() -> Result<usize, anyhow::Error> {
     driver::result::init().context("CUDA is not available")?;
-    let device = CudaDevice::new(0).context("CUDA is not available")?;
-    let major = device
+    let context = driver::CudaContext::new(0).context("CUDA is not available")?;
+    let major = context
         .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
         .context("Could not retrieve device compute capability major")?;
-    let minor = device
+    let minor = context
         .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
         .context("Could not retrieve device compute capability minor")?;
     Ok((major * 10 + minor) as usize)

diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs
@@ -3,6 +3,7 @@ use std::sync::Once;
 
 static INIT: Once = Once::new();
 static mut RUNTIME_COMPUTE_CAP: usize = 0;
+
 fn init_runtime_compute_cap() {
     unsafe {
         INIT.call_once(|| {

diff --git a/backends/candle/src/layers/index_select.rs b/backends/candle/src/layers/index_select.rs
@@ -1,19 +1,7 @@
-// SPDX-License-Identifier: MIT or Apache-2.0
-// First Published under RadixMLP and https://github.com/michaelfeil/candle-index-select-cu by Michael Feil
-
 use candle::{Result, Tensor};
-#[cfg(feature = "cuda")]
-use candle_index_select_cu;
 
 #[inline]
 #[allow(dead_code)]
 pub fn index_select(tensor: &Tensor, ids: &Tensor, dim: usize) -> Result<Tensor> {
-    #[cfg(not(feature = "cuda"))]
-    {
-        tensor.index_select(ids, dim)
-    }
-    #[cfg(feature = "cuda")]
-    {
-        candle_index_select_cu::index_select(tensor, ids, dim)
-    }
+    tensor.index_select(ids, dim)
 }
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -261,11 +261,42 @@ impl CandleBackend {
         }
         .map_err(|err| BackendError::Start(err.to_string()))?;
 
-        // Get candle dtype
         let dtype = if &dtype == "float32" {
             Ok(DType::F32)
         } else if &dtype == "float16" {
             Ok(DType::F16)
+        } else if &dtype == "bfloat16" {
+            match &device {
+                Device::Cpu => {
+                    return Err(BackendError::Start(
+                        "BFloat16 is not supported on CPU. Use float16 or float32 instead."
+                            .to_string(),
+                    ));
+                }
+                Device::Cuda(_) => {
+                    return Err(BackendError::Start(
+                        "CUDA feature is not enabled".to_string(),
+                    ));
+                }
+                // NOTE: Temporarily left out given that supporting BF16 w/ Flash Attn requires an
+                // update on `candle` and `candle-extensions` which is still in progress
+                // #[cfg(feature = "cuda")]
+                // Device::Cuda(_) => {
+                //     let compute_cap = get_runtime_compute_cap().map_err(|e| {
+                //         BackendError::Start(format!("Failed to get CUDA compute capability: {e:?}"))
+                //     })?;
+                //     if compute_cap < 80 {
+                //         return Err(BackendError::Start(format!(
+                //             "BFloat16 requires CUDA compute capability >= 8.0 (Ampere or newer), \
+                //              but found {}.{}. Use float16 or float32 instead.",
+                //             compute_cap / 10,
+                //             compute_cap % 10
+                //         )));
+                //     }
+                // }
+                Device::Metal(_) => (),
+            }
+            Ok(DType::BF16)
         } else {
             Err(BackendError::Start(format!(
                 "DType {dtype} is not supported"
@@ -377,6 +408,7 @@ impl CandleBackend {
             }
             #[cfg(feature = "cuda")]
             (Config::Bert(config), Device::Cuda(_)) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if dtype == DType::F16 && use_flash_attn(&[FlashAttn::V1, FlashAttn::V2]) {
                     match config {
                         BertConfigWrapper::JinaBert(config) => {
@@ -420,6 +452,7 @@ impl CandleBackend {
                 Config::Camembert(config) | Config::Roberta(config) | Config::XlmRoberta(config),
                 Device::Cuda(_),
             ) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if dtype == DType::F16 && use_flash_attn(&[FlashAttn::V1, FlashAttn::V2]) {
                     tracing::info!("Starting FlashBert model on {:?}", device);
                     Ok(Box::new(
@@ -439,6 +472,7 @@ impl CandleBackend {
             }
             #[cfg(feature = "cuda")]
             (Config::DistilBert(config), Device::Cuda(_)) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if dtype == DType::F16 && use_flash_attn(&[FlashAttn::V2]) {
                     tracing::info!("Starting FlashDistilBert model on {:?}", device);
                     Ok(Box::new(
@@ -465,6 +499,7 @@ impl CandleBackend {
             }
             #[cfg(feature = "cuda")]
             (Config::Gte(config), Device::Cuda(_)) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if dtype == DType::F16 && use_flash_attn(&[FlashAttn::V1, FlashAttn::V2]) {
                     tracing::info!("Starting FlashGTE model on {:?}", device);
                     Ok(Box::new(FlashGTEModel::load(vb, &config, model_type).s()?))
@@ -475,6 +510,7 @@ impl CandleBackend {
             }
             #[cfg(feature = "cuda")]
             (Config::Mistral(config), Device::Cuda(_)) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if !(dtype == DType::F16 && use_flash_attn(&[FlashAttn::V2])) {
                     return Err(BackendError::Start("Mistral is only supported on Cuda devices in fp16 with flash attention v2 enabled".to_string()));
                 }
@@ -485,6 +521,7 @@ impl CandleBackend {
             }
             #[cfg(feature = "cuda")]
             (Config::ModernBert(config), Device::Cuda(_)) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if dtype == DType::F16 && use_flash_attn(&[FlashAttn::V2]) {
                     tracing::info!("Starting FlashModernBert model on {:?}", device);
                     Ok(Box::new(
@@ -501,6 +538,7 @@ impl CandleBackend {
             }
             #[cfg(feature = "cuda")]
             (Config::NomicBert(config), Device::Cuda(_)) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if dtype == DType::F16 && use_flash_attn(&[FlashAttn::V2]) {
                     tracing::info!("Starting FlashNomicBert model on {:?}", device);
                     Ok(Box::new(
@@ -513,6 +551,7 @@ impl CandleBackend {
             }
             #[cfg(feature = "cuda")]
             (Config::Qwen2(config), Device::Cuda(_)) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if !(dtype == DType::F16 && use_flash_attn(&[FlashAttn::V1, FlashAttn::V2])) {
                     return Err(BackendError::Start("Qwen2 is only supported on Cuda devices in fp16 with flash attention v2 enabled".to_string()));
                 }
@@ -523,6 +562,7 @@ impl CandleBackend {
             }
             #[cfg(feature = "cuda")]
             (Config::Qwen3(config), Device::Cuda(_)) => {
+                // TODO(alvarobartt): Include the `dtype` as an arg in `use_flash_attn`
                 if dtype == DType::F16 && use_flash_attn(&[FlashAttn::V1, FlashAttn::V2]) {
                     tracing::info!("Starting FlashQwen3 model on {:?}", device);
                     Ok(Box::new(

diff --git a/backends/candle/src/models/gemma3.rs b/backends/candle/src/models/gemma3.rs
@@ -260,7 +260,11 @@ impl Gemma3Attention {
     ) -> Result<Tensor> {
         let min_value = match dtype {
             DType::F32 => f32::MIN,
-            _ => -65504.0, // f16 minimum value
+            DType::BF16 => -3.3895314e38_f32,
+            DType::F16 => -65504.0_f32,
+            // SAFETY: Default to F16 min finite value, even if dtype will always match any of the
+            // previous variants
+            _ => -65504.0_f32,
         };
 
         let mask: Vec<u8> = (0..seq_len)

diff --git a/backends/candle/src/models/modernbert.rs b/backends/candle/src/models/modernbert.rs
@@ -681,7 +681,11 @@ impl ModernBertModel {
 
         let min_value = match self.dtype {
             DType::F32 => f32::MIN as f64,
-            _ => -65504.0, // f16 minimum value
+            DType::BF16 => -3.3895314e38_f64,
+            DType::F16 => -65504.0_f64,
+            // SAFETY: Default to F16 min finite value, even if dtype will always match any of the
+            // previous variants
+            _ => -65504.0_f64,
         };
 
         let global_attention_mask = ((1.0 - global_attention_mask)? * min_value)?;

diff --git a/backends/candle/src/models/mpnet.rs b/backends/candle/src/models/mpnet.rs
@@ -495,7 +495,11 @@ impl MPNetModel {
 
         let min_value = match self.dtype {
             DType::F32 => f32::MIN as f64,
-            _ => -65504.0_f64, // f16 minumum value
+            DType::BF16 => -3.3895314e38_f64,
+            DType::F16 => -65504.0_f64,
+            // SAFETY: Default to F16 min finite value, even if dtype will always match any of the
+            // previous variants
+            _ => -65504.0_f64,
         };
 
         let extended_attention_mask = ((1.0 - extended_attention_mask)? * min_value)?;

diff --git a/backends/candle/src/models/qwen3.rs b/backends/candle/src/models/qwen3.rs
@@ -501,7 +501,11 @@ impl Qwen3Model {
 
         let min_value = match self.dtype {
             DType::F32 => f32::MIN,
-            _ => -65504.0, // f16 minimum value
+            DType::BF16 => -3.3895314e38_f32,
+            DType::F16 => -65504.0_f32,
+            // SAFETY: Default to F16 min finite value, even if dtype will always match any of the
+            // previous variants
+            _ => -65504.0_f32,
         };
 
         let negatives =

diff --git a/backends/candle/tests/common.rs b/backends/candle/tests/common.rs
@@ -290,14 +290,14 @@ pub fn load_tokenizer(model_root: &Path) -> Result<Tokenizer> {
             m.set_prepend_scheme(PrependScheme::First);
             tokenizer.with_pre_tokenizer(Some(PreTokenizerWrapper::Metaspace(m)));
         } else if let PreTokenizerWrapper::Sequence(s) = pre_tokenizer {
-            let pre_tokenizers = s.get_pre_tokenizers();
+            let pre_tokenizers: Vec<_> = s.clone().into_iter().collect();
             // Check if we have a Metaspace pre tokenizer in the sequence
             let has_metaspace = pre_tokenizers
                 .iter()
                 .any(|t| matches!(t, PreTokenizerWrapper::Metaspace(_)));
 
             if has_metaspace {
-                let mut new_pre_tokenizers = Vec::with_capacity(s.get_pre_tokenizers().len());
+                let mut new_pre_tokenizers = Vec::with_capacity(pre_tokenizers.len());
 
                 for pre_tokenizer in pre_tokenizers {
                     if let PreTokenizerWrapper::WhitespaceSplit(_) = pre_tokenizer {

diff --git a/backends/src/dtype.rs b/backends/src/dtype.rs
@@ -1,35 +1,61 @@
-use std::fmt;
+use std::{fmt, str::FromStr};
 
 #[cfg(feature = "clap")]
 use clap::ValueEnum;
 
 #[derive(Debug, PartialEq)]
 #[cfg_attr(feature = "clap", derive(Clone, ValueEnum))]
 pub enum DType {
-    // Float16 is not available on accelerate
     #[cfg(any(
         feature = "python",
         all(feature = "candle", not(feature = "accelerate"))
     ))]
     Float16,
     #[cfg(any(feature = "python", feature = "candle", feature = "ort"))]
     Float32,
-    #[cfg(feature = "python")]
+    // NOTE: For CUDA, BF16 requires Ampere (SM 80) or newer, which is validated at runtime, as
+    // there are no specific features for the different CUDA compute capabilities to filter out
+    // Turing and Volta from having `DType::Bfloat16`.
+    // NOTE: At the moment only Intel HPU and Metal are supported, given that there are still a few
+    // missing pieces to update `candle` and `candle-extensions` w/ support for BF16 Flash Attn
+    #[cfg(any(feature = "python", all(feature = "candle", feature = "metal")))]
     Bfloat16,
 }
 
+#[derive(Debug, PartialEq, Eq)]
+pub struct DTypeParseError;
+
+impl FromStr for DType {
+    type Err = DTypeParseError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let dtype = match s {
+            "float32" => DType::Float32,
+            #[cfg(any(
+                feature = "python",
+                all(feature = "candle", not(feature = "accelerate"))
+            ))]
+            "float16" => DType::Float16,
+            #[cfg(any(feature = "python", all(feature = "candle", feature = "metal")))]
+            "bfloat16" => DType::Bfloat16,
+            _ => return Err(DTypeParseError),
+        };
+
+        Ok(dtype)
+    }
+}
+
 impl fmt::Display for DType {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            // Float16 is not available on accelerate
             #[cfg(any(
                 feature = "python",
                 all(feature = "candle", not(feature = "accelerate"))
             ))]
             DType::Float16 => write!(f, "float16"),
             #[cfg(any(feature = "python", feature = "candle", feature = "ort"))]
             DType::Float32 => write!(f, "float32"),
-            #[cfg(feature = "python")]
+            #[cfg(any(feature = "python", all(feature = "candle", feature = "metal")))]
             DType::Bfloat16 => write!(f, "bfloat16"),
         }
     }

diff --git a/router/Cargo.toml b/router/Cargo.toml
@@ -91,6 +91,6 @@ candle = ["text-embeddings-backend/candle"]
 candle-cuda = ["candle", "text-embeddings-backend/flash-attn", "dep:cudarc"]
 candle-cuda-turing = ["candle", "text-embeddings-backend/flash-attn-v1", "dep:cudarc"]
 candle-cuda-volta = ["candle", "text-embeddings-backend/cuda", "dep:cudarc"]
-static-linking = ["cudarc?/static-linking", "intel-mkl-src?/mkl-static-lp64-iomp"]
-dynamic-linking = ["cudarc?/dynamic-linking", "intel-mkl-src?/mkl-dynamic-lp64-iomp"]
+static-linking = ["cudarc?/static-linking", "intel-mkl-src?/mkl-static-lp64-iomp", "text-embeddings-backend/static-linking"]
+dynamic-linking = ["cudarc?/dynamic-linking", "intel-mkl-src?/mkl-dynamic-lp64-iomp", "text-embeddings-backend/dynamic-linking"]
 google = []