huggingface
diff --git a/‎Cargo.lock‎
Lines changed: 29 additions & 57 deletions b/‎Cargo.lock‎
Lines changed: 29 additions & 57 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 13 additions & 13 deletions b/‎README.md‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎backends/Cargo.toml‎
Lines changed: 0 additions & 1 deletion b/‎backends/Cargo.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/src/lib.rs‎
Lines changed: 5 additions & 5 deletions b/‎backends/src/lib.rs‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/openapi.json‎
Lines changed: 1 addition & 1 deletion b/‎docs/openapi.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/private_models.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/private_models.md‎
Lines changed: 1 addition & 1 deletion
@@ -11,7 +11,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.4.0"
+version = "0.5.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-embeddings-inference"
 
@@ -101,7 +101,7 @@ model=BAAI/bge-large-en-v1.5
 revision=refs/pr/5
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model --revision $revision
 ```
 
 And then you can make requests like
@@ -242,15 +242,15 @@ Options:
 
 Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend:
 
-| Architecture                        | Image                                                                     |
-|-------------------------------------|---------------------------------------------------------------------------|
-| CPU                                 | ghcr.io/huggingface/text-embeddings-inference:cpu-0.4.0                   |
-| Volta                               | NOT SUPPORTED                                                             |
-| Turing (T4, RTX 2000 series, ...)   | ghcr.io/huggingface/text-embeddings-inference:turing-0.4.0 (experimental) |
-| Ampere 80 (A100, A30)               | ghcr.io/huggingface/text-embeddings-inference:0.4.0                       |
-| Ampere 86 (A10, A40, ...)           | ghcr.io/huggingface/text-embeddings-inference:86-0.4.0                    |
-| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.4.0                    |
-| Hopper (H100)                       | ghcr.io/huggingface/text-embeddings-inference:hopper-0.4.0 (experimental) |
+| Architecture                        | Image                                                                   |
+|-------------------------------------|-------------------------------------------------------------------------|
+| CPU                                 | ghcr.io/huggingface/text-embeddings-inference:cpu-0.5                   |
+| Volta                               | NOT SUPPORTED                                                           |
+| Turing (T4, RTX 2000 series, ...)   | ghcr.io/huggingface/text-embeddings-inference:turing-0.5 (experimental) |
+| Ampere 80 (A100, A30)               | ghcr.io/huggingface/text-embeddings-inference:0.5                       |
+| Ampere 86 (A10, A40, ...)           | ghcr.io/huggingface/text-embeddings-inference:86-0.5                    |
+| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.5                    |
+| Hopper (H100)                       | ghcr.io/huggingface/text-embeddings-inference:hopper-0.5 (experimental) |
 
 **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
 You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.
@@ -279,7 +279,7 @@ model=<your private model>
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
+docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model
 ```
 
 ### Using Re-rankers models
@@ -297,7 +297,7 @@ model=BAAI/bge-reranker-large
 revision=refs/pr/4
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model --revision $revision
 ```
 
 And then you can rank the similarity between a query and a list of passages with:
@@ -317,7 +317,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba
 model=SamLowe/roberta-base-go_emotions
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model 
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model 
 ```
 
 Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:
 
@@ -6,7 +6,6 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
-flume = "^0.11"
 clap = { version = "4.1.4", features = ["derive"], optional = true }
 text-embeddings-backend-core = { path = "core" }
 text-embeddings-backend-python = { path = "python", optional = true }
 
@@ -5,7 +5,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use text_embeddings_backend_core::Backend as CoreBackend;
-use tokio::sync::oneshot;
+use tokio::sync::{mpsc, oneshot};
 use tracing::{instrument, Span};
 
 pub use crate::dtype::DType;
@@ -20,7 +20,7 @@ use text_embeddings_backend_python::PythonBackend;
 #[derive(Debug, Clone)]
 pub struct Backend {
     /// Channel to communicate with the background thread
-    backend_sender: flume::Sender<BackendCommand>,
+    backend_sender: mpsc::UnboundedSender<BackendCommand>,
     /// Health status
     health: Arc<AtomicBool>,
     pub max_batch_size: Option<usize>,
@@ -35,7 +35,7 @@ impl Backend {
         uds_path: String,
         otlp_endpoint: Option<String>,
     ) -> Result<Self, BackendError> {
-        let (backend_sender, backend_receiver) = flume::unbounded();
+        let (backend_sender, backend_receiver) = mpsc::unbounded_channel();
 
         let backend = init_backend(
             model_path,
@@ -164,9 +164,9 @@ fn init_backend(
 
 fn backend_blocking_task(
     backend: Box<dyn CoreBackend + Send>,
-    command_receiver: flume::Receiver<BackendCommand>,
+    mut command_receiver: mpsc::UnboundedReceiver<BackendCommand>,
 ) {
-    while let Ok(cmd) = command_receiver.recv() {
+    while let Some(cmd) = command_receiver.blocking_recv() {
         let start = Instant::now();
         match cmd {
             BackendCommand::Health(span, sender) => {
 
@@ -9,7 +9,7 @@
     "license": {
       "name": "HFOIL"
     },
-    "version": "0.4.0"
+    "version": "0.5"
   },
   "paths": {
     "/embed": {
 
@@ -37,5 +37,5 @@ model=<your private model>
 volume=$PWD/data
 token=<your cli Hugging Face Hub token>
 
-docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
+docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model
 ```