vllm-project
diff --git a/‎examples/offline_inference/prithvi_geospatial_mae.py
Lines changed: 1 addition & 1 deletion b/‎examples/offline_inference/prithvi_geospatial_mae.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/conftest.py
Lines changed: 11 additions & 2 deletions b/‎tests/conftest.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎tests/model_executor/test_model_load_with_params.py
Lines changed: 9 additions & 3 deletions b/‎tests/model_executor/test_model_load_with_params.py
Lines changed: 9 additions & 3 deletions
diff --git a/‎tests/models/language/pooling/test_embedding.py
Lines changed: 3 additions & 11 deletions b/‎tests/models/language/pooling/test_embedding.py
Lines changed: 3 additions & 11 deletions
diff --git a/‎tests/models/language/pooling/test_jina.py
Lines changed: 8 additions & 0 deletions b/‎tests/models/language/pooling/test_jina.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎tests/v1/attention/utils.py
Lines changed: 1 addition & 0 deletions b/‎tests/v1/attention/utils.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/v1/test_oracle.py
Lines changed: 0 additions & 1 deletion b/‎tests/v1/test_oracle.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/v1/test_utils.py
Lines changed: 1 addition & 2 deletions b/‎tests/v1/test_utils.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎vllm/engine/arg_utils.py
Lines changed: 2 additions & 1 deletion b/‎vllm/engine/arg_utils.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎vllm/model_executor/models/bert.py
Lines changed: 7 additions & 11 deletions b/‎vllm/model_executor/models/bert.py
Lines changed: 7 additions & 11 deletions
@@ -3,12 +3,12 @@
 import argparse
 import datetime
 import os
-import re
 from typing import Union
 
 import albumentations
 import numpy as np
 import rasterio
+import regex as re
 import torch
 from einops import rearrange
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
 
@@ -1062,8 +1062,17 @@ def score(
         return [req_output.outputs.score for req_output in req_outputs]
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.llm.llm_engine.model_executor
-        return executor.apply_model(func)
+        if hasattr(self.llm.llm_engine, "model_executor"):
+            # This works either in V0 or in V1 with
+            # VLLM_ENABLE_V1_MULTIPROCESSING=0
+            executor = self.llm.llm_engine.model_executor
+            return executor.apply_model(func)
+
+        # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
+        def _apply_model(self):
+            return func(self.get_model())
+
+        return self.llm.llm_engine.collective_rpc(_apply_model)
 
     def __enter__(self):
         return self
 
@@ -22,10 +22,12 @@
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_model_loading_with_params(vllm_runner):
+def test_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test parameter weight loading with tp>1.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_name=MODEL_NAME,
                      revision=REVISION,
                      dtype="float16",
@@ -61,10 +63,12 @@ def check_model(model):
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_roberta_model_loading_with_params(vllm_runner):
+def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test parameter weight loading with tp>1.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     with vllm_runner(model_name=MODEL_NAME_ROBERTA,
                      revision=REVISION_ROBERTA,
                      dtype="float16",
@@ -101,10 +105,12 @@ def check_model(model):
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
-def test_facebook_roberta_model_loading_with_params(vllm_runner):
+def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
     """
     Test loading roberta-base model with no lm_head.
     """
+    # to use apply_model
+    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     model_name = "FacebookAI/roberta-base"
     with vllm_runner(model_name=model_name,
                      dtype="float16",
 
@@ -39,17 +39,9 @@ def v1(run_with_both_engines):
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param(
-            "BAAI/bge-base-en-v1.5",
-            marks=[
-                # CPU only supports V1
-                pytest.mark.core_model,
-                pytest.mark.skip_v1
-            ]),
-        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
-                     marks=[pytest.mark.skip_v1]),
-        pytest.param("intfloat/multilingual-e5-small",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
+        pytest.param("intfloat/multilingual-e5-small"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                      marks=[pytest.mark.skip_v1]),
         # [Cross-Encoder]
 
@@ -23,6 +23,14 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
 
@@ -93,6 +93,7 @@ def create_common_attn_metadata(
         max_query_len=max_query_len,
         block_table_tensor=block_table_tensor,
         slot_mapping=slot_mapping,
+        causal=True,
     )
 
 
 
@@ -13,7 +13,6 @@
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
     "state-spaces/mamba-130m-hf",  # mamba1
-    "BAAI/bge-m3",  # embedding
 ]
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import re
-
 import pytest
+import regex as re
 import requests
 import torch
 
 
@@ -1649,7 +1649,8 @@ def _set_default_args_v1(self, usage_context: UsageContext,
 
         if (self.max_num_seqs is None
                 and usage_context in default_max_num_seqs):
-            self.max_num_seqs = default_max_num_seqs[usage_context]
+            self.max_num_seqs = min(default_max_num_seqs[usage_context],
+                                    self.max_num_batched_tokens or sys.maxsize)
 
             logger.debug("Setting max_num_seqs to %d for %s usage context.",
                          self.max_num_seqs, use_context_value)
 
@@ -12,7 +12,6 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
@@ -60,7 +59,6 @@ def __init__(self, config: BertConfig):
     def forward(
         self,
         input_ids: torch.Tensor,
-        seq_lens: torch.Tensor,
         position_ids: torch.Tensor,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -119,7 +117,6 @@ def forward(
         return pooled_output
 
 
-@support_torch_compile
 class BertEncoder(nn.Module):
 
     def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
@@ -337,6 +334,7 @@ def forward(self, hidden_states: torch.Tensor,
         return hidden_states
 
 
+@support_torch_compile
 class BertModel(nn.Module, SupportsQuant):
 
     is_pooling_model = True
@@ -368,13 +366,9 @@ def forward(
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
-            attn_metadata = get_forward_context().attn_metadata
-            assert hasattr(attn_metadata, "seq_lens_tensor")
-            hidden_states = self.embeddings(
-                input_ids=input_ids,
-                seq_lens=attn_metadata.seq_lens_tensor,
-                position_ids=position_ids,
-                token_type_ids=token_type_ids)
+            hidden_states = self.embeddings(input_ids=input_ids,
+                                            position_ids=position_ids,
+                                            token_type_ids=token_type_ids)
         return self.encoder(hidden_states)
 
     def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -447,7 +441,7 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loaded_params
 
 
-class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
+class BertEmbeddingModel(nn.Module, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
     This class encapsulates the BertModel and provides an interface for
@@ -474,11 +468,13 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return self.model(input_ids=input_ids,
                           position_ids=positions,
+                          token_type_ids=token_type_ids,
                           inputs_embeds=inputs_embeds,
                           intermediate_tensors=intermediate_tensors)
Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,7 @@ def create_common_attn_metadata(`
`93`	`93`	`max_query_len=max_query_len,`
`94`	`94`	`block_table_tensor=block_table_tensor,`
`95`	`95`	`slot_mapping=slot_mapping,`
	`96`	`+ causal=True,`
`96`	`97`	`)`
`97`	`98`
`98`	`99`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,6 @@`
`13`	`13`	`"openai/whisper-large-v3", # transcription`
`14`	`14`	`"facebook/bart-large-cnn", # encoder decoder`
`15`	`15`	`"state-spaces/mamba-130m-hf", # mamba1`
`16`		`- "BAAI/bge-m3", # embedding`
`17`	`16`	`]`
`18`	`17`
`19`	`18`	`MODEL = "meta-llama/Llama-3.2-1B-Instruct"`