From 48d9510543aa90ebbe6679995b3583121a49040f Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Tue, 15 Jul 2025 16:55:21 -0700 Subject: [PATCH 01/13] Update meetups.md --- docs/community/meetups.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/community/meetups.md b/docs/community/meetups.md index e8b3a9c9c8e6..8fab11b16f14 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -1,6 +1,6 @@ # Meetups -We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +We host regular meetups in the San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing) - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). @@ -17,4 +17,4 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share - [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg) - [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing) -We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu). +We are always looking for speakers and sponsors in the San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu). From aa5e71797cc9966a3d24f6963f88713736f3688e Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Tue, 15 Jul 2025 16:56:17 -0700 Subject: [PATCH 02/13] Update fastsafetensor.md --- docs/models/extensions/fastsafetensor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md index 531d58690014..17d9356d94fc 100644 --- a/docs/models/extensions/fastsafetensor.md +++ b/docs/models/extensions/fastsafetensor.md @@ -2,4 +2,4 @@ Loading Model weights with fastsafetensors =================================================================== Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details. -For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true`` +To enable this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true`` From 850bfced09079efed1e6034ed2b1b0d4aa02d87c Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Tue, 15 Jul 2025 18:34:51 -0700 Subject: [PATCH 03/13] Update output_processor.py --- vllm/v1/engine/output_processor.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 2bcd61d1f0aa..503d69430175 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import time import asyncio from collections.abc import Iterable from dataclasses import dataclass @@ -10,6 +11,7 @@ from vllm.outputs import (CompletionOutput, PoolingOutput, PoolingRequestOutput, RequestOutput) +from vllm.sequence import RequestMetrics from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -410,6 +412,14 @@ def process_outputs( if request_output := req_state.make_request_output( new_token_ids, pooling_output, finish_reason, stop_reason, kv_transfer_params, num_cached_tokens): + request_output.metrics = RequestMetrics( + arrival_time=req_state.stats.arrival_time, + last_token_time=req_state.stats.last_token_ts, + first_scheduled_time=req_state.stats.scheduled_ts, + first_token_time=req_state.stats.first_token_ts, + time_in_queue=req_state.stats.scheduled_ts - req_state.stats.arrival_time, + finished_time=time.monotonic() + ) if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put(request_output) From f6e5ebd09cd7b6a16e41bb38e472a5e019d20519 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Tue, 15 Jul 2025 18:35:18 -0700 Subject: [PATCH 04/13] Update processor.py --- vllm/v1/engine/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 7af4ed54a220..549b3574bfac 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -247,7 +247,7 @@ def process_inputs( f"is out of range [0, {data_parallel_size}).") if arrival_time is None: - arrival_time = time.time() + arrival_time = time.monotonic() # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. From 183115f58c52d6416782455ebde2d4e6489f9f47 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Mon, 21 Jul 2025 17:36:35 -0700 Subject: [PATCH 05/13] Update api_server.py --- vllm/entrypoints/openai/api_server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 19d0110ff371..7ec6f32c055b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -106,6 +106,10 @@ from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION +from fastapi import APIRouter, Request, HTTPException +from uuid import uuid4 +from .schemas import AnthropicMessagesRequest, AnthropicMessagesResponse + prometheus_multiproc_dir: tempfile.TemporaryDirectory # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) From b4ea8e668e887269dfa5d4d85c616680ea4b10f3 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Mon, 21 Jul 2025 17:38:47 -0700 Subject: [PATCH 06/13] Update api_server.py --- vllm/entrypoints/openai/api_server.py | 37 +++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7ec6f32c055b..5a5d73d9ba91 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -117,6 +117,43 @@ _running_tasks: set[asyncio.Task] = set() +router = APIRouter() + +@router.post("/v1/messages") +async def anthropic_messages(request: Request): + body = await request.json() + # Validate Anthropic headers and fields + api_key = request.headers.get("x-api-key") + version = request.headers.get("anthropic-version") + if not api_key or not version: + raise HTTPException(status_code=400, detail="Missing required Anthropic headers.") + + # Convert messages to prompt + prompt = convert_messages_to_prompt(body["messages"]) + + # Call existing vLLM generation logic + llm_response = await vllm_generate( + model=body["model"], + prompt=prompt, + max_tokens=body.get("max_tokens", 1024) + ) + + # Return response in Anthropic format + output = { + "id": f"msg_{uuid4().hex[:24]}", + "type": "message", + "role": "assistant", + "content": [{"type": "text", "text": llm_response["text"]}], + "model": body["model"], + "stop_reason": llm_response.get("stop_reason", "end_turn"), + "stop_sequence": None, + "usage": { + "input_tokens": llm_response["prompt_tokens"], + "output_tokens": llm_response["completion_tokens"], + } + } + return output + @asynccontextmanager async def lifespan(app: FastAPI): From 905d958bdb6950305f48402c86c9409033fa7435 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Mon, 21 Jul 2025 17:39:56 -0700 Subject: [PATCH 07/13] Create schemas_anthropic.py --- vllm/entrypoints/openai/schemas_anthropic.py | 23 ++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 vllm/entrypoints/openai/schemas_anthropic.py diff --git a/vllm/entrypoints/openai/schemas_anthropic.py b/vllm/entrypoints/openai/schemas_anthropic.py new file mode 100644 index 000000000000..5520d860c27c --- /dev/null +++ b/vllm/entrypoints/openai/schemas_anthropic.py @@ -0,0 +1,23 @@ +from pydantic import BaseModel +from typing import List, Dict, Any, Optional + +class AnthropicMessageBlock(BaseModel): + role: str # "user" | "assistant" + content: Any + +class AnthropicMessagesRequest(BaseModel): + model: str + messages: List[AnthropicMessageBlock] + max_tokens: int + system: Optional[str] = None + # Add further optional fields per API docs + +class AnthropicMessagesResponse(BaseModel): + id: str + type: str = "message" + role: str = "assistant" + content: List[Dict[str, Any]] + model: str + stop_reason: Optional[str] + stop_sequence: Optional[str] + usage: Dict[str, int] From ce891c414c67cc37a1ee1383d35452537537c4ae Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Mon, 21 Jul 2025 17:41:15 -0700 Subject: [PATCH 08/13] Update api_server.py --- vllm/entrypoints/openai/api_server.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5a5d73d9ba91..57ffe5768e60 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -155,6 +155,17 @@ async def anthropic_messages(request: Request): return output +def convert_messages_to_prompt(messages): + # Converts an Anthropic-style conversation to a plain prompt string. + prompt = "" + for msg in messages: + if msg["role"] == "user": + prompt += f"Human: {msg['content']}\n" + elif msg["role"] == "assistant": + prompt += f"Assistant: {msg['content']}\n" + return prompt + + @asynccontextmanager async def lifespan(app: FastAPI): try: From 0189bed140327dd6541049edb9ca65fec7b47f33 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Mon, 21 Jul 2025 17:42:18 -0700 Subject: [PATCH 09/13] Update utils.py --- vllm/entrypoints/openai/tool_parsers/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index aa41cd6dc53e..edc5810e91d2 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -122,3 +122,14 @@ def consume_space(i: int, s: str) -> int: while i < len(s) and s[i].isspace(): i += 1 return i + +def convert_messages_to_prompt(messages): + # Converts an Anthropic-style conversation to a plain prompt string. + prompt = "" + for msg in messages: + if msg["role"] == "user": + prompt += f"Human: {msg['content']}\n" + elif msg["role"] == "assistant": + prompt += f"Assistant: {msg['content']}\n" + return prompt + From f527b5853218ce2523a5cc567a979da71cbc0041 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Mon, 21 Jul 2025 17:43:05 -0700 Subject: [PATCH 10/13] Update api_server.py --- vllm/entrypoints/openai/api_server.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 57ffe5768e60..5a5d73d9ba91 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -155,17 +155,6 @@ async def anthropic_messages(request: Request): return output -def convert_messages_to_prompt(messages): - # Converts an Anthropic-style conversation to a plain prompt string. - prompt = "" - for msg in messages: - if msg["role"] == "user": - prompt += f"Human: {msg['content']}\n" - elif msg["role"] == "assistant": - prompt += f"Assistant: {msg['content']}\n" - return prompt - - @asynccontextmanager async def lifespan(app: FastAPI): try: From 56433bfbf4d7522429e7fc16989326097bf30108 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Mon, 21 Jul 2025 17:44:39 -0700 Subject: [PATCH 11/13] Update api_server.py From a28bff219e2e490243f26d62a4cc442821ee2b83 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Wed, 30 Jul 2025 18:08:07 -0700 Subject: [PATCH 12/13] Rename schemas_anthropic.py to protocol_anthropic.py --- .../openai/{schemas_anthropic.py => protocol_anthropic.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename vllm/entrypoints/openai/{schemas_anthropic.py => protocol_anthropic.py} (100%) diff --git a/vllm/entrypoints/openai/schemas_anthropic.py b/vllm/entrypoints/openai/protocol_anthropic.py similarity index 100% rename from vllm/entrypoints/openai/schemas_anthropic.py rename to vllm/entrypoints/openai/protocol_anthropic.py From 057e7f66079c3cedba899e230ba537c8848a7cc9 Mon Sep 17 00:00:00 2001 From: Tarun Paparaju Date: Wed, 30 Jul 2025 18:09:21 -0700 Subject: [PATCH 13/13] Update meetups.md --- docs/community/meetups.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/community/meetups.md b/docs/community/meetups.md index 8fab11b16f14..03a9a9edf66b 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -1,6 +1,6 @@ # Meetups -We host regular meetups in the San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +We host regular meetups at San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing) - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). @@ -17,4 +17,4 @@ We host regular meetups in the San Francisco Bay Area every 2 months. We will sh - [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg) - [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing) -We are always looking for speakers and sponsors in the San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu). +We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu).