diff --git a/docs/community/meetups.md b/docs/community/meetups.md index e8b3a9c9c8e6..03a9a9edf66b 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -1,6 +1,6 @@ # Meetups -We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +We host regular meetups at San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing) - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). diff --git a/docs/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md index 531d58690014..17d9356d94fc 100644 --- a/docs/models/extensions/fastsafetensor.md +++ b/docs/models/extensions/fastsafetensor.md @@ -2,4 +2,4 @@ Loading Model weights with fastsafetensors =================================================================== Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details. -For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true`` +To enable this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true`` diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 19d0110ff371..5a5d73d9ba91 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -106,6 +106,10 @@ from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION +from fastapi import APIRouter, Request, HTTPException +from uuid import uuid4 +from .schemas import AnthropicMessagesRequest, AnthropicMessagesResponse + prometheus_multiproc_dir: tempfile.TemporaryDirectory # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) @@ -113,6 +117,43 @@ _running_tasks: set[asyncio.Task] = set() +router = APIRouter() + +@router.post("/v1/messages") +async def anthropic_messages(request: Request): + body = await request.json() + # Validate Anthropic headers and fields + api_key = request.headers.get("x-api-key") + version = request.headers.get("anthropic-version") + if not api_key or not version: + raise HTTPException(status_code=400, detail="Missing required Anthropic headers.") + + # Convert messages to prompt + prompt = convert_messages_to_prompt(body["messages"]) + + # Call existing vLLM generation logic + llm_response = await vllm_generate( + model=body["model"], + prompt=prompt, + max_tokens=body.get("max_tokens", 1024) + ) + + # Return response in Anthropic format + output = { + "id": f"msg_{uuid4().hex[:24]}", + "type": "message", + "role": "assistant", + "content": [{"type": "text", "text": llm_response["text"]}], + "model": body["model"], + "stop_reason": llm_response.get("stop_reason", "end_turn"), + "stop_sequence": None, + "usage": { + "input_tokens": llm_response["prompt_tokens"], + "output_tokens": llm_response["completion_tokens"], + } + } + return output + @asynccontextmanager async def lifespan(app: FastAPI): diff --git a/vllm/entrypoints/openai/protocol_anthropic.py b/vllm/entrypoints/openai/protocol_anthropic.py new file mode 100644 index 000000000000..5520d860c27c --- /dev/null +++ b/vllm/entrypoints/openai/protocol_anthropic.py @@ -0,0 +1,23 @@ +from pydantic import BaseModel +from typing import List, Dict, Any, Optional + +class AnthropicMessageBlock(BaseModel): + role: str # "user" | "assistant" + content: Any + +class AnthropicMessagesRequest(BaseModel): + model: str + messages: List[AnthropicMessageBlock] + max_tokens: int + system: Optional[str] = None + # Add further optional fields per API docs + +class AnthropicMessagesResponse(BaseModel): + id: str + type: str = "message" + role: str = "assistant" + content: List[Dict[str, Any]] + model: str + stop_reason: Optional[str] + stop_sequence: Optional[str] + usage: Dict[str, int] diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index aa41cd6dc53e..edc5810e91d2 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -122,3 +122,14 @@ def consume_space(i: int, s: str) -> int: while i < len(s) and s[i].isspace(): i += 1 return i + +def convert_messages_to_prompt(messages): + # Converts an Anthropic-style conversation to a plain prompt string. + prompt = "" + for msg in messages: + if msg["role"] == "user": + prompt += f"Human: {msg['content']}\n" + elif msg["role"] == "assistant": + prompt += f"Assistant: {msg['content']}\n" + return prompt + diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 2bcd61d1f0aa..503d69430175 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import time import asyncio from collections.abc import Iterable from dataclasses import dataclass @@ -10,6 +11,7 @@ from vllm.outputs import (CompletionOutput, PoolingOutput, PoolingRequestOutput, RequestOutput) +from vllm.sequence import RequestMetrics from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -410,6 +412,14 @@ def process_outputs( if request_output := req_state.make_request_output( new_token_ids, pooling_output, finish_reason, stop_reason, kv_transfer_params, num_cached_tokens): + request_output.metrics = RequestMetrics( + arrival_time=req_state.stats.arrival_time, + last_token_time=req_state.stats.last_token_ts, + first_scheduled_time=req_state.stats.scheduled_ts, + first_token_time=req_state.stats.first_token_ts, + time_in_queue=req_state.stats.scheduled_ts - req_state.stats.arrival_time, + finished_time=time.monotonic() + ) if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put(request_output) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 7af4ed54a220..549b3574bfac 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -247,7 +247,7 @@ def process_inputs( f"is out of range [0, {data_parallel_size}).") if arrival_time is None: - arrival_time = time.time() + arrival_time = time.monotonic() # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists.