vllm-project · SriRangaTarun · Jul 15, 2025 · Jul 15, 2025 · Jul 16, 2025 · Jul 16, 2025
@@ -1,6 +1,6 @@
 # Meetups
 
-We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+We host regular meetups at San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).

@@ -2,4 +2,4 @@ Loading Model weights with fastsafetensors
 ===================================================================
 
 Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
-For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
+To enable this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
@@ -106,13 +106,54 @@
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
+from fastapi import APIRouter, Request, HTTPException
+from uuid import uuid4
+from .schemas import AnthropicMessagesRequest, AnthropicMessagesResponse
-from .schemas import AnthropicMessagesRequest, AnthropicMessagesResponse
+from .schemas_anthropic import AnthropicMessagesRequest, AnthropicMessagesResponse
-from .schemas import AnthropicMessagesRequest, AnthropicMessagesResponse
+from .schemas_anthropic import AnthropicMessagesRequest, AnthropicMessagesResponse
+
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
 
 # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
 logger = init_logger('vllm.entrypoints.openai.api_server')
 
 _running_tasks: set[asyncio.Task] = set()
 
+router = APIRouter()
+
+@router.post("/v1/messages")
+async def anthropic_messages(request: Request):
+    body = await request.json()
+    # Validate Anthropic headers and fields
+    api_key = request.headers.get("x-api-key")
+    version = request.headers.get("anthropic-version")
+    if not api_key or not version:
+        raise HTTPException(status_code=400, detail="Missing required Anthropic headers.")
+
+    # Convert messages to prompt
+    prompt = convert_messages_to_prompt(body["messages"])
+
+    # Call existing vLLM generation logic
+    llm_response = await vllm_generate(
+        model=body["model"],
+        prompt=prompt,
+        max_tokens=body.get("max_tokens", 1024)
+    )
+
+    # Return response in Anthropic format
+    output = {
+        "id": f"msg_{uuid4().hex[:24]}",
+        "type": "message",
+        "role": "assistant",
+        "content": [{"type": "text", "text": llm_response["text"]}],
+        "model": body["model"],
+        "stop_reason": llm_response.get("stop_reason", "end_turn"),
+        "stop_sequence": None,
+        "usage": {
+            "input_tokens": llm_response["prompt_tokens"],
+            "output_tokens": llm_response["completion_tokens"],
+        }
+    }
+    return output
+
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):

@@ -0,0 +1,23 @@
+from pydantic import BaseModel
+from typing import List, Dict, Any, Optional
+
+class AnthropicMessageBlock(BaseModel):
+    role: str  # "user" | "assistant"
+    content: Any
+
+class AnthropicMessagesRequest(BaseModel):
+    model: str
+    messages: List[AnthropicMessageBlock]
+    max_tokens: int
+    system: Optional[str] = None
+    # Add further optional fields per API docs
+
+class AnthropicMessagesResponse(BaseModel):
+    id: str
+    type: str = "message"
+    role: str = "assistant"
+    content: List[Dict[str, Any]]
+    model: str
+    stop_reason: Optional[str]
+    stop_sequence: Optional[str]
+    usage: Dict[str, int]
@@ -122,3 +122,14 @@ def consume_space(i: int, s: str) -> int:
     while i < len(s) and s[i].isspace():
         i += 1
     return i
+
+def convert_messages_to_prompt(messages):
+    # Converts an Anthropic-style conversation to a plain prompt string.
+    prompt = ""
+    for msg in messages:
+        if msg["role"] == "user":
+            prompt += f"Human: {msg['content']}\n"
+        elif msg["role"] == "assistant":
+            prompt += f"Assistant: {msg['content']}\n"
+    return prompt
+
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import time
 import asyncio
 from collections.abc import Iterable
 from dataclasses import dataclass
@@ -10,6 +11,7 @@
 
 from vllm.outputs import (CompletionOutput, PoolingOutput,
                           PoolingRequestOutput, RequestOutput)
+from vllm.sequence import RequestMetrics
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -410,6 +412,14 @@ def process_outputs(
             if request_output := req_state.make_request_output(
                     new_token_ids, pooling_output, finish_reason, stop_reason,
                     kv_transfer_params, num_cached_tokens):
+                request_output.metrics = RequestMetrics(
+                    arrival_time=req_state.stats.arrival_time,
+                    last_token_time=req_state.stats.last_token_ts,
+                    first_scheduled_time=req_state.stats.scheduled_ts,
+                    first_token_time=req_state.stats.first_token_ts,
+                    time_in_queue=req_state.stats.scheduled_ts - req_state.stats.arrival_time,
+                    finished_time=time.monotonic()
+                )
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put(request_output)

@@ -247,7 +247,7 @@ def process_inputs(
                              f"is out of range [0, {data_parallel_size}).")
 
         if arrival_time is None:
-            arrival_time = time.time()
+            arrival_time = time.monotonic()
 
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.