From 60c60621b4fdcd5ee1df5453589787cbb14ce67b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 29 Jul 2025 13:41:53 -0700
Subject: [PATCH 1/3] fix doc

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../design/{v1 => }/metrics/intervals-1.png     | Bin
 .../design/{v1 => }/metrics/intervals-2.png     | Bin
 .../design/{v1 => }/metrics/intervals-3.png     | Bin
 .../{v1 => }/prefix_caching/example-time-1.png  | Bin
 .../{v1 => }/prefix_caching/example-time-3.png  | Bin
 .../{v1 => }/prefix_caching/example-time-4.png  | Bin
 .../{v1 => }/prefix_caching/example-time-5.png  | Bin
 .../{v1 => }/prefix_caching/example-time-6.png  | Bin
 .../{v1 => }/prefix_caching/example-time-7.png  | Bin
 .../design/{v1 => }/prefix_caching/free.png     | Bin
 .../design/{v1 => }/prefix_caching/overview.png | Bin
 .../design/{v1 => }/tpu/most_model_len.png      | Bin
 docs/configuration/tpu.md                       |   2 +-
 docs/design/metrics.md                          |   6 +++---
 docs/design/prefix_caching.md                   |  16 ++++++++--------
 examples/offline_inference/basic/basic.py       |   2 +-
 16 files changed, 13 insertions(+), 13 deletions(-)
 rename docs/assets/design/{v1 => }/metrics/intervals-1.png (100%)
 rename docs/assets/design/{v1 => }/metrics/intervals-2.png (100%)
 rename docs/assets/design/{v1 => }/metrics/intervals-3.png (100%)
 rename docs/assets/design/{v1 => }/prefix_caching/example-time-1.png (100%)
 rename docs/assets/design/{v1 => }/prefix_caching/example-time-3.png (100%)
 rename docs/assets/design/{v1 => }/prefix_caching/example-time-4.png (100%)
 rename docs/assets/design/{v1 => }/prefix_caching/example-time-5.png (100%)
 rename docs/assets/design/{v1 => }/prefix_caching/example-time-6.png (100%)
 rename docs/assets/design/{v1 => }/prefix_caching/example-time-7.png (100%)
 rename docs/assets/design/{v1 => }/prefix_caching/free.png (100%)
 rename docs/assets/design/{v1 => }/prefix_caching/overview.png (100%)
 rename docs/assets/design/{v1 => }/tpu/most_model_len.png (100%)

diff --git a/docs/assets/design/v1/metrics/intervals-1.png b/docs/assets/design/metrics/intervals-1.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-1.png
rename to docs/assets/design/metrics/intervals-1.png
diff --git a/docs/assets/design/v1/metrics/intervals-2.png b/docs/assets/design/metrics/intervals-2.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-2.png
rename to docs/assets/design/metrics/intervals-2.png
diff --git a/docs/assets/design/v1/metrics/intervals-3.png b/docs/assets/design/metrics/intervals-3.png
similarity index 100%
rename from docs/assets/design/v1/metrics/intervals-3.png
rename to docs/assets/design/metrics/intervals-3.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-1.png b/docs/assets/design/prefix_caching/example-time-1.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-1.png
rename to docs/assets/design/prefix_caching/example-time-1.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-3.png b/docs/assets/design/prefix_caching/example-time-3.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-3.png
rename to docs/assets/design/prefix_caching/example-time-3.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-4.png b/docs/assets/design/prefix_caching/example-time-4.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-4.png
rename to docs/assets/design/prefix_caching/example-time-4.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-5.png b/docs/assets/design/prefix_caching/example-time-5.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-5.png
rename to docs/assets/design/prefix_caching/example-time-5.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-6.png b/docs/assets/design/prefix_caching/example-time-6.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-6.png
rename to docs/assets/design/prefix_caching/example-time-6.png
diff --git a/docs/assets/design/v1/prefix_caching/example-time-7.png b/docs/assets/design/prefix_caching/example-time-7.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/example-time-7.png
rename to docs/assets/design/prefix_caching/example-time-7.png
diff --git a/docs/assets/design/v1/prefix_caching/free.png b/docs/assets/design/prefix_caching/free.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/free.png
rename to docs/assets/design/prefix_caching/free.png
diff --git a/docs/assets/design/v1/prefix_caching/overview.png b/docs/assets/design/prefix_caching/overview.png
similarity index 100%
rename from docs/assets/design/v1/prefix_caching/overview.png
rename to docs/assets/design/prefix_caching/overview.png
diff --git a/docs/assets/design/v1/tpu/most_model_len.png b/docs/assets/design/tpu/most_model_len.png
similarity index 100%
rename from docs/assets/design/v1/tpu/most_model_len.png
rename to docs/assets/design/tpu/most_model_len.png
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index 005b7f78f440..d255fb36a381 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -46,7 +46,7 @@ This initial compilation time ranges significantly and is impacted by many of th
 
 #### max model len vs. most model len
 
-![most_model_len](../assets/design/v1/tpu/most_model_len.png)
+![most_model_len](../assets/design/tpu/most_model_len.png)
 
 If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
 
diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 52cd320dd4e1..0fe865e0d950 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -223,7 +223,7 @@ And the calculated intervals are:
 
 Put another way:
 
-![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png)
+![Interval calculations - common case](../../assets/design/metrics/intervals-1.png)
 
 We explored the possibility of having the frontend calculate these
 intervals using the timing of events visible by the frontend. However,
@@ -238,13 +238,13 @@ When a preemption occurs during decode, since any already generated
 tokens are reused, we consider the preemption as affecting the
 inter-token, decode, and inference intervals.
 
-![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png)
+![Interval calculations - preempted decode](../../assets/design/metrics/intervals-2.png)
 
 When a preemption occurs during prefill (assuming such an event
 is possible), we consider the preemption as affecting the
 time-to-first-token and prefill intervals.
 
-![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png)
+![Interval calculations - preempted prefill](../../assets/design/metrics/intervals-3.png)
 
 ### Frontend Stats Collection
 
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index 2d3c8412894a..785566895a78 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -122,7 +122,7 @@ There are two design points to highlight:
 
 As a result, we will have the following components when the KV cache manager is initialized:
 
-![Component Overview](../../assets/design/v1/prefix_caching/overview.png)
+![Component Overview](../../assets/design/prefix_caching/overview.png)
 
 * Block Pool: A list of KVCacheBlock.  
 * Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
@@ -192,7 +192,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund
 
 When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
 
-![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png)
+![Free queue after a request us freed](../../assets/design/prefix_caching/free.png)
 
 ### Eviction (LRU)
 
@@ -208,24 +208,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 
 **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
 
-![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png)
+![Example Time 1](../../assets/design/prefix_caching/example-time-1.png)
 
 **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
 
-![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png)
+![Example Time 3](../../assets/design/prefix_caching/example-time-3.png)
 
 **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
 
-![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png)
+![Example Time 4](../../assets/design/prefix_caching/example-time-4.png)
 
 **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
 
-![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png)
+![Example Time 5](../../assets/design/prefix_caching/example-time-5.png)
 
 **Time 6: Request 1 is finished and free.**
 
-![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png)
+![Example Time 6](../../assets/design/prefix_caching/example-time-6.png)
 
 **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
 
-![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png)
+![Example Time 7](../../assets/design/prefix_caching/example-time-7.png)
diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index 78bfda9bcf4e..445b97630e3a 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -16,7 +16,7 @@
 
 def main():
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m")
+    llm = LLM(model="google/gemma-3-1b-it")
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.

From dbc1f0148a95808fec8f5fd13b9bbabecce65d7a Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 29 Jul 2025 13:48:53 -0700
Subject: [PATCH 2/3] Update examples/offline_inference/basic/basic.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 examples/offline_inference/basic/basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index 445b97630e3a..78bfda9bcf4e 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -16,7 +16,7 @@
 
 def main():
     # Create an LLM.
-    llm = LLM(model="google/gemma-3-1b-it")
+    llm = LLM(model="facebook/opt-125m")
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.

From 09a0a869b72b656554cfc2a4aa4bd1e91b9800c2 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 29 Jul 2025 15:10:25 -0700
Subject: [PATCH 3/3] fix doc

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 docs/design/metrics.md        |  6 +++---
 docs/design/prefix_caching.md | 16 ++++++++--------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index 0fe865e0d950..f4018d56bab8 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -223,7 +223,7 @@ And the calculated intervals are:
 
 Put another way:
 
-![Interval calculations - common case](../../assets/design/metrics/intervals-1.png)
+![Interval calculations - common case](../assets/design/metrics/intervals-1.png)
 
 We explored the possibility of having the frontend calculate these
 intervals using the timing of events visible by the frontend. However,
@@ -238,13 +238,13 @@ When a preemption occurs during decode, since any already generated
 tokens are reused, we consider the preemption as affecting the
 inter-token, decode, and inference intervals.
 
-![Interval calculations - preempted decode](../../assets/design/metrics/intervals-2.png)
+![Interval calculations - preempted decode](../assets/design/metrics/intervals-2.png)
 
 When a preemption occurs during prefill (assuming such an event
 is possible), we consider the preemption as affecting the
 time-to-first-token and prefill intervals.
 
-![Interval calculations - preempted prefill](../../assets/design/metrics/intervals-3.png)
+![Interval calculations - preempted prefill](../assets/design/metrics/intervals-3.png)
 
 ### Frontend Stats Collection
 
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index 785566895a78..77e583328bee 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -122,7 +122,7 @@ There are two design points to highlight:
 
 As a result, we will have the following components when the KV cache manager is initialized:
 
-![Component Overview](../../assets/design/prefix_caching/overview.png)
+![Component Overview](../assets/design/prefix_caching/overview.png)
 
 * Block Pool: A list of KVCacheBlock.  
 * Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
@@ -192,7 +192,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund
 
 When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
 
-![Free queue after a request us freed](../../assets/design/prefix_caching/free.png)
+![Free queue after a request us freed](../assets/design/prefix_caching/free.png)
 
 ### Eviction (LRU)
 
@@ -208,24 +208,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 
 **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
 
-![Example Time 1](../../assets/design/prefix_caching/example-time-1.png)
+![Example Time 1](../assets/design/prefix_caching/example-time-1.png)
 
 **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
 
-![Example Time 3](../../assets/design/prefix_caching/example-time-3.png)
+![Example Time 3](../assets/design/prefix_caching/example-time-3.png)
 
 **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
 
-![Example Time 4](../../assets/design/prefix_caching/example-time-4.png)
+![Example Time 4](../assets/design/prefix_caching/example-time-4.png)
 
 **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
 
-![Example Time 5](../../assets/design/prefix_caching/example-time-5.png)
+![Example Time 5](../assets/design/prefix_caching/example-time-5.png)
 
 **Time 6: Request 1 is finished and free.**
 
-![Example Time 6](../../assets/design/prefix_caching/example-time-6.png)
+![Example Time 6](../assets/design/prefix_caching/example-time-6.png)
 
 **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
 
-![Example Time 7](../../assets/design/prefix_caching/example-time-7.png)
+![Example Time 7](../assets/design/prefix_caching/example-time-7.png)