vllm-project · youkaichao · Oct 28, 2025 · Oct 26, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/_posts/2025-10-26-sleep-mode.md b/_posts/2025-10-26-sleep-mode.md
diff --git a/assets/figures/2025-vllm-sleep-mode/plotly-ablation-inference.js b/assets/figures/2025-vllm-sleep-mode/plotly-ablation-inference.js
@@ -0,0 +1,91 @@
+document.addEventListener('DOMContentLoaded', function() {
+  // Ablation inference data: BF16 vs FP8
+  const ablationInferenceData = {
+    "ModelA": {
+      name: "Qwen3-0.6B",
+      bf16: [0.41, 0.4, 0.41],
+      fp8: [0.43, 0.43, 0.45]
+    },
+    "ModelB": {
+      name: "Phi-3-vision-128k",
+      bf16: [0.9, 0.74, 0.8],
+      fp8: [0.69, 0.59, 0.44]
+    }
+  };
+
+  function calcStatsAblInf(values) {
+    const mean = values.reduce((a, b) => a + b, 0) / values.length;
+    const min = Math.min(...values);
+    const max = Math.max(...values);
+    return { mean, errorMinus: mean - min, errorPlus: max - mean };
+  }
+
+  const modelsAblInf = Object.keys(ablationInferenceData);
+  const bf16StatsInf = modelsAblInf.map(m => calcStatsAblInf(ablationInferenceData[m].bf16));
+  const fp8StatsInf = modelsAblInf.map(m => calcStatsAblInf(ablationInferenceData[m].fp8));
+
+  const bf16TraceInf = {
+    x: modelsAblInf.map(m => ablationInferenceData[m].name),
+    y: bf16StatsInf.map(s => s.mean),
+    name: "BF16",
+    type: "bar",
+    marker: { color: "#1f77b4" },
+    error_y: {
+      type: "data",
+      symmetric: false,
+      array: bf16StatsInf.map(s => s.errorPlus),
+      arrayminus: bf16StatsInf.map(s => s.errorMinus),
+      color: "#0d4a6e",
+      thickness: 2,
+      width: 6
+    },
+    text: bf16StatsInf.map(s => s.mean.toFixed(2) + "s"),
+    textposition: "outside",
+    textfont: { size: 12, color: "#1f77b4", weight: "bold" },
+    hovertemplate: "<b>%{x}</b><br>BF16: %{y:.2f}s<extra></extra>"
+  };
+
+  const fp8TraceInf = {
+    x: modelsAblInf.map(m => ablationInferenceData[m].name),
+    y: fp8StatsInf.map(s => s.mean),
+    name: "FP8",
+    type: "bar",
+    marker: { color: "#ff7f0e" },
+    error_y: {
+      type: "data",
+      symmetric: false,
+      array: fp8StatsInf.map(s => s.errorPlus),
+      arrayminus: fp8StatsInf.map(s => s.errorMinus),
+      color: "#cc6600",
+      thickness: 2,
+      width: 6
+    },
+    text: fp8StatsInf.map(s => s.mean.toFixed(2) + "s"),
+    textposition: "outside",
+    textfont: { size: 12, color: "#ff7f0e", weight: "bold" },
+    hovertemplate: "<b>%{x}</b><br>FP8: %{y:.2f}s<extra></extra>"
+  };
+
+  Plotly.newPlot("plotly-ablation-inference", [bf16TraceInf, fp8TraceInf], {
+    barmode: "group",
+    bargap: 0.15,
+    bargroupgap: 0.1,
+    margin: { l: 60, r: 30, t: 40, b: 50 },
+    xaxis: {
+      title: "",
+      tickangle: 0
+    },
+    yaxis: {
+      title: "Inference Time (seconds)",
+      range: [0, Math.max(...bf16StatsInf.map(s => s.mean + s.errorPlus), ...fp8StatsInf.map(s => s.mean + s.errorPlus)) * 1.25]
+    },
+    hovermode: "closest",
+    legend: {
+      x: 0.5,
+      y: 1.15,
+      xanchor: "center",
+      yanchor: "top",
+      orientation: "h"
+    }
+  }, {displayModeBar: true, responsive: true});
+});
diff --git a/assets/figures/2025-vllm-sleep-mode/plotly-ablation-quant.js b/assets/figures/2025-vllm-sleep-mode/plotly-ablation-quant.js
@@ -0,0 +1,140 @@
+document.addEventListener('DOMContentLoaded', function() {
+  // Ablation study: BF16 vs FP8 quantization
+  const timingDataAblation = {
+    "Sleep Mode (BF16)": [
+      { event: "A Model Load", duration: 32.56 },
+      { event: "A Model Warm Up", duration: 2.69 },
+      { event: "B Model Load", duration: 57.96 },
+      { event: "B Model Warm Up", duration: 5.92 },
+      { event: "A Model Wake up", duration: 0.28 },
+      { event: "A Model Prompt", duration: 0.41 },
+      { event: "A Model Sleep", duration: 0.09 },
+      { event: "B Model Wake Up", duration: 0.89 },
+      { event: "B Model Prompt", duration: 0.9 },
+      { event: "B Model Sleep", duration: 0.48 },
+      { event: "A Model Wake up", duration: 0.27 },
+      { event: "A Model Prompt", duration: 0.4 },
+      { event: "A Model Sleep", duration: 0.1 },
+      { event: "B Model Wake Up", duration: 0.93 },
+      { event: "B Model Prompt", duration: 0.74 },
+      { event: "B Model Sleep", duration: 0.5 },
+      { event: "A Model Wake up", duration: 0.27 },
+      { event: "A Model Prompt", duration: 0.41 },
+      { event: "A Model Sleep", duration: 0.1 },
+      { event: "B Model Wake Up", duration: 0.88 },
+      { event: "B Model Prompt", duration: 0.8 }
+    ],
+    "Sleep Mode (FP8)": [
+      { event: "A Model Load", duration: 37.71 },
+      { event: "A Model Warm Up", duration: 2.34 },
+      { event: "B Model Load", duration: 57.79 },
+      { event: "B Model Warm Up", duration: 6.37 },
+      { event: "A Model Wake up", duration: 0.18 },
+      { event: "A Model Prompt", duration: 0.43 },
+      { event: "A Model Sleep", duration: 0.06 },
+      { event: "B Model Wake Up", duration: 0.79 },
+      { event: "B Model Prompt", duration: 0.69 },
+      { event: "B Model Sleep", duration: 0.31 },
+      { event: "A Model Wake up", duration: 0.19 },
+      { event: "A Model Prompt", duration: 0.43 },
+      { event: "A Model Sleep", duration: 0.06 },
+      { event: "B Model Wake Up", duration: 0.77 },
+      { event: "B Model Prompt", duration: 0.59 },
+      { event: "B Model Sleep", duration: 0.31 },
+      { event: "A Model Wake up", duration: 0.16 },
+      { event: "A Model Prompt", duration: 0.45 },
+      { event: "A Model Sleep", duration: 0.07 },
+      { event: "B Model Wake Up", duration: 0.78 },
+      { event: "B Model Prompt", duration: 0.44 }
+    ]
+  };
+
+  // Convert to segment format
+  function createSegmentsAblation(timingData) {
+    const segments = [];
+
+    Object.entries(timingData).forEach(([scenario, events]) => {
+      let cumulativeTime = 0;
+
+      events.forEach(({ event, duration }) => {
+        const [who, ...stageParts] = event.split(' ');
+        const stage = stageParts.join(' ');
+
+        let action, category;
+        if (stage.includes('Load')) {
+          action = 'Load';
+          category = `${who} Load`;
+        } else if (stage.includes('Wake')) {
+          action = 'Wake';
+          category = `${who} Wake`;
+        } else if (stage.includes('Prompt')) {
+          action = 'Prompt';
+          category = `${who} Prompt`;
+        } else if (stage.includes('Sleep')) {
+          action = 'Sleep';
+          category = `${who} Sleep`;
+        } else if (stage.includes('Warm')) {
+          action = 'Load';
+          category = `${who} Load`;
+        }
+
+        segments.push({
+          scenario,
+          who,
+          stage,
+          action,
+          start: cumulativeTime,
+          end: cumulativeTime + duration,
+          duration,
+          category
+        });
+
+        cumulativeTime += duration;
+      });
+    });
+
+    return segments;
+  }
+
+  const segmentsAblation = createSegmentsAblation(timingDataAblation);
+  const colorMapAblation = {"A Load": "#1f77b4", "B Load": "#ff7f0e", "A Wake": "#2ca02c", "B Wake": "#17becf", "A Sleep": "#9467bd", "B Sleep": "#8c564b", "A Prompt": "#e377c2", "B Prompt": "#7f7f7f"};
+  const categoriesAblation = Object.keys(colorMapAblation);
+
+  const xAblation = segmentsAblation.map(d => d.duration);
+  const baseAblation = segmentsAblation.map(d => d.start);
+  const yAblation = segmentsAblation.map(d => d.scenario);
+  const colorsAblation = segmentsAblation.map(d => colorMapAblation[d.category]);
+  const customAblation = segmentsAblation.map(d => [d.scenario, d.category, d.stage, d.start, d.end]);
+
+  const barsAblation = {
+    type: "bar",
+    orientation: "h",
+    x: xAblation, base: baseAblation, y: yAblation,
+    marker: { color: colorsAblation, line: {width:1, color:"rgba(0,0,0,0.35)"} },
+    hovertemplate:
+      "<b>%{customdata[0]}</b><br>%{customdata[1]} — %{customdata[2]}<br>"+
+      "Start %{customdata[3]:.2f}s → End %{customdata[4]:.2f}s<br>"+
+      "<b>%{x:.2f}s</b><extra></extra>",
+    customdata: customAblation,
+    showlegend: false
+  };
+
+  const legendTracesAblation = categoriesAblation.map(name => ({
+    type: "scatter", mode: "markers", x:[null], y:[null],
+    name, marker: {color: colorMapAblation[name], size: 10},
+    hoverinfo:"skip", showlegend:true
+  }));
+
+  Plotly.newPlot("plotly-ablation-quant", [barsAblation, ...legendTracesAblation], {
+    barmode: "overlay",
+    bargap: 0.05,
+    margin: {l: 140, r: 30, t: 20, b: 40},
+    xaxis: { title: "Time (seconds)", range: [0, 115] },
+    yaxis: {
+      categoryorder: "array",
+      categoryarray: ["Sleep Mode (FP8)", "Sleep Mode (BF16)"]
+    },
+    hovermode: "closest",
+    dragmode: "pan"
+  }, {displayModeBar: true, responsive: true});
+});
diff --git a/assets/figures/2025-vllm-sleep-mode/plotly-ablation-switching.js b/assets/figures/2025-vllm-sleep-mode/plotly-ablation-switching.js
@@ -0,0 +1,105 @@
+document.addEventListener('DOMContentLoaded', function() {
+  // Ablation switching data: BF16 vs FP8
+  const ablationSwitchingData = {
+    "ModelA": {
+      name: "Qwen3-0.6B",
+      bf16: [0.28, 0.27, 0.27],
+      fp8: [0.18, 0.19, 0.16]
+    },
+    "ModelB": {
+      name: "Phi-3-vision-128k",
+      bf16: [0.89, 0.93, 0.88],
+      fp8: [0.79, 0.77, 0.78]
+    }
+  };
+
+  function calcStatsAblSwitch(values) {
+    const mean = values.reduce((a, b) => a + b, 0) / values.length;
+    const min = Math.min(...values);
+    const max = Math.max(...values);
+    return { mean, errorMinus: mean - min, errorPlus: max - mean };
+  }
+
+  const modelsAblSwitch = Object.keys(ablationSwitchingData);
+  const bf16StatsSwitch = modelsAblSwitch.map(m => calcStatsAblSwitch(ablationSwitchingData[m].bf16));
+  const fp8StatsSwitch = modelsAblSwitch.map(m => calcStatsAblSwitch(ablationSwitchingData[m].fp8));
+
+  const bf16TraceSwitch = {
+    x: modelsAblSwitch.map(m => ablationSwitchingData[m].name),
+    y: bf16StatsSwitch.map(s => s.mean),
+    name: "BF16",
+    type: "bar",
+    marker: { color: "#1f77b4" },
+    error_y: {
+      type: "data",
+      symmetric: false,
+      array: bf16StatsSwitch.map(s => s.errorPlus),
+      arrayminus: bf16StatsSwitch.map(s => s.errorMinus),
+      color: "#0d4a6e",
+      thickness: 2,
+      width: 6
+    },
+    text: bf16StatsSwitch.map(s => s.mean.toFixed(2) + "s"),
+    textposition: "outside",
+    textfont: { size: 12, color: "#1f77b4", weight: "bold" },
+    hovertemplate: "<b>%{x}</b><br>BF16: %{y:.2f}s<extra></extra>"
+  };
+
+  const fp8TraceSwitch = {
+    x: modelsAblSwitch.map(m => ablationSwitchingData[m].name),
+    y: fp8StatsSwitch.map(s => s.mean),
+    name: "FP8",
+    type: "bar",
+    marker: { color: "#ff7f0e" },
+    error_y: {
+      type: "data",
+      symmetric: false,
+      array: fp8StatsSwitch.map(s => s.errorPlus),
+      arrayminus: fp8StatsSwitch.map(s => s.errorMinus),
+      color: "#cc6600",
+      thickness: 2,
+      width: 6
+    },
+    text: fp8StatsSwitch.map(s => s.mean.toFixed(2) + "s"),
+    textposition: "outside",
+    textfont: { size: 12, color: "#ff7f0e", weight: "bold" },
+    hovertemplate: "<b>%{x}</b><br>FP8: %{y:.2f}s<extra></extra>"
+  };
+
+  // Calculate speedup percentages for annotation
+  const speedupsSwitchAbl = bf16StatsSwitch.map((bf16, i) => {
+    const reduction = ((bf16.mean - fp8StatsSwitch[i].mean) / bf16.mean * 100).toFixed(0);
+    return reduction;
+  });
+
+  Plotly.newPlot("plotly-ablation-switching", [bf16TraceSwitch, fp8TraceSwitch], {
+    barmode: "group",
+    bargap: 0.15,
+    bargroupgap: 0.1,
+    margin: { l: 60, r: 30, t: 40, b: 50 },
+    xaxis: {
+      title: "",
+      tickangle: 0
+    },
+    yaxis: {
+      title: "Wake Time (seconds)",
+      range: [0, Math.max(...bf16StatsSwitch.map(s => s.mean + s.errorPlus)) * 1.3]
+    },
+    hovermode: "closest",
+    legend: {
+      x: 0.5,
+      y: 1.15,
+      xanchor: "center",
+      yanchor: "top",
+      orientation: "h"
+    },
+    annotations: modelsAblSwitch.map((m, i) => ({
+      x: ablationSwitchingData[m].name,
+      y: bf16StatsSwitch[i].mean + bf16StatsSwitch[i].errorPlus + 0.07,
+      text: `<b>${speedupsSwitchAbl[i]}% faster</b>`,
+      showarrow: false,
+      font: { size: 11, color: "#ff7f0e", weight: "bold" },
+      xanchor: "center"
+    }))
+  }, {displayModeBar: true, responsive: true});
+});