Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
471 changes: 471 additions & 0 deletions _posts/2025-10-26-sleep-mode.md
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please change the filename to 2025-10-26-sleep-mode.md , the resulting url will be better.

Large diffs are not rendered by default.

91 changes: 91 additions & 0 deletions assets/figures/2025-vllm-sleep-mode/plotly-ablation-inference.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
document.addEventListener('DOMContentLoaded', function() {
// Ablation inference data: BF16 vs FP8
const ablationInferenceData = {
"ModelA": {
name: "Qwen3-0.6B",
bf16: [0.41, 0.4, 0.41],
fp8: [0.43, 0.43, 0.45]
},
"ModelB": {
name: "Phi-3-vision-128k",
bf16: [0.9, 0.74, 0.8],
fp8: [0.69, 0.59, 0.44]
}
};

function calcStatsAblInf(values) {
const mean = values.reduce((a, b) => a + b, 0) / values.length;
const min = Math.min(...values);
const max = Math.max(...values);
return { mean, errorMinus: mean - min, errorPlus: max - mean };
}

const modelsAblInf = Object.keys(ablationInferenceData);
const bf16StatsInf = modelsAblInf.map(m => calcStatsAblInf(ablationInferenceData[m].bf16));
const fp8StatsInf = modelsAblInf.map(m => calcStatsAblInf(ablationInferenceData[m].fp8));

const bf16TraceInf = {
x: modelsAblInf.map(m => ablationInferenceData[m].name),
y: bf16StatsInf.map(s => s.mean),
name: "BF16",
type: "bar",
marker: { color: "#1f77b4" },
error_y: {
type: "data",
symmetric: false,
array: bf16StatsInf.map(s => s.errorPlus),
arrayminus: bf16StatsInf.map(s => s.errorMinus),
color: "#0d4a6e",
thickness: 2,
width: 6
},
text: bf16StatsInf.map(s => s.mean.toFixed(2) + "s"),
textposition: "outside",
textfont: { size: 12, color: "#1f77b4", weight: "bold" },
hovertemplate: "<b>%{x}</b><br>BF16: %{y:.2f}s<extra></extra>"
};

const fp8TraceInf = {
x: modelsAblInf.map(m => ablationInferenceData[m].name),
y: fp8StatsInf.map(s => s.mean),
name: "FP8",
type: "bar",
marker: { color: "#ff7f0e" },
error_y: {
type: "data",
symmetric: false,
array: fp8StatsInf.map(s => s.errorPlus),
arrayminus: fp8StatsInf.map(s => s.errorMinus),
color: "#cc6600",
thickness: 2,
width: 6
},
text: fp8StatsInf.map(s => s.mean.toFixed(2) + "s"),
textposition: "outside",
textfont: { size: 12, color: "#ff7f0e", weight: "bold" },
hovertemplate: "<b>%{x}</b><br>FP8: %{y:.2f}s<extra></extra>"
};

Plotly.newPlot("plotly-ablation-inference", [bf16TraceInf, fp8TraceInf], {
barmode: "group",
bargap: 0.15,
bargroupgap: 0.1,
margin: { l: 60, r: 30, t: 40, b: 50 },
xaxis: {
title: "",
tickangle: 0
},
yaxis: {
title: "Inference Time (seconds)",
range: [0, Math.max(...bf16StatsInf.map(s => s.mean + s.errorPlus), ...fp8StatsInf.map(s => s.mean + s.errorPlus)) * 1.25]
},
hovermode: "closest",
legend: {
x: 0.5,
y: 1.15,
xanchor: "center",
yanchor: "top",
orientation: "h"
}
}, {displayModeBar: true, responsive: true});
});
140 changes: 140 additions & 0 deletions assets/figures/2025-vllm-sleep-mode/plotly-ablation-quant.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
document.addEventListener('DOMContentLoaded', function() {
// Ablation study: BF16 vs FP8 quantization
const timingDataAblation = {
"Sleep Mode (BF16)": [
{ event: "A Model Load", duration: 32.56 },
{ event: "A Model Warm Up", duration: 2.69 },
{ event: "B Model Load", duration: 57.96 },
{ event: "B Model Warm Up", duration: 5.92 },
{ event: "A Model Wake up", duration: 0.28 },
{ event: "A Model Prompt", duration: 0.41 },
{ event: "A Model Sleep", duration: 0.09 },
{ event: "B Model Wake Up", duration: 0.89 },
{ event: "B Model Prompt", duration: 0.9 },
{ event: "B Model Sleep", duration: 0.48 },
{ event: "A Model Wake up", duration: 0.27 },
{ event: "A Model Prompt", duration: 0.4 },
{ event: "A Model Sleep", duration: 0.1 },
{ event: "B Model Wake Up", duration: 0.93 },
{ event: "B Model Prompt", duration: 0.74 },
{ event: "B Model Sleep", duration: 0.5 },
{ event: "A Model Wake up", duration: 0.27 },
{ event: "A Model Prompt", duration: 0.41 },
{ event: "A Model Sleep", duration: 0.1 },
{ event: "B Model Wake Up", duration: 0.88 },
{ event: "B Model Prompt", duration: 0.8 }
],
"Sleep Mode (FP8)": [
{ event: "A Model Load", duration: 37.71 },
{ event: "A Model Warm Up", duration: 2.34 },
{ event: "B Model Load", duration: 57.79 },
{ event: "B Model Warm Up", duration: 6.37 },
{ event: "A Model Wake up", duration: 0.18 },
{ event: "A Model Prompt", duration: 0.43 },
{ event: "A Model Sleep", duration: 0.06 },
{ event: "B Model Wake Up", duration: 0.79 },
{ event: "B Model Prompt", duration: 0.69 },
{ event: "B Model Sleep", duration: 0.31 },
{ event: "A Model Wake up", duration: 0.19 },
{ event: "A Model Prompt", duration: 0.43 },
{ event: "A Model Sleep", duration: 0.06 },
{ event: "B Model Wake Up", duration: 0.77 },
{ event: "B Model Prompt", duration: 0.59 },
{ event: "B Model Sleep", duration: 0.31 },
{ event: "A Model Wake up", duration: 0.16 },
{ event: "A Model Prompt", duration: 0.45 },
{ event: "A Model Sleep", duration: 0.07 },
{ event: "B Model Wake Up", duration: 0.78 },
{ event: "B Model Prompt", duration: 0.44 }
]
};

// Convert to segment format
function createSegmentsAblation(timingData) {
const segments = [];

Object.entries(timingData).forEach(([scenario, events]) => {
let cumulativeTime = 0;

events.forEach(({ event, duration }) => {
const [who, ...stageParts] = event.split(' ');
const stage = stageParts.join(' ');

let action, category;
if (stage.includes('Load')) {
action = 'Load';
category = `${who} Load`;
} else if (stage.includes('Wake')) {
action = 'Wake';
category = `${who} Wake`;
} else if (stage.includes('Prompt')) {
action = 'Prompt';
category = `${who} Prompt`;
} else if (stage.includes('Sleep')) {
action = 'Sleep';
category = `${who} Sleep`;
} else if (stage.includes('Warm')) {
action = 'Load';
category = `${who} Load`;
}

segments.push({
scenario,
who,
stage,
action,
start: cumulativeTime,
end: cumulativeTime + duration,
duration,
category
});

cumulativeTime += duration;
});
});

return segments;
}

const segmentsAblation = createSegmentsAblation(timingDataAblation);
const colorMapAblation = {"A Load": "#1f77b4", "B Load": "#ff7f0e", "A Wake": "#2ca02c", "B Wake": "#17becf", "A Sleep": "#9467bd", "B Sleep": "#8c564b", "A Prompt": "#e377c2", "B Prompt": "#7f7f7f"};
const categoriesAblation = Object.keys(colorMapAblation);

const xAblation = segmentsAblation.map(d => d.duration);
const baseAblation = segmentsAblation.map(d => d.start);
const yAblation = segmentsAblation.map(d => d.scenario);
const colorsAblation = segmentsAblation.map(d => colorMapAblation[d.category]);
const customAblation = segmentsAblation.map(d => [d.scenario, d.category, d.stage, d.start, d.end]);

const barsAblation = {
type: "bar",
orientation: "h",
x: xAblation, base: baseAblation, y: yAblation,
marker: { color: colorsAblation, line: {width:1, color:"rgba(0,0,0,0.35)"} },
hovertemplate:
"<b>%{customdata[0]}</b><br>%{customdata[1]} — %{customdata[2]}<br>"+
"Start %{customdata[3]:.2f}s → End %{customdata[4]:.2f}s<br>"+
"<b>%{x:.2f}s</b><extra></extra>",
customdata: customAblation,
showlegend: false
};

const legendTracesAblation = categoriesAblation.map(name => ({
type: "scatter", mode: "markers", x:[null], y:[null],
name, marker: {color: colorMapAblation[name], size: 10},
hoverinfo:"skip", showlegend:true
}));

Plotly.newPlot("plotly-ablation-quant", [barsAblation, ...legendTracesAblation], {
barmode: "overlay",
bargap: 0.05,
margin: {l: 140, r: 30, t: 20, b: 40},
xaxis: { title: "Time (seconds)", range: [0, 115] },
yaxis: {
categoryorder: "array",
categoryarray: ["Sleep Mode (FP8)", "Sleep Mode (BF16)"]
},
hovermode: "closest",
dragmode: "pan"
}, {displayModeBar: true, responsive: true});
});
105 changes: 105 additions & 0 deletions assets/figures/2025-vllm-sleep-mode/plotly-ablation-switching.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
document.addEventListener('DOMContentLoaded', function() {
// Ablation switching data: BF16 vs FP8
const ablationSwitchingData = {
"ModelA": {
name: "Qwen3-0.6B",
bf16: [0.28, 0.27, 0.27],
fp8: [0.18, 0.19, 0.16]
},
"ModelB": {
name: "Phi-3-vision-128k",
bf16: [0.89, 0.93, 0.88],
fp8: [0.79, 0.77, 0.78]
}
};

function calcStatsAblSwitch(values) {
const mean = values.reduce((a, b) => a + b, 0) / values.length;
const min = Math.min(...values);
const max = Math.max(...values);
return { mean, errorMinus: mean - min, errorPlus: max - mean };
}

const modelsAblSwitch = Object.keys(ablationSwitchingData);
const bf16StatsSwitch = modelsAblSwitch.map(m => calcStatsAblSwitch(ablationSwitchingData[m].bf16));
const fp8StatsSwitch = modelsAblSwitch.map(m => calcStatsAblSwitch(ablationSwitchingData[m].fp8));

const bf16TraceSwitch = {
x: modelsAblSwitch.map(m => ablationSwitchingData[m].name),
y: bf16StatsSwitch.map(s => s.mean),
name: "BF16",
type: "bar",
marker: { color: "#1f77b4" },
error_y: {
type: "data",
symmetric: false,
array: bf16StatsSwitch.map(s => s.errorPlus),
arrayminus: bf16StatsSwitch.map(s => s.errorMinus),
color: "#0d4a6e",
thickness: 2,
width: 6
},
text: bf16StatsSwitch.map(s => s.mean.toFixed(2) + "s"),
textposition: "outside",
textfont: { size: 12, color: "#1f77b4", weight: "bold" },
hovertemplate: "<b>%{x}</b><br>BF16: %{y:.2f}s<extra></extra>"
};

const fp8TraceSwitch = {
x: modelsAblSwitch.map(m => ablationSwitchingData[m].name),
y: fp8StatsSwitch.map(s => s.mean),
name: "FP8",
type: "bar",
marker: { color: "#ff7f0e" },
error_y: {
type: "data",
symmetric: false,
array: fp8StatsSwitch.map(s => s.errorPlus),
arrayminus: fp8StatsSwitch.map(s => s.errorMinus),
color: "#cc6600",
thickness: 2,
width: 6
},
text: fp8StatsSwitch.map(s => s.mean.toFixed(2) + "s"),
textposition: "outside",
textfont: { size: 12, color: "#ff7f0e", weight: "bold" },
hovertemplate: "<b>%{x}</b><br>FP8: %{y:.2f}s<extra></extra>"
};

// Calculate speedup percentages for annotation
const speedupsSwitchAbl = bf16StatsSwitch.map((bf16, i) => {
const reduction = ((bf16.mean - fp8StatsSwitch[i].mean) / bf16.mean * 100).toFixed(0);
return reduction;
});

Plotly.newPlot("plotly-ablation-switching", [bf16TraceSwitch, fp8TraceSwitch], {
barmode: "group",
bargap: 0.15,
bargroupgap: 0.1,
margin: { l: 60, r: 30, t: 40, b: 50 },
xaxis: {
title: "",
tickangle: 0
},
yaxis: {
title: "Wake Time (seconds)",
range: [0, Math.max(...bf16StatsSwitch.map(s => s.mean + s.errorPlus)) * 1.3]
},
hovermode: "closest",
legend: {
x: 0.5,
y: 1.15,
xanchor: "center",
yanchor: "top",
orientation: "h"
},
annotations: modelsAblSwitch.map((m, i) => ({
x: ablationSwitchingData[m].name,
y: bf16StatsSwitch[i].mean + bf16StatsSwitch[i].errorPlus + 0.07,
text: `<b>${speedupsSwitchAbl[i]}% faster</b>`,
showarrow: false,
font: { size: 11, color: "#ff7f0e", weight: "bold" },
xanchor: "center"
}))
}, {displayModeBar: true, responsive: true});
});
Loading