diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 13c3bc2c7e03..4d3aba4e4a6a 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -5,7 +5,12 @@ ## Profile with PyTorch Profiler -We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/` +We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables: + +- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default +- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default +- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default +- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. diff --git a/vllm/envs.py b/vllm/envs.py index 0eff741519ae..190bad7a61e2 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -79,6 +79,10 @@ VLLM_PLUGINS: Optional[list[str]] = None VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None VLLM_TORCH_PROFILER_DIR: Optional[str] = None + VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False + VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False + VLLM_TORCH_PROFILER_WITH_STACK: bool = True + VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False @@ -621,6 +625,31 @@ def get_vllm_port() -> Optional[int]: lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))), + # Enable torch profiler to record shapes if set + # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will + # not record shapes. + "VLLM_TORCH_PROFILER_RECORD_SHAPES": + lambda: bool(os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"), + + # Enable torch profiler to profile memory if set + # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler + # will not profile memory. + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": + lambda: bool( + os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"), + + # Enable torch profiler to profile stack if set + # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL + # profile stack by default. + "VLLM_TORCH_PROFILER_WITH_STACK": + lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"), + + # Enable torch profiler to profile flops if set + # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will + # not profile flops. + "VLLM_TORCH_PROFILER_WITH_FLOPS": + lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"), + # If set, vLLM will use Triton implementations of AWQ. "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 50618c9ce8b8..ab84c72caeaf 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -71,12 +71,23 @@ def __init__( torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR logger.info("Profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir) + logger.debug( + "Profiler config: record_shapes=%s," + "profile_memory=%s,with_stack=%s,with_flops=%s", + envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + envs.VLLM_TORCH_PROFILER_WITH_STACK, + envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + ) self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], - with_stack=True, + record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, on_trace_ready=torch.profiler.tensorboard_trace_handler( torch_profiler_trace_dir, use_gzip=True)) else: @@ -209,7 +220,7 @@ def reload_weights(self) -> None: @torch.inference_mode() def determine_available_memory(self) -> int: - """Profiles the peak memory usage of the model to determine how much + """Profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs. The engine will first conduct a profiling of the existing memory usage. diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index c7885694f7a3..2a7e0625b2f8 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -41,12 +41,23 @@ def __init__( torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR logger.info("Profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir) + logger.debug( + "Profiler config: record_shapes=%s," + "profile_memory=%s,with_stack=%s,with_flops=%s", + envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + envs.VLLM_TORCH_PROFILER_WITH_STACK, + envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + ) self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.XPU, ], - with_stack=True, + record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, on_trace_ready=torch.profiler.tensorboard_trace_handler( torch_profiler_trace_dir, use_gzip=True)) else: