diff --git a/README.md b/README.md index 0066988..433415c 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,43 @@ for more information. Additional vLLM outputs may be requested optionally on a per-request basis. See [this docs](docs/additional_outputs.md) for more information. +## Priority Requests + +The vLLM backend supports priority-based request scheduling when the engine is configured with a scheduler policy set to `priority`. This allows you to prioritize certain requests over others, with lower priority numbers being processed first. + +### Configuration + +To enable priority scheduling, set the `scheduler_policy` parameter to `priority` in your `model.json`: + +```json +{ + "scheduler_policy": "priority", + // ... other engine args ... +} +``` + +### Usage + +You can specify the priority of a request using the optional `priority` input tensor: + +```python +inputs = [] +inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) +inputs[-1].set_data_from_numpy(np.array([prompt.encode("utf-8")], dtype=np.object_)) + +# Add priority input (optional) +inputs.append(grpcclient.InferInput("priority", [1], "INT32")) +inputs[-1].set_data_from_numpy(np.array([priority_value], dtype=np.int32)) +``` + +If the priority input is not provided, it defaults to 0. Lower priority numbers are processed first. + +### Example Use Cases + +- Prioritize real-time user requests over background tasks +- Implement different service level agreements (SLAs) +- Manage system resources by processing high-priority requests first + ## Triton Metrics Starting with the 24.08 release of Triton, users can now obtain specific vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics diff --git a/src/model.py b/src/model.py index d201244..5b1efd4 100644 --- a/src/model.py +++ b/src/model.py @@ -129,6 +129,12 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config): "dims": [1], "optional": True, }, + { + "name": "priority", + "data_type": "TYPE_INT32", + "dims": [1], + "optional": True, + }, ] # Outputs expected by the backend. outputs = [ @@ -426,6 +432,7 @@ async def _generate(self, request): prepend_input, parameters, additional_outputs, + priority, ) = self._get_input_tensors(request) sampling_params = TritonSamplingParams.from_dict(parameters, self.logger) @@ -438,7 +445,11 @@ async def _generate(self, request): lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path) response_iterator = self._llm_engine.generate( - prompt, sampling_params, request_id, lora_request=lora_request + prompt, + sampling_params, + request_id, + lora_request=lora_request, + priority=priority, ) request_output_state = {} @@ -587,7 +598,14 @@ def _get_input_tensors(self, request): tensor = False additional_outputs[tensor_name] = tensor - return prompt, stream, prepend_input, parameters, additional_outputs + # priority + priority = pb_utils.get_input_tensor_by_name(request, "priority") + if priority: + priority = int(priority.as_numpy()[0]) + else: + priority = 0 + + return prompt, stream, prepend_input, parameters, additional_outputs, priority def _create_response( self, request_output_state, request_output, prepend_input, additional_outputs