Use new structural output API (#3729)

mzegla · web-flow · commit 74d4416744e7 · 2025-10-29T12:49:49.000+01:00
diff --git a/docs/model_server_rest_api_chat.md b/docs/model_server_rest_api_chat.md
@@ -166,7 +166,7 @@ Some parameters, especially related to sampling (like `temperature`, `top_p` etc
 | logprobs | ⚠️ | ✅ | ✅ | bool (default: `false`) | Include the log probabilities on the logprob of the returned output token. **_ in stream mode logprobs are not returned. Only info about selected tokens is returned _** |
 | tools | ✅ | ✅ | ✅ | array | A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for. See [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-tools) for more details. |
 | tool_choice | ✅ | ✅ | ✅ | string or object | Controls which (if any) tool is called by the model. `none` means the model will not call any tool and instead generates a message. `auto` means the model can pick between generating a message or calling one or more tools. `required` means that model should call at least one tool. Specifying a particular tool via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool. See [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice) for more details. |
-| response_format | ✅ | ✅ | ✅ | object | An object specifying the format that the model must output. Setting to { "type": "json_schema", "json_schema": {...} } enables Structured Outputs which ensures the model will match your supplied JSON schema. Learn more in the [Structured Outputs demo](../demos/continuous_batching/structured_output/README.md). **Note** that if model server fails to load the schema, the request will still be processed, but the schema will not be applied. |
+| response_format | ✅ | ✅ | ✅ | object | An object specifying the format that the model must output. Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured Outputs which ensures the model will match your supplied JSON schema according to [OpenAI reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format). Learn more in the [Structured Outputs demo](../demos/continuous_batching/structured_output/README.md). Additionally, `response_format` can accept [XGrammar structural tags format](https://github.com/mlc-ai/xgrammar/blob/main/docs/tutorials/structural_tag.md#format-types) (not part of OpenAI API). For example: `{ "type": "const_string", "value": "Hello World!" }`. **Note** that if model server fails to process the format, the request will still be processed, but the format will not be imposed. |
 | chat_template_kwargs | ✅ | ❌ | ✅ |  object | Enables passing additional parameters to chat template engine. Example `{"enable_thinking": false}`. Note that values like `messages`, `eos_token`, `bos_token` etc. are provided natively to the template engine, so including them in `chat_template_kwargs` will cause error. **Effective only in configuration with Python support**. |
 
 #### Beam search sampling specific
diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp
@@ -427,8 +427,49 @@ std::optional<int> OpenAIChatCompletionsHandler::getMaxTokens() const {
     return request.maxTokens;
 }
 
-std::optional<std::string> OpenAIChatCompletionsHandler::getResponseSchema() const {
-    return request.responseSchema;
+std::optional<std::string> OpenAIChatCompletionsHandler::getResponseFormat() const {
+    return request.responseFormat;
+}
+
+std::string convertOpenAIResponseFormatToStructuralTagStringFormat(const rapidjson::Value& openAIFormat) {
+    // Build the new object: {"type": "structural_tag", "format": <openAIFormat>}
+    // If response_format has {"json_schema": {"schema": {...}}}, flatten it to {"json_schema": {...}}
+    rapidjson::Document flatFormatDoc;
+    flatFormatDoc.CopyFrom(openAIFormat, flatFormatDoc.GetAllocator());
+
+    if (flatFormatDoc.HasMember("json_schema") && flatFormatDoc["json_schema"].IsObject()) {
+        auto& jsonSchema = flatFormatDoc["json_schema"];
+        if (jsonSchema.HasMember("schema") && jsonSchema["schema"].IsObject()) {
+            // Move all members from "schema" to "json_schema"
+            rapidjson::Value schemaObjCopy;
+            schemaObjCopy.CopyFrom(jsonSchema["schema"], flatFormatDoc.GetAllocator());  // Make a copy as we will modify jsonSchema
+            for (auto itr = schemaObjCopy.MemberBegin(); itr != schemaObjCopy.MemberEnd(); ++itr) {
+                rapidjson::Value key;
+                key.CopyFrom(itr->name, flatFormatDoc.GetAllocator());
+                rapidjson::Value value;
+                value.CopyFrom(itr->value, flatFormatDoc.GetAllocator());
+                jsonSchema.AddMember(key, value, flatFormatDoc.GetAllocator());
+            }
+            // Remove the "schema" member
+            jsonSchema.RemoveMember("schema");
+        }
+    }
+
+    // Serialize the flattened response_format object
+    rapidjson::StringBuffer formatBuffer;
+    rapidjson::Writer<rapidjson::StringBuffer> formatWriter(formatBuffer);
+    flatFormatDoc.Accept(formatWriter);
+
+    // Build the new object: {"type": "structural_tag", "format": <flattened>}
+    rapidjson::StringBuffer buffer;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+    writer.StartObject();
+    writer.Key("type");
+    writer.String("structural_tag");
+    writer.Key("format");
+    writer.RawValue(formatBuffer.GetString(), formatBuffer.GetSize(), rapidjson::kObjectType);
+    writer.EndObject();
+    return buffer.GetString();
 }
 
 absl::Status OpenAIChatCompletionsHandler::parseChatCompletionsPart(std::optional<uint32_t> maxTokensLimit, std::optional<std::string> allowedLocalMediaPath) {
@@ -475,36 +516,8 @@ absl::Status OpenAIChatCompletionsHandler::parseChatCompletionsPart(std::optiona
             return absl::OkStatus();
         if (!it->value.IsObject())
             return absl::InvalidArgumentError("response_format is not an object");
-        auto responseFormat = it->value.GetObject();
-        auto typeIt = responseFormat.FindMember("type");
-        if (typeIt != responseFormat.MemberEnd()) {
-            if (!typeIt->value.IsString())
-                return absl::InvalidArgumentError("response_format.type is not a string");
-            if (std::string(typeIt->value.GetString()) != "json_schema") {
-                return absl::InvalidArgumentError("response_format.type can be only json_schema");
-            } else {
-                auto jsonSchemaIt = responseFormat.FindMember("json_schema");
-                if (jsonSchemaIt != responseFormat.MemberEnd()) {
-                    if (!jsonSchemaIt->value.IsObject())
-                        return absl::InvalidArgumentError("response_format.json_schema is not an object");
-                    auto jsonSchema = jsonSchemaIt->value.GetObject();
-                    auto schemaIt = jsonSchema.FindMember("schema");
-                    if (schemaIt == jsonSchema.MemberEnd())
-                        return absl::InvalidArgumentError("response_format.json_schema.schema is missing");
-                    if (!schemaIt->value.IsObject())
-                        return absl::InvalidArgumentError("response_format.json_schema.schema is not an object");
-                    // Convert schema value to a JSON string and assign to optional string responseSchema
-                    StringBuffer schemaBuffer;
-                    Writer<StringBuffer> schemaWriter(schemaBuffer);
-                    schemaIt->value.Accept(schemaWriter);
-                    request.responseSchema = std::make_optional<std::string>(schemaBuffer.GetString());
-                } else {
-                    return absl::InvalidArgumentError("response_format.json_schema is missing");
-                }
-            }
-        } else {
-            return absl::InvalidArgumentError("response_format.type is missing");
-        }
+        const rapidjson::Value& responseFormat = it->value;
+        request.responseFormat = convertOpenAIResponseFormatToStructuralTagStringFormat(responseFormat);
     }
 
     return absl::OkStatus();
diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp
@@ -103,7 +103,7 @@ class OpenAIChatCompletionsHandler {
     // User input might be modified by the servable logic, so it is not const
     ov::genai::ChatHistory& getChatHistory();
     std::optional<int> getMaxTokens() const;
-    std::optional<std::string> getResponseSchema() const;
+    std::optional<std::string> getResponseFormat() const;
 
     bool isStream() const;
     std::string getModel() const;
diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp
@@ -74,8 +74,8 @@ struct OpenAIChatCompletionsRequest {
     std::optional<uint32_t> maxModelLength;
 
     // Guided generation specific
-    // Schema for response_format handling
-    std::optional<std::string> responseSchema{std::nullopt};
+    // String representation of response format object
+    std::optional<std::string> responseFormat{std::nullopt};
     // Map that holds tool names and schemas for their arguments
     ToolsSchemas_t toolNameSchemaMap;
     // Holds value for tool_choice field as described in https://platform.openai.com/docs/api-reference/chat/create#chat_create-tool_choice
diff --git a/src/llm/io_processing/base_generation_config_builder.cpp b/src/llm/io_processing/base_generation_config_builder.cpp
@@ -20,12 +20,12 @@
 #include "base_generation_config_builder.hpp"
 
 namespace ovms {
-void BaseGenerationConfigBuilder::setStructuralTagsConfig(const ov::genai::StructuralTagsConfig& structuralTagsConfig) {
+void BaseGenerationConfigBuilder::setStructuralTagsConfig(const ov::genai::StructuredOutputConfig::StructuralTag& structuralTag) {
     if (config.structured_output_config) {
-        config.structured_output_config->structural_tags_config = structuralTagsConfig;
+        config.structured_output_config->structural_tags_config = structuralTag;
     } else {
         ov::genai::StructuredOutputConfig structuredOutputConfig;
-        structuredOutputConfig.structural_tags_config = structuralTagsConfig;
+        structuredOutputConfig.structural_tags_config = structuralTag;
         config.structured_output_config = structuredOutputConfig;
     }
 }
@@ -104,9 +104,9 @@ void BaseGenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComplet
         config.max_ngram_size = request.maxNgramSize.value();
 
     // Response format handling
-    if (request.responseSchema.has_value()) {
+    if (request.responseFormat.has_value()) {
         ov::genai::StructuredOutputConfig structuredOutputConfig;
-        structuredOutputConfig.json_schema = request.responseSchema.value();
+        structuredOutputConfig.structural_tags_config = request.responseFormat.value();
         config.structured_output_config = structuredOutputConfig;
         config.stop_strings.insert("#");
     }
diff --git a/src/llm/io_processing/base_generation_config_builder.hpp b/src/llm/io_processing/base_generation_config_builder.hpp
@@ -31,7 +31,7 @@ namespace ovms {
 class BaseGenerationConfigBuilder {
 protected:
     ov::genai::GenerationConfig config;
-    void setStructuralTagsConfig(const ov::genai::StructuralTagsConfig& structuralTagsConfig);
+    void setStructuralTagsConfig(const ov::genai::StructuredOutputConfig::StructuralTag& structuralTag);
 
 public:
     BaseGenerationConfigBuilder() = delete;
diff --git a/src/llm/io_processing/hermes3/generation_config_builder.cpp b/src/llm/io_processing/hermes3/generation_config_builder.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //*****************************************************************************
 
+#include <memory>
 #include <string>
 #include <utility>
 #include <openvino/genai/generation_config.hpp>
@@ -33,33 +34,19 @@ void Hermes3GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComp
     }
 
     // Set tool guided generation config specific to Hermes3 and Qwen3 models
-    ov::genai::StructuralTagsConfig structuralTagsConfig;
-    static const std::string toolCallTrigger = "<tool_call>";
-    structuralTagsConfig.triggers.push_back(toolCallTrigger);
+    auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
+    triggeredTags->triggers.push_back("<tool_call>");
 
     for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
         const auto& toolSchema = toolSchemaWrapper.stringRepr;
-        ov::genai::StructuralTagItem tagItem;
-        tagItem.begin = toolCallTrigger;
-        tagItem.schema = R"({
-            "type": "object",
-            "properties": {
-                "name": {
-                    "type": "string",
-                    "enum": [")" +
-                         toolName + R"("]
-                },
-                "arguments": )" +
-                         toolSchema + R"(
-            },
-            "required": [
-                "name",
-                "arguments"
-            ]
-        })";
-        structuralTagsConfig.structural_tags.push_back(tagItem);
+        ov::genai::StructuredOutputConfig::Tag tagItem;
+        tagItem.begin = "<tool_call>\n{\"name\": \"" + toolName + "\", \"arguments\": ";
+        tagItem.end = "}\n</tool_call>";
+        tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
+        triggeredTags->tags.push_back(tagItem);
     }
-    setStructuralTagsConfig(structuralTagsConfig);
+    ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
+    setStructuralTagsConfig(structuralTag);
 }
 
 }  // namespace ovms
diff --git a/src/llm/io_processing/llama3/generation_config_builder.cpp b/src/llm/io_processing/llama3/generation_config_builder.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //*****************************************************************************
 
+#include <memory>
 #include <string>
 #include <utility>
 #include <openvino/genai/generation_config.hpp>
@@ -33,20 +34,19 @@ void Llama3GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatCompl
     }
 
     // Set tool guided generation config specific to Llama-3 model
-    ov::genai::StructuralTagsConfig structuralTagsConfig;
-    static const std::string beginOfToolsString = "<|python_tag|>";
-    structuralTagsConfig.triggers.push_back(beginOfToolsString);
+    auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
+    triggeredTags->triggers.push_back("{\"name\":");
 
     for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
         const auto& toolSchema = toolSchemaWrapper.stringRepr;
-        ov::genai::StructuralTagItem tagItem;
-        std::string toolCallTrigger = "{\"name\": \"" + toolName + "\", \"parameters\": ";
-        structuralTagsConfig.triggers.push_back(toolCallTrigger);
-        tagItem.begin = toolCallTrigger;
-        tagItem.schema = toolSchema;
-        structuralTagsConfig.structural_tags.push_back(tagItem);
+        ov::genai::StructuredOutputConfig::Tag tagItem;
+        tagItem.begin = "{\"name\": \"" + toolName + "\", \"parameters\": ";
+        tagItem.end = "}";
+        tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
+        triggeredTags->tags.push_back(tagItem);
     }
-    setStructuralTagsConfig(structuralTagsConfig);
+    ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
+    setStructuralTagsConfig(structuralTag);
 }
 
 }  // namespace ovms
diff --git a/src/llm/io_processing/phi4/generation_config_builder.cpp b/src/llm/io_processing/phi4/generation_config_builder.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //*****************************************************************************
 
+#include <memory>
 #include <string>
 #include <utility>
 #include <openvino/genai/generation_config.hpp>
@@ -34,10 +35,11 @@ void Phi4GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComplet
 
     // Set tool guided generation config specific to Phi-4 model as described in template from:
     // https://github.com/vllm-project/vllm/blob/v0.9.2/examples/tool_chat_template_phi4_mini.jinja
-    ov::genai::StructuralTagsConfig structuralTagsConfig;
+
     static const std::string beginOfToolsString = "functools";
-    structuralTagsConfig.triggers.push_back(beginOfToolsString);
-    ov::genai::StructuralTagItem tagItem;
+    auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
+    triggeredTags->triggers.push_back(beginOfToolsString);
+    ov::genai::StructuredOutputConfig::Tag tagItem;
     tagItem.begin = beginOfToolsString;
 
     // Build the "anyOf" array for each tool
@@ -68,16 +70,18 @@ void Phi4GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComplet
     }
     anyOfArray += "]";
 
-    tagItem.schema = R"({
+    std::string schema = R"({
         "type": "array",
         "items": {
             "anyOf": )" +
-                     anyOfArray + R"(
+                         anyOfArray + R"(
         }
     })";
 
-    structuralTagsConfig.structural_tags.push_back(tagItem);
-    setStructuralTagsConfig(structuralTagsConfig);
+    tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(schema);
+    triggeredTags->tags.push_back(tagItem);
+    ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
+    setStructuralTagsConfig(structuralTag);
 }
 
 }  // namespace ovms
diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp
@@ -1200,14 +1200,25 @@ TEST_F(HttpOpenAIHandlerParsingTest, responseFormatValid) {
   })";
     doc.Parse(json.c_str());
     ASSERT_FALSE(doc.HasParseError());
-    std::string expectedReponseFormatSchema = R"({"type":"object","properties":{"text":{"type":"string"}},"required":["text"]})";
+    // Response format is converted from OpenAI compatible to XGrammar compatible
+    std::string expectedResponseFormat = R"({"type":"structural_tag","format":{"type":"json_schema","json_schema":{"type":"object","properties":{"text":{"type":"string"}},"required":["text"]}}})";
     uint32_t bestOfLimit = 0;
     uint32_t maxTokensLimit = 30;
     std::optional<uint32_t> maxModelLength;
     std::shared_ptr<ovms::OpenAIChatCompletionsHandler> apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
     EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
-    EXPECT_TRUE(apiHandler->getResponseSchema().has_value());
-    EXPECT_EQ(apiHandler->getResponseSchema().value(), expectedReponseFormatSchema);
+    EXPECT_TRUE(apiHandler->getResponseFormat().has_value());
+
+    // Compare JSONs
+    rapidjson::Document expectedDoc;
+    expectedDoc.Parse(expectedResponseFormat.c_str());
+    ASSERT_FALSE(expectedDoc.HasParseError());
+
+    rapidjson::Document actualDoc;
+    actualDoc.Parse(apiHandler->getResponseFormat().value().c_str());
+    ASSERT_FALSE(actualDoc.HasParseError());
+
+    EXPECT_TRUE(expectedDoc == actualDoc);
 }
 
 TEST_F(HttpOpenAIHandlerParsingTest, responseFormatMissingSchema) {
@@ -1230,7 +1241,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, responseFormatMissingSchema) {
     uint32_t maxTokensLimit = 10;
     std::optional<uint32_t> maxModelLength;
     std::shared_ptr<ovms::OpenAIChatCompletionsHandler> apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
-    EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("response_format.json_schema is not an object"));
+    // Response format content is not validated by OVMS. Any error would be raised by XGrammar during generation config validation which happens after request parsing.
+    EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
 }
 
 TEST_F(HttpOpenAIHandlerParsingTest, responseFormatNullValue) {
@@ -1251,5 +1263,5 @@ TEST_F(HttpOpenAIHandlerParsingTest, responseFormatNullValue) {
     std::optional<uint32_t> maxModelLength;
     std::shared_ptr<ovms::OpenAIChatCompletionsHandler> apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
     EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
-    EXPECT_FALSE(apiHandler->getResponseSchema().has_value());
+    EXPECT_FALSE(apiHandler->getResponseFormat().has_value());
 }
diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp