Skip to content

Commit 74d4416

Browse files
authored
Use new structural output API (#3729)
1 parent 7eb1ce5 commit 74d4416

11 files changed

+165
-87
lines changed

docs/model_server_rest_api_chat.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ Some parameters, especially related to sampling (like `temperature`, `top_p` etc
166166
| logprobs | ⚠️ ||| bool (default: `false`) | Include the log probabilities on the logprob of the returned output token. **_ in stream mode logprobs are not returned. Only info about selected tokens is returned _** |
167167
| tools |||| array | A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for. See [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-tools) for more details. |
168168
| tool_choice |||| string or object | Controls which (if any) tool is called by the model. `none` means the model will not call any tool and instead generates a message. `auto` means the model can pick between generating a message or calling one or more tools. `required` means that model should call at least one tool. Specifying a particular tool via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool. See [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice) for more details. |
169-
| response_format |||| object | An object specifying the format that the model must output. Setting to { "type": "json_schema", "json_schema": {...} } enables Structured Outputs which ensures the model will match your supplied JSON schema. Learn more in the [Structured Outputs demo](../demos/continuous_batching/structured_output/README.md). **Note** that if model server fails to load the schema, the request will still be processed, but the schema will not be applied. |
169+
| response_format |||| object | An object specifying the format that the model must output. Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured Outputs which ensures the model will match your supplied JSON schema according to [OpenAI reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-response_format). Learn more in the [Structured Outputs demo](../demos/continuous_batching/structured_output/README.md). Additionally, `response_format` can accept [XGrammar structural tags format](https://github.com/mlc-ai/xgrammar/blob/main/docs/tutorials/structural_tag.md#format-types) (not part of OpenAI API). For example: `{ "type": "const_string", "value": "Hello World!" }`. **Note** that if model server fails to process the format, the request will still be processed, but the format will not be imposed. |
170170
| chat_template_kwargs |||| object | Enables passing additional parameters to chat template engine. Example `{"enable_thinking": false}`. Note that values like `messages`, `eos_token`, `bos_token` etc. are provided natively to the template engine, so including them in `chat_template_kwargs` will cause error. **Effective only in configuration with Python support**. |
171171

172172
#### Beam search sampling specific

src/llm/apis/openai_completions.cpp

Lines changed: 45 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -427,8 +427,49 @@ std::optional<int> OpenAIChatCompletionsHandler::getMaxTokens() const {
427427
return request.maxTokens;
428428
}
429429

430-
std::optional<std::string> OpenAIChatCompletionsHandler::getResponseSchema() const {
431-
return request.responseSchema;
430+
std::optional<std::string> OpenAIChatCompletionsHandler::getResponseFormat() const {
431+
return request.responseFormat;
432+
}
433+
434+
std::string convertOpenAIResponseFormatToStructuralTagStringFormat(const rapidjson::Value& openAIFormat) {
435+
// Build the new object: {"type": "structural_tag", "format": <openAIFormat>}
436+
// If response_format has {"json_schema": {"schema": {...}}}, flatten it to {"json_schema": {...}}
437+
rapidjson::Document flatFormatDoc;
438+
flatFormatDoc.CopyFrom(openAIFormat, flatFormatDoc.GetAllocator());
439+
440+
if (flatFormatDoc.HasMember("json_schema") && flatFormatDoc["json_schema"].IsObject()) {
441+
auto& jsonSchema = flatFormatDoc["json_schema"];
442+
if (jsonSchema.HasMember("schema") && jsonSchema["schema"].IsObject()) {
443+
// Move all members from "schema" to "json_schema"
444+
rapidjson::Value schemaObjCopy;
445+
schemaObjCopy.CopyFrom(jsonSchema["schema"], flatFormatDoc.GetAllocator()); // Make a copy as we will modify jsonSchema
446+
for (auto itr = schemaObjCopy.MemberBegin(); itr != schemaObjCopy.MemberEnd(); ++itr) {
447+
rapidjson::Value key;
448+
key.CopyFrom(itr->name, flatFormatDoc.GetAllocator());
449+
rapidjson::Value value;
450+
value.CopyFrom(itr->value, flatFormatDoc.GetAllocator());
451+
jsonSchema.AddMember(key, value, flatFormatDoc.GetAllocator());
452+
}
453+
// Remove the "schema" member
454+
jsonSchema.RemoveMember("schema");
455+
}
456+
}
457+
458+
// Serialize the flattened response_format object
459+
rapidjson::StringBuffer formatBuffer;
460+
rapidjson::Writer<rapidjson::StringBuffer> formatWriter(formatBuffer);
461+
flatFormatDoc.Accept(formatWriter);
462+
463+
// Build the new object: {"type": "structural_tag", "format": <flattened>}
464+
rapidjson::StringBuffer buffer;
465+
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
466+
writer.StartObject();
467+
writer.Key("type");
468+
writer.String("structural_tag");
469+
writer.Key("format");
470+
writer.RawValue(formatBuffer.GetString(), formatBuffer.GetSize(), rapidjson::kObjectType);
471+
writer.EndObject();
472+
return buffer.GetString();
432473
}
433474

434475
absl::Status OpenAIChatCompletionsHandler::parseChatCompletionsPart(std::optional<uint32_t> maxTokensLimit, std::optional<std::string> allowedLocalMediaPath) {
@@ -475,36 +516,8 @@ absl::Status OpenAIChatCompletionsHandler::parseChatCompletionsPart(std::optiona
475516
return absl::OkStatus();
476517
if (!it->value.IsObject())
477518
return absl::InvalidArgumentError("response_format is not an object");
478-
auto responseFormat = it->value.GetObject();
479-
auto typeIt = responseFormat.FindMember("type");
480-
if (typeIt != responseFormat.MemberEnd()) {
481-
if (!typeIt->value.IsString())
482-
return absl::InvalidArgumentError("response_format.type is not a string");
483-
if (std::string(typeIt->value.GetString()) != "json_schema") {
484-
return absl::InvalidArgumentError("response_format.type can be only json_schema");
485-
} else {
486-
auto jsonSchemaIt = responseFormat.FindMember("json_schema");
487-
if (jsonSchemaIt != responseFormat.MemberEnd()) {
488-
if (!jsonSchemaIt->value.IsObject())
489-
return absl::InvalidArgumentError("response_format.json_schema is not an object");
490-
auto jsonSchema = jsonSchemaIt->value.GetObject();
491-
auto schemaIt = jsonSchema.FindMember("schema");
492-
if (schemaIt == jsonSchema.MemberEnd())
493-
return absl::InvalidArgumentError("response_format.json_schema.schema is missing");
494-
if (!schemaIt->value.IsObject())
495-
return absl::InvalidArgumentError("response_format.json_schema.schema is not an object");
496-
// Convert schema value to a JSON string and assign to optional string responseSchema
497-
StringBuffer schemaBuffer;
498-
Writer<StringBuffer> schemaWriter(schemaBuffer);
499-
schemaIt->value.Accept(schemaWriter);
500-
request.responseSchema = std::make_optional<std::string>(schemaBuffer.GetString());
501-
} else {
502-
return absl::InvalidArgumentError("response_format.json_schema is missing");
503-
}
504-
}
505-
} else {
506-
return absl::InvalidArgumentError("response_format.type is missing");
507-
}
519+
const rapidjson::Value& responseFormat = it->value;
520+
request.responseFormat = convertOpenAIResponseFormatToStructuralTagStringFormat(responseFormat);
508521
}
509522

510523
return absl::OkStatus();

src/llm/apis/openai_completions.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ class OpenAIChatCompletionsHandler {
103103
// User input might be modified by the servable logic, so it is not const
104104
ov::genai::ChatHistory& getChatHistory();
105105
std::optional<int> getMaxTokens() const;
106-
std::optional<std::string> getResponseSchema() const;
106+
std::optional<std::string> getResponseFormat() const;
107107

108108
bool isStream() const;
109109
std::string getModel() const;

src/llm/apis/openai_request.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ struct OpenAIChatCompletionsRequest {
7474
std::optional<uint32_t> maxModelLength;
7575

7676
// Guided generation specific
77-
// Schema for response_format handling
78-
std::optional<std::string> responseSchema{std::nullopt};
77+
// String representation of response format object
78+
std::optional<std::string> responseFormat{std::nullopt};
7979
// Map that holds tool names and schemas for their arguments
8080
ToolsSchemas_t toolNameSchemaMap;
8181
// Holds value for tool_choice field as described in https://platform.openai.com/docs/api-reference/chat/create#chat_create-tool_choice

src/llm/io_processing/base_generation_config_builder.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,12 @@
2020
#include "base_generation_config_builder.hpp"
2121

2222
namespace ovms {
23-
void BaseGenerationConfigBuilder::setStructuralTagsConfig(const ov::genai::StructuralTagsConfig& structuralTagsConfig) {
23+
void BaseGenerationConfigBuilder::setStructuralTagsConfig(const ov::genai::StructuredOutputConfig::StructuralTag& structuralTag) {
2424
if (config.structured_output_config) {
25-
config.structured_output_config->structural_tags_config = structuralTagsConfig;
25+
config.structured_output_config->structural_tags_config = structuralTag;
2626
} else {
2727
ov::genai::StructuredOutputConfig structuredOutputConfig;
28-
structuredOutputConfig.structural_tags_config = structuralTagsConfig;
28+
structuredOutputConfig.structural_tags_config = structuralTag;
2929
config.structured_output_config = structuredOutputConfig;
3030
}
3131
}
@@ -104,9 +104,9 @@ void BaseGenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComplet
104104
config.max_ngram_size = request.maxNgramSize.value();
105105

106106
// Response format handling
107-
if (request.responseSchema.has_value()) {
107+
if (request.responseFormat.has_value()) {
108108
ov::genai::StructuredOutputConfig structuredOutputConfig;
109-
structuredOutputConfig.json_schema = request.responseSchema.value();
109+
structuredOutputConfig.structural_tags_config = request.responseFormat.value();
110110
config.structured_output_config = structuredOutputConfig;
111111
config.stop_strings.insert("#");
112112
}

src/llm/io_processing/base_generation_config_builder.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ namespace ovms {
3131
class BaseGenerationConfigBuilder {
3232
protected:
3333
ov::genai::GenerationConfig config;
34-
void setStructuralTagsConfig(const ov::genai::StructuralTagsConfig& structuralTagsConfig);
34+
void setStructuralTagsConfig(const ov::genai::StructuredOutputConfig::StructuralTag& structuralTag);
3535

3636
public:
3737
BaseGenerationConfigBuilder() = delete;

src/llm/io_processing/hermes3/generation_config_builder.cpp

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
// limitations under the License.
1515
//*****************************************************************************
1616

17+
#include <memory>
1718
#include <string>
1819
#include <utility>
1920
#include <openvino/genai/generation_config.hpp>
@@ -33,33 +34,19 @@ void Hermes3GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComp
3334
}
3435

3536
// Set tool guided generation config specific to Hermes3 and Qwen3 models
36-
ov::genai::StructuralTagsConfig structuralTagsConfig;
37-
static const std::string toolCallTrigger = "<tool_call>";
38-
structuralTagsConfig.triggers.push_back(toolCallTrigger);
37+
auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
38+
triggeredTags->triggers.push_back("<tool_call>");
3939

4040
for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
4141
const auto& toolSchema = toolSchemaWrapper.stringRepr;
42-
ov::genai::StructuralTagItem tagItem;
43-
tagItem.begin = toolCallTrigger;
44-
tagItem.schema = R"({
45-
"type": "object",
46-
"properties": {
47-
"name": {
48-
"type": "string",
49-
"enum": [")" +
50-
toolName + R"("]
51-
},
52-
"arguments": )" +
53-
toolSchema + R"(
54-
},
55-
"required": [
56-
"name",
57-
"arguments"
58-
]
59-
})";
60-
structuralTagsConfig.structural_tags.push_back(tagItem);
42+
ov::genai::StructuredOutputConfig::Tag tagItem;
43+
tagItem.begin = "<tool_call>\n{\"name\": \"" + toolName + "\", \"arguments\": ";
44+
tagItem.end = "}\n</tool_call>";
45+
tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
46+
triggeredTags->tags.push_back(tagItem);
6147
}
62-
setStructuralTagsConfig(structuralTagsConfig);
48+
ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
49+
setStructuralTagsConfig(structuralTag);
6350
}
6451

6552
} // namespace ovms

src/llm/io_processing/llama3/generation_config_builder.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
// limitations under the License.
1515
//*****************************************************************************
1616

17+
#include <memory>
1718
#include <string>
1819
#include <utility>
1920
#include <openvino/genai/generation_config.hpp>
@@ -33,20 +34,19 @@ void Llama3GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatCompl
3334
}
3435

3536
// Set tool guided generation config specific to Llama-3 model
36-
ov::genai::StructuralTagsConfig structuralTagsConfig;
37-
static const std::string beginOfToolsString = "<|python_tag|>";
38-
structuralTagsConfig.triggers.push_back(beginOfToolsString);
37+
auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
38+
triggeredTags->triggers.push_back("{\"name\":");
3939

4040
for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
4141
const auto& toolSchema = toolSchemaWrapper.stringRepr;
42-
ov::genai::StructuralTagItem tagItem;
43-
std::string toolCallTrigger = "{\"name\": \"" + toolName + "\", \"parameters\": ";
44-
structuralTagsConfig.triggers.push_back(toolCallTrigger);
45-
tagItem.begin = toolCallTrigger;
46-
tagItem.schema = toolSchema;
47-
structuralTagsConfig.structural_tags.push_back(tagItem);
42+
ov::genai::StructuredOutputConfig::Tag tagItem;
43+
tagItem.begin = "{\"name\": \"" + toolName + "\", \"parameters\": ";
44+
tagItem.end = "}";
45+
tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
46+
triggeredTags->tags.push_back(tagItem);
4847
}
49-
setStructuralTagsConfig(structuralTagsConfig);
48+
ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
49+
setStructuralTagsConfig(structuralTag);
5050
}
5151

5252
} // namespace ovms

src/llm/io_processing/phi4/generation_config_builder.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
// limitations under the License.
1515
//*****************************************************************************
1616

17+
#include <memory>
1718
#include <string>
1819
#include <utility>
1920
#include <openvino/genai/generation_config.hpp>
@@ -34,10 +35,11 @@ void Phi4GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComplet
3435

3536
// Set tool guided generation config specific to Phi-4 model as described in template from:
3637
// https://github.com/vllm-project/vllm/blob/v0.9.2/examples/tool_chat_template_phi4_mini.jinja
37-
ov::genai::StructuralTagsConfig structuralTagsConfig;
38+
3839
static const std::string beginOfToolsString = "functools";
39-
structuralTagsConfig.triggers.push_back(beginOfToolsString);
40-
ov::genai::StructuralTagItem tagItem;
40+
auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
41+
triggeredTags->triggers.push_back(beginOfToolsString);
42+
ov::genai::StructuredOutputConfig::Tag tagItem;
4143
tagItem.begin = beginOfToolsString;
4244

4345
// Build the "anyOf" array for each tool
@@ -68,16 +70,18 @@ void Phi4GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComplet
6870
}
6971
anyOfArray += "]";
7072

71-
tagItem.schema = R"({
73+
std::string schema = R"({
7274
"type": "array",
7375
"items": {
7476
"anyOf": )" +
75-
anyOfArray + R"(
77+
anyOfArray + R"(
7678
}
7779
})";
7880

79-
structuralTagsConfig.structural_tags.push_back(tagItem);
80-
setStructuralTagsConfig(structuralTagsConfig);
81+
tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(schema);
82+
triggeredTags->tags.push_back(tagItem);
83+
ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
84+
setStructuralTagsConfig(structuralTag);
8185
}
8286

8387
} // namespace ovms

src/test/http_openai_handler_test.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,14 +1200,25 @@ TEST_F(HttpOpenAIHandlerParsingTest, responseFormatValid) {
12001200
})";
12011201
doc.Parse(json.c_str());
12021202
ASSERT_FALSE(doc.HasParseError());
1203-
std::string expectedReponseFormatSchema = R"({"type":"object","properties":{"text":{"type":"string"}},"required":["text"]})";
1203+
// Response format is converted from OpenAI compatible to XGrammar compatible
1204+
std::string expectedResponseFormat = R"({"type":"structural_tag","format":{"type":"json_schema","json_schema":{"type":"object","properties":{"text":{"type":"string"}},"required":["text"]}}})";
12041205
uint32_t bestOfLimit = 0;
12051206
uint32_t maxTokensLimit = 30;
12061207
std::optional<uint32_t> maxModelLength;
12071208
std::shared_ptr<ovms::OpenAIChatCompletionsHandler> apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
12081209
EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
1209-
EXPECT_TRUE(apiHandler->getResponseSchema().has_value());
1210-
EXPECT_EQ(apiHandler->getResponseSchema().value(), expectedReponseFormatSchema);
1210+
EXPECT_TRUE(apiHandler->getResponseFormat().has_value());
1211+
1212+
// Compare JSONs
1213+
rapidjson::Document expectedDoc;
1214+
expectedDoc.Parse(expectedResponseFormat.c_str());
1215+
ASSERT_FALSE(expectedDoc.HasParseError());
1216+
1217+
rapidjson::Document actualDoc;
1218+
actualDoc.Parse(apiHandler->getResponseFormat().value().c_str());
1219+
ASSERT_FALSE(actualDoc.HasParseError());
1220+
1221+
EXPECT_TRUE(expectedDoc == actualDoc);
12111222
}
12121223

12131224
TEST_F(HttpOpenAIHandlerParsingTest, responseFormatMissingSchema) {
@@ -1230,7 +1241,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, responseFormatMissingSchema) {
12301241
uint32_t maxTokensLimit = 10;
12311242
std::optional<uint32_t> maxModelLength;
12321243
std::shared_ptr<ovms::OpenAIChatCompletionsHandler> apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
1233-
EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("response_format.json_schema is not an object"));
1244+
// Response format content is not validated by OVMS. Any error would be raised by XGrammar during generation config validation which happens after request parsing.
1245+
EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
12341246
}
12351247

12361248
TEST_F(HttpOpenAIHandlerParsingTest, responseFormatNullValue) {
@@ -1251,5 +1263,5 @@ TEST_F(HttpOpenAIHandlerParsingTest, responseFormatNullValue) {
12511263
std::optional<uint32_t> maxModelLength;
12521264
std::shared_ptr<ovms::OpenAIChatCompletionsHandler> apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
12531265
EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
1254-
EXPECT_FALSE(apiHandler->getResponseSchema().has_value());
1266+
EXPECT_FALSE(apiHandler->getResponseFormat().has_value());
12551267
}

0 commit comments

Comments
 (0)