openvinotoolkit
diff --git a/‎src/llm/io_processing/base_generation_config_builder.hpp‎
Lines changed: 4 additions & 2 deletions b/‎src/llm/io_processing/base_generation_config_builder.hpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/llm/io_processing/base_output_parser.cpp‎
Lines changed: 0 additions & 8 deletions b/‎src/llm/io_processing/base_output_parser.cpp‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/llm/io_processing/base_output_parser.hpp‎
Lines changed: 0 additions & 8 deletions b/‎src/llm/io_processing/base_output_parser.hpp‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/llm/io_processing/generation_config_builder.hpp‎
Lines changed: 9 additions & 12 deletions b/‎src/llm/io_processing/generation_config_builder.hpp‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎src/llm/io_processing/hermes3/generation_config_builder.cpp‎
Lines changed: 18 additions & 13 deletions b/‎src/llm/io_processing/hermes3/generation_config_builder.cpp‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎src/llm/io_processing/hermes3/generation_config_builder.hpp‎
Lines changed: 2 additions & 2 deletions b/‎src/llm/io_processing/hermes3/generation_config_builder.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llm/io_processing/hermes3/tool_parser.cpp‎
Lines changed: 2 additions & 21 deletions b/‎src/llm/io_processing/hermes3/tool_parser.cpp‎
Lines changed: 2 additions & 21 deletions
diff --git a/‎src/llm/io_processing/llama3/generation_config_builder.cpp‎
Lines changed: 18 additions & 13 deletions b/‎src/llm/io_processing/llama3/generation_config_builder.cpp‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎src/llm/io_processing/llama3/generation_config_builder.hpp‎
Lines changed: 2 additions & 2 deletions b/‎src/llm/io_processing/llama3/generation_config_builder.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llm/io_processing/llama3/tool_parser.cpp‎
Lines changed: 16 additions & 21 deletions b/‎src/llm/io_processing/llama3/tool_parser.cpp‎
Lines changed: 16 additions & 21 deletions
@@ -31,13 +31,15 @@ namespace ovms {
 class BaseGenerationConfigBuilder {
 protected:
     ov::genai::GenerationConfig config;
+    const bool enableToolGuidedGeneration;
     void setStructuralTagsConfig(const ov::genai::StructuredOutputConfig::StructuralTag& structuralTag);
 
 public:
     BaseGenerationConfigBuilder() = delete;
     // Initializes the builder with a base generation config read from model generation_config.json
-    explicit BaseGenerationConfigBuilder(ov::genai::GenerationConfig& baseConfig) :
-        config(baseConfig) {}
+    explicit BaseGenerationConfigBuilder(ov::genai::GenerationConfig& baseConfig, bool enableToolGuidedGeneration) :
+        config(baseConfig),
+        enableToolGuidedGeneration(enableToolGuidedGeneration) {}
     virtual ~BaseGenerationConfigBuilder() = default;
 
     ov::genai::GenerationConfig& getConfig() { return config; }
 
@@ -72,12 +72,4 @@ rapidjson::Document BaseOutputParser::wrapDelta(const rapidjson::Document& delta
     return wrappedDelta;
 }
 
-void BaseOutputParser::enableImmediateParsing() {
-    immediateParsingEnabled = true;
-}
-
-bool BaseOutputParser::isImmediateParsingEnabled() const {
-    return immediateParsingEnabled;
-}
-
 }  // namespace ovms
@@ -64,9 +64,6 @@ using ToolsParameterTypeMap_t = std::unordered_map<std::string, ParametersTypeMa
 class BaseOutputParser {
 protected:
     ov::genai::Tokenizer tokenizer;
-    // Flag indicating whether parsing start tag has been injected into the prompt
-    // if true, parser should assume start tag already appeared and start parsing immediately
-    bool immediateParsingEnabled = false;
 
 public:
     BaseOutputParser() = delete;
@@ -81,11 +78,6 @@ class BaseOutputParser {
     // {"tool_calls":[{"index":0,"function":<delta>}]}
     static rapidjson::Document wrapDelta(const rapidjson::Document& delta, int toolCallIndex);
 
-    // Calling this method should put parser into immediate parsing mode where it starts parsing immediately, without seeking the start tag.
-    void enableImmediateParsing();
-
-    bool isImmediateParsingEnabled() const;
-
     // --- Specialized output parsers interface ---
 
     // Parse model output and extract relevant information to parsedOutput fields. Raw generated tokens are provided as an argument.
 
@@ -34,24 +34,21 @@ class GenerationConfigBuilder {
 public:
     GenerationConfigBuilder() = delete;
     // Using tool parser name to select appropriate builder implementation to avoid introducing additional parameters. Might be insufficient in the future.
-    explicit GenerationConfigBuilder(ov::genai::GenerationConfig baseConfig, std::string toolParserName = "", bool enableToolGuidedGeneration = false) {
-        if (!enableToolGuidedGeneration) {
-            builder_impl = std::make_unique<BaseGenerationConfigBuilder>(baseConfig);
-            return;
-        }
-
+    explicit GenerationConfigBuilder(ov::genai::GenerationConfig baseConfig, bool enableToolGuidedGeneration, std::string toolParserName = "") {
         if (toolParserName == "llama3") {
-            builder_impl = std::make_unique<Llama3GenerationConfigBuilder>(baseConfig);
+            builder_impl = std::make_unique<Llama3GenerationConfigBuilder>(baseConfig, enableToolGuidedGeneration);
         } else if (toolParserName == "qwen3") {
             // Qwen3 and Hermes3 share the same mechanism for generating tool calls, so we can use Hermes3GenerationConfigBuilder
-            builder_impl = std::make_unique<Hermes3GenerationConfigBuilder>(baseConfig);
+            builder_impl = std::make_unique<Hermes3GenerationConfigBuilder>(baseConfig, enableToolGuidedGeneration);
         } else if (toolParserName == "hermes3") {
-            builder_impl = std::make_unique<Hermes3GenerationConfigBuilder>(baseConfig);
+            builder_impl = std::make_unique<Hermes3GenerationConfigBuilder>(baseConfig, enableToolGuidedGeneration);
         } else if (toolParserName == "phi4") {
-            builder_impl = std::make_unique<Phi4GenerationConfigBuilder>(baseConfig);
+            builder_impl = std::make_unique<Phi4GenerationConfigBuilder>(baseConfig, enableToolGuidedGeneration);
         } else {
-            SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Option enable_tool_guided_generation is set, but will not be effective since no valid tool parser has been provided.");
-            builder_impl = std::make_unique<BaseGenerationConfigBuilder>(baseConfig);
+            if (enableToolGuidedGeneration) {
+                SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Option enable_tool_guided_generation is set, but will not be effective since no valid tool parser has been provided.");
+            }
+            builder_impl = std::make_unique<BaseGenerationConfigBuilder>(baseConfig, enableToolGuidedGeneration);
         }
     }
 
 
@@ -33,20 +33,25 @@ void Hermes3GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatComp
         return;
     }
 
-    // Set tool guided generation config specific to Hermes3 and Qwen3 models
-    auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
-    triggeredTags->triggers.push_back("<tool_call>");
-
-    for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
-        const auto& toolSchema = toolSchemaWrapper.stringRepr;
-        ov::genai::StructuredOutputConfig::Tag tagItem;
-        tagItem.begin = "<tool_call>\n{\"name\": \"" + toolName + "\", \"arguments\": ";
-        tagItem.end = "}\n</tool_call>";
-        tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
-        triggeredTags->tags.push_back(tagItem);
+    if (enableToolGuidedGeneration || request.toolChoice == "required") {
+        // Set tool guided generation config specific to Hermes3 and Qwen3 models
+        auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
+        triggeredTags->triggers.push_back("<tool_call>");
+
+        for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
+            const auto& toolSchema = toolSchemaWrapper.stringRepr;
+            ov::genai::StructuredOutputConfig::Tag tagItem;
+            tagItem.begin = "<tool_call>\n{\"name\": \"" + toolName + "\", \"arguments\": ";
+            tagItem.end = "}\n</tool_call>";
+            tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
+            triggeredTags->tags.push_back(tagItem);
+        }
+        if (request.toolChoice == "required") {
+            triggeredTags->at_least_one = true;
+        }
+        ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
+        setStructuralTagsConfig(structuralTag);
     }
-    ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
-    setStructuralTagsConfig(structuralTag);
 }
 
 }  // namespace ovms
@@ -25,8 +25,8 @@ namespace ovms {
 class Hermes3GenerationConfigBuilder : public BaseGenerationConfigBuilder {
 public:
     Hermes3GenerationConfigBuilder() = delete;
-    explicit Hermes3GenerationConfigBuilder(ov::genai::GenerationConfig& baseConfig) :
-        BaseGenerationConfigBuilder(baseConfig) {}
+    explicit Hermes3GenerationConfigBuilder(ov::genai::GenerationConfig& baseConfig, bool enableToolGuidedGeneration) :
+        BaseGenerationConfigBuilder(baseConfig, enableToolGuidedGeneration) {}
 
     void parseConfigFromRequest(const OpenAIChatCompletionsRequest& request) override;
 };
 
@@ -123,27 +123,8 @@ void Hermes3ToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int6
     size_t pos = 0;
     size_t firstToolCallPos;
 
-    // If immediate parsing is enabled, we assume tool calls start from the beginning of the content.
-    // Otherwise, we search for the first occurrence of the tool call start tag.
-    if (!immediateParsingEnabled) {
-        firstToolCallPos = parsedOutput.content.find(startTag, pos);
-    } else {
-        // Read first tool call without opening tag
-        firstToolCallPos = 0;
-        size_t end = parsedOutput.content.find(endTag, firstToolCallPos);
-        std::string tool;
-        if (end != std::string::npos) {
-            tool = parsedOutput.content.substr(0, end);
-            pos = end + endTag.length();
-        } else {
-            tool = parsedOutput.content;
-            pos = parsedOutput.content.length();
-        }
-        if (!tool.empty()) {
-            tools.push_back(tool);
-        }
-    }
-
+    // Save position of the first tool call start tag to properly clear content after parsing.
+    firstToolCallPos = parsedOutput.content.find(startTag, pos);
     while (true) {
         size_t start = parsedOutput.content.find(startTag, pos);
         if (start == std::string::npos) {
 
@@ -33,20 +33,25 @@ void Llama3GenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatCompl
         return;
     }
 
-    // Set tool guided generation config specific to Llama-3 model
-    auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
-    triggeredTags->triggers.push_back("{\"name\":");
-
-    for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
-        const auto& toolSchema = toolSchemaWrapper.stringRepr;
-        ov::genai::StructuredOutputConfig::Tag tagItem;
-        tagItem.begin = "{\"name\": \"" + toolName + "\", \"parameters\": ";
-        tagItem.end = "}";
-        tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
-        triggeredTags->tags.push_back(tagItem);
+    if (enableToolGuidedGeneration || request.toolChoice == "required") {
+        // Set tool guided generation config specific to Llama-3 model
+        auto triggeredTags = std::make_shared<ov::genai::StructuredOutputConfig::TriggeredTags>();
+        triggeredTags->triggers.push_back("{\"name\":");
+
+        for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) {
+            const auto& toolSchema = toolSchemaWrapper.stringRepr;
+            ov::genai::StructuredOutputConfig::Tag tagItem;
+            tagItem.begin = "{\"name\": \"" + toolName + "\", \"parameters\": ";
+            tagItem.end = "}";
+            tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema);
+            triggeredTags->tags.push_back(tagItem);
+        }
+        if (request.toolChoice == "required") {
+            triggeredTags->at_least_one = true;
+        }
+        ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
+        setStructuralTagsConfig(structuralTag);
     }
-    ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags;
-    setStructuralTagsConfig(structuralTag);
 }
 
 }  // namespace ovms
@@ -25,8 +25,8 @@ namespace ovms {
 class Llama3GenerationConfigBuilder : public BaseGenerationConfigBuilder {
 public:
     Llama3GenerationConfigBuilder() = delete;
-    explicit Llama3GenerationConfigBuilder(ov::genai::GenerationConfig& baseConfig) :
-        BaseGenerationConfigBuilder(baseConfig) {}
+    explicit Llama3GenerationConfigBuilder(ov::genai::GenerationConfig& baseConfig, bool enableToolGuidedGeneration) :
+        BaseGenerationConfigBuilder(baseConfig, enableToolGuidedGeneration) {}
 
     void parseConfigFromRequest(const OpenAIChatCompletionsRequest& request) override;
 };
 
@@ -31,30 +31,25 @@ void Llama3ToolParser::parse(ParsedOutput& parsedOutput, const std::vector<int64
     // TODO: check if we can rely on decoded <|python_tag|> token to be present in the content, so we can drop multiple detokenizations and copies
     // and just extract substrings from the content and modify content in-place
 
-    // If immediate trigger parsing is enabled, we assume botTokenId has been injected into the prompt and whole output are tool calls,
-    // otherwise we search for botTokenId in the generatedTokens to find tool calls start or check if the content starts with "{" (llama3 sometimes does not generate botTokenId)
+    // We search for botTokenId in the generatedTokens to find tool calls start or check if the content starts with "{" (llama3 sometimes does not generate botTokenId)
     auto toolCallsStartPosition = generatedTokens.begin();
-    if (!immediateParsingEnabled) {
-        toolCallsStartPosition = generatedTokens.end();
-        // Find botTokenId in generated_ids
-        auto botTokenIt = std::find(generatedTokens.begin(), generatedTokens.end(), botTokenId);
+    toolCallsStartPosition = generatedTokens.end();
+    // Find botTokenId in generated_ids
+    auto botTokenIt = std::find(generatedTokens.begin(), generatedTokens.end(), botTokenId);
 
-        if (botTokenIt != generatedTokens.end()) {
-            // Decode the content before botTokenId
-            std::vector<int64_t> contentTokens(generatedTokens.begin(), botTokenIt);
-            parsedOutput.content = tokenizer.decode(contentTokens);
-            // Tokens after botTokenId will be treated as tool calls
-            toolCallsStartPosition = botTokenIt + 1;
-        } else {
-            // If botTokenId is not found, check if model output starts with "{" and if so, assume it's a tool call"
-            if (!parsedOutput.content.empty() && parsedOutput.content[0] == '{') {
-                // If model output starts with "{", treat it as a tool call
-                toolCallsStartPosition = generatedTokens.begin();
-                parsedOutput.content.clear();
-            }
-        }
+    if (botTokenIt != generatedTokens.end()) {
+        // Decode the content before botTokenId
+        std::vector<int64_t> contentTokens(generatedTokens.begin(), botTokenIt);
+        parsedOutput.content = tokenizer.decode(contentTokens);
+        // Tokens after botTokenId will be treated as tool calls
+        toolCallsStartPosition = botTokenIt + 1;
     } else {
-        parsedOutput.content.clear();
+        // If botTokenId is not found, check if model output starts with "{" and if so, assume it's a tool call"
+        if (!parsedOutput.content.empty() && parsedOutput.content[0] == '{') {
+            // If model output starts with "{", treat it as a tool call
+            toolCallsStartPosition = generatedTokens.begin();
+            parsedOutput.content.clear();
+        }
     }
 
     if (toolCallsStartPosition != generatedTokens.end()) {