sungeunk · andrew-k-park · Jul 25, 2024 · Jul 26, 2024 · Jul 30, 2024 · Sep 11, 2024
diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
@@ -1,7 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 numpy
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-openvino
 openvino-tokenizers
 openvino_genai
 auto-gptq>=0.5.1 # for gptq

diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
@@ -2,19 +2,100 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "openvino/genai/llm_pipeline.hpp"
+#include <openvino/openvino.hpp>
 
-int main(int argc, char* argv[]) try {
-    if (2 != argc) {
-        throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR>");
+#ifdef _WIN32
+#include <codecvt>
+#include <fcntl.h>
+#include <io.h>
+#include <windows.h>
+#endif
+
+const std::string prompts[] =
+{
+    "Hello there! How are you doing?",
+    "What is OpenVINO?",
+    "Who are you?",
+    "Can you explain to me briefly what is Python programming language?",
+    "Explain the plot of Cinderella in a sentence.",
+    "What are some common mistakes to avoid when writing code?",
+    "Write a 100-word blog post on “Benefits of Artificial Intelligence and OpenVINO“",
+};
+
+
+struct Args {
+    std::string model_path = "";
+    std::string device = "GPU";
+    int max_new_tokens = 100;
+};
+
+static void usage(const std::string& prog) {
+    std::cout << "Usage: " << prog << " [options]\n"
+        << "\n"
+        << "options:\n"
+        << "  -h, --help              show this help message and exit\n"
+        << "  -m, --model PATH        Chatglm OpenVINO model path (default: openvino_model.xml)\n"
+        << "  -d, --device            Device (default: GPU)\n"
+        << "  --max_new_tokens        max_new_tokens (default: 100)\n";
+}
+
+static Args parse_args(const std::vector<std::string>& argv) {
+    Args args;
+
+    for (size_t i = 1; i < argv.size(); i++) {
+        const std::string& arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            usage(argv[0]);
+            exit(EXIT_SUCCESS);
+        }
+        else if (arg == "-m" || arg == "--model") {
+            args.model_path = argv[++i];
+        }
+        else if (arg == "-d" || arg == "--device") {
+            args.device = argv[++i];
+        }
+        else if (arg == "--max_new_tokens") {
+            args.max_new_tokens = std::stoi(argv[++i]);
+        }
+        else {
+            std::cerr << "Unknown argument: " << arg << std::endl;
+            usage(argv[0]);
+            exit(EXIT_FAILURE);
+        }
     }
-    std::string prompt;
-    std::string model_path = argv[1];
 
-    std::string device = "CPU";  // GPU, NPU can be used as well
-    ov::genai::LLMPipeline pipe(model_path, device);
-
+    return args;
+}
+
+static Args parse_args(int argc, char** argv) {
+    std::vector<std::string> argv_vec;
+    argv_vec.reserve(argc);
+
+#ifdef _WIN32
+    LPWSTR* wargs = CommandLineToArgvW(GetCommandLineW(), &argc);
+
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    for (int i = 0; i < argc; i++) {
+        argv_vec.emplace_back(converter.to_bytes(wargs[i]));
+    }
+
+    LocalFree(wargs);
+#else
+    for (int i = 0; i < argc; i++) {
+        argv_vec.emplace_back(argv[i]);
+    }
+#endif
+
+    return parse_args(argv_vec);
+}
+
+int main(int argc, char* argv[]) try {
+    Args args = parse_args(argc, argv);
+    ov::genai::LLMPipeline pipe(args.model_path, args.device);
+
     ov::genai::GenerationConfig config;
-    config.max_new_tokens = 100;
+    config.max_new_tokens = args.max_new_tokens;
     std::function<bool(std::string)> streamer = [](std::string word) { 
         std::cout << word << std::flush;
         // Return flag corresponds whether generation should be stopped.
@@ -23,11 +104,13 @@ int main(int argc, char* argv[]) try {
     };
 
     pipe.start_chat();
-    std::cout << "question:\n";
-    while (std::getline(std::cin, prompt)) {
+    for (std::string prompt : prompts) {
+        std::cout << "question:\n";
+        std::cout << prompt << std::endl;
+
         pipe.generate(prompt, config, streamer);
-        std::cout << "\n----------\n"
-            "question:\n";
+
+        std::cout << "\n----------\n";
     }
     pipe.finish_chat();
 } catch (const std::exception& error) {

diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/chat_sample/chat_sample.py
@@ -15,21 +15,34 @@ def streamer(subword):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('model_dir')
+    parser.add_argument("-m", "--model", type=str, help="Path to model")
+    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
+
     args = parser.parse_args()
 
-    device = 'CPU'  # GPU can be used as well
-    pipe = openvino_genai.LLMPipeline(args.model_dir, device)
+    models_path = args.model
+    device = args.device
+
+    pipe = openvino_genai.LLMPipeline(models_path, device)
 
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
 
+    # Predefined list of prompts
+    prompts = [
+        "Hello there! How are you doing?",
+        "What is OpenVINO?",
+        "Who are you?",
+        "Can you explain to me briefly what is Python programming language?",
+        "Explain the plot of Cinderella in a sentence.",
+        "What are some common mistakes to avoid when writing code?",
+        "Write a 100-word blog post on “Benefits of Artificial Intelligence and OpenVINO“",
+    ]
+
     pipe.start_chat()
-    while True:
-        try:
-            prompt = input('question:\n')
-        except EOFError:
-            break
+
+    for prompt in prompts:
+        print(f"question:\n{prompt}")
         pipe.generate(prompt, config, streamer)
         print('\n----------')
     pipe.finish_chat()

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -77,21 +77,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
         LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(model_path))
     {
         ov::Core core;
-        if(auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
-            auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config);
-            core.set_property(core_plugin_config);
-            auto model = core.read_model(model_path / "openvino_model.xml");
-            m_adapter_controller = AdapterController(model, m_generation_config.adapters, "base_model.model.model.", device);   // TODO: Make the prefix name configurable
-            utils::slice_matmul_statefull_model(model);
-            m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
-            m_adapter_controller->apply(m_model_runner, m_generation_config.adapters);
-        } else {
-            auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config);
-            core.set_property(core_plugin_config);
-            auto model = core.read_model(model_path / "openvino_model.xml");
-            utils::slice_matmul_statefull_model(model);
-            m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
-        }
+        core.set_property(device, plugin_config);
+        std::cout << "Compiling the model to " << device << std::endl;
+        m_model_runner = core.compile_model(model_path / "openvino_model.xml", device).create_infer_request();
 
         // If eos_token_id was not provided, take value
         if (m_generation_config.eos_token_id == -1)
@@ -131,10 +119,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
 
                 m_history.push_back({{"role", "user"}, {"content", prompt}});
                 constexpr bool add_generation_prompt = true;
-                auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-                // Do not add special tokens in chat scenario to be aligned with HF.
-                bool add_special_tokens = false;  
-                auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens));
+                std::string default_chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content + ' ' + eos_token }}{% endif %}{% endfor %}";
+                auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, default_chat_template);
+                auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history);
                 if (m_is_cache_empty) {
                     encoded_input = new_chat_tokens;
                 } else {
@@ -579,6 +566,7 @@ ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() {
 }
 
 void ov::genai::LLMPipeline::start_chat(const std::string& system_message) {
+    std::cout << ov::get_openvino_version() << std::endl;
     m_pimpl->start_chat(system_message);
 }