diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index 6139bf843c..0612cf16b8 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -1,7 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu numpy --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly -openvino openvino-tokenizers openvino_genai auto-gptq>=0.5.1 # for gptq diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index 827c08ae57..82746fc4a8 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -2,19 +2,100 @@ // SPDX-License-Identifier: Apache-2.0 #include "openvino/genai/llm_pipeline.hpp" +#include -int main(int argc, char* argv[]) try { - if (2 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); +#ifdef _WIN32 +#include +#include +#include +#include +#endif + +const std::string prompts[] = +{ + "Hello there! How are you doing?", + "What is OpenVINO?", + "Who are you?", + "Can you explain to me briefly what is Python programming language?", + "Explain the plot of Cinderella in a sentence.", + "What are some common mistakes to avoid when writing code?", + "Write a 100-word blog post on “Benefits of Artificial Intelligence and OpenVINO“", +}; + + +struct Args { + std::string model_path = ""; + std::string device = "GPU"; + int max_new_tokens = 100; +}; + +static void usage(const std::string& prog) { + std::cout << "Usage: " << prog << " [options]\n" + << "\n" + << "options:\n" + << " -h, --help show this help message and exit\n" + << " -m, --model PATH Chatglm OpenVINO model path (default: openvino_model.xml)\n" + << " -d, --device Device (default: GPU)\n" + << " --max_new_tokens max_new_tokens (default: 100)\n"; +} + +static Args parse_args(const std::vector& argv) { + Args args; + + for (size_t i = 1; i < argv.size(); i++) { + const std::string& arg = argv[i]; + + if (arg == "-h" || arg == "--help") { + usage(argv[0]); + exit(EXIT_SUCCESS); + } + else if (arg == "-m" || arg == "--model") { + args.model_path = argv[++i]; + } + else if (arg == "-d" || arg == "--device") { + args.device = argv[++i]; + } + else if (arg == "--max_new_tokens") { + args.max_new_tokens = std::stoi(argv[++i]); + } + else { + std::cerr << "Unknown argument: " << arg << std::endl; + usage(argv[0]); + exit(EXIT_FAILURE); + } } - std::string prompt; - std::string model_path = argv[1]; - std::string device = "CPU"; // GPU, NPU can be used as well - ov::genai::LLMPipeline pipe(model_path, device); - + return args; +} + +static Args parse_args(int argc, char** argv) { + std::vector argv_vec; + argv_vec.reserve(argc); + +#ifdef _WIN32 + LPWSTR* wargs = CommandLineToArgvW(GetCommandLineW(), &argc); + + std::wstring_convert> converter; + for (int i = 0; i < argc; i++) { + argv_vec.emplace_back(converter.to_bytes(wargs[i])); + } + + LocalFree(wargs); +#else + for (int i = 0; i < argc; i++) { + argv_vec.emplace_back(argv[i]); + } +#endif + + return parse_args(argv_vec); +} + +int main(int argc, char* argv[]) try { + Args args = parse_args(argc, argv); + ov::genai::LLMPipeline pipe(args.model_path, args.device); + ov::genai::GenerationConfig config; - config.max_new_tokens = 100; + config.max_new_tokens = args.max_new_tokens; std::function streamer = [](std::string word) { std::cout << word << std::flush; // Return flag corresponds whether generation should be stopped. @@ -23,11 +104,13 @@ int main(int argc, char* argv[]) try { }; pipe.start_chat(); - std::cout << "question:\n"; - while (std::getline(std::cin, prompt)) { + for (std::string prompt : prompts) { + std::cout << "question:\n"; + std::cout << prompt << std::endl; + pipe.generate(prompt, config, streamer); - std::cout << "\n----------\n" - "question:\n"; + + std::cout << "\n----------\n"; } pipe.finish_chat(); } catch (const std::exception& error) { diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/chat_sample/chat_sample.py index eee66fb71d..9477206145 100755 --- a/samples/python/chat_sample/chat_sample.py +++ b/samples/python/chat_sample/chat_sample.py @@ -15,21 +15,34 @@ def streamer(subword): def main(): parser = argparse.ArgumentParser() - parser.add_argument('model_dir') + parser.add_argument("-m", "--model", type=str, help="Path to model") + parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") + args = parser.parse_args() - device = 'CPU' # GPU can be used as well - pipe = openvino_genai.LLMPipeline(args.model_dir, device) + models_path = args.model + device = args.device + + pipe = openvino_genai.LLMPipeline(models_path, device) config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 + # Predefined list of prompts + prompts = [ + "Hello there! How are you doing?", + "What is OpenVINO?", + "Who are you?", + "Can you explain to me briefly what is Python programming language?", + "Explain the plot of Cinderella in a sentence.", + "What are some common mistakes to avoid when writing code?", + "Write a 100-word blog post on “Benefits of Artificial Intelligence and OpenVINO“", + ] + pipe.start_chat() - while True: - try: - prompt = input('question:\n') - except EOFError: - break + + for prompt in prompts: + print(f"question:\n{prompt}") pipe.generate(prompt, config, streamer) print('\n----------') pipe.finish_chat() diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index e3815e5944..6a632d1f0f 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -77,21 +77,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(model_path)) { ov::Core core; - if(auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config); - core.set_property(core_plugin_config); - auto model = core.read_model(model_path / "openvino_model.xml"); - m_adapter_controller = AdapterController(model, m_generation_config.adapters, "base_model.model.model.", device); // TODO: Make the prefix name configurable - utils::slice_matmul_statefull_model(model); - m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); - m_adapter_controller->apply(m_model_runner, m_generation_config.adapters); - } else { - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); - core.set_property(core_plugin_config); - auto model = core.read_model(model_path / "openvino_model.xml"); - utils::slice_matmul_statefull_model(model); - m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); - } + core.set_property(device, plugin_config); + std::cout << "Compiling the model to " << device << std::endl; + m_model_runner = core.compile_model(model_path / "openvino_model.xml", device).create_infer_request(); // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) @@ -131,10 +119,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; - auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - // Do not add special tokens in chat scenario to be aligned with HF. - bool add_special_tokens = false; - auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens)); + std::string default_chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}"; + auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, default_chat_template); + auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history); if (m_is_cache_empty) { encoded_input = new_chat_tokens; } else { @@ -579,6 +566,7 @@ ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() { } void ov::genai::LLMPipeline::start_chat(const std::string& system_message) { + std::cout << ov::get_openvino_version() << std::endl; m_pimpl->start_chat(system_message); }