Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion llm_bench/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
numpy
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
openvino
openvino-tokenizers
openvino_genai
auto-gptq>=0.5.1 # for gptq
Expand Down
109 changes: 96 additions & 13 deletions samples/cpp/chat_sample/chat_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,100 @@
// SPDX-License-Identifier: Apache-2.0

#include "openvino/genai/llm_pipeline.hpp"
#include <openvino/openvino.hpp>

int main(int argc, char* argv[]) try {
if (2 != argc) {
throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR>");
#ifdef _WIN32
#include <codecvt>
#include <fcntl.h>
#include <io.h>
#include <windows.h>
#endif

const std::string prompts[] =
{
"Hello there! How are you doing?",
"What is OpenVINO?",
"Who are you?",
"Can you explain to me briefly what is Python programming language?",
"Explain the plot of Cinderella in a sentence.",
"What are some common mistakes to avoid when writing code?",
"Write a 100-word blog post on “Benefits of Artificial Intelligence and OpenVINO“",
};


struct Args {
std::string model_path = "";
std::string device = "GPU";
int max_new_tokens = 100;
};

static void usage(const std::string& prog) {
std::cout << "Usage: " << prog << " [options]\n"
<< "\n"
<< "options:\n"
<< " -h, --help show this help message and exit\n"
<< " -m, --model PATH Chatglm OpenVINO model path (default: openvino_model.xml)\n"
<< " -d, --device Device (default: GPU)\n"
<< " --max_new_tokens max_new_tokens (default: 100)\n";
}

static Args parse_args(const std::vector<std::string>& argv) {
Args args;

for (size_t i = 1; i < argv.size(); i++) {
const std::string& arg = argv[i];

if (arg == "-h" || arg == "--help") {
usage(argv[0]);
exit(EXIT_SUCCESS);
}
else if (arg == "-m" || arg == "--model") {
args.model_path = argv[++i];
}
else if (arg == "-d" || arg == "--device") {
args.device = argv[++i];
}
else if (arg == "--max_new_tokens") {
args.max_new_tokens = std::stoi(argv[++i]);
}
else {
std::cerr << "Unknown argument: " << arg << std::endl;
usage(argv[0]);
exit(EXIT_FAILURE);
}
}
std::string prompt;
std::string model_path = argv[1];

std::string device = "CPU"; // GPU, NPU can be used as well
ov::genai::LLMPipeline pipe(model_path, device);

return args;
}

static Args parse_args(int argc, char** argv) {
std::vector<std::string> argv_vec;
argv_vec.reserve(argc);

#ifdef _WIN32
LPWSTR* wargs = CommandLineToArgvW(GetCommandLineW(), &argc);

std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
for (int i = 0; i < argc; i++) {
argv_vec.emplace_back(converter.to_bytes(wargs[i]));
}

LocalFree(wargs);
#else
for (int i = 0; i < argc; i++) {
argv_vec.emplace_back(argv[i]);
}
#endif

return parse_args(argv_vec);
}

int main(int argc, char* argv[]) try {
Args args = parse_args(argc, argv);
ov::genai::LLMPipeline pipe(args.model_path, args.device);

ov::genai::GenerationConfig config;
config.max_new_tokens = 100;
config.max_new_tokens = args.max_new_tokens;
std::function<bool(std::string)> streamer = [](std::string word) {
std::cout << word << std::flush;
// Return flag corresponds whether generation should be stopped.
Expand All @@ -23,11 +104,13 @@ int main(int argc, char* argv[]) try {
};

pipe.start_chat();
std::cout << "question:\n";
while (std::getline(std::cin, prompt)) {
for (std::string prompt : prompts) {
std::cout << "question:\n";
std::cout << prompt << std::endl;

pipe.generate(prompt, config, streamer);
std::cout << "\n----------\n"
"question:\n";

std::cout << "\n----------\n";
}
pipe.finish_chat();
} catch (const std::exception& error) {
Expand Down
29 changes: 21 additions & 8 deletions samples/python/chat_sample/chat_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,34 @@ def streamer(subword):

def main():
parser = argparse.ArgumentParser()
parser.add_argument('model_dir')
parser.add_argument("-m", "--model", type=str, help="Path to model")
parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")

args = parser.parse_args()

device = 'CPU' # GPU can be used as well
pipe = openvino_genai.LLMPipeline(args.model_dir, device)
models_path = args.model
device = args.device

pipe = openvino_genai.LLMPipeline(models_path, device)

config = openvino_genai.GenerationConfig()
config.max_new_tokens = 100

# Predefined list of prompts
prompts = [
"Hello there! How are you doing?",
"What is OpenVINO?",
"Who are you?",
"Can you explain to me briefly what is Python programming language?",
"Explain the plot of Cinderella in a sentence.",
"What are some common mistakes to avoid when writing code?",
"Write a 100-word blog post on “Benefits of Artificial Intelligence and OpenVINO“",
]

pipe.start_chat()
while True:
try:
prompt = input('question:\n')
except EOFError:
break

for prompt in prompts:
print(f"question:\n{prompt}")
pipe.generate(prompt, config, streamer)
print('\n----------')
pipe.finish_chat()
Expand Down
26 changes: 7 additions & 19 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(model_path))
{
ov::Core core;
if(auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) {
auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config);
core.set_property(core_plugin_config);
auto model = core.read_model(model_path / "openvino_model.xml");
m_adapter_controller = AdapterController(model, m_generation_config.adapters, "base_model.model.model.", device); // TODO: Make the prefix name configurable
utils::slice_matmul_statefull_model(model);
m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
m_adapter_controller->apply(m_model_runner, m_generation_config.adapters);
} else {
auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config);
core.set_property(core_plugin_config);
auto model = core.read_model(model_path / "openvino_model.xml");
utils::slice_matmul_statefull_model(model);
m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
}
core.set_property(device, plugin_config);
std::cout << "Compiling the model to " << device << std::endl;
m_model_runner = core.compile_model(model_path / "openvino_model.xml", device).create_infer_request();

// If eos_token_id was not provided, take value
if (m_generation_config.eos_token_id == -1)
Expand Down Expand Up @@ -131,10 +119,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {

m_history.push_back({{"role", "user"}, {"content", prompt}});
constexpr bool add_generation_prompt = true;
auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
// Do not add special tokens in chat scenario to be aligned with HF.
bool add_special_tokens = false;
auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens));
std::string default_chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}";
auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, default_chat_template);
auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history);
if (m_is_cache_empty) {
encoded_input = new_chat_tokens;
} else {
Expand Down Expand Up @@ -579,6 +566,7 @@ ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() {
}

void ov::genai::LLMPipeline::start_chat(const std::string& system_message) {
std::cout << ov::get_openvino_version() << std::endl;
m_pimpl->start_chat(system_message);
}

Expand Down
Loading