Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import argparse

parser = argparse.ArgumentParser(description='Client for age gender recognition')
parser.add_argument('--rest_address', required=False, default='localhost', help='Specify url to REST API service. default:localhost')
parser.add_argument('--rest_address', required=False, default='127.0.0.1', help='Specify url to REST API service. default:127.0.0.1')
parser.add_argument('--rest_port', required=False, default=9001, help='Specify port to REST API service. default: 9178')
parser.add_argument('--model_name', required=False, default='age_gender', help='Model name to request. default: age_gender')
parser.add_argument('--image_input_path', required=True, help='Input image path.')
Expand Down
8 changes: 4 additions & 4 deletions demos/continuous_batching/structured_output/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ payload = {
}

headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/chat/completions", json=payload, headers=headers)
response = requests.post("http://127.0.0.1:8000/v3/chat/completions", json=payload, headers=headers)
json_response = response.json()

print(json_response["choices"][0]["message"]["content"])
Expand All @@ -138,7 +138,7 @@ pip install openai
```python
from openai import OpenAI
from pydantic import BaseModel
base_url = "http://localhost:8000/v3"
base_url = "http://127.0.0.1:8000/v3"
model_name = "OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov"
client = OpenAI(base_url=base_url, api_key="unused")
class CalendarEvent(BaseModel):
Expand Down Expand Up @@ -174,14 +174,14 @@ It will be executed with the response_format request field including the schema
```console
pip install datasets tqdm openai jsonschema
curl -L https://raw.githubusercontent.com/openvinotoolkit/model_server/main/demos/continuous_batching/structured_output/accuracy_test.py -O
python accuracy_test.py --base_url http://localhost:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --concurrency 50 --limit 1000
python accuracy_test.py --base_url http://127.0.0.1:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --concurrency 50 --limit 1000
```
```
Requests: 1000, Successful responses: 1000, Exact matches: 135, Schema matches: 435 Invalid inputs: 0
```

```console
python accuracy_test.py --base_url http://localhost:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --enable_response_format --concurrency 50 --limit 1000
python accuracy_test.py --base_url http://127.0.0.1:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --enable_response_format --concurrency 50 --limit 1000
```
```
Requests: 1000, Successful responses: 1000, Exact matches: 217, Schema matches: 828 Invalid inputs: 0
Expand Down
2 changes: 1 addition & 1 deletion demos/continuous_batching/vlm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/m
```python
import requests
import base64
base_url='http://localhost:8000/v3'
base_url='http://127.0.0.1:8000/v3'
model_name = "OpenGVLab/InternVL2-2B"

def convert_image(Image):
Expand Down
12 changes: 6 additions & 6 deletions demos/rerank/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ pip3 install cohere
```bash
echo '
import cohere
client = cohere.Client(base_url="http://localhost:8000/v3", api_key="not_used")
client = cohere.Client(base_url="http://127.0.0.1:8000/v3", api_key="not_used")
responses = client.rerank(query="hello",documents=["welcome","farewell"], model="BAAI/bge-reranker-large")
for response in responses.results:
print(f"index {response.index}, relevance_score {response.relevance_score}")' > rerank_client.py
Expand Down Expand Up @@ -178,7 +178,7 @@ documents = [
document_template.format(doc=doc, suffix=suffix) for doc in documents
]

response = requests.post("http://localhost:8125/v3/rerank",
response = requests.post("http://127.0.0.1:8000/v3/rerank",
json={
"model": "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
"query": query,
Expand All @@ -199,7 +199,7 @@ It will return response similar to:

```bash
git clone https://github.com/openvinotoolkit/model_server
python model_server/demos/rerank/compare_results.py --query "hello" --document "welcome" --document "farewell" --base_url http://localhost:8000/v3/
python model_server/demos/rerank/compare_results.py --query "hello" --document "welcome" --document "farewell" --base_url http://127.0.0.1:8000/v3/
query hello
documents ['welcome', 'farewell']
HF Duration: 145.731 ms
Expand All @@ -214,7 +214,7 @@ An asynchronous benchmarking client can be used to access the model server perfo
```bash
cd model_server/demos/benchmark/embeddings/
pip install -r requirements.txt
python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
Number of documents: 1000
100%|██████████████████████████████████████| 50/50 [00:19<00:00, 2.53it/s]
Tokens: 501000
Expand All @@ -224,7 +224,7 @@ Mean latency: 10268 ms
Median latency: 10249 ms
Average document length: 501.0 tokens

python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
Number of documents: 1000
100%|██████████████████████████████████████| 50/50 [00:19<00:00, 2.53it/s]
Tokens: 501000
Expand All @@ -234,7 +234,7 @@ Mean latency: 10268 ms
Median latency: 10249 ms
Average document length: 501.0 tokens

python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset Cohere/wikipedia-22-12-simple-embeddings --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset Cohere/wikipedia-22-12-simple-embeddings --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
Number of documents: 1000
100%|██████████████████████████████████████| 50/50 [00:09<00:00, 5.55it/s]
Tokens: 92248
Expand Down
2 changes: 1 addition & 1 deletion demos/vlm_npu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js
```python
import requests
import base64
base_url='http://localhost:8000/v3'
base_url='http://127.0.0.1:8000/v3'
model_name = "microsoft/Phi-3.5-vision-instruct"

def convert_image(Image):
Expand Down
8 changes: 4 additions & 4 deletions docs/clients_genai.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ print(response.choices[0].message)
import requests
payload = {"model": "meta-llama/Llama-2-7b-chat-hf", "messages": [ {"role": "user","content": "Say this is a test" }]}
headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/chat/completions", json=payload, headers=headers)
response = requests.post("http://127.0.0.1:8000/v3/chat/completions", json=payload, headers=headers)
print(response.text)
```
:::
Expand Down Expand Up @@ -147,7 +147,7 @@ print(response.choices[0].text)
import requests
payload = {"model": "meta-llama/Llama-2-7b", "prompt": "Say this is a test"}
headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/completions", json=payload, headers=headers)
response = requests.post("http://127.0.0.1:8000/v3/completions", json=payload, headers=headers)
print(response.text)
```
:::
Expand Down Expand Up @@ -280,7 +280,7 @@ for data in responses.data:
import requests
payload = {"model": "Alibaba-NLP/gte-large-en-v1.5", "input": "hello world"}
headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/embeddings", json=payload, headers=headers)
response = requests.post("http://127.0.0.1:8000/v3/embeddings", json=payload, headers=headers)
print(response.text)
```
:::
Expand Down Expand Up @@ -435,7 +435,7 @@ for res in responses.results:
import requests
payload = {"model": "BAAI/bge-reranker-large", "query": "Hello", "documents":["Welcome","Farewell"]}
headers = {"Content-Type": "application/json", "Authorization": "not used"}
response = requests.post("http://localhost:8000/v3/rerank", json=payload, headers=headers)
response = requests.post("http://127.0.0.1:8000/v3/rerank", json=payload, headers=headers)
print(response.text)
```
:::
Expand Down
4 changes: 2 additions & 2 deletions docs/parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ Configuration options for the server are defined only via command-line options a
|---|---|---|
| `port` | `integer` | Number of the port used by gRPC sever. |
| `rest_port` | `integer` | Number of the port used by HTTP server (if not provided or set to 0, HTTP server will not be launched). |
| `grpc_bind_address` | `string` | Network interface address or a hostname, to which gRPC server will bind to. Default: all interfaces: 0.0.0.0 |
| `rest_bind_address` | `string` | Network interface address or a hostname, to which REST server will bind to. Default: all interfaces: 0.0.0.0 |
| `grpc_bind_address` | `string` | Comma separated list of ipv4/ipv6 network interface addresses or hostnames, to which gRPC server will bind to. Default: all interfaces: 0.0.0.0 |
| `rest_bind_address` | `string` | Comma separated list of ipv4/ipv6 network interface addresses or hostnames, to which REST server will bind to. Default: all interfaces: 0.0.0.0 |
| `grpc_workers` | `integer` | Number of the gRPC server instances (must be from 1 to CPU core count). Default value is 1 and it's optimal for most use cases. Consider setting higher value while expecting heavy load. |
| `rest_workers` | `integer` | Number of HTTP server threads. Effective when `rest_port` > 0. Default value is set based on the number of CPUs. |
| `file_system_poll_wait_seconds` | `integer` | Time interval between config and model versions changes detection in seconds. Default value is 1. Zero value disables changes monitoring. |
Expand Down
32 changes: 32 additions & 0 deletions docs/performance_tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,38 @@ To save power, the OS can decrease the CPU frequency and increase a volatility o
$ cpupower frequency-set --min 3.1GHz
```

## Network Configuration for Optimal Performance

When clients connect to the server using hostname resolution (particularly "localhost"), the system may attempt IPv6 resolution first before falling back to IPv4. If IPv6 is disabled, misconfigured, or unavailable, this can cause connection timeouts and delays before the IPv4 fallback occurs, which is especially noticeable when minimizing time to first token in generative AI applications.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By default, OVMS endpoints are bound to all ipv4 addresses. On same systems, which route localhost name to ipv6 address, it might cause extra time on the client side to switch to ipv4. It can effectively results with extra 1-2s latency.
It can be overcome by switching the API URL to http://127.0.0.1 instead.

Alternatively ipv6 can be enabled in the model server using --grpc_bind_address and --rest_bind_address.
For example:
--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1
or
--grpc_bind_address 0.0.0.0,:: --rest_bind_address 0.0.0.0,::


To optimize network connection performance:

**For local secured environments (restricted to localhost only):**
- If dual-stack networking is configured properly, binding to IPv6 localhost is sufficient: `--grpc_bind_address ::1 --rest_bind_address ::1` (both IPv4 and IPv6 will work)
- For systems without proper dual-stack support, specify both addresses to avoid resolution delays when clients use "localhost": `--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1`
- If IPv6 is disabled or not available in the environment, bind only to IPv4: `--grpc_bind_address 127.0.0.1 --rest_bind_address 127.0.0.1`

**For public deployments:**
- If dual-stack networking is configured properly, binding to IPv6 is sufficient: `--grpc_bind_address :: --rest_bind_address ::` (both IPv4 and IPv6 will work)
- For systems without proper dual-stack support, specify both addresses: `--grpc_bind_address 0.0.0.0,:: --rest_bind_address 0.0.0.0,::`
- Or just configure clients to connect directly to specific IP addresses (127.0.0.1 or ::1) rather than using "localhost" hostname

Example for local secured access using OpenVINO Model Server binary:

Linux/macOS:
```bash
./ovms --model_path /path/to/model --model_name resnet --port 9001 \
--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1 \
--rest_port 8001
```

Windows:
```cmd
ovms.exe --model_path C:\path\to\model --model_name resnet --port 9001 ^
--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1 ^
--rest_port 8001
```

## Tuning Model Server configuration parameters

OpenVINO Model Server in C++ implementation is using scalable multithreaded gRPC and REST interface, however in some hardware configuration it might become a bottleneck for high performance backend with OpenVINO.
Expand Down
5 changes: 5 additions & 0 deletions docs/security_considerations.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ docker run --rm -d --user $(id -u):$(id -g) --read-only --tmpfs /tmp -p 9000:900
---
OpenVINO Model Server currently does not provide access restrictions and traffic encryption on gRPC and REST API endpoints. The endpoints can be secured using network settings like docker network settings or network firewall on the host. The recommended configuration is to place OpenVINO Model Server behind any reverse proxy component or load balancer, which provides traffic encryption and user authorization.

When deploying in environments where only local access is required, administrators can configure the server to bind exclusively to localhost addresses. This can be achieved by setting the bind address to `127.0.0.1` for IPv4 or `::1` for IPv6, which restricts incoming connections to the local machine only. This configuration prevents external network access to the server endpoints, providing an additional layer of security for local development or testing environments.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • performance guideline doc about potential 2s issue on windows

```
--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1
```

See also:
- [Securing OVMS with NGINX](../extras/nginx-mtls-auth/README.md)
- [Securing models with OVSA](https://docs.openvino.ai/2025/about-openvino/openvino-ecosystem/openvino-project/openvino-security-add-on.html)
Expand Down
2 changes: 1 addition & 1 deletion docs/stateful_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ signature = "serving_default"
request_body = json.dumps({"signature_name": signature,'inputs': inputs})

# Send request to OVMS and get response
response = requests.post("localhost:5555/v1/models/stateful_model:predict", data=request_body)
response = requests.post("127.0.0.1:5555/v1/models/stateful_model:predict", data=request_body)

# Parse response
response_body = json.loads(response.text)
Expand Down
3 changes: 2 additions & 1 deletion src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2278,7 +2278,8 @@ ovms_cc_library(
"libovmslogging",
"libovmsstatus",
"libhttp_status_code",
"libovms_config"
"libovms_config",
"libovmsstring_utils",
],
visibility = ["//visibility:public",],
)
Expand Down
32 changes: 29 additions & 3 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
#include <thread>
#include <vector>

#ifdef _WIN32
#include <ws2tcpip.h>
#else
#include <netdb.h>
#endif

#include "logging.hpp"
#include "ovms_exit_codes.hpp"

Expand Down Expand Up @@ -59,7 +65,29 @@ bool Config::parse(ServerSettingsImpl* serverSettings, ModelsSettingsImpl* model
return validate();
}

bool Config::is_ipv6(const std::string& s) {
addrinfo hints{};
hints.ai_family = AF_INET6;
hints.ai_flags = AI_NUMERICHOST;
addrinfo* res = nullptr;
const int rc = getaddrinfo(s.c_str(), nullptr, &hints, &res);
if (res) {
freeaddrinfo(res);
}
return rc == 0;
}

bool Config::check_hostname_or_ip(const std::string& input) {
auto split = ovms::tokenize(input, ',');
if (split.size() > 1) {
for (const auto& part : split) {
if (!check_hostname_or_ip(part)) {
return false;
}
}
return true;
}

if (input.size() > 255) {
return false;
}
Expand All @@ -74,9 +102,7 @@ bool Config::check_hostname_or_ip(const std::string& input) {
}
if (all_numeric) {
static const std::regex valid_ipv4_regex("^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$");
static const std::regex valid_ipv6_regex(R"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))");
return std::regex_match(input, valid_ipv4_regex) ||
std::regex_match(input, valid_ipv6_regex);
return std::regex_match(input, valid_ipv4_regex) || is_ipv6(input);
} else {
std::regex valid_hostname_regex("^(([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\\-]*[a-zA-Z0-9])\\.)*([A-Za-z0-9]|[A-Za-z0-9][A-Za-z0-9\\-]*[A-Za-z0-9])$");
return std::regex_match(input, valid_hostname_regex);
Expand Down
1 change: 1 addition & 0 deletions src/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ class Config {
* @return bool
*/
static bool check_hostname_or_ip(const std::string& input);
static bool is_ipv6(const std::string& input);

/**
* @brief Get the config path
Expand Down
12 changes: 9 additions & 3 deletions src/drogon_http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "logging.hpp"
#include "mediapipe/framework/port/threadpool.h"
#include "timer.hpp"
#include "stringutils.hpp"

namespace ovms {

Expand Down Expand Up @@ -129,9 +130,14 @@ Status DrogonHttpServer::startAcceptingRequests() {
if (allowedHeaders.size()) {
resp->addHeader("Access-Control-Allow-Headers", allowedHeaders);
}
})
.addListener(this->address, this->port)
.run();
});

auto ips = ovms::tokenize(this->address, ',');
for (const auto& ip : ips) {
SPDLOG_INFO("Binding REST server to address: {}:{}", ip, this->port);
drogon::app().addListener(ip, this->port);
}
drogon::app().run();
} catch (...) {
SPDLOG_ERROR("Exception occurred during drogon::run()");
}
Expand Down
15 changes: 14 additions & 1 deletion src/grpcservermodule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ GRPCServerModule::GRPCServerModule(Server& server) :
tfsModelService(this->server),
kfsGrpcInferenceService(this->server) {}

static std::string host_with_port(const std::string& host, int port) {
if (Config::is_ipv6(host)) {
return "[" + host + "]:" + std::to_string(port);
} else {
return host + ":" + std::to_string(port);
}
}

Status GRPCServerModule::start(const ovms::Config& config) {
state = ModuleState::STARTED_INITIALIZE;
SPDLOG_INFO("{} starting", GRPC_SERVER_MODULE_NAME);
Expand All @@ -123,7 +131,12 @@ Status GRPCServerModule::start(const ovms::Config& config) {
ServerBuilder builder;
builder.SetMaxReceiveMessageSize(GIGABYTE);
builder.SetMaxSendMessageSize(GIGABYTE);
builder.AddListeningPort(config.grpcBindAddress() + ":" + std::to_string(config.port()), grpc::InsecureServerCredentials());
auto ips = ovms::tokenize(config.grpcBindAddress(), ',');
for (const auto& ip : ips) {
auto hostWithPort = host_with_port(ip, config.port());
SPDLOG_INFO("Binding gRPC server to address: {}", hostWithPort);
builder.AddListeningPort(hostWithPort, grpc::InsecureServerCredentials());
}
builder.RegisterService(&tfsPredictService);
builder.RegisterService(&tfsModelService);
builder.RegisterService(&kfsGrpcInferenceService);
Expand Down
Loading