openvinotoolkit · dkalinowski · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/demos/age_gender_recognition/python/age_gender_recognition.py b/demos/age_gender_recognition/python/age_gender_recognition.py
@@ -21,7 +21,7 @@
 import argparse
 
 parser = argparse.ArgumentParser(description='Client for age gender recognition')
-parser.add_argument('--rest_address', required=False, default='localhost',  help='Specify url to REST API service. default:localhost')
+parser.add_argument('--rest_address', required=False, default='127.0.0.1',  help='Specify url to REST API service. default:127.0.0.1')
 parser.add_argument('--rest_port', required=False, default=9001, help='Specify port to REST API service. default: 9178')
 parser.add_argument('--model_name', required=False, default='age_gender', help='Model name to request. default: age_gender')
 parser.add_argument('--image_input_path', required=True, help='Input image path.')

diff --git a/demos/continuous_batching/structured_output/README.md b/demos/continuous_batching/structured_output/README.md
@@ -120,7 +120,7 @@ payload = {
 }
 
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/chat/completions", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/chat/completions", json=payload, headers=headers)
 json_response = response.json()
 
 print(json_response["choices"][0]["message"]["content"])
@@ -138,7 +138,7 @@ pip install openai
 ```python
 from openai import OpenAI
 from pydantic import BaseModel
-base_url = "http://localhost:8000/v3"
+base_url = "http://127.0.0.1:8000/v3"
 model_name = "OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov"
 client = OpenAI(base_url=base_url, api_key="unused")
 class CalendarEvent(BaseModel):
@@ -174,14 +174,14 @@ It will be executed with the response_format request field including the schema
 ```console
 pip install datasets tqdm openai jsonschema
 curl -L https://raw.githubusercontent.com/openvinotoolkit/model_server/main/demos/continuous_batching/structured_output/accuracy_test.py -O 
-python accuracy_test.py --base_url http://localhost:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --concurrency 50 --limit 1000
+python accuracy_test.py --base_url http://127.0.0.1:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --concurrency 50 --limit 1000
 ```
 ```
 Requests: 1000, Successful responses: 1000, Exact matches: 135, Schema matches: 435 Invalid inputs: 0
 ```
 
 ```console
-python accuracy_test.py --base_url http://localhost:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --enable_response_format --concurrency 50 --limit 1000
+python accuracy_test.py --base_url http://127.0.0.1:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --enable_response_format --concurrency 50 --limit 1000
 ```
 ```
 Requests: 1000, Successful responses: 1000, Exact matches: 217, Schema matches: 828 Invalid inputs: 0

diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md
@@ -239,7 +239,7 @@ curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/m
 ```python
 import requests
 import base64
-base_url='http://localhost:8000/v3'
+base_url='http://127.0.0.1:8000/v3'
 model_name = "OpenGVLab/InternVL2-2B"
 
 def convert_image(Image):

diff --git a/demos/rerank/README.md b/demos/rerank/README.md
@@ -130,7 +130,7 @@ pip3 install cohere
 ```bash
 echo '
 import cohere
-client = cohere.Client(base_url="http://localhost:8000/v3", api_key="not_used")
+client = cohere.Client(base_url="http://127.0.0.1:8000/v3", api_key="not_used")
 responses = client.rerank(query="hello",documents=["welcome","farewell"], model="BAAI/bge-reranker-large")
 for response in responses.results:
     print(f"index {response.index}, relevance_score {response.relevance_score}")' > rerank_client.py
@@ -178,7 +178,7 @@ documents = [
     document_template.format(doc=doc, suffix=suffix) for doc in documents
 ]
 
-response = requests.post("http://localhost:8125/v3/rerank",
+response = requests.post("http://127.0.0.1:8000/v3/rerank",
                          json={
                              "model": "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
                              "query": query,
@@ -199,7 +199,7 @@ It will return response similar to:
 
 ```bash
 git clone https://github.com/openvinotoolkit/model_server
-python model_server/demos/rerank/compare_results.py --query "hello" --document "welcome" --document "farewell" --base_url http://localhost:8000/v3/
+python model_server/demos/rerank/compare_results.py --query "hello" --document "welcome" --document "farewell" --base_url http://127.0.0.1:8000/v3/
 query hello
 documents ['welcome', 'farewell']
 HF Duration: 145.731 ms
@@ -214,7 +214,7 @@ An asynchronous benchmarking client can be used to access the model server perfo
 ```bash
 cd model_server/demos/benchmark/embeddings/
 pip install -r requirements.txt
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large 
+python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:19<00:00,  2.53it/s]
 Tokens: 501000
@@ -224,7 +224,7 @@ Mean latency: 10268 ms
 Median latency: 10249 ms
 Average document length: 501.0 tokens
 
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large 
+python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:19<00:00,  2.53it/s]
 Tokens: 501000
@@ -234,7 +234,7 @@ Mean latency: 10268 ms
 Median latency: 10249 ms
 Average document length: 501.0 tokens
 
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset Cohere/wikipedia-22-12-simple-embeddings --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
+python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset Cohere/wikipedia-22-12-simple-embeddings --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:09<00:00,  5.55it/s]
 Tokens: 92248

diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md
@@ -168,7 +168,7 @@ curl http://localhost:8000/v3/chat/completions  -H "Content-Type: application/js
 ```python
 import requests
 import base64
-base_url='http://localhost:8000/v3'
+base_url='http://127.0.0.1:8000/v3'
 model_name = "microsoft/Phi-3.5-vision-instruct"
 
 def convert_image(Image):

diff --git a/docs/clients_genai.md b/docs/clients_genai.md
@@ -67,7 +67,7 @@ print(response.choices[0].message)
 import requests
 payload = {"model": "meta-llama/Llama-2-7b-chat-hf", "messages": [ {"role": "user","content": "Say this is a test" }]}
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/chat/completions", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/chat/completions", json=payload, headers=headers)
 print(response.text)
 ```
 :::
@@ -147,7 +147,7 @@ print(response.choices[0].text)
 import requests
 payload = {"model": "meta-llama/Llama-2-7b", "prompt": "Say this is a test"}
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/completions", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/completions", json=payload, headers=headers)
 print(response.text)
 ```
 :::
@@ -280,7 +280,7 @@ for data in responses.data:
 import requests
 payload = {"model": "Alibaba-NLP/gte-large-en-v1.5", "input": "hello world"}
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/embeddings", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/embeddings", json=payload, headers=headers)
 print(response.text)
 ```
 :::
@@ -435,7 +435,7 @@ for res in responses.results:
 import requests
 payload = {"model": "BAAI/bge-reranker-large", "query": "Hello", "documents":["Welcome","Farewell"]}
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/rerank", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/rerank", json=payload, headers=headers)
 print(response.text)
 ```
 :::

diff --git a/docs/parameters.md b/docs/parameters.md
@@ -36,8 +36,8 @@ Configuration options for the server are defined only via command-line options a
 |---|---|---|
 | `port` | `integer` | Number of the port used by gRPC sever. |
 | `rest_port` | `integer` | Number of the port used by HTTP server (if not provided or set to 0, HTTP server will not be launched). |
-| `grpc_bind_address` | `string` | Network interface address or a hostname, to which gRPC server will bind to. Default: all interfaces: 0.0.0.0 |
-| `rest_bind_address` | `string` | Network interface address or a hostname, to which REST server will bind to. Default: all interfaces: 0.0.0.0 |
+| `grpc_bind_address` | `string` | Comma separated list of ipv4/ipv6 network interface addresses or hostnames, to which gRPC server will bind to. Default: all interfaces: 0.0.0.0 |
+| `rest_bind_address` | `string` | Comma separated list of ipv4/ipv6 network interface addresses or hostnames, to which REST server will bind to. Default: all interfaces: 0.0.0.0 |
 | `grpc_workers` | `integer` | Number of the gRPC server instances (must be from 1 to CPU core count). Default value is 1 and it's optimal for most use cases. Consider setting higher value while expecting heavy load. |
 | `rest_workers` | `integer` | Number of HTTP server threads. Effective when `rest_port` > 0. Default value is set based on the number of CPUs. |
 | `file_system_poll_wait_seconds` | `integer` | Time interval between config and model versions changes detection in seconds. Default value is 1. Zero value disables changes monitoring. |

diff --git a/docs/performance_tuning.md b/docs/performance_tuning.md
@@ -144,6 +144,38 @@ To save power, the OS can decrease the CPU frequency and increase a volatility o
 $ cpupower frequency-set --min 3.1GHz
 ```
 
+## Network Configuration for Optimal Performance
+
+When clients connect to the server using hostname resolution (particularly "localhost"), the system may attempt IPv6 resolution first before falling back to IPv4. If IPv6 is disabled, misconfigured, or unavailable, this can cause connection timeouts and delays before the IPv4 fallback occurs, which is especially noticeable when minimizing time to first token in generative AI applications.
+
+To optimize network connection performance:
+
+**For local secured environments (restricted to localhost only):**
+- If dual-stack networking is configured properly, binding to IPv6 localhost is sufficient: `--grpc_bind_address ::1 --rest_bind_address ::1` (both IPv4 and IPv6 will work)
+- For systems without proper dual-stack support, specify both addresses to avoid resolution delays when clients use "localhost": `--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1`
+- If IPv6 is disabled or not available in the environment, bind only to IPv4: `--grpc_bind_address 127.0.0.1 --rest_bind_address 127.0.0.1`
+
+**For public deployments:**
+- If dual-stack networking is configured properly, binding to IPv6 is sufficient: `--grpc_bind_address :: --rest_bind_address ::` (both IPv4 and IPv6 will work)
+- For systems without proper dual-stack support, specify both addresses: `--grpc_bind_address 0.0.0.0,:: --rest_bind_address 0.0.0.0,::`
+- Or just configure clients to connect directly to specific IP addresses (127.0.0.1 or ::1) rather than using "localhost" hostname
+
+Example for local secured access using OpenVINO Model Server binary:
+
+Linux/macOS:
+```bash
+./ovms --model_path /path/to/model --model_name resnet --port 9001 \
+--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1 \
+--rest_port 8001
+```
+
+Windows:
+```cmd
+ovms.exe --model_path C:\path\to\model --model_name resnet --port 9001 ^
+--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1 ^
+--rest_port 8001
+```
+
 ## Tuning Model Server configuration parameters
 
 OpenVINO Model Server in C++ implementation is using scalable multithreaded gRPC and REST interface, however in some hardware configuration it might become a bottleneck for high performance backend with OpenVINO.

diff --git a/docs/security_considerations.md b/docs/security_considerations.md
@@ -13,6 +13,11 @@ docker run --rm -d --user $(id -u):$(id -g) --read-only --tmpfs /tmp -p 9000:900
 ---
 OpenVINO Model Server currently does not provide access restrictions and traffic encryption on gRPC and REST API endpoints. The endpoints can be secured using network settings like docker network settings or network firewall on the host. The recommended configuration is to place OpenVINO Model Server behind any reverse proxy component or load balancer, which provides traffic encryption and user authorization.
 
+When deploying in environments where only local access is required, administrators can configure the server to bind exclusively to localhost addresses. This can be achieved by setting the bind address to `127.0.0.1` for IPv4 or `::1` for IPv6, which restricts incoming connections to the local machine only. This configuration prevents external network access to the server endpoints, providing an additional layer of security for local development or testing environments.
+```
+--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1
+```
+
 See also:
 - [Securing OVMS with NGINX](../extras/nginx-mtls-auth/README.md)
 - [Securing models with OVSA](https://docs.openvino.ai/2025/about-openvino/openvino-ecosystem/openvino-project/openvino-security-add-on.html)

diff --git a/docs/stateful_models.md b/docs/stateful_models.md
@@ -259,7 +259,7 @@ signature = "serving_default"
 request_body = json.dumps({"signature_name": signature,'inputs': inputs})
 
 # Send request to OVMS and get response
-response = requests.post("localhost:5555/v1/models/stateful_model:predict", data=request_body)
+response = requests.post("127.0.0.1:5555/v1/models/stateful_model:predict", data=request_body)
 
 # Parse response
 response_body = json.loads(response.text)

diff --git a/src/BUILD b/src/BUILD
@@ -2278,7 +2278,8 @@ ovms_cc_library(
         "libovmslogging",
         "libovmsstatus",
         "libhttp_status_code",
-        "libovms_config"
+        "libovms_config",
+        "libovmsstring_utils",
     ],
     visibility = ["//visibility:public",],
 )

diff --git a/src/config.cpp b/src/config.cpp
@@ -21,6 +21,12 @@
 #include <thread>
 #include <vector>
 
+#ifdef _WIN32
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#endif
+
 #include "logging.hpp"
 #include "ovms_exit_codes.hpp"
 
@@ -59,7 +65,29 @@ bool Config::parse(ServerSettingsImpl* serverSettings, ModelsSettingsImpl* model
     return validate();
 }
 
+bool Config::is_ipv6(const std::string& s) {
+    addrinfo hints{};
+    hints.ai_family = AF_INET6;
+    hints.ai_flags = AI_NUMERICHOST;
+    addrinfo* res = nullptr;
+    const int rc = getaddrinfo(s.c_str(), nullptr, &hints, &res);
+    if (res) {
+        freeaddrinfo(res);
+    }
+    return rc == 0;
+}
+
 bool Config::check_hostname_or_ip(const std::string& input) {
+    auto split = ovms::tokenize(input, ',');
+    if (split.size() > 1) {
+        for (const auto& part : split) {
+            if (!check_hostname_or_ip(part)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
     if (input.size() > 255) {
         return false;
     }
@@ -74,9 +102,7 @@ bool Config::check_hostname_or_ip(const std::string& input) {
     }
     if (all_numeric) {
         static const std::regex valid_ipv4_regex("^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$");
-        static const std::regex valid_ipv6_regex(R"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))");
-        return std::regex_match(input, valid_ipv4_regex) ||
-               std::regex_match(input, valid_ipv6_regex);
+        return std::regex_match(input, valid_ipv4_regex) || is_ipv6(input);
     } else {
         std::regex valid_hostname_regex("^(([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\\-]*[a-zA-Z0-9])\\.)*([A-Za-z0-9]|[A-Za-z0-9][A-Za-z0-9\\-]*[A-Za-z0-9])$");
         return std::regex_match(input, valid_hostname_regex);

diff --git a/src/config.hpp b/src/config.hpp
@@ -92,6 +92,7 @@ class Config {
          * @return bool
          */
     static bool check_hostname_or_ip(const std::string& input);
+    static bool is_ipv6(const std::string& input);
 
     /**
          * @brief Get the config path

diff --git a/src/drogon_http_server.cpp b/src/drogon_http_server.cpp
@@ -28,6 +28,7 @@
 #include "logging.hpp"
 #include "mediapipe/framework/port/threadpool.h"
 #include "timer.hpp"
+#include "stringutils.hpp"
 
 namespace ovms {
 
@@ -129,9 +130,14 @@ Status DrogonHttpServer::startAcceptingRequests() {
                         if (allowedHeaders.size()) {
                             resp->addHeader("Access-Control-Allow-Headers", allowedHeaders);
                         }
-                    })
-                    .addListener(this->address, this->port)
-                    .run();
+                    });
+
+                auto ips = ovms::tokenize(this->address, ',');
+                for (const auto& ip : ips) {
+                    SPDLOG_INFO("Binding REST server to address: {}:{}", ip, this->port);
+                    drogon::app().addListener(ip, this->port);
+                }
+                drogon::app().run();
             } catch (...) {
                 SPDLOG_ERROR("Exception occurred during drogon::run()");
             }

diff --git a/src/grpcservermodule.cpp b/src/grpcservermodule.cpp
@@ -97,6 +97,14 @@ GRPCServerModule::GRPCServerModule(Server& server) :
     tfsModelService(this->server),
     kfsGrpcInferenceService(this->server) {}
 
+static std::string host_with_port(const std::string& host, int port) {
+    if (Config::is_ipv6(host)) {
+        return "[" + host + "]:" + std::to_string(port);
+    } else {
+        return host + ":" + std::to_string(port);
+    }
+}
+
 Status GRPCServerModule::start(const ovms::Config& config) {
     state = ModuleState::STARTED_INITIALIZE;
     SPDLOG_INFO("{} starting", GRPC_SERVER_MODULE_NAME);
@@ -123,7 +131,12 @@ Status GRPCServerModule::start(const ovms::Config& config) {
     ServerBuilder builder;
     builder.SetMaxReceiveMessageSize(GIGABYTE);
     builder.SetMaxSendMessageSize(GIGABYTE);
-    builder.AddListeningPort(config.grpcBindAddress() + ":" + std::to_string(config.port()), grpc::InsecureServerCredentials());
+    auto ips = ovms::tokenize(config.grpcBindAddress(), ',');
+    for (const auto& ip : ips) {
+        auto hostWithPort = host_with_port(ip, config.port());
+        SPDLOG_INFO("Binding gRPC server to address: {}", hostWithPort);
+        builder.AddListeningPort(hostWithPort, grpc::InsecureServerCredentials());
+    }
     builder.RegisterService(&tfsPredictService);
     builder.RegisterService(&tfsModelService);
     builder.RegisterService(&kfsGrpcInferenceService);