Allow more than one http rest & grpc listeners (#3749)

dkalinowski · web-flow · commit b80c17d9c75d · 2025-11-12T11:18:32.000+01:00
### 🛠 Summary

CVS-170537

- added support for comma separated bind address list via CLI/C-API
- changed localhost to 127.0.0.1 in docs where requests pip package is
used because request localhost can introduce initial 2s delay on windows
systems due to ipv6 connection try before actual ipv4 connection is
estabilished
- updated security considerations doc
- updated performance optimization doc
diff --git a/demos/age_gender_recognition/python/age_gender_recognition.py b/demos/age_gender_recognition/python/age_gender_recognition.py
@@ -21,7 +21,7 @@
 import argparse
 
 parser = argparse.ArgumentParser(description='Client for age gender recognition')
-parser.add_argument('--rest_address', required=False, default='localhost',  help='Specify url to REST API service. default:localhost')
+parser.add_argument('--rest_address', required=False, default='127.0.0.1',  help='Specify url to REST API service. default:127.0.0.1')
 parser.add_argument('--rest_port', required=False, default=9001, help='Specify port to REST API service. default: 9178')
 parser.add_argument('--model_name', required=False, default='age_gender', help='Model name to request. default: age_gender')
 parser.add_argument('--image_input_path', required=True, help='Input image path.')
diff --git a/demos/continuous_batching/structured_output/README.md b/demos/continuous_batching/structured_output/README.md
@@ -120,7 +120,7 @@ payload = {
 }
 
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/chat/completions", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/chat/completions", json=payload, headers=headers)
 json_response = response.json()
 
 print(json_response["choices"][0]["message"]["content"])
@@ -138,7 +138,7 @@ pip install openai
 ```python
 from openai import OpenAI
 from pydantic import BaseModel
-base_url = "http://localhost:8000/v3"
+base_url = "http://127.0.0.1:8000/v3"
 model_name = "OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov"
 client = OpenAI(base_url=base_url, api_key="unused")
 class CalendarEvent(BaseModel):
@@ -174,14 +174,14 @@ It will be executed with the response_format request field including the schema
 ```console
 pip install datasets tqdm openai jsonschema
 curl -L https://raw.githubusercontent.com/openvinotoolkit/model_server/main/demos/continuous_batching/structured_output/accuracy_test.py -O 
-python accuracy_test.py --base_url http://localhost:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --concurrency 50 --limit 1000
+python accuracy_test.py --base_url http://127.0.0.1:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --concurrency 50 --limit 1000
 ```
 ```
 Requests: 1000, Successful responses: 1000, Exact matches: 135, Schema matches: 435 Invalid inputs: 0
 ```
 
 ```console
-python accuracy_test.py --base_url http://localhost:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --enable_response_format --concurrency 50 --limit 1000
+python accuracy_test.py --base_url http://127.0.0.1:8000/v3 --model OpenVINO/Mistral-7B-Instruct-v0.3-int4-cw-ov --enable_response_format --concurrency 50 --limit 1000
 ```
 ```
 Requests: 1000, Successful responses: 1000, Exact matches: 217, Schema matches: 828 Invalid inputs: 0
diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md
@@ -239,7 +239,7 @@ curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/m
 ```python
 import requests
 import base64
-base_url='http://localhost:8000/v3'
+base_url='http://127.0.0.1:8000/v3'
 model_name = "OpenGVLab/InternVL2-2B"
 
 def convert_image(Image):
diff --git a/demos/rerank/README.md b/demos/rerank/README.md
@@ -130,7 +130,7 @@ pip3 install cohere
 ```bash
 echo '
 import cohere
-client = cohere.Client(base_url="http://localhost:8000/v3", api_key="not_used")
+client = cohere.Client(base_url="http://127.0.0.1:8000/v3", api_key="not_used")
 responses = client.rerank(query="hello",documents=["welcome","farewell"], model="BAAI/bge-reranker-large")
 for response in responses.results:
     print(f"index {response.index}, relevance_score {response.relevance_score}")' > rerank_client.py
@@ -178,7 +178,7 @@ documents = [
     document_template.format(doc=doc, suffix=suffix) for doc in documents
 ]
 
-response = requests.post("http://localhost:8125/v3/rerank",
+response = requests.post("http://127.0.0.1:8000/v3/rerank",
                          json={
                              "model": "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
                              "query": query,
@@ -199,7 +199,7 @@ It will return response similar to:
 
 ```bash
 git clone https://github.com/openvinotoolkit/model_server
-python model_server/demos/rerank/compare_results.py --query "hello" --document "welcome" --document "farewell" --base_url http://localhost:8000/v3/
+python model_server/demos/rerank/compare_results.py --query "hello" --document "welcome" --document "farewell" --base_url http://127.0.0.1:8000/v3/
 query hello
 documents ['welcome', 'farewell']
 HF Duration: 145.731 ms
@@ -214,7 +214,7 @@ An asynchronous benchmarking client can be used to access the model server perfo
 ```bash
 cd model_server/demos/benchmark/embeddings/
 pip install -r requirements.txt
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large 
+python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:19<00:00,  2.53it/s]
 Tokens: 501000
@@ -224,7 +224,7 @@ Mean latency: 10268 ms
 Median latency: 10249 ms
 Average document length: 501.0 tokens
 
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large 
+python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset synthetic --synthetic_length 500 --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:19<00:00,  2.53it/s]
 Tokens: 501000
@@ -234,7 +234,7 @@ Mean latency: 10268 ms
 Median latency: 10249 ms
 Average document length: 501.0 tokens
 
-python benchmark_embeddings.py --api_url http://localhost:8000/v3/rerank --backend ovms_rerank --dataset Cohere/wikipedia-22-12-simple-embeddings --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
+python benchmark_embeddings.py --api_url http://127.0.0.1:8000/v3/rerank --backend ovms_rerank --dataset Cohere/wikipedia-22-12-simple-embeddings --request_rate inf --batch_size 20 --model BAAI/bge-reranker-large
 Number of documents: 1000
 100%|██████████████████████████████████████| 50/50 [00:09<00:00,  5.55it/s]
 Tokens: 92248
diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md
@@ -168,7 +168,7 @@ curl http://localhost:8000/v3/chat/completions  -H "Content-Type: application/js
 ```python
 import requests
 import base64
-base_url='http://localhost:8000/v3'
+base_url='http://127.0.0.1:8000/v3'
 model_name = "microsoft/Phi-3.5-vision-instruct"
 
 def convert_image(Image):
diff --git a/docs/clients_genai.md b/docs/clients_genai.md
@@ -67,7 +67,7 @@ print(response.choices[0].message)
 import requests
 payload = {"model": "meta-llama/Llama-2-7b-chat-hf", "messages": [ {"role": "user","content": "Say this is a test" }]}
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/chat/completions", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/chat/completions", json=payload, headers=headers)
 print(response.text)
 ```
 :::
@@ -147,7 +147,7 @@ print(response.choices[0].text)
 import requests
 payload = {"model": "meta-llama/Llama-2-7b", "prompt": "Say this is a test"}
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/completions", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/completions", json=payload, headers=headers)
 print(response.text)
 ```
 :::
@@ -280,7 +280,7 @@ for data in responses.data:
 import requests
 payload = {"model": "Alibaba-NLP/gte-large-en-v1.5", "input": "hello world"}
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/embeddings", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/embeddings", json=payload, headers=headers)
 print(response.text)
 ```
 :::
@@ -435,7 +435,7 @@ for res in responses.results:
 import requests
 payload = {"model": "BAAI/bge-reranker-large", "query": "Hello", "documents":["Welcome","Farewell"]}
 headers = {"Content-Type": "application/json", "Authorization": "not used"}
-response = requests.post("http://localhost:8000/v3/rerank", json=payload, headers=headers)
+response = requests.post("http://127.0.0.1:8000/v3/rerank", json=payload, headers=headers)
 print(response.text)
 ```
 :::
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -36,8 +36,8 @@ Configuration options for the server are defined only via command-line options a
 |---|---|---|
 | `port` | `integer` | Number of the port used by gRPC sever. |
 | `rest_port` | `integer` | Number of the port used by HTTP server (if not provided or set to 0, HTTP server will not be launched). |
-| `grpc_bind_address` | `string` | Network interface address or a hostname, to which gRPC server will bind to. Default: all interfaces: 0.0.0.0 |
-| `rest_bind_address` | `string` | Network interface address or a hostname, to which REST server will bind to. Default: all interfaces: 0.0.0.0 |
+| `grpc_bind_address` | `string` | Comma separated list of ipv4/ipv6 network interface addresses or hostnames, to which gRPC server will bind to. Default: all interfaces: 0.0.0.0 |
+| `rest_bind_address` | `string` | Comma separated list of ipv4/ipv6 network interface addresses or hostnames, to which REST server will bind to. Default: all interfaces: 0.0.0.0 |
 | `grpc_workers` | `integer` | Number of the gRPC server instances (must be from 1 to CPU core count). Default value is 1 and it's optimal for most use cases. Consider setting higher value while expecting heavy load. |
 | `rest_workers` | `integer` | Number of HTTP server threads. Effective when `rest_port` > 0. Default value is set based on the number of CPUs. |
 | `file_system_poll_wait_seconds` | `integer` | Time interval between config and model versions changes detection in seconds. Default value is 1. Zero value disables changes monitoring. |
diff --git a/docs/performance_tuning.md b/docs/performance_tuning.md
@@ -144,6 +144,23 @@ To save power, the OS can decrease the CPU frequency and increase a volatility o
 $ cpupower frequency-set --min 3.1GHz
 ```
 
+## Network Configuration for Optimal Performance
+
+By default, OVMS endpoints are bound to all ipv4 addresses. On same systems, which route localhost name to ipv6 address, it might cause extra time on the client side to switch to ipv4. It can effectively results with extra 1-2s latency.
+It can be overcome by switching the API URL to `http://127.0.0.1` on the client side.
+
+To optimize network connection performance:
+
+Alternatively ipv6 can be enabled in the model server using `--grpc_bind_address` and `--rest_bind_address`.
+For example:
+```
+--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1
+```
+or
+```
+--grpc_bind_address 0.0.0.0,:: --rest_bind_address 0.0.0.0,::
+```
+
 ## Tuning Model Server configuration parameters
 
 OpenVINO Model Server in C++ implementation is using scalable multithreaded gRPC and REST interface, however in some hardware configuration it might become a bottleneck for high performance backend with OpenVINO.
diff --git a/docs/security_considerations.md b/docs/security_considerations.md
@@ -13,6 +13,11 @@ docker run --rm -d --user $(id -u):$(id -g) --read-only --tmpfs /tmp -p 9000:900
 ---
 OpenVINO Model Server currently does not provide access restrictions and traffic encryption on gRPC and REST API endpoints. The endpoints can be secured using network settings like docker network settings or network firewall on the host. The recommended configuration is to place OpenVINO Model Server behind any reverse proxy component or load balancer, which provides traffic encryption and user authorization.
 
+When deploying in environments where only local access is required, administrators can configure the server to bind exclusively to localhost addresses. This can be achieved by setting the bind address to `127.0.0.1` for IPv4 or `::1` for IPv6, which restricts incoming connections to the local machine only. This configuration prevents external network access to the server endpoints, providing an additional layer of security for local development or testing environments.
+```
+--grpc_bind_address 127.0.0.1,::1 --rest_bind_address 127.0.0.1,::1
+```
+
 See also:
 - [Securing OVMS with NGINX](../extras/nginx-mtls-auth/README.md)
 - [Securing models with OVSA](https://docs.openvino.ai/2025/about-openvino/openvino-ecosystem/openvino-project/openvino-security-add-on.html)
diff --git a/docs/stateful_models.md b/docs/stateful_models.md
@@ -259,7 +259,7 @@ signature = "serving_default"
 request_body = json.dumps({"signature_name": signature,'inputs': inputs})
 
 # Send request to OVMS and get response
-response = requests.post("localhost:5555/v1/models/stateful_model:predict", data=request_body)
+response = requests.post("127.0.0.1:5555/v1/models/stateful_model:predict", data=request_body)
 
 # Parse response
 response_body = json.loads(response.text)
diff --git a/src/BUILD b/src/BUILD
@@ -2285,7 +2285,8 @@ ovms_cc_library(
         "libovmslogging",
         "libovmsstatus",
         "libhttp_status_code",
-        "libovms_config"
+        "libovms_config",
+        "libovmsstring_utils",
     ],
     visibility = ["//visibility:public",],
 )
diff --git a/src/config.cpp b/src/config.cpp
@@ -21,6 +21,12 @@
 #include <thread>
 #include <vector>
 
+#ifdef _WIN32
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#endif
+
 #include "logging.hpp"
 #include "ovms_exit_codes.hpp"
 
@@ -59,7 +65,29 @@ bool Config::parse(ServerSettingsImpl* serverSettings, ModelsSettingsImpl* model
     return validate();
 }
 
+bool Config::is_ipv6(const std::string& s) {
+    addrinfo hints{};
+    hints.ai_family = AF_INET6;
+    hints.ai_flags = AI_NUMERICHOST;
+    addrinfo* res = nullptr;
+    const int rc = getaddrinfo(s.c_str(), nullptr, &hints, &res);
+    if (res) {
+        freeaddrinfo(res);
+    }
+    return rc == 0;
+}
+
 bool Config::check_hostname_or_ip(const std::string& input) {
+    auto split = ovms::tokenize(input, ',');
+    if (split.size() > 1) {
+        for (const auto& part : split) {
+            if (!check_hostname_or_ip(part)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
     if (input.size() > 255) {
         return false;
     }
@@ -74,9 +102,7 @@ bool Config::check_hostname_or_ip(const std::string& input) {
     }
     if (all_numeric) {
         static const std::regex valid_ipv4_regex("^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$");
-        static const std::regex valid_ipv6_regex(R"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))");
-        return std::regex_match(input, valid_ipv4_regex) ||
-               std::regex_match(input, valid_ipv6_regex);
+        return std::regex_match(input, valid_ipv4_regex) || is_ipv6(input);
     } else {
         std::regex valid_hostname_regex("^(([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\\-]*[a-zA-Z0-9])\\.)*([A-Za-z0-9]|[A-Za-z0-9][A-Za-z0-9\\-]*[A-Za-z0-9])$");
         return std::regex_match(input, valid_hostname_regex);
diff --git a/src/config.hpp b/src/config.hpp
@@ -92,6 +92,7 @@ class Config {
          * @return bool
          */
     static bool check_hostname_or_ip(const std::string& input);
+    static bool is_ipv6(const std::string& input);
 
     /**
          * @brief Get the config path
diff --git a/src/drogon_http_server.cpp b/src/drogon_http_server.cpp
@@ -28,6 +28,7 @@
 #include "logging.hpp"
 #include "mediapipe/framework/port/threadpool.h"
 #include "timer.hpp"
+#include "stringutils.hpp"
 
 namespace ovms {
 
@@ -129,9 +130,14 @@ Status DrogonHttpServer::startAcceptingRequests() {
                         if (allowedHeaders.size()) {
                             resp->addHeader("Access-Control-Allow-Headers", allowedHeaders);
                         }
-                    })
-                    .addListener(this->address, this->port)
-                    .run();
+                    });
+
+                auto ips = ovms::tokenize(this->address, ',');
+                for (const auto& ip : ips) {
+                    SPDLOG_INFO("Binding REST server to address: {}:{}", ip, this->port);
+                    drogon::app().addListener(ip, this->port);
+                }
+                drogon::app().run();
             } catch (...) {
                 SPDLOG_ERROR("Exception occurred during drogon::run()");
             }
diff --git a/src/grpcservermodule.cpp b/src/grpcservermodule.cpp
@@ -97,6 +97,14 @@ GRPCServerModule::GRPCServerModule(Server& server) :
     tfsModelService(this->server),
     kfsGrpcInferenceService(this->server) {}
 
+static std::string host_with_port(const std::string& host, int port) {
+    if (Config::is_ipv6(host)) {
+        return "[" + host + "]:" + std::to_string(port);
+    } else {
+        return host + ":" + std::to_string(port);
+    }
+}
+
 Status GRPCServerModule::start(const ovms::Config& config) {
     state = ModuleState::STARTED_INITIALIZE;
     SPDLOG_INFO("{} starting", GRPC_SERVER_MODULE_NAME);
@@ -123,7 +131,12 @@ Status GRPCServerModule::start(const ovms::Config& config) {
     ServerBuilder builder;
     builder.SetMaxReceiveMessageSize(GIGABYTE);
     builder.SetMaxSendMessageSize(GIGABYTE);
-    builder.AddListeningPort(config.grpcBindAddress() + ":" + std::to_string(config.port()), grpc::InsecureServerCredentials());
+    auto ips = ovms::tokenize(config.grpcBindAddress(), ',');
+    for (const auto& ip : ips) {
+        auto hostWithPort = host_with_port(ip, config.port());
+        SPDLOG_INFO("Binding gRPC server to address: {}", hostWithPort);
+        builder.AddListeningPort(hostWithPort, grpc::InsecureServerCredentials());
+    }
     builder.RegisterService(&tfsPredictService);
     builder.RegisterService(&tfsModelService);
     builder.RegisterService(&kfsGrpcInferenceService);
diff --git a/src/test/ovmsconfig_test.cpp b/src/test/ovmsconfig_test.cpp