[serve] Downgrade multiplex model loading/unloading logs from INFO to DEBUG (#60985)

kouroshHakha · cursoragent · aslonnie · web-flow · commit 2f654ada3e65 · 2026-02-12T09:02:57.000Z
## Why are these changes needed?

The "Loading model '...'" / "Successfully loaded model '...'" /
"Unloading model '...'" / "Successfully unloaded model '...'" messages
fire on every request when using multiplexed model IDs (e.g.,
session-aware routing). At high concurrency this floods stdout with
hundreds of lines per second, drowning out application logs.

Since model loading/unloading metrics are already tracked via counters
(`models_load_counter`, `models_unload_counter`) and histograms
(`model_load_latency_ms`, `model_unload_latency_ms`), these log lines
are only useful for debugging. Downgrade to DEBUG.

---------

Signed-off-by: Kourosh Hakhamaneshi &lt;kourosh@anyscale.com&gt;
Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
Co-authored-by: Lonnie Liu &lt;95255098+aslonnie@users.noreply.github.com&gt;
diff --git a/python/ray/serve/multiplex.py b/python/ray/serve/multiplex.py
@@ -212,7 +212,7 @@ async def load_model(self, model_id: str) -> Any:
                         self._push_multiplexed_replica_info = True
 
                     # Load the model.
-                    logger.info(f"Loading model '{model_id}'.")
+                    logger.debug(f"Loading model '{model_id}'.")
                     self.models_load_counter.inc()
                     load_start_time = time.time()
                     if self.self_arg is None:
@@ -222,7 +222,7 @@ async def load_model(self, model_id: str) -> Any:
                             self.self_arg, model_id
                         )
                     load_latency_ms = (time.time() - load_start_time) * 1000.0
-                    logger.info(
+                    logger.debug(
                         f"Successfully loaded model '{model_id}' in "
                         f"{load_latency_ms:.1f}ms."
                     )
@@ -242,7 +242,7 @@ async def unload_model_lru(self) -> None:
         self.models_unload_counter.inc()
         unload_start_time = time.time()
         model_id, model = self.models.popitem(last=False)
-        logger.info(f"Unloading model '{model_id}'.")
+        logger.debug(f"Unloading model '{model_id}'.")
 
         # If the model has __del__ attribute, call it.
         # This is to clean up the model resources eagerly.
@@ -254,7 +254,7 @@ async def unload_model_lru(self) -> None:
             model.__del__ = lambda _: None
         unload_latency_ms = (time.time() - unload_start_time) * 1000.0
         self.model_unload_latency_ms.observe(unload_latency_ms)
-        logger.info(
+        logger.debug(
             f"Successfully unloaded model '{model_id}' in {unload_latency_ms:.1f}ms."
         )
         self.registered_model_gauge.set(0, tags={"model_id": model_id})