Skip to content

Commit 2f654ad

Browse files
kouroshHakhacursoragentaslonnie
authored
[serve] Downgrade multiplex model loading/unloading logs from INFO to DEBUG (#60985)
## Why are these changes needed? The "Loading model '...'" / "Successfully loaded model '...'" / "Unloading model '...'" / "Successfully unloaded model '...'" messages fire on every request when using multiplexed model IDs (e.g., session-aware routing). At high concurrency this floods stdout with hundreds of lines per second, drowning out application logs. Since model loading/unloading metrics are already tracked via counters (`models_load_counter`, `models_unload_counter`) and histograms (`model_load_latency_ms`, `model_unload_latency_ms`), these log lines are only useful for debugging. Downgrade to DEBUG. --------- Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com> Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: Lonnie Liu <95255098+aslonnie@users.noreply.github.com>
1 parent b14f098 commit 2f654ad

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

python/ray/serve/multiplex.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ async def load_model(self, model_id: str) -> Any:
212212
self._push_multiplexed_replica_info = True
213213

214214
# Load the model.
215-
logger.info(f"Loading model '{model_id}'.")
215+
logger.debug(f"Loading model '{model_id}'.")
216216
self.models_load_counter.inc()
217217
load_start_time = time.time()
218218
if self.self_arg is None:
@@ -222,7 +222,7 @@ async def load_model(self, model_id: str) -> Any:
222222
self.self_arg, model_id
223223
)
224224
load_latency_ms = (time.time() - load_start_time) * 1000.0
225-
logger.info(
225+
logger.debug(
226226
f"Successfully loaded model '{model_id}' in "
227227
f"{load_latency_ms:.1f}ms."
228228
)
@@ -242,7 +242,7 @@ async def unload_model_lru(self) -> None:
242242
self.models_unload_counter.inc()
243243
unload_start_time = time.time()
244244
model_id, model = self.models.popitem(last=False)
245-
logger.info(f"Unloading model '{model_id}'.")
245+
logger.debug(f"Unloading model '{model_id}'.")
246246

247247
# If the model has __del__ attribute, call it.
248248
# This is to clean up the model resources eagerly.
@@ -254,7 +254,7 @@ async def unload_model_lru(self) -> None:
254254
model.__del__ = lambda _: None
255255
unload_latency_ms = (time.time() - unload_start_time) * 1000.0
256256
self.model_unload_latency_ms.observe(unload_latency_ms)
257-
logger.info(
257+
logger.debug(
258258
f"Successfully unloaded model '{model_id}' in {unload_latency_ms:.1f}ms."
259259
)
260260
self.registered_model_gauge.set(0, tags={"model_id": model_id})

0 commit comments

Comments
 (0)