Lightning-AI · alpha0422 · Sep 10, 2025 · Sep 10, 2025 · Sep 21, 2025 · Sep 22, 2025
@@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246))
 
 
+- Fixed redundant host-device sync in progressbar printing ([#21233](https://github.com/Lightning-AI/pytorch-lightning/pull/21233))
+
+
 - Fixed check the init args only when the given frames are in `__init__` method ([#21227](https://github.com/Lightning-AI/pytorch-lightning/pull/21227))
 
 

@@ -656,11 +656,17 @@ def __check_allowed(v: Any, name: str, value: Any) -> None:
         raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged")
 
     def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor:
-        value = (
-            value.clone().detach()
-            if isinstance(value, Tensor)
-            else torch.tensor(value, device=self.device, dtype=_get_default_dtype())
-        )
+        if isinstance(value, Tensor):
+            # Keep tensor on its original device to avoid unnecessary transfers
+            value = value.clone().detach()
+        else:
+            # Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization.
+            # `torch.tensor(value, device="cuda")` contains such synchronization, while the metric
+            # itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient.
+            # For non-CUDA devices, maintain original behavior
+            device = "cpu" if self.device.type == "cuda" else self.device
+            value = torch.tensor(value, device=device, dtype=_get_default_dtype())
+
         if not torch.numel(value) == 1:
             raise ValueError(
                 f"`self.log({name}, {value})` was called, but the tensor must have a single element."

@@ -234,7 +234,9 @@ def metrics(self) -> _METRICS:
         """This function returns either batch or epoch metrics."""
         on_step = self._first_loop_iter is not None
         assert self.trainer._results is not None
-        return self.trainer._results.metrics(on_step)
+        # Only include progress bar metrics if a progress bar callback is present
+        include_pbar_metrics = self.trainer.progress_bar_callback is not None
+        return self.trainer._results.metrics(on_step, include_pbar_metrics=include_pbar_metrics)
 
     @property
     def callback_metrics(self) -> _OUT_DICT:

@@ -467,7 +467,7 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> tuple[str
             forked_name += dataloader_suffix
         return name, forked_name
 
-    def metrics(self, on_step: bool) -> _METRICS:
+    def metrics(self, on_step: bool, include_pbar_metrics: bool = True) -> _METRICS:
         metrics = _METRICS(callback={}, log={}, pbar={})
 
         for _, result_metric in self.valid_items():
@@ -488,7 +488,7 @@ def metrics(self, on_step: bool) -> _METRICS:
                 metrics["callback"][forked_name] = value
 
             # populate progress_bar metrics. convert tensors to numbers
-            if result_metric.meta.prog_bar:
+            if result_metric.meta.prog_bar and include_pbar_metrics:
                 metrics["pbar"][forked_name] = convert_tensors_to_scalars(value)
 
         return metrics

@@ -594,3 +594,53 @@ def __init__(self):
         fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=1e-3, max_norm=None)
     else:
         fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=None, max_norm=1e-3)
+
+
+@RunIf(min_cuda_gpus=1)
+def test_log_no_cuda_sync():
+    """Test logging scalars and tensors doesn't introduce CUDA sync."""
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.to("cuda")
+
+        def training_step(self, batch, batch_idx):
+            # Create tensors before enabling sync debug mode to avoid sync
+            cuda_tensor = torch.tensor(0.7, device=self.device)
+            cpu_tensor = torch.tensor(1.0, device="cpu")
+
+            # Enable sync debug mode to catch any synchronization
+            torch.cuda.set_sync_debug_mode("error")
+            try:
+                # Test scalar value (should be placed on CPU to avoid sync)
+                self.log("scalar_loss", 0.5)
+
+                # Test CUDA tensor (should stay on original device)
+                self.log("cuda_tensor", cuda_tensor)
+
+                # Test CPU tensor (should stay on CPU)
+                self.log("cpu_tensor", cpu_tensor)
+
+            except RuntimeError as e:
+                if "called a synchronizing CUDA operation" in str(e):
+                    msg = f"Unexpected CUDA synchronization: {e}"
+                    pytest.fail(msg)
+                else:
+                    raise
+            finally:
+                torch.cuda.set_sync_debug_mode("default")
+
+            return super().training_step(batch, batch_idx)
+
+    model = TestModel()
+    trainer = Trainer(
+        max_epochs=1,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        accelerator="gpu",
+        devices=1,
+        enable_progress_bar=False,
+        enable_checkpointing=False,
+    )
+    trainer.fit(model)
@@ -660,3 +660,103 @@ def test_result_collection_changes_device():
     # same device as the new tensor
     results.log(fx, name, log_val, on_step=True, on_epoch=False, reduce_fx="mean")
     assert results[f"{fx}.{name}"].cumulated_batch_size.device == log_val.device
+
+
+@RunIf(min_cuda_gpus=1)
+def test_logger_connector_no_sync_without_progress_bar():
+    """Test logger connector doesn't sync when no progress bar."""
+    from lightning.pytorch import Trainer
+    from lightning.pytorch.demos.boring_classes import BoringModel
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.to("cuda")
+
+        def training_step(self, batch, batch_idx):
+            # Log some metrics with progress bar enabled
+            loss = super().training_step(batch, batch_idx)["loss"]
+
+            # Enable sync debug mode to catch any synchronization
+            torch.cuda.set_sync_debug_mode("error")
+            try:
+                # These logs have prog_bar=True but should not sync
+                # when progress bar callback is not present
+                self.log("train_loss", loss, prog_bar=True)
+                self.log("train_acc", 0.95, prog_bar=True)
+
+            except RuntimeError as e:
+                if "called a synchronizing CUDA operation" in str(e):
+                    msg = f"Unexpected CUDA synchronization: {e}"
+                    pytest.fail(msg)
+                else:
+                    raise
+            finally:
+                torch.cuda.set_sync_debug_mode("default")
+
+            return loss
+
+    model = TestModel()
+    trainer = Trainer(
+        max_epochs=1,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        accelerator="gpu",
+        devices=1,
+        enable_progress_bar=False,  # Key - no progress bar callback
+        enable_checkpointing=False,
+    )
+    trainer.fit(model)
+
+
+def test_result_collection_metrics_include_pbar_parameter():
+    """Test metrics method handles include_pbar_metrics parameter."""
+    from lightning.pytorch.trainer.connectors.logger_connector.result import (
+        _ResultCollection,
+    )
+
+    results = _ResultCollection(training=True)
+
+    # Log some metrics with different prog_bar settings
+    results.log(
+        "training_step",
+        "regular_metric",
+        torch.tensor(1.0),
+        on_step=True,
+        on_epoch=False,
+        prog_bar=False,
+    )
+    results.log(
+        "training_step",
+        "pbar_metric",
+        torch.tensor(2.0),
+        on_step=True,
+        on_epoch=False,
+        prog_bar=True,
+    )
+    results.log(
+        "training_step",
+        "both_metric",
+        torch.tensor(3.0),
+        on_step=True,
+        on_epoch=False,
+        prog_bar=True,
+        logger=True,
+    )
+
+    # Test with include_pbar_metrics=True (default behavior)
+    metrics_with_pbar = results.metrics(on_step=True, include_pbar_metrics=True)
+    assert "pbar_metric" in metrics_with_pbar["pbar"]
+    assert "both_metric" in metrics_with_pbar["pbar"]
+    assert "both_metric" in metrics_with_pbar["log"]
+
+    # Test with include_pbar_metrics=False (optimization)
+    metrics_without_pbar = results.metrics(on_step=True, include_pbar_metrics=False)
+    # No progress bar metrics should be included
+    assert len(metrics_without_pbar["pbar"]) == 0
+    # Logger metrics should still be included
+    assert "both_metric" in metrics_without_pbar["log"]
+
+    # Verify callback metrics are not affected
+    assert "regular_metric" in metrics_with_pbar["callback"]
+    assert "regular_metric" in metrics_without_pbar["callback"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		- Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246))


		- Fixed redundant host-device sync in progressbar printing ([#21233](https://github.com/Lightning-AI/pytorch-lightning/pull/21233))


		- Fixed check the init args only when the given frames are in `__init__` method ([#21227](https://github.com/Lightning-AI/pytorch-lightning/pull/21227))


Expand Down