Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/lightning/pytorch/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed `LightningCLI` loading of hyperparameters from `ckpt_path` failing for subclass model mode ([#21246](https://github.com/Lightning-AI/pytorch-lightning/pull/21246))


- Fixed redundant host-device sync in progressbar printing ([#21233](https://github.com/Lightning-AI/pytorch-lightning/pull/21233))


- Fixed check the init args only when the given frames are in `__init__` method ([#21227](https://github.com/Lightning-AI/pytorch-lightning/pull/21227))


Expand Down
16 changes: 11 additions & 5 deletions src/lightning/pytorch/core/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,11 +656,17 @@ def __check_allowed(v: Any, name: str, value: Any) -> None:
raise ValueError(f"`self.log({name}, {value})` was called, but `{type(v).__name__}` values cannot be logged")

def __to_tensor(self, value: Union[Tensor, numbers.Number], name: str) -> Tensor:
value = (
value.clone().detach()
if isinstance(value, Tensor)
else torch.tensor(value, device=self.device, dtype=_get_default_dtype())
)
if isinstance(value, Tensor):
# Keep tensor on its original device to avoid unnecessary transfers
value = value.clone().detach()
else:
# Place scalar metrics on CPU to avoid CPU-GPU transfer and synchronization.
# `torch.tensor(value, device="cuda")` contains such synchronization, while the metric
# itself is only used on the CPU side. So placing metric on CPU for scalar inputs is more efficient.
# For non-CUDA devices, maintain original behavior
device = "cpu" if self.device.type == "cuda" else self.device
value = torch.tensor(value, device=device, dtype=_get_default_dtype())

if not torch.numel(value) == 1:
raise ValueError(
f"`self.log({name}, {value})` was called, but the tensor must have a single element."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,9 @@ def metrics(self) -> _METRICS:
"""This function returns either batch or epoch metrics."""
on_step = self._first_loop_iter is not None
assert self.trainer._results is not None
return self.trainer._results.metrics(on_step)
# Only include progress bar metrics if a progress bar callback is present
include_pbar_metrics = self.trainer.progress_bar_callback is not None
return self.trainer._results.metrics(on_step, include_pbar_metrics=include_pbar_metrics)

@property
def callback_metrics(self) -> _OUT_DICT:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ def _forked_name(self, result_metric: _ResultMetric, on_step: bool) -> tuple[str
forked_name += dataloader_suffix
return name, forked_name

def metrics(self, on_step: bool) -> _METRICS:
def metrics(self, on_step: bool, include_pbar_metrics: bool = True) -> _METRICS:
metrics = _METRICS(callback={}, log={}, pbar={})

for _, result_metric in self.valid_items():
Expand All @@ -488,7 +488,7 @@ def metrics(self, on_step: bool) -> _METRICS:
metrics["callback"][forked_name] = value

# populate progress_bar metrics. convert tensors to numbers
if result_metric.meta.prog_bar:
if result_metric.meta.prog_bar and include_pbar_metrics:
metrics["pbar"][forked_name] = convert_tensors_to_scalars(value)

return metrics
Expand Down
50 changes: 50 additions & 0 deletions tests/tests_pytorch/core/test_lightning_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,3 +594,53 @@ def __init__(self):
fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=1e-3, max_norm=None)
else:
fabric.clip_gradients.assert_called_once_with(orig_model, optimizer, clip_val=None, max_norm=1e-3)


@RunIf(min_cuda_gpus=1)
def test_log_no_cuda_sync():
"""Test logging scalars and tensors doesn't introduce CUDA sync."""

class TestModel(BoringModel):
def __init__(self):
super().__init__()
self.to("cuda")

def training_step(self, batch, batch_idx):
# Create tensors before enabling sync debug mode to avoid sync
cuda_tensor = torch.tensor(0.7, device=self.device)
cpu_tensor = torch.tensor(1.0, device="cpu")

# Enable sync debug mode to catch any synchronization
torch.cuda.set_sync_debug_mode("error")
try:
# Test scalar value (should be placed on CPU to avoid sync)
self.log("scalar_loss", 0.5)

# Test CUDA tensor (should stay on original device)
self.log("cuda_tensor", cuda_tensor)

# Test CPU tensor (should stay on CPU)
self.log("cpu_tensor", cpu_tensor)

except RuntimeError as e:
if "called a synchronizing CUDA operation" in str(e):
msg = f"Unexpected CUDA synchronization: {e}"
pytest.fail(msg)
else:
raise
finally:
torch.cuda.set_sync_debug_mode("default")

return super().training_step(batch, batch_idx)

model = TestModel()
trainer = Trainer(
max_epochs=1,
limit_train_batches=1,
limit_val_batches=0,
accelerator="gpu",
devices=1,
enable_progress_bar=False,
enable_checkpointing=False,
)
trainer.fit(model)
100 changes: 100 additions & 0 deletions tests/tests_pytorch/trainer/logging_/test_logger_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,3 +660,103 @@ def test_result_collection_changes_device():
# same device as the new tensor
results.log(fx, name, log_val, on_step=True, on_epoch=False, reduce_fx="mean")
assert results[f"{fx}.{name}"].cumulated_batch_size.device == log_val.device


@RunIf(min_cuda_gpus=1)
def test_logger_connector_no_sync_without_progress_bar():
"""Test logger connector doesn't sync when no progress bar."""
from lightning.pytorch import Trainer
from lightning.pytorch.demos.boring_classes import BoringModel

class TestModel(BoringModel):
def __init__(self):
super().__init__()
self.to("cuda")

def training_step(self, batch, batch_idx):
# Log some metrics with progress bar enabled
loss = super().training_step(batch, batch_idx)["loss"]

# Enable sync debug mode to catch any synchronization
torch.cuda.set_sync_debug_mode("error")
try:
# These logs have prog_bar=True but should not sync
# when progress bar callback is not present
self.log("train_loss", loss, prog_bar=True)
self.log("train_acc", 0.95, prog_bar=True)

except RuntimeError as e:
if "called a synchronizing CUDA operation" in str(e):
msg = f"Unexpected CUDA synchronization: {e}"
pytest.fail(msg)
else:
raise
finally:
torch.cuda.set_sync_debug_mode("default")

return loss

model = TestModel()
trainer = Trainer(
max_epochs=1,
limit_train_batches=1,
limit_val_batches=0,
accelerator="gpu",
devices=1,
enable_progress_bar=False, # Key - no progress bar callback
enable_checkpointing=False,
)
trainer.fit(model)


def test_result_collection_metrics_include_pbar_parameter():
"""Test metrics method handles include_pbar_metrics parameter."""
from lightning.pytorch.trainer.connectors.logger_connector.result import (
_ResultCollection,
)

results = _ResultCollection(training=True)

# Log some metrics with different prog_bar settings
results.log(
"training_step",
"regular_metric",
torch.tensor(1.0),
on_step=True,
on_epoch=False,
prog_bar=False,
)
results.log(
"training_step",
"pbar_metric",
torch.tensor(2.0),
on_step=True,
on_epoch=False,
prog_bar=True,
)
results.log(
"training_step",
"both_metric",
torch.tensor(3.0),
on_step=True,
on_epoch=False,
prog_bar=True,
logger=True,
)

# Test with include_pbar_metrics=True (default behavior)
metrics_with_pbar = results.metrics(on_step=True, include_pbar_metrics=True)
assert "pbar_metric" in metrics_with_pbar["pbar"]
assert "both_metric" in metrics_with_pbar["pbar"]
assert "both_metric" in metrics_with_pbar["log"]

# Test with include_pbar_metrics=False (optimization)
metrics_without_pbar = results.metrics(on_step=True, include_pbar_metrics=False)
# No progress bar metrics should be included
assert len(metrics_without_pbar["pbar"]) == 0
# Logger metrics should still be included
assert "both_metric" in metrics_without_pbar["log"]

# Verify callback metrics are not affected
assert "regular_metric" in metrics_with_pbar["callback"]
assert "regular_metric" in metrics_without_pbar["callback"]