-
Notifications
You must be signed in to change notification settings - Fork 335
[MLH Phase 2] Refactor Logging System #284
base: main
Are you sure you want to change the base?
Changes from all commits
d5598fd
80da0bb
ff54875
30e434b
fef448c
9fdff75
5a6ece8
f407ebe
aa1654d
f969ac2
3b7d4f4
e7ebcb3
5bfd568
8232297
f586bf1
61296cf
a9747a7
496cd70
41877d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,7 +52,7 @@ class SSLClassyHookFunctions(Enum): | |
on_end = auto() | ||
|
||
|
||
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]: | ||
def default_hook_generator(cfg: AttrDict, event_storage) -> List[ClassyHook]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Type hint for event_storage. |
||
""" | ||
The utility function that prepares all the hoooks that will be used in training | ||
based on user selection. Some basic hooks are used by default. | ||
|
@@ -114,7 +114,7 @@ def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]: | |
"If using conda and you prefer conda install of tensorboard: " | ||
"`conda install -c conda-forge tensorboard`" | ||
) | ||
tb_hook = get_tensorboard_hook(cfg) | ||
tb_hook = get_tensorboard_hook(cfg, event_storage) | ||
hooks.extend([tb_hook]) | ||
if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP: | ||
hooks.extend( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,8 +5,6 @@ | |
""" | ||
|
||
import atexit | ||
import datetime | ||
import json | ||
import logging | ||
import time | ||
from typing import Optional | ||
|
@@ -19,6 +17,7 @@ | |
from fvcore.common.file_io import PathManager | ||
from vissl.utils.checkpoint import CheckpointWriter, is_checkpoint_phase | ||
from vissl.utils.env import get_machine_local_and_dist_rank | ||
from vissl.utils.events import VisslEventStorage | ||
from vissl.utils.io import save_file | ||
from vissl.utils.logger import log_gpu_stats | ||
from vissl.utils.perf_stats import PerfStats | ||
|
@@ -165,6 +164,7 @@ def on_update(self, task: "tasks.ClassyTask") -> None: | |
train_phase_idx = task.train_phase_idx | ||
log_freq = task.config["LOG_FREQUENCY"] | ||
iteration = task.iteration | ||
evt_stg: VisslEventStorage = task.event_storage | ||
|
||
if torch.cuda.is_available(): | ||
peak_mem_used = int(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0) | ||
|
@@ -184,49 +184,35 @@ def on_update(self, task: "tasks.ClassyTask") -> None: | |
avg_time = sum(batch_times) / len(batch_times) | ||
|
||
eta_secs = avg_time * (task.max_iteration - iteration) | ||
eta_string = str(datetime.timedelta(seconds=int(eta_secs))) | ||
if isinstance(task.optimizer.options_view.lr, set): | ||
lr_val = list(task.optimizer.options_view.lr) | ||
else: | ||
lr_val = round(task.optimizer.options_view.lr, 5) | ||
batch_time = int(1000.0 * avg_time) | ||
rank = get_rank() | ||
log_data = { | ||
"Rank": rank, | ||
"ep": train_phase_idx, | ||
"iter": iteration, | ||
"lr": lr_val, | ||
"loss": loss_val, | ||
"btime(ms)": batch_time, | ||
"eta": eta_string, | ||
"peak_mem(M)": peak_mem_used, | ||
} | ||
evt_stg.put_scalars( | ||
rank=rank, | ||
epoch=train_phase_idx, | ||
iteration=iteration, | ||
lr=lr_val, | ||
loss=loss_val, | ||
batch_time=batch_time, | ||
eta=eta_secs, | ||
peak_mem_used=peak_mem_used, | ||
) | ||
if self.btime_freq and len(batch_times) >= self.btime_freq: | ||
rolling_avg_time = ( | ||
sum(batch_times[-self.btime_freq :]) / self.btime_freq | ||
) | ||
rolling_eta_secs = int( | ||
rolling_avg_time * (task.max_iteration - iteration) | ||
) | ||
rolling_eta_str = str( | ||
datetime.timedelta(seconds=int(rolling_eta_secs)) | ||
) | ||
rolling_btime = int(1000.0 * rolling_avg_time) | ||
log_data[f"btime({self.btime_freq}iters)(ms)"] = rolling_btime | ||
log_data["rolling_eta"] = rolling_eta_str | ||
|
||
# to maintain the backwards compatibility with the log.txt | ||
# logs, we convert the json to the previous format. | ||
# the stdout.json can be used to use the json format of logs. | ||
stdout_data = "" | ||
for key, value in log_data.items(): | ||
stdout_data = ( | ||
f"{stdout_data}[{key}: {value}] " | ||
if key == "ep" | ||
else f"{stdout_data}{key}: {value}; " | ||
evt_stg.put_scalars( | ||
rolling_btime=rolling_btime, rolling_eta=rolling_eta_secs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I prefer keeping all times in human readable string. We may want to add a VisslEventStorage#put_string or change #put_scalars to #put_values and make it type agnostic. |
||
) | ||
logging.info(stdout_data.strip()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I believe we lose the logging to stdout here, which imo we should definitely keep -- super helpful for debugging. We could create another EventStorageWriter called StdOutWriter and add to event_storage_writers. |
||
self.json_stdout_logger.write(json.dumps(log_data) + "\n") | ||
for writer in task.event_storage_writers: | ||
writer.write() | ||
|
||
|
||
class LogLossMetricsCheckpointHook(ClassyHook): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
ActivationStatisticsMonitor, | ||
ActivationStatisticsObserver, | ||
) | ||
from vissl.utils.events import VisslEventStorage | ||
|
||
|
||
try: | ||
|
@@ -62,6 +63,7 @@ class SSLTensorboardHook(ClassyHook): | |
def __init__( | ||
self, | ||
tb_writer: SummaryWriter, | ||
event_storage: VisslEventStorage, | ||
log_params: bool = False, | ||
log_params_every_n_iterations: int = -1, | ||
log_params_gradients: bool = False, | ||
|
@@ -86,6 +88,7 @@ def __init__( | |
) | ||
logging.info("Setting up SSL Tensorboard Hook...") | ||
self.tb_writer = tb_writer | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need tb_writer anymore in this class? Can we refactor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. confirming: none of these methods write to tensorboard anymore and that it's only handled in TensorBoardWriter#Write. |
||
self.event_storage = event_storage | ||
self.log_params = log_params | ||
self.log_params_every_n_iterations = log_params_every_n_iterations | ||
self.log_params_gradients = log_params_gradients | ||
|
@@ -137,9 +140,7 @@ def on_forward(self, task: "tasks.ClassyTask") -> None: | |
and task.iteration % self.log_params_every_n_iterations == 0 | ||
): | ||
for name, parameter in task.base_model.named_parameters(): | ||
self.tb_writer.add_histogram( | ||
f"Parameters/{name}", parameter, global_step=task.iteration | ||
) | ||
self.event_storage.put_histogram(f"Parameters/{name}", parameter) | ||
|
||
def on_phase_start(self, task: "tasks.ClassyTask") -> None: | ||
""" | ||
|
@@ -153,9 +154,7 @@ def on_phase_start(self, task: "tasks.ClassyTask") -> None: | |
# log the parameters just once, before training starts | ||
if is_primary() and task.train and task.train_phase_idx == 0: | ||
for name, parameter in task.base_model.named_parameters(): | ||
self.tb_writer.add_histogram( | ||
f"Parameters/{name}", parameter, global_step=-1 | ||
) | ||
self.event_storage.put_histogram(f"Parameters/{name}", parameter) | ||
|
||
def on_phase_end(self, task: "tasks.ClassyTask") -> None: | ||
""" | ||
|
@@ -172,7 +171,7 @@ def on_phase_end(self, task: "tasks.ClassyTask") -> None: | |
for top_n, accuracies in meter.value.items(): | ||
for i, acc in accuracies.items(): | ||
tag_name = f"{phase_type}/Accuracy_" f" {top_n}_Output_{i}" | ||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag=tag_name, | ||
scalar_value=round(acc, 5), | ||
global_step=task.train_phase_idx, | ||
|
@@ -184,20 +183,16 @@ def on_phase_end(self, task: "tasks.ClassyTask") -> None: | |
# Log the weights and bias at the end of the epoch | ||
if self.log_params: | ||
for name, parameter in task.base_model.named_parameters(): | ||
self.tb_writer.add_histogram( | ||
f"Parameters/{name}", | ||
parameter, | ||
global_step=task.train_phase_idx, | ||
self.event_storage.put_histogram( | ||
f"Parameters/{name}", parameter, | ||
) | ||
# Log the parameter gradients at the end of the epoch | ||
if self.log_params_gradients: | ||
for name, parameter in task.base_model.named_parameters(): | ||
if parameter.grad is not None: | ||
try: | ||
self.tb_writer.add_histogram( | ||
f"Gradients/{name}", | ||
parameter.grad, | ||
global_step=task.train_phase_idx, | ||
self.event_storage.put_histogram( | ||
f"Gradients/{name}", parameter.grad, | ||
) | ||
except ValueError: | ||
logging.info( | ||
|
@@ -235,10 +230,8 @@ def on_update(self, task: "tasks.ClassyTask") -> None: | |
for name, parameter in task.base_model.named_parameters(): | ||
if parameter.grad is not None: | ||
try: | ||
self.tb_writer.add_histogram( | ||
f"Gradients/{name}", | ||
parameter.grad, | ||
global_step=task.iteration, | ||
self.event_storage.put_histogram( | ||
f"Gradients/{name}", parameter.grad, | ||
) | ||
except ValueError: | ||
logging.info( | ||
|
@@ -251,13 +244,13 @@ def on_update(self, task: "tasks.ClassyTask") -> None: | |
iteration <= 100 and iteration % 5 == 0 | ||
): | ||
logging.info(f"Logging metrics. Iteration {iteration}") | ||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag="Training/Loss", | ||
scalar_value=round(task.last_batch.loss.data.cpu().item(), 5), | ||
global_step=iteration, | ||
) | ||
|
||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag="Training/Learning_rate", | ||
scalar_value=round(task.optimizer.options_view.lr, 5), | ||
global_step=iteration, | ||
|
@@ -270,7 +263,7 @@ def on_update(self, task: "tasks.ClassyTask") -> None: | |
batch_times = [0] | ||
|
||
batch_time_avg_s = sum(batch_times) / max(len(batch_times), 1) | ||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag="Speed/Batch_processing_time_ms", | ||
scalar_value=int(1000.0 * batch_time_avg_s), | ||
global_step=iteration, | ||
|
@@ -285,7 +278,7 @@ def on_update(self, task: "tasks.ClassyTask") -> None: | |
if batch_time_avg_s > 0 | ||
else 0.0 | ||
) | ||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag="Speed/img_per_sec_per_gpu", | ||
scalar_value=pic_per_batch_per_gpu_per_sec, | ||
global_step=iteration, | ||
|
@@ -294,7 +287,7 @@ def on_update(self, task: "tasks.ClassyTask") -> None: | |
# ETA | ||
avg_time = sum(batch_times) / len(batch_times) | ||
eta_secs = avg_time * (task.max_iteration - iteration) | ||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag="Speed/ETA_hours", | ||
scalar_value=eta_secs / 3600.0, | ||
global_step=iteration, | ||
|
@@ -303,21 +296,21 @@ def on_update(self, task: "tasks.ClassyTask") -> None: | |
# GPU Memory | ||
if torch.cuda.is_available(): | ||
# Memory actually being used | ||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag="Memory/Peak_GPU_Memory_allocated_MiB", | ||
scalar_value=torch.cuda.max_memory_allocated() / BYTE_TO_MiB, | ||
global_step=iteration, | ||
) | ||
|
||
# Memory reserved by PyTorch's memory allocator | ||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag="Memory/Peak_GPU_Memory_reserved_MiB", | ||
scalar_value=torch.cuda.max_memory_reserved() | ||
/ BYTE_TO_MiB, # byte to MiB | ||
global_step=iteration, | ||
) | ||
|
||
self.tb_writer.add_scalar( | ||
self.event_storage.put_scalars( | ||
tag="Memory/Current_GPU_Memory_reserved_MiB", | ||
scalar_value=torch.cuda.memory_reserved() | ||
/ BYTE_TO_MiB, # byte to MiB | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -116,6 +116,13 @@ def standard_train_step(task): | |
sample = next(task.data_iterator) | ||
|
||
sample = construct_sample_for_model(sample, task) | ||
vis_period = task.config.HOOKS.TENSORBOARD_SETUP.VISUALIZATION_SAMPLE_PERIOD | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Can we comment this code block? |
||
if vis_period > 0 and task.iteration % vis_period == 0: | ||
storage = task.event_storage | ||
storage.put_images() | ||
name = f"Model input sample: iteration: {task.iteration_num}" | ||
for idx, vis_img in enumerate(sample["input"]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thought: If batch size is large, this will upload a lot of images -- do we want to offer the option to only sample a small portion of the inputs? |
||
storage.put_image(name + f" ({idx})", vis_img) | ||
|
||
# Only need gradients during training | ||
grad_context = torch.enable_grad() if task.train else torch.no_grad() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
from classy_vision.tasks.classification_task import AmpType, BroadcastBuffersMode | ||
from fvcore.common.file_io import PathManager | ||
from torch.cuda.amp import GradScaler as TorchGradScaler | ||
|
||
from vissl.config import AttrDict | ||
from vissl.data import ( | ||
build_dataset, | ||
|
@@ -88,6 +89,7 @@ def __init__(self, config: AttrDict): | |
self.phase_idx = -1 | ||
# id of the current training phase training is at. Starts from 0 | ||
self.train_phase_idx = -1 # set by trainer | ||
self._event_storage = None | ||
# metrics stored during the training. | ||
self.metrics = {} # set by the trainer | ||
self.start_time = -1 # set by trainer | ||
|
@@ -116,6 +118,28 @@ def __init__(self, config: AttrDict): | |
# communication as much as possible | ||
self.set_ddp_bucket_cap_mb() | ||
self.use_gpu = self.device.type == "cuda" | ||
self.event_storage_writers = [] | ||
|
||
def initiate_vissl_event_storage(self): | ||
from vissl.utils.events import create_event_storage, get_event_storage | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thought: I prefer inputs up top usually unless necessary. |
||
|
||
create_event_storage() | ||
self._event_storage = get_event_storage() | ||
|
||
def build_event_storage_writers(self): | ||
from vissl.utils.events import JsonWriter, TensorboardWriter | ||
|
||
flush_secs = self.config.HOOKS.TENSORBOARD_SETUP.FLUSH_EVERY_N_MIN * 60 | ||
checkpoint_dir = self.config.CHECKPOINT.DIR | ||
|
||
self.event_storage_writers = [ | ||
JsonWriter(f"{self.checkpoint_folder}/stdout.json"), | ||
TensorboardWriter(checkpoint_dir, flush_secs), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From my reading of this, TensorBoardWriter will always write, regardless of |
||
] | ||
|
||
@property | ||
def event_storage(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can just use self.event_storage internally and externally. |
||
return self._event_storage | ||
|
||
def set_device(self): | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: comment what this is.
nit: move comment below USE_TENSORBOARD