File tree Expand file tree Collapse file tree 2 files changed +13
-12
lines changed Expand file tree Collapse file tree 2 files changed +13
-12
lines changed Original file line number Diff line number Diff line change @@ -62,10 +62,10 @@ optimizer:
62
62
optimizer_in_bwd : True # True saves memory. Requires gradient_accumulation_steps=1
63
63
loss :
64
64
_component_ : torchtune.modules.loss.LinearCrossEntropyLoss
65
- max_steps_per_epoch : null
65
+ max_steps_per_epoch : 110
66
66
gradient_accumulation_steps : 1 # Use to increase effective batch size
67
67
clip_grad_norm : null
68
- compile : False # torch.compile the model + loss, True increases speed + decreases memory
68
+ compile : True # torch.compile the model + loss, True increases speed + decreases memory
69
69
70
70
# Training environment
71
71
device : cuda
Original file line number Diff line number Diff line change @@ -41,16 +41,17 @@ def compile_model(
41
41
backend = os .environ .get ("TORCH_COMPILE_BACKEND" , "inductor" )
42
42
if isinstance (model , DeepFusionModel ):
43
43
model = model .decoder
44
- # Per-layer compilation by default
45
- if verbose :
46
- log .info (
47
- "Compiling model layers with torch.compile. Expect a relatively slower first step."
48
- )
49
- for m in reversed (list (model .modules ())):
50
- if isinstance (m , TransformerSelfAttentionLayer ) or isinstance (
51
- m , TransformerCrossAttentionLayer
52
- ):
53
- m .compile (backend = backend )
44
+ model .compile (backed = backend )
45
+ # # Per-layer compilation by default
46
+ # if verbose:
47
+ # log.info(
48
+ # "Compiling model layers with torch.compile. Expect a relatively slower first step."
49
+ # )
50
+ # for m in reversed(list(model.modules())):
51
+ # if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
52
+ # m, TransformerCrossAttentionLayer
53
+ # ):
54
+ # m.compile(backend=backend)
54
55
55
56
56
57
def compile_loss (loss : nn .Module , verbose : bool = True ) -> nn .Module :
You can’t perform that action at this time.
0 commit comments