Skip to content

Commit 434d559

Browse files
committed
Debug branch for the Qwen3
Use the instructions at the beginning of this file - https://github.com/pytorch/torchtune/blob/main/recipes/configs/qwen3/8B_full_single_device.yaml
1 parent 2344509 commit 434d559

File tree

2 files changed

+13
-12
lines changed

2 files changed

+13
-12
lines changed

recipes/configs/qwen3/8B_full_single_device.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,10 @@ optimizer:
6262
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
6363
loss:
6464
_component_: torchtune.modules.loss.LinearCrossEntropyLoss
65-
max_steps_per_epoch: null
65+
max_steps_per_epoch: 110
6666
gradient_accumulation_steps: 1 # Use to increase effective batch size
6767
clip_grad_norm: null
68-
compile: False # torch.compile the model + loss, True increases speed + decreases memory
68+
compile: True # torch.compile the model + loss, True increases speed + decreases memory
6969

7070
# Training environment
7171
device: cuda

torchtune/training/_compile.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,17 @@ def compile_model(
4141
backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
4242
if isinstance(model, DeepFusionModel):
4343
model = model.decoder
44-
# Per-layer compilation by default
45-
if verbose:
46-
log.info(
47-
"Compiling model layers with torch.compile. Expect a relatively slower first step."
48-
)
49-
for m in reversed(list(model.modules())):
50-
if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
51-
m, TransformerCrossAttentionLayer
52-
):
53-
m.compile(backend=backend)
44+
model.compile(backed=backend)
45+
# # Per-layer compilation by default
46+
# if verbose:
47+
# log.info(
48+
# "Compiling model layers with torch.compile. Expect a relatively slower first step."
49+
# )
50+
# for m in reversed(list(model.modules())):
51+
# if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
52+
# m, TransformerCrossAttentionLayer
53+
# ):
54+
# m.compile(backend=backend)
5455

5556

5657
def compile_loss(loss: nn.Module, verbose: bool = True) -> nn.Module:

0 commit comments

Comments
 (0)