Debug branch for the Qwen3

anijain2305 · anijain2305 · commit 434d559cff5b · 2025-06-16T11:09:04.000-07:00
Use the instructions at the beginning of this file - https://github.com/pytorch/torchtune/blob/main/recipes/configs/qwen3/8B_full_single_device.yaml
diff --git a/recipes/configs/qwen3/8B_full_single_device.yaml b/recipes/configs/qwen3/8B_full_single_device.yaml
@@ -62,10 +62,10 @@ optimizer:
 optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
 loss:
   _component_: torchtune.modules.loss.LinearCrossEntropyLoss
-max_steps_per_epoch: null
+max_steps_per_epoch: 110
 gradient_accumulation_steps: 1  # Use to increase effective batch size
 clip_grad_norm: null
-compile: False  # torch.compile the model + loss, True increases speed + decreases memory
+compile: True  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training environment
 device: cuda
diff --git a/torchtune/training/_compile.py b/torchtune/training/_compile.py
@@ -41,16 +41,17 @@ def compile_model(
     backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
     if isinstance(model, DeepFusionModel):
         model = model.decoder
-    # Per-layer compilation by default
-    if verbose:
-        log.info(
-            "Compiling model layers with torch.compile. Expect a relatively slower first step."
-        )
-    for m in reversed(list(model.modules())):
-        if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
-            m, TransformerCrossAttentionLayer
-        ):
-            m.compile(backend=backend)
+    model.compile(backed=backend)
+    # # Per-layer compilation by default
+    # if verbose:
+    #     log.info(
+    #         "Compiling model layers with torch.compile. Expect a relatively slower first step."
+    #     )
+    # for m in reversed(list(model.modules())):
+    #     if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
+    #         m, TransformerCrossAttentionLayer
+    #     ):
+    #         m.compile(backend=backend)
 
 
 def compile_loss(loss: nn.Module, verbose: bool = True) -> nn.Module: