Merge branch 'main' into release/3.10

Jintao-Huang · Jintao-Huang · commit 6bda90b6df83 · 2025-11-10T21:51:18.000+08:00
diff --git a/docs/source/Megatron-SWIFT/Quick-start.md b/docs/source/Megatron-SWIFT/Quick-start.md
@@ -161,7 +161,7 @@ I am a language model developed by swift, you can call me swift-robot. How can I
 
 
 ## 训练技巧
-- 增加训练吞吐量方法：使用packing、增加DP、减少重计算、增加计算通信overlap。MoE还可以通过丢弃tokens加速。
+- 增加训练吞吐量方法：使用packing（不要开启流式）、增加DP、减少重计算、增加计算通信overlap。MoE还可以通过丢弃tokens加速。
 - 并行技术选择：
   - Megatron-SWIFT的并行技术采用zero1（默认开启use_distributed_optimizer）+各种并行技术的组合。
   - DP的速度最快，但显存占用较多，使用其他并行技术以降低显存占用。
diff --git a/docs/source_en/Megatron-SWIFT/Quick-start.md b/docs/source_en/Megatron-SWIFT/Quick-start.md
@@ -164,7 +164,7 @@ I am a language model developed by swift, you can call me swift-robot. How can I
 
 
 ## Training Tips
-- Methods to increase training throughput: use packing, increase data parallelism (DP), reduce recomputation, and increase compute-communication overlap. MoE models can also be accelerated by dropping tokens.
+- Methods to increase training throughput: use packing (do not enable streaming), increase data parallelism (DP), reduce recomputation, and increase compute-communication overlap. MoE models can also be accelerated by dropping tokens.
 - Parallelism choices:
   - Megatron-SWIFT uses ZeRO-1 (use_distributed_optimizer enabled by default) combined with various parallelism techniques.
   - DP is the fastest but consumes the most memory; use other parallel techniques to reduce memory usage.
diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
@@ -375,7 +375,7 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
             if isinstance(response, (list, tuple)):
                 from transformers.utils import strtobool
                 # sometimes response is a list, pick one randomly
-                if strtobool(os.environ.get('RANDOM_DATASET_RESPONSE', 'True')):
+                if strtobool(os.environ.get('RANDOM_DATASET_RESPONSE', 'False')):
                     response = self.random_state.choice(response)
                 else:
                     response = response[0]
diff --git a/swift/megatron/model/gpt_model.py b/swift/megatron/model/gpt_model.py
@@ -217,7 +217,7 @@ def _preprocess(
                         rotary_seq_len,
                         packed_seq=packed_seq,
                     )
-                    if packed_seq:
+                    if packed_seq and not self.config.apply_rope_fusion:
                         assert position_ids.shape[0] == 1, f'position_ids.shape: {position_ids.shape}'
                         rotary_pos_emb = rotary_pos_emb[position_ids[0]]
 

Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,7 @@ def _preprocess(`
`217`	`217`	`rotary_seq_len,`
`218`	`218`	`packed_seq=packed_seq,`
`219`	`219`	`)`
`220`		`- if packed_seq:`
	`220`	`+ if packed_seq and not self.config.apply_rope_fusion:`
`221`	`221`	`assert position_ids.shape[0] == 1, f'position_ids.shape: {position_ids.shape}'`
`222`	`222`	`rotary_pos_emb = rotary_pos_emb[position_ids[0]]`
`223`	`223`