hpcaitech
diff --git a/‎applications/ColossalChat/coati/distributed/comm.py‎
Lines changed: 29 additions & 0 deletions b/‎applications/ColossalChat/coati/distributed/comm.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/consumer.py‎
Lines changed: 11 additions & 4 deletions b/‎applications/ColossalChat/coati/distributed/consumer.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/grpo_consumer.py‎
Lines changed: 1 addition & 0 deletions b/‎applications/ColossalChat/coati/distributed/grpo_consumer.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎applications/ColossalChat/coati/distributed/producer.py‎
Lines changed: 6 additions & 14 deletions b/‎applications/ColossalChat/coati/distributed/producer.py‎
Lines changed: 6 additions & 14 deletions
diff --git a/‎applications/ColossalChat/rl_example.py‎
Lines changed: 3 additions & 1 deletion b/‎applications/ColossalChat/rl_example.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎colossalai/shardformer/modeling/qwen2.py‎
Lines changed: 9 additions & 38 deletions b/‎colossalai/shardformer/modeling/qwen2.py‎
Lines changed: 9 additions & 38 deletions
diff --git a/‎examples/language/performance_evaluator.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/language/performance_evaluator.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/language/qwen2/README.md‎
Lines changed: 127 additions & 0 deletions b/‎examples/language/qwen2/README.md‎
Lines changed: 127 additions & 0 deletions
@@ -55,3 +55,32 @@ def ray_broadcast_tensor_dict(
     if rank == src:
         out_dict = tensor_dict
     return out_dict
+
+
+def ray_broadcast_tensor_dict_and_load(
+    producer_obj, tensor_dict: Dict[str, torch.Tensor], src: int = 0, device=None, group_name: str = "default"
+):
+    rank = cc.get_rank(group_name)
+    if rank == src:
+        metadata = []
+        for k, v in tensor_dict.items():
+            metadata.append((k, v.shape, v.dtype))
+    else:
+        metadata = None
+    metadata = ray_broadcast_object(metadata, src, device, group_name)
+    for k, shape, dtype in metadata:
+        if "consumer_global_step" == k:
+            continue
+        if rank == src:
+            tensor = tensor_dict[k]
+        else:
+            out_dict = {}
+            tensor = torch.empty(shape, dtype=dtype, device=device)
+        cc.broadcast(tensor, src, group_name)
+        if rank != src:
+            out_dict[k] = tensor
+            producer_obj.load_state_dict(out_dict)
+            del out_dict
+            torch.npu.empty_cache()
+    if rank == src:
+        out_dict = tensor_dict
@@ -15,7 +15,7 @@
 from colossalai.initialize import launch
 from colossalai.nn.optimizer import HybridAdam
 
-from .comm import ray_broadcast_tensor_dict
+from .comm import ray_broadcast_tensor_dict, ray_broadcast_tensor_dict_and_load
 from .utils import bind_batch, post_recv, unbind_batch
 
 
@@ -172,6 +172,8 @@ def loop(self) -> None:
                                 )
                                 self.profiler.enter("step")
                                 loss = self.step(i, pbar, **batch, **raw_mini_batches_metric_dict)
+                                del batch
+                                del raw_mini_batches_metric_dict
                                 self.profiler.exit("step")
                                 self.buffer = self.buffer[
                                     effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1 :
@@ -303,16 +305,21 @@ def loop(self) -> None:
                         state_dict = self.state_dict()
                         if self.pp_size > 1:
                             if self.tp_rank == 0 and self.dp_rank == 0:
-                                ray_broadcast_tensor_dict(
+                                ray_broadcast_tensor_dict_and_load(
+                                    None,
                                     state_dict,
                                     src=self.num_producers,
                                     device=self.device,
                                     group_name=f"sync_model_{self.pp_rank}",
                                 )
                         else:
                             if self.rank == 0:
-                                ray_broadcast_tensor_dict(
-                                    state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
+                                ray_broadcast_tensor_dict_and_load(
+                                    None,
+                                    state_dict,
+                                    src=self.num_producers,
+                                    device=self.device,
+                                    group_name="sync_model",
                                 )
                         del state_dict
                         torch.npu.empty_cache()
 
@@ -62,6 +62,7 @@ def __init__(
             batch_size,
             model_config,
             plugin_config,
+            generate_config,
             minibatch_size,
             save_interval=save_interval,
             save_dir=save_dir,
 
@@ -17,7 +17,7 @@
 from torch.utils.data import DataLoader, DistributedSampler
 from transformers import AutoTokenizer
 
-from .comm import ray_broadcast_tensor_dict
+from .comm import ray_broadcast_tensor_dict, ray_broadcast_tensor_dict_and_load
 from .inference_backend import BACKEND_MAP
 from .utils import safe_append_to_jsonl_file
 
@@ -191,6 +191,7 @@ def setup(self) -> None:
                 )
         else:
             cc.init_collective_group(self.num_producers + 1, self.producer_idx, backend="hccl", group_name="sync_model")
+        cc.init_collective_group(self.num_producers, self.producer_idx, backend="hccl", group_name="producer_group")
 
     def rollout(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -340,25 +341,16 @@ def loop(self) -> None:
                             print(
                                 f"[P{self.producer_idx}] Sync model PP stage {pp_idx} episode {episode} step {(i + 1) // self.num_microbatches - 1}"
                             )
-                            state_dict = ray_broadcast_tensor_dict(
-                                None, self.num_producers, device=self.device, group_name=f"sync_model_{pp_idx}"
+                            ray_broadcast_tensor_dict_and_load(
+                                self, None, self.num_producers, device=self.device, group_name=f"sync_model_{pp_idx}"
                             )
-                            if "consumer_global_step" in state_dict:
-                                self.consumer_global_step = state_dict.pop("consumer_global_step").item()
-                            self.load_state_dict(state_dict)
                     else:
                         print(
                             f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}"
                         )
-                        state_dict = ray_broadcast_tensor_dict(
-                            None, self.num_producers, device=self.device, group_name="sync_model"
+                        ray_broadcast_tensor_dict_and_load(
+                            self, None, self.num_producers, device=self.device, group_name=f"sync_model"
                         )
-                        if "consumer_global_step" in state_dict:
-                            self.consumer_global_step = state_dict.pop("consumer_global_step").item()
-                        self.load_state_dict(state_dict)
-                    self.profiler.exit("sync_model")
-                    del state_dict
-                    torch.npu.empty_cache()
                     if isinstance(self.model, BACKEND_MAP["vllm"]) and self.model.model_config.get(
                         "enable_sleep_mode", False
                     ):
 
@@ -166,6 +166,7 @@
     parser.add_argument(
         "--enable_profiling", action="store_true", default=False, help="Enable profiling for the training process."
     )
+    parser.add_argument("--cpu_offload", action="store_true", default=False, help="Cpu offload.")
     args = parser.parse_args()
 
     if args.train_minibatch_size is None:
@@ -251,7 +252,7 @@
         )
         generate_config.update(
             dict(
-                max_tokens=args.max_new_tokens,  # max new tokens
+                max_tokens=args.max_new_tokens + args.max_prompt_tokens,  # max new tokens
                 include_stop_str_in_output=True,
                 stop=["</answer>"] if args.reward_type == "think_answer_tags" else None,
                 ignore_eos=True if args.reward_type == "think_answer_tags" else False,
@@ -344,6 +345,7 @@
                 1, args.train_microbatch_size // args.pipeline_parallel_size
             ),  # microbatch size should be set to train_microbatch_size // pp_size
             "zero_stage": args.zero_stage,
+            "cpu_offload": args.cpu_offload,
             "max_norm": 1.0,
             "enable_flash_attention": True,
             "sp_size": args.tensor_parallel_size,
 
@@ -12,10 +12,7 @@
 )
 
 try:
-    from transformers.modeling_attn_mask_utils import (
-        _prepare_4d_causal_attention_mask,
-        _prepare_4d_causal_attention_mask_for_sdpa,
-    )
+    from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
     from transformers.models.qwen2.modeling_qwen2 import (
         Qwen2Attention,
         Qwen2ForCausalLM,
@@ -132,46 +129,20 @@ def qwen2_model_forward(
         else:
             position_ids = position_ids.view(-1, seq_length).long()
 
-        if (
-            not shard_config.enable_flash_attention
-            and attention_mask is not None
-            and self._attn_implementation == "flash_attention_2"
-            and use_cache
-        ):
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
         # embed positions, for the first stage, hidden_states is the input embeddings,
         # for the other stages, hidden_states is the output of the previous stage
         if shard_config.enable_flash_attention:
             # in this case, attention_mask is a dict rather than a tensor
             attention_mask = None
         else:
-            if self._attn_implementation == "flash_attention_2":
-                # 2d mask is passed through the layers
-                attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-            elif self._attn_implementation == "sdpa" and not output_attentions:
-                # output_attentions=True can not be supported when using SDPA, and we fall back on
-                # the manual implementation that requires a 4D causal mask in all cases.
-                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                    attention_mask,
-                    (batch_size, seq_length),
-                    inputs_embeds,
-                    past_key_values_length,
-                )
-            else:
-                # 4d mask is passed through the layers
-                attention_mask = _prepare_4d_causal_attention_mask(
-                    attention_mask,
-                    (batch_size, seq_length),
-                    hidden_states,
-                    past_key_values_length,
-                    sliding_window=self.config.sliding_window,
-                )
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                hidden_states,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
 
         if stage_manager.is_first_stage():
             if shard_config.enable_sequence_parallelism:
 
@@ -161,18 +161,18 @@ def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
         ) * (
             1.0 + (seq_len / (6.0 * self.hidden_size)) + (self.vocab_size / (16.0 * self.num_layers * self.hidden_size))
         )
-        self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))
+        self.flop += batch_size * (seq_len // 1024) * self.model_numel * (3 + int(self.enable_grad_checkpoint))
 
     def on_fit_end(self) -> None:
         avg_duration = all_reduce_mean(self.timer.duration, self.coordinator.world_size)
         avg_throughput = self.num_samples * self.dp_world_size / (avg_duration + 1e-12)
         mp_world_size = self.coordinator.world_size // self.dp_world_size
-        avg_tflops_per_gpu_megatron = self.flop_megatron / 1e12 / (avg_duration + 1e-12) / mp_world_size
+        self.flop_megatron / 1e12 / (avg_duration + 1e-12) / mp_world_size
         avg_tflops_per_gpu = self.flop / 1e12 / (avg_duration + 1e-12) / mp_world_size
         self.coordinator.print_on_master(
             f"num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop_megatron: {self.flop_megatron}, flop: {self.flop}, avg_duration: {avg_duration}, "
             f"avg_throughput: {avg_throughput}"
         )
         self.coordinator.print_on_master(
-            f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU by Megatron: {avg_tflops_per_gpu_megatron:.2f}, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}"
+            f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}"
         )
@@ -0,0 +1,127 @@
+# Pretraining LLaMA-1/2/3: best practices for building LLaMA-1/2/3-like base models
+### LLaMA3
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA3-70B-H100.png" width=600/>
+</p>
+
+- 70 billion parameter LLaMA3 model training accelerated by 18%
+
+### LLaMA2
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/llama2_pretraining.png" width=600/>
+</p>
+
+- 70 billion parameter LLaMA2 model training accelerated by 195%
+[[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)
+
+### LLaMA1
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
+</p>
+
+- 65-billion-parameter large model pretraining accelerated by 38%
+[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
+
+## Usage
+
+> ⚠ This example only has benchmarking script. For training/finetuning, please refer to the [applications/Colossal-LLaMA](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA).
+
+### 1. Installation
+
+Please install the latest ColossalAI from source.
+
+```bash
+BUILD_EXT=1 pip install -U git+https://github.com/hpcaitech/ColossalAI
+```
+
+Then install other dependencies.
+
+```bash
+pip install -r requirements.txt
+```
+
+### 4. Shell Script Examples
+
+For your convenience, we provide some shell scripts to run benchmark with various configurations.
+
+You can find them in `scripts/benchmark_7B` and `scripts/benchmark_70B` directory. The main command should be in the format of:
+```bash
+colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \
+benchmark.py --OTHER_CONFIGURATIONS
+```
+Here we will show an example of how to run training
+llama pretraining with `gemini, batch_size=16, sequence_length=4096, gradient_checkpoint=True, flash_attn=True`.
+
+#### a. Running environment
+This experiment was performed on 4 computing nodes with 32 A800/H800 80GB GPUs in total for LLaMA-1 65B or LLaMA-2 70B. The nodes are
+connected with RDMA and GPUs within one node are fully connected with NVLink.
+
+#### b. Running command
+
+```bash
+cd scripts/benchmark_7B
+```
+
+First, put your host file (`hosts.txt`) in this directory with your real host ip or host name.
+
+Here is a sample `hosts.txt`:
+```text
+hostname1
+hostname2
+hostname3
+hostname4
+```
+
+Then add environment variables to script if needed.
+
+Finally, run the following command to start training:
+
+```bash
+bash gemini.sh
+```
+
+If you encounter out-of-memory(OOM) error during training with script `gemini.sh`, changing to script `gemini_auto.sh` might be a solution, since gemini_auto will set a upper limit on GPU memory usage through offloading part of the model parameters and optimizer states back to CPU memory. But there's a trade-off: `gemini_auto.sh` will be a bit slower, since more data are transmitted between CPU and GPU.
+
+#### c. Results
+If you run the above command successfully, you will get the following results:
+`max memory usage:  55491.10 MB, throughput:  24.26 samples/s, TFLOPS/GPU:  167.43`.
+
+
+## Reference
+```
+@article{bian2021colossal,
+  title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
+  author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
+  journal={arXiv preprint arXiv:2110.14883},
+  year={2021}
+}
+```
+
+```bibtex
+@software{openlm2023openllama,
+  author = {Geng, Xinyang and Liu, Hao},
+  title = {OpenLLaMA: An Open Reproduction of LLaMA},
+  month = May,
+  year = 2023,
+  url = {https://github.com/openlm-research/open_llama}
+}
+```
+
+```bibtex
+@software{together2023redpajama,
+  author = {Together Computer},
+  title = {RedPajama-Data: An Open Source Recipe to Reproduce LLaMA training dataset},
+  month = April,
+  year = 2023,
+  url = {https://github.com/togethercomputer/RedPajama-Data}
+}
+```
+
+```bibtex
+@article{touvron2023llama,
+  title={Llama: Open and efficient foundation language models},
+  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
+  journal={arXiv preprint arXiv:2302.13971},
+  year={2023}
+}
+```