Update OH to v1.18.0

splotnikv · splotnikv · commit 9f32ec2b0471 · 2025-06-18T08:13:24.000-07:00
diff --git a/docs/hpu.md b/docs/hpu.md
@@ -10,7 +10,7 @@ Next changes are required to enable training on HPU:
 
 It is also recommended to use HPU optimized versions of transformers:
 
-```python
+```Python
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 adapt_transformers_to_gaudi()
 ```
@@ -29,8 +29,21 @@ To compute bucked size, we use next algorithm:
 This approach limits overhead of the bucketing to 1/16 th of the longest sample and allows us to significantly reduce number of recompilations.
 
 ## How to run
-To run training make next changes to config file:
-```json
+To run training build docker using next dockerfile:
+```Dockerfile
+FROM vault.habana.ai/gaudi-docker/1.21.0/rhel9.4/habanalabs/pytorch-installer-2.6.0:1.21.0-555
+
+ARG CMAKE_ARGS="-DGGML_NATIVE=off"
+
+WORKDIR /app
+RUN pip install git+https://github.com/instructlab/instructlab.git@v0.26.1
+
+WORKDIR /app
+RUN pip install git+https://github.com/huggingface/optimum-habana.git@v1.18.0
+```
+
+Then make next changes to config file:
+```YAML
 train:
   device: hpu
   distributed_backend: fsdp
@@ -40,8 +53,8 @@ train:
   disable_flash_attn: true
 ```
 
-And use this command line:
-```bash
+And finally run this command line:
+```BASH
 ilab --config=./config.yaml model train --pipeline accelerated --data-path ./data.jsonl
 ```
 
diff --git a/src/instructlab/training/accelerator.py b/src/instructlab/training/accelerator.py
@@ -132,6 +132,7 @@ def get_fsdp_config(self):
         from functools import partial
 
         # Third Party
+        from accelerate.utils import FullyShardedDataParallelPlugin
         from peft.utils.other import fsdp_auto_wrap_policy
         from torch.distributed.fsdp import BackwardPrefetch, ShardingStrategy
         from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
@@ -159,27 +160,17 @@ def get_fsdp_config(self):
         prefetch_policy = (
             BackwardPrefetch.BACKWARD_POST if is_lora else BackwardPrefetch.BACKWARD_PRE
         )
+        fsdp_plugin = FullyShardedDataParallelPlugin(
+            auto_wrap_policy=wrap_policy,
+            limit_all_gathers=True,
+            backward_prefetch=prefetch_policy,
+            sharding_strategy=ShardingStrategy[self.fsdp_sharding_strategy],
+            cpu_offload=CPUOffload(self.fsdp_cpu_offload_params),
+        )
 
         if self.device_str == "hpu":
-            from optimum.habana.accelerate.utils import GaudiFullyShardedDataParallelPlugin
-            fsdp_plugin = GaudiFullyShardedDataParallelPlugin(
-                auto_wrap_policy=wrap_policy,
-                limit_all_gathers=True,
-                backward_prefetch=prefetch_policy,
-                sharding_strategy=ShardingStrategy[self.fsdp_sharding_strategy],
-                cpu_offload=CPUOffload(self.fsdp_cpu_offload_params),
-            )
             fsdp_plugin.use_orig_params=True
             fsdp_plugin.sync_module_states=True
-        else:
-            from accelerate.utils import FullyShardedDataParallelPlugin
-            fsdp_plugin = FullyShardedDataParallelPlugin(
-                auto_wrap_policy=wrap_policy,
-                limit_all_gathers=True,
-                backward_prefetch=prefetch_policy,
-                sharding_strategy=ShardingStrategy[self.fsdp_sharding_strategy],
-                cpu_offload=CPUOffload(self.fsdp_cpu_offload_params),
-            )
 
         # `use_orig_params` must be disabled when using LoRA and FSDP together
         # Source: https://huggingface.co/docs/peft/en/accelerate/fsdp#the-important-parts