Skip to content

Commit 9b8097e

Browse files
SW publisherJenkins
authored andcommitted
DeepSpeed content for 1.22.0
Signed-off-by: SW publisher <[email protected]>
1 parent 95ead2a commit 9b8097e

File tree

154 files changed

+11720
-1197
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

154 files changed

+11720
-1197
lines changed

.pre-commit-config.yaml

Lines changed: 0 additions & 89 deletions
This file was deleted.

CODEOWNERS

Lines changed: 2 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5,52 +5,6 @@
55
# Learn more about CODEOWNERS syntax here:
66
# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
77

8+
89

9-
# top-level repo folders
10-
/.github/ @loadams
11-
/azure/ @awan-10
12-
/benchmarks/ @awan-10 @tjruwase
13-
/bin/ @loadams
14-
/csrc/ @awan-10
15-
/deepspeed/ @loadams @tjruwase
16-
/docker/ @awan-10
17-
/docs/ @loadams @tjruwase
18-
/examples/ @awan-10 @tohtana
19-
/op_builder/ @loadams @tjruwase @jomayeri
20-
/release/ @loadams
21-
/requirements/ @loadams
22-
/scripts/ @awan-10
23-
/tests/ @tjruwase @loadams @tohtana
24-
25-
# deepspeed
26-
/deepspeed/autotuning/ @loadams
27-
/deepspeed/checkpoint/ @tjruwase
28-
/deepspeed/comm/ @awan-10
29-
/deepspeed/compression/ @tjruwase
30-
/deepspeed/elasticity/ @awan-10
31-
/deepspeed/launcher/ @loadams
32-
/deepspeed/module_inject/ @awan-10
33-
/deepspeed/moe/ @tohtana
34-
/deepspeed/monitor/ @awan-10
35-
/deepspeed/nebula/ @tjruwase
36-
/deepspeed/ops/ @tohtana
37-
/deepspeed/pipe/ @tohtana @loadams
38-
/deepspeed/profiling/ @loadams
39-
/deepspeed/utils/ @tjruwase @awan-10
40-
41-
# inference
42-
/deepspeed/inference/ @awan-10
43-
/deepspeed/model_implementations/ @awan-10
44-
45-
# training
46-
/deepspeed/runtime/ @tjruwase @tohtana
47-
/deepspeed/runtime/activation_checkpointing/ @tjruwase
48-
/deepspeed/runtime/checkpoint_engine/ @tjruwase
49-
/deepspeed/runtime/comm/ @awan-10
50-
/deepspeed/runtime/compression/ @awan-10
51-
/deepspeed/runtime/data_pipeline/ @tjruwase
52-
/deepspeed/runtime/fp16/ @tjruwase
53-
/deepspeed/runtime/fp16/onebit/ @awan-10
54-
/deepspeed/runtime/pipe/ @loadams
55-
/deepspeed/runtime/swap_tensor/ @tjruwase
56-
/deepspeed/runtime/zero/ @tjruwase
10+

accelerator/hpu_accelerator.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@ def __init__(self):
2121
self.apply_hpu_workarounds()
2222
try:
2323
import habana_frameworks.torch.hpu as hpu
24-
hpu.setDeterministic(True)
2524
self.hpu = hpu
25+
torch.use_deterministic_algorithms(True)
26+
# TODO: [SW-215614] remove this WA when SW-208658 is resolved.
27+
torch.utils.deterministic.fill_uninitialized_memory = False
28+
2629
except ImportError as e:
2730
raise ValueError(
2831
f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
@@ -299,6 +302,14 @@ def get_op_builder(self, class_name):
299302
else:
300303
return self.class_dict['NotImplementedBuilder'] if 'NotImplementedBuilder' in self.class_dict else None
301304

305+
#shall be removed once moving to torch.compile
306+
def wrap_in_hpu_graph(self, module):
307+
if self.hpu.is_lazy():
308+
module = self.hpu.wrap_in_hpu_graph(module)
309+
else:
310+
print("Warning: hpu graphs in eager mode is not supported, ignoring")
311+
return module
312+
302313
def build_extension(self):
303314
from torch.utils.cpp_extension import BuildExtension
304315
return BuildExtension
@@ -307,6 +318,7 @@ def export_envs(self):
307318
return []
308319

309320
def visible_devices_envs(self):
321+
# TODO SW-195658: remove WA to not return HABANA_VISIBLE_MODULES once SW-195657 is resolved
310322
# Current way deepspeed set this env var is not applicable with all HPU instances
311323
# User has to follow instructions in:
312324
# https://docs.habana.ai/en/latest/PyTorch/Reference/PT_Multiple_Tenants_on_HPU/Multiple_Workloads_Single_Docker.html

accelerator/real_accelerator.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def get_accelerator():
6767
f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
6868
elif accelerator_name == "xpu.external":
6969
try:
70-
import intel_extension_for_deepspeed # noqa: F401 # type: ignore
70+
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore
7171
except ImportError as e:
7272
raise ValueError(
7373
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
@@ -193,6 +193,12 @@ def get_accelerator():
193193
ds_accelerator = CPU_Accelerator()
194194
elif accelerator_name == "xpu.external":
195195
# XPU_Accelerator is already imported in detection stage
196+
try:
197+
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F811
198+
except ImportError as e:
199+
raise ValueError(
200+
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
201+
)
196202
ds_accelerator = XPU_Accelerator()
197203
elif accelerator_name == "xpu":
198204
from .xpu_accelerator import XPU_Accelerator
@@ -223,7 +229,7 @@ def get_accelerator():
223229
def set_accelerator(accel_obj):
224230
global ds_accelerator
225231
_validate_accelerator(accel_obj)
226-
if accel_logger is not None:
232+
if accel_logger is not None and accel_obj is not None:
227233
accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
228234
ds_accelerator = accel_obj
229235

build.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
+hpu.synapse.v1.22.0

csrc/fp_quantizer/fp_quantize.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
at::Tensor quantize(torch::Tensor& out,
2626
torch::Tensor& val,
27+
torch::Tensor& scale,
2728
int group_size,
2829
int stochastic_rounding,
2930
int q_bits,
@@ -59,6 +60,7 @@ at::Tensor quantize(torch::Tensor& out,
5960

6061
void dequantize(torch::Tensor& val,
6162
torch::Tensor& val_q,
63+
torch::Tensor& scale,
6264
int group_size,
6365
int q_mantisa_bits,
6466
int q_exponent_bits)

deepspeed/autotuning/autotuner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def __init__(self, args, active_resources):
8181
if not os.path.exists(self.results_dir):
8282
try:
8383
os.makedirs(self.results_dir, exist_ok=True)
84-
logger.info(f"Created autotuning results directory: {self.exps_dir}")
84+
logger.info(f"Created autotuning results directory: {self.results_dir}")
8585
except:
8686
logger.error(
8787
f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."

deepspeed/autotuning/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@
144144
"zero_optimization": {
145145
"stage": 3
146146
},
147-
"memory_break_down": False
147+
"memory_breakdown": False
148148
}
149149

150150
DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}}

deepspeed/comm/ccl.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -77,27 +77,12 @@ def run_collective(self, name, **kwargs):
7777
return CCLHandler(self.ccl_comm_op)
7878

7979
def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
80-
use_caching = False
81-
if use_caching:
82-
match_id = f"{tensor.size()}-{op}"
83-
name = "all_reduce_caching"
84-
if name in self.available_coll:
85-
group = self.get_all_ranks_from_group(group)
86-
return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
87-
else:
88-
return self.run_collective(name=name,
89-
tensor=tensor,
90-
op=op,
91-
match_id=match_id,
92-
group=group,
93-
async_op=async_op)
80+
name = "all_reduce"
81+
if name in self.available_coll:
82+
group = self.get_all_ranks_from_group(group)
83+
return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
9484
else:
95-
name = "all_reduce"
96-
if name in self.available_coll:
97-
group = self.get_all_ranks_from_group(group)
98-
return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
99-
else:
100-
return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
85+
return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
10186

10287
def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None):
10388
name = "inference_all_reduce"

0 commit comments

Comments
 (0)