Chao1Han
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_init.py‎
Lines changed: 80 additions & 80 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_init.py‎
Lines changed: 80 additions & 80 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_logging.py‎
Lines changed: 1 addition & 0 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_logging.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_memory.py‎
Lines changed: 9 additions & 9 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_memory.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py‎
Lines changed: 15 additions & 15 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_overlap.py‎
Lines changed: 18 additions & 18 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_overlap.py‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎test/distributed/_composable/fsdp/test_fully_shard_state.py‎
Lines changed: 6 additions & 6 deletions b/‎test/distributed/_composable/fsdp/test_fully_shard_state.py‎
Lines changed: 6 additions & 6 deletions
@@ -7,6 +7,7 @@
 from torch._dynamo.test_case import run_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.common_utils import TEST_XPU
 from torch.testing._internal.logging_utils import LoggingTestCase
 
 
 
@@ -18,7 +18,7 @@
 class TestFullyShardMemory(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(2, torch.cuda.device_count())
+        return min(2, torch.xpu.device_count())
 
     @skip_if_lt_x_gpu(2)
     def test_fully_shard_training_memory(self):
@@ -56,10 +56,10 @@ def _test_fully_shard_training_memory(
         # Pre-run a linear forward (gemm and bias) and backward (gemm) to
         # allocate the cuBLAS workspaces before measuring the memory usage
         # since the workspace size can differ between hardwares
-        lin = torch.nn.Linear(768, 768, device="cuda")
-        inp = torch.randn(1, 768, device="cuda")
+        lin = torch.nn.Linear(768, 768, device="xpu")
+        inp = torch.randn(1, 768, device="xpu")
         lin(inp).sum().backward()
-        torch.cuda.empty_cache()
+        torch.xpu.empty_cache()
         base_mem_mb = self._get_peak_active_memory_mb()
         vocab_size = 32
         model_args = ModelArgs(
@@ -108,7 +108,7 @@ def _test_fully_shard_training_memory(
         self.assertLessEqual(curr_mem_mb - base_mem_mb, init_mem_mb)
 
         # Use a small input to minimize activation memory usage
-        inp = torch.randint(0, vocab_size, (1, 4), device="cuda")
+        inp = torch.randint(0, vocab_size, (1, 4), device="xpu")
 
         # Forward:
         loss = model(inp)
@@ -166,7 +166,7 @@ def _test_fully_shard_training_memory(
             ) * 4 / 1e6 + buffer_mb
         self.assertLessEqual(mem_mb - base_mem_mb, expected_mem_mb)
         del loss
-        torch.cuda.reset_peak_memory_stats()
+        torch.xpu.reset_peak_memory_stats()
 
         # Optimizer step: unsharded parameters/gradients freed
         if not run_optim_in_backward:
@@ -184,7 +184,7 @@ def _test_fully_shard_training_memory(
         # Zero grad: sharded gradients freed
         if not run_optim_in_backward:
             optim.zero_grad()
-        torch.cuda.reset_peak_memory_stats()  # reset after freeing
+        torch.xpu.reset_peak_memory_stats()  # reset after freeing
         mem_mb = self._get_peak_active_memory_mb()
         expected_mem_mb = 0
         if not use_cpu_offload:
@@ -225,11 +225,11 @@ def test_fully_shard_del_memory(self):
         self.assertEqual(mem_mb, base_mem_mb)
 
     def _get_peak_active_memory_mb(self) -> int:
-        mem_stats = torch.cuda.memory_stats()
+        mem_stats = torch.xpu.memory_stats()
         return round(mem_stats["active_bytes.all.peak"] / 1e6)
 
     def _get_curr_active_memory_mb(self) -> int:
-        mem_stats = torch.cuda.memory_stats()
+        mem_stats = torch.xpu.memory_stats()
         return round(mem_stats["active_bytes.all.current"] / 1e6)
 
     def _register_optim_in_backward(
 
@@ -32,7 +32,7 @@
 class TestFullyShardMixedPrecisionTraining(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.xpu.device_count())
 
     def _init_models_and_optims(
         self,
@@ -43,7 +43,7 @@ def _init_models_and_optims(
     ):
         torch.manual_seed(42)
         model = nn.Sequential(*[MLP(16, torch.device("cpu")) for _ in range(3)])
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).xpu()
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
 
         def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
@@ -122,7 +122,7 @@ def assert_fn(output: torch.Tensor):
         )
 
         torch.manual_seed(42 + self.rank + 1)
-        inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+        inp = torch.randn((4, 16), device="xpu", dtype=param_dtype)
         for iter_idx in range(10):
             optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = model(inp).sum()
@@ -207,7 +207,7 @@ def assert_fn(output: torch.Tensor):
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
         torch.manual_seed(42 + self.rank + 1)
-        inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+        inp = torch.randn((4, 16), device="xpu", dtype=param_dtype)
         for iter_idx in range(10):
             optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = model(inp).sum()
@@ -256,7 +256,7 @@ def assert_fn(output: torch.Tensor):
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
         torch.manual_seed(42 + self.rank + 1)
-        inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+        inp = torch.randn((4, 16), device="xpu", dtype=param_dtype)
         for iter_idx in range(10):
             optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = model(inp).sum()
@@ -307,7 +307,7 @@ def _test_grad_acc_with_reduce_dtype(self, reshard_after_forward: bool):
         # To emulate the mixed precision implementation where forward/backward
         # compute use bf16 and optimizer uses fp32, we maintain both an fp32
         # and a bf16 copy of the reference model
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).xpu()
         ref_model_compute = copy.deepcopy(ref_model).to(param_dtype)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         for mlp in model:
@@ -327,7 +327,7 @@ def assert_fn(output: torch.Tensor):
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
         torch.manual_seed(42 + self.rank + 1)
-        device = torch.device("cuda")
+        device = torch.device("xpu")
         # Train on the same input to avoid loss explosion
         num_microbatches = 4
         inp = torch.randn((2 * num_microbatches, 16), device=device, dtype=param_dtype)
@@ -387,15 +387,15 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(1)
     def test_float16_on_one_submodule(self):
-        x = torch.zeros(2, 100, device="cuda")
+        x = torch.zeros(2, 100, device="xpu")
 
         # Subtest 1: use fp16 on the second child submodule -- does not require
         # any additional casting logic
         forward_inputs: dict[str, nn.Module] = {}
         model = SaveForwardInputsModel(
             forward_inputs,
             cast_forward_inputs=False,
-        ).cuda()
+        ).xpu()
         fully_shard(model.c2, mp_policy=MixedPrecisionPolicy(param_dtype=torch.float16))
         fully_shard(model)
         model(x).sum().backward()
@@ -408,7 +408,7 @@ def test_float16_on_one_submodule(self):
         forward_inputs: dict[nn.Module, torch.Tensor] = {}
         model = SaveForwardInputsModel(
             forward_inputs=forward_inputs, cast_forward_inputs=True
-        ).cuda()
+        ).xpu()
         fully_shard(
             model.c2,
             mp_policy=MixedPrecisionPolicy(
@@ -426,7 +426,7 @@ def test_float16_on_one_submodule(self):
         forward_inputs: dict[nn.Module, torch.Tensor] = {}
         model = SaveForwardInputsModel(
             forward_inputs=forward_inputs, cast_forward_inputs=False
-        ).cuda()
+        ).xpu()
         fully_shard(
             model.c1,
             mp_policy=MixedPrecisionPolicy(
@@ -468,13 +468,13 @@ def __init__(self, forward_inputs: dict[str, torch.Tensor]) -> None:
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 self.forward_inputs["model_input_x"] = x
                 y = torch.ones(
-                    2, 100, device="cuda", dtype=torch.float32
+                    2, 100, device="xpu", dtype=torch.float32
                 )  # external input
                 return self.l2(self.l1(x), y)
 
         forward_inputs: dict[str, torch.Tensor] = {}
-        model = ToyModel(forward_inputs).cuda()
-        x = torch.zeros(2, 100, device="cuda", dtype=torch.float32)
+        model = ToyModel(forward_inputs).xpu()
+        x = torch.zeros(2, 100, device="xpu", dtype=torch.float32)
         fully_shard(
             model.l2,
             mp_policy=MixedPrecisionPolicy(
@@ -577,7 +577,7 @@ def assert_fn(output: torch.Tensor):
             reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
         )
         with patch_reduce_scatter(reduce_scatter):
-            inp = torch.randn((4, 32), device="cuda")
+            inp = torch.randn((4, 32), device="xpu")
             loss = model(inp).sum()
             loss.backward()
 
 
@@ -35,7 +35,7 @@ class TestFullyShardOverlap(FSDPTest):
 
     @property
     def world_size(self) -> int:
-        return min(2, torch.cuda.device_count())
+        return min(2, torch.xpu.device_count())
 
     @skip_if_lt_x_gpu(2)
     def test_fully_shard_training_overlap(self):
@@ -46,23 +46,23 @@ def test_fully_shard_training_overlap(self):
         model = nn.Sequential(
             *[LinearWithSleep(dim, compute_sleep_ms) for _ in range(num_linears)]
         )
-        ref_model = copy.deepcopy(model).cuda()
+        ref_model = copy.deepcopy(model).xpu()
         for lin in model:
             assert len(list(lin.parameters())) == 1, "Expects only one weight"
             fully_shard(lin, reshard_after_forward=True)
         fully_shard(model, reshard_after_forward=True)
 
         orig_all_gather_into_tensor = dist.all_gather_into_tensor
         orig_reduce_scatter_tensor = dist.reduce_scatter_tensor
-        comm_stream = torch.cuda.Stream()
+        comm_stream = torch.xpu.Stream()
 
         def delay_collective():
             # Share a stream so that all-gather and reduce-scatter block each
             # other like in `ProcessGroupNCCL`
-            comm_stream.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(comm_stream):
-                torch.cuda._sleep(int(comm_sleep_ms * get_cycles_per_ms()))
-            torch.cuda.current_stream().wait_stream(comm_stream)
+            comm_stream.wait_stream(torch.xpu.current_stream())
+            with torch.xpu.stream(comm_stream):
+                torch.xpu._sleep(int(comm_sleep_ms * get_cycles_per_ms()))
+            torch.xpu.current_stream().wait_stream(comm_stream)
 
         def delayed_all_gather(*args, **kwargs):
             delay_collective()
@@ -72,7 +72,7 @@ def delayed_reduce_scatter(*args, **kwargs):
             delay_collective()
             return orig_reduce_scatter_tensor(*args, **kwargs)
 
-        inp = torch.randn((2, dim), device="cuda")
+        inp = torch.randn((2, dim), device="xpu")
         loss = model(inp).sum()  # warmup CUDA and allocator
         loss.backward()
 
@@ -153,17 +153,17 @@ def test_fully_shard_post_optim_event_overlap(self):
         # low-compute linear, where only the low-compute linear uses FSDP
         model = nn.Sequential(
             LinearWithSleep(dim, compute_sleep_ms), nn.Linear(dim, dim)
-        ).cuda()
+        ).xpu()
         fully_shard(model[1], reshard_after_forward=False)
         optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
 
         orig_all_gather_into_tensor = dist.all_gather_into_tensor
 
         def delayed_all_gather(*args, **kwargs):
-            torch.cuda._sleep(int(comm_sleep_ms * get_cycles_per_ms()))
+            torch.xpu._sleep(int(comm_sleep_ms * get_cycles_per_ms()))
             return orig_all_gather_into_tensor(*args, **kwargs)
 
-        inp = torch.randn((2, dim), device="cuda")
+        inp = torch.randn((2, dim), device="xpu")
 
         def run_train_steps(num_iters: int, use_post_optim_event: bool):
             for _ in range(num_iters):
@@ -174,7 +174,7 @@ def run_train_steps(num_iters: int, use_post_optim_event: bool):
                 with implicit_replication():
                     optim.step()
                 if use_post_optim_event:
-                    post_optim_event = torch.cuda.current_stream().record_event()
+                    post_optim_event = torch.xpu.current_stream().record_event()
                     model[1].set_post_optim_event(post_optim_event)
 
         run_train_steps(1, False)  # warmup CUDA and allocator
@@ -205,14 +205,14 @@ def run_train_steps(num_iters: int, use_post_optim_event: bool):
         self.assertGreater(baseline_time, test_time)
 
     def _time_fn(self, fn: Callable):
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
+        start_event = torch.xpu.Event(enable_timing=True)
+        end_event = torch.xpu.Event(enable_timing=True)
         dist.barrier()
-        torch.cuda.synchronize()
+        torch.xpu.synchronize()
         start_event.record()
         fn()
         end_event.record()
-        torch.cuda.synchronize()
+        torch.xpu.synchronize()
         elapsed_time = start_event.elapsed_time(end_event)
         return elapsed_time
 
@@ -223,13 +223,13 @@ class Matmul(torch.autograd.Function):
     def forward(ctx, input: torch.Tensor, weight: torch.Tensor, sleep_ms: int):
         ctx.save_for_backward(input, weight)
         ctx.sleep_ms = sleep_ms
-        torch.cuda._sleep(int(sleep_ms * get_cycles_per_ms()))
+        torch.xpu._sleep(int(sleep_ms * get_cycles_per_ms()))
         return input @ weight
 
     @staticmethod
     def backward(ctx, grad_output: torch.Tensor):
         (input, weight) = ctx.saved_tensors
-        torch.cuda._sleep(int(2 * ctx.sleep_ms * get_cycles_per_ms()))
+        torch.xpu._sleep(int(2 * ctx.sleep_ms * get_cycles_per_ms()))
         grad_input = grad_output @ weight.T
         grad_weight = input.T @ grad_output
         return grad_input, grad_weight, None
 
@@ -7,15 +7,15 @@
 from torch.distributed.fsdp import FSDPModule, fully_shard
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_fsdp import FSDPTestMultiThread, MLP
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests,TEST_XPU
 
 
 class TestFullyShardState(FSDPTestMultiThread):
     @property
     def world_size(self) -> int:
         return 1
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @unittest.skipIf(not TEST_XPU, "no xpu")
     def test_fully_shard_state(self):
         """
         Tests the ability to get the state object from a fully sharded module.
@@ -31,7 +31,7 @@ def test_fully_shard_state(self):
         # Check that each `fully_shard` call constructs a distinct state object
         self.assertEqual(len(set(all_states)), num_mlps + 1)
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @unittest.skipIf(not TEST_XPU, "no xpu")
     def test_fully_shard_reapply(self):
         model = MLP(8)
         fully_shard(model)
@@ -41,7 +41,7 @@ def test_fully_shard_reapply(self):
         ):
             fully_shard(model)
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @unittest.skipIf(not TEST_XPU, "no xpu")
     def test_fully_shard_cls(self):
         # Check that we only swap class for the module passed to `fully_shard`
         model = MLP(8)
@@ -64,7 +64,7 @@ def test_fully_shard_cls(self):
         self.assertTrue(isinstance(sliced_model, nn.Sequential))
         self.assertFalse(isinstance(sliced_model, FSDPModule))
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @unittest.skipIf(not TEST_XPU, "no xpu")
     def test_fully_shard_unsupported_module_cls(self):
         regex = (
             r"fully\_shard does not support containers that do not implement forward"
@@ -76,7 +76,7 @@ def test_fully_shard_unsupported_module_cls(self):
         with self.assertRaisesRegex(ValueError, regex):
             fully_shard(model)
 
-    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @unittest.skipIf(not TEST_XPU, "no xpu")
     def test_fully_shard_deepcopy(self):
         model = MLP(8)
         fully_shard(model)