pytorch
diff --git a/‎API_GUIDE.md
Lines changed: 5 additions & 5 deletions b/‎API_GUIDE.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎benchmarks/experiment_runner.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/experiment_runner.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb
Lines changed: 3 additions & 3 deletions b/‎contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/learn/_pjrt.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/learn/_pjrt.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/learn/eager.md
Lines changed: 2 additions & 2 deletions b/‎docs/source/learn/eager.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/learn/pytorch-on-xla-devices.md
Lines changed: 5 additions & 5 deletions b/‎docs/source/learn/pytorch-on-xla-devices.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/source/learn/xla-overview.md
Lines changed: 4 additions & 4 deletions b/‎docs/source/learn/xla-overview.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/perf/amp.md
Lines changed: 5 additions & 5 deletions b/‎docs/source/perf/amp.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/source/perf/ddp.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/perf/ddp.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/perf/dynamo.md
Lines changed: 2 additions & 2 deletions b/‎docs/source/perf/dynamo.md
Lines changed: 2 additions & 2 deletions
@@ -22,7 +22,7 @@ print(t)
 
 This code should look familiar. PyTorch/XLA uses the same interface as regular
 PyTorch with a few additions. Importing `torch_xla` initializes PyTorch/XLA, and
-`torch_xla.device()` returns the current XLA device. This may be a CPU or TPU
+`torch.device('xla')` returns the current XLA device. This may be a CPU or TPU
 depending on your environment.
 
 ## XLA Tensors are PyTorch Tensors
@@ -112,7 +112,7 @@ train_loader = xu.SampleGenerator(
           torch.zeros(batch_size, dtype=torch.int64)),
     sample_count=60000 // batch_size // xr.world_size())
 
-device = torch_xla.device()  # Get the XLA device (TPU).
+device = torch.device('xla')  # Get the XLA device (TPU).
 model = MNIST().train().to(device)  # Create a model and move it to the device.
 loss_fn = nn.NLLLoss()
 optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
@@ -169,7 +169,7 @@ def _mp_fn(index):
     index: Index of the process.
   """
 
-  device = torch_xla.device()  # Get the device assigned to this process.
+  device = torch.device('xla')  # Get the device assigned to this process.
   # Wrap the loader for multi-device.
   mp_device_loader = pl.MpDeviceLoader(train_loader, device)
 
@@ -197,7 +197,7 @@ single device snippet. Let's go over then one by one.
 - `torch_xla.launch()`
   - Creates the processes that each run an XLA device.
   - This function is a wrapper of multithreading spawn to allow user run the script with torchrun command line also. Each process will only be able to access the device assigned to the current process. For example on a TPU v4-8, there will be 4 processes being spawn up and each process will own a TPU device.
-  - Note that if you print the `torch_xla.device()` on each process you will see `xla:0` on all devices. This is because each process can only see one device. This does not mean multi-process is not functioning. The only exeption is with PJRT runtime on TPU v2 and TPU v3 since there will be `#devices/2` processes and each process will have 2 threads (check this [doc](https://github.com/pytorch/xla/blob/master/docs/pjrt.md#tpus-v2v3-vs-v4) for more details).
+  - Note that if you print the `torch.device('xla')` on each process you will see `xla:0` on all devices. This is because each process can only see one device. This does not mean multi-process is not functioning. The only exeption is with PJRT runtime on TPU v2 and TPU v3 since there will be `#devices/2` processes and each process will have 2 threads (check this [doc](https://github.com/pytorch/xla/blob/master/docs/pjrt.md#tpus-v2v3-vs-v4) for more details).
 - `MpDeviceLoader`
   - Loads the training data onto each device.
   - `MpDeviceLoader` can wrap on a torch dataloader. It can preload the data to the device and overlap the dataloading with device execution to improve the performance.
@@ -290,7 +290,7 @@ import torch
 import torch_xla
 import torch_xla.core.xla_model as xm
 
-device = torch_xla.device()
+device = torch.device('xla')
 
 t0 = torch.randn(2, 2, device=device)
 t1 = torch.randn(2, 2, device=device)
 
@@ -255,7 +255,7 @@ def _default_iter_fn(self, benchmark_experiment: BenchmarkExperiment,
 
   def _pure_wall_time_iter_fn(self, benchmark_experiment: BenchmarkExperiment,
                               benchmark_model: BenchmarkModel, input_tensor):
-    device = torch_xla.device() if benchmark_experiment.xla else 'cuda'
+    device = torch.device('xla') if benchmark_experiment.xla else 'cuda'
     sync_fn = xm.wait_device_ops if benchmark_experiment.xla else torch.cuda.synchronize
     timing, output = bench.do_bench(
         lambda: benchmark_model.model_iter_fn(
 
@@ -193,7 +193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-01-10T19:30:28.607393Z",
@@ -210,7 +210,7 @@
     "lock = mp.Manager().Lock()\n",
     "\n",
     "def print_device(i, lock):\n",
-    "    device = torch_xla.device()\n",
+    "    device = torch.device('xla')\n",
     "    with lock:\n",
     "        print('process', i, device)"
    ]
@@ -454,7 +454,7 @@
     "import torch_xla.experimental.pjrt_backend # Required for torch.distributed on TPU v2 and v3\n",
     "\n",
     "def toy_model(index, lock):\n",
-    "    device = torch_xla.device()\n",
+    "    device = torch.device('xla')\n",
     "    dist.init_process_group('xla', init_method='xla://')\n",
     "\n",
     "    # Initialize a basic toy model\n",
 
@@ -73,7 +73,7 @@ import torch_xla.distributed.xla_backend
 
 
 def _mp_fn(index):
-  device = torch_xla.device()
+  device = torch.device('xla')
 -  dist.init_process_group('xla', rank=xr.global_ordinal(), world_size=xr.world_size())
 +  dist.init_process_group('xla', init_method='xla://')
 
 
@@ -13,7 +13,7 @@ import torch
 import torch_xla
 import torchvision
 
-device = torch_xla.device()
+device = torch.device('xla')
 model = torchvision.models.resnet18().to(device)
 input = torch.randn(64, 3, 224, 224).to(device)
 
@@ -71,7 +71,7 @@ import torchvision
 # Run ops eagerly by default
 torch_xla.experimental.eager_mode(True)
 
-device = torch_xla.device()
+device = torch.device('xla')
 model = torchvision.models.resnet18().to(device)
 
 # Mark the function to be compiled
 
@@ -21,7 +21,7 @@ print(t)
 
 This code should look familiar. PyTorch/XLA uses the same interface as
 regular PyTorch with a few additions. Importing `torch_xla` initializes
-PyTorch/XLA, and `torch_xla.device()` returns the current XLA device. This
+PyTorch/XLA, and `torch.device('xla')` returns the current XLA device. This
 may be a CPU or TPU depending on your environment.
 
 ## XLA Tensors are PyTorch Tensors
@@ -81,7 +81,7 @@ The following snippet shows a network training on a single XLA device:
 ``` python
 import torch_xla.core.xla_model as xm
 
-device = torch_xla.device()
+device = torch.device('xla')
 model = MNIST().train().to(device)
 loss_fn = nn.NLLLoss()
 optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
@@ -120,7 +120,7 @@ import torch_xla.core.xla_model as xm
 import torch_xla.distributed.parallel_loader as pl
 
 def _mp_fn(index):
-  device = torch_xla.device()
+  device = torch.device('xla')
   mp_device_loader = pl.MpDeviceLoader(train_loader, device)
 
   model = MNIST().train().to(device)
@@ -148,7 +148,7 @@ previous single device snippet. Let's go over then one by one.
         will only be able to access the device assigned to the current
         process. For example on a TPU v4-8, there will be 4 processes
         being spawn up and each process will own a TPU device.
-    -   Note that if you print the `torch_xla.device()` on each process you
+    -   Note that if you print the `torch.device('xla')` on each process you
         will see `xla:0` on all devices. This is because each process
         can only see one device. This does not mean multi-process is not
         functioning. The only execution is with PJRT runtime on TPU v2
@@ -283,7 +283,7 @@ import torch
 import torch_xla
 import torch_xla.core.xla_model as xm
 
-device = torch_xla.device()
+device = torch.device('xla')
 
 t0 = torch.randn(2, 2, device=device)
 t1 = torch.randn(2, 2, device=device)
 
@@ -184,7 +184,7 @@ repo. contains examples for training and serving many LLM and diffusion models.
 
 General guidelines to modify your code:
 
--   Replace `cuda` with `torch_xla.device()`
+-   Replace `cuda` with `torch.device('xla')`
 -   Remove progress bar, printing that would access the XLA tensor
     values
 -   Reduce logging and callbacks that would access the XLA tensor values
@@ -227,7 +227,7 @@ tutorial, but you can pass the `device` value to the function as well.
 
 ``` python
     import torch_xla.core.xla_model as xm
-    self.device = torch_xla.device()
+    self.device = torch.device('xla')
 ```
 
 Another place in the code that has cuda specific code is DDIM scheduler.
@@ -244,7 +244,7 @@ if attr.device != torch.device("cuda"):
 with
 
 ``` python
-device = torch_xla.device()
+device = torch.device('xla')
 attr = attr.to(torch.device(device))
 ```
 
@@ -339,7 +339,7 @@ with the following lines:
 
 ``` python
 import torch_xla.core.xla_model as xm
-device = torch_xla.device()
+device = torch.device('xla')
 pipe.to(device)
 ```
 
 
@@ -27,7 +27,7 @@ for input, target in data:
     optimizer.zero_grad()
 
     # Enables autocasting for the forward pass
-    with autocast(torch_xla.device()):
+    with autocast(torch.device('xla')):
         output = model(input)
         loss = loss_fn(output, target)
 
@@ -36,7 +36,7 @@ for input, target in data:
     xm.optimizer_step.(optimizer)
 ```
 
-`autocast(torch_xla.device())` aliases `torch.autocast('xla')` when the XLA
+`autocast(torch.device('xla'))` aliases `torch.autocast('xla')` when the XLA
 Device is a TPU. Alternatively, if a script is only used with TPUs, then
 `torch.autocast('xla', dtype=torch.bfloat16)` can be directly used.
 
@@ -115,7 +115,7 @@ for input, target in data:
     optimizer.zero_grad()
 
     # Enables autocasting for the forward pass
-    with autocast(torch_xla.device()):
+    with autocast(torch.device('xla')):
         output = model(input)
         loss = loss_fn(output, target)
 
@@ -127,12 +127,12 @@ for input, target in data:
     scaler.update()
 ```
 
-`autocast(torch_xla.device())` aliases `torch.cuda.amp.autocast()` when the
+`autocast(torch.device('xla'))` aliases `torch.cuda.amp.autocast()` when the
 XLA Device is a CUDA device (XLA:GPU). Alternatively, if a script is
 only used with CUDA devices, then `torch.cuda.amp.autocast` can be
 directly used, but requires `torch` is compiled with `cuda` support for
 datatype of `torch.bfloat16`. We recommend using
-`autocast(torch_xla.device())` on XLA:GPU as it does not require
+`autocast(torch.device('xla'))` on XLA:GPU as it does not require
 `torch.cuda` support for any datatypes, including `torch.bfloat16`.
 
 ### AMP for XLA:GPU Best Practices
 
@@ -105,7 +105,7 @@ def demo_basic(rank):
     setup(rank, world_size)
 
     # create model and move it to XLA device
-    device = torch_xla.device()
+    device = torch.device('xla')
     model = ToyModel().to(device)
     ddp_model = DDP(model, gradient_as_bucket_view=True)
 
 
@@ -41,7 +41,7 @@ import torchvision
 import torch_xla.core.xla_model as xm
 
 def eval_model(loader):
-  device = torch_xla.device()
+  device = torch.device('xla')
   xla_resnet18 = torchvision.models.resnet18().to(device)
   xla_resnet18.eval()
   dynamo_resnet18 = torch.compile(
@@ -129,7 +129,7 @@ def train_model(model, data, target, optimizer):
   return pred
 
 def train_model_main(loader):
-  device = torch_xla.device()
+  device = torch.device('xla')
   xla_resnet18 = torchvision.models.resnet18().to(device)
   xla_resnet18.train()
   dynamo_train_model = torch.compile(