pytorch
diff --git a/‎API_GUIDE.md‎
Lines changed: 12 additions & 12 deletions b/‎API_GUIDE.md‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_experiment.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_experiment.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/experiment_runner.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/experiment_runner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/matmul_bench.py‎
Lines changed: 2 additions & 5 deletions b/‎benchmarks/matmul_bench.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb‎
Lines changed: 12 additions & 12 deletions b/‎contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎contrib/kaggle/pytorch-xla-2-0-on-kaggle.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎contrib/kaggle/pytorch-xla-2-0-on-kaggle.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/features/pallas.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/features/pallas.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/features/triton.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/features/triton.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/learn/_pjrt.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/learn/_pjrt.md‎
Lines changed: 2 additions & 2 deletions
@@ -15,14 +15,14 @@ import torch
 import torch_xla
 import torch_xla.core.xla_model as xm
 
-t = torch.randn(2, 2, device=xm.xla_device())
+t = torch.randn(2, 2, device='xla')
 print(t.device)
 print(t)
 ```
 
 This code should look familiar. PyTorch/XLA uses the same interface as regular
 PyTorch with a few additions. Importing `torch_xla` initializes PyTorch/XLA, and
-`xm.xla_device()` returns the current XLA device. This may be a CPU or TPU
+`torch_xla.device()` returns the current XLA device. This may be a CPU or TPU
 depending on your environment.
 
 ## XLA Tensors are PyTorch Tensors
@@ -32,8 +32,8 @@ PyTorch operations can be performed on XLA tensors just like CPU or CUDA tensors
 For example, XLA tensors can be added together:
 
 ```python
-t0 = torch.randn(2, 2, device=xm.xla_device())
-t1 = torch.randn(2, 2, device=xm.xla_device())
+t0 = torch.randn(2, 2, device='xla')
+t1 = torch.randn(2, 2, device='xla')
 print(t0 + t1)
 ```
 
@@ -46,8 +46,8 @@ print(t0.mm(t1))
 Or used with neural network modules:
 
 ```python
-l_in = torch.randn(10, device=xm.xla_device())
-linear = torch.nn.Linear(10, 20).to(xm.xla_device())
+l_in = torch.randn(10, device='xla')
+linear = torch.nn.Linear(10, 20).to(torch_xla.device())
 l_out = linear(l_in)
 print(l_out)
 ```
@@ -56,7 +56,7 @@ Like other device types, XLA tensors only work with other XLA tensors on the
 same device. So code like
 
 ```python
-l_in = torch.randn(10, device=xm.xla_device())
+l_in = torch.randn(10, device='xla')
 linear = torch.nn.Linear(10, 20)
 l_out = linear(l_in)
 print(l_out)
@@ -109,10 +109,10 @@ class MNIST(nn.Module):
 batch_size = 128
 train_loader = xu.SampleGenerator(
     data=(torch.zeros(batch_size, 1, 28, 28),
-          torch.zeros(batch_size, dtype=torch.int64)), 
+          torch.zeros(batch_size, dtype=torch.int64)),
     sample_count=60000 // batch_size // xr.world_size())
 
-device = xm.xla_device()  # Get the XLA device (TPU).
+device = torch_xla.device()  # Get the XLA device (TPU).
 model = MNIST().train().to(device)  # Create a model and move it to the device.
 loss_fn = nn.NLLLoss()
 optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
@@ -169,7 +169,7 @@ def _mp_fn(index):
     index: Index of the process.
   """
 
-  device = xm.xla_device()  # Get the device assigned to this process.
+  device = torch_xla.device()  # Get the device assigned to this process.
   # Wrap the loader for multi-device.
   mp_device_loader = pl.MpDeviceLoader(train_loader, device)
 
@@ -197,7 +197,7 @@ single device snippet. Let's go over then one by one.
 - `torch_xla.launch()`
   - Creates the processes that each run an XLA device.
   - This function is a wrapper of multithreading spawn to allow user run the script with torchrun command line also. Each process will only be able to access the device assigned to the current process. For example on a TPU v4-8, there will be 4 processes being spawn up and each process will own a TPU device.
-  - Note that if you print the `xm.xla_device()` on each process you will see `xla:0` on all devices. This is because each process can only see one device. This does not mean multi-process is not functioning. The only exeption is with PJRT runtime on TPU v2 and TPU v3 since there will be `#devices/2` processes and each process will have 2 threads (check this [doc](https://github.com/pytorch/xla/blob/master/docs/pjrt.md#tpus-v2v3-vs-v4) for more details).
+  - Note that if you print the `torch_xla.device()` on each process you will see `xla:0` on all devices. This is because each process can only see one device. This does not mean multi-process is not functioning. The only exeption is with PJRT runtime on TPU v2 and TPU v3 since there will be `#devices/2` processes and each process will have 2 threads (check this [doc](https://github.com/pytorch/xla/blob/master/docs/pjrt.md#tpus-v2v3-vs-v4) for more details).
 - `MpDeviceLoader`
   - Loads the training data onto each device.
   - `MpDeviceLoader` can wrap on a torch dataloader. It can preload the data to the device and overlap the dataloading with device execution to improve the performance.
@@ -290,7 +290,7 @@ import torch
 import torch_xla
 import torch_xla.core.xla_model as xm
 
-device = xm.xla_device()
+device = torch_xla.device()
 
 t0 = torch.randn(2, 2, device=device)
 t1 = torch.randn(2, 2, device=device)
 
@@ -196,7 +196,7 @@ If you're using `DistributedDataParallel`, make the following changes:
 +  # Rank and world size are inferred from the XLA device runtime
 +  dist.init_process_group("xla", init_method='xla://')
 +
-+  model.to(xm.xla_device())
++  model.to(torch_xla.device())
 +  ddp_model = DDP(model, gradient_as_bucket_view=True)
 
 -  model = model.to(rank)
 
@@ -208,7 +208,7 @@ def update_process_env(self, process_env: Dict[str, str]):
   def get_device(self):
     if self.torch_xla2:
       # Initiate the model in CPU first for xla2. We will move the model to jax device later.
-      # This is because we don't have xm.xla_device() function in torch_xla2.
+      # This is because we don't have torch_xla.device() function in torch_xla2.
       return torch.device("cpu")
     if self.xla:
       return xm.xla_device(devkind=self.accelerator.upper())
 
@@ -255,7 +255,7 @@ def _default_iter_fn(self, benchmark_experiment: BenchmarkExperiment,
 
   def _pure_wall_time_iter_fn(self, benchmark_experiment: BenchmarkExperiment,
                               benchmark_model: BenchmarkModel, input_tensor):
-    device = xm.xla_device() if benchmark_experiment.xla else 'cuda'
+    device = torch_xla.device() if benchmark_experiment.xla else 'cuda'
     sync_fn = xm.wait_device_ops if benchmark_experiment.xla else torch.cuda.synchronize
     timing, output = bench.do_bench(
         lambda: benchmark_model.model_iter_fn(
 
@@ -39,10 +39,7 @@ def main():
   """
 
   xla_bench_fn = lambda fn: do_bench(
-      fn,
-      return_mode='min',
-      sync_fn=lambda: xm.wait_device_ops(),
-      device=xm.xla_device())
+      fn, return_mode='min', sync_fn=lambda: xm.wait_device_ops(), device='xla')
   ind_bench_fn = lambda fn: do_bench(
       fn,
       return_mode='min',
@@ -53,7 +50,7 @@ def main():
   for dtype in dtypes:
     for inductor_matmul, xla_matmul in zip(
         get_matmuls(device='cuda', dtype=dtype, backend='inductor'),
-        get_matmuls(device=xm.xla_device(), dtype=dtype, backend='openxla')):
+        get_matmuls(device='xla', dtype=dtype, backend='openxla')):
       ind_lhs_shape, ind_rhs_shape, ind_fn = inductor_matmul
       xla_lhs_shape, xla_rhs_shape, xla_fn = xla_matmul
       assert ind_lhs_shape == xla_lhs_shape, f"Expect matmul shapes to match for benchmarking. Mismatch lhs: {ind_lhs_shape}, rhs: {xla_rhs_shape}"
 
@@ -188,7 +188,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To get the current process/thread's default XLA device, use `xm.xla_device()`. XLA devices are numbered as `xla:i`, where `i` is the index of the device within the current process. Since each process has two devices on a TPU v3, this will be `xla:0` or `xla:1`."
+    "To get the current process/thread's default XLA device, use `torch_xla.device()`. XLA devices are numbered as `xla:i`, where `i` is the index of the device within the current process. Since each process has two devices on a TPU v3, this will be `xla:0` or `xla:1`."
    ]
   },
   {
@@ -210,7 +210,7 @@
     "lock = mp.Manager().Lock()\n",
     "\n",
     "def print_device(i, lock):\n",
-    "    device = xm.xla_device()\n",
+    "    device = torch_xla.device()\n",
     "    with lock:\n",
     "        print('process', i, device)"
    ]
@@ -273,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-01-10T19:30:33.219878Z",
@@ -318,12 +318,12 @@
    ],
    "source": [
     "def add_ones(i, lock):\n",
-    "    x = torch.ones((3, 3), device=xm.xla_device())\n",
+    "    x = torch.ones((3, 3), device='xla')\n",
     "    y = x + x\n",
-    "    \n",
+    "\n",
     "    # Run graph to compute `y` before printing\n",
     "    torch_xla.sync()\n",
-    "    \n",
+    "\n",
     "    with lock:\n",
     "        print(i, y)\n",
     "\n",
@@ -340,7 +340,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-01-10T19:30:35.656796Z",
@@ -378,10 +378,10 @@
    "source": [
     "def gather_ids(i, lock):\n",
     "    # Create a tensor on each device with the device ID\n",
-    "    t = torch.tensor([i], device=xm.xla_device())\n",
+    "    t = torch.tensor([i], device='xla')\n",
     "    with lock:\n",
     "        print(i, t)\n",
-    "    \n",
+    "\n",
     "    # Collect and concatenate the IDs\n",
     "    ts = xm.all_gather(t)\n",
     "    torch_xla.sync()\n",
@@ -402,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2024-01-10T19:30:38.315927Z",
@@ -454,7 +454,7 @@
     "import torch_xla.experimental.pjrt_backend # Required for torch.distributed on TPU v2 and v3\n",
     "\n",
     "def toy_model(index, lock):\n",
-    "    device = xm.xla_device()\n",
+    "    device = torch_xla.device()\n",
     "    dist.init_process_group('xla', init_method='xla://')\n",
     "\n",
     "    # Initialize a basic toy model\n",
@@ -479,7 +479,7 @@
     "        loss.backward()\n",
     "\n",
     "        optimizer.step()\n",
-    "        \n",
+    "\n",
     "        # Run the pending graph\n",
     "        torch_xla.sync()\n",
     "\n",
 
@@ -172,7 +172,7 @@
     "\n",
     "pipeline = DiffusionPipeline.from_pretrained(\"runwayml/stable-diffusion-v1-5\")\n",
     "# Move the model to the first TPU core\n",
-    "pipeline = pipeline.to(xm.xla_device())"
+    "pipeline = pipeline.to(torch_xla.device())"
    ]
   },
   {
 
@@ -40,9 +40,9 @@ jax will lock the TPU and torch-xla cannot access it.
 Example usage:
 
 ``` python3
-q = torch.randn(3, 2, 128, 4).to("xla")
-k = torch.randn(3, 2, 128, 4).to("xla")
-v = torch.randn(3, 2, 128, 4).to("xla")
+q = torch.randn(3, 2, 128, 4).to('xla')
+k = torch.randn(3, 2, 128, 4).to('xla')
+v = torch.randn(3, 2, 128, 4).to('xla')
 
 # Adopts any Pallas kernel
 from torch_xla.experimental.custom_kernel import make_kernel_from_pallas
 
@@ -42,8 +42,8 @@ import triton
 import triton.language as tl
 
 size = 16
-x = torch.arange(size, dtype=torch.int64).to("xla")
-y = torch.arange(size, dtype=torch.int64).to("xla")
+x = torch.arange(size, dtype=torch.int64).to('xla')
+y = torch.arange(size, dtype=torch.int64).to('xla')
 output = torch.empty_like(x)
 block_size = 8
 grid = (triton.cdiv(size, block_size),)
 
@@ -73,7 +73,7 @@ import torch_xla.distributed.xla_backend
 
 
 def _mp_fn(index):
-  device = xm.xla_device()
+  device = torch_xla.device()
 -  dist.init_process_group('xla', rank=xr.global_ordinal(), world_size=xr.world_size())
 +  dist.init_process_group('xla', init_method='xla://')
 
@@ -377,7 +377,7 @@ def _all_gather(index: int):
   # No need to pass in `rank` or `world_size`
   dist.init_process_group('xla', init_method='xla://')
 
-  t = torch.tensor([index], dtype=torch.int32, device=xm.xla_device())
+  t = torch.tensor([index], dtype=torch.int32, device='xla')
   output = [torch.zeros_like(t) for _ in range(dist.get_world_size())]
   dist.all_gather(output, t)
Original file line number	Diff line number	Diff line change
@@ -196,7 +196,7 @@ If you're using `DistributedDataParallel`, make the following changes:
`196`	`196`	`+ # Rank and world size are inferred from the XLA device runtime`
`197`	`197`	`+ dist.init_process_group("xla", init_method='xla://')`
`198`	`198`	`+`
`199`		`-+ model.to(xm.xla_device())`
	`199`	`++ model.to(torch_xla.device())`
`200`	`200`	`+ ddp_model = DDP(model, gradient_as_bucket_view=True)`
`201`	`201`
`202`	`202`	`- model = model.to(rank)`
Original file line number	Diff line number	Diff line change
`@@ -172,7 +172,7 @@`
`172`	`172`	`"\n",`
`173`	`173`	`"pipeline = DiffusionPipeline.from_pretrained(\"runwayml/stable-diffusion-v1-5\")\n",`
`174`	`174`	`"# Move the model to the first TPU core\n",`
`175`		`- "pipeline = pipeline.to(xm.xla_device())"`
	`175`	`+ "pipeline = pipeline.to(torch_xla.device())"`
`176`	`176`	`]`
`177`	`177`	`},`
`178`	`178`	`{`