keras-team · amitsrivastava78 · Oct 3, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 8, 2025
diff --git a/keras/api/_tf_keras/keras/callbacks/__init__.py b/keras/api/_tf_keras/keras/callbacks/__init__.py
@@ -19,6 +19,9 @@
 from keras.src.callbacks.model_checkpoint import (
     ModelCheckpoint as ModelCheckpoint,
 )
+from keras.src.callbacks.orbax_checkpoint import (
+    OrbaxCheckpoint as OrbaxCheckpoint,
+)
 from keras.src.callbacks.progbar_logger import ProgbarLogger as ProgbarLogger
 from keras.src.callbacks.reduce_lr_on_plateau import (
     ReduceLROnPlateau as ReduceLROnPlateau,

diff --git a/keras/api/callbacks/__init__.py b/keras/api/callbacks/__init__.py
@@ -19,6 +19,9 @@
 from keras.src.callbacks.model_checkpoint import (
     ModelCheckpoint as ModelCheckpoint,
 )
+from keras.src.callbacks.orbax_checkpoint import (
+    OrbaxCheckpoint as OrbaxCheckpoint,
+)
 from keras.src.callbacks.progbar_logger import ProgbarLogger as ProgbarLogger
 from keras.src.callbacks.reduce_lr_on_plateau import (
     ReduceLROnPlateau as ReduceLROnPlateau,

diff --git a/keras/src/backend/__init__.py b/keras/src/backend/__init__.py
@@ -75,3 +75,38 @@ class name_scope(backend_name_scope):
 @keras_export("keras.device")
 def device(device_name):
     return device_scope(device_name)  # noqa: F405
+
+
+def get_process_index():
+    """Get the index of the current process in a distributed setup.
+
+    Returns:
+        int: The process index (0 for primary process, >0 for others).
+             Returns 0 if not in a distributed setup.
+    """
+    backend_name = backend()
+    if backend_name == "jax":
+        try:
+            import jax
+
+            return jax.process_index()
+        except (ImportError, AttributeError):
+            return 0
+    elif backend_name == "tensorflow":
+        try:
+            import tensorflow as tf
+
+            return tf.distribute.get_replica_context().replica_id_in_sync_group
+        except (ImportError, AttributeError, RuntimeError):
+            return 0
+    elif backend_name == "torch":
+        try:
+            import torch.distributed as dist
+
+            if dist.is_available() and dist.is_initialized():
+                return dist.get_rank()
+            return 0
+        except (ImportError, AttributeError):
+            return 0
+    else:
+        return 0
diff --git a/keras/src/backend/jax/core_test.py b/keras/src/backend/jax/core_test.py
@@ -16,6 +16,16 @@
     from keras.src.backend.jax.core import NnxVariable
 
 
+class JaxCoreTest(testing.TestCase):
+    def _require_min_devices(self, min_devices):
+        """Skip test if fewer than min_devices are available."""
+        if len(jax.devices()) < min_devices:
+            pytest.skip(
+                f"Test requires at least {min_devices} devices, "
+                f"but only {len(jax.devices())} available"
+            )
+
+
 @pytest.mark.skipif(
     backend.backend() != "jax",
     reason="JAX backend specific test for core Variable integration with NNX.",
@@ -25,8 +35,8 @@
     reason="Test requires NNX backend to be enabled by default for setup.",
 )
 class NnxVariableTest(testing.TestCase):
-    def setup(self):
-        super().setup()
+    def setUp(self):
+        super().setUp()
 
         class NNXModel(nnx.Module):
             def __init__(self, rngs):

diff --git a/keras/src/backend/jax/distribution_lib_test.py b/keras/src/backend/jax/distribution_lib_test.py
@@ -33,6 +33,14 @@
     reason="Backend specific test",
 )
 class JaxDistributionLibTest(testing.TestCase):
+    def _require_min_devices(self, min_devices):
+        """Skip test if fewer than min_devices are available."""
+        if len(jax.devices()) < min_devices:
+            pytest.skip(
+                f"Test requires at least {min_devices} devices, "
+                f"but only {len(jax.devices())} available"
+            )
+
     def _create_jax_layout(self, sharding):
         # Use jax_layout.Format or jax_layout.Layout if available.
         if hasattr(jax_layout, "Format"):
@@ -43,6 +51,7 @@ def _create_jax_layout(self, sharding):
         return sharding
 
     def test_list_devices(self):
+        self._require_min_devices(8)
         self.assertEqual(len(distribution_lib.list_devices()), 8)
         self.assertEqual(len(distribution_lib.list_devices("cpu")), 8)
         self.assertEqual(len(distribution_lib.list_devices("cpu")), 8)
@@ -77,6 +86,7 @@ def test_initialize_with_coordinator_address(self, mock_jax_initialize):
         )
 
     def test_distribute_tensor(self):
+        self._require_min_devices(8)
         jax_mesh = jax.sharding.Mesh(
             np.array(jax.devices()).reshape(2, 4), ("batch", "model")
         )
@@ -101,6 +111,7 @@ def test_function(inputs, target_layout):
         self.assertTrue(result.sharding.is_equivalent_to(target_layout, ndim=2))
 
     def test_distribute_variable(self):
+        self._require_min_devices(8)
         # This test only verify the single worker/process behavior.
         jax_mesh = jax.sharding.Mesh(
             np.array(jax.devices()).reshape(2, 4), ("batch", "model")
@@ -118,6 +129,7 @@ def test_distribute_variable(self):
         self.assertTrue(result.sharding.is_equivalent_to(target_layout, ndim=2))
 
     def test_distribute_input_data(self):
+        self._require_min_devices(8)
         # This test only verify the single worker/process behavior.
         # The multi-process test lives in g3.
         jax_mesh = jax.sharding.Mesh(
@@ -136,6 +148,7 @@ def test_distribute_input_data(self):
         self.assertTrue(result.sharding.is_equivalent_to(target_layout, ndim=2))
 
     def test_distribute_tensor_with_jax_layout(self):
+        self._require_min_devices(8)
         jax_mesh = jax.sharding.Mesh(
             np.array(jax.devices()).reshape(2, 4), ("batch", "model")
         )
@@ -166,6 +179,7 @@ def test_function(inputs, target_layout):
         )
 
     def test_distribute_variable_with_jax_layout(self):
+        self._require_min_devices(8)
         # This test only verify the single worker/process behavior.
         jax_mesh = jax.sharding.Mesh(
             np.array(jax.devices()).reshape(2, 4), ("batch", "model")
@@ -187,6 +201,7 @@ def test_distribute_variable_with_jax_layout(self):
         )
 
     def test_distribute_input_data_with_jax_layout(self):
+        self._require_min_devices(8)
         # This test only verify the single worker/process behavior.
         jax_mesh = jax.sharding.Mesh(
             np.array(jax.devices()).reshape(2, 4), ("batch", "model")
@@ -212,6 +227,7 @@ def test_processes(self):
         self.assertEqual(backend_dlib.num_processes(), 1)
 
     def test_to_backend_mesh(self):
+        self._require_min_devices(8)
         devices = [f"cpu:{i}" for i in range(8)]
         shape = (4, 2)
         axis_names = ["batch", "model"]
@@ -224,6 +240,7 @@ def test_to_backend_mesh(self):
         self.assertEqual(jax_mesh.axis_names, ("batch", "model"))
 
     def test_to_backend_layout(self):
+        self._require_min_devices(8)
         axes = ["data", None]
         mesh = distribution_lib.DeviceMesh(
             (4, 2), ["data", "model"], [f"cpu:{i}" for i in range(8)]
@@ -248,6 +265,7 @@ def test_validation_for_device_mesh(self):
             backend_dlib._to_backend_layout(layout)
 
     def test_variable_assignment_reuse_layout(self):
+        self._require_min_devices(8)
         shape = (4, 2)
         axis_names = ["batch", "model"]
         device_mesh = distribution_lib.DeviceMesh(
@@ -310,6 +328,7 @@ def test_e2e_data_parallel_model(self):
             model.fit(inputs, labels)
 
     def test_e2e_model_parallel_model(self):
+        self._require_min_devices(8)
         shape = (4, 2)
         axis_names = ["batch", "model"]
         device_mesh = distribution_lib.DeviceMesh(
@@ -349,6 +368,7 @@ def test_e2e_model_parallel_model(self):
             model.fit(inputs, labels)
 
     def test_e2e_model_parallel_with_output_sharding(self):
+        self._require_min_devices(8)
         shape = (4, 2)
         axis_names = ["batch", "model"]
         device_mesh = distribution_lib.DeviceMesh(
@@ -405,6 +425,7 @@ def test_e2e_model_parallel_with_output_sharding(self):
         )
 
     def test_distribute_data_input(self):
+        self._require_min_devices(4)
         per_process_batch = jax.numpy.arange(24).reshape(
             6, 4
         )  # Example input array

diff --git a/keras/src/backend/torch/core.py b/keras/src/backend/torch/core.py
@@ -110,6 +110,7 @@ def _initialize(self, value):
             ).to(get_device())
 
     def _direct_assign(self, value):
+        value = convert_to_tensor(value, dtype=self._dtype)
         with torch.no_grad():
             self.value.copy_(value)
 

diff --git a/keras/src/callbacks/__init__.py b/keras/src/callbacks/__init__.py
@@ -8,6 +8,12 @@
 from keras.src.callbacks.learning_rate_scheduler import LearningRateScheduler
 from keras.src.callbacks.model_checkpoint import ModelCheckpoint
 from keras.src.callbacks.monitor_callback import MonitorCallback
+
+try:
+    from keras.src.callbacks.orbax_checkpoint import OrbaxCheckpoint
+except ImportError:
+    OrbaxCheckpoint = None
+
 from keras.src.callbacks.progbar_logger import ProgbarLogger
 from keras.src.callbacks.reduce_lr_on_plateau import ReduceLROnPlateau
 from keras.src.callbacks.remote_monitor import RemoteMonitor