fix: job transition after failed recovery (#330)

raresgaia123 · web-flow · commit 12d0924ebb19 · 2026-01-16T09:27:43.000-08:00
If some worker can't be recovered, logic is to remove the pipeline of these workers and check if the job is still able to run. In the case of incomplete data loop after pipeline removal, the job will transition to Failing state. Failing state will then wait for all workers to have certain status before transitioning to Failed state. The bug was in the way we updated the worker status in job context. _reconcile_wrk_status was using workers from wrong source - class attribute 'self._new_cfg' instead of 'new_cfg' received as argument.
diff --git a/infscale/controller/job_context.py b/infscale/controller/job_context.py
@@ -410,6 +410,15 @@ async def start(self):
 
         self.context.set_state(JobStateEnum.STARTING)
 
+    def cond_stopped(self):
+        """Handle the transition to stopped."""
+        # when a worker fails and is unrecoverable, we need to
+        # remove that worker and the pipeline of that worker.
+        # This means that some workers will send Failed status,
+        # others will send Terminated status. This will avoid any
+        # unnecessary exceptions since the job is already in Failed.
+        pass
+
 
 class RecoveryState(BaseJobState):
     """RecoveryState class."""
@@ -836,7 +845,7 @@ def _reconcile_wrk_status(self, cur_cfg: JobConfig, new_cfg: JobConfig) -> None:
             worker_diff = JobConfig.get_workers_diff(cur_cfg, new_cfg)
             self.remove_wrk_status(worker_diff)
 
-        for w in self._new_cfg.workers:
+        for w in new_cfg.workers:
             if w.id not in self.wrk_status:
                 self.wrk_status[w.id] = WorkerStatus.READY