single controller: add train controller

daihao · daihao · commit 28af941bc2f8 · 2025-10-02T14:25:56.000+08:00
diff --git a/areal/api/engine_api.py b/areal/api/engine_api.py
@@ -25,6 +25,8 @@ class Scheduling:
     cpu: int
     gpu: int
     mem: int
+    port_count: int
+    cmd: str | None = None
     nodelist: str | None = None
     exclude: str | None = None
     partition: str | None = None
@@ -138,7 +140,7 @@ def parallelism_group(self) -> dist.ProcessGroup:
         """
         raise NotImplementedError()
 
-    def get_scheduling_config(self) -> Scheduling:
+    def get_scheduling_config(self) -> List[Scheduling]:
         """Get the scheduling configuration for the engine.
 
         This includes configuration such as container image, CPU/GPU/memory size.
diff --git a/areal/api/scheduler_api.py b/areal/api/scheduler_api.py
@@ -1,47 +1,40 @@
 import abc
 from dataclasses import dataclass, field
-from typing import Dict, List
+from typing import List, Literal
+
+from areal.api.engine_api import Scheduling
 
 
 @dataclass
 class Worker:
     id: str
     ip: str
-    ports: List[str] = field(default_factory=list)
-
-
-@dataclass
-class ContainerSpec:
-    cpu: int = 0
-    gpu: int = 0
-    mem: int = 0
-    container_image: str = ""
-    cmd: str = ""
-    env_vars: Dict[str, str] = field(default_factory=dict)
-    port_count: int = 2
+    serve_port: str
+    extra_ports: List[str] = field(default_factory=list)
 
 
 @dataclass
 class ScheduleStrategy:
-    type: str = ""
+    type: Literal["colocation", "separation", ""] = ""
     uid: str = ""
 
 
 @dataclass
-class SchedulingConfig:
+class Job:
     replicas: int = 0
-    specs: List[ContainerSpec] = field(default_factory=list)
+    tasks: List[Scheduling] = field(default_factory=list)
     schedule_strategy: ScheduleStrategy | None = None
     role: str = ""
 
 
 class Scheduler(abc.ABC):
-    def create_workers(self, worker_key, scheduler_config, *args, **kwargs) -> str:
+    def create_workers(self, job: Job, *args, **kwargs):
         """
-        Start workers, return job id
+        Start workers
         """
+        raise NotImplementedError()
 
-    def get_workers(self, worker_key, timeout=None) -> List[Worker]:
+    def get_workers(self, role: str, timeout=None) -> List[Worker]:
         """
         Wait and return worker list, including scheduling results such as ip and engine ports
         (worker id, ip, ports)
diff --git a/areal/controller/train_controller.py b/areal/controller/train_controller.py
@@ -0,0 +1,144 @@
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Any, Callable, Dict, List
+
+import torch
+
+from areal.api.alloc_mode import ParallelStrategy
+from areal.api.cli_args import TrainEngineConfig
+from areal.api.controller_api import DistributedBatch, TrainController
+from areal.api.engine_api import TrainEngine
+from areal.api.io_struct import (
+    AllocationMode,
+    FinetuneSpec,
+    ParamSpec,
+    SaveLoadMeta,
+    WeightUpdateMeta,
+)
+from areal.api.scheduler_api import Job, Scheduler, ScheduleStrategy, Worker
+from areal.controller.utils import create_engine_with_retry, rpc_call
+from areal.utils import logging
+
+logger = logging.getLogger("DistributedTrainController")
+
+
+class DistributedTrainController(TrainController):
+    def __init__(
+        self, train_engine: TrainEngine, config: TrainEngineConfig, scheduler: Scheduler
+    ):
+        super().__init__(train_engine, config, scheduler)
+
+        self.role: str = "train"
+        self.group_size = 0
+        self.alloc_mode: AllocationMode
+        self.uid: str
+        self.workers: List[Worker]
+
+        # todo: delete this method
+
+    def create_process_group(self, parallel_strategy: ParallelStrategy | None = None):
+        assert self.workers is not None
+        rpc_call(
+            self.scheduler, self.workers, "create_process_group", parallel_strategy
+        )
+
+    def initialize(
+        self,
+        alloc_mode_str: str,
+        ft_spec: FinetuneSpec,
+        schedule_strategy: ScheduleStrategy,
+    ):
+        """Initialize environments for distributed training and load models."""
+        self.alloc_mode = AllocationMode.from_str(alloc_mode_str)
+        self.ft_spec = ft_spec
+
+        job = Job(
+            replicas=self.alloc_mode.train.world_size,
+            tasks=self.train_engine.get_scheduling_config(),
+            schedule_strategy=schedule_strategy,
+            role=self.role,
+        )
+
+        logger.info(f"Start to create job: {job}")
+
+        self.uid = self.scheduler.create_workers(job)
+
+        self.workers = self.scheduler.get_workers(self.role, timeout=1800)
+
+        with ThreadPoolExecutor(max_workers=len(self.workers)) as executor:
+            futures = [
+                executor.submit(
+                    partial(
+                        create_engine_with_retry,
+                        self.scheduler.create_engine,
+                        worker.id,
+                        self.train_engine,
+                        None,
+                        self.ft_spec,
+                    )
+                )
+                for index, worker in enumerate(self.workers)
+            ]
+            try:
+                for worker_index, future in enumerate(futures):
+                    rank_info = future.result()
+                    self.rank_info[worker_index] = rank_info
+                    logger.info(f"worker_index: {worker_index}, rank_info: {rank_info}")
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to initialize worker_index: {worker_index}, error: {e}"
+                )
+
+    def destroy(self):
+        self.scheduler.delete_workers()
+
+    def train(self, mode: bool = True):
+        pass
+
+    def upload_weights(self, meta: WeightUpdateMeta):
+        pass
+
+    def get_param_specs(
+        self, weight_chunked_mem_mb: int = 1024
+    ) -> List[List[ParamSpec]]:
+        pass
+
+    def set_version(self, version: int):
+        pass
+
+    def get_version(self) -> int:
+        pass
+
+    def save(self, meta: SaveLoadMeta):
+        pass
+
+    def load(self, meta: SaveLoadMeta):
+        pass
+
+    def step_lr_scheduler(self):
+        pass
+
+    def train_batch(
+        self,
+        input_: DistributedBatch,
+        loss_fn: Callable[[torch.Tensor, Dict[str, Any]], torch.Tensor],
+        loss_weight_fn: Callable[[Dict[str, Any]], torch.Tensor],
+    ) -> Dict[str, float]:
+        pass
+
+    def eval_batch(
+        self,
+        input_: DistributedBatch,
+        loss_fn: Callable[[torch.Tensor, Dict[str, Any]], torch.Tensor],
+        loss_weight_fn: Callable[[Dict[str, Any]], torch.Tensor],
+    ) -> torch.Tensor | None:
+        pass
+
+    def forward(
+        self,
+        input_: DistributedBatch,
+        output_seqlens: List[int] | None = None,
+        post_hook: Callable[[torch.Tensor, Dict[str, Any]], Any] | None = None,
+        aggregate_fn: Callable[[List[Any]], Any] = torch.cat,
+    ) -> Any | None:
+        pass
diff --git a/areal/controller/utils.py b/areal/controller/utils.py
@@ -0,0 +1,61 @@
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, List
+
+from requests.exceptions import ConnectionError
+
+from areal.api.scheduler_api import Scheduler, Worker
+from areal.utils import logging
+from areal.utils.http import wait_future_ordered
+
+logger = logging.getLogger("ControllerUtil")
+
+
+def create_engine_with_retry(
+    create_engine_func, max_retries=60, retry_delay=10, *args, **kwargs
+):
+    logger.info(
+        f"Create engine with retry: {max_retries}, {retry_delay}, {args}, {kwargs}"
+    )
+    retries = 0
+    while retries < max_retries:
+        try:
+            return create_engine_func(*args, **kwargs)
+        except ConnectionError as e:
+            logger.info(
+                f"Worker is not ready, exception: {e}, retrying in {retry_delay} seconds..."
+            )
+            time.sleep(retry_delay)
+            retries += 1
+        except Exception as e:
+            logger.error(f"Connection failed: {e}. unknown exception")
+            raise e
+
+    raise RuntimeError("Failed to connect to remote service after maximum retries.")
+
+
+def rpc_call(
+    scheduler: Scheduler, workers: List[Worker], method: str, *args, **kwargs
+) -> List[Any]:
+    """
+    工具方法：并发RPC调用
+
+    :param scheduler: 调度器对象, 必须有 call_engine(worker_id, method, *args, **kwargs)
+    :param workers: 可遍历的worker列表，每个worker应有 worker.id 属性
+    :param method: 方法名字符串
+    :param args: 传递给call_engine的*args
+    :param kwargs: 传递给call_engine的**kwargs
+    :return: results
+    """
+    logger.info(f"Start to rpc call, method: {method}, args: {args}, kwargs: {kwargs}")
+    with ThreadPoolExecutor(max_workers=len(workers)) as executor:
+        futures = [
+            executor.submit(scheduler.call_engine, worker.id, method, *args, **kwargs)
+            for worker in workers
+        ]
+        try:
+            results = wait_future_ordered(futures, exit_on_exception=True)
+        except Exception as e:
+            raise RuntimeError(f"{method} failed, error: {e}")
+
+    return results
diff --git a/areal/utils/http.py b/areal/utils/http.py
@@ -1,6 +1,10 @@
 import asyncio
+import os
+import signal
+import traceback
+from concurrent.futures import Future, as_completed
 from http import HTTPStatus
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 import aiohttp
 
@@ -96,3 +100,26 @@ def response_ok(http_code: int) -> bool:
 
 def response_retryable(http_code: int) -> bool:
     return http_code == HTTPStatus.REQUEST_TIMEOUT
+
+
+def wait_future_ordered(
+    futures: List[Future], exit_on_exception: bool = False
+) -> List[Any]:
+    """
+    按照提交顺序等待future完成，返回结果列表
+    """
+    results = [None] * len(futures)
+    future_index_map = {future: i for i, future in enumerate(futures)}
+    for future in as_completed(futures):
+        index = future_index_map[future]
+        try:
+            results[index] = future.result()
+        except Exception as e:
+            logger.warning(f"Exception caught when waiting for future: {e}")
+            logger.warning(traceback.format_exc())
+            if exit_on_exception:
+                logger.info("Exiting due to exception in future.")
+                os.kill(os.getpid(), signal.SIGTERM)
+            else:
+                raise e
+    return results