meta-pytorch · MattKotzbauer · May 3, 2025 · May 5, 2025 · May 7, 2025 · May 16, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,6 +9,7 @@ askama = "0.12.1"
 atty = "0.2.14"
 axum = "0.7.7"
 chrono = "0.4.40"
+dashmap = "6.1"
 fern = {version = "0.7.1", features = ["colored"]}
 gethostname = "0.5.0"
 log = "0.4.22"
@@ -21,6 +22,7 @@ slog-stdlog = "4.1.1"
 stderrlog = "0.6.0"
 structopt = "0.3.26"
 tokio = {version = "1.40.0", features = ["full", "test-util", "tracing", "macros", "rt-multi-thread"] }
+tokio-stream = "0.1"
 tonic = "0.12.2"
 
 [build-dependencies]

diff --git a/src/bin/lighthouse.rs b/src/bin/lighthouse.rs
@@ -4,8 +4,11 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+use std::net::SocketAddr;
 use structopt::StructOpt;
-use torchft::lighthouse::{Lighthouse, LighthouseOpt};
+use torchft::lighthouse::LighthouseOpt;
+use torchft::router::Router;
+use torchftpb::lighthouse_service_server::LighthouseServiceServer;
 
 #[tokio::main(flavor = "multi_thread", worker_threads = 4)]
 async fn main() {
@@ -17,7 +20,10 @@ async fn main() {
         .unwrap();
 
     let opt = LighthouseOpt::from_args();
-    let lighthouse = Lighthouse::new(opt).await.unwrap();
-
-    lighthouse.run().await.unwrap();
+    let router = Router::new(opt.clone());
+    Server::builder()
+        .add_service(LighthouseServiceServer::new(router))
+        .serve(opt.bind.parse::<SocketAddr>().unwrap())
+        .await
+        .unwrap();
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -8,8 +8,11 @@ pub mod lighthouse;
 pub mod manager;
 mod net;
 mod retry;
+mod router;
 mod timeout;
 
+pub use crate::router::Router;
+
 use anyhow::Result;
 use atty::Stream;
 use core::time::Duration;
@@ -21,6 +24,7 @@ use std::thread::available_parallelism;
 use structopt::StructOpt;
 use tokio::runtime::Runtime;
 use tokio::task::JoinHandle;
+use tokio_stream::wrappers::TcpListenerStream;
 use tonic::transport::Channel;
 use tonic::Status;
 
@@ -33,7 +37,9 @@ pub mod torchftpb {
 }
 
 use crate::torchftpb::lighthouse_service_client::LighthouseServiceClient;
+use crate::torchftpb::lighthouse_service_server::LighthouseServiceServer;
 use crate::torchftpb::manager_service_client::ManagerServiceClient;
+use crate::torchftpb::LighthouseHeartbeatRequest;
 use crate::torchftpb::{
     CheckpointMetadataRequest, LighthouseHeartbeatRequest, LighthouseQuorumRequest,
     ManagerQuorumRequest, ShouldCommitRequest,
@@ -339,9 +345,12 @@ fn lighthouse_main(py: Python<'_>) -> PyResult<()> {
 }
 
 async fn lighthouse_main_async(opt: lighthouse::LighthouseOpt) -> Result<()> {
-    let lighthouse = lighthouse::Lighthouse::new(opt).await?;
+    let router = Router::new(opt.clone());
 
-    lighthouse.run().await?;
+    tonic::transport::Server::builder()
+        .add_service(LighthouseServiceServer::new(router))
+        .serve(opt.bind.parse::<std::net::SocketAddr>()?)
+        .await?;
 
     Ok(())
 }
@@ -479,13 +488,19 @@ fn convert_quorum(py: Python, q: &torchftpb::Quorum) -> PyResult<Quorum> {
 struct LighthouseClient {
     client: LighthouseServiceClient<Channel>,
     runtime: Runtime,
+    room_id: Option<String>,
 }
 
 #[pymethods]
 impl LighthouseClient {
-    #[pyo3(signature = (addr, connect_timeout))]
+    #[pyo3(signature = (addr, connect_timeout, room_id = None))]
     #[new]
-    fn new(py: Python<'_>, addr: String, connect_timeout: Duration) -> PyResult<Self> {
+    fn new(
+        py: Python<'_>,
+        addr: String,
+        connect_timeout: Duration,
+        room_id: Option<String>,
+    ) -> PyResult<Self> {
         py.allow_threads(move || {
             let runtime = tokio::runtime::Builder::new_multi_thread()
                 .worker_threads(num_threads())
@@ -498,6 +513,7 @@ impl LighthouseClient {
             Ok(Self {
                 client: client,
                 runtime: runtime,
+                room_id: room_id,
             })
         })
     }
@@ -553,6 +569,8 @@ impl LighthouseClient {
                 }),
             });
 
+            let mut request = self.add_room_header(request);
+
             // This timeout is processed on the server side so we also enable
             // keep alives to detect server health.
             request.set_timeout(timeout);
@@ -581,13 +599,29 @@ impl LighthouseClient {
     ) -> Result<(), StatusError> {
         py.allow_threads(move || {
             let mut req = tonic::Request::new(LighthouseHeartbeatRequest { replica_id });
+            let mut req = self.add_room_header(req);
             req.set_timeout(timeout);
             self.runtime.block_on(self.client.clone().heartbeat(req))?;
             Ok(())
         })
     }
 }
 
+impl LighthouseClient {
+    /// Attach `"room-id"` header if `self.room_id` is Some(_)
+    fn add_room_header<T>(&self, mut req: tonic::Request<T>) -> tonic::Request<T> {
+        if let Some(ref id) = self.room_id {
+            use tonic::metadata::MetadataValue;
+            req.metadata_mut().insert(
+                crate::router::ROOM_ID_HEADER,
+                MetadataValue::try_from(id.as_str()).expect("room-id ascii"),
+            );
+        }
+        req
+    }
+
+}
+
 /// LighthouseServer is a GRPC server for the lighthouse service.
 ///
 /// It is used to coordinate the ManagerServer for each replica group.
@@ -603,7 +637,7 @@ impl LighthouseClient {
 ///     heartbeat_timeout_ms (int): The timeout for heartbeats.
 #[pyclass]
 struct LighthouseServer {
-    lighthouse: Arc<lighthouse::Lighthouse>,
+    bind: String,
     handle: JoinHandle<Result<()>>,
     _runtime: Runtime,
 }
@@ -631,19 +665,30 @@ impl LighthouseServer {
                 .enable_all()
                 .build()?;
 
-            let lighthouse = rt
-                .block_on(lighthouse::Lighthouse::new(lighthouse::LighthouseOpt {
-                    bind: bind,
-                    min_replicas: min_replicas,
-                    join_timeout_ms: join_timeout_ms,
-                    quorum_tick_ms: quorum_tick_ms,
-                    heartbeat_timeout_ms: heartbeat_timeout_ms,
-                }))
-                .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+            let opt = lighthouse::LighthouseOpt {
+                bind: bind.clone(),
+                min_replicas,
+                join_timeout_ms,
+                quorum_tick_ms,
+                heartbeat_timeout_ms,
+            };
+
+            let listener = rt.block_on(tokio::net::TcpListener::bind(&bind))?;
+            let bound_sock = listener.local_addr()?;
+            let bound = format!("http://{}", bound_sock);
+            let incoming = TcpListenerStream::new(listener);
+
+            let handle = rt.spawn(async move {
+                tonic::transport::Server::builder()
+                    .add_service(LighthouseServiceServer::new(Router::new(opt.clone())))
+                    .serve_with_incoming(incoming)
+                    .await
+                    .map_err(|e: tonic::transport::Error| anyhow::anyhow!(e))
+            });
 
             Ok(Self {
-                handle: rt.spawn(lighthouse.clone().run()),
-                lighthouse: lighthouse,
+                bind: bound,
+                handle,
                 _runtime: rt,
             })
         })
@@ -654,7 +699,7 @@ impl LighthouseServer {
     /// Returns:
     ///    str: The address of the lighthouse server.
     fn address(&self) -> PyResult<String> {
-        Ok(self.lighthouse.address().to_string())
+        Ok(self.bind.clone())
     }
 
     /// shutdown shuts down the lighthouse server.

diff --git a/src/lighthouse.rs b/src/lighthouse.rs
@@ -83,7 +83,7 @@ impl ChangeLogger {
     }
 }
 
-#[derive(StructOpt, Debug)]
+#[derive(StructOpt, Debug, Clone)]
 #[structopt()]
 pub struct LighthouseOpt {
     // bind is the address to bind the server to.

diff --git a/src/router.rs b/src/router.rs
@@ -0,0 +1,88 @@
+use std::sync::Arc;
+
+use dashmap::{mapref::entry::Entry, DashMap};
+use tonic::{Request, Response, Status};
+
+use crate::{
+    lighthouse::{Lighthouse, LighthouseOpt},
+    torchftpb::{
+        lighthouse_service_server::LighthouseService, LighthouseHeartbeatRequest,
+        LighthouseHeartbeatResponse, LighthouseQuorumRequest, LighthouseQuorumResponse,
+    },
+};
+
+/// Metadata header for both client and router
+pub const ROOM_ID_HEADER: &str = "room-id";
+
+/// Top-level service registered with tonic’s `Server::builder()`
+#[derive(Clone)]
+pub struct Router {
+    rooms: Arc<DashMap<String, Arc<Lighthouse>>>,
+    tmpl_opt: LighthouseOpt, // (cloned for each new room)
+}
+
+/// Designates a single tonic gRPC server into many logical “rooms.”
+/// Inspects the `room-id` metadata header on each request, then
+/// lazily creates or reuses an Arc<Lighthouse> for that namespace
+impl Router {
+    /// Create a new router given the CLI/config options that are
+    /// normally passed straight to `Lighthouse::new`.
+    pub fn new(tmpl_opt: LighthouseOpt) -> Self {
+        Self {
+            rooms: Arc::new(DashMap::new()),
+            tmpl_opt,
+        }
+    }
+
+    /// Room lookup: creation if it doesn't exist, access if it does
+    async fn room(&self, id: &str) -> Arc<Lighthouse> {
+        // 1. Quick optimistic read (no locking contention).
+        if let Some(handle) = self.rooms.get(id) {
+            return handle.clone();
+        }
+
+        // 2. Build the Lighthouse instance *off the map* so
+        //    we don't hold any guard across `.await`.
+        let new_room = Lighthouse::new(self.tmpl_opt.clone())
+            .await
+            .expect("failed to create Lighthouse");
+
+        // 3. Second pass: insert if still vacant, otherwise reuse
+        //    whatever another task inserted first.
+        match self.rooms.entry(id.to_owned()) {
+            Entry::Occupied(entry) => entry.get().clone(),
+            Entry::Vacant(entry) => {
+                entry.insert(new_room.clone());
+                new_room
+            }
+        }
+    }
+
+    /// Extracts `"room-id"` from metadata, defaulting to `"default"`.
+    fn extract_room_id(meta: &tonic::metadata::MetadataMap) -> &str {
+        meta.get(ROOM_ID_HEADER)
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("default")
+    }
+}
+
+#[tonic::async_trait]
+impl LighthouseService for Router {
+    async fn quorum(
+        &self,
+        req: Request<LighthouseQuorumRequest>,
+    ) -> Result<Response<LighthouseQuorumResponse>, Status> {
+        let id = Self::extract_room_id(req.metadata()).to_owned();
+        let room = self.room(&id).await;
+        <Arc<Lighthouse> as LighthouseService>::quorum(&room, req).await
+    }
+
+    async fn heartbeat(
+        &self,
+        req: Request<LighthouseHeartbeatRequest>,
+    ) -> Result<Response<LighthouseHeartbeatResponse>, Status> {
+        let id = Self::extract_room_id(req.metadata()).to_owned();
+        let room = self.room(&id).await;
+        <Arc<Lighthouse> as LighthouseService>::heartbeat(&room, req).await
+    }
+}
diff --git a/torchft/multi_quorum_test.py b/torchft/multi_quorum_test.py
@@ -0,0 +1,46 @@
+"""
+Validate that one Lighthouse server can host isolated quorums
+for multiple logical rooms (job IDs) via `room-id` metadata header.
+"""
+
+from __future__ import annotations
+
+import datetime as _dt
+
+import pytest
+
+import torchft._torchft as ext
+
+_TIMEOUT = _dt.timedelta(seconds=3)  # connect + RPC timeout
+
+
+def _client(addr: str, room: str) -> ext.LighthouseClient:
+    """Utility: create a client with a logical room-id."""
+    return ext.LighthouseClient(addr, _TIMEOUT, room)
+
+
+@pytest.mark.asyncio
+async def test_multi_room_quorums() -> None:
+    # 1) one server, any free port
+    server = ext.LighthouseServer("[::]:0", 1)
+    addr = server.address()
+
+    # 2) two clients in two separate rooms
+    a = _client(addr, "jobA")
+    b = _client(addr, "jobB")
+
+    # 3) explicit heartbeats (exercises RPC path)
+    a.heartbeat("a0")
+    b.heartbeat("b0")
+
+    # 4) ask for a quorum from each room
+    qa = a.quorum("a0", _TIMEOUT)
+    qb = b.quorum("b0", _TIMEOUT)
+
+    # 5) verify the rooms are independent
+    assert qa.quorum_id == qb.quorum_id == 1
+    assert len(qa.participants) == 1 and qa.participants[0].replica_id == "a0"
+    assert len(qb.participants) == 1 and qb.participants[0].replica_id == "b0"
+
+    # 6) shutdown
+    server.shutdown()