using filelock

apbose · apbose · commit 2ea1d3cebdef · 2026-04-23T18:47:39.000-07:00
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
@@ -611,9 +611,11 @@ jobs:
                 python -m pytest -ra -v --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
                   distributed/test_nccl_ops.py \
                   distributed/test_native_nccl.py \
-                  distributed/test_export_save_load.py
+                  distributed/test_export_save_load.py \
+                  distributed/test_distributed_engine_cache.py
                 python -m torch_tensorrt.distributed.run --nproc_per_node=2 distributed/test_native_nccl.py --multirank
                 python -m torch_tensorrt.distributed.run --nproc_per_node=2 distributed/test_export_save_load.py --multirank
+                python -m torch_tensorrt.distributed.run --nproc_per_node=2 distributed/test_distributed_engine_cache.py --multirank
                 popd
 
 concurrency:
diff --git a/py/torch_tensorrt/distributed/__init__.py b/py/torch_tensorrt/distributed/__init__.py
@@ -2,10 +2,7 @@
     distributed_context,
     is_distributed_caching_enabled,
     set_distributed_mode,
-    signal_distributed_engine_build_complete,
-    wait_for_distributed_engine_build,
 )
-from torch_tensorrt.distributed._lock import DistributedFileLock  # noqa: F401
 from torch_tensorrt.distributed._nccl_utils import (  # noqa: F401
     setup_nccl_for_torch_tensorrt,
 )
diff --git a/py/torch_tensorrt/distributed/_distributed.py b/py/torch_tensorrt/distributed/_distributed.py
@@ -230,71 +230,3 @@ def is_distributed_caching_enabled(
         and dist.is_initialized()
         and dist.get_world_size() > 1
     )
-
-
-def wait_for_distributed_engine_build(
-    pull_fn: Any,
-    cache_dir: str,
-    hash_val: str,
-    poll_interval: float = 0.5,
-    timeout: float = 600.0,
-) -> Any:
-    """Non-building rank: poll for cached engine file, then load from cache.
-
-    Called when this rank failed to acquire the build lock, meaning another
-    rank is building the engine. Polls the filesystem for the cached engine
-    file instead of using NCCL collectives (which are unreliable inside
-    the TRT compilation path due to aot_autograd/CUDA stream conflicts).
-
-    Args:
-        pull_fn: Zero-arg callable (e.g. functools.partial) that loads the
-                 engine from cache. Returns SerializedInterpreterResult on
-                 hit, None on miss.
-        cache_dir: Shared engine cache directory path.
-        hash_val: Engine hash for this compilation.
-        poll_interval: Seconds between filesystem checks (default 0.5s).
-        timeout: Maximum seconds to wait before giving up (default 600s).
-
-    Returns:
-        SerializedInterpreterResult on cache hit, None on timeout.
-    """
-    import logging
-    import os
-    import time
-
-    logger = logging.getLogger(__name__)
-
-    blob_path = os.path.join(cache_dir, hash_val, "blob.bin")
-    logger.info(f"Polling for cached engine: {blob_path}")
-
-    elapsed = 0.0
-    while not os.path.exists(blob_path):
-        time.sleep(poll_interval)
-        elapsed += poll_interval
-        if elapsed >= timeout:
-            logger.warning(
-                f"Polling timed out after {timeout:.0f}s — building engine locally"
-            )
-            return None
-
-    logger.info(f"Cached engine found after {elapsed:.1f}s — loading from cache")
-    cached = pull_fn()
-    if cached is not None:
-        return cached
-
-    logger.warning("Cache file exists but pull_cached_engine failed — building locally")
-    return None
-
-
-def signal_distributed_engine_build_complete(lock: Any) -> None:
-    """Building rank: release the file lock after caching the engine.
-
-    Called after the building rank has inserted the engine into the shared
-    cache. Releases the file lock so other ranks' stale lock detection
-    works correctly. No NCCL collective needed — waiter ranks poll the
-    filesystem directly.
-
-    Args:
-        lock: DistributedFileLock instance that was acquired by this rank.
-    """
-    lock.release()
diff --git a/py/torch_tensorrt/distributed/_lock.py b/py/torch_tensorrt/distributed/_lock.py
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -2,20 +2,14 @@
 
 import io
 import logging
-from functools import partial
 from typing import Any, Dict, List, NamedTuple, Optional, Sequence
 
 import tensorrt as trt
 import torch
 from torch_tensorrt._enums import dtype
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
-from torch_tensorrt.distributed._distributed import (
-    is_distributed_caching_enabled,
-    signal_distributed_engine_build_complete,
-    wait_for_distributed_engine_build,
-)
-from torch_tensorrt.distributed._lock import DistributedFileLock
+from torch_tensorrt.distributed._distributed import is_distributed_caching_enabled
 from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
 from torch_tensorrt.dynamo._settings import CompilationSettings, settings_are_compatible
 from torch_tensorrt.dynamo.conversion._symbolic_shape_capture import (
@@ -275,33 +269,33 @@ def interpret_module_to_result(
         settings.cache_built_engines,
         settings.reuse_cached_engines,
     )
-    _build_lock = None
+    _lock: Optional[Any] = None
 
     if _distributed_caching:
+        import os as _os
+
+        from filelock import FileLock
+
         # is_distributed_caching_enabled guarantees engine_cache and hash_val are set.
         assert engine_cache is not None
         assert hash_val is not None
-        _build_lock = DistributedFileLock(engine_cache.engine_cache_dir, hash_val)
-        if _build_lock.acquire():
-            logger.info("Acquired engine build lock — this rank builds")
-        else:
-            logger.info("Lock held by another rank — polling for cached engine")
-            _pull_fn = partial(
-                pull_cached_engine,
-                hash_val,
-                module,
-                engine_cache,
-                settings,
-                inputs,
-                symbolic_shape_expressions,
-            )
-            cached: Optional[SerializedInterpreterResult] = (
-                wait_for_distributed_engine_build(
-                    _pull_fn, engine_cache.engine_cache_dir, hash_val
-                )
-            )
-            if cached is not None:
-                return cached
+
+        _lock_path = _os.path.join(engine_cache.engine_cache_dir, f".{hash_val}.lock")
+        _lock = FileLock(_lock_path, timeout=600)
+        _lock.acquire()
+
+        # Check cache again — another rank may have built while we waited
+        cached = pull_cached_engine(
+            hash_val,
+            module,
+            engine_cache,
+            settings,
+            inputs,
+            symbolic_shape_expressions,
+        )
+        if cached is not None:
+            _lock.release()
+            return cached
 
     output_dtypes = infer_module_output_dtypes(
         module, truncate_double=settings.truncate_double
@@ -348,9 +342,9 @@ def interpret_module_to_result(
                 hash_val, interpreter_result, engine_cache, settings, inputs
             )
 
-    # Signal other ranks that the engine is cached and ready
-    if _build_lock is not None and _build_lock.acquired:
-        signal_distributed_engine_build_complete(_build_lock)
+    # Release the filelock so other ranks can proceed
+    if _distributed_caching and _lock is not None:
+        _lock.release()
 
     serialized_engine = interpreter_result.engine.serialize()
     with io.BytesIO() as engine_bytes:
diff --git a/tests/py/dynamo/distributed/test_distributed_engine_cache.py b/tests/py/dynamo/distributed/test_distributed_engine_cache.py

Original file line number	Diff line number	Diff line change
`@@ -2,10 +2,7 @@`
`2`	`2`	`distributed_context,`
`3`	`3`	`is_distributed_caching_enabled,`
`4`	`4`	`set_distributed_mode,`
`5`		`- signal_distributed_engine_build_complete,`
`6`		`- wait_for_distributed_engine_build,`
`7`	`5`	`)`
`8`		`-from torch_tensorrt.distributed._lock import DistributedFileLock # noqa: F401`
`9`	`6`	`from torch_tensorrt.distributed._nccl_utils import ( # noqa: F401`
`10`	`7`	`setup_nccl_for_torch_tensorrt,`
`11`	`8`	`)`