From c4fce46b35808464897980bbdbf39fd47da54682 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 28 May 2026 13:18:24 -0700
Subject: [PATCH 1/2] setup_nccl_for_torch_tensorrt change

---
 py/torch_tensorrt/distributed/_nccl_utils.py  | 150 ++++++++----------
 .../py/dynamo/distributed/test_native_nccl.py |  36 -----
 2 files changed, 68 insertions(+), 118 deletions(-)
diff --git a/py/torch_tensorrt/distributed/_nccl_utils.py b/py/torch_tensorrt/distributed/_nccl_utils.py
index 5271d87725..ea491541d9 100644
--- a/py/torch_tensorrt/distributed/_nccl_utils.py
+++ b/py/torch_tensorrt/distributed/_nccl_utils.py
@@ -28,9 +28,9 @@
 symlink workarounds.
 """
 
-import ctypes
 import logging
 import os
+import subprocess
 from typing import Optional
 
 import torch.distributed as dist
@@ -101,88 +101,92 @@ def ensure_nccl_symlink(nccl_lib_dir: str) -> bool:
         return False
 
 
-def check_nccl_library_path() -> bool:
-    """
-    Check if LD_LIBRARY_PATH includes PyTorch's NCCL directory.
+def _sys_libdir_on_ldso_path() -> str:
+    """Pick a system library directory that ld.so searches by default.
 
-    Returns:
-        True if configuration is correct, False if LD_LIBRARY_PATH needs updating.
+    Returns the first existing directory from a portability-ordered list:
+    Debian/Ubuntu x86_64 multiarch → ARM64 multiarch → RHEL/CentOS lib64 →
+    bare /usr/lib (always on ld.so's search path as a final fallback).
     """
-    nccl_lib_dir = get_nccl_library_path()
-
-    if nccl_lib_dir is None:
-        # System NCCL - no action needed
-        return True
-
-    ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
-    return nccl_lib_dir in ld_library_path
+    for d in (
+        "/usr/lib/x86_64-linux-gnu",  # Debian / Ubuntu x86_64
+        "/usr/lib/aarch64-linux-gnu",  # Debian / Ubuntu ARM64 (Jetson)
+        "/usr/lib64",  # RHEL / CentOS / Fedora x86_64
+    ):
+        if os.path.isdir(d):
+            return d
+    return "/usr/lib"
 
 
 def setup_nccl_for_torch_tensorrt() -> None:
     """
-    Setup NCCL library for TensorRT distributed inference.
-
-    This function:
-    1. Detects if nvidia.nccl pip package is installed
-    2. Creates libnccl.so symlink if needed
-    3. Pre-loads libnccl.so via ctypes (helps Python runtime path)
-    4. Updates LD_LIBRARY_PATH for dynamic loaders
-
-    Note: TRT's internal loader (libLoader.cpp) reads LD_LIBRARY_PATH at
-    process launch time, not when updated via os.environ. For the C++ TRT
-    runtime path, LD_LIBRARY_PATH must be set before the process starts:
-
-        NCCL_LIB=$(python -c "from torch_tensorrt.distributed._nccl_utils import get_nccl_library_path; print(get_nccl_library_path())")
-        LD_LIBRARY_PATH="$NCCL_LIB:$LD_LIBRARY_PATH" python script.py
-
-    For NGC containers (system NCCL), this is a no-op.
+    Point a `libnccl.so` symlink on ld.so's default search path at PyTorch's
+    libnccl.so.2 so TRT and PyTorch share a single NCCL library in the process.
+
+    What this function does:
+      1. Locate the nvidia.nccl pip package's libnccl.so.2 via
+         get_nccl_library_path().  If pip's nccl isn't installed (NGC /
+         system-NCCL environments) returns immediately — no action needed.
+      2. Pick a system library directory that ld.so already searches by
+         default via _sys_libdir_on_ldso_path() (Debian/Ubuntu multiarch,
+         RHEL lib64, or /usr/lib fallback).
+      3. If <sys_libdir>/libnccl.so already points at that libnccl.so.2,
+         return.
+      4. Otherwise, remove any existing libnccl.so at that path and create
+         a fresh symlink:
+             <sys_libdir>/libnccl.so → <pip>/libnccl.so.2
+      5. Run `ldconfig` to refresh /etc/ld.so.cache.
+      6. Guarded by a module-global flag so subsequent calls in the same
+         process are a no-op.
+
+    Requires write access to the chosen sys_libdir (root inside Docker is
+    the common case).  On OSError the function raises RuntimeError with
+    documented LD_PRELOAD / LD_LIBRARY_PATH workarounds for non-root setups.
     """
     global _nccl_setup_checked
-
-    # Only check once per process
     if _nccl_setup_checked:
         return
     _nccl_setup_checked = True
 
     nccl_lib_dir = get_nccl_library_path()
-
     if nccl_lib_dir is None:
-        # NGC container or system NCCL - no action needed
         logger.debug(
-            "nvidia.nccl package not found. "
-            "Assuming system NCCL is used by both PyTorch and TensorRT."
+            "nvidia.nccl package not found; assuming system NCCL is shared by PyTorch and TensorRT."
         )
         return
 
-    logger.debug(f"Found nvidia.nccl package at: {nccl_lib_dir}")
-
-    # Ensure symlink exists
-    symlink_ok = ensure_nccl_symlink(nccl_lib_dir)
-
-    # Ensure LD_LIBRARY_PATH includes the NCCL directory so TRT's dlopen("libnccl.so")
-    # finds the same library PyTorch already loaded.  dlopen() reads LD_LIBRARY_PATH
-    # dynamically, so updating os.environ here takes effect for subsequent loads.
-    ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
-    if nccl_lib_dir not in ld_library_path:
-        os.environ["LD_LIBRARY_PATH"] = (
-            f"{nccl_lib_dir}:{ld_library_path}" if ld_library_path else nccl_lib_dir
+    nccl_so_2 = os.path.join(nccl_lib_dir, "libnccl.so.2")
+    if not os.path.isfile(nccl_so_2):
+        logger.warning(
+            f"Expected {nccl_so_2} to exist but it doesn't; skipping NCCL setup."
         )
-        logger.debug(f"Added NCCL directory to LD_LIBRARY_PATH: {nccl_lib_dir}")
-    else:
-        logger.debug(f"LD_LIBRARY_PATH already includes NCCL directory: {nccl_lib_dir}")
+        return
 
-    if symlink_ok:
-        # Pre-load libnccl.so into the process with RTLD_GLOBAL so that TRT's
-        # subsequent dlopen("libnccl.so") inside setCommunicator() finds the
-        # already-loaded library rather than searching LD_LIBRARY_PATH again.
-        nccl_so = os.path.join(nccl_lib_dir, "libnccl.so")
-        try:
-            ctypes.CDLL(nccl_so, mode=ctypes.RTLD_GLOBAL)
-            logger.debug(f"Pre-loaded NCCL library: {nccl_so}")
-        except OSError as e:
-            logger.warning(f"Failed to pre-load NCCL library {nccl_so}: {e}")
+    sys_libdir = _sys_libdir_on_ldso_path()
+    target = os.path.join(sys_libdir, "libnccl.so")
 
-        logger.debug("NCCL library setup complete")
+    try:
+        if os.path.lexists(target):
+            existing = os.readlink(target) if os.path.islink(target) else None
+            if existing == nccl_so_2:
+                logger.debug(f"{target} already points at {nccl_so_2}; nothing to do.")
+                return
+            os.remove(target)
+        os.symlink(nccl_so_2, target)
+        subprocess.run(["ldconfig"], check=False)
+        logger.info(
+            f"NCCL: linked {target} -> {nccl_so_2} so TRT and PyTorch share one libnccl."
+        )
+    except OSError as e:
+        raise RuntimeError(
+            f"setup_nccl_for_torch_tensorrt(): cannot write {target} "
+            f"(needed so TRT's dlopen('libnccl.so') resolves to PyTorch's libnccl.so.2). "
+            f"Workarounds without root: relaunch python with "
+            f"LD_PRELOAD={nccl_so_2} ; or pre-set "
+            f"LD_LIBRARY_PATH={nccl_lib_dir}:$LD_LIBRARY_PATH before python starts "
+            f"(and create a libnccl.so symlink in that dir first). "
+            f"Original error: {e}"
+        ) from e
 
 
 def initialize_nccl_comm(device: Optional[int] = None) -> None:
@@ -253,29 +257,11 @@ def initialize_nccl_comm(device: Optional[int] = None) -> None:
 
 
 def check_nccl_engine_requirements() -> None:
-    """Warn if an requires_native_multidevice TRT engine's NCCL prerequisites are not satisfied.
-
-    Checks two conditions and logs a warning for each:
-    1. LD_LIBRARY_PATH does not include PyTorch's NCCL lib dir (too late to fix,
-       must be set before process launch — use torchtrtrun).
-    2. torch.distributed is not initialized or world_size == 1.
+    """Warn if a requires_native_multidevice TRT engine's NCCL prerequisites are not satisfied.
 
-    Call this from both TorchTensorRTModule and PythonTorchTensorRTModule after
+    Called from TorchTensorRTModule and PythonTorchTensorRTModule after
     confirming the engine has NCCL collective ops.
     """
-    if get_nccl_library_path() is not None and not check_nccl_library_path():
-        logger.warning(
-            "This TRT engine contains NCCL collective ops but "
-            "LD_LIBRARY_PATH does not include PyTorch's NCCL library directory. "
-            "TRT may load a different NCCL instance than PyTorch, causing "
-            "communicator sharing to fail. Use torchtrtrun to launch distributed "
-            "scripts, or set LD_PRELOAD and LD_LIBRARY_PATH before process start:\n"
-            "  NCCL_LIB=$(python -c 'from torch_tensorrt.distributed._nccl_utils "
-            "import get_nccl_library_path; print(get_nccl_library_path())')\n"
-            "  LD_PRELOAD=$NCCL_LIB/libnccl.so.2 "
-            "LD_LIBRARY_PATH=$NCCL_LIB:$LD_LIBRARY_PATH python ..."
-        )
-
     if not (
         dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1
     ):
diff --git a/tests/py/dynamo/distributed/test_native_nccl.py b/tests/py/dynamo/distributed/test_native_nccl.py
index 231d9ab6d3..22422bf495 100644
--- a/tests/py/dynamo/distributed/test_native_nccl.py
+++ b/tests/py/dynamo/distributed/test_native_nccl.py
@@ -713,22 +713,6 @@ def test_get_nccl_library_path_returns_none_or_string(self) -> None:
                 f"libnccl.so.2 not found in {result}",
             )
 
-    def test_check_nccl_library_path_system_nccl(self) -> None:
-        """check_nccl_library_path returns True when nvidia.nccl not installed."""
-        from torch_tensorrt.distributed._nccl_utils import (
-            check_nccl_library_path,
-            get_nccl_library_path,
-        )
-
-        nccl_lib_dir = get_nccl_library_path()
-        if nccl_lib_dir is None:
-            # System NCCL path — must return True
-            self.assertTrue(check_nccl_library_path())
-        else:
-            # nvidia.nccl installed — result depends on LD_LIBRARY_PATH
-            result = check_nccl_library_path()
-            self.assertIsInstance(result, bool)
-
     def test_setup_nccl_for_torch_tensorrt_idempotent(self) -> None:
         """Calling setup_nccl_for_torch_tensorrt() multiple times is safe."""
         from torch_tensorrt.distributed import _nccl_utils
@@ -749,26 +733,6 @@ def test_ensure_nccl_symlink_nonexistent_dir(self) -> None:
         # libnccl.so.2 doesn't exist there → returns False
         self.assertFalse(result)
 
-    def test_check_nccl_library_path_detects_missing_ld_path(self) -> None:
-        """check_nccl_library_path returns False when LD_LIBRARY_PATH is absent."""
-        from torch_tensorrt.distributed._nccl_utils import get_nccl_library_path
-
-        nccl_lib_dir = get_nccl_library_path()
-        if nccl_lib_dir is None:
-            self.skipTest("nvidia.nccl not installed; system NCCL path is always OK")
-
-        from torch_tensorrt.distributed._nccl_utils import check_nccl_library_path
-
-        original = os.environ.get("LD_LIBRARY_PATH", "")
-        # Remove nccl_lib_dir from LD_LIBRARY_PATH
-        paths = [p for p in original.split(":") if p and p != nccl_lib_dir]
-        os.environ["LD_LIBRARY_PATH"] = ":".join(paths)
-        try:
-            result = check_nccl_library_path()
-            self.assertFalse(result)
-        finally:
-            os.environ["LD_LIBRARY_PATH"] = original
-
 
 # ============================================================================
 # Section 4 — fuse_distributed_ops graph pass (no GPU, no dist)   [was Section 3]

From 55b3ea7d81ce79282bb677f8c787ead6680c718b Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Mon, 1 Jun 2026 13:04:49 -0700
Subject: [PATCH 2/2] atomic symlink

---
 py/torch_tensorrt/distributed/_nccl_utils.py | 48 ++++++++++++++++----
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/py/torch_tensorrt/distributed/_nccl_utils.py b/py/torch_tensorrt/distributed/_nccl_utils.py
index ea491541d9..ab56d06b71 100644
--- a/py/torch_tensorrt/distributed/_nccl_utils.py
+++ b/py/torch_tensorrt/distributed/_nccl_utils.py
@@ -132,9 +132,13 @@ def setup_nccl_for_torch_tensorrt() -> None:
          RHEL lib64, or /usr/lib fallback).
       3. If <sys_libdir>/libnccl.so already points at that libnccl.so.2,
          return.
-      4. Otherwise, remove any existing libnccl.so at that path and create
-         a fresh symlink:
+      4. Otherwise, atomically install a fresh symlink:
              <sys_libdir>/libnccl.so → <pip>/libnccl.so.2
+         via "symlink to a unique per-pid temp name, then os.replace onto
+         the target."  This is multi-process safe: when several ranks of a
+         distributed test call this function concurrently, none of them
+         crash on FileExistsError, and the final on-disk state is the same
+         regardless of execution order.
       5. Run `ldconfig` to refresh /etc/ld.so.cache.
       6. Guarded by a module-global flag so subsequent calls in the same
          process are a no-op.
@@ -165,19 +169,45 @@ def setup_nccl_for_torch_tensorrt() -> None:
     sys_libdir = _sys_libdir_on_ldso_path()
     target = os.path.join(sys_libdir, "libnccl.so")
 
+    # Fast path: already set up by a prior process or rank.
+    if os.path.islink(target) and os.readlink(target) == nccl_so_2:
+        logger.debug(f"{target} already points at {nccl_so_2}; nothing to do.")
+        return
+
+    # Race-safe symlink swap.  Multiple ranks may enter this function
+    # concurrently (e.g. MultiProcessTestCase forks 2 children that each call
+    # setup_nccl_for_torch_tensorrt simultaneously).  Using `os.remove` +
+    # `os.symlink` opens a window where one rank's symlink call races with
+    # another's, raising FileExistsError on the loser.
+    #
+    # Instead: create the new symlink under a unique per-pid temp name
+    # (no contention possible — different filenames), then atomically rename
+    # it onto `target`.  `os.replace` is a single POSIX rename(2) call:
+    # it overwrites unconditionally and is observable as a single transition,
+    # never a missing or half-written state.  All ranks converge on the same
+    # final symlink without any of them crashing.
+    tmp = f"{target}.torchtrt-{os.getpid()}"
     try:
-        if os.path.lexists(target):
-            existing = os.readlink(target) if os.path.islink(target) else None
-            if existing == nccl_so_2:
-                logger.debug(f"{target} already points at {nccl_so_2}; nothing to do.")
-                return
-            os.remove(target)
-        os.symlink(nccl_so_2, target)
+        if os.path.lexists(tmp):
+            os.remove(tmp)
+        os.symlink(nccl_so_2, tmp)
+        os.replace(tmp, target)
         subprocess.run(["ldconfig"], check=False)
         logger.info(
             f"NCCL: linked {target} -> {nccl_so_2} so TRT and PyTorch share one libnccl."
         )
     except OSError as e:
+        # Clean up our temp link if we left one behind.
+        if os.path.lexists(tmp):
+            try:
+                os.remove(tmp)
+            except OSError:
+                pass
+        # If another rank already produced the correct final symlink while
+        # we were failing, accept that as success — the end state we wanted
+        # is in place.
+        if os.path.islink(target) and os.readlink(target) == nccl_so_2:
+            return
         raise RuntimeError(
             f"setup_nccl_for_torch_tensorrt(): cannot write {target} "
             f"(needed so TRT's dlopen('libnccl.so') resolves to PyTorch's libnccl.so.2). "