From 7c05733344788124db3186efc4515d98b932c1bb Mon Sep 17 00:00:00 2001
From: Time4Mind <119820237+Time4Mind@users.noreply.github.com>
Date: Sun, 17 May 2026 00:52:31 +0300
Subject: [PATCH] fix(startup): refuse to start when another ccbot already
 holds the lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Telegram's ``getUpdates`` long-poll is exclusive per bot token: a second
instance silently steals updates and the original starts spamming
``telegram.error.Conflict: terminated by other getUpdates request`` on
every poll. Today we hit this with two ccbot processes running side by
side — one under ``ccbot-supervisor.sh`` inside ``tmux ccbot:__main__``,
the other started ~1d earlier from a NetHunter-terminal interactive
shell outside tmux. Both kept retrying for hours; user-visible behaviour
was message delays, ghost responses, and Card edits failing.

Fix: ``main.py`` now opens ``$CCBOT_DIR/ccbot.lock`` and holds an
exclusive ``fcntl.flock(LOCK_EX | LOCK_NB)`` for the process lifetime
BEFORE any tmux / bot startup work runs. The handle lives at module
scope so the lock survives until the interpreter exits; ``FD_CLOEXEC``
prevents the lock from leaking into any ``subprocess`` /
``asyncio.subprocess`` child (a stray child outliving the parent would
otherwise hold the lock and block future starts).

A contending instance hits ``OSError`` on ``flock``, logs the path and
the reason, prints to stderr (for supervisor capture), closes the
handle, and ``sys.exit(1)``. The supervisor's restart-backoff loop then
just waits for the existing instance to die naturally — no more
silent ``getUpdates`` cross-fire.

+4 unit tests cover happy-path acquire, contention → SystemExit(1),
re-acquire after release (process death is enough — no stale-lock
sweep needed), and parent-directory creation for first-launch
``$CCBOT_DIR``.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/ccbot/main.py                  | 51 ++++++++++++++++++++++++
 tests/ccbot/test_singleton_lock.py | 63 ++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 tests/ccbot/test_singleton_lock.py

diff --git a/src/ccbot/main.py b/src/ccbot/main.py
index c4f695ba..074a1f4f 100644
--- a/src/ccbot/main.py
+++ b/src/ccbot/main.py
@@ -4,10 +4,53 @@
   1. `ccbot hook` — delegates to hook.hook_main() for Claude Code hook processing.
   2. Default — configures logging, initializes tmux session, and starts the
      Telegram bot polling loop via bot.create_bot().
+
+Also enforces a single-bot mutex via flock on ``$CCBOT_DIR/ccbot.lock``
+— Telegram's getUpdates long-poll is exclusive per token, so a second
+instance silently steals updates and the original starts logging
+``Conflict: terminated by other getUpdates request`` until one dies.
+The flock makes the second instance refuse to start instead.
 """
 
+import fcntl
 import logging
 import sys
+from pathlib import Path
+from typing import IO, Any
+
+# Held at module scope so the OS keeps the flock for the whole process
+# lifetime. Local-scope file handles would be GC-closed once main()
+# returns from acquiring them.
+_singleton_lock_handle: IO[Any] | None = None
+
+
+def _acquire_singleton_lock(lock_path: Path) -> IO[Any]:
+    """Acquire an exclusive flock on ``lock_path`` or ``sys.exit(1)``.
+
+    Returns the file handle holding the lock; callers MUST keep the
+    handle alive for the process lifetime (we assign it to
+    ``_singleton_lock_handle`` for this). ``FD_CLOEXEC`` is set so the
+    lock doesn't leak into ``subprocess`` / ``asyncio.subprocess``
+    children — a stray child outliving the parent would otherwise hold
+    the lock and block future bot starts.
+    """
+    lock_path.parent.mkdir(parents=True, exist_ok=True)
+    fh = open(lock_path, "w")
+    fcntl.fcntl(fh.fileno(), fcntl.F_SETFD, fcntl.FD_CLOEXEC)
+    try:
+        fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
+    except OSError:
+        # Logging may not be configured yet on this path, so go via
+        # stderr too — the supervisor wrapper captures it either way.
+        msg = (
+            f"Another ccbot instance holds {lock_path}. "
+            "Refusing to start to avoid Telegram getUpdates conflict."
+        )
+        logging.getLogger(__name__).error(msg)
+        print(f"Error: {msg}", file=sys.stderr)
+        fh.close()
+        sys.exit(1)
+    return fh
 
 
 def main() -> None:
@@ -40,6 +83,14 @@ def main() -> None:
         print("Get your user ID from @userinfobot on Telegram.")
         sys.exit(1)
 
+    # Singleton lock has to land BEFORE we touch tmux / create_bot /
+    # run_polling — otherwise a second instance would still race the
+    # getUpdates handshake before discovering it can't hold the lock.
+    global _singleton_lock_handle
+    from .utils import ccbot_dir
+
+    _singleton_lock_handle = _acquire_singleton_lock(ccbot_dir() / "ccbot.lock")
+
     logging.getLogger("ccbot").setLevel(logging.DEBUG)
     # AIORateLimiter (max_retries=5) handles retries itself; keep INFO for visibility
     logging.getLogger("telegram.ext.AIORateLimiter").setLevel(logging.INFO)
diff --git a/tests/ccbot/test_singleton_lock.py b/tests/ccbot/test_singleton_lock.py
new file mode 100644
index 00000000..d232252b
--- /dev/null
+++ b/tests/ccbot/test_singleton_lock.py
@@ -0,0 +1,63 @@
+"""Tests for the singleton flock that prevents two bot instances from
+both calling Telegram's exclusive ``getUpdates``."""
+
+from __future__ import annotations
+
+import fcntl
+from pathlib import Path
+
+import pytest
+
+from ccbot.main import _acquire_singleton_lock
+
+
+def test_fresh_path_locks_and_returns_handle(tmp_path: Path) -> None:
+    lock = tmp_path / "ccbot.lock"
+    fh = _acquire_singleton_lock(lock)
+    try:
+        assert lock.exists()
+        assert not fh.closed
+        # FD_CLOEXEC is set so the lock doesn't leak into subprocess children.
+        flags = fcntl.fcntl(fh.fileno(), fcntl.F_GETFD)
+        assert flags & fcntl.FD_CLOEXEC
+    finally:
+        fh.close()
+
+
+def test_second_acquirer_exits(tmp_path: Path) -> None:
+    lock = tmp_path / "ccbot.lock"
+    held = _acquire_singleton_lock(lock)
+    try:
+        # The first call held the lock; the second must hit sys.exit(1)
+        # because LOCK_NB returns OSError immediately when contended.
+        with pytest.raises(SystemExit) as exc:
+            _acquire_singleton_lock(lock)
+        assert exc.value.code == 1
+    finally:
+        held.close()
+
+
+def test_released_lock_can_be_reacquired(tmp_path: Path) -> None:
+    # Holder dies → fcntl releases the lock automatically when the fd
+    # closes. A fresh start (next supervisor cycle, etc.) should be
+    # able to come up cleanly without a "stale lock file" sweep.
+    lock = tmp_path / "ccbot.lock"
+    first = _acquire_singleton_lock(lock)
+    first.close()
+    second = _acquire_singleton_lock(lock)
+    try:
+        assert not second.closed
+    finally:
+        second.close()
+
+
+def test_creates_parent_directory(tmp_path: Path) -> None:
+    # CCBOT_DIR may not exist on first launch; the lock acquire shouldn't
+    # crash with FileNotFoundError before the rest of bootstrap creates
+    # state.json.
+    nested = tmp_path / "fresh" / "subdir" / "ccbot.lock"
+    fh = _acquire_singleton_lock(nested)
+    try:
+        assert nested.exists()
+    finally:
+        fh.close()