From 7c05733344788124db3186efc4515d98b932c1bb Mon Sep 17 00:00:00 2001 From: Time4Mind <119820237+Time4Mind@users.noreply.github.com> Date: Sun, 17 May 2026 00:52:31 +0300 Subject: [PATCH] fix(startup): refuse to start when another ccbot already holds the lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Telegram's ``getUpdates`` long-poll is exclusive per bot token: a second instance silently steals updates and the original starts spamming ``telegram.error.Conflict: terminated by other getUpdates request`` on every poll. Today we hit this with two ccbot processes running side by side — one under ``ccbot-supervisor.sh`` inside ``tmux ccbot:__main__``, the other started ~1d earlier from a NetHunter-terminal interactive shell outside tmux. Both kept retrying for hours; user-visible behaviour was message delays, ghost responses, and Card edits failing. Fix: ``main.py`` now opens ``$CCBOT_DIR/ccbot.lock`` and holds an exclusive ``fcntl.flock(LOCK_EX | LOCK_NB)`` for the process lifetime BEFORE any tmux / bot startup work runs. The handle lives at module scope so the lock survives until the interpreter exits; ``FD_CLOEXEC`` prevents the lock from leaking into any ``subprocess`` / ``asyncio.subprocess`` child (a stray child outliving the parent would otherwise hold the lock and block future starts). A contending instance hits ``OSError`` on ``flock``, logs the path and the reason, prints to stderr (for supervisor capture), closes the handle, and ``sys.exit(1)``. The supervisor's restart-backoff loop then just waits for the existing instance to die naturally — no more silent ``getUpdates`` cross-fire. +4 unit tests cover happy-path acquire, contention → SystemExit(1), re-acquire after release (process death is enough — no stale-lock sweep needed), and parent-directory creation for first-launch ``$CCBOT_DIR``. Co-Authored-By: Claude Opus 4.7 --- src/ccbot/main.py | 51 ++++++++++++++++++++++++ tests/ccbot/test_singleton_lock.py | 63 ++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 tests/ccbot/test_singleton_lock.py diff --git a/src/ccbot/main.py b/src/ccbot/main.py index c4f695ba..074a1f4f 100644 --- a/src/ccbot/main.py +++ b/src/ccbot/main.py @@ -4,10 +4,53 @@ 1. `ccbot hook` — delegates to hook.hook_main() for Claude Code hook processing. 2. Default — configures logging, initializes tmux session, and starts the Telegram bot polling loop via bot.create_bot(). + +Also enforces a single-bot mutex via flock on ``$CCBOT_DIR/ccbot.lock`` +— Telegram's getUpdates long-poll is exclusive per token, so a second +instance silently steals updates and the original starts logging +``Conflict: terminated by other getUpdates request`` until one dies. +The flock makes the second instance refuse to start instead. """ +import fcntl import logging import sys +from pathlib import Path +from typing import IO, Any + +# Held at module scope so the OS keeps the flock for the whole process +# lifetime. Local-scope file handles would be GC-closed once main() +# returns from acquiring them. +_singleton_lock_handle: IO[Any] | None = None + + +def _acquire_singleton_lock(lock_path: Path) -> IO[Any]: + """Acquire an exclusive flock on ``lock_path`` or ``sys.exit(1)``. + + Returns the file handle holding the lock; callers MUST keep the + handle alive for the process lifetime (we assign it to + ``_singleton_lock_handle`` for this). ``FD_CLOEXEC`` is set so the + lock doesn't leak into ``subprocess`` / ``asyncio.subprocess`` + children — a stray child outliving the parent would otherwise hold + the lock and block future bot starts. + """ + lock_path.parent.mkdir(parents=True, exist_ok=True) + fh = open(lock_path, "w") + fcntl.fcntl(fh.fileno(), fcntl.F_SETFD, fcntl.FD_CLOEXEC) + try: + fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB) + except OSError: + # Logging may not be configured yet on this path, so go via + # stderr too — the supervisor wrapper captures it either way. + msg = ( + f"Another ccbot instance holds {lock_path}. " + "Refusing to start to avoid Telegram getUpdates conflict." + ) + logging.getLogger(__name__).error(msg) + print(f"Error: {msg}", file=sys.stderr) + fh.close() + sys.exit(1) + return fh def main() -> None: @@ -40,6 +83,14 @@ def main() -> None: print("Get your user ID from @userinfobot on Telegram.") sys.exit(1) + # Singleton lock has to land BEFORE we touch tmux / create_bot / + # run_polling — otherwise a second instance would still race the + # getUpdates handshake before discovering it can't hold the lock. + global _singleton_lock_handle + from .utils import ccbot_dir + + _singleton_lock_handle = _acquire_singleton_lock(ccbot_dir() / "ccbot.lock") + logging.getLogger("ccbot").setLevel(logging.DEBUG) # AIORateLimiter (max_retries=5) handles retries itself; keep INFO for visibility logging.getLogger("telegram.ext.AIORateLimiter").setLevel(logging.INFO) diff --git a/tests/ccbot/test_singleton_lock.py b/tests/ccbot/test_singleton_lock.py new file mode 100644 index 00000000..d232252b --- /dev/null +++ b/tests/ccbot/test_singleton_lock.py @@ -0,0 +1,63 @@ +"""Tests for the singleton flock that prevents two bot instances from +both calling Telegram's exclusive ``getUpdates``.""" + +from __future__ import annotations + +import fcntl +from pathlib import Path + +import pytest + +from ccbot.main import _acquire_singleton_lock + + +def test_fresh_path_locks_and_returns_handle(tmp_path: Path) -> None: + lock = tmp_path / "ccbot.lock" + fh = _acquire_singleton_lock(lock) + try: + assert lock.exists() + assert not fh.closed + # FD_CLOEXEC is set so the lock doesn't leak into subprocess children. + flags = fcntl.fcntl(fh.fileno(), fcntl.F_GETFD) + assert flags & fcntl.FD_CLOEXEC + finally: + fh.close() + + +def test_second_acquirer_exits(tmp_path: Path) -> None: + lock = tmp_path / "ccbot.lock" + held = _acquire_singleton_lock(lock) + try: + # The first call held the lock; the second must hit sys.exit(1) + # because LOCK_NB returns OSError immediately when contended. + with pytest.raises(SystemExit) as exc: + _acquire_singleton_lock(lock) + assert exc.value.code == 1 + finally: + held.close() + + +def test_released_lock_can_be_reacquired(tmp_path: Path) -> None: + # Holder dies → fcntl releases the lock automatically when the fd + # closes. A fresh start (next supervisor cycle, etc.) should be + # able to come up cleanly without a "stale lock file" sweep. + lock = tmp_path / "ccbot.lock" + first = _acquire_singleton_lock(lock) + first.close() + second = _acquire_singleton_lock(lock) + try: + assert not second.closed + finally: + second.close() + + +def test_creates_parent_directory(tmp_path: Path) -> None: + # CCBOT_DIR may not exist on first launch; the lock acquire shouldn't + # crash with FileNotFoundError before the rest of bootstrap creates + # state.json. + nested = tmp_path / "fresh" / "subdir" / "ccbot.lock" + fh = _acquire_singleton_lock(nested) + try: + assert nested.exists() + finally: + fh.close()