diff --git a/src/ccbot/main.py b/src/ccbot/main.py index c4f695ba..074a1f4f 100644 --- a/src/ccbot/main.py +++ b/src/ccbot/main.py @@ -4,10 +4,53 @@ 1. `ccbot hook` — delegates to hook.hook_main() for Claude Code hook processing. 2. Default — configures logging, initializes tmux session, and starts the Telegram bot polling loop via bot.create_bot(). + +Also enforces a single-bot mutex via flock on ``$CCBOT_DIR/ccbot.lock`` +— Telegram's getUpdates long-poll is exclusive per token, so a second +instance silently steals updates and the original starts logging +``Conflict: terminated by other getUpdates request`` until one dies. +The flock makes the second instance refuse to start instead. """ +import fcntl import logging import sys +from pathlib import Path +from typing import IO, Any + +# Held at module scope so the OS keeps the flock for the whole process +# lifetime. Local-scope file handles would be GC-closed once main() +# returns from acquiring them. +_singleton_lock_handle: IO[Any] | None = None + + +def _acquire_singleton_lock(lock_path: Path) -> IO[Any]: + """Acquire an exclusive flock on ``lock_path`` or ``sys.exit(1)``. + + Returns the file handle holding the lock; callers MUST keep the + handle alive for the process lifetime (we assign it to + ``_singleton_lock_handle`` for this). ``FD_CLOEXEC`` is set so the + lock doesn't leak into ``subprocess`` / ``asyncio.subprocess`` + children — a stray child outliving the parent would otherwise hold + the lock and block future bot starts. + """ + lock_path.parent.mkdir(parents=True, exist_ok=True) + fh = open(lock_path, "w") + fcntl.fcntl(fh.fileno(), fcntl.F_SETFD, fcntl.FD_CLOEXEC) + try: + fcntl.flock(fh, fcntl.LOCK_EX | fcntl.LOCK_NB) + except OSError: + # Logging may not be configured yet on this path, so go via + # stderr too — the supervisor wrapper captures it either way. + msg = ( + f"Another ccbot instance holds {lock_path}. " + "Refusing to start to avoid Telegram getUpdates conflict." + ) + logging.getLogger(__name__).error(msg) + print(f"Error: {msg}", file=sys.stderr) + fh.close() + sys.exit(1) + return fh def main() -> None: @@ -40,6 +83,14 @@ def main() -> None: print("Get your user ID from @userinfobot on Telegram.") sys.exit(1) + # Singleton lock has to land BEFORE we touch tmux / create_bot / + # run_polling — otherwise a second instance would still race the + # getUpdates handshake before discovering it can't hold the lock. + global _singleton_lock_handle + from .utils import ccbot_dir + + _singleton_lock_handle = _acquire_singleton_lock(ccbot_dir() / "ccbot.lock") + logging.getLogger("ccbot").setLevel(logging.DEBUG) # AIORateLimiter (max_retries=5) handles retries itself; keep INFO for visibility logging.getLogger("telegram.ext.AIORateLimiter").setLevel(logging.INFO) diff --git a/tests/ccbot/test_singleton_lock.py b/tests/ccbot/test_singleton_lock.py new file mode 100644 index 00000000..d232252b --- /dev/null +++ b/tests/ccbot/test_singleton_lock.py @@ -0,0 +1,63 @@ +"""Tests for the singleton flock that prevents two bot instances from +both calling Telegram's exclusive ``getUpdates``.""" + +from __future__ import annotations + +import fcntl +from pathlib import Path + +import pytest + +from ccbot.main import _acquire_singleton_lock + + +def test_fresh_path_locks_and_returns_handle(tmp_path: Path) -> None: + lock = tmp_path / "ccbot.lock" + fh = _acquire_singleton_lock(lock) + try: + assert lock.exists() + assert not fh.closed + # FD_CLOEXEC is set so the lock doesn't leak into subprocess children. + flags = fcntl.fcntl(fh.fileno(), fcntl.F_GETFD) + assert flags & fcntl.FD_CLOEXEC + finally: + fh.close() + + +def test_second_acquirer_exits(tmp_path: Path) -> None: + lock = tmp_path / "ccbot.lock" + held = _acquire_singleton_lock(lock) + try: + # The first call held the lock; the second must hit sys.exit(1) + # because LOCK_NB returns OSError immediately when contended. + with pytest.raises(SystemExit) as exc: + _acquire_singleton_lock(lock) + assert exc.value.code == 1 + finally: + held.close() + + +def test_released_lock_can_be_reacquired(tmp_path: Path) -> None: + # Holder dies → fcntl releases the lock automatically when the fd + # closes. A fresh start (next supervisor cycle, etc.) should be + # able to come up cleanly without a "stale lock file" sweep. + lock = tmp_path / "ccbot.lock" + first = _acquire_singleton_lock(lock) + first.close() + second = _acquire_singleton_lock(lock) + try: + assert not second.closed + finally: + second.close() + + +def test_creates_parent_directory(tmp_path: Path) -> None: + # CCBOT_DIR may not exist on first launch; the lock acquire shouldn't + # crash with FileNotFoundError before the rest of bootstrap creates + # state.json. + nested = tmp_path / "fresh" / "subdir" / "ccbot.lock" + fh = _acquire_singleton_lock(nested) + try: + assert nested.exists() + finally: + fh.close()