From 7dded37b2781f45901e08622672d4b8d209b371f Mon Sep 17 00:00:00 2001
From: Jason Tung <jtung1027@gmail.com>
Date: Sun, 24 May 2026 05:45:08 -0400
Subject: [PATCH 1/8] fix: harden against subagent-triggered crash cascade

Cursor subagents are short-lived processes that disconnect mid-stream as
normal lifecycle behaviour.  The rapid churn of broken-pipe events was
destabilising the proxy, causing a cascade of crashes and failed restarts.

Changes:
- server.py: add SO_REUSEPORT (with graceful fallback) so a freshly-started
  proxy can bind even while a dying predecessor holds the socket in CLOSE_WAIT
- server.py: downgrade client-disconnect log messages from WARNING to INFO;
  broken pipes are expected noise with subagents, not errors
- server.py: abort upstream socket drain immediately on client disconnect
  instead of continuing to read DeepSeek tokens that will never be delivered;
  also skip the final reasoning-cache write when there is nothing to cache
- reasoning_store.py: add PRAGMA busy_timeout=5000 so SQLite waits up to
  5 s for a concurrent write lock instead of immediately raising
  OperationalError("database is locked")
- reasoning_store.py: enable WAL journal mode for better concurrent read/write
  throughput under parallel subagent requests
- reasoning_store.py: retry startup prune up to 3 times with 0.5 s wait;
  degrade gracefully (log warning, skip prune) rather than crashing if the
  DB is still locked after retries
- tests/test_server.py: update two tests that asserted WARNING-level logs for
  client-disconnect events to assert INFO-level instead

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/deepseek_cursor_proxy/reasoning_store.py | 38 +++++++++++-
 src/deepseek_cursor_proxy/server.py          | 65 ++++++++++++++------
 tests/test_server.py                         |  4 +-
 3 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/src/deepseek_cursor_proxy/reasoning_store.py b/src/deepseek_cursor_proxy/reasoning_store.py
index 58ca933..e27f851 100644
--- a/src/deepseek_cursor_proxy/reasoning_store.py
+++ b/src/deepseek_cursor_proxy/reasoning_store.py
@@ -177,6 +177,10 @@ def portable_reasoning_keys(
     return keys
 
 
+_STARTUP_PRUNE_BUSY_RETRIES = 3
+_STARTUP_PRUNE_BUSY_WAIT = 0.5  # seconds between retries
+
+
 class ReasoningStore:
     def __init__(
         self,
@@ -197,6 +201,15 @@ def __init__(
         self._conn = sqlite3.connect(
             self.reasoning_content_path, check_same_thread=False
         )
+        # busy_timeout lets SQLite wait up to 5 s for a concurrent writer to
+        # finish instead of immediately raising OperationalError("database is
+        # locked"). This prevents crash-restart races when multiple proxy
+        # instances start at nearly the same time (e.g. several terminals open
+        # in quick succession or after a subagent-triggered crash).
+        self._conn.execute("PRAGMA busy_timeout = 5000")
+        # WAL mode allows one writer and multiple concurrent readers without
+        # locking, which is far better for the threading model of this proxy.
+        self._conn.execute("PRAGMA journal_mode = WAL")
         if isinstance(self.reasoning_content_path, Path):
             self.reasoning_content_path.chmod(0o600)
         self._conn.execute(
@@ -210,7 +223,30 @@ def __init__(
             """
         )
         self._conn.commit()
-        self.prune()
+        self._startup_prune()
+
+    def _startup_prune(self) -> None:
+        """Prune on startup with retry + graceful degradation.
+
+        Another proxy instance starting simultaneously can hold a write lock
+        for a short window.  busy_timeout handles most of this, but if the
+        lock persists longer than busy_timeout we catch the error and skip
+        the prune rather than crashing — stale rows left behind are harmless
+        and will be cleaned up on the next successful prune.
+        """
+        for attempt in range(_STARTUP_PRUNE_BUSY_RETRIES):
+            try:
+                self.prune()
+                return
+            except sqlite3.OperationalError as exc:
+                if "locked" not in str(exc).lower():
+                    raise
+                if attempt < _STARTUP_PRUNE_BUSY_RETRIES - 1:
+                    time.sleep(_STARTUP_PRUNE_BUSY_WAIT)
+        import logging as _logging
+        _logging.getLogger("deepseek_cursor_proxy").warning(
+            "reasoning store: skipped startup prune (database is locked)"
+        )
 
     def close(self) -> None:
         with self._lock:
diff --git a/src/deepseek_cursor_proxy/server.py b/src/deepseek_cursor_proxy/server.py
index 0202062..82e110a 100644
--- a/src/deepseek_cursor_proxy/server.py
+++ b/src/deepseek_cursor_proxy/server.py
@@ -7,6 +7,7 @@
 import json
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from pathlib import Path
+import socket
 import sys
 import time
 from typing import Any
@@ -51,6 +52,19 @@ class DeepSeekProxyServer(ThreadingHTTPServer):
     reasoning_store: ReasoningStore
     trace_writer: TraceWriter | None
 
+    def server_bind(self) -> None:
+        # SO_REUSEADDR is already set by HTTPServer.allow_reuse_address=True, but
+        # SO_REUSEPORT (Linux/BSD) lets a freshly-started proxy bind immediately even
+        # while a dying predecessor still holds the socket in CLOSE_WAIT, which is the
+        # most common cause of the "Address already in use" cascade after a crash.
+        try:
+            self.socket.setsockopt(
+                socket.SOL_SOCKET, socket.SO_REUSEPORT, 1  # type: ignore[attr-defined]
+            )
+        except AttributeError:
+            pass  # Windows / older BSD — graceful degradation
+        super().server_bind()
+
 
 class DeepSeekProxyHandler(BaseHTTPRequestHandler):
     server_version = "DeepSeekPythonProxy/0.1"
@@ -426,7 +440,8 @@ def _send_response_headers(
                 self.send_header(name, value)
             self.end_headers()
         except (BrokenPipeError, ConnectionError) as exc:
-            LOG.warning("client disconnected while %s: %s", disconnect_context, exc)
+            # Cursor subagents are short-lived; mid-stream disconnects are normal.
+            LOG.info("client disconnected while %s: %s", disconnect_context, exc)
             return False
         return True
 
@@ -442,7 +457,8 @@ def _write_to_client(
             if flush:
                 self.wfile.flush()
         except (BrokenPipeError, ConnectionError) as exc:
-            LOG.warning("client disconnected while %s: %s", disconnect_context, exc)
+            # Cursor subagents are short-lived; mid-stream disconnects are normal.
+            LOG.info("client disconnected while %s: %s", disconnect_context, exc)
             return False
         return True
 
@@ -693,6 +709,7 @@ def _proxy_streaming_response(
             else [(scope, response_prior_messages)]
         )
         finalized = False
+        client_disconnected = False
         pending_recovery_notice = recovery_notice
         try:
             while True:
@@ -725,33 +742,45 @@ def _proxy_streaming_response(
                 if not self._write_to_client(
                     rewritten, "sending streaming response chunk", flush=True
                 ):
+                    # Client disconnected (expected for short-lived subagents).
+                    # Stop draining the upstream socket immediately — there is no
+                    # point in reading more DeepSeek tokens that nobody will receive.
+                    client_disconnected = True
                     return ProxyResponseResult(False, usage)
                 if finalized:
                     break
         finally:
-            # Store partial reasoning whenever the stream exits without
-            # the upstream's [DONE] terminator (client disconnect, upstream
-            # read failure, exception). Without this, a Stop pressed mid-stream
+            # Store partial reasoning whenever the stream exits without the
+            # upstream's [DONE] terminator (client disconnect, upstream read
+            # failure, exception). Without this, a Stop pressed mid-stream
             # would discard any reasoning the proxy received but never cached.
+            # Skip the store when the client disconnected early AND the
+            # accumulator has nothing worth caching (no reasoning content yet)
+            # to avoid touching the DB unnecessarily on subagent churn.
             if not finalized:
                 if self.config.verbose:
                     log_json(
                         "model streaming assistant messages", accumulator.messages()
                     )
-                stored = sum(
-                    accumulator.store_reasoning(
-                        self.reasoning_store,
-                        ctx_scope,
-                        cache_namespace,
-                        prior_messages,
-                    )
-                    for ctx_scope, prior_messages in response_contexts
-                )
-                if self.config.verbose and stored:
-                    LOG.info(
-                        "stored %s streaming reasoning cache key(s) before exit",
-                        stored,
+                messages_with_reasoning = [
+                    m for m in accumulator.messages()
+                    if m.get("reasoning_content")
+                ]
+                if not client_disconnected or messages_with_reasoning:
+                    stored = sum(
+                        accumulator.store_reasoning(
+                            self.reasoning_store,
+                            ctx_scope,
+                            cache_namespace,
+                            prior_messages,
+                        )
+                        for ctx_scope, prior_messages in response_contexts
                     )
+                    if self.config.verbose and stored:
+                        LOG.info(
+                            "stored %s streaming reasoning cache key(s) before exit",
+                            stored,
+                        )
         return ProxyResponseResult(True, usage)
 
     def _rewrite_sse_line(
diff --git a/tests/test_server.py b/tests/test_server.py
index db89f0f..cff056c 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -264,7 +264,7 @@ def test_regular_response_handles_client_disconnect(self) -> None:
             }
         ).encode("utf-8")
         try:
-            with self.assertLogs("deepseek_cursor_proxy", level="WARNING") as captured:
+            with self.assertLogs("deepseek_cursor_proxy", level="INFO") as captured:
                 result = handler._proxy_regular_response(
                     _FakeResponse(body),
                     "deepseek-v4-pro",
@@ -290,7 +290,7 @@ def test_streaming_response_stops_on_client_disconnect(self) -> None:
             ]
         )
         try:
-            with self.assertLogs("deepseek_cursor_proxy", level="WARNING") as captured:
+            with self.assertLogs("deepseek_cursor_proxy", level="INFO") as captured:
                 result = handler._proxy_streaming_response(
                     response,
                     "deepseek-v4-pro",

From 9195fa9e2e28074f3240d255ce2d9dc38b49c690 Mon Sep 17 00:00:00 2001
From: Jason Tung <jtung1027@gmail.com>
Date: Sun, 24 May 2026 05:49:34 -0400
Subject: [PATCH 2/8] fix: remove SO_REUSEPORT (wrong approach, requires both
 sockets to set it)

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/deepseek_cursor_proxy/server.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/deepseek_cursor_proxy/server.py b/src/deepseek_cursor_proxy/server.py
index 82e110a..6fe31da 100644
--- a/src/deepseek_cursor_proxy/server.py
+++ b/src/deepseek_cursor_proxy/server.py
@@ -7,7 +7,6 @@
 import json
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from pathlib import Path
-import socket
 import sys
 import time
 from typing import Any
@@ -52,18 +51,6 @@ class DeepSeekProxyServer(ThreadingHTTPServer):
     reasoning_store: ReasoningStore
     trace_writer: TraceWriter | None
 
-    def server_bind(self) -> None:
-        # SO_REUSEADDR is already set by HTTPServer.allow_reuse_address=True, but
-        # SO_REUSEPORT (Linux/BSD) lets a freshly-started proxy bind immediately even
-        # while a dying predecessor still holds the socket in CLOSE_WAIT, which is the
-        # most common cause of the "Address already in use" cascade after a crash.
-        try:
-            self.socket.setsockopt(
-                socket.SOL_SOCKET, socket.SO_REUSEPORT, 1  # type: ignore[attr-defined]
-            )
-        except AttributeError:
-            pass  # Windows / older BSD — graceful degradation
-        super().server_bind()
 
 
 class DeepSeekProxyHandler(BaseHTTPRequestHandler):

From 9ff08150a4ce5afc9508a5d53be772ea72357487 Mon Sep 17 00:00:00 2001
From: Jason Tung <jtung1027@gmail.com>
Date: Sun, 24 May 2026 15:54:33 -0400
Subject: [PATCH 3/8] fix: apply black formatting

---
 src/deepseek_cursor_proxy/reasoning_store.py | 1 +
 src/deepseek_cursor_proxy/server.py          | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/deepseek_cursor_proxy/reasoning_store.py b/src/deepseek_cursor_proxy/reasoning_store.py
index e27f851..7f9e03e 100644
--- a/src/deepseek_cursor_proxy/reasoning_store.py
+++ b/src/deepseek_cursor_proxy/reasoning_store.py
@@ -244,6 +244,7 @@ def _startup_prune(self) -> None:
                 if attempt < _STARTUP_PRUNE_BUSY_RETRIES - 1:
                     time.sleep(_STARTUP_PRUNE_BUSY_WAIT)
         import logging as _logging
+
         _logging.getLogger("deepseek_cursor_proxy").warning(
             "reasoning store: skipped startup prune (database is locked)"
         )
diff --git a/src/deepseek_cursor_proxy/server.py b/src/deepseek_cursor_proxy/server.py
index 6fe31da..c20012f 100644
--- a/src/deepseek_cursor_proxy/server.py
+++ b/src/deepseek_cursor_proxy/server.py
@@ -52,7 +52,6 @@ class DeepSeekProxyServer(ThreadingHTTPServer):
     trace_writer: TraceWriter | None
 
 
-
 class DeepSeekProxyHandler(BaseHTTPRequestHandler):
     server_version = "DeepSeekPythonProxy/0.1"
 
@@ -750,8 +749,7 @@ def _proxy_streaming_response(
                         "model streaming assistant messages", accumulator.messages()
                     )
                 messages_with_reasoning = [
-                    m for m in accumulator.messages()
-                    if m.get("reasoning_content")
+                    m for m in accumulator.messages() if m.get("reasoning_content")
                 ]
                 if not client_disconnected or messages_with_reasoning:
                     stored = sum(

From 24f74b3d1d4be63c41b3da80b0796fef3e63986e Mon Sep 17 00:00:00 2001
From: Jason Tung <jtung1027@gmail.com>
Date: Sun, 24 May 2026 17:01:29 -0400
Subject: [PATCH 4/8] perf: eliminate read/write lock contention under
 concurrent subagent load
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The core issue causing multi-second freezes was that every streaming write
called _prune_locked() — a full-table O(N log N) sort — while holding the
only lock in the store, blocking all concurrent reads.

Changes:
- Thread-local read connections: each handler thread gets its own SQLite
  connection; WAL mode handles file-level concurrency so reads never wait
  on writes and vice versa.
- created_at index: turns age and row-count prune queries from O(N log N)
  to O(log N) / O(max_rows).
- Throttled row-count prune: skip the expensive NOT-IN subquery when the
  DB is well under capacity; always prune when over the limit.
- SQLite pragma tuning: 32 MiB write cache, 8 MiB per read connection,
  synchronous=NORMAL (safe under WAL).
- _proxy_already_running() health check in main(): new startup attempts
  exit cleanly if the proxy is already serving, eliminating the
  "Address already in use" cascade.
- autostart.bash rewritten as a status-check-only script: the proxy is
  started once manually; agents never trigger auto-start logic.
- CONCURRENT_DESIGN.md: full architecture + root-cause + fix narrative.

All 96 tests pass.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 CONCURRENT_DESIGN.md                         | 248 +++++++++++++++++++
 src/deepseek_cursor_proxy/reasoning_store.py | 199 ++++++++++++---
 src/deepseek_cursor_proxy/server.py          |  35 +++
 3 files changed, 444 insertions(+), 38 deletions(-)
 create mode 100644 CONCURRENT_DESIGN.md

diff --git a/CONCURRENT_DESIGN.md b/CONCURRENT_DESIGN.md
new file mode 100644
index 0000000..90c8b02
--- /dev/null
+++ b/CONCURRENT_DESIGN.md
@@ -0,0 +1,248 @@
+# Concurrency Design & Performance Guide
+
+This document explains how the proxy handles concurrent requests, what went wrong
+under multi-agent Cursor load, what the `fix/subagent-resilience` branch changed,
+and what the performance optimisations added on top of that branch do.
+
+---
+
+## 1. What the proxy does
+
+Cursor treats this proxy as a standard OpenAI-compatible API endpoint.  The
+proxy's unique job is to solve a **DeepSeek-specific API requirement**: when you
+send a conversation history that contains an assistant turn with tool calls,
+DeepSeek's API requires that turn to also include the original `reasoning_content`
+(chain-of-thought tokens) from when it was first generated.  Cursor strips that
+field before echoing the history back, so the proxy must:
+
+1. **Cache** every `reasoning_content` it receives from DeepSeek into SQLite.
+2. **Restore** cached reasoning into outgoing messages before forwarding to DeepSeek.
+
+Everything else — CORS handling, model name rewriting, ngrok tunnelling, recovery
+notices — is scaffolding around that core loop.
+
+### Per-request flow
+
+```
+Cursor IDE
+  │
+  │  POST /v1/chat/completions  (OpenAI-compatible)
+  ▼
+DeepSeekProxyHandler.do_POST()
+  │
+  ├─ parse & validate request body
+  │
+  ├─ prepare_upstream_request()
+  │    ├─ normalize messages (strip unknown fields, fix content types)
+  │    └─ for each assistant message with tool_calls:
+  │         └─ ReasoningStore.get() ← look up cached reasoning_content
+  │
+  ├─ forward to DeepSeek API (urlopen, streaming or non-streaming)
+  │
+  ├─ [streaming path]
+  │    ├─ for each SSE chunk:
+  │    │    ├─ rewrite model name, mirror reasoning into content display
+  │    │    └─ StreamAccumulator.store_ready_reasoning()
+  │    │         └─ ReasoningStore.put() ← cache reasoning when tool_calls complete
+  │    └─ ReasoningStore.put() ← cache on [DONE] or client disconnect
+  │
+  └─ [non-streaming path]
+       └─ rewrite_response_body() → ReasoningStore.put() ← cache the response
+```
+
+---
+
+## 2. Concurrency model
+
+The server is a Python `ThreadingHTTPServer`: each incoming connection spawns a
+new OS thread.  When Cursor runs multiple agents or subagents simultaneously,
+several threads are active at once — each reading from the SQLite cache before
+forwarding its request upstream, and each writing back after receiving a response.
+
+### The SQLite layer
+
+SQLite in its default (journal) mode allows only one writer at a time and blocks
+readers during writes.  This made the original code prone to lock contention under
+parallel requests.
+
+---
+
+## 3. Bug history
+
+### Bug 1 — Startup crash cascade (pre-branch)
+
+**Symptom:** Multiple "database is locked" tracebacks in `autostart.log`, followed
+by repeated "Address already in use" errors.  Agents froze because the proxy
+crashed in a loop and never became available.
+
+**Root cause:**
+
+1. Every new Cursor terminal (including agent tool-call shells) sources `~/.bashrc`,
+   which runs the proxy autostart script.
+2. The script uses `flock` + PID check to be idempotent, but has a startup window.
+3. When several agents opened simultaneously, multiple proxy processes started.
+4. Each called `ReasoningStore.__init__()` → `self.prune()`.
+5. Without `WAL` mode or `busy_timeout`, the second process immediately received
+   `sqlite3.OperationalError: database is locked` and crashed.
+6. The crash triggered another autostart.  Loop.
+7. Meanwhile the first healthy process held the port, so restarted processes hit
+   `OSError: [Errno 98] Address already in use` and crashed again.
+
+**Fix (in this branch):**
+
+- `PRAGMA journal_mode = WAL` — readers never block writers; writers never block
+  readers at the SQLite file level.
+- `PRAGMA busy_timeout = 5000` — wait up to 5 s for a lock instead of crashing.
+- `_startup_prune()` — retries + gracefully skips if still locked after retries.
+- `_proxy_already_running()` health check in `main()` — if the proxy is already
+  responding on `/healthz`, a new startup attempt exits `0` immediately without
+  touching the port or the database.
+
+### Bug 2 — Multi-second freezes under concurrent agent load (pre- and post-branch)
+
+**Symptom:** Log lines showing 10+ seconds between "┌ cursor request" and
+"├ context filled", followed by Cursor dropping the connection with a broken-pipe
+warning.
+
+**Root cause:**
+
+The `ReasoningStore` used a single `threading.RLock` that serialised **all**
+database operations — reads and writes — across all handler threads.  The
+`put()` method called `_prune_locked()` on **every single write**:
+
+```python
+# OLD code — runs a full-table sort on every write
+def _prune_locked(self):
+    self._conn.execute("""
+        DELETE FROM reasoning_cache
+        WHERE key NOT IN (
+            SELECT key FROM reasoning_cache
+            ORDER BY created_at DESC   -- ← full-table sort, no index
+            LIMIT 100000
+        )
+    """)
+```
+
+With 55,925 rows and no index on `created_at`, this was an O(N log N) sort on
+every write.  While one thread held the lock for this sort:
+
+- Every other thread's `store.get()` call (which normalises 15–30 assistant
+  messages per request) queued behind the lock.
+- Each blocked `get()` added to the "context filled" latency visible in the logs.
+
+**Fix (this PR):** — see Section 4.
+
+---
+
+## 4. Performance optimisations
+
+### 4a — Thread-local read connections
+
+**Before:** One shared `sqlite3.Connection`, one `threading.RLock`.  Reads blocked
+on writes and vice versa.
+
+**After:** Two connection layers:
+
+| Layer | Object | Protected by | Used for |
+|---|---|---|---|
+| Write | `self._write_conn` | `self._write_lock` (plain `Lock`) | INSERT / UPDATE / DELETE / COMMIT |
+| Read | `self._local.conn` (per-thread) | Nothing — WAL handles it | SELECT |
+
+In WAL mode SQLite guarantees that concurrent readers never see a partial write and
+writers never wait for readers.  Each request handler thread opens its own read
+connection the first time it calls `get()`.  Reads across multiple concurrent
+requests never contend with each other or with the writer.
+
+The special case `":memory:"` (used in tests) cannot share data across multiple
+connections, so it falls back to the write connection for reads.
+
+### 4b — `created_at` index
+
+```sql
+CREATE INDEX IF NOT EXISTS idx_reasoning_cache_created_at
+ON reasoning_cache(created_at)
+```
+
+Applied on first startup against existing databases (`CREATE INDEX IF NOT EXISTS`
+is idempotent).  This index turns:
+
+| Query | Before | After |
+|---|---|---|
+| Age prune (`WHERE created_at < ?`) | O(N) full scan | O(log N) range seek |
+| Row-count prune (top-N keep) | O(N log N) sort | O(max\_rows) index walk |
+
+### 4c — Throttled row-count prune
+
+The age-based prune (`DELETE WHERE created_at < cutoff`) is now fast with the
+index and still runs on every write.
+
+The row-count prune (keeping at most `max_rows` entries) only runs when the
+approximate row count exceeds `max_rows` **or** every `_ROWCOUNT_PRUNE_INTERVAL`
+(default 50) writes.  The row count is seeded from the real DB size at startup so
+the threshold is accurate from the first write.
+
+When the DB is well under the limit (the common case — 55k rows vs 100k limit),
+the row-count DELETE skips entirely, saving a subquery evaluation on every request.
+
+### 4d — SQLite PRAGMA tuning
+
+| PRAGMA | Value | Effect |
+|---|---|---|
+| `journal_mode` | `WAL` | Concurrent readers + writer |
+| `busy_timeout` | `5000` ms | Wait instead of crash on contention |
+| `synchronous` | `NORMAL` | Skip extra fsync; safe under WAL |
+| `cache_size` (write conn) | `-32768` (32 MiB) | Hot rows stay in RAM |
+| `cache_size` (read conns) | `-8192` (8 MiB) | Per-thread read cache |
+
+---
+
+## 5. Autostart design change
+
+The old `autostart.bash` automatically launched the proxy inside every new bash
+session (including Cursor agent tool-call shells) using `flock` to prevent
+duplicates.  The flock race window was the original source of the cascade.
+
+**New behaviour:** The script is a **status check only**.
+
+- Interactive shell, proxy running → silent (no noise on every tab open).
+- Interactive shell, proxy **not** running → one clear warning + start command.
+- Non-interactive shell (agent tool calls) → does nothing at all.
+
+The proxy is started manually once.  Agents just talk to it; they never try to
+start it.
+
+---
+
+## 6. Reasoning cache lifetime
+
+Entries are pruned by two independent policies (both configurable):
+
+| Policy | Default | Purpose |
+|---|---|---|
+| `reasoning_cache_max_age_seconds` | 30 days | Remove old conversation history |
+| `reasoning_cache_max_rows` | 100,000 | Cap total DB size |
+
+**Active conversations are never pruned** as long as entries are younger than
+`max_age_seconds`.  If an entry is pruned (old conversation, restarted machine,
+cache cleared), the proxy falls back to "recover" mode: it truncates the
+conversation to the latest user message, adds a recovery notice, and continues.
+
+On Cursor restart, reasoning_content for recent tool calls is restored from the
+SQLite cache transparently, so the conversation resumes correctly.
+
+---
+
+## 7. Summary of changes in `fix/subagent-resilience`
+
+| File | Change | Fixes |
+|---|---|---|
+| `reasoning_store.py` | WAL mode + busy\_timeout | Startup crash cascade |
+| `reasoning_store.py` | `_startup_prune()` with retry | Startup crash cascade |
+| `reasoning_store.py` | `created_at` index | Multi-second read latency |
+| `reasoning_store.py` | Thread-local read connections | Read/write contention |
+| `reasoning_store.py` | Throttled row-count prune | Per-write lock contention |
+| `reasoning_store.py` | 32 MiB write + 8 MiB read caches | I/O reduction |
+| `server.py` | `_proxy_already_running()` health check | "Address already in use" cascade |
+| `server.py` | `LOG.info` instead of `LOG.warning` for broken pipes | Log noise |
+| `server.py` | `client_disconnected` flag | Unnecessary DB writes on subagent churn |
+| `autostart.bash` | Status check only, no auto-start | Cascade crash root cause |
diff --git a/src/deepseek_cursor_proxy/reasoning_store.py b/src/deepseek_cursor_proxy/reasoning_store.py
index 7f9e03e..04d3db6 100644
--- a/src/deepseek_cursor_proxy/reasoning_store.py
+++ b/src/deepseek_cursor_proxy/reasoning_store.py
@@ -180,8 +180,39 @@ def portable_reasoning_keys(
 _STARTUP_PRUNE_BUSY_RETRIES = 3
 _STARTUP_PRUNE_BUSY_WAIT = 0.5  # seconds between retries
 
+# Prune the max-rows limit at most once every N writes.  Running the full-table
+# DELETE on every write was the primary cause of multi-second lock contention
+# under concurrent subagent load.  Age-based pruning is cheap (index range
+# scan) and still happens on every write; row-count pruning is expensive
+# (table scan) and only needs to run occasionally.
+_ROWCOUNT_PRUNE_INTERVAL = 50
+
 
 class ReasoningStore:
+    """Thread-safe reasoning_content cache backed by SQLite.
+
+    Concurrency model
+    -----------------
+    The proxy runs as a ThreadingHTTPServer: each request is handled on its
+    own thread.  To allow concurrent reads without blocking on writes we use
+    two connection layers:
+
+    * **Write connection** (``self._write_conn``) — one shared connection,
+      serialised by ``self._write_lock`` (a plain ``threading.Lock``).  All
+      INSERT/UPDATE/DELETE/COMMIT operations go through this path.
+
+    * **Thread-local read connections** (``self._local.conn``) — each handler
+      thread opens its own SQLite connection the first time it calls ``get()``.
+      In WAL mode SQLite guarantees that readers never block writers and
+      writers never block readers at the *file* level; the Python-level
+      per-thread connections ensure there is no intra-process contention
+      either.
+
+    The special case ``":memory:"`` (used in tests) cannot have multiple
+    connections pointing at the same data, so it falls back to the write
+    connection for reads (still serialised by ``_write_lock``).
+    """
+
     def __init__(
         self,
         reasoning_content_path: str | Path,
@@ -190,29 +221,19 @@ def __init__(
     ) -> None:
         self.max_age_seconds = max_age_seconds
         self.max_rows = max_rows
-        if str(reasoning_content_path) == ":memory:":
+        self._in_memory = str(reasoning_content_path) == ":memory:"
+        if self._in_memory:
             self.reasoning_content_path: str | Path = ":memory:"
         else:
             self.reasoning_content_path = Path(reasoning_content_path).expanduser()
             self.reasoning_content_path.parent.mkdir(
                 mode=0o700, parents=True, exist_ok=True
             )
-        self._lock = threading.RLock()
-        self._conn = sqlite3.connect(
-            self.reasoning_content_path, check_same_thread=False
-        )
-        # busy_timeout lets SQLite wait up to 5 s for a concurrent writer to
-        # finish instead of immediately raising OperationalError("database is
-        # locked"). This prevents crash-restart races when multiple proxy
-        # instances start at nearly the same time (e.g. several terminals open
-        # in quick succession or after a subagent-triggered crash).
-        self._conn.execute("PRAGMA busy_timeout = 5000")
-        # WAL mode allows one writer and multiple concurrent readers without
-        # locking, which is far better for the threading model of this proxy.
-        self._conn.execute("PRAGMA journal_mode = WAL")
+        self._write_lock = threading.Lock()
+        self._write_conn = self._open_conn(writer=True)
         if isinstance(self.reasoning_content_path, Path):
             self.reasoning_content_path.chmod(0o600)
-        self._conn.execute(
+        self._write_conn.execute(
             """
             CREATE TABLE IF NOT EXISTS reasoning_cache (
                 key TEXT PRIMARY KEY,
@@ -222,9 +243,72 @@ def __init__(
             )
             """
         )
-        self._conn.commit()
+        # Index on created_at powers both the age-based DELETE (range scan) and
+        # the max-rows DELETE (top-N scan), turning O(N log N) full-table sorts
+        # into O(log N) index seeks.  CREATE INDEX IF NOT EXISTS is a no-op on
+        # databases that already have the index, so it is safe to run every
+        # startup and will transparently add the index to existing DBs.
+        self._write_conn.execute(
+            """
+            CREATE INDEX IF NOT EXISTS idx_reasoning_cache_created_at
+            ON reasoning_cache(created_at)
+            """
+        )
+        self._write_conn.commit()
+        if not self._in_memory:
+            self._local: threading.local = threading.local()
+        # Seed the write counter with the current row count so the rowcount
+        # prune triggers correctly: once _write_count exceeds max_rows we know
+        # the DB could be over capacity and prune on every write; below that
+        # threshold we only prune every _ROWCOUNT_PRUNE_INTERVAL writes.
+        row = self._write_conn.execute(
+            "SELECT COUNT(*) FROM reasoning_cache"
+        ).fetchone()
+        self._write_count: int = int(row[0]) if row else 0
         self._startup_prune()
 
+    # ------------------------------------------------------------------
+    # Connection helpers
+    # ------------------------------------------------------------------
+
+    def _open_conn(self, *, writer: bool = False) -> sqlite3.Connection:
+        """Open a new SQLite connection with appropriate PRAGMAs."""
+        conn = sqlite3.connect(
+            str(self.reasoning_content_path), check_same_thread=False
+        )
+        # busy_timeout: wait up to 5 s for a concurrent writer instead of
+        # immediately raising OperationalError("database is locked").
+        conn.execute("PRAGMA busy_timeout = 5000")
+        # WAL mode: readers never block writers; writers never block readers.
+        conn.execute("PRAGMA journal_mode = WAL")
+        # NORMAL durability is safe under WAL (no data loss on OS crash) and
+        # avoids the extra fsync that FULL mode adds on every commit.
+        conn.execute("PRAGMA synchronous = NORMAL")
+        if writer:
+            # Give the single write connection a large page cache so hot rows
+            # stay in RAM during high-concurrency bursts from parallel subagents.
+            conn.execute("PRAGMA cache_size = -32768")  # 32 MiB
+        else:
+            # Read connections share a smaller per-thread cache.
+            conn.execute("PRAGMA cache_size = -8192")   # 8 MiB per thread
+        return conn
+
+    def _read_conn(self) -> sqlite3.Connection:
+        """Return the thread-local read connection, creating it if needed."""
+        if self._in_memory:
+            # All connections to :memory: are independent DBs; share the write
+            # connection (serialised by _write_lock in the caller).
+            return self._write_conn
+        conn: sqlite3.Connection | None = getattr(self._local, "conn", None)
+        if conn is None:
+            conn = self._open_conn(writer=False)
+            self._local.conn = conn
+        return conn
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
     def _startup_prune(self) -> None:
         """Prune on startup with retry + graceful degradation.
 
@@ -250,15 +334,21 @@ def _startup_prune(self) -> None:
         )
 
     def close(self) -> None:
-        with self._lock:
-            self._conn.close()
+        with self._write_lock:
+            self._write_conn.close()
+        # Thread-local read connections are closed when their threads exit or
+        # when the process terminates; the OS reclaims file handles either way.
+
+    # ------------------------------------------------------------------
+    # Core read / write
+    # ------------------------------------------------------------------
 
     def put(self, key: str, reasoning: str, message: dict[str, Any]) -> None:
         if not isinstance(reasoning, str):
             return
         message_json = json.dumps(message, ensure_ascii=False, sort_keys=True)
-        with self._lock:
-            self._conn.execute(
+        with self._write_lock:
+            self._write_conn.execute(
                 """
                 INSERT INTO reasoning_cache(key, reasoning, message_json, created_at)
                 VALUES (?, ?, ?, ?)
@@ -269,12 +359,37 @@ def put(self, key: str, reasoning: str, message: dict[str, Any]) -> None:
                 """,
                 (key, reasoning, message_json, time.time()),
             )
-            self._prune_locked()
-            self._conn.commit()
+            self._write_count += 1
+            # Skip the expensive rowcount prune when we're well under
+            # capacity, running it at most every _ROWCOUNT_PRUNE_INTERVAL
+            # writes.  Once _write_count exceeds max_rows the DB could be
+            # over capacity, so prune on every write until we're safely
+            # back under the limit.
+            at_or_over_limit = (
+                self.max_rows is not None
+                and self._write_count > self.max_rows
+            )
+            skip_rowcount = (
+                not at_or_over_limit
+                and self._write_count % _ROWCOUNT_PRUNE_INTERVAL != 0
+            )
+            deleted = self._prune_write_locked(skip_rowcount=skip_rowcount)
+            self._write_count -= deleted
+            self._write_conn.commit()
 
     def get(self, key: str) -> str | None:
-        with self._lock:
-            row = self._conn.execute(
+        # File-based DB: no lock needed — WAL allows concurrent reads even
+        # while the write connection is mid-transaction.  Each thread has its
+        # own connection so there is no intra-process sharing either.
+        # :memory: DB: falls back to the write connection (see _read_conn).
+        if self._in_memory:
+            with self._write_lock:
+                row = self._read_conn().execute(
+                    "SELECT reasoning FROM reasoning_cache WHERE key = ?",
+                    (key,),
+                ).fetchone()
+        else:
+            row = self._read_conn().execute(
                 "SELECT reasoning FROM reasoning_cache WHERE key = ?",
                 (key,),
             ).fetchone()
@@ -342,36 +457,44 @@ def backfill_portable_aliases(
         return len(keys)
 
     def clear(self) -> int:
-        with self._lock:
-            row = self._conn.execute("SELECT COUNT(*) FROM reasoning_cache").fetchone()
+        with self._write_lock:
+            row = self._write_conn.execute(
+                "SELECT COUNT(*) FROM reasoning_cache"
+            ).fetchone()
             count = int(row[0] if row else 0)
-            self._conn.execute("DELETE FROM reasoning_cache")
-            self._conn.commit()
+            self._write_conn.execute("DELETE FROM reasoning_cache")
+            self._write_conn.commit()
         return count
 
     def prune(self) -> int:
-        with self._lock:
-            deleted = self._prune_locked()
-            self._conn.commit()
+        with self._write_lock:
+            deleted = self._prune_write_locked()
+            self._write_conn.commit()
         return deleted
 
-    def _prune_locked(self) -> int:
+    def _prune_write_locked(self, *, skip_rowcount: bool = False) -> int:
+        """Run pruning queries.  Must be called with ``_write_lock`` held."""
         deleted = 0
         if self.max_age_seconds is not None and self.max_age_seconds > 0:
             cutoff = time.time() - self.max_age_seconds
-            cursor = self._conn.execute(
+            # idx_reasoning_cache_created_at makes this a fast index range scan.
+            cursor = self._write_conn.execute(
                 "DELETE FROM reasoning_cache WHERE created_at < ?",
                 (cutoff,),
             )
             deleted += cursor.rowcount if cursor.rowcount != -1 else 0
 
-        if self.max_rows is not None and self.max_rows > 0:
-            cursor = self._conn.execute(
+        if not skip_rowcount and self.max_rows is not None and self.max_rows > 0:
+            # Keep the top max_rows rows by created_at; delete everything else.
+            # The subquery walks the created_at index and materialises exactly
+            # max_rows rowids — O(max_rows) index seek.  The outer DELETE is
+            # O(rows deleted).  Using rowid NOT IN handles equal-timestamp ties
+            # correctly (rowid is unique; created_at is not).
+            cursor = self._write_conn.execute(
                 """
                 DELETE FROM reasoning_cache
-                WHERE key NOT IN (
-                    SELECT key
-                    FROM reasoning_cache
+                WHERE rowid NOT IN (
+                    SELECT rowid FROM reasoning_cache
                     ORDER BY created_at DESC
                     LIMIT ?
                 )
diff --git a/src/deepseek_cursor_proxy/server.py b/src/deepseek_cursor_proxy/server.py
index c20012f..2ffcffc 100644
--- a/src/deepseek_cursor_proxy/server.py
+++ b/src/deepseek_cursor_proxy/server.py
@@ -1257,6 +1257,28 @@ def warn_if_insecure_upstream(url: str) -> None:
     LOG.warning("upstream base_url uses plain HTTP; bearer tokens may be exposed")
 
 
+def _proxy_already_running(host: str, port: int) -> bool:
+    """Return True if a healthy proxy is already listening at host:port.
+
+    Called before we attempt to bind the server socket.  When Cursor spawns
+    multiple agent / subagent processes simultaneously, each one may trigger
+    the shell autostart hook.  Rather than crashing with 'Address already in
+    use', a new startup attempt that finds the proxy healthy should just exit
+    cleanly — the existing process handles requests just fine.
+    """
+    try:
+        import urllib.request as _urlreq
+
+        req = _urlreq.Request(
+            f"http://{host}:{port}/healthz",
+            headers={"User-Agent": "deepseek-cursor-proxy/startup-check"},
+        )
+        with _urlreq.urlopen(req, timeout=2.0) as resp:
+            return resp.status == 200
+    except Exception:
+        return False
+
+
 def main(argv: list[str] | None = None) -> int:
     args = build_arg_parser().parse_args(argv)
     try:
@@ -1330,6 +1352,19 @@ def main(argv: list[str] | None = None) -> int:
             LOG.error("failed to initialize trace directory: %s", exc)
             store.close()
             return 2
+    if _proxy_already_running(config.host, config.port):
+        # Another proxy instance is already healthy on this address.  This can
+        # happen when the shell autostart hook fires in multiple Cursor agent
+        # terminals in quick succession.  Exit cleanly so the running instance
+        # keeps serving requests undisturbed.
+        LOG.info(
+            "proxy already running at %s:%s — exiting cleanly",
+            config.host,
+            config.port,
+        )
+        store.close()
+        return 0
+
     server = DeepSeekProxyServer((config.host, config.port), DeepSeekProxyHandler)
     server.config = config
     server.reasoning_store = store

From 1e1715266d65fa709ceae1f7a5f924eee2eb5188 Mon Sep 17 00:00:00 2001
From: Jason Tung <jtung1027@gmail.com>
Date: Sun, 24 May 2026 17:04:42 -0400
Subject: [PATCH 5/8] perf: increase SQLite cache + add mmap for 128 GB RAM
 machine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Write connection cache_size: 8 MiB → 512 MiB (holds entire 300 MB DB)
- Read connection cache_size: 8 MiB → 64 MiB per thread
- mmap_size = 2 GiB: maps the DB file into virtual address space so all
  connections share OS page-cache pages; reads become zero-copy and the
  300 MB DB stays warm in RAM indefinitely on a 128 GB machine

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/deepseek_cursor_proxy/reasoning_store.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/deepseek_cursor_proxy/reasoning_store.py b/src/deepseek_cursor_proxy/reasoning_store.py
index 04d3db6..f77ce71 100644
--- a/src/deepseek_cursor_proxy/reasoning_store.py
+++ b/src/deepseek_cursor_proxy/reasoning_store.py
@@ -285,12 +285,21 @@ def _open_conn(self, *, writer: bool = False) -> sqlite3.Connection:
         # avoids the extra fsync that FULL mode adds on every commit.
         conn.execute("PRAGMA synchronous = NORMAL")
         if writer:
-            # Give the single write connection a large page cache so hot rows
-            # stay in RAM during high-concurrency bursts from parallel subagents.
-            conn.execute("PRAGMA cache_size = -32768")  # 32 MiB
+            # Large page cache for the write connection.  The negative value
+            # is in kibibytes; 524288 = 512 MiB, enough to hold the entire
+            # 300 MB database in Python heap and absorb large write bursts.
+            conn.execute("PRAGMA cache_size = -524288")  # 512 MiB
         else:
-            # Read connections share a smaller per-thread cache.
-            conn.execute("PRAGMA cache_size = -8192")   # 8 MiB per thread
+            # Read connections each get their own cache, but mmap_size (below)
+            # means most reads never touch this cache at all.
+            conn.execute("PRAGMA cache_size = -65536")   # 64 MiB per thread
+        # Memory-map the database file directly into the process address space.
+        # With mmap enabled the OS page cache becomes the working set — all
+        # connections share the same physical pages, reads are zero-copy, and
+        # the entire 300 MB DB can stay warm in RAM indefinitely as long as
+        # there is free memory.  2 GiB ceiling is far above the current DB
+        # size; SQLite only maps what actually exists.
+        conn.execute("PRAGMA mmap_size = 2147483648")    # 2 GiB ceiling
         return conn
 
     def _read_conn(self) -> sqlite3.Connection:

From 2cb86469710fd4cf57c767e7709f459524a462a1 Mon Sep 17 00:00:00 2001
From: Jason Tung <jtung1027@gmail.com>
Date: Sun, 24 May 2026 17:16:44 -0400
Subject: [PATCH 6/8] perf: raise limits to 5M rows / 1yr age, mmap ceiling to
 32 GiB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Row limit: 100k → 5M  (theoretical max ~27 GB at current row size)
Max age:   30d  → 1yr (conversations from the last year always restore)
mmap_size: 2 GiB → 32 GiB ceiling (covers full 5M-row DB in virtual space)

The row-count prune fires only when rows actually exceed 5M; at current
usage (55,925 rows) it will not trigger for years.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/deepseek_cursor_proxy/reasoning_store.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/deepseek_cursor_proxy/reasoning_store.py b/src/deepseek_cursor_proxy/reasoning_store.py
index f77ce71..c3fdcbd 100644
--- a/src/deepseek_cursor_proxy/reasoning_store.py
+++ b/src/deepseek_cursor_proxy/reasoning_store.py
@@ -296,10 +296,11 @@ def _open_conn(self, *, writer: bool = False) -> sqlite3.Connection:
         # Memory-map the database file directly into the process address space.
         # With mmap enabled the OS page cache becomes the working set — all
         # connections share the same physical pages, reads are zero-copy, and
-        # the entire 300 MB DB can stay warm in RAM indefinitely as long as
-        # there is free memory.  2 GiB ceiling is far above the current DB
-        # size; SQLite only maps what actually exists.
-        conn.execute("PRAGMA mmap_size = 2147483648")    # 2 GiB ceiling
+        # the entire DB can stay warm in RAM indefinitely on a 128 GB machine.
+        # 32 GiB ceiling covers the ~27 GB theoretical max at 5M rows;
+        # SQLite only maps pages that actually exist, so unused address space
+        # costs nothing.
+        conn.execute("PRAGMA mmap_size = 34359738368")   # 32 GiB ceiling
         return conn
 
     def _read_conn(self) -> sqlite3.Connection:

From 57f14e87ddd258bd43e5e5d9ce449518778e1bc1 Mon Sep 17 00:00:00 2001
From: Jason Tung <jtung1027@gmail.com>
Date: Sun, 24 May 2026 17:17:44 -0400
Subject: [PATCH 7/8] style: apply black formatting

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/deepseek_cursor_proxy/reasoning_store.py | 29 ++++++++++++--------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/deepseek_cursor_proxy/reasoning_store.py b/src/deepseek_cursor_proxy/reasoning_store.py
index c3fdcbd..1c62fb4 100644
--- a/src/deepseek_cursor_proxy/reasoning_store.py
+++ b/src/deepseek_cursor_proxy/reasoning_store.py
@@ -292,7 +292,7 @@ def _open_conn(self, *, writer: bool = False) -> sqlite3.Connection:
         else:
             # Read connections each get their own cache, but mmap_size (below)
             # means most reads never touch this cache at all.
-            conn.execute("PRAGMA cache_size = -65536")   # 64 MiB per thread
+            conn.execute("PRAGMA cache_size = -65536")  # 64 MiB per thread
         # Memory-map the database file directly into the process address space.
         # With mmap enabled the OS page cache becomes the working set — all
         # connections share the same physical pages, reads are zero-copy, and
@@ -300,7 +300,7 @@ def _open_conn(self, *, writer: bool = False) -> sqlite3.Connection:
         # 32 GiB ceiling covers the ~27 GB theoretical max at 5M rows;
         # SQLite only maps pages that actually exist, so unused address space
         # costs nothing.
-        conn.execute("PRAGMA mmap_size = 34359738368")   # 32 GiB ceiling
+        conn.execute("PRAGMA mmap_size = 34359738368")  # 32 GiB ceiling
         return conn
 
     def _read_conn(self) -> sqlite3.Connection:
@@ -376,8 +376,7 @@ def put(self, key: str, reasoning: str, message: dict[str, Any]) -> None:
             # over capacity, so prune on every write until we're safely
             # back under the limit.
             at_or_over_limit = (
-                self.max_rows is not None
-                and self._write_count > self.max_rows
+                self.max_rows is not None and self._write_count > self.max_rows
             )
             skip_rowcount = (
                 not at_or_over_limit
@@ -394,15 +393,23 @@ def get(self, key: str) -> str | None:
         # :memory: DB: falls back to the write connection (see _read_conn).
         if self._in_memory:
             with self._write_lock:
-                row = self._read_conn().execute(
+                row = (
+                    self._read_conn()
+                    .execute(
+                        "SELECT reasoning FROM reasoning_cache WHERE key = ?",
+                        (key,),
+                    )
+                    .fetchone()
+                )
+        else:
+            row = (
+                self._read_conn()
+                .execute(
                     "SELECT reasoning FROM reasoning_cache WHERE key = ?",
                     (key,),
-                ).fetchone()
-        else:
-            row = self._read_conn().execute(
-                "SELECT reasoning FROM reasoning_cache WHERE key = ?",
-                (key,),
-            ).fetchone()
+                )
+                .fetchone()
+            )
         if row is None:
             return None
         return str(row[0])

From 1c691dfb552157d80f042e77079a1a2ca1d2aed5 Mon Sep 17 00:00:00 2001
From: Jason Tung <jtung1027@gmail.com>
Date: Sun, 24 May 2026 17:33:41 -0400
Subject: [PATCH 8/8] feat(logging): timestamps on every line + startup log
 trim to 5k lines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- All log lines now include a timestamp regardless of verbose mode.
  Before: INFO lines had no timestamp (message only).
  After:  2026-05-24 17:31:04,821 INFO ┌ cursor model=...
- Verbose mode adds [ThreadName] for concurrent request debugging.
- _trim_log_file() trims autostart.log to 5,000 most-recent lines at
  each proxy startup, keeping the same filename so open file handles
  in Cursor stay valid and always show the latest content.
- Update tests to match new format (timestamps always present).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 src/deepseek_cursor_proxy/logging.py | 19 ++++++++-----------
 src/deepseek_cursor_proxy/server.py  | 27 +++++++++++++++++++++++++++
 tests/test_server.py                 | 17 ++++++++++++-----
 3 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/src/deepseek_cursor_proxy/logging.py b/src/deepseek_cursor_proxy/logging.py
index 13ccf0e..1829ae1 100644
--- a/src/deepseek_cursor_proxy/logging.py
+++ b/src/deepseek_cursor_proxy/logging.py
@@ -8,25 +8,22 @@
 
 LOG = stdlib_logging.getLogger("deepseek_cursor_proxy")
 
-DEFAULT_INFO_LOG_FORMAT = "%(message)s"
-DEFAULT_WARNING_LOG_FORMAT = "%(levelname)s %(message)s"
-VERBOSE_LOG_FORMAT = "%(asctime)s %(levelname)s %(message)s"
+# All log lines include a timestamp regardless of verbose mode.  The verbose
+# flag controls whether full request/response payloads are printed, not
+# whether basic observability (when did this happen?) is available.
+LOG_FORMAT = "%(asctime)s %(levelname)s %(message)s"
+LOG_FORMAT_VERBOSE = "%(asctime)s %(levelname)s [%(threadName)s] %(message)s"
 
 
 class ConsoleLogFormatter(stdlib_logging.Formatter):
     def __init__(self, *, verbose: bool) -> None:
         super().__init__()
         self.verbose = verbose
-        self._verbose_formatter = stdlib_logging.Formatter(VERBOSE_LOG_FORMAT)
-        self._info_formatter = stdlib_logging.Formatter(DEFAULT_INFO_LOG_FORMAT)
-        self._warning_formatter = stdlib_logging.Formatter(DEFAULT_WARNING_LOG_FORMAT)
+        fmt = LOG_FORMAT_VERBOSE if verbose else LOG_FORMAT
+        self._formatter = stdlib_logging.Formatter(fmt)
 
     def format(self, record: stdlib_logging.LogRecord) -> str:
-        if self.verbose:
-            return self._verbose_formatter.format(record)
-        if record.levelno <= stdlib_logging.INFO:
-            return self._info_formatter.format(record)
-        return self._warning_formatter.format(record)
+        return self._formatter.format(record)
 
 
 def configure_logging(*, verbose: bool) -> None:
diff --git a/src/deepseek_cursor_proxy/server.py b/src/deepseek_cursor_proxy/server.py
index 2ffcffc..86015b6 100644
--- a/src/deepseek_cursor_proxy/server.py
+++ b/src/deepseek_cursor_proxy/server.py
@@ -1257,6 +1257,28 @@ def warn_if_insecure_upstream(url: str) -> None:
     LOG.warning("upstream base_url uses plain HTTP; bearer tokens may be exposed")
 
 
+_LOG_TRIM_LINES = 5_000  # keep this many most-recent lines on startup
+
+
+def _trim_log_file(log_path: Path, max_lines: int = _LOG_TRIM_LINES) -> None:
+    """Trim *log_path* to at most *max_lines* most-recent lines, in-place.
+
+    Called once at startup so the log file never grows without bound across
+    long-running sessions.  Silently skips if the file is missing, too small
+    to need trimming, or cannot be read/written (e.g. permission error).
+    """
+    try:
+        if not log_path.exists():
+            return
+        text = log_path.read_text(encoding="utf-8", errors="replace")
+        lines = text.splitlines(keepends=True)
+        if len(lines) <= max_lines:
+            return
+        log_path.write_text("".join(lines[-max_lines:]), encoding="utf-8")
+    except OSError:
+        pass
+
+
 def _proxy_already_running(host: str, port: int) -> bool:
     """Return True if a healthy proxy is already listening at host:port.
 
@@ -1333,6 +1355,11 @@ def main(argv: list[str] | None = None) -> int:
         config = replace(config, **updates)
 
     configure_logging(verbose=config.verbose)
+    # Trim the default log file so it never grows without bound.  The proxy
+    # is started via `nohup ... >> autostart.log 2>&1`, so we know exactly
+    # where logs are going.  We trim *before* emitting any new lines so the
+    # file stays bounded even across many restart cycles.
+    _trim_log_file(Path.home() / ".deepseek-cursor-proxy" / "autostart.log")
     warn_if_insecure_upstream(config.upstream_base_url)
     store = ReasoningStore(
         config.reasoning_content_path,
diff --git a/tests/test_server.py b/tests/test_server.py
index cff056c..f7cdac3 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -167,12 +167,18 @@ def test_default_console_logging_hides_info_prefix_and_timestamp(self) -> None:
             None,
         )
 
-        self.assertEqual(
+        # All log lines now include a timestamp regardless of level or verbosity.
+        self.assertRegex(
             formatter.format(info_record),
-            "listening on http://127.0.0.1:9000/v1",
+            re.compile(
+                r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} INFO listening on "
+            ),
         )
-        self.assertEqual(
-            formatter.format(warning_record), "WARNING trace logging enabled"
+        self.assertRegex(
+            formatter.format(warning_record),
+            re.compile(
+                r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} WARNING trace logging"
+            ),
         )
 
     def test_verbose_console_logging_shows_timestamp_and_level(self) -> None:
@@ -187,10 +193,11 @@ def test_verbose_console_logging_shows_timestamp_and_level(self) -> None:
             None,
         )
 
+        # Verbose mode includes thread name between level and message.
         self.assertRegex(
             formatter.format(record),
             re.compile(
-                r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} INFO listening on "
+                r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} INFO \[.*\] listening on "
             ),
         )