From 886f4a5e2b0e3c733616fe287fd32d64008e29f4 Mon Sep 17 00:00:00 2001
From: psl75011 <257030628+psl75011@users.noreply.github.com>
Date: Wed, 13 May 2026 03:46:42 +0200
Subject: [PATCH 1/2] fix(parser): skip invalid entries from JSON exports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some bookmark JSON exports contain entries the AI pipeline can never enrich
(they produce 0 semantic tags and get force-classified as "general"). On one
real import this was ~25% of the batch instead of the usual ~5% floor.
Four profiles, now skipped in parseSingleTweet():

- preview cards captured as tweets (id_str prefixed "card://")
- the id is a t.co URL string instead of a numeric id (extraction error in the export)
- deleted/inaccessible tweets where the export only kept an id: empty text AND no
  user data (some exports write the literal "unknown" as screen_name, handled too)
- quote-tweets that are just a bare t.co URL with no media — no usable signal at all

No behaviour change for valid tweets.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/parser.ts | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/lib/parser.ts b/lib/parser.ts
index e536722..f144e49 100644
--- a/lib/parser.ts
+++ b/lib/parser.ts
@@ -148,15 +148,34 @@ function parseSingleTweet(tweet: RawTweet): ParsedBookmark | null {
   const tweetId = extractTweetId(tweet)
   if (!tweetId) return null
 
+  // Skip entries that some JSON exports include but that the AI pipeline can
+  // never enrich (0 semantic tags -> they end up force-classified as "general"):
+  //  - preview cards captured as tweets (id_str prefixed "card://")
+  //  - the id is a t.co URL string instead of a numeric id (extraction error in the export)
+  //  - deleted/inaccessible tweets where the export only kept an id (empty text AND no user data;
+  //    note some exports write the literal "unknown" as screen_name)
+  //  - quote-tweets that are just a bare t.co URL with no media (no usable signal at all)
+  if (tweetId.startsWith('card://')) return null
+  if (tweetId.startsWith('http://') || tweetId.startsWith('https://') || tweetId.startsWith('t.co/')) return null
+
+  const text = extractText(tweet)
+  const handle = tweet.user?.screen_name
+  const hasUser = !!((handle && handle !== 'unknown') || tweet.user?.name)
+  if (text === '' && !hasUser) return null
+
+  const media = extractMedia(tweet)
+  const isUrlOnly = /^https?:\/\/t\.co\/\S+\s*$/.test(text)
+  if (isUrlOnly && media.length === 0) return null
+
   return {
     tweetId,
-    text: extractText(tweet),
+    text,
     authorHandle: extractAuthorHandle(tweet),
     authorName: extractAuthorName(tweet),
     tweetCreatedAt: extractCreatedAt(tweet),
     hashtags: extractHashtags(tweet),
     urls: extractUrls(tweet),
-    media: extractMedia(tweet),
+    media,
     rawJson: JSON.stringify(tweet),
   }
 }

From 05e58ccf6d6df2f34652a3ea80f146e965ff5a79 Mon Sep 17 00:00:00 2001
From: psl75011 <257030628+psl75011@users.noreply.github.com>
Date: Wed, 13 May 2026 03:46:43 +0200
Subject: [PATCH 2/2] fix(categorizer): bump CLI timeout 60s -> 180s for large
 prompts

A 20-bookmark categorization batch routinely needs more than 60s on the CLI
path once the prompt grows (many categories, image OCR text, non-English
bookmarks). When it times out the whole batch silently drops to the SDK
fallback. Raised to 180s for both the Codex and Claude CLI calls.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/categorizer.ts | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/categorizer.ts b/lib/categorizer.ts
index 4cfd377..cd4a9cb 100644
--- a/lib/categorizer.ts
+++ b/lib/categorizer.ts
@@ -244,7 +244,10 @@ export async function categorizeBatch(
   // Prefer CLI over SDK (avoids OAuth token extraction, uses CLI directly)
   if (provider === 'openai') {
     if (await getCodexCliAvailability()) {
-      const result = await codexPrompt(prompt, { timeoutMs: 60_000 })
+      // 60s is too short once the prompt grows (many categories, image OCR text,
+      // non-English bookmarks). A 20-bookmark batch routinely needs >60s on the
+      // CLI path -> hitting the timeout drops the whole batch to the SDK fallback.
+      const result = await codexPrompt(prompt, { timeoutMs: 180_000 })
       if (result.success && result.data) {
         try {
           return parseCategorizationResponse(result.data, new Set(allSlugs))
@@ -260,7 +263,8 @@ export async function categorizeBatch(
       const model = await getActiveModel()
       const cliModel = modelNameToCliAlias(model)
 
-      const result = await claudePrompt(prompt, { model: cliModel, timeoutMs: 60_000 })
+      // See note above on codexPrompt — same reasoning for the Claude CLI path.
+      const result = await claudePrompt(prompt, { model: cliModel, timeoutMs: 180_000 })
       if (result.success && result.data) {
         try {
           return parseCategorizationResponse(result.data, new Set(allSlugs))