From 886f4a5e2b0e3c733616fe287fd32d64008e29f4 Mon Sep 17 00:00:00 2001 From: psl75011 <257030628+psl75011@users.noreply.github.com> Date: Wed, 13 May 2026 03:46:42 +0200 Subject: [PATCH 1/2] fix(parser): skip invalid entries from JSON exports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some bookmark JSON exports contain entries the AI pipeline can never enrich (they produce 0 semantic tags and get force-classified as "general"). On one real import this was ~25% of the batch instead of the usual ~5% floor. Four profiles, now skipped in parseSingleTweet(): - preview cards captured as tweets (id_str prefixed "card://") - the id is a t.co URL string instead of a numeric id (extraction error in the export) - deleted/inaccessible tweets where the export only kept an id: empty text AND no user data (some exports write the literal "unknown" as screen_name, handled too) - quote-tweets that are just a bare t.co URL with no media — no usable signal at all No behaviour change for valid tweets. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/parser.ts | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/lib/parser.ts b/lib/parser.ts index e536722..f144e49 100644 --- a/lib/parser.ts +++ b/lib/parser.ts @@ -148,15 +148,34 @@ function parseSingleTweet(tweet: RawTweet): ParsedBookmark | null { const tweetId = extractTweetId(tweet) if (!tweetId) return null + // Skip entries that some JSON exports include but that the AI pipeline can + // never enrich (0 semantic tags -> they end up force-classified as "general"): + // - preview cards captured as tweets (id_str prefixed "card://") + // - the id is a t.co URL string instead of a numeric id (extraction error in the export) + // - deleted/inaccessible tweets where the export only kept an id (empty text AND no user data; + // note some exports write the literal "unknown" as screen_name) + // - quote-tweets that are just a bare t.co URL with no media (no usable signal at all) + if (tweetId.startsWith('card://')) return null + if (tweetId.startsWith('http://') || tweetId.startsWith('https://') || tweetId.startsWith('t.co/')) return null + + const text = extractText(tweet) + const handle = tweet.user?.screen_name + const hasUser = !!((handle && handle !== 'unknown') || tweet.user?.name) + if (text === '' && !hasUser) return null + + const media = extractMedia(tweet) + const isUrlOnly = /^https?:\/\/t\.co\/\S+\s*$/.test(text) + if (isUrlOnly && media.length === 0) return null + return { tweetId, - text: extractText(tweet), + text, authorHandle: extractAuthorHandle(tweet), authorName: extractAuthorName(tweet), tweetCreatedAt: extractCreatedAt(tweet), hashtags: extractHashtags(tweet), urls: extractUrls(tweet), - media: extractMedia(tweet), + media, rawJson: JSON.stringify(tweet), } } From 05e58ccf6d6df2f34652a3ea80f146e965ff5a79 Mon Sep 17 00:00:00 2001 From: psl75011 <257030628+psl75011@users.noreply.github.com> Date: Wed, 13 May 2026 03:46:43 +0200 Subject: [PATCH 2/2] fix(categorizer): bump CLI timeout 60s -> 180s for large prompts A 20-bookmark categorization batch routinely needs more than 60s on the CLI path once the prompt grows (many categories, image OCR text, non-English bookmarks). When it times out the whole batch silently drops to the SDK fallback. Raised to 180s for both the Codex and Claude CLI calls. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/categorizer.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/categorizer.ts b/lib/categorizer.ts index 4cfd377..cd4a9cb 100644 --- a/lib/categorizer.ts +++ b/lib/categorizer.ts @@ -244,7 +244,10 @@ export async function categorizeBatch( // Prefer CLI over SDK (avoids OAuth token extraction, uses CLI directly) if (provider === 'openai') { if (await getCodexCliAvailability()) { - const result = await codexPrompt(prompt, { timeoutMs: 60_000 }) + // 60s is too short once the prompt grows (many categories, image OCR text, + // non-English bookmarks). A 20-bookmark batch routinely needs >60s on the + // CLI path -> hitting the timeout drops the whole batch to the SDK fallback. + const result = await codexPrompt(prompt, { timeoutMs: 180_000 }) if (result.success && result.data) { try { return parseCategorizationResponse(result.data, new Set(allSlugs)) @@ -260,7 +263,8 @@ export async function categorizeBatch( const model = await getActiveModel() const cliModel = modelNameToCliAlias(model) - const result = await claudePrompt(prompt, { model: cliModel, timeoutMs: 60_000 }) + // See note above on codexPrompt — same reasoning for the Claude CLI path. + const result = await claudePrompt(prompt, { model: cliModel, timeoutMs: 180_000 }) if (result.success && result.data) { try { return parseCategorizationResponse(result.data, new Set(allSlugs))