diff --git a/lib/categorizer.ts b/lib/categorizer.ts index 4cfd377..cd4a9cb 100644 --- a/lib/categorizer.ts +++ b/lib/categorizer.ts @@ -244,7 +244,10 @@ export async function categorizeBatch( // Prefer CLI over SDK (avoids OAuth token extraction, uses CLI directly) if (provider === 'openai') { if (await getCodexCliAvailability()) { - const result = await codexPrompt(prompt, { timeoutMs: 60_000 }) + // 60s is too short once the prompt grows (many categories, image OCR text, + // non-English bookmarks). A 20-bookmark batch routinely needs >60s on the + // CLI path -> hitting the timeout drops the whole batch to the SDK fallback. + const result = await codexPrompt(prompt, { timeoutMs: 180_000 }) if (result.success && result.data) { try { return parseCategorizationResponse(result.data, new Set(allSlugs)) @@ -260,7 +263,8 @@ export async function categorizeBatch( const model = await getActiveModel() const cliModel = modelNameToCliAlias(model) - const result = await claudePrompt(prompt, { model: cliModel, timeoutMs: 60_000 }) + // See note above on codexPrompt — same reasoning for the Claude CLI path. + const result = await claudePrompt(prompt, { model: cliModel, timeoutMs: 180_000 }) if (result.success && result.data) { try { return parseCategorizationResponse(result.data, new Set(allSlugs)) diff --git a/lib/parser.ts b/lib/parser.ts index e536722..f144e49 100644 --- a/lib/parser.ts +++ b/lib/parser.ts @@ -148,15 +148,34 @@ function parseSingleTweet(tweet: RawTweet): ParsedBookmark | null { const tweetId = extractTweetId(tweet) if (!tweetId) return null + // Skip entries that some JSON exports include but that the AI pipeline can + // never enrich (0 semantic tags -> they end up force-classified as "general"): + // - preview cards captured as tweets (id_str prefixed "card://") + // - the id is a t.co URL string instead of a numeric id (extraction error in the export) + // - deleted/inaccessible tweets where the export only kept an id (empty text AND no user data; + // note some exports write the literal "unknown" as screen_name) + // - quote-tweets that are just a bare t.co URL with no media (no usable signal at all) + if (tweetId.startsWith('card://')) return null + if (tweetId.startsWith('http://') || tweetId.startsWith('https://') || tweetId.startsWith('t.co/')) return null + + const text = extractText(tweet) + const handle = tweet.user?.screen_name + const hasUser = !!((handle && handle !== 'unknown') || tweet.user?.name) + if (text === '' && !hasUser) return null + + const media = extractMedia(tweet) + const isUrlOnly = /^https?:\/\/t\.co\/\S+\s*$/.test(text) + if (isUrlOnly && media.length === 0) return null + return { tweetId, - text: extractText(tweet), + text, authorHandle: extractAuthorHandle(tweet), authorName: extractAuthorName(tweet), tweetCreatedAt: extractCreatedAt(tweet), hashtags: extractHashtags(tweet), urls: extractUrls(tweet), - media: extractMedia(tweet), + media, rawJson: JSON.stringify(tweet), } }