diff --git a/nerve/sources/gmail.py b/nerve/sources/gmail.py index 0b96c05..6ea29e0 100644 --- a/nerve/sources/gmail.py +++ b/nerve/sources/gmail.py @@ -386,8 +386,25 @@ def _strip_boilerplate(body: str) -> str: # table-based layouts into a single block. if _BOILERPLATE_RE.search(para) and len(para) < 300: continue - # Strip standalone URL lines within the paragraph - para = _STANDALONE_URL_RE.sub('', para).strip() + # Strip standalone URL lines that are clearly boilerplate + # (tracking pixels, unsubscribe, social media, etc.) + # but keep actionable URLs (booking, messaging, payment, etc.) + def _is_boilerplate_url(m: re.Match) -> str: + url = m.group(0).strip().strip('<>') + _boilerplate_url_patterns = ( + 'unsubscribe', 'optout', 'opt-out', 'manage-preferences', + 'email-preferences', 'notification-settings', + 'tracking', 'click.', 'trk.', 'sng.link', + 'tiktok.com', 'instagram.com', 'twitter.com', 'facebook.com', + 'youtube.com', 'linkedin.com', + 'account-settings/notification', 'email-unsubscribe', + '%opentrack%', + ) + url_lower = url.lower() + if any(p in url_lower for p in _boilerplate_url_patterns): + return '' + return m.group(0) + para = _STANDALONE_URL_RE.sub(_is_boilerplate_url, para).strip() if para: cleaned.append(para) diff --git a/nerve/sources/runner.py b/nerve/sources/runner.py index 37660c5..c4e3d91 100644 --- a/nerve/sources/runner.py +++ b/nerve/sources/runner.py @@ -90,8 +90,10 @@ def is_backed_off(self) -> bool: "Extract the essential information from this source record content.\n" "Rules:\n" "- Keep: key facts, amounts, dates, names, identifiers, action items, deadlines\n" + "- Keep ALL actionable URLs: booking links, messaging threads, payment links, " + "reply links, tracking pages, confirmation links\n" "- Remove: legal disclaimers, marketing copy, boilerplate, footer text, " - "tracking links, unsubscribe text\n" + "unsubscribe links, social media links, tracking pixels\n" "- Preserve the original structure and key details\n" "- Return ONLY the cleaned content, no preamble or commentary\n" "- If the message is mostly noise, return just the 1-2 core facts"