From ce178e1836ee5d6f14a99b7fd5afbe32af24883f Mon Sep 17 00:00:00 2001 From: Konstantin Kolesnyak Date: Fri, 3 Apr 2026 23:15:32 +0200 Subject: [PATCH] Preserve actionable URLs through email preprocessing and condensation Gmail preprocessing was stripping all standalone URLs indiscriminately. Now only removes boilerplate (unsubscribe, social media, tracking pixels) and keeps actionable links (booking, messaging, payment, reply threads). Condense prompts updated to explicitly preserve actionable URLs. Co-Authored-By: Claude Opus 4.6 --- nerve/sources/gmail.py | 21 +++++++++++++++++++-- nerve/sources/runner.py | 4 +++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/nerve/sources/gmail.py b/nerve/sources/gmail.py index 0b96c05..6ea29e0 100644 --- a/nerve/sources/gmail.py +++ b/nerve/sources/gmail.py @@ -386,8 +386,25 @@ def _strip_boilerplate(body: str) -> str: # table-based layouts into a single block. if _BOILERPLATE_RE.search(para) and len(para) < 300: continue - # Strip standalone URL lines within the paragraph - para = _STANDALONE_URL_RE.sub('', para).strip() + # Strip standalone URL lines that are clearly boilerplate + # (tracking pixels, unsubscribe, social media, etc.) + # but keep actionable URLs (booking, messaging, payment, etc.) + def _is_boilerplate_url(m: re.Match) -> str: + url = m.group(0).strip().strip('<>') + _boilerplate_url_patterns = ( + 'unsubscribe', 'optout', 'opt-out', 'manage-preferences', + 'email-preferences', 'notification-settings', + 'tracking', 'click.', 'trk.', 'sng.link', + 'tiktok.com', 'instagram.com', 'twitter.com', 'facebook.com', + 'youtube.com', 'linkedin.com', + 'account-settings/notification', 'email-unsubscribe', + '%opentrack%', + ) + url_lower = url.lower() + if any(p in url_lower for p in _boilerplate_url_patterns): + return '' + return m.group(0) + para = _STANDALONE_URL_RE.sub(_is_boilerplate_url, para).strip() if para: cleaned.append(para) diff --git a/nerve/sources/runner.py b/nerve/sources/runner.py index 37660c5..c4e3d91 100644 --- a/nerve/sources/runner.py +++ b/nerve/sources/runner.py @@ -90,8 +90,10 @@ def is_backed_off(self) -> bool: "Extract the essential information from this source record content.\n" "Rules:\n" "- Keep: key facts, amounts, dates, names, identifiers, action items, deadlines\n" + "- Keep ALL actionable URLs: booking links, messaging threads, payment links, " + "reply links, tracking pages, confirmation links\n" "- Remove: legal disclaimers, marketing copy, boilerplate, footer text, " - "tracking links, unsubscribe text\n" + "unsubscribe links, social media links, tracking pixels\n" "- Preserve the original structure and key details\n" "- Return ONLY the cleaned content, no preamble or commentary\n" "- If the message is mostly noise, return just the 1-2 core facts"