Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions nerve/sources/gmail.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,25 @@ def _strip_boilerplate(body: str) -> str:
# table-based layouts into a single block.
if _BOILERPLATE_RE.search(para) and len(para) < 300:
continue
# Strip standalone URL lines within the paragraph
para = _STANDALONE_URL_RE.sub('', para).strip()
# Strip standalone URL lines that are clearly boilerplate
# (tracking pixels, unsubscribe, social media, etc.)
# but keep actionable URLs (booking, messaging, payment, etc.)
def _is_boilerplate_url(m: re.Match) -> str:
url = m.group(0).strip().strip('<>')
_boilerplate_url_patterns = (
'unsubscribe', 'optout', 'opt-out', 'manage-preferences',
'email-preferences', 'notification-settings',
'tracking', 'click.', 'trk.', 'sng.link',
'tiktok.com', 'instagram.com', 'twitter.com', 'facebook.com',
'youtube.com', 'linkedin.com',
'account-settings/notification', 'email-unsubscribe',
'%opentrack%',
)
url_lower = url.lower()
if any(p in url_lower for p in _boilerplate_url_patterns):
return ''
return m.group(0)
para = _STANDALONE_URL_RE.sub(_is_boilerplate_url, para).strip()
if para:
cleaned.append(para)

Expand Down
4 changes: 3 additions & 1 deletion nerve/sources/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@ def is_backed_off(self) -> bool:
"Extract the essential information from this source record content.\n"
"Rules:\n"
"- Keep: key facts, amounts, dates, names, identifiers, action items, deadlines\n"
"- Keep ALL actionable URLs: booking links, messaging threads, payment links, "
"reply links, tracking pages, confirmation links\n"
"- Remove: legal disclaimers, marketing copy, boilerplate, footer text, "
"tracking links, unsubscribe text\n"
"unsubscribe links, social media links, tracking pixels\n"
"- Preserve the original structure and key details\n"
"- Return ONLY the cleaned content, no preamble or commentary\n"
"- If the message is mostly noise, return just the 1-2 core facts"
Expand Down