Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 20 additions & 11 deletions collectors/nlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,28 @@

Today's free-tier implementations:

loughran_mcdonald.LoughranMcDonaldScorer — finance-domain dictionary
sentiment, the academic
gold standard
event_extraction.AnthropicEventExtractor — Haiku-tier structured
event flag extraction
(we already pay Anthropic)
loughran_mcdonald.LoughranMcDonaldScorer — finance-domain
dictionary sentiment
(academic standard)
rule_based_event_extraction.RuleBasedEventExtractor — deterministic event
classification from
vendor tags (Polygon
keywords, GDELT codes)
+ title-keyword regex.
Replaced the Haiku-
backed
AnthropicEventExtractor
2026-05-25 per the
"LLM calls confined
to research module"
architectural rule.

Heavier free upgrades that drop in as new adapter classes (Phase 3+):

finbert.FinBERTScorer — HF yiyanghkust/finbert-tone
spacy_ner.SpacyEntityExtractor — en_core_web_sm or larger
"""

from collectors.nlp.event_extraction import (
AnthropicEventExtractor,
DEFAULT_EVENT_CATEGORIES,
)
from collectors.nlp.loughran_mcdonald import (
LoughranMcDonaldScorer,
load_lm_master_dict,
Expand All @@ -48,6 +53,10 @@
SentimentScore,
SentimentScorer,
)
from collectors.nlp.rule_based_event_extraction import (
DEFAULT_EVENT_CATEGORIES,
RuleBasedEventExtractor,
)

__all__ = [
"EntityMention",
Expand All @@ -58,7 +67,7 @@
"SentimentScorer",
"LoughranMcDonaldScorer",
"load_lm_master_dict",
"AnthropicEventExtractor",
"RuleBasedEventExtractor",
"DEFAULT_EVENT_CATEGORIES",
"NewsNLPPipeline",
]
233 changes: 0 additions & 233 deletions collectors/nlp/event_extraction.py

This file was deleted.

9 changes: 9 additions & 0 deletions collectors/nlp/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,20 @@ def process(
extractor.name, fp, e,
)

# Union vendor tags across all variants — Polygon keywords +
# GDELT event codes + Benzinga channels for the same wire
# story. Rule-based extractors use this as the primary
# classification signal; LLM extractors (if any reactivated)
# ignore the kwarg via the EventExtractor Protocol default.
article_tags: tuple[str, ...] = tuple({
t for v in article.variants for t in v.tags
})
for extractor in self._event_extractors:
try:
events.extend(extractor.extract(
text=text, article_fingerprint=fp,
article_tickers=article.tickers,
article_tags=article_tags,
))
except Exception as e:
logger.warning(
Expand Down
1 change: 1 addition & 0 deletions collectors/nlp/protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,5 @@ class EventExtractor(Protocol):
def extract(
self, *, text: str, article_fingerprint: str,
article_tickers: tuple[str, ...],
article_tags: tuple[str, ...] = (),
) -> list[EventFlag]: ...
Loading
Loading