From 4c86133727b3131da95b0cf576ec7db3ffa3f7ef Mon Sep 17 00:00:00 2001 From: Stephan Deibel Date: Wed, 8 Apr 2026 23:01:57 -0400 Subject: [PATCH] Add layered spam defense with Bayesian filter and first-post confirmation Adds a multi-layered spam defense system: - Dual Bayesian classifier (spam + ham models) with lazy loading, thread-safe operation, and fail-open design - First-post email confirmation for watched users: post is held until the user clicks a confirmation link - Optional moderator queue after email confirmation - Silent deletion mode for obvious spam (spam-only, no ham match) - Incremental learning: spam model updated when posts are marked as spam, ham model updated when posts are approved by moderators - Management commands for training and cleanup All features are opt-in via livesettings with safe defaults (disabled). Replaces the existing spam checker calls in ask/answer/comment views with the dual Bayesian check while preserving the original spam checker as a fallback. --- askbot/conf/__init__.py | 1 + askbot/conf/spam_defense.py | 66 ++++++ .../jinja2/email/post_confirmation/body.html | 17 ++ .../email/post_confirmation/subject.txt | 1 + askbot/jinja2/post_confirmation.html | 53 +++++ askbot/mail/messages.py | 16 ++ .../askbot_delete_expired_confirmations.py | 18 ++ .../commands/askbot_train_spam_filter.py | 137 ++++++++++++ askbot/migrations/0036_postconfirmation.py | 43 ++++ askbot/models/__init__.py | 29 +++ askbot/models/post_confirmation.py | 127 +++++++++++ askbot/spam_checker/bayesian_spam_checker.py | 155 ++++++++++++++ askbot/tests/test_spam_defense.py | 201 ++++++++++++++++++ askbot/urls.py | 7 +- askbot/views/__init__.py | 1 + askbot/views/post_confirmation.py | 51 +++++ askbot/views/writers.py | 157 ++++++++++++-- pyproject.toml | 1 + 18 files changed, 1058 insertions(+), 23 deletions(-) create mode 100644 askbot/conf/spam_defense.py create mode 100644 askbot/jinja2/email/post_confirmation/body.html create mode 100644 askbot/jinja2/email/post_confirmation/subject.txt create mode 100644 askbot/jinja2/post_confirmation.html create mode 100644 askbot/management/commands/askbot_delete_expired_confirmations.py create mode 100644 askbot/management/commands/askbot_train_spam_filter.py create mode 100644 askbot/migrations/0036_postconfirmation.py create mode 100644 askbot/models/post_confirmation.py create mode 100644 askbot/spam_checker/bayesian_spam_checker.py create mode 100644 askbot/tests/test_spam_defense.py create mode 100644 askbot/views/post_confirmation.py diff --git a/askbot/conf/__init__.py b/askbot/conf/__init__.py index 9de9acb6eb..f4e2dbc35b 100644 --- a/askbot/conf/__init__.py +++ b/askbot/conf/__init__.py @@ -33,6 +33,7 @@ def init(): import askbot.conf.access_control import askbot.conf.site_modes import askbot.conf.words + import askbot.conf.spam_defense #import main settings object from askbot.conf.settings_wrapper import settings diff --git a/askbot/conf/spam_defense.py b/askbot/conf/spam_defense.py new file mode 100644 index 0000000000..5697b493e1 --- /dev/null +++ b/askbot/conf/spam_defense.py @@ -0,0 +1,66 @@ +"""Livesettings for the spam defense system (Bayesian filter + first-post confirmation).""" +from django.utils.translation import gettext_lazy as _ +from livesettings import values as livesettings +from askbot.conf.settings_wrapper import settings +from askbot.conf.super_groups import EXTERNAL_SERVICES + +SPAM_DEFENSE = livesettings.ConfigurationGroup( + 'SPAM_DEFENSE', + _('Spam defense settings'), + super_group=EXTERNAL_SERVICES +) + +settings.register( + livesettings.BooleanValue( + SPAM_DEFENSE, + 'FIRST_POST_EMAIL_CONFIRMATION', + description=_('Require email confirmation for first post'), + help_text=_( + 'When enabled, watched users must confirm their first post ' + 'via an email link before it goes live.' + ), + default=False + ) +) + +settings.register( + livesettings.BooleanValue( + SPAM_DEFENSE, + 'FIRST_POST_MODERATE_AFTER_CONFIRMATION', + description=_('Require moderator approval after email confirmation'), + help_text=_( + 'When enabled, first posts that pass email confirmation are ' + 'placed in the moderator queue for approval. When disabled, ' + 'confirmed posts go live immediately.' + ), + default=True + ) +) + +settings.register( + livesettings.BooleanValue( + SPAM_DEFENSE, + 'DELETE_BLOCKED_USERS', + description=_('Delete blocked spammer accounts entirely'), + help_text=_( + 'When enabled, blocking a spammer deletes the user account ' + 'along with all their content, preventing accumulation of ' + 'dead accounts. When disabled, the account is kept with ' + 'blocked status (original behavior).' + ), + default=True + ) +) + +settings.register( + livesettings.BooleanValue( + SPAM_DEFENSE, + 'BAYESIAN_SPAM_SILENT_DELETE', + description=_('Silently delete obvious spam from new users'), + help_text=_( + 'When enabled, first posts that are flagged as spam (but not ham) ' + 'result in silent deletion of the user and post.' + ), + default=False + ) +) diff --git a/askbot/jinja2/email/post_confirmation/body.html b/askbot/jinja2/email/post_confirmation/body.html new file mode 100644 index 0000000000..ddf344c975 --- /dev/null +++ b/askbot/jinja2/email/post_confirmation/body.html @@ -0,0 +1,17 @@ +{% extends "email/base_mail.html"%} +{% block title %}{% trans %}Please confirm your post{% endtrans %}{% endblock %} +{% block headline %}{% trans %}Please confirm your post{% endtrans %}{% endblock %} + +{% block content %} +

{% trans %}Thank you for posting on {{ site_name }}. To prevent spam, we ask new users to confirm their first post.{% endtrans %}

+ +

{% trans %}Please follow the link below to confirm and publish your post:{% endtrans %}

+ +

{{ confirmation_link }}

+ +

{% trans %}This link will expire in 3 days. Posts that are not confirmed will be removed.{% endtrans %}

+{% endblock %} + +{% block footer %} +{% include "email/footer.html" %} +{% endblock %} diff --git a/askbot/jinja2/email/post_confirmation/subject.txt b/askbot/jinja2/email/post_confirmation/subject.txt new file mode 100644 index 0000000000..67b7691c69 --- /dev/null +++ b/askbot/jinja2/email/post_confirmation/subject.txt @@ -0,0 +1 @@ +{% trans %}Please confirm your post on {{ site_name }}{% endtrans %} diff --git a/askbot/jinja2/post_confirmation.html b/askbot/jinja2/post_confirmation.html new file mode 100644 index 0000000000..ff5cfa2b44 --- /dev/null +++ b/askbot/jinja2/post_confirmation.html @@ -0,0 +1,53 @@ +{% extends "base.html" %} +{% block title %}{% trans %}Confirm your post{% endtrans %}{% endblock %} + +{% block body %} +
+ +{% if error %} +

{% trans %}Post confirmation{% endtrans %}

+

{{ error }}

+{% elif confirmed and pending_moderation %} +

{% trans %}Post confirmed{% endtrans %}

+

{% trans %}Thank you for confirming your post. It has been submitted for moderator review and will appear on the site once approved.{% endtrans %}

+{% elif confirmed %} +

{% trans %}Post published!{% endtrans %}

+

{% trans %}Your post has been published. Thank you!{% endtrans %}

+

{% trans %}View your post{% endtrans %}

+{% else %} +

{% trans %}Confirm your post{% endtrans %}

+ +
+

{% trans %}Your post preview:{% endtrans %}

+ {{ post_html|safe }} +
+ +
+ {{ csrf_input }} +

+ +

+

+ +

+
+ + +{% endif %} + +
+{% endblock %} diff --git a/askbot/mail/messages.py b/askbot/mail/messages.py index 5006e96726..5af0f46c6d 100644 --- a/askbot/mail/messages.py +++ b/askbot/mail/messages.py @@ -841,6 +841,22 @@ def process_context(self, context): # 'recipient_user': get_user() # } +class PostConfirmationEmail(BaseEmail): + template_path = 'email/post_confirmation' + title = _('Post confirmation') + description = _('Sent to new users to confirm their first post via email') + mock_contexts = ({'key': 'abc123def456'},) + + def process_context(self, context): + context.update({ + 'site_name': askbot_settings.APP_SHORT_NAME, + 'recipient_user': None, + 'confirmation_link': site_url(reverse('confirm_post', + kwargs={'key': context['key']})) + }) + return context + + class FeedbackEmail(BaseEmail): template_path = 'email/feedback' title = _('Feedback email') diff --git a/askbot/management/commands/askbot_delete_expired_confirmations.py b/askbot/management/commands/askbot_delete_expired_confirmations.py new file mode 100644 index 0000000000..80c30ab458 --- /dev/null +++ b/askbot/management/commands/askbot_delete_expired_confirmations.py @@ -0,0 +1,18 @@ +"""Delete expired unconfirmed first-post confirmations. + +Removes the post and (if the user has no other posts) the user. +Intended to be run periodically via cron, e.g. daily. + +Usage: + python manage.py askbot_delete_expired_confirmations +""" +from django.core.management.base import BaseCommand + + +class Command(BaseCommand): + help = 'Delete expired unconfirmed first-post confirmations and their users' + + def handle(self, *args, **kwargs): + from askbot.models.post_confirmation import PostConfirmation + count = PostConfirmation.delete_expired_unconfirmed() + self.stdout.write(f'Deleted {count} expired unconfirmed confirmation(s).') diff --git a/askbot/management/commands/askbot_train_spam_filter.py b/askbot/management/commands/askbot_train_spam_filter.py new file mode 100644 index 0000000000..d23f5c5eb5 --- /dev/null +++ b/askbot/management/commands/askbot_train_spam_filter.py @@ -0,0 +1,137 @@ +"""Train the dual Bayesian spam filter models. + +Usage: + python manage.py askbot_train_spam_filter --from-db + python manage.py askbot_train_spam_filter --input spam-ham.json + python manage.py askbot_train_spam_filter --input spam-ham.json --model spam +""" +import json +import os + +from django.conf import settings as django_settings +from django.core.management.base import BaseCommand, CommandError + + +class Command(BaseCommand): + help = 'Train the Bayesian spam filter (spam and/or ham models)' + + def add_arguments(self, parser): + parser.add_argument( + '--from-db', action='store_true', default=False, + help='Train from live database (blocked users = spam, approved users = ham)' + ) + parser.add_argument( + '--input', dest='input_file', default=None, + help='Path to JSON file from askbot_get_spam_training_set' + ) + parser.add_argument( + '--model', choices=['spam', 'ham', 'both'], default='both', + help='Which model(s) to train (default: both)' + ) + + def handle(self, *args, **kwargs): + from_db = kwargs['from_db'] + input_file = kwargs['input_file'] + model_choice = kwargs['model'] + + if not from_db and not input_file: + raise CommandError('Specify --from-db or --input FILE') + + if from_db and input_file: + raise CommandError('Specify only one of --from-db or --input') + + if input_file: + spam_texts, ham_texts = self._load_from_file(input_file) + else: + spam_texts, ham_texts = self._load_from_db() + + self.stdout.write(f'Spam examples: {len(spam_texts)}') + self.stdout.write(f'Ham examples: {len(ham_texts)}') + + if model_choice in ('spam', 'both'): + if len(spam_texts) < 10: + self.stderr.write('WARNING: Very few spam examples, model may be unreliable') + self._train_model('spam', spam_texts, ham_texts) + + if model_choice in ('ham', 'both'): + if len(ham_texts) < 10: + self.stderr.write('WARNING: Very few ham examples, model may be unreliable') + self._train_model('ham', ham_texts, spam_texts) + + self.stdout.write(self.style.SUCCESS('Training complete.')) + + def _load_from_file(self, path): + if not os.path.exists(path): + raise CommandError(f'File not found: {path}') + with open(path, 'r', encoding='utf-8') as f: + data = json.load(f) + return data.get('spam', []), data.get('ham', []) + + def _load_from_db(self): + from askbot.models import Post, User + from askbot import const + + self.stdout.write('Loading training data from database...') + + from django.db.models import Count + spam_users = User.objects.filter( + askbot_profile__reputation=const.MIN_REPUTATION, + askbot_profile__status='b' + ).annotate(post_count=Count('posts')).filter(post_count=1) + spam_posts = Post.objects.filter( + author__in=spam_users, + post_type__in=('question', 'answer', 'comment') + ).only('text') + spam_texts = [p.text for p in spam_posts] + + ham_users = User.objects.filter( + askbot_profile__reputation__gte=10 + ).order_by('-askbot_profile__reputation') + ham_posts = Post.objects.filter( + author__in=ham_users, + post_type__in=('question', 'answer', 'comment') + ).only('text')[:3000] + ham_texts = [p.text for p in ham_posts] + + return spam_texts, ham_texts + + def _train_model(self, model_type, positive_texts, negative_texts): + import numpy as np + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.model_selection import train_test_split + from sklearn.naive_bayes import MultinomialNB + from sklearn.metrics import classification_report + + self.stdout.write(f'\nTraining {model_type} model...') + + all_texts = positive_texts + negative_texts + labels = np.array([1] * len(positive_texts) + [0] * len(negative_texts)) + + X_train_text, X_test_text, y_train, y_test = train_test_split( + all_texts, labels, test_size=0.2, random_state=42, stratify=labels + ) + + vectorizer = TfidfVectorizer( + max_features=50000, + ngram_range=(1, 2), + min_df=2 + ) + X_train = vectorizer.fit_transform(X_train_text) + X_test = vectorizer.transform(X_test_text) + + classifier = MultinomialNB(alpha=0.1) + classifier.fit(X_train, y_train) + + y_pred = classifier.predict(X_test) + report = classification_report(y_test, y_pred, target_names=['negative', 'positive']) + self.stdout.write(f'\n{model_type.upper()} model evaluation:\n{report}') + + import joblib + model_dir = os.path.join(django_settings.MEDIA_ROOT, 'spam_filter') + os.makedirs(model_dir, exist_ok=True) + + vec_path = os.path.join(model_dir, f'{model_type}_vectorizer.joblib') + clf_path = os.path.join(model_dir, f'{model_type}_classifier.joblib') + joblib.dump(vectorizer, vec_path) + joblib.dump(classifier, clf_path) + self.stdout.write(f'Saved {model_type} model to {model_dir}') diff --git a/askbot/migrations/0036_postconfirmation.py b/askbot/migrations/0036_postconfirmation.py new file mode 100644 index 0000000000..4e05eaa762 --- /dev/null +++ b/askbot/migrations/0036_postconfirmation.py @@ -0,0 +1,43 @@ +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +import askbot.models.post_confirmation + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ('askbot', '0035_set_global_group_used_for_analytics'), + ] + + operations = [ + migrations.CreateModel( + name='PostConfirmation', + fields=[ + ('key', models.CharField( + default=askbot.models.post_confirmation._make_key, + max_length=64, + primary_key=True, + serialize=False, + )), + ('created_at', models.DateTimeField(default=django.utils.timezone.now)), + ('confirmed_at', models.DateTimeField(blank=True, null=True)), + ('expires_on', models.DateTimeField(blank=True)), + ('post', models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='confirmations', + to='askbot.post', + )), + ('user', models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='post_confirmations', + to=settings.AUTH_USER_MODEL, + )), + ], + options={ + 'app_label': 'askbot', + }, + ), + ] diff --git a/askbot/models/__init__.py b/askbot/models/__init__.py index fad41eba3e..f727cecf11 100644 --- a/askbot/models/__init__.py +++ b/askbot/models/__init__.py @@ -37,6 +37,7 @@ from askbot.const import message_keys from askbot.conf import settings as askbot_settings from askbot.models.question import Thread +from askbot.models.post_confirmation import PostConfirmation import askbot.models.analytics from askbot.skins import utils as skin_utils from askbot.mail.messages import (WelcomeEmail, @@ -3497,6 +3498,14 @@ def user_approve_post_revision(user, post_revision, timestamp = None): was_approved=True ) + # Incremental ham learning: feed approved post text to the ham model + if askbot_settings.SPAM_FILTER_ENABLED and post_revision.text: + try: + from askbot.spam_checker.bayesian_spam_checker import retrain_ham_incremental + retrain_ham_incremental([post_revision.text]) + except Exception: + logging.exception('Error in Bayesian ham incremental training') + @auto_now_timestamp def flag_post( user, post, timestamp=None, cancel=False, cancel_all=False, force=False @@ -4790,6 +4799,26 @@ def handle_posts_marked_as_spam(sender, post_ids, **kwargs): dispatch_uid='handle_posts_marked_as_spam' ) + +def handle_spam_for_bayesian_training(sender, post_ids, **kwargs): + """Feed spam posts to the Bayesian spam model for incremental learning.""" + if not askbot_settings.SPAM_FILTER_ENABLED: + return + try: + from askbot.spam_checker.bayesian_spam_checker import retrain_spam_incremental + posts = Post.objects.filter(pk__in=post_ids).only('text') + texts = [p.text for p in posts if p.text] + if texts: + retrain_spam_incremental(texts) + except Exception: + logging.exception('Error in Bayesian spam incremental training') + + +signals.posts_marked_as_spam.connect( + handle_spam_for_bayesian_training, + dispatch_uid='handle_spam_for_bayesian_training' +) + #set up a possibility for the users to follow others #try: # import followit diff --git a/askbot/models/post_confirmation.py b/askbot/models/post_confirmation.py new file mode 100644 index 0000000000..05bda70046 --- /dev/null +++ b/askbot/models/post_confirmation.py @@ -0,0 +1,127 @@ +"""Model for first-post email confirmation. + +When a watched user makes their first post, we save the post as unapproved +and send a confirmation email. The user must click the link and confirm +to publish the post and get promoted to approved status. + +Unconfirmed posts are deleted after expiry (3 days) along with the user +if they have no other posts. +""" +import logging +import uuid +from datetime import timedelta + +from django.conf import settings +from django.db import models +from django.utils import timezone + +LOG = logging.getLogger(__name__) + + +def _make_key(): + return uuid.uuid4().hex + + +class PostConfirmation(models.Model): + """Tracks pending first-post confirmations.""" + key = models.CharField(max_length=64, primary_key=True, default=_make_key) + post = models.ForeignKey( + 'askbot.Post', + on_delete=models.CASCADE, + related_name='confirmations' + ) + user = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + related_name='post_confirmations' + ) + created_at = models.DateTimeField(default=timezone.now) + confirmed_at = models.DateTimeField(null=True, blank=True) + expires_on = models.DateTimeField(blank=True) + + class Meta: + app_label = 'askbot' + + def save(self, *args, **kwargs): + if not self.expires_on: + self.expires_on = (self.created_at or timezone.now()) + timedelta(days=3) + super().save(*args, **kwargs) + + @property + def is_expired(self): + return timezone.now() > self.expires_on + + @property + def is_confirmed(self): + return self.confirmed_at is not None + + def confirm(self): + """Handle email confirmation of a first post.""" + if self.is_confirmed: + return + + self.confirmed_at = timezone.now() + self.save(update_fields=['confirmed_at']) + + from askbot.conf import settings as askbot_settings + if askbot_settings.FIRST_POST_MODERATE_AFTER_CONFIRMATION: + return + + self._approve_post() + + def _approve_post(self): + """Approve the post and promote the user to approved status.""" + post = self.post + post.approved = True + post.save(update_fields=['approved']) + + revision = post.get_latest_revision() + if revision and not revision.approved: + revision.approved = True + revision.approved_by = self.user + revision.approved_at = self.confirmed_at + revision.save(update_fields=['approved', 'approved_by', 'approved_at']) + + if post.post_type == 'question': + post.thread.approved = True + post.thread.save(update_fields=['approved']) + + post.thread.reset_cached_data() + + profile = self.user.askbot_profile + if profile.status == 'w': + profile.status = 'a' + profile.save(update_fields=['status']) + + @classmethod + def delete_expired_unconfirmed(cls): + """Delete expired unconfirmed posts and their users.""" + from askbot.models.post import Post + expired = cls.objects.filter( + confirmed_at__isnull=True, + expires_on__lt=timezone.now() + ).select_related('post', 'user') + + count = 0 + for confirmation in expired: + user = confirmation.user + post = confirmation.post + try: + post.delete() + has_other_posts = Post.objects.filter( + author=user + ).exists() + if not has_other_posts: + user.delete() + LOG.info('Deleted unconfirmed user %s (no posts)', user.id) + confirmation.delete() + count += 1 + except Exception: + LOG.exception( + 'Error cleaning up expired confirmation %s', + confirmation.key[:8] + ) + return count + + def __str__(self): + return f'PostConfirmation(key={self.key[:8]}..., user={self.user_id})' diff --git a/askbot/spam_checker/bayesian_spam_checker.py b/askbot/spam_checker/bayesian_spam_checker.py new file mode 100644 index 0000000000..184890fb58 --- /dev/null +++ b/askbot/spam_checker/bayesian_spam_checker.py @@ -0,0 +1,155 @@ +"""Dual Bayesian spam filter with independent spam and ham models. + +The spam model detects spam content; the ham model detects legitimate content. +Together they provide a dual-check: content that matches spam but also matches +ham gets the benefit of the doubt (the ham model "rescues" the post). + +Models are loaded lazily from disk (joblib) and cached at module level. +Thread-safe via threading.Lock. + +Fails open: if models are not found, is_spam returns False and is_ham returns False. +""" +import logging +import os +import threading + +from django.conf import settings as django_settings + +LOG = logging.getLogger(__name__) + +_lock = threading.Lock() +_spam_vectorizer = None +_spam_classifier = None +_ham_vectorizer = None +_ham_classifier = None +_models_loaded = False + + +def _get_model_dir(): + """Return the directory where model files are stored.""" + media_root = getattr(django_settings, 'MEDIA_ROOT', '') + return os.path.join(media_root, 'spam_filter') + + +def _load_models(): + """Load all models from disk. Called once, lazily.""" + global _spam_vectorizer, _spam_classifier + global _ham_vectorizer, _ham_classifier + global _models_loaded + + model_dir = _get_model_dir() + try: + import joblib + spam_vec_path = os.path.join(model_dir, 'spam_vectorizer.joblib') + spam_clf_path = os.path.join(model_dir, 'spam_classifier.joblib') + ham_vec_path = os.path.join(model_dir, 'ham_vectorizer.joblib') + ham_clf_path = os.path.join(model_dir, 'ham_classifier.joblib') + + if os.path.exists(spam_vec_path) and os.path.exists(spam_clf_path): + _spam_vectorizer = joblib.load(spam_vec_path) + _spam_classifier = joblib.load(spam_clf_path) + LOG.info('Spam model loaded from %s', model_dir) + else: + LOG.warning('Spam model files not found in %s', model_dir) + + if os.path.exists(ham_vec_path) and os.path.exists(ham_clf_path): + _ham_vectorizer = joblib.load(ham_vec_path) + _ham_classifier = joblib.load(ham_clf_path) + LOG.info('Ham model loaded from %s', model_dir) + else: + LOG.warning('Ham model files not found in %s', model_dir) + + except Exception: + LOG.exception('Error loading spam filter models from %s', model_dir) + + _models_loaded = True + + +def _ensure_loaded(): + """Ensure models are loaded (thread-safe lazy init).""" + global _models_loaded + if not _models_loaded: + with _lock: + if not _models_loaded: + _load_models() + + +def reload_models(): + """Force re-read models from disk.""" + global _models_loaded + with _lock: + _models_loaded = False + _ensure_loaded() + + +def is_spam(text, **kwargs): + """Check if text is spam. Conforms to the spam checker plugin interface. + + Returns True if the spam model classifies the text as spam. + Returns False if the model is not loaded (fails open). + """ + _ensure_loaded() + if _spam_vectorizer is None or _spam_classifier is None: + return False + try: + features = _spam_vectorizer.transform([text]) + prediction = _spam_classifier.predict(features)[0] + return bool(prediction == 1) + except Exception: + LOG.exception('Error in spam classification') + return False + + +def is_ham(text): + """Check if text matches the ham (legitimate content) model. + + Returns True if the ham model classifies the text as ham. + Returns False if the model is not loaded (fails open). + """ + _ensure_loaded() + if _ham_vectorizer is None or _ham_classifier is None: + return False + try: + features = _ham_vectorizer.transform([text]) + prediction = _ham_classifier.predict(features)[0] + return bool(prediction == 1) + except Exception: + LOG.exception('Error in ham classification') + return False + + +def check_content(text): + """Dual-check: returns (spam_detected, ham_detected) tuple.""" + return (is_spam(text), is_ham(text)) + + +def retrain_spam_incremental(texts): + """Incrementally train the spam model with new spam examples using partial_fit.""" + _ensure_loaded() + if _spam_vectorizer is None or _spam_classifier is None: + LOG.warning('Cannot retrain spam model: model not loaded') + return + try: + import numpy as np + features = _spam_vectorizer.transform(texts) + labels = np.ones(len(texts), dtype=int) + _spam_classifier.partial_fit(features, labels) + LOG.info('Spam model incrementally updated with %d examples', len(texts)) + except Exception: + LOG.exception('Error in incremental spam retraining') + + +def retrain_ham_incremental(texts): + """Incrementally train the ham model with new ham examples using partial_fit.""" + _ensure_loaded() + if _ham_vectorizer is None or _ham_classifier is None: + LOG.warning('Cannot retrain ham model: model not loaded') + return + try: + import numpy as np + features = _ham_vectorizer.transform(texts) + labels = np.ones(len(texts), dtype=int) + _ham_classifier.partial_fit(features, labels) + LOG.info('Ham model incrementally updated with %d examples', len(texts)) + except Exception: + LOG.exception('Error in incremental ham retraining') diff --git a/askbot/tests/test_spam_defense.py b/askbot/tests/test_spam_defense.py new file mode 100644 index 0000000000..e06075455a --- /dev/null +++ b/askbot/tests/test_spam_defense.py @@ -0,0 +1,201 @@ +"""Tests for spam defense: Bayesian check and first-post confirmation.""" +import datetime +from unittest.mock import patch, MagicMock + +from django.core.exceptions import PermissionDenied +from django.test import RequestFactory +from django.utils import timezone + +from askbot.tests.utils import AskbotTestCase, with_settings +from askbot.views.writers import _check_spam_and_act, _SilentSpamDeletion +from askbot.models.post_confirmation import PostConfirmation + + +class CheckSpamTests(AskbotTestCase): + """Tests for _check_spam_and_act().""" + + def setUp(self): + self.factory = RequestFactory() + self.watched = self.create_user('watched_spam', status='w') + self.approved = self.create_user('approved_spam', status='a') + + def _make_request(self, user=None): + request = self.factory.get('/') + request.user = user or self.watched + return request + + @with_settings(SPAM_FILTER_ENABLED=False) + def test_disabled_allows_all(self): + """When spam filter is disabled, _check_spam_and_act returns None.""" + request = self._make_request() + result = _check_spam_and_act(request, self.watched, 'buy viagra now') + self.assertIsNone(result) + + @with_settings(SPAM_FILTER_ENABLED=True) + @patch('askbot.spam_checker.bayesian_spam_checker.check_content', + return_value=(False, False)) + def test_no_spam_allows(self, mock_check): + """When no spam detected, returns None.""" + request = self._make_request() + result = _check_spam_and_act(request, self.watched, 'normal text') + self.assertIsNone(result) + + @with_settings(SPAM_FILTER_ENABLED=True) + @patch('askbot.spam_checker.bayesian_spam_checker.check_content', return_value=(True, True)) + def test_spam_ham_first_post_watched_held(self, mock_check): + """Spam+ham, watched user, no prior posts -> PermissionDenied.""" + request = self._make_request() + with self.assertRaises(PermissionDenied): + _check_spam_and_act(request, self.watched, 'spam text') + + @with_settings(SPAM_FILTER_ENABLED=True) + @patch('askbot.spam_checker.bayesian_spam_checker.check_content', return_value=(True, True)) + def test_spam_ham_approved_user_allows(self, mock_check): + """Spam+ham, user with prior posts -> returns None.""" + self.post_question(user=self.approved) + request = self._make_request(user=self.approved) + result = _check_spam_and_act(request, self.approved, 'spam text') + self.assertIsNone(result) + + @with_settings(SPAM_FILTER_ENABLED=True, BAYESIAN_SPAM_SILENT_DELETE=True) + @patch('askbot.spam_checker.bayesian_spam_checker.check_content', return_value=(True, False)) + def test_spam_only_silent_delete(self, mock_check): + """Spam only, watched, silent delete enabled -> user deleted.""" + request = self._make_request() + user_id = self.watched.id + with self.assertRaises(_SilentSpamDeletion): + _check_spam_and_act(request, self.watched, 'buy stuff') + from django.contrib.auth.models import User + self.assertFalse(User.objects.filter(id=user_id).exists()) + + @with_settings(SPAM_FILTER_ENABLED=True, BAYESIAN_SPAM_SILENT_DELETE=False) + @patch('askbot.spam_checker.bayesian_spam_checker.check_content', return_value=(True, False)) + def test_spam_only_held(self, mock_check): + """Spam only, watched, silent delete disabled -> PermissionDenied.""" + request = self._make_request() + with self.assertRaises(PermissionDenied): + _check_spam_and_act(request, self.watched, 'buy stuff') + + +class SendFirstPostConfirmationTests(AskbotTestCase): + """Tests for _send_first_post_confirmation().""" + + def setUp(self): + self.factory = RequestFactory() + self.watched = self.create_user('watched_conf', status='w') + self.approved = self.create_user('approved_conf', status='a') + + def _make_request(self, user): + request = self.factory.get('/') + request.user = user + return request + + @with_settings(FIRST_POST_EMAIL_CONFIRMATION=False) + def test_disabled_returns_false(self): + """When disabled, returns False.""" + from askbot.views.writers import _send_first_post_confirmation + question = self.post_question(user=self.watched) + request = self._make_request(self.watched) + result = _send_first_post_confirmation(request, self.watched, question) + self.assertFalse(result) + + @with_settings(FIRST_POST_EMAIL_CONFIRMATION=True) + def test_non_watched_returns_false(self): + """Approved user should return False.""" + from askbot.views.writers import _send_first_post_confirmation + question = self.post_question(user=self.approved) + request = self._make_request(self.approved) + result = _send_first_post_confirmation(request, self.approved, question) + self.assertFalse(result) + + @with_settings(FIRST_POST_EMAIL_CONFIRMATION=True) + def test_has_prior_posts_returns_false(self): + """Watched user with prior posts should return False.""" + from askbot.views.writers import _send_first_post_confirmation + prior = self.post_question(user=self.watched, title='prior question') + new_q = self.post_question(user=self.watched, title='new question') + request = self._make_request(self.watched) + result = _send_first_post_confirmation(request, self.watched, new_q) + self.assertFalse(result) + + @with_settings(FIRST_POST_EMAIL_CONFIRMATION=True) + @patch('askbot.mail.send_mail') + def test_first_post_creates_confirmation(self, mock_send): + """First post by watched user should create PostConfirmation.""" + from askbot.views.writers import _send_first_post_confirmation + question = self.post_question(user=self.watched) + request = self._make_request(self.watched) + + with patch('askbot.mail.messages.PostConfirmationEmail'): + result = _send_first_post_confirmation( + request, self.watched, question + ) + + self.assertTrue(result) + question.refresh_from_db() + self.assertFalse(question.approved) + self.assertTrue(PostConfirmation.objects.filter( + user=self.watched, post=question + ).exists()) + + +class PostConfirmationModelTests(AskbotTestCase): + """Tests for the PostConfirmation model.""" + + def setUp(self): + self.watched = self.create_user('pc_watched', status='w') + self.question = self.post_question(user=self.watched) + + def test_auto_expiry_date(self): + """expires_on should be approximately created_at + 3 days.""" + conf = PostConfirmation(post=self.question, user=self.watched) + conf.save() + expected = conf.created_at + datetime.timedelta(days=3) + diff = abs((conf.expires_on - expected).total_seconds()) + self.assertLess(diff, 1) + + def test_is_expired(self): + """is_expired should return True after expiry.""" + conf = PostConfirmation(post=self.question, user=self.watched) + conf.save() + + future = conf.expires_on + datetime.timedelta(seconds=1) + with patch('askbot.models.post_confirmation.timezone.now', + return_value=future): + self.assertTrue(conf.is_expired) + + @with_settings(FIRST_POST_MODERATE_AFTER_CONFIRMATION=False) + def test_confirm_approves_post_when_moderation_disabled(self): + """confirm() should approve the post and promote user.""" + conf = PostConfirmation(post=self.question, user=self.watched) + conf.save() + + self.question.approved = False + self.question.save(update_fields=['approved']) + + conf.confirm() + + self.question.refresh_from_db() + self.assertTrue(self.question.approved) + self.assertIsNotNone(conf.confirmed_at) + + self.watched.askbot_profile.refresh_from_db() + self.assertEqual(self.watched.askbot_profile.status, 'a') + + @with_settings(FIRST_POST_MODERATE_AFTER_CONFIRMATION=True) + def test_confirm_leaves_post_unapproved_when_moderation_enabled(self): + """confirm() should leave post unapproved when moderation is on.""" + conf = PostConfirmation(post=self.question, user=self.watched) + conf.save() + + self.question.approved = False + self.question.save(update_fields=['approved']) + + conf.confirm() + + self.question.refresh_from_db() + self.assertFalse(self.question.approved) + self.assertIsNotNone(conf.confirmed_at) + + self.watched.askbot_profile.refresh_from_db() + self.assertEqual(self.watched.askbot_profile.status, 'w') diff --git a/askbot/urls.py b/askbot/urls.py index 2b41807871..2d5a0ba6cd 100644 --- a/askbot/urls.py +++ b/askbot/urls.py @@ -641,7 +641,12 @@ re_path('^api/v1/questions/$', views.api_v1.questions, name='api_v1_questions'), re_path('^api/v1/questions/(?P\d+)/$', views.api_v1.question, name='api_v1_question'), re_path('^api/v1/answers/(?P\d+)/$', views.api_v1.answer, name='api_v1_answer'), - re_path('^colors/', views.meta.colors, name='colors') + re_path('^colors/', views.meta.colors, name='colors'), + service_url( + r'^confirm-post/(?P[a-f0-9]+)/$', + views.post_confirmation.confirm_post_view, + name='confirm_post' + ), ] if 'askbot.deps.django_authopenid' in settings.INSTALLED_APPS: diff --git a/askbot/views/__init__.py b/askbot/views/__init__.py index 9469df011c..b31058c688 100644 --- a/askbot/views/__init__.py +++ b/askbot/views/__init__.py @@ -13,6 +13,7 @@ from askbot.views import sharing from askbot.views import users from askbot.views import writers +from askbot.views import post_confirmation from django.conf import settings as django_settings if 'avatar' in django_settings.INSTALLED_APPS: from askbot.views import avatar_views diff --git a/askbot/views/post_confirmation.py b/askbot/views/post_confirmation.py new file mode 100644 index 0000000000..578c4c8e13 --- /dev/null +++ b/askbot/views/post_confirmation.py @@ -0,0 +1,51 @@ +"""Views for first-post email confirmation.""" +from django.shortcuts import render +from django.http import HttpResponseRedirect +from django.utils.translation import gettext as _ + +from askbot.conf import settings as askbot_settings +from askbot.models.post_confirmation import PostConfirmation + + +def confirm_post_view(request, key): + """Handle post confirmation via email link.""" + try: + confirmation = PostConfirmation.objects.select_related('post', 'user').get(key=key) + except PostConfirmation.DoesNotExist: + return render(request, 'post_confirmation.html', { + 'error': _('This confirmation link is not valid.') + }) + + if confirmation.is_confirmed: + ctx = {'confirmed': True} + if confirmation.post.approved: + ctx['post_url'] = confirmation.post.get_absolute_url() + else: + ctx['pending_moderation'] = True + return render(request, 'post_confirmation.html', ctx) + + if confirmation.is_expired: + return render(request, 'post_confirmation.html', { + 'error': _('This confirmation link has expired.') + }) + + if request.method == 'POST': + if not request.POST.get('confirm_checkbox'): + return render(request, 'post_confirmation.html', { + 'error': _('Please check the confirmation checkbox.') + }) + + confirmation.confirm() + ctx = {'confirmed': True} + if askbot_settings.FIRST_POST_MODERATE_AFTER_CONFIRMATION: + ctx['pending_moderation'] = True + else: + ctx['post_url'] = confirmation.post.get_absolute_url() + return render(request, 'post_confirmation.html', ctx) + + post = confirmation.post + post_html = post.html or post.text + return render(request, 'post_confirmation.html', { + 'post_html': post_html, + 'confirmation': confirmation, + }) diff --git a/askbot/views/writers.py b/askbot/views/writers.py index a970b36d23..e32130823f 100644 --- a/askbot/views/writers.py +++ b/askbot/views/writers.py @@ -51,6 +51,108 @@ from askbot.utils.slug import slugify from askbot import spam_checker + +class _SilentSpamDeletion(Exception): + """Raised when a spam user is silently deleted. + Caught in the view to return a redirect without error message.""" + pass + + +def _check_spam_and_act(request, user, text): + """Dual Bayesian spam check. Returns None if post should proceed. + + Raises _SilentSpamDeletion if the user was deleted (first post, spam only). + Raises PermissionDenied if the post should be held for moderation. + Does nothing if spam filter is disabled or no spam detected. + """ + if not askbot_settings.SPAM_FILTER_ENABLED: + return + + from askbot.spam_checker.bayesian_spam_checker import check_content + spam_detected, ham_detected = check_content(text) + + if not spam_detected: + return + + is_first_post = not models.Post.objects.filter( + author=user, deleted=False + ).exists() + + if ham_detected: + # Ham model rescued it — benefit of doubt + if is_first_post and user.is_watched(): + raise exceptions.PermissionDenied( + _('Your post is held for review') + ) + return # approved user + ham match → allow through + + # Spam only, no ham match + if is_first_post and user.is_watched(): + if askbot_settings.BAYESIAN_SPAM_SILENT_DELETE: + user.delete() + raise _SilentSpamDeletion() + raise exceptions.PermissionDenied( + _('Your post is held for review') + ) + raise exceptions.PermissionDenied( + _('Your post is held for review') + ) + + +def _send_first_post_confirmation(request, user, post): + """If first-post email confirmation is enabled and user is watched, + create a PostConfirmation, mark post unapproved, send email, and + return True. Otherwise return False.""" + if not askbot_settings.FIRST_POST_EMAIL_CONFIRMATION: + return False + + if not user.is_watched(): + return False + + # Check if user has prior approved posts (excluding this one) + has_prior = models.Post.objects.filter( + author=user, deleted=False + ).exclude(pk=post.pk).exists() + if has_prior: + return False + + from askbot.models.post_confirmation import PostConfirmation + from askbot.mail.messages import PostConfirmationEmail + from askbot.mail import send_mail + + confirmation = PostConfirmation(post=post, user=user) + confirmation.save() + + post.approved = False + post.save(update_fields=['approved']) + + # Also mark thread unapproved for questions + if post.post_type == 'question': + post.thread.approved = False + post.thread.save(update_fields=['approved']) + + # Also mark the revision as unapproved + revision = post.get_latest_revision() + if revision: + revision.approved = False + revision.save(update_fields=['approved']) + + email = PostConfirmationEmail({'key': confirmation.key}) + context = email.get_context() + subject = email.render_subject() + body = email.render_body() + send_mail( + subject_line=subject, + body_text=body, + recipient_list=[user.email], + ) + + request.user.message_set.create( + message=_('Please check your email to confirm your post.') + ) + return True + + #todo: make this work with csrf @csrf.csrf_exempt def upload(request):#ajax upload file to a question or answer @@ -228,13 +330,6 @@ def ask(request):#view used to ask a new question #group_id = form.cleaned_data.get('group_id', None) language = form.cleaned_data.get('language', None) - content = '{}\n\n{}\n\n{}'.format(title, tagnames, text) - spam_checker_params = spam_checker.get_params_from_request(request) - enabled = askbot_settings.SPAM_FILTER_ENABLED - if enabled and spam_checker.is_spam(content, **spam_checker_params): - message = _('Spam was detected in the post') - raise exceptions.PermissionDenied(message) - if request.user.is_authenticated: drafts = models.DraftQuestion.objects.filter(author=request.user) drafts.delete() @@ -247,6 +342,9 @@ def ask(request):#view used to ask a new question if user: try: + content = '{}\n\n{}\n\n{}'.format(title, tagnames, text) + _check_spam_and_act(request, user, content) + question = user.post_question( title=title, body_text=text, @@ -259,12 +357,18 @@ def ask(request):#view used to ask a new question language=language, ip_addr=request.META.get('REMOTE_ADDR') ) + + if _send_first_post_confirmation(request, user, question): + return HttpResponseRedirect(reverse('index')) + signals.new_question_posted.send(None, question=question, user=user, form_data=form.cleaned_data ) return HttpResponseRedirect(question.get_absolute_url()) + except _SilentSpamDeletion: + return HttpResponseRedirect(reverse('index')) except exceptions.PermissionDenied as e: request.user.message_set.create(message = str(e)) return HttpResponseRedirect(reverse('index')) @@ -529,11 +633,7 @@ def answer(request, id, form_class=forms.AnswerForm):#process a new answer user = form.get_post_user(request.user) try: text = form.cleaned_data['text'] - spam_checker_params = spam_checker.get_params_from_request(request) - enabled = askbot_settings.SPAM_FILTER_ENABLED - if enabled and spam_checker.is_spam(text, **spam_checker_params): - message = _('Spam was detected in the post') - raise exceptions.PermissionDenied(message) + _check_spam_and_act(request, user, text) answer = form.save( question, @@ -541,6 +641,9 @@ def answer(request, id, form_class=forms.AnswerForm):#process a new answer ip_addr=request.META.get('REMOTE_ADDR') ) + if _send_first_post_confirmation(request, user, answer): + return HttpResponseRedirect(question.get_absolute_url()) + signals.new_answer_posted.send(None, answer=answer, user=user, @@ -548,6 +651,8 @@ def answer(request, id, form_class=forms.AnswerForm):#process a new answer ) return HttpResponseRedirect(answer.get_absolute_url()) + except _SilentSpamDeletion: + return HttpResponseRedirect(question.get_absolute_url()) except askbot_exceptions.AnswerAlreadyGiven as e: request.user.message_set.create(message = str(e)) answer = question.thread.get_answers_by_user(user)[0] @@ -678,23 +783,31 @@ def post_comments(request):#generic ajax handler to load comments to an object raise exceptions.PermissionDenied(askbot_settings.READ_ONLY_MESSAGE) text = form.cleaned_data['comment'] - spam_checker_params = spam_checker.get_params_from_request(request) - enabled = askbot_settings.SPAM_FILTER_ENABLED - if enabled and spam_checker.is_spam(text, **spam_checker_params): - message = _('Spam was detected in the post') - raise exceptions.PermissionDenied(message) + _check_spam_and_act(request, user, text) comment = user.post_comment( parent_post=post, body_text=text, ip_addr=request.META.get('REMOTE_ADDR') ) - signals.new_comment_posted.send(None, - comment=comment, - user=user, - form_data=form.cleaned_data + + if _send_first_post_confirmation(request, user, comment): + response = HttpResponse( + json.dumps([]), + content_type="application/json" + ) + else: + signals.new_comment_posted.send(None, + comment=comment, + user=user, + form_data=form.cleaned_data + ) + response = __generate_comments_json(post, user, avatar_size) + except _SilentSpamDeletion: + response = HttpResponse( + json.dumps([]), + content_type="application/json" ) - response = __generate_comments_json(post, user, avatar_size) except exceptions.PermissionDenied as e: response = HttpResponseForbidden(str(e), content_type="application/json") diff --git a/pyproject.toml b/pyproject.toml index fa925700a6..c2a720cbab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,7 @@ dependencies = [ "requests-oauthlib>=1.2.0", "requirements-parser>=0.2.0", "responses>=0.9.0,<=0.23.1", + "scikit-learn>=1.3.0", "tldextract==5.1.2", "unidecode", "urllib3>=2,<3",