From 723151b8f22c827e2f2aab8081794be0d168c9e6 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 15:19:30 -0400 Subject: [PATCH 01/17] doc(analytics): refine 031 living plan with pre-implementation decisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Resolve site config open question: SITE_ANALYTICS_ALLOWED_SITES env var - Resolve auth open question: no per-site tokens in v1 - Document IP extraction strategy (X-Forwarded-For → REMOTE_ADDR fallback) - Clarify CSRF handling via DRF authentication_classes pattern - Specify backup_policy.py table placement for all three new tables - Split chunk plan: A1 includes backup_policy + AGENTS.md, A3/A4 split by daily/monthly Co-Authored-By: Claude Sonnet 4.6 --- .../031-analytics-ingestion-design.md | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index 822eb228..1b1c71d9 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -49,14 +49,13 @@ ### HTTP ingestion contract - Endpoint: `POST /api/v1/analytics/collect` - Request payload (v1): - - `site` (slug; required) + - `site` (slug; required; must be in `SITE_ANALYTICS_ALLOWED_SITES`) - `path` (required) - `referrer` (optional) - - `token` (optional; validated when site is configured as token-required) - Behavior: - - CSRF-exempt endpoint for third-party/static-site calls. + - Uses DRF `APIView` with `authentication_classes = []` and `permission_classes = []`; this implicitly bypasses CSRF enforcement without needing `@csrf_exempt`. - Minimal synchronous work: validate, compute hash fields, write one row, return `204`. - - Reject malformed payloads with `400`; unauthorized token with `403`. + - Reject unknown `site` or malformed payloads with `400`. ### Data model (planned) - `AnalyticsPageView` (raw) @@ -78,6 +77,7 @@ - Compute `visitor_month_hash = sha256(ip + normalized_user_agent + month_key + secret_salt)`. - `month_key` is UTC `YYYY-MM`; this intentionally prevents cross-month correlation. - `secret_salt` comes from env/config (e.g., `SITE_ANALYTICS_HASH_SALT`). +- IP extraction: read `X-Forwarded-For` first (set by Heroku's routing layer and most reverse proxies); fall back to `REMOTE_ADDR`. Take the first (leftmost) address from `X-Forwarded-For` to get the client IP. - Retain raw rows only for bounded backfill/debug windows (target: 12-18 months). ### Aggregation strategy @@ -88,7 +88,8 @@ - Wire schedules in `qb_site/qb_site/settings/base.py` with env-overridable intervals and retention days. ### Security and abuse controls -- Lightweight allowlist/denylist for `site` identifiers. +- Site allowlist via `SITE_ANALYTICS_ALLOWED_SITES` env var (comma-separated slugs); unknown slugs rejected with `400`. + - Per-site tokens deferred to v1.1. - Basic bot filtering: - denylist common bot user-agent substrings. - optional reject when user-agent is empty. @@ -97,7 +98,10 @@ ### Public sanitized backup compatibility - This data will flow through the public backup pipeline in `.github/workflows/upload_backup.yaml`. - Keep analytics tables and fields compatible with sanitization/export scripts (`scripts/sanitize_backup.py`, `scripts/export_for_analysis.py`). -- If analytics introduces potentially sensitive columns, update sanitization rules and manifest outputs in the same change. +- Backup policy for analytics tables: + - `site_analytics_analyticspageview` → `TRUNCATE_TABLES` (raw rows contain visitor hashes; exclude from public backup). + - `site_analytics_analyticsdailymetric`, `site_analytics_analyticsmonthlymetric` → `RETAIN_TABLES` (aggregate-only, safe to share). +- `scripts/backup_policy.py` must be updated in A1 alongside the migration; `repo_check_compose.sh` enforces coverage. - Treat this as a release gate for analytics schema changes, consistent with `docs/design-decisions/016-sanitized-backups.md`. ## Invariants / Subtleties @@ -110,21 +114,25 @@ ## Implementation Plan (Chunks) 1. `A1` App scaffold + settings wiring. - Create `site_analytics` app and add to `INSTALLED_APPS`. - - Add env settings for hash salt, retention days, and task periods. + - Add `AnalyticsPageView` raw model and initial migration. + - Add env settings: `SITE_ANALYTICS_HASH_SALT`, `SITE_ANALYTICS_ALLOWED_SITES` (comma-separated slugs), `SITE_ANALYTICS_RETENTION_DAYS`, task period vars. + - Update `scripts/backup_policy.py` with the three new tables. + - Create `qb_site/site_analytics/AGENTS.md` (and `CLAUDE.md`). 2. `A2` Raw ingestion endpoint + validation. - Add `POST /api/v1/analytics/collect` route and view. - - Implement payload validation, token check, hashing service, and raw insert. -3. `A3` Aggregate models + aggregation services. - - Add daily/monthly metric models and upsert services. - - Add management command entrypoints for manual runs. -4. `A4` Celery task scheduling. - - Add periodic tasks + beat schedule keys in base settings. - - Ensure retry-safe/idempotent task behavior. + - Implement payload validation, site allowlist check, IP extraction (X-Forwarded-For → REMOTE_ADDR fallback), hashing service, and raw insert. + - Per-site tokens deferred to v1.1. +3. `A3` Daily aggregate model + service + Celery task. + - Add `AnalyticsDailyMetric` model, upsert service, and `site_analytics.aggregate_daily_metrics` Celery task. + - Wire beat schedule entry in `base.py`. +4. `A4` Monthly aggregate model + service + Celery task + prune task. + - Add `AnalyticsMonthlyMetric` model, upsert service, and `site_analytics.aggregate_monthly_metrics` Celery task. + - Add `site_analytics.prune_old_pageviews` task. + - Wire beat schedule entries. 5. `A5` Admin + operational visibility. - Register admin for raw/aggregate models. - Add concise task result payloads/counters. 6. `A6` Retention and hardening. - - Add pruning task and tests. - Tune bot filtering and optional throttle policy. ## Validation Plan @@ -156,12 +164,20 @@ - Converted this file from generic guidance to a repo-specific living plan. - Anchored implementation to existing `qb_site` boundaries and task patterns. - No code implementation started yet; all chunks currently pending. +- 2026-03-25: + - Pre-implementation design review; resolved open questions and refined plan: + - CSRF: implicit via DRF `authentication_classes = []`, no decorator needed. + - Site config: `SITE_ANALYTICS_ALLOWED_SITES` comma-separated env var; per-site tokens deferred to v1.1. + - IP extraction: `X-Forwarded-For` (Heroku/proxy) → `REMOTE_ADDR` fallback in hashing service. + - Backup policy: raw pageviews truncated, daily/monthly aggregates retained; `backup_policy.py` update required in A1. + - `tasks/__init__.py` re-export pattern (consistent with syncer/analyzer) added to A1 scaffold. + - `AGENTS.md` creation required in A1. ## Open Questions -- Should `site` configuration live in DB (admin-editable) or settings/env (static)? +- ~~Should `site` configuration live in DB (admin-editable) or settings/env (static)?~~ Resolved: settings/env (`SITE_ANALYTICS_ALLOWED_SITES`) for v1. - Do we need per-path monthly aggregates in v1, or can we defer to v1.1? - What default retention window is acceptable for privacy/compliance expectations? -- Should endpoint auth be required for all sites from day one, or optional during bootstrap? +- ~~Should endpoint auth be required for all sites from day one, or optional during bootstrap?~~ Resolved: no per-site tokens in v1; `SITE_ANALYTICS_ALLOWED_SITES` allowlist is the only gate. ## References - `docs/design-decisions/README.md` From 120bc75c85b845db6660f153628d016fd69cf285 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 15:30:37 -0400 Subject: [PATCH 02/17] =?UTF-8?q?feat(site=5Fanalytics):=20A1=20=E2=80=94?= =?UTF-8?q?=20app=20scaffold,=20AnalyticsPageView=20model,=20settings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New `site_analytics` Django app with models/, services/, tasks/, tests/ layout - AnalyticsPageView raw event model: site, path, referrer, user_agent, occurred_at, visitor_month_hash; indexes on (site, occurred_at) and (occurred_at); initial migration generated - Settings: SITE_ANALYTICS_HASH_SALT, SITE_ANALYTICS_ALLOWED_SITES, SITE_ANALYTICS_RETENTION_DAYS, and task period vars in base.py - backup_policy.py: site_analytics_analyticspageview → TRUNCATE_TABLES (raw rows contain visitor hashes; excluded from public backup) - repo_check_compose.sh: add step 12/12 for site_analytics test suite - AGENTS.md files created/updated for new app and root/qb_site indexes Co-Authored-By: Claude Sonnet 4.6 --- AGENTS.md | 4 +- qb_site/AGENTS.md | 2 + qb_site/qb_site/settings/base.py | 11 +++++ qb_site/site_analytics/AGENTS.md | 47 +++++++++++++++++++ qb_site/site_analytics/CLAUDE.md | 1 + qb_site/site_analytics/__init__.py | 0 qb_site/site_analytics/apps.py | 7 +++ .../site_analytics/migrations/0001_initial.py | 30 ++++++++++++ qb_site/site_analytics/migrations/__init__.py | 0 qb_site/site_analytics/models/__init__.py | 3 ++ qb_site/site_analytics/models/pageview.py | 30 ++++++++++++ qb_site/site_analytics/services/__init__.py | 0 qb_site/site_analytics/tasks/__init__.py | 4 ++ qb_site/site_analytics/tests/__init__.py | 0 scripts/backup_policy.py | 3 ++ scripts/repo_check_compose.sh | 30 +++++++----- 16 files changed, 157 insertions(+), 15 deletions(-) create mode 100644 qb_site/site_analytics/AGENTS.md create mode 100644 qb_site/site_analytics/CLAUDE.md create mode 100644 qb_site/site_analytics/__init__.py create mode 100644 qb_site/site_analytics/apps.py create mode 100644 qb_site/site_analytics/migrations/0001_initial.py create mode 100644 qb_site/site_analytics/migrations/__init__.py create mode 100644 qb_site/site_analytics/models/__init__.py create mode 100644 qb_site/site_analytics/models/pageview.py create mode 100644 qb_site/site_analytics/services/__init__.py create mode 100644 qb_site/site_analytics/tasks/__init__.py create mode 100644 qb_site/site_analytics/tests/__init__.py diff --git a/AGENTS.md b/AGENTS.md index d434c60c..9b18f595 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,7 +2,7 @@ ## Project Structure & Module Organization - `src/queueboard/` contains the legacy Python data pipeline: GraphQL queries under `queries/`, HTML assets in `static/`, and scripts like `dashboard.py`, `process.py`, and `suggest_reviewer.py`. -- `qb_site/` hosts the Django codebase; apps live in `qb_site/{core,syncer,analyzer,api,zulip_bot}/` and share settings from `qb_site/qb_site/settings/`. +- `qb_site/` hosts the Django codebase; apps live in `qb_site/{core,syncer,analyzer,api,zulip_bot,site_analytics}/` and share settings from `qb_site/qb_site/settings/`. - `scripts/` provides operational helpers; `test/` stores fixture JSON for dashboard regression checks; `docs/` captures architecture plans/decisions. ## Build, Test, and Development Commands @@ -61,7 +61,7 @@ Notes ## Keeping AGENTS.md Files Updated - Every directory with significant logic has its own `AGENTS.md` (mirrored as `CLAUDE.md`). Current locations: root, `qb_site/`, `qb_site/syncer/`, `qb_site/analyzer/`, - `qb_site/zulip_bot/`, `src/queueboard/`. + `qb_site/zulip_bot/`, `qb_site/site_analytics/`, `src/queueboard/`. - When you add, rename, or remove management commands, Celery tasks, key services, or directory structure, update the relevant `AGENTS.md` in the same commit/PR. - When you add a new app or significant sub-directory, create a matching `AGENTS.md` diff --git a/qb_site/AGENTS.md b/qb_site/AGENTS.md index 592f1b06..b7781659 100644 --- a/qb_site/AGENTS.md +++ b/qb_site/AGENTS.md @@ -8,11 +8,13 @@ - `analyzer`: derived queue/revision/dependency state and snapshots. - `api`: DRF views/serializers for queueboard surfaces. - `zulip_bot`: Zulip webhook/command integration and policies. + - `site_analytics`: privacy-preserving pageview ingestion and aggregation for static/funder-facing sites. - Keep new modules inside the owning app (`models/`, `services/`, `tasks/`, `management/commands/`, `tests/`). - App-specific guidance: - `qb_site/syncer/AGENTS.md` for ingestion, discovery/backfill, and sync admin workflows. - `qb_site/analyzer/AGENTS.md` for revision/queue/dependency sweeps and analytics models. - `qb_site/zulip_bot/AGENTS.md` for webhook/command/policy/registration behavior. + - `qb_site/site_analytics/AGENTS.md` for pageview ingestion, aggregation tasks, and privacy rules. ## Core Commands ```bash diff --git a/qb_site/qb_site/settings/base.py b/qb_site/qb_site/settings/base.py index 38bcbb64..cd06a0ce 100644 --- a/qb_site/qb_site/settings/base.py +++ b/qb_site/qb_site/settings/base.py @@ -60,6 +60,7 @@ def env_optional_bounded_int(name: str, *, minimum: int, maximum: int) -> int | "analyzer", "api", "zulip_bot", + "site_analytics", ] MIDDLEWARE = [ @@ -362,6 +363,16 @@ def env_optional_bounded_int(name: str, *, minimum: int, maximum: int) -> int | ANALYZER_AREA_STATS_TTL_SECONDS = int(os.getenv("ANALYZER_AREA_STATS_TTL_SECONDS", ANALYZER_QUEUEBOARD_SNAPSHOT_TTL_SECONDS)) ANALYTICS_CONVERGENCE_PERIOD_SECONDS = int(os.getenv("ANALYTICS_CONVERGENCE_PERIOD_SECONDS", 900)) +# Site analytics settings +SITE_ANALYTICS_HASH_SALT = os.getenv("SITE_ANALYTICS_HASH_SALT", "") +SITE_ANALYTICS_ALLOWED_SITES: list[str] = [ + s.strip() for s in os.getenv("SITE_ANALYTICS_ALLOWED_SITES", "").split(",") if s.strip() +] +SITE_ANALYTICS_RETENTION_DAYS = int(os.getenv("SITE_ANALYTICS_RETENTION_DAYS", 540)) +SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS = int(os.getenv("SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS", 3600)) +SITE_ANALYTICS_MONTHLY_AGGREGATE_PERIOD_SECONDS = int(os.getenv("SITE_ANALYTICS_MONTHLY_AGGREGATE_PERIOD_SECONDS", 86400)) +SITE_ANALYTICS_PRUNE_PERIOD_SECONDS = int(os.getenv("SITE_ANALYTICS_PRUNE_PERIOD_SECONDS", 86400)) + # CI filter (opt-in allowlist mode) # Set mode to 'allowlist' to enable filtering by the following substrings; otherwise all contexts are ingested. SYNCER_CI_FILTER_MODE = os.getenv("SYNCER_CI_FILTER_MODE", "all").lower() diff --git a/qb_site/site_analytics/AGENTS.md b/qb_site/site_analytics/AGENTS.md new file mode 100644 index 00000000..fcd1694a --- /dev/null +++ b/qb_site/site_analytics/AGENTS.md @@ -0,0 +1,47 @@ +# Site Analytics Guidelines + +## Scope +- `qb_site/site_analytics/` implements privacy-preserving pageview ingestion and aggregation for static/funder-facing sites. +- Raw events in `AnalyticsPageView`; aggregate reporting in `AnalyticsDailyMetric` and `AnalyticsMonthlyMetric` (added in A3/A4). +- Design record: `docs/design-decisions/031-analytics-ingestion-design.md`. + +## Module Layout +- `models/pageview.py` — `AnalyticsPageView` raw event rows (immutable after insert). +- `models/daily_metric.py` — `AnalyticsDailyMetric` (added in A3). +- `models/monthly_metric.py` — `AnalyticsMonthlyMetric` (added in A4). +- `services/` — hashing, bot filtering, aggregation logic. +- `tasks/` — periodic Celery tasks for aggregation and pruning. +- `tests/` — unit and integration tests. +- API ingestion view: `qb_site/api/views/analytics_collect.py` (added in A2). + +## Key Settings (all env-overridable) +- `SITE_ANALYTICS_HASH_SALT` — required secret for visitor hash; empty string disables hashing safety checks in dev. +- `SITE_ANALYTICS_ALLOWED_SITES` — comma-separated site slugs; unknown slugs rejected with `400`. +- `SITE_ANALYTICS_RETENTION_DAYS` — raw pageview retention window (default 540 days / ~18 months). +- `SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS` — beat period for daily aggregation task (default 3600). +- `SITE_ANALYTICS_MONTHLY_AGGREGATE_PERIOD_SECONDS` — beat period for monthly aggregation task (default 86400). +- `SITE_ANALYTICS_PRUNE_PERIOD_SECONDS` — beat period for retention pruning task (default 86400). + +## Task Surface +Celery task names (as registered via `@shared_task(name=…)`): + +- `site_analytics.aggregate_daily_metrics` — idempotent upsert of daily pageview/unique-visitor counts (added in A3). +- `site_analytics.aggregate_monthly_metrics` — idempotent upsert of monthly metrics; recomputes current + previous month (added in A4). +- `site_analytics.prune_old_pageviews` — deletes raw rows older than `SITE_ANALYTICS_RETENTION_DAYS` (added in A4). + +## Privacy Invariants +- Raw IP addresses are never stored. +- `visitor_month_hash = sha256(ip + normalized_user_agent + YYYY-MM + salt)`. +- IP is extracted from `X-Forwarded-For` (Heroku / reverse-proxy header) falling back to `REMOTE_ADDR`. +- Changing hashing semantics requires an explicit migration/versioning note in the design doc. + +## Backup Policy +- `site_analytics_analyticspageview` → TRUNCATE (raw rows contain visitor hashes; excluded from public backup). +- `site_analytics_analyticsdailymetric`, `site_analytics_analyticsmonthlymetric` → RETAIN (aggregate-only, safe to share). +- Update `scripts/backup_policy.py` whenever adding or removing tables. + +## Testing +```bash +uv run python qb_site/manage.py test site_analytics +bash scripts/repo_check_compose.sh +``` diff --git a/qb_site/site_analytics/CLAUDE.md b/qb_site/site_analytics/CLAUDE.md new file mode 100644 index 00000000..43c994c2 --- /dev/null +++ b/qb_site/site_analytics/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/qb_site/site_analytics/__init__.py b/qb_site/site_analytics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qb_site/site_analytics/apps.py b/qb_site/site_analytics/apps.py new file mode 100644 index 00000000..3f3b5c63 --- /dev/null +++ b/qb_site/site_analytics/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class SiteAnalyticsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "site_analytics" + verbose_name = "Site Analytics" diff --git a/qb_site/site_analytics/migrations/0001_initial.py b/qb_site/site_analytics/migrations/0001_initial.py new file mode 100644 index 00000000..2884c7cd --- /dev/null +++ b/qb_site/site_analytics/migrations/0001_initial.py @@ -0,0 +1,30 @@ +# Generated by Django 5.2.6 on 2026-03-25 19:22 + +import django.utils.timezone +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='AnalyticsPageView', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('site', models.CharField(max_length=100)), + ('path', models.CharField(max_length=2000)), + ('referrer', models.CharField(blank=True, default='', max_length=2000)), + ('user_agent', models.CharField(blank=True, default='', max_length=1000)), + ('occurred_at', models.DateTimeField(default=django.utils.timezone.now)), + ('visitor_month_hash', models.CharField(max_length=64)), + ], + options={ + 'indexes': [models.Index(fields=['site', 'occurred_at'], name='sa_pv_site_occurred_idx'), models.Index(fields=['occurred_at'], name='sa_pv_occurred_idx')], + }, + ), + ] diff --git a/qb_site/site_analytics/migrations/__init__.py b/qb_site/site_analytics/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qb_site/site_analytics/models/__init__.py b/qb_site/site_analytics/models/__init__.py new file mode 100644 index 00000000..2db40d15 --- /dev/null +++ b/qb_site/site_analytics/models/__init__.py @@ -0,0 +1,3 @@ +"""Site analytics models: raw events and aggregate reporting tables.""" + +from .pageview import AnalyticsPageView # noqa: F401 diff --git a/qb_site/site_analytics/models/pageview.py b/qb_site/site_analytics/models/pageview.py new file mode 100644 index 00000000..f6ae1704 --- /dev/null +++ b/qb_site/site_analytics/models/pageview.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from django.db import models +from django.utils import timezone + + +class AnalyticsPageView(models.Model): + """Raw pageview event row. + + Rows are immutable after insert; never updated. + Raw IP is not stored; privacy-preserving monthly hash is used instead. + Retained for a bounded window (SITE_ANALYTICS_RETENTION_DAYS) then pruned. + """ + + site = models.CharField(max_length=100) + path = models.CharField(max_length=2000) + referrer = models.CharField(max_length=2000, blank=True, default="") + user_agent = models.CharField(max_length=1000, blank=True, default="") + occurred_at = models.DateTimeField(default=timezone.now) + # sha256(ip + normalized_user_agent + YYYY-MM + salt) — no raw IP stored + visitor_month_hash = models.CharField(max_length=64) + + class Meta: + indexes = [ + models.Index(fields=["site", "occurred_at"], name="sa_pv_site_occurred_idx"), + models.Index(fields=["occurred_at"], name="sa_pv_occurred_idx"), + ] + + def __str__(self) -> str: + return f"{self.site}:{self.path} @ {self.occurred_at}" diff --git a/qb_site/site_analytics/services/__init__.py b/qb_site/site_analytics/services/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/qb_site/site_analytics/tasks/__init__.py b/qb_site/site_analytics/tasks/__init__.py new file mode 100644 index 00000000..2d7b3bd9 --- /dev/null +++ b/qb_site/site_analytics/tasks/__init__.py @@ -0,0 +1,4 @@ +"""Celery tasks for site analytics aggregation and retention. + +Populated in A3 (daily aggregate) and A4 (monthly aggregate + prune). +""" diff --git a/qb_site/site_analytics/tests/__init__.py b/qb_site/site_analytics/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/backup_policy.py b/scripts/backup_policy.py index 49490e27..06ec1600 100644 --- a/scripts/backup_policy.py +++ b/scripts/backup_policy.py @@ -5,6 +5,7 @@ # Tables expected to exist in backups and managed by this policy. # We include implicit Django auth M2M tables and django_migrations explicitly. BACKUP_TABLES: tuple[str, ...] = ( + "site_analytics_analyticspageview", "analyzer_analyzerconvergencesnapshot", "analyzer_areastatssnapshot", "analyzer_prdependency", @@ -88,6 +89,8 @@ "syncer_cishafetchstate", "syncer_repodiscoverystate", "syncer_githubwebhookdelivery", + # Raw analytics pageviews — contain visitor hashes; exclude from public backup + "site_analytics_analyticspageview", ) # Tables retained in sanitized dump. diff --git a/scripts/repo_check_compose.sh b/scripts/repo_check_compose.sh index 9dd8cdb0..a36edab3 100755 --- a/scripts/repo_check_compose.sh +++ b/scripts/repo_check_compose.sh @@ -51,16 +51,16 @@ PY fi if [ "${SKIP_COMPOSE_BUILD:-0}" != "1" ]; then - echo "[0/11] Building compose images (web/migrate/worker/beat) to pick up dependency changes" + echo "[0/12] Building compose images (web/migrate/worker/beat) to pick up dependency changes" docker compose build web migrate worker beat else - echo "[0/11] Skipping compose build (SKIP_COMPOSE_BUILD=1)" + echo "[0/12] Skipping compose build (SKIP_COMPOSE_BUILD=1)" fi -echo "[1/11] Reset compose services/networks to avoid stale startup state" +echo "[1/12] Reset compose services/networks to avoid stale startup state" docker compose down --remove-orphans >/dev/null 2>&1 || true -echo "[2/11] Validate GitHub GraphQL queries (host)" +echo "[2/12] Validate GitHub GraphQL queries (host)" if [ "${SKIP_GRAPHQL_VALIDATE:-0}" = "1" ]; then echo "Skipping GraphQL validation (SKIP_GRAPHQL_VALIDATE=1)" else @@ -71,7 +71,7 @@ else fi fi -echo "[3/11] Starting web (waits on db:healthy via depends_on)" +echo "[3/12] Starting web (waits on db:healthy via depends_on)" if ! docker compose up -d web; then echo "Compose failed to start services. Dumping service status and migrate logs..." >&2 docker compose ps || true @@ -79,33 +79,37 @@ if ! docker compose up -d web; then exit 1 fi -echo "[4/11] Validate backup policy coverage (compose)" +echo "[4/12] Validate backup policy coverage (compose)" docker compose exec -T web python scripts/validate_backup_policy.py -echo "[5/11] Django system checks (compose)" +echo "[5/12] Django system checks (compose)" docker compose exec -T web python qb_site/manage.py check -echo "[6/11] Dry-run makemigrations (compose)" +echo "[6/12] Dry-run makemigrations (compose)" docker compose exec -T web python qb_site/manage.py makemigrations --dry-run --check -echo "[7/11] Run core tests (compose)" +echo "[7/12] Run core tests (compose)" # Use higher verbosity to list skipped tests with reasons. docker compose exec -T web env DJANGO_SETTINGS_MODULE=qb_site.settings.ci python qb_site/manage.py test core # --verbosity 2 -echo "[8/11] Run syncer tests (compose)" +echo "[8/12] Run syncer tests (compose)" # Use higher verbosity to list skipped tests with reasons. docker compose exec -T web env DJANGO_SETTINGS_MODULE=qb_site.settings.ci python qb_site/manage.py test syncer # --verbosity 2 -echo "[9/11] Run analyzer tests (compose)" +echo "[9/12] Run analyzer tests (compose)" # Use higher verbosity to list skipped tests with reasons. docker compose exec -T web env DJANGO_SETTINGS_MODULE=qb_site.settings.ci python qb_site/manage.py test analyzer # --verbosity 2 -echo "[10/11] Run api tests (compose)" +echo "[10/12] Run api tests (compose)" # Use higher verbosity to list skipped tests with reasons. docker compose exec -T web env DJANGO_SETTINGS_MODULE=qb_site.settings.ci python qb_site/manage.py test api # --verbosity 2 -echo "[11/11] Run zulip_bot tests (compose)" +echo "[11/12] Run zulip_bot tests (compose)" # Use higher verbosity to list skipped tests with reasons. docker compose exec -T web env DJANGO_SETTINGS_MODULE=qb_site.settings.ci python qb_site/manage.py test zulip_bot # --verbosity 2 +echo "[12/12] Run site_analytics tests (compose)" +# Use higher verbosity to list skipped tests with reasons. +docker compose exec -T web env DJANGO_SETTINGS_MODULE=qb_site.settings.ci python qb_site/manage.py test site_analytics # --verbosity 2 + echo "Compose checks completed." From 15239856d9ae4afdadb7181dc908188846829f32 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 15:56:42 -0400 Subject: [PATCH 03/17] =?UTF-8?q?feat(site=5Fanalytics):=20A2=20=E2=80=94?= =?UTF-8?q?=20ingestion=20endpoint,=20hashing,=20bot-filter,=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - POST /api/v1/analytics/collect: validate site/path, check allowlist, drop bots silently (204), insert AnalyticsPageView row, return 204 - services/hashing.py: get_client_ip (XFF → REMOTE_ADDR fallback) and compute_visitor_month_hash (pipe-separated fields, UA lowercased) - services/bot_filter.py: substring denylist for known bots/crawlers - .env.example: SITE_ANALYTICS_HASH_SALT, ALLOWED_SITES, RETENTION_DAYS - tests/test_services.py: IP extraction, hash determinism/isolation, bot detection - tests/test_collect_view.py: 204 success, 400 validation, bot drop, no raw IP in row, XFF hash isolation, field truncation, empty allowlist - design doc: A1+A2 progress notes, three implementation subtleties recorded Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 8 ++ .../031-analytics-ingestion-design.md | 11 +- qb_site/api/urls.py | 2 + qb_site/api/views/analytics_collect.py | 65 +++++++++ qb_site/site_analytics/services/bot_filter.py | 38 ++++++ qb_site/site_analytics/services/hashing.py | 34 +++++ .../site_analytics/tests/test_collect_view.py | 125 ++++++++++++++++++ qb_site/site_analytics/tests/test_services.py | 112 ++++++++++++++++ 8 files changed, 393 insertions(+), 2 deletions(-) create mode 100644 qb_site/api/views/analytics_collect.py create mode 100644 qb_site/site_analytics/services/bot_filter.py create mode 100644 qb_site/site_analytics/services/hashing.py create mode 100644 qb_site/site_analytics/tests/test_collect_view.py create mode 100644 qb_site/site_analytics/tests/test_services.py diff --git a/.env.example b/.env.example index 0a48a121..0c61f90e 100644 --- a/.env.example +++ b/.env.example @@ -242,6 +242,14 @@ ANALYZER_AREA_STATS_TTL_SECONDS=300 # Convergence snapshots: how often to collect syncer/analyzer convergence counts (seconds) ANALYTICS_CONVERGENCE_PERIOD_SECONDS=900 +# Site analytics (pageview ingestion) +# Secret salt for visitor monthly hash (sha256-based; required in production). +SITE_ANALYTICS_HASH_SALT= +# Comma-separated slugs of sites allowed to post events; unknown slugs → 400. +SITE_ANALYTICS_ALLOWED_SITES= +# Raw pageview retention window in days (default 540 ≈ 18 months). +SITE_ANALYTICS_RETENTION_DAYS=540 + # CI filter (opt-in allowlist) # Set SYNCER_CI_FILTER_MODE=allowlist to enable filtering by the allow lists below. # Substrings matched case-insensitively against CheckRun.name and StatusContext.context. diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index 1b1c71d9..d122b3f8 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -112,13 +112,13 @@ - `site` taxonomy must remain stable; renames need explicit backfill/mapping handling. ## Implementation Plan (Chunks) -1. `A1` App scaffold + settings wiring. +1. `A1` ✓ App scaffold + settings wiring. - Create `site_analytics` app and add to `INSTALLED_APPS`. - Add `AnalyticsPageView` raw model and initial migration. - Add env settings: `SITE_ANALYTICS_HASH_SALT`, `SITE_ANALYTICS_ALLOWED_SITES` (comma-separated slugs), `SITE_ANALYTICS_RETENTION_DAYS`, task period vars. - Update `scripts/backup_policy.py` with the three new tables. - Create `qb_site/site_analytics/AGENTS.md` (and `CLAUDE.md`). -2. `A2` Raw ingestion endpoint + validation. +2. `A2` ✓ Raw ingestion endpoint + validation. - Add `POST /api/v1/analytics/collect` route and view. - Implement payload validation, site allowlist check, IP extraction (X-Forwarded-For → REMOTE_ADDR fallback), hashing service, and raw insert. - Per-site tokens deferred to v1.1. @@ -172,6 +172,13 @@ - Backup policy: raw pageviews truncated, daily/monthly aggregates retained; `backup_policy.py` update required in A1. - `tasks/__init__.py` re-export pattern (consistent with syncer/analyzer) added to A1 scaffold. - `AGENTS.md` creation required in A1. + - `A1` implemented: app scaffold, `AnalyticsPageView` model, settings, backup policy entry, `repo_check_compose.sh` wiring, AGENTS.md. + - No significant deviations from plan. `validate_backup_policy.py` passes without a live DB (uses Django app registry). + - `A2` implemented: hashing service, bot-filter service, `POST /api/v1/analytics/collect` view, URL wiring, `.env.example` entries, service + endpoint tests. + - Subtlety — field separator in hash: the design specified `sha256(ip + ua + month + salt)` with implicit concatenation, which is collision-prone (e.g. `"ab" + "c"` vs `"a" + "bc"`). Implemented with `|` pipe separators between all four fields. This is a privacy-contract detail: any future reimplementation must use the same separator or old and new hashes will diverge for the same visitor. + - Subtlety — bot traffic response: bots return `204` (same as success) rather than a distinct status code so detection heuristics are not leaked to callers. + - Subtlety — field truncation: `path`, `referrer`, and `user_agent` are silently truncated to their model `max_length` before insert rather than rejected, since truncation is preferable to dropping the event entirely for slightly over-long paths. + - Subtlety — empty `SITE_ANALYTICS_ALLOWED_SITES`: an unconfigured (empty) allowlist rejects all requests with `400`; this is intentional opt-in behaviour that prevents accidental data collection before sites are explicitly registered. ## Open Questions - ~~Should `site` configuration live in DB (admin-editable) or settings/env (static)?~~ Resolved: settings/env (`SITE_ANALYTICS_ALLOWED_SITES`) for v1. diff --git a/qb_site/api/urls.py b/qb_site/api/urls.py index 8c82119d..b02db278 100644 --- a/qb_site/api/urls.py +++ b/qb_site/api/urls.py @@ -2,12 +2,14 @@ from django.urls import path from api.views import index +from api.views.analytics_collect import AnalyticsCollectView from api.views.queueboard_dependency_graph import QueueboardDependencyGraphView from api.views.queueboard_snapshot import QueueboardSnapshotView from api.views.reviewer_assignment import AreaStatsView, ReviewerAssignmentsView urlpatterns: list = [ path("", index, name="index"), + path("v1/analytics/collect", AnalyticsCollectView.as_view(), name="analytics-collect"), path("v1/queueboard/snapshot", QueueboardSnapshotView.as_view(), name="queueboard-snapshot"), path( "v1/queueboard/dependency-graph", diff --git a/qb_site/api/views/analytics_collect.py b/qb_site/api/views/analytics_collect.py new file mode 100644 index 00000000..636fe78c --- /dev/null +++ b/qb_site/api/views/analytics_collect.py @@ -0,0 +1,65 @@ +"""POST /api/v1/analytics/collect — lightweight pageview ingestion endpoint.""" + +from __future__ import annotations + +from django.conf import settings +from django.utils import timezone +from rest_framework import status +from rest_framework.request import Request +from rest_framework.response import Response +from rest_framework.views import APIView + +from site_analytics.models import AnalyticsPageView +from site_analytics.services.bot_filter import is_bot +from site_analytics.services.hashing import compute_visitor_month_hash, get_client_ip + +# Hard caps to guard against oversized payloads hitting DB column limits. +_PATH_MAX = 2000 +_REFERRER_MAX = 2000 +_UA_MAX = 1000 + + +class AnalyticsCollectView(APIView): + """Ingest a single pageview event. + + Intentionally minimal: validate, hash, insert, return 204. + All heavier work (aggregation, reporting) happens in periodic tasks. + """ + + authentication_classes: list = [] + permission_classes: list = [] + + def post(self, request: Request, *args: object, **kwargs: object) -> Response: + site = (request.data.get("site") or "").strip() + path = (request.data.get("path") or "").strip() + referrer = (request.data.get("referrer") or "").strip() + user_agent = request.META.get("HTTP_USER_AGENT", "").strip() + + if not site: + return Response({"detail": "site is required"}, status=status.HTTP_400_BAD_REQUEST) + if not path: + return Response({"detail": "path is required"}, status=status.HTTP_400_BAD_REQUEST) + + allowed_sites = settings.SITE_ANALYTICS_ALLOWED_SITES + if site not in allowed_sites: + return Response({"detail": "unknown site"}, status=status.HTTP_400_BAD_REQUEST) + + # Silently drop bot traffic rather than returning an error, to avoid + # leaking information about detection heuristics. + if is_bot(user_agent): + return Response(status=status.HTTP_204_NO_CONTENT) + + now = timezone.now() + month_key = now.strftime("%Y-%m") + visitor_month_hash = compute_visitor_month_hash(get_client_ip(request), user_agent, month_key) + + AnalyticsPageView.objects.create( + site=site, + path=path[:_PATH_MAX], + referrer=referrer[:_REFERRER_MAX], + user_agent=user_agent[:_UA_MAX], + occurred_at=now, + visitor_month_hash=visitor_month_hash, + ) + + return Response(status=status.HTTP_204_NO_CONTENT) diff --git a/qb_site/site_analytics/services/bot_filter.py b/qb_site/site_analytics/services/bot_filter.py new file mode 100644 index 00000000..e6e01564 --- /dev/null +++ b/qb_site/site_analytics/services/bot_filter.py @@ -0,0 +1,38 @@ +"""Bot/crawler user-agent filtering for analytics ingestion.""" + +from __future__ import annotations + +# Case-insensitive substrings that identify known bots/crawlers/tools. +# Extend conservatively; false positives silently drop legitimate pageviews. +_BOT_UA_SUBSTRINGS: tuple[str, ...] = ( + "bot", + "crawler", + "spider", + "scraper", + "curl/", + "wget/", + "python-requests", + "python-urllib", + "go-http-client", + "java/", + "libwww", + "httpclient", + "okhttp", + "axios/", + "node-fetch", + "got/", + "undici", + "vercel", +) + + +def is_bot(user_agent: str) -> bool: + """Return True if the user-agent matches a known bot/crawler pattern. + + Empty user-agents are allowed (not treated as bots) in v1; that behaviour + can be tightened via a settings flag in a later chunk. + """ + if not user_agent: + return False + ua_lower = user_agent.lower() + return any(sub in ua_lower for sub in _BOT_UA_SUBSTRINGS) diff --git a/qb_site/site_analytics/services/hashing.py b/qb_site/site_analytics/services/hashing.py new file mode 100644 index 00000000..71305a90 --- /dev/null +++ b/qb_site/site_analytics/services/hashing.py @@ -0,0 +1,34 @@ +"""Visitor hashing service for privacy-preserving pageview identity.""" + +from __future__ import annotations + +import hashlib + +from django.conf import settings +from django.http import HttpRequest + + +def get_client_ip(request: HttpRequest) -> str: + """Return the client IP address. + + Prefers the leftmost address in X-Forwarded-For (set by Heroku's routing + layer and most reverse proxies). Falls back to REMOTE_ADDR for direct + connections (local dev, tests). + """ + xff = request.META.get("HTTP_X_FORWARDED_FOR", "").strip() + if xff: + return xff.split(",")[0].strip() + return request.META.get("REMOTE_ADDR", "") + + +def compute_visitor_month_hash(ip: str, user_agent: str, month_key: str) -> str: + """Return sha256(ip | normalized_ua | month_key | salt) as a hex digest. + + Fields are joined with ``|`` to prevent cross-field collisions. + ``month_key`` must be a UTC ``YYYY-MM`` string so cross-month correlation + is impossible by construction. + """ + salt = settings.SITE_ANALYTICS_HASH_SALT + normalized_ua = user_agent.strip().lower() + payload = f"{ip}|{normalized_ua}|{month_key}|{salt}" + return hashlib.sha256(payload.encode()).hexdigest() diff --git a/qb_site/site_analytics/tests/test_collect_view.py b/qb_site/site_analytics/tests/test_collect_view.py new file mode 100644 index 00000000..07c5162c --- /dev/null +++ b/qb_site/site_analytics/tests/test_collect_view.py @@ -0,0 +1,125 @@ +"""Endpoint tests for POST /api/v1/analytics/collect.""" + +from __future__ import annotations + +from django.test import TestCase, override_settings +from rest_framework.test import APIClient + +from site_analytics.models import AnalyticsPageView + +_ALLOWED = override_settings( + SITE_ANALYTICS_ALLOWED_SITES=["test-site"], + SITE_ANALYTICS_HASH_SALT="test-salt", +) + +URL = "/api/v1/analytics/collect" + + +@_ALLOWED +class AnalyticsCollectViewTests(TestCase): + def setUp(self) -> None: + self.client = APIClient() + + def _post(self, data: dict, **kwargs) -> object: + return self.client.post(URL, data, format="json", **kwargs) + + # --- success path --- + + def test_valid_payload_returns_204(self): + resp = self._post({"site": "test-site", "path": "/about"}) + self.assertEqual(resp.status_code, 204) + + def test_valid_payload_creates_pageview_row(self): + self._post({"site": "test-site", "path": "/about", "referrer": "https://example.com"}) + pv = AnalyticsPageView.objects.get() + self.assertEqual(pv.site, "test-site") + self.assertEqual(pv.path, "/about") + self.assertEqual(pv.referrer, "https://example.com") + self.assertEqual(len(pv.visitor_month_hash), 64) + + def test_referrer_optional(self): + resp = self._post({"site": "test-site", "path": "/home"}) + self.assertEqual(resp.status_code, 204) + self.assertEqual(AnalyticsPageView.objects.get().referrer, "") + + def test_user_agent_captured_from_header(self): + self._post( + {"site": "test-site", "path": "/"}, + HTTP_USER_AGENT="Mozilla/5.0 (Test)", + ) + self.assertEqual(AnalyticsPageView.objects.get().user_agent, "Mozilla/5.0 (Test)") + + # --- validation errors --- + + def test_missing_site_returns_400(self): + resp = self._post({"path": "/about"}) + self.assertEqual(resp.status_code, 400) + self.assertIn("site", resp.json()["detail"]) + + def test_missing_path_returns_400(self): + resp = self._post({"site": "test-site"}) + self.assertEqual(resp.status_code, 400) + self.assertIn("path", resp.json()["detail"]) + + def test_unknown_site_returns_400(self): + resp = self._post({"site": "unknown-site", "path": "/about"}) + self.assertEqual(resp.status_code, 400) + self.assertIn("site", resp.json()["detail"]) + + def test_empty_site_returns_400(self): + resp = self._post({"site": "", "path": "/about"}) + self.assertEqual(resp.status_code, 400) + + # --- bot filtering --- + + def test_bot_ua_returns_204_but_no_row(self): + resp = self._post( + {"site": "test-site", "path": "/about"}, + HTTP_USER_AGENT="Googlebot/2.1", + ) + self.assertEqual(resp.status_code, 204) + self.assertEqual(AnalyticsPageView.objects.count(), 0) + + # --- privacy: no raw IP stored --- + + def test_ip_not_stored_in_row(self): + self._post( + {"site": "test-site", "path": "/"}, + REMOTE_ADDR="1.2.3.4", + ) + pv = AnalyticsPageView.objects.get() + row_values = [str(v) for v in [pv.site, pv.path, pv.referrer, pv.user_agent, pv.visitor_month_hash]] + self.assertFalse(any("1.2.3.4" in v for v in row_values)) + + # --- XFF extraction --- + + def test_xff_used_for_hash_differs_from_remote_addr(self): + """Two requests with different XFF but same REMOTE_ADDR → different hashes.""" + self._post( + {"site": "test-site", "path": "/"}, + REMOTE_ADDR="10.0.0.1", + HTTP_X_FORWARDED_FOR="1.1.1.1", + ) + self._post( + {"site": "test-site", "path": "/"}, + REMOTE_ADDR="10.0.0.1", + HTTP_X_FORWARDED_FOR="2.2.2.2", + ) + hashes = list(AnalyticsPageView.objects.values_list("visitor_month_hash", flat=True)) + self.assertEqual(len(hashes), 2) + self.assertNotEqual(hashes[0], hashes[1]) + + # --- field truncation --- + + def test_oversized_path_is_truncated(self): + long_path = "/" + "a" * 3000 + resp = self._post({"site": "test-site", "path": long_path}) + self.assertEqual(resp.status_code, 204) + self.assertEqual(len(AnalyticsPageView.objects.get().path), 2000) + + # --- empty allowed list --- + + @override_settings(SITE_ANALYTICS_ALLOWED_SITES=[]) + def test_empty_allowed_sites_rejects_all(self): + resp = self._post({"site": "test-site", "path": "/about"}) + self.assertEqual(resp.status_code, 400) diff --git a/qb_site/site_analytics/tests/test_services.py b/qb_site/site_analytics/tests/test_services.py new file mode 100644 index 00000000..6d5fa953 --- /dev/null +++ b/qb_site/site_analytics/tests/test_services.py @@ -0,0 +1,112 @@ +"""Unit tests for site_analytics hashing and bot-filter services.""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +from django.test import TestCase, override_settings + +from site_analytics.services.bot_filter import is_bot +from site_analytics.services.hashing import compute_visitor_month_hash, get_client_ip + + +class GetClientIpTests(TestCase): + def _make_request(self, remote_addr: str = "", xff: str = "") -> MagicMock: + req = MagicMock() + meta: dict[str, str] = {} + if remote_addr: + meta["REMOTE_ADDR"] = remote_addr + if xff: + meta["HTTP_X_FORWARDED_FOR"] = xff + req.META = meta + return req + + def test_remote_addr_used_when_no_xff(self): + req = self._make_request(remote_addr="1.2.3.4") + self.assertEqual(get_client_ip(req), "1.2.3.4") + + def test_xff_takes_precedence_over_remote_addr(self): + req = self._make_request(remote_addr="10.0.0.1", xff="5.6.7.8, 10.0.0.1") + self.assertEqual(get_client_ip(req), "5.6.7.8") + + def test_xff_single_address(self): + req = self._make_request(remote_addr="10.0.0.1", xff="9.9.9.9") + self.assertEqual(get_client_ip(req), "9.9.9.9") + + def test_xff_strips_whitespace(self): + req = self._make_request(xff=" 203.0.113.5 , 10.0.0.1") + self.assertEqual(get_client_ip(req), "203.0.113.5") + + def test_empty_xff_falls_back_to_remote_addr(self): + req = self._make_request(remote_addr="1.2.3.4", xff="") + self.assertEqual(get_client_ip(req), "1.2.3.4") + + def test_missing_both_returns_empty_string(self): + req = self._make_request() + self.assertEqual(get_client_ip(req), "") + + +@override_settings(SITE_ANALYTICS_HASH_SALT="test-salt") +class ComputeVisitorMonthHashTests(TestCase): + def test_returns_64_char_hex(self): + result = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + self.assertEqual(len(result), 64) + self.assertTrue(all(c in "0123456789abcdef" for c in result)) + + def test_deterministic(self): + a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + b = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + self.assertEqual(a, b) + + def test_different_month_keys_produce_different_hashes(self): + a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + b = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-04") + self.assertNotEqual(a, b) + + def test_different_ips_produce_different_hashes(self): + a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + b = compute_visitor_month_hash("1.2.3.5", "Mozilla/5.0", "2026-03") + self.assertNotEqual(a, b) + + def test_ua_normalized_case_insensitive(self): + a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + b = compute_visitor_month_hash("1.2.3.4", "MOZILLA/5.0", "2026-03") + self.assertEqual(a, b) + + @override_settings(SITE_ANALYTICS_HASH_SALT="other-salt") + def test_different_salts_produce_different_hashes(self): + a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + with self.settings(SITE_ANALYTICS_HASH_SALT="test-salt"): + b = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + self.assertNotEqual(a, b) + + +class IsBotTests(TestCase): + def test_known_bot_substrings(self): + bot_uas = [ + "Googlebot/2.1", + "Mozilla/5.0 (compatible; bingbot/2.0)", + "curl/7.68.0", + "wget/1.20", + "python-requests/2.28.0", + "Go-http-client/1.1", + "Java/11.0", + "okhttp/4.9.0", + "axios/1.3.0", + ] + for ua in bot_uas: + with self.subTest(ua=ua): + self.assertTrue(is_bot(ua), f"Expected {ua!r} to be detected as bot") + + def test_legitimate_browser_uas(self): + browser_uas = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15", + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0", + ] + for ua in browser_uas: + with self.subTest(ua=ua): + self.assertFalse(is_bot(ua), f"Expected {ua!r} not to be detected as bot") + + def test_empty_ua_is_not_bot(self): + self.assertFalse(is_bot("")) From 802d73ee87343d8484c20586e813a7c546f88971 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 15:59:12 -0400 Subject: [PATCH 04/17] =?UTF-8?q?feat(site=5Fanalytics):=20A3=20=E2=80=94?= =?UTF-8?q?=20daily=20aggregate=20model,=20service,=20Celery=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AnalyticsDailyMetric model: site, date (UTC), pageviews, unique_visitors; unique constraint on (site, date); migration 0002 - aggregate_daily_metrics service: idempotent upsert over a rolling days_back window; preserves existing aggregates when raw rows have been pruned - site_analytics.aggregate_daily_metrics Celery task + beat schedule entry - backup_policy.py: site_analytics_analyticsdailymetric → RETAIN_TABLES - tests: basic count, idempotency, multi-site, UTC date boundary, prune-safe - design doc: A3 progress notes and three implementation subtleties recorded Co-Authored-By: Claude Sonnet 4.6 --- .../031-analytics-ingestion-design.md | 6 +- qb_site/qb_site/settings/base.py | 5 + .../migrations/0002_analyticsdailymetric.py | 27 ++++ qb_site/site_analytics/models/__init__.py | 1 + qb_site/site_analytics/models/daily_metric.py | 28 ++++ .../site_analytics/services/aggregation.py | 71 ++++++++++ qb_site/site_analytics/tasks/__init__.py | 9 +- .../site_analytics/tasks/aggregate_daily.py | 20 +++ .../site_analytics/tests/test_aggregation.py | 126 ++++++++++++++++++ scripts/backup_policy.py | 2 + 10 files changed, 291 insertions(+), 4 deletions(-) create mode 100644 qb_site/site_analytics/migrations/0002_analyticsdailymetric.py create mode 100644 qb_site/site_analytics/models/daily_metric.py create mode 100644 qb_site/site_analytics/services/aggregation.py create mode 100644 qb_site/site_analytics/tasks/aggregate_daily.py create mode 100644 qb_site/site_analytics/tests/test_aggregation.py diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index d122b3f8..72095826 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -122,7 +122,7 @@ - Add `POST /api/v1/analytics/collect` route and view. - Implement payload validation, site allowlist check, IP extraction (X-Forwarded-For → REMOTE_ADDR fallback), hashing service, and raw insert. - Per-site tokens deferred to v1.1. -3. `A3` Daily aggregate model + service + Celery task. +3. `A3` ✓ Daily aggregate model + service + Celery task. - Add `AnalyticsDailyMetric` model, upsert service, and `site_analytics.aggregate_daily_metrics` Celery task. - Wire beat schedule entry in `base.py`. 4. `A4` Monthly aggregate model + service + Celery task + prune task. @@ -179,6 +179,10 @@ - Subtlety — bot traffic response: bots return `204` (same as success) rather than a distinct status code so detection heuristics are not leaked to callers. - Subtlety — field truncation: `path`, `referrer`, and `user_agent` are silently truncated to their model `max_length` before insert rather than rejected, since truncation is preferable to dropping the event entirely for slightly over-long paths. - Subtlety — empty `SITE_ANALYTICS_ALLOWED_SITES`: an unconfigured (empty) allowlist rejects all requests with `400`; this is intentional opt-in behaviour that prevents accidental data collection before sites are explicitly registered. + - `A3` implemented: `AnalyticsDailyMetric` model, `aggregate_daily_metrics` service, `site_analytics.aggregate_daily_metrics` Celery task, beat schedule entry, backup policy entry (`RETAIN_TABLES`), tests. + - Subtlety — rolling window default (`days_back=2`): recomputes today and yesterday on every run so events arriving near UTC midnight or during a prior task run are never missed. The task is fully idempotent: each call overwrites aggregates with a fresh count, so retries and overlapping runs are harmless. + - Subtlety — preserve aggregates when raw rows are gone: if raw pageviews have been pruned (by the A4 retention task) but an aggregate row already exists, the service leaves the existing row in place rather than zeroing it. Zeroing would silently destroy reporting data after the retention window passes. + - Subtlety — `__date` lookup and UTC: Django's `occurred_at__date=d` with `USE_TZ=True` and `TIME_ZONE=UTC` correctly evaluates the date boundary at UTC midnight in PostgreSQL. Covered by `test_date_boundary_is_utc`. ## Open Questions - ~~Should `site` configuration live in DB (admin-editable) or settings/env (static)?~~ Resolved: settings/env (`SITE_ANALYTICS_ALLOWED_SITES`) for v1. diff --git a/qb_site/qb_site/settings/base.py b/qb_site/qb_site/settings/base.py index cd06a0ce..82474d19 100644 --- a/qb_site/qb_site/settings/base.py +++ b/qb_site/qb_site/settings/base.py @@ -550,3 +550,8 @@ def env_optional_bounded_int(name: str, *, minimum: int, maximum: int) -> int | "fanout": True, }, } +if SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS > 0: + CELERY_BEAT_SCHEDULE["site_analytics_aggregate_daily"] = { + "task": "site_analytics.aggregate_daily_metrics", + "schedule": SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS, + } diff --git a/qb_site/site_analytics/migrations/0002_analyticsdailymetric.py b/qb_site/site_analytics/migrations/0002_analyticsdailymetric.py new file mode 100644 index 00000000..5af80ab8 --- /dev/null +++ b/qb_site/site_analytics/migrations/0002_analyticsdailymetric.py @@ -0,0 +1,27 @@ +# Generated by Django 5.2.6 on 2026-03-25 19:58 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('site_analytics', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='AnalyticsDailyMetric', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('site', models.CharField(max_length=100)), + ('date', models.DateField()), + ('pageviews', models.PositiveIntegerField(default=0)), + ('unique_visitors', models.PositiveIntegerField(default=0)), + ], + options={ + 'indexes': [models.Index(fields=['site', 'date'], name='sa_dailymetric_site_date_idx')], + 'constraints': [models.UniqueConstraint(fields=('site', 'date'), name='sa_dailymetric_site_date_unique')], + }, + ), + ] diff --git a/qb_site/site_analytics/models/__init__.py b/qb_site/site_analytics/models/__init__.py index 2db40d15..7e0b4a57 100644 --- a/qb_site/site_analytics/models/__init__.py +++ b/qb_site/site_analytics/models/__init__.py @@ -1,3 +1,4 @@ """Site analytics models: raw events and aggregate reporting tables.""" +from .daily_metric import AnalyticsDailyMetric # noqa: F401 from .pageview import AnalyticsPageView # noqa: F401 diff --git a/qb_site/site_analytics/models/daily_metric.py b/qb_site/site_analytics/models/daily_metric.py new file mode 100644 index 00000000..be0bd490 --- /dev/null +++ b/qb_site/site_analytics/models/daily_metric.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from django.db import models + + +class AnalyticsDailyMetric(models.Model): + """Aggregated daily pageview and unique-visitor counts per site. + + Rows are upserted by the ``site_analytics.aggregate_daily_metrics`` task; + never written by the ingestion endpoint. + Reporting queries must use this table, not raw ``AnalyticsPageView`` scans. + """ + + site = models.CharField(max_length=100) + date = models.DateField() # UTC calendar date + pageviews = models.PositiveIntegerField(default=0) + unique_visitors = models.PositiveIntegerField(default=0) + + class Meta: + constraints = [ + models.UniqueConstraint(fields=["site", "date"], name="sa_dailymetric_site_date_unique"), + ] + indexes = [ + models.Index(fields=["site", "date"], name="sa_dailymetric_site_date_idx"), + ] + + def __str__(self) -> str: + return f"{self.site} {self.date}: {self.pageviews} pv / {self.unique_visitors} uv" diff --git a/qb_site/site_analytics/services/aggregation.py b/qb_site/site_analytics/services/aggregation.py new file mode 100644 index 00000000..f2279dd8 --- /dev/null +++ b/qb_site/site_analytics/services/aggregation.py @@ -0,0 +1,71 @@ +"""Aggregation services for site analytics.""" + +from __future__ import annotations + +import datetime +from typing import Any + +from django.db.models import Count +from django.utils import timezone + +from site_analytics.models import AnalyticsDailyMetric, AnalyticsPageView + + +def aggregate_daily_metrics( + *, + date: datetime.date | None = None, + days_back: int = 2, +) -> dict[str, Any]: + """Idempotent upsert of AnalyticsDailyMetric for a rolling date window. + + By default recomputes today and yesterday (``days_back=2``) so that events + arriving near midnight or during a prior task run are not missed. The task + is safe to retry: each call overwrites the aggregate with a fresh count from + raw rows, so running it multiple times on the same window is harmless. + + Returns a summary dict suitable for Celery task result storage. + """ + if date is None: + date = timezone.now().date() + + target_dates = [date - datetime.timedelta(days=i) for i in range(days_back)] + upserted = 0 + skipped = 0 + + for d in target_dates: + # COUNT(*) and COUNT(DISTINCT visitor_month_hash) per site for this UTC date. + # Django's __date lookup respects USE_TZ and the configured TIME_ZONE (UTC), + # so the date boundary is always UTC midnight. + rows = ( + AnalyticsPageView.objects.filter(occurred_at__date=d) + .values("site") + .annotate( + pageviews=Count("id"), + unique_visitors=Count("visitor_month_hash", distinct=True), + ) + ) + + for row in rows: + AnalyticsDailyMetric.objects.update_or_create( + site=row["site"], + date=d, + defaults={ + "pageviews": row["pageviews"], + "unique_visitors": row["unique_visitors"], + }, + ) + upserted += 1 + + # If no raw rows exist for this date and site, we leave any existing + # aggregate row in place rather than zeroing it out. This avoids + # accidental data loss if the raw rows were pruned before the aggregate + # was read. + sites_with_existing = set(AnalyticsDailyMetric.objects.filter(date=d).values_list("site", flat=True)) + sites_in_raw = {r["site"] for r in rows} + skipped += len(sites_with_existing - sites_in_raw) + + return { + "dates_processed": [str(d) for d in target_dates], + "upserted": upserted, + "skipped_existing_no_raw": skipped, + } diff --git a/qb_site/site_analytics/tasks/__init__.py b/qb_site/site_analytics/tasks/__init__.py index 2d7b3bd9..6f8dd265 100644 --- a/qb_site/site_analytics/tasks/__init__.py +++ b/qb_site/site_analytics/tasks/__init__.py @@ -1,4 +1,7 @@ -"""Celery tasks for site analytics aggregation and retention. +"""Celery tasks for site analytics aggregation and retention.""" -Populated in A3 (daily aggregate) and A4 (monthly aggregate + prune). -""" +from __future__ import annotations + +from site_analytics.tasks.aggregate_daily import aggregate_daily_metrics_task # noqa: F401 + +__all__ = ["aggregate_daily_metrics_task"] diff --git a/qb_site/site_analytics/tasks/aggregate_daily.py b/qb_site/site_analytics/tasks/aggregate_daily.py new file mode 100644 index 00000000..676f32a9 --- /dev/null +++ b/qb_site/site_analytics/tasks/aggregate_daily.py @@ -0,0 +1,20 @@ +"""Celery task: aggregate daily pageview metrics.""" + +from __future__ import annotations + +from typing import Any + +from celery import shared_task + +from site_analytics.services.aggregation import aggregate_daily_metrics + + +@shared_task(name="site_analytics.aggregate_daily_metrics") +def aggregate_daily_metrics_task(*, days_back: int = 2) -> dict[str, Any]: + """Idempotent upsert of daily pageview and unique-visitor counts. + + Recomputes the rolling ``days_back`` UTC calendar days so that events + arriving near midnight or during a prior task run are never missed. + Safe to retry: each run overwrites aggregates with a fresh count. + """ + return aggregate_daily_metrics(days_back=days_back) diff --git a/qb_site/site_analytics/tests/test_aggregation.py b/qb_site/site_analytics/tests/test_aggregation.py new file mode 100644 index 00000000..4f13bd85 --- /dev/null +++ b/qb_site/site_analytics/tests/test_aggregation.py @@ -0,0 +1,126 @@ +"""Tests for daily metric aggregation service and task.""" + +from __future__ import annotations + +import datetime + +from django.test import TestCase, override_settings + +from site_analytics.models import AnalyticsDailyMetric, AnalyticsPageView +from site_analytics.services.aggregation import aggregate_daily_metrics +from site_analytics.tasks.aggregate_daily import aggregate_daily_metrics_task + +_SALT = override_settings(SITE_ANALYTICS_HASH_SALT="test-salt") + +TODAY = datetime.date(2026, 3, 25) +YESTERDAY = TODAY - datetime.timedelta(days=1) + + +def _pv(site: str, path: str, date: datetime.date, visitor_hash: str = "abc") -> AnalyticsPageView: + return AnalyticsPageView.objects.create( + site=site, + path=path, + occurred_at=datetime.datetime(date.year, date.month, date.day, 12, 0, 0, tzinfo=datetime.timezone.utc), + visitor_month_hash=visitor_hash, + ) + + +@_SALT +class AggregateDailyMetricsServiceTests(TestCase): + def test_basic_count(self): + _pv("s1", "/a", TODAY, "h1") + _pv("s1", "/b", TODAY, "h2") + _pv("s1", "/c", TODAY, "h1") # same hash — same visitor + + result = aggregate_daily_metrics(date=TODAY, days_back=1) + + metric = AnalyticsDailyMetric.objects.get(site="s1", date=TODAY) + self.assertEqual(metric.pageviews, 3) + self.assertEqual(metric.unique_visitors, 2) + self.assertEqual(result["upserted"], 1) + self.assertEqual(result["dates_processed"], [str(TODAY)]) + + def test_idempotent_rerun(self): + _pv("s1", "/a", TODAY, "h1") + aggregate_daily_metrics(date=TODAY, days_back=1) + aggregate_daily_metrics(date=TODAY, days_back=1) + + self.assertEqual(AnalyticsDailyMetric.objects.filter(site="s1", date=TODAY).count(), 1) + self.assertEqual(AnalyticsDailyMetric.objects.get(site="s1", date=TODAY).pageviews, 1) + + def test_rerun_after_new_events_updates_counts(self): + _pv("s1", "/a", TODAY, "h1") + aggregate_daily_metrics(date=TODAY, days_back=1) + _pv("s1", "/b", TODAY, "h2") + aggregate_daily_metrics(date=TODAY, days_back=1) + + metric = AnalyticsDailyMetric.objects.get(site="s1", date=TODAY) + self.assertEqual(metric.pageviews, 2) + self.assertEqual(metric.unique_visitors, 2) + + def test_days_back_covers_yesterday(self): + _pv("s1", "/a", TODAY, "h1") + _pv("s1", "/b", YESTERDAY, "h2") + + aggregate_daily_metrics(date=TODAY, days_back=2) + + self.assertEqual(AnalyticsDailyMetric.objects.get(site="s1", date=TODAY).pageviews, 1) + self.assertEqual(AnalyticsDailyMetric.objects.get(site="s1", date=YESTERDAY).pageviews, 1) + + def test_multiple_sites_aggregated_separately(self): + _pv("site-a", "/", TODAY, "h1") + _pv("site-a", "/", TODAY, "h2") + _pv("site-b", "/", TODAY, "h3") + + aggregate_daily_metrics(date=TODAY, days_back=1) + + self.assertEqual(AnalyticsDailyMetric.objects.get(site="site-a", date=TODAY).pageviews, 2) + self.assertEqual(AnalyticsDailyMetric.objects.get(site="site-b", date=TODAY).pageviews, 1) + + def test_no_raw_rows_does_not_create_metric(self): + aggregate_daily_metrics(date=TODAY, days_back=1) + self.assertEqual(AnalyticsDailyMetric.objects.count(), 0) + + def test_no_raw_rows_preserves_existing_metric(self): + # Existing aggregate from a previous run; no raw rows remain (pruned). + AnalyticsDailyMetric.objects.create(site="s1", date=TODAY, pageviews=5, unique_visitors=3) + aggregate_daily_metrics(date=TODAY, days_back=1) + + metric = AnalyticsDailyMetric.objects.get(site="s1", date=TODAY) + self.assertEqual(metric.pageviews, 5) # preserved, not zeroed + + def test_unique_visitors_uses_distinct_hash(self): + for _ in range(10): + _pv("s1", "/", TODAY, "same-hash") + + aggregate_daily_metrics(date=TODAY, days_back=1) + + metric = AnalyticsDailyMetric.objects.get(site="s1", date=TODAY) + self.assertEqual(metric.pageviews, 10) + self.assertEqual(metric.unique_visitors, 1) + + def test_date_boundary_is_utc(self): + # Event at 23:59 UTC on YESTERDAY must count for YESTERDAY, not TODAY. + AnalyticsPageView.objects.create( + site="s1", + path="/", + occurred_at=datetime.datetime( + YESTERDAY.year, YESTERDAY.month, YESTERDAY.day, 23, 59, 0, tzinfo=datetime.timezone.utc + ), + visitor_month_hash="h1", + ) + + aggregate_daily_metrics(date=TODAY, days_back=2) + + self.assertEqual(AnalyticsDailyMetric.objects.get(site="s1", date=YESTERDAY).pageviews, 1) + self.assertFalse(AnalyticsDailyMetric.objects.filter(site="s1", date=TODAY).exists()) + + +@_SALT +class AggregateDailyMetricsTaskTests(TestCase): + def test_task_returns_summary_dict(self): + _pv("s1", "/", TODAY, "h1") + result = aggregate_daily_metrics_task(days_back=1) + self.assertIn("upserted", result) + self.assertIn("dates_processed", result) + self.assertEqual(result["upserted"], 1) diff --git a/scripts/backup_policy.py b/scripts/backup_policy.py index 06ec1600..216cf60f 100644 --- a/scripts/backup_policy.py +++ b/scripts/backup_policy.py @@ -6,6 +6,7 @@ # We include implicit Django auth M2M tables and django_migrations explicitly. BACKUP_TABLES: tuple[str, ...] = ( "site_analytics_analyticspageview", + "site_analytics_analyticsdailymetric", "analyzer_analyzerconvergencesnapshot", "analyzer_areastatssnapshot", "analyzer_prdependency", @@ -95,6 +96,7 @@ # Tables retained in sanitized dump. RETAIN_TABLES: tuple[str, ...] = ( + "site_analytics_analyticsdailymetric", "core_repository", "core_user", "syncer_pullrequest", From 52199f8638dfa1abef56f85601d3526fa3fe57c8 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 16:01:46 -0400 Subject: [PATCH 05/17] =?UTF-8?q?feat(site=5Fanalytics):=20A4=20=E2=80=94?= =?UTF-8?q?=20monthly=20aggregate=20model,=20prune=20task,=20beat=20schedu?= =?UTF-8?q?les?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - AnalyticsMonthlyMetric model: site, month (UTC first-of-month DateField), pageviews, unique_visitors; unique constraint on (site, month); migration 0003 - aggregate_monthly_metrics service: idempotent upsert over rolling months_back window; preserves existing aggregates when raw rows have been pruned - prune_old_pageviews service: deletes AnalyticsPageView rows older than SITE_ANALYTICS_RETENTION_DAYS; aggregate tables are never pruned - site_analytics.aggregate_monthly_metrics and site_analytics.prune_old_pageviews Celery tasks + beat schedule entries for monthly aggregate and prune - backup_policy.py: site_analytics_analyticsmonthlymetric → RETAIN_TABLES - tests: monthly count, idempotency, month boundary, prune boundaries, settings default retention, prune-safe aggregate preservation - design doc: A4 progress notes; index name length limit subtlety noted Co-Authored-By: Claude Sonnet 4.6 --- .../031-analytics-ingestion-design.md | 6 +- qb_site/qb_site/settings/base.py | 11 ++ .../migrations/0003_analyticsmonthlymetric.py | 27 ++++ qb_site/site_analytics/models/__init__.py | 1 + .../site_analytics/models/monthly_metric.py | 30 ++++ .../site_analytics/services/aggregation.py | 102 +++++++++++++- qb_site/site_analytics/tasks/__init__.py | 10 +- .../site_analytics/tasks/aggregate_monthly.py | 30 ++++ .../tests/test_monthly_aggregation.py | 130 ++++++++++++++++++ scripts/backup_policy.py | 2 + 10 files changed, 345 insertions(+), 4 deletions(-) create mode 100644 qb_site/site_analytics/migrations/0003_analyticsmonthlymetric.py create mode 100644 qb_site/site_analytics/models/monthly_metric.py create mode 100644 qb_site/site_analytics/tasks/aggregate_monthly.py create mode 100644 qb_site/site_analytics/tests/test_monthly_aggregation.py diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index 72095826..cb87da04 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -125,7 +125,7 @@ 3. `A3` ✓ Daily aggregate model + service + Celery task. - Add `AnalyticsDailyMetric` model, upsert service, and `site_analytics.aggregate_daily_metrics` Celery task. - Wire beat schedule entry in `base.py`. -4. `A4` Monthly aggregate model + service + Celery task + prune task. +4. `A4` ✓ Monthly aggregate model + service + Celery task + prune task. - Add `AnalyticsMonthlyMetric` model, upsert service, and `site_analytics.aggregate_monthly_metrics` Celery task. - Add `site_analytics.prune_old_pageviews` task. - Wire beat schedule entries. @@ -183,6 +183,10 @@ - Subtlety — rolling window default (`days_back=2`): recomputes today and yesterday on every run so events arriving near UTC midnight or during a prior task run are never missed. The task is fully idempotent: each call overwrites aggregates with a fresh count, so retries and overlapping runs are harmless. - Subtlety — preserve aggregates when raw rows are gone: if raw pageviews have been pruned (by the A4 retention task) but an aggregate row already exists, the service leaves the existing row in place rather than zeroing it. Zeroing would silently destroy reporting data after the retention window passes. - Subtlety — `__date` lookup and UTC: Django's `occurred_at__date=d` with `USE_TZ=True` and `TIME_ZONE=UTC` correctly evaluates the date boundary at UTC midnight in PostgreSQL. Covered by `test_date_boundary_is_utc`. + - `A4` implemented: `AnalyticsMonthlyMetric` model, monthly aggregation service, `site_analytics.aggregate_monthly_metrics` and `site_analytics.prune_old_pageviews` Celery tasks, beat schedule entries, backup policy entries, tests. + - Subtlety — `month` stored as first-of-month `DateField`: avoids a separate `YearMonthField` or string; sorts and filters naturally, and is unambiguous regardless of timezone. Any query for a given month uses `month=date(year, month, 1)`. + - Subtlety — index name length limit: Django enforces a 30-character limit on `Index` names. `sa_monthlymetric_site_month_idx` (31 chars) triggered `models.E034` at `makemigrations` time. Renamed to `sa_monthly_site_month_idx`. The daily metric constraint name `sa_dailymetric_site_date_unique` (31 chars) is a `UniqueConstraint` name, not an `Index` name, so Django does not enforce the same limit for it. + - Subtlety — prune task does not touch aggregate tables: `prune_old_pageviews` only deletes `AnalyticsPageView` rows. The daily/monthly aggregate tables are never pruned automatically; they are the durable reporting record. ## Open Questions - ~~Should `site` configuration live in DB (admin-editable) or settings/env (static)?~~ Resolved: settings/env (`SITE_ANALYTICS_ALLOWED_SITES`) for v1. diff --git a/qb_site/qb_site/settings/base.py b/qb_site/qb_site/settings/base.py index 82474d19..3ec0e8a5 100644 --- a/qb_site/qb_site/settings/base.py +++ b/qb_site/qb_site/settings/base.py @@ -555,3 +555,14 @@ def env_optional_bounded_int(name: str, *, minimum: int, maximum: int) -> int | "task": "site_analytics.aggregate_daily_metrics", "schedule": SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS, } +if SITE_ANALYTICS_MONTHLY_AGGREGATE_PERIOD_SECONDS > 0: + CELERY_BEAT_SCHEDULE["site_analytics_aggregate_monthly"] = { + "task": "site_analytics.aggregate_monthly_metrics", + "schedule": SITE_ANALYTICS_MONTHLY_AGGREGATE_PERIOD_SECONDS, + } +if SITE_ANALYTICS_PRUNE_PERIOD_SECONDS > 0: + CELERY_BEAT_SCHEDULE["site_analytics_prune_pageviews"] = { + "task": "site_analytics.prune_old_pageviews", + "schedule": SITE_ANALYTICS_PRUNE_PERIOD_SECONDS, + "kwargs": {"retention_days": SITE_ANALYTICS_RETENTION_DAYS}, + } diff --git a/qb_site/site_analytics/migrations/0003_analyticsmonthlymetric.py b/qb_site/site_analytics/migrations/0003_analyticsmonthlymetric.py new file mode 100644 index 00000000..aaf12554 --- /dev/null +++ b/qb_site/site_analytics/migrations/0003_analyticsmonthlymetric.py @@ -0,0 +1,27 @@ +# Generated by Django 5.2.6 on 2026-03-25 20:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('site_analytics', '0002_analyticsdailymetric'), + ] + + operations = [ + migrations.CreateModel( + name='AnalyticsMonthlyMetric', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('site', models.CharField(max_length=100)), + ('month', models.DateField()), + ('pageviews', models.PositiveIntegerField(default=0)), + ('unique_visitors', models.PositiveIntegerField(default=0)), + ], + options={ + 'indexes': [models.Index(fields=['site', 'month'], name='sa_monthly_site_month_idx')], + 'constraints': [models.UniqueConstraint(fields=('site', 'month'), name='sa_monthly_site_month_uniq')], + }, + ), + ] diff --git a/qb_site/site_analytics/models/__init__.py b/qb_site/site_analytics/models/__init__.py index 7e0b4a57..5bacc5b2 100644 --- a/qb_site/site_analytics/models/__init__.py +++ b/qb_site/site_analytics/models/__init__.py @@ -1,4 +1,5 @@ """Site analytics models: raw events and aggregate reporting tables.""" from .daily_metric import AnalyticsDailyMetric # noqa: F401 +from .monthly_metric import AnalyticsMonthlyMetric # noqa: F401 from .pageview import AnalyticsPageView # noqa: F401 diff --git a/qb_site/site_analytics/models/monthly_metric.py b/qb_site/site_analytics/models/monthly_metric.py new file mode 100644 index 00000000..5d27e23a --- /dev/null +++ b/qb_site/site_analytics/models/monthly_metric.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from django.db import models + + +class AnalyticsMonthlyMetric(models.Model): + """Aggregated monthly pageview and unique-visitor counts per site. + + Rows are upserted by the ``site_analytics.aggregate_monthly_metrics`` task. + Reporting queries must use this table, not raw ``AnalyticsPageView`` scans. + + ``month`` is stored as the first day of the UTC month (e.g. 2026-03-01) + so it is a plain ``DateField`` with natural ordering and easy filtering. + """ + + site = models.CharField(max_length=100) + month = models.DateField() # UTC first-of-month, e.g. 2026-03-01 + pageviews = models.PositiveIntegerField(default=0) + unique_visitors = models.PositiveIntegerField(default=0) + + class Meta: + constraints = [ + models.UniqueConstraint(fields=["site", "month"], name="sa_monthly_site_month_uniq"), + ] + indexes = [ + models.Index(fields=["site", "month"], name="sa_monthly_site_month_idx"), + ] + + def __str__(self) -> str: + return f"{self.site} {self.month:%Y-%m}: {self.pageviews} pv / {self.unique_visitors} uv" diff --git a/qb_site/site_analytics/services/aggregation.py b/qb_site/site_analytics/services/aggregation.py index f2279dd8..24b18283 100644 --- a/qb_site/site_analytics/services/aggregation.py +++ b/qb_site/site_analytics/services/aggregation.py @@ -1,14 +1,15 @@ -"""Aggregation services for site analytics.""" +"""Aggregation and retention services for site analytics.""" from __future__ import annotations import datetime from typing import Any +from django.conf import settings from django.db.models import Count from django.utils import timezone -from site_analytics.models import AnalyticsDailyMetric, AnalyticsPageView +from site_analytics.models import AnalyticsDailyMetric, AnalyticsMonthlyMetric, AnalyticsPageView def aggregate_daily_metrics( @@ -69,3 +70,100 @@ def aggregate_daily_metrics( "upserted": upserted, "skipped_existing_no_raw": skipped, } + + +def _month_start(date: datetime.date) -> datetime.date: + """Return the first day of the month containing ``date``.""" + return date.replace(day=1) + + +def aggregate_monthly_metrics( + *, + date: datetime.date | None = None, + months_back: int = 2, +) -> dict[str, Any]: + """Idempotent upsert of AnalyticsMonthlyMetric for a rolling month window. + + By default recomputes the current month and the previous month + (``months_back=2``) so that late-arriving events and month-boundary races + are always captured. Safe to retry. + + ``month`` values are stored as the first day of the UTC month so they sort + and filter naturally as ``DateField`` values. + """ + if date is None: + date = timezone.now().date() + + # Build the list of first-of-month dates to recompute. + target_months: list[datetime.date] = [] + current = _month_start(date) + for _ in range(months_back): + target_months.append(current) + # Step back one month: subtract enough days to land in the previous month + # then take the first of that month. + current = _month_start(current - datetime.timedelta(days=1)) + + upserted = 0 + skipped = 0 + + for month_start in target_months: + # Last day of the month: go to first of next month, subtract one day. + if month_start.month == 12: + month_end = datetime.date(month_start.year + 1, 1, 1) - datetime.timedelta(days=1) + else: + month_end = datetime.date(month_start.year, month_start.month + 1, 1) - datetime.timedelta(days=1) + + rows = ( + AnalyticsPageView.objects.filter(occurred_at__date__gte=month_start, occurred_at__date__lte=month_end) + .values("site") + .annotate( + pageviews=Count("id"), + unique_visitors=Count("visitor_month_hash", distinct=True), + ) + ) + + for row in rows: + AnalyticsMonthlyMetric.objects.update_or_create( + site=row["site"], + month=month_start, + defaults={ + "pageviews": row["pageviews"], + "unique_visitors": row["unique_visitors"], + }, + ) + upserted += 1 + + # Preserve existing rows where raw data has been pruned (same logic as daily). + sites_with_existing = set(AnalyticsMonthlyMetric.objects.filter(month=month_start).values_list("site", flat=True)) + sites_in_raw = {r["site"] for r in rows} + skipped += len(sites_with_existing - sites_in_raw) + + return { + "months_processed": [str(m) for m in target_months], + "upserted": upserted, + "skipped_existing_no_raw": skipped, + } + + +def prune_old_pageviews( + *, + retention_days: int | None = None, +) -> dict[str, Any]: + """Delete AnalyticsPageView rows older than the retention window. + + ``retention_days`` defaults to ``SITE_ANALYTICS_RETENTION_DAYS`` from + settings. Rows whose ``occurred_at`` is strictly before the cutoff are + deleted in a single query; Postgres will handle the index scan efficiently + given the index on ``occurred_at``. + """ + if retention_days is None: + retention_days = settings.SITE_ANALYTICS_RETENTION_DAYS + + cutoff = timezone.now() - datetime.timedelta(days=retention_days) + deleted, _ = AnalyticsPageView.objects.filter(occurred_at__lt=cutoff).delete() + + return { + "deleted": deleted, + "cutoff": cutoff.isoformat(), + "retention_days": retention_days, + } diff --git a/qb_site/site_analytics/tasks/__init__.py b/qb_site/site_analytics/tasks/__init__.py index 6f8dd265..f7ecbf1f 100644 --- a/qb_site/site_analytics/tasks/__init__.py +++ b/qb_site/site_analytics/tasks/__init__.py @@ -3,5 +3,13 @@ from __future__ import annotations from site_analytics.tasks.aggregate_daily import aggregate_daily_metrics_task # noqa: F401 +from site_analytics.tasks.aggregate_monthly import ( # noqa: F401 + aggregate_monthly_metrics_task, + prune_old_pageviews_task, +) -__all__ = ["aggregate_daily_metrics_task"] +__all__ = [ + "aggregate_daily_metrics_task", + "aggregate_monthly_metrics_task", + "prune_old_pageviews_task", +] diff --git a/qb_site/site_analytics/tasks/aggregate_monthly.py b/qb_site/site_analytics/tasks/aggregate_monthly.py new file mode 100644 index 00000000..fed96ebb --- /dev/null +++ b/qb_site/site_analytics/tasks/aggregate_monthly.py @@ -0,0 +1,30 @@ +"""Celery tasks: monthly aggregate and raw pageview pruning.""" + +from __future__ import annotations + +from typing import Any + +from celery import shared_task + +from site_analytics.services.aggregation import aggregate_monthly_metrics, prune_old_pageviews + + +@shared_task(name="site_analytics.aggregate_monthly_metrics") +def aggregate_monthly_metrics_task(*, months_back: int = 2) -> dict[str, Any]: + """Idempotent upsert of monthly pageview and unique-visitor counts. + + Recomputes the current month and the previous ``months_back - 1`` months + so late-arriving events and month-boundary races are always captured. + Safe to retry. + """ + return aggregate_monthly_metrics(months_back=months_back) + + +@shared_task(name="site_analytics.prune_old_pageviews") +def prune_old_pageviews_task(*, retention_days: int | None = None) -> dict[str, Any]: + """Delete raw AnalyticsPageView rows older than the retention window. + + Uses ``SITE_ANALYTICS_RETENTION_DAYS`` from settings when ``retention_days`` + is not provided. Aggregate rows (daily/monthly) are never pruned by this task. + """ + return prune_old_pageviews(retention_days=retention_days) diff --git a/qb_site/site_analytics/tests/test_monthly_aggregation.py b/qb_site/site_analytics/tests/test_monthly_aggregation.py new file mode 100644 index 00000000..c06fea84 --- /dev/null +++ b/qb_site/site_analytics/tests/test_monthly_aggregation.py @@ -0,0 +1,130 @@ +"""Tests for monthly metric aggregation service and prune service/task.""" + +from __future__ import annotations + +import datetime + +from django.test import TestCase, override_settings + +from site_analytics.models import AnalyticsMonthlyMetric, AnalyticsPageView +from site_analytics.services.aggregation import aggregate_monthly_metrics, prune_old_pageviews +from site_analytics.tasks.aggregate_monthly import aggregate_monthly_metrics_task, prune_old_pageviews_task + +_SALT = override_settings(SITE_ANALYTICS_HASH_SALT="test-salt") + +# Fixed reference dates +MAR_2026 = datetime.date(2026, 3, 25) +MAR_START = datetime.date(2026, 3, 1) +FEB_START = datetime.date(2026, 2, 1) +JAN_START = datetime.date(2026, 1, 1) + + +def _pv( + site: str, + path: str, + date: datetime.date, + visitor_hash: str = "h1", +) -> AnalyticsPageView: + return AnalyticsPageView.objects.create( + site=site, + path=path, + occurred_at=datetime.datetime(date.year, date.month, date.day, 12, 0, 0, tzinfo=datetime.timezone.utc), + visitor_month_hash=visitor_hash, + ) + + +@_SALT +class AggregateMonthlyMetricsServiceTests(TestCase): + def test_basic_monthly_count(self): + _pv("s1", "/a", MAR_2026, "h1") + _pv("s1", "/b", MAR_2026, "h2") + _pv("s1", "/c", MAR_2026, "h1") # same visitor + + result = aggregate_monthly_metrics(date=MAR_2026, months_back=1) + + metric = AnalyticsMonthlyMetric.objects.get(site="s1", month=MAR_START) + self.assertEqual(metric.pageviews, 3) + self.assertEqual(metric.unique_visitors, 2) + self.assertEqual(result["upserted"], 1) + self.assertEqual(result["months_processed"], [str(MAR_START)]) + + def test_idempotent_rerun(self): + _pv("s1", "/a", MAR_2026, "h1") + aggregate_monthly_metrics(date=MAR_2026, months_back=1) + aggregate_monthly_metrics(date=MAR_2026, months_back=1) + + self.assertEqual(AnalyticsMonthlyMetric.objects.filter(site="s1", month=MAR_START).count(), 1) + self.assertEqual(AnalyticsMonthlyMetric.objects.get(site="s1", month=MAR_START).pageviews, 1) + + def test_months_back_covers_previous_month(self): + _pv("s1", "/a", MAR_2026, "h1") + _pv("s1", "/b", datetime.date(2026, 2, 15), "h2") + + aggregate_monthly_metrics(date=MAR_2026, months_back=2) + + self.assertEqual(AnalyticsMonthlyMetric.objects.get(site="s1", month=MAR_START).pageviews, 1) + self.assertEqual(AnalyticsMonthlyMetric.objects.get(site="s1", month=FEB_START).pageviews, 1) + + def test_month_stored_as_first_of_month(self): + _pv("s1", "/", datetime.date(2026, 3, 31), "h1") + aggregate_monthly_metrics(date=MAR_2026, months_back=1) + self.assertTrue(AnalyticsMonthlyMetric.objects.filter(month=MAR_START).exists()) + + def test_events_spanning_month_boundary_separated(self): + _pv("s1", "/", datetime.date(2026, 2, 28), "h1") + _pv("s1", "/", datetime.date(2026, 3, 1), "h2") + + aggregate_monthly_metrics(date=MAR_2026, months_back=2) + + self.assertEqual(AnalyticsMonthlyMetric.objects.get(site="s1", month=FEB_START).pageviews, 1) + self.assertEqual(AnalyticsMonthlyMetric.objects.get(site="s1", month=MAR_START).pageviews, 1) + + def test_no_raw_rows_preserves_existing_metric(self): + AnalyticsMonthlyMetric.objects.create(site="s1", month=MAR_START, pageviews=99, unique_visitors=50) + aggregate_monthly_metrics(date=MAR_2026, months_back=1) + + metric = AnalyticsMonthlyMetric.objects.get(site="s1", month=MAR_START) + self.assertEqual(metric.pageviews, 99) # preserved, not zeroed + + def test_task_returns_summary_dict(self): + _pv("s1", "/", MAR_2026, "h1") + result = aggregate_monthly_metrics_task(months_back=1) + self.assertIn("upserted", result) + self.assertIn("months_processed", result) + self.assertEqual(result["upserted"], 1) + + +@_SALT +class PruneOldPageviewsTests(TestCase): + def test_prunes_rows_older_than_retention(self): + old = _pv("s1", "/old", datetime.date(2023, 1, 1)) + recent = _pv("s1", "/new", datetime.date(2026, 3, 1)) + + result = prune_old_pageviews(retention_days=30) + + self.assertFalse(AnalyticsPageView.objects.filter(pk=old.pk).exists()) + self.assertTrue(AnalyticsPageView.objects.filter(pk=recent.pk).exists()) + self.assertEqual(result["deleted"], 1) + + def test_no_rows_deleted_when_all_recent(self): + _pv("s1", "/", datetime.date(2026, 3, 24)) + result = prune_old_pageviews(retention_days=30) + self.assertEqual(result["deleted"], 0) + self.assertEqual(AnalyticsPageView.objects.count(), 1) + + def test_returns_cutoff_and_retention_days(self): + result = prune_old_pageviews(retention_days=90) + self.assertIn("cutoff", result) + self.assertEqual(result["retention_days"], 90) + + @override_settings(SITE_ANALYTICS_RETENTION_DAYS=7) + def test_uses_settings_default_when_not_specified(self): + _pv("s1", "/", datetime.date(2026, 3, 1)) + result = prune_old_pageviews() + self.assertEqual(result["retention_days"], 7) + self.assertEqual(result["deleted"], 1) + + def test_prune_task_returns_summary(self): + result = prune_old_pageviews_task(retention_days=365) + self.assertIn("deleted", result) + self.assertIn("cutoff", result) diff --git a/scripts/backup_policy.py b/scripts/backup_policy.py index 216cf60f..a3f60410 100644 --- a/scripts/backup_policy.py +++ b/scripts/backup_policy.py @@ -7,6 +7,7 @@ BACKUP_TABLES: tuple[str, ...] = ( "site_analytics_analyticspageview", "site_analytics_analyticsdailymetric", + "site_analytics_analyticsmonthlymetric", "analyzer_analyzerconvergencesnapshot", "analyzer_areastatssnapshot", "analyzer_prdependency", @@ -97,6 +98,7 @@ # Tables retained in sanitized dump. RETAIN_TABLES: tuple[str, ...] = ( "site_analytics_analyticsdailymetric", + "site_analytics_analyticsmonthlymetric", "core_repository", "core_user", "syncer_pullrequest", From 43335e72d0fc5079de75462ae9128b0a1dd0b2e7 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 16:02:15 -0400 Subject: [PATCH 06/17] =?UTF-8?q?feat(site=5Fanalytics):=20A5=20=E2=80=94?= =?UTF-8?q?=20admin=20registrations=20for=20all=20three=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All three admin classes (AnalyticsPageView, AnalyticsDailyMetric, AnalyticsMonthlyMetric) are read-only: add/change/delete permissions disabled to enforce immutability of analytics data through the admin UI. PageView admin shows hash prefix and truncated referrer for readability. Co-Authored-By: Claude Sonnet 4.6 --- .../031-analytics-ingestion-design.md | 3 +- qb_site/site_analytics/admin.py | 72 +++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 qb_site/site_analytics/admin.py diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index cb87da04..246b2747 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -129,7 +129,7 @@ - Add `AnalyticsMonthlyMetric` model, upsert service, and `site_analytics.aggregate_monthly_metrics` Celery task. - Add `site_analytics.prune_old_pageviews` task. - Wire beat schedule entries. -5. `A5` Admin + operational visibility. +5. `A5` ✓ Admin + operational visibility. - Register admin for raw/aggregate models. - Add concise task result payloads/counters. 6. `A6` Retention and hardening. @@ -187,6 +187,7 @@ - Subtlety — `month` stored as first-of-month `DateField`: avoids a separate `YearMonthField` or string; sorts and filters naturally, and is unambiguous regardless of timezone. Any query for a given month uses `month=date(year, month, 1)`. - Subtlety — index name length limit: Django enforces a 30-character limit on `Index` names. `sa_monthlymetric_site_month_idx` (31 chars) triggered `models.E034` at `makemigrations` time. Renamed to `sa_monthly_site_month_idx`. The daily metric constraint name `sa_dailymetric_site_date_unique` (31 chars) is a `UniqueConstraint` name, not an `Index` name, so Django does not enforce the same limit for it. - Subtlety — prune task does not touch aggregate tables: `prune_old_pageviews` only deletes `AnalyticsPageView` rows. The daily/monthly aggregate tables are never pruned automatically; they are the durable reporting record. + - `A5` implemented: Django admin for all three models. All three admin classes disable add/change/delete permissions to enforce immutability of analytics data through the admin UI. ## Open Questions - ~~Should `site` configuration live in DB (admin-editable) or settings/env (static)?~~ Resolved: settings/env (`SITE_ANALYTICS_ALLOWED_SITES`) for v1. diff --git a/qb_site/site_analytics/admin.py b/qb_site/site_analytics/admin.py new file mode 100644 index 00000000..dbfd7e3b --- /dev/null +++ b/qb_site/site_analytics/admin.py @@ -0,0 +1,72 @@ +"""Django admin registrations for site analytics models.""" + +from __future__ import annotations + +from django.contrib import admin + +from site_analytics.models import AnalyticsDailyMetric, AnalyticsMonthlyMetric, AnalyticsPageView + + +@admin.register(AnalyticsPageView) +class AnalyticsPageViewAdmin(admin.ModelAdmin): + list_display = ("site", "path", "occurred_at", "visitor_month_hash_short", "referrer_short") + list_filter = ("site",) + search_fields = ("site", "path", "referrer") + readonly_fields = ("site", "path", "referrer", "user_agent", "occurred_at", "visitor_month_hash") + ordering = ("-occurred_at",) + date_hierarchy = "occurred_at" + + # Raw rows are immutable; disable add/change/delete to enforce that. + def has_add_permission(self, request): + return False + + def has_change_permission(self, request, obj=None): + return False + + def has_delete_permission(self, request, obj=None): + return False + + @admin.display(description="hash (prefix)") + def visitor_month_hash_short(self, obj: AnalyticsPageView) -> str: + return obj.visitor_month_hash[:12] + "…" + + @admin.display(description="referrer") + def referrer_short(self, obj: AnalyticsPageView) -> str: + return (obj.referrer[:60] + "…") if len(obj.referrer) > 60 else obj.referrer + + +@admin.register(AnalyticsDailyMetric) +class AnalyticsDailyMetricAdmin(admin.ModelAdmin): + list_display = ("site", "date", "pageviews", "unique_visitors") + list_filter = ("site",) + search_fields = ("site",) + readonly_fields = ("site", "date", "pageviews", "unique_visitors") + ordering = ("-date", "site") + date_hierarchy = "date" + + def has_add_permission(self, request): + return False + + def has_change_permission(self, request, obj=None): + return False + + def has_delete_permission(self, request, obj=None): + return False + + +@admin.register(AnalyticsMonthlyMetric) +class AnalyticsMonthlyMetricAdmin(admin.ModelAdmin): + list_display = ("site", "month", "pageviews", "unique_visitors") + list_filter = ("site",) + search_fields = ("site",) + readonly_fields = ("site", "month", "pageviews", "unique_visitors") + ordering = ("-month", "site") + + def has_add_permission(self, request): + return False + + def has_change_permission(self, request, obj=None): + return False + + def has_delete_permission(self, request, obj=None): + return False From acac5e38018394464615307197743faa4391969e Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 16:03:10 -0400 Subject: [PATCH 07/17] =?UTF-8?q?feat(site=5Fanalytics):=20A6=20=E2=80=94?= =?UTF-8?q?=20empty-UA=20hardening=20flag?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SITE_ANALYTICS_REJECT_EMPTY_UA setting (default off) to optionally drop requests with no User-Agent header before the existing bot-filter check. Returns 204 (same as bot drop) to avoid leaking detection logic. Tests cover default-allow, flag-enabled drop, and flag-enabled accept. Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 2 ++ .../031-analytics-ingestion-design.md | 3 ++- qb_site/api/views/analytics_collect.py | 4 ++++ qb_site/qb_site/settings/base.py | 2 ++ .../site_analytics/tests/test_collect_view.py | 19 +++++++++++++++++++ 5 files changed, 29 insertions(+), 1 deletion(-) diff --git a/.env.example b/.env.example index 0c61f90e..c292db68 100644 --- a/.env.example +++ b/.env.example @@ -249,6 +249,8 @@ SITE_ANALYTICS_HASH_SALT= SITE_ANALYTICS_ALLOWED_SITES= # Raw pageview retention window in days (default 540 ≈ 18 months). SITE_ANALYTICS_RETENTION_DAYS=540 +# Reject requests with an empty User-Agent (stricter bot hardening; default off). +SITE_ANALYTICS_REJECT_EMPTY_UA=0 # CI filter (opt-in allowlist) # Set SYNCER_CI_FILTER_MODE=allowlist to enable filtering by the allow lists below. diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index 246b2747..eaa62539 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -132,7 +132,7 @@ 5. `A5` ✓ Admin + operational visibility. - Register admin for raw/aggregate models. - Add concise task result payloads/counters. -6. `A6` Retention and hardening. +6. `A6` ✓ Retention and hardening. - Tune bot filtering and optional throttle policy. ## Validation Plan @@ -188,6 +188,7 @@ - Subtlety — index name length limit: Django enforces a 30-character limit on `Index` names. `sa_monthlymetric_site_month_idx` (31 chars) triggered `models.E034` at `makemigrations` time. Renamed to `sa_monthly_site_month_idx`. The daily metric constraint name `sa_dailymetric_site_date_unique` (31 chars) is a `UniqueConstraint` name, not an `Index` name, so Django does not enforce the same limit for it. - Subtlety — prune task does not touch aggregate tables: `prune_old_pageviews` only deletes `AnalyticsPageView` rows. The daily/monthly aggregate tables are never pruned automatically; they are the durable reporting record. - `A5` implemented: Django admin for all three models. All three admin classes disable add/change/delete permissions to enforce immutability of analytics data through the admin UI. + - `A6` implemented: `SITE_ANALYTICS_REJECT_EMPTY_UA` settings flag (default off) to drop requests with no `User-Agent` header; wired into the collect view before the existing bot-filter check. ## Open Questions - ~~Should `site` configuration live in DB (admin-editable) or settings/env (static)?~~ Resolved: settings/env (`SITE_ANALYTICS_ALLOWED_SITES`) for v1. diff --git a/qb_site/api/views/analytics_collect.py b/qb_site/api/views/analytics_collect.py index 636fe78c..b6875565 100644 --- a/qb_site/api/views/analytics_collect.py +++ b/qb_site/api/views/analytics_collect.py @@ -44,6 +44,10 @@ def post(self, request: Request, *args: object, **kwargs: object) -> Response: if site not in allowed_sites: return Response({"detail": "unknown site"}, status=status.HTTP_400_BAD_REQUEST) + # Reject empty UA when the stricter hardening flag is enabled. + if not user_agent and settings.SITE_ANALYTICS_REJECT_EMPTY_UA: + return Response(status=status.HTTP_204_NO_CONTENT) + # Silently drop bot traffic rather than returning an error, to avoid # leaking information about detection heuristics. if is_bot(user_agent): diff --git a/qb_site/qb_site/settings/base.py b/qb_site/qb_site/settings/base.py index 3ec0e8a5..347b8b6a 100644 --- a/qb_site/qb_site/settings/base.py +++ b/qb_site/qb_site/settings/base.py @@ -372,6 +372,8 @@ def env_optional_bounded_int(name: str, *, minimum: int, maximum: int) -> int | SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS = int(os.getenv("SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS", 3600)) SITE_ANALYTICS_MONTHLY_AGGREGATE_PERIOD_SECONDS = int(os.getenv("SITE_ANALYTICS_MONTHLY_AGGREGATE_PERIOD_SECONDS", 86400)) SITE_ANALYTICS_PRUNE_PERIOD_SECONDS = int(os.getenv("SITE_ANALYTICS_PRUNE_PERIOD_SECONDS", 86400)) +# Reject requests with an empty User-Agent header (stricter bot hardening). +SITE_ANALYTICS_REJECT_EMPTY_UA = env_bool(os.getenv("SITE_ANALYTICS_REJECT_EMPTY_UA"), False) # CI filter (opt-in allowlist mode) # Set mode to 'allowlist' to enable filtering by the following substrings; otherwise all contexts are ingested. diff --git a/qb_site/site_analytics/tests/test_collect_view.py b/qb_site/site_analytics/tests/test_collect_view.py index 07c5162c..ba59d4bd 100644 --- a/qb_site/site_analytics/tests/test_collect_view.py +++ b/qb_site/site_analytics/tests/test_collect_view.py @@ -123,3 +123,22 @@ def test_oversized_path_is_truncated(self): def test_empty_allowed_sites_rejects_all(self): resp = self._post({"site": "test-site", "path": "/about"}) self.assertEqual(resp.status_code, 400) + + # --- empty UA hardening flag --- + + def test_empty_ua_allowed_by_default(self): + resp = self._post({"site": "test-site", "path": "/"}) # no HTTP_USER_AGENT + self.assertEqual(resp.status_code, 204) + self.assertEqual(AnalyticsPageView.objects.count(), 1) + + @override_settings(SITE_ANALYTICS_REJECT_EMPTY_UA=True) + def test_empty_ua_dropped_when_flag_enabled(self): + resp = self._post({"site": "test-site", "path": "/"}) + self.assertEqual(resp.status_code, 204) + self.assertEqual(AnalyticsPageView.objects.count(), 0) + + @override_settings(SITE_ANALYTICS_REJECT_EMPTY_UA=True) + def test_non_empty_ua_accepted_when_flag_enabled(self): + resp = self._post({"site": "test-site", "path": "/"}, HTTP_USER_AGENT="Mozilla/5.0") + self.assertEqual(resp.status_code, 204) + self.assertEqual(AnalyticsPageView.objects.count(), 1) From 2b4a3c64fc1b746454438bd5442bddbca09bb71e Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 16:46:46 -0400 Subject: [PATCH 08/17] feat(site_analytics): add CORS support; convert 031 living plan to final ADR CORS: - AnalyticsCollectView now returns Access-Control-Allow-Origin: * on all responses and handles OPTIONS preflight so browsers on third-party static sites can call the endpoint directly without a server-side proxy - Two new tests: POST includes CORS header, OPTIONS preflight returns 204 ADR: - Convert 031 from living implementation plan to concise final decision record - Covers architecture, models, privacy invariants, operational notes, consequences, and deferred v1.1 items - Includes static-site tracking snippet (sendBeacon + fetch fallback) with onboarding instructions for new sites Co-Authored-By: Claude Sonnet 4.6 --- .../031-analytics-ingestion-design.md | 338 +++++++----------- qb_site/api/views/analytics_collect.py | 34 +- .../site_analytics/tests/test_collect_view.py | 13 + 3 files changed, 177 insertions(+), 208 deletions(-) diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index eaa62539..1203872f 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -1,208 +1,142 @@ -# Analytics Ingestion for `qb_site` (Living Plan) +# Site Analytics Ingestion (`site_analytics`) ## Context -- We want lightweight website analytics for funder-facing growth reporting. -- Current `qb_site` structure already has clear boundaries: - - `api/` for HTTP surface area. - - `syncer/` for raw GitHub ingestion. - - `analyzer/` for derived analytics built from stored facts. -- Existing Celery beat and snapshot patterns in `qb_site/qb_site/settings/base.py` and `analyzer/tasks/*` are good templates. -- The previous version of this doc was generic and not mapped to repo-specific modules, rollout, or tests. - -## Problem Statement -- Add privacy-preserving pageview ingestion for static properties without introducing new infrastructure. -- Keep implementation operationally simple and consistent with the current Django/Celery architecture. -- Make implementation trackable as a chunked, testable plan that can be updated during delivery. - -## Goals / Non-Goals -- Goals: - - Collect coarse pageview/referrer signals for multiple sites. - - Produce daily and monthly aggregates suitable for funder reporting. - - Enforce privacy constraints (no raw IP retention, no persistent cross-month identifiers). - - Keep ingestion endpoint lightweight and resilient. -- Non-goals (v1): - - Sessionization, funnels, attribution modeling. - - Cookie-based or long-lived user identity. - - Real-time dashboarding. - - New data infrastructure (Kafka/ClickHouse/etc.). - -## Decision (Current Plan) -- Implement a new Django app: `site_analytics`. -- Expose ingestion under existing API namespace via `qb_site/api/urls.py`. -- Store raw event rows in `site_analytics` (bounded retention), and store reporting reads in aggregated tables. -- Run periodic aggregation with Celery beat, following existing analyzer/syncer task style. -- Keep auth simple in v1: optional per-site shared token + bot/user-agent filtering. - -## Proposed Design - -### App and module placement -- New app: `qb_site/site_analytics/` - - `models/`: raw + aggregate tables. - - `services/`: hashing, bot filtering, aggregation logic. - - `tasks/`: periodic aggregation + retention. - - `tests/`: model/service/task coverage. -- API entrypoint: - - Add endpoint in `qb_site/api/urls.py`. - - View implementation in `qb_site/api/views/analytics_collect.py`. - - This keeps external endpoints discoverable in one API module. - -### HTTP ingestion contract -- Endpoint: `POST /api/v1/analytics/collect` -- Request payload (v1): - - `site` (slug; required; must be in `SITE_ANALYTICS_ALLOWED_SITES`) - - `path` (required) - - `referrer` (optional) -- Behavior: - - Uses DRF `APIView` with `authentication_classes = []` and `permission_classes = []`; this implicitly bypasses CSRF enforcement without needing `@csrf_exempt`. - - Minimal synchronous work: validate, compute hash fields, write one row, return `204`. - - Reject unknown `site` or malformed payloads with `400`. - -### Data model (planned) -- `AnalyticsPageView` (raw) - - `site`, `path`, `referrer`, `user_agent` - - `occurred_at` (event time, default `timezone.now`) - - `visitor_month_hash` (privacy-preserving monthly hash) - - indexes on `(site, occurred_at)`, `(occurred_at)`, and optionally `(site, path, occurred_at)` -- `AnalyticsDailyMetric` (aggregate) - - `site`, `date` - - `pageviews`, `unique_visitors` - - unique constraint on `(site, date)` -- `AnalyticsMonthlyMetric` (aggregate/reporting convenience) - - `site`, `month` - - `pageviews`, `unique_visitors`, `top_referrers_json`, `top_paths_json` (optional in v1) - - unique constraint on `(site, month)` - -### Privacy and identity strategy -- Do not persist raw IP addresses. -- Compute `visitor_month_hash = sha256(ip + normalized_user_agent + month_key + secret_salt)`. -- `month_key` is UTC `YYYY-MM`; this intentionally prevents cross-month correlation. -- `secret_salt` comes from env/config (e.g., `SITE_ANALYTICS_HASH_SALT`). -- IP extraction: read `X-Forwarded-For` first (set by Heroku's routing layer and most reverse proxies); fall back to `REMOTE_ADDR`. Take the first (leftmost) address from `X-Forwarded-For` to get the client IP. -- Retain raw rows only for bounded backfill/debug windows (target: 12-18 months). - -### Aggregation strategy -- Add periodic tasks: - - `site_analytics.aggregate_daily_metrics` (hourly or nightly; idempotent upsert). - - `site_analytics.aggregate_monthly_metrics` (daily; recompute current + previous month). - - `site_analytics.prune_old_pageviews` (daily retention cleanup). -- Wire schedules in `qb_site/qb_site/settings/base.py` with env-overridable intervals and retention days. - -### Security and abuse controls -- Site allowlist via `SITE_ANALYTICS_ALLOWED_SITES` env var (comma-separated slugs); unknown slugs rejected with `400`. - - Per-site tokens deferred to v1.1. -- Basic bot filtering: - - denylist common bot user-agent substrings. - - optional reject when user-agent is empty. -- Optional DRF/Django rate limiting for collection endpoint (can begin with app-level simple limits). - -### Public sanitized backup compatibility -- This data will flow through the public backup pipeline in `.github/workflows/upload_backup.yaml`. -- Keep analytics tables and fields compatible with sanitization/export scripts (`scripts/sanitize_backup.py`, `scripts/export_for_analysis.py`). -- Backup policy for analytics tables: - - `site_analytics_analyticspageview` → `TRUNCATE_TABLES` (raw rows contain visitor hashes; exclude from public backup). - - `site_analytics_analyticsdailymetric`, `site_analytics_analyticsmonthlymetric` → `RETAIN_TABLES` (aggregate-only, safe to share). -- `scripts/backup_policy.py` must be updated in A1 alongside the migration; `repo_check_compose.sh` enforces coverage. -- Treat this as a release gate for analytics schema changes, consistent with `docs/design-decisions/016-sanitized-backups.md`. - -## Invariants / Subtleties +- Lightweight, privacy-preserving pageview analytics for funder-facing growth reporting on static community sites. +- No new infrastructure: implemented as a Django app inside the existing `qb_site` stack (Postgres, Celery, Redis). +- Privacy constraint: no raw IP retention; visitor identity approximated by a monthly-rotating hash. +- Operational constraint: ingestion endpoint must be fast, resilient, and callable from third-party static sites (CORS required). + +## Decision +- New Django app `site_analytics` with three models, a REST ingestion endpoint, and three Celery periodic tasks. +- Site allowlist (`SITE_ANALYTICS_ALLOWED_SITES`) gates ingestion; no per-site auth tokens in v1. +- Reporting reads from aggregate tables only; raw rows are bounded-retention scratch space. + +## Architecture + +### Models +- `AnalyticsPageView` — raw event rows; immutable after insert; pruned after `SITE_ANALYTICS_RETENTION_DAYS` (default 540). + - Fields: `site`, `path`, `referrer`, `user_agent`, `occurred_at`, `visitor_month_hash`. + - Indexes: `(site, occurred_at)`, `(occurred_at)`. +- `AnalyticsDailyMetric` — daily aggregate per site; unique on `(site, date)`. + - Fields: `site`, `date` (UTC), `pageviews`, `unique_visitors`. +- `AnalyticsMonthlyMetric` — monthly aggregate per site; unique on `(site, month)`. + - Fields: `site`, `month` (UTC first-of-month `DateField`, e.g. `2026-03-01`), `pageviews`, `unique_visitors`. + +### Ingestion endpoint +- `POST /api/v1/analytics/collect` — view in `api/views/analytics_collect.py`. +- Required fields: `site` (must be in `SITE_ANALYTICS_ALLOWED_SITES`), `path`. +- Optional field: `referrer`. +- `User-Agent` read from HTTP header (not payload). +- Returns `204` on success, bot drop, and empty-UA drop; `400` on validation failure. +- CORS headers (`Access-Control-Allow-Origin: *`) on all responses; `OPTIONS` preflight handled. +- No CSRF enforcement: DRF `authentication_classes = []` / `permission_classes = []`. + +### Privacy +- Raw IP not stored. `visitor_month_hash = sha256(ip | normalized_ua | YYYY-MM | salt)`. +- Fields joined with `|` to prevent cross-field hash collisions. +- UA is lowercased before hashing so casing variation in the same browser does not inflate unique-visitor counts. +- IP extracted from `X-Forwarded-For` (leftmost address; set by Heroku routing and most reverse proxies), falling back to `REMOTE_ADDR` for direct connections. +- `month_key` is UTC `YYYY-MM`; cross-month correlation is impossible by construction. +- Salt from `SITE_ANALYTICS_HASH_SALT` env var. **Changing the salt or separator invalidates all historical hashes.** + +### Bot filtering +- Substring denylist in `site_analytics/services/bot_filter.py` matched against lowercased UA. +- Bots return `204` (not a distinct error code) to avoid leaking detection heuristics. +- Optional empty-UA rejection via `SITE_ANALYTICS_REJECT_EMPTY_UA=1` (default off); also returns `204`. + +### Aggregation +- `site_analytics.aggregate_daily_metrics` (default: hourly) — upserts `AnalyticsDailyMetric` for a rolling `days_back=2` window so events near UTC midnight are never missed. Fully idempotent. +- `site_analytics.aggregate_monthly_metrics` (default: daily) — upserts `AnalyticsMonthlyMetric` for a rolling `months_back=2` window. +- `site_analytics.prune_old_pageviews` (default: daily) — deletes `AnalyticsPageView` rows older than the retention cutoff. Aggregate tables are never pruned. +- All three tasks preserve existing aggregate rows when the corresponding raw rows have been pruned, to avoid silently zeroing out reporting data after the retention window. + +### Backup policy +- `site_analytics_analyticspageview` → `TRUNCATE_TABLES` (raw rows contain visitor hashes). +- `site_analytics_analyticsdailymetric`, `site_analytics_analyticsmonthlymetric` → `RETAIN_TABLES` (aggregate-only, safe to share publicly). + +## Invariants - Reporting reads must come from aggregate tables, not raw pageview scans. -- Hashing semantics are part of the privacy contract; any change requires explicit migration/versioning note. -- Aggregation tasks must be idempotent and safe under retries/overlap. -- Time boundary semantics must use UTC consistently for day/month buckets. -- `site` taxonomy must remain stable; renames need explicit backfill/mapping handling. - -## Implementation Plan (Chunks) -1. `A1` ✓ App scaffold + settings wiring. - - Create `site_analytics` app and add to `INSTALLED_APPS`. - - Add `AnalyticsPageView` raw model and initial migration. - - Add env settings: `SITE_ANALYTICS_HASH_SALT`, `SITE_ANALYTICS_ALLOWED_SITES` (comma-separated slugs), `SITE_ANALYTICS_RETENTION_DAYS`, task period vars. - - Update `scripts/backup_policy.py` with the three new tables. - - Create `qb_site/site_analytics/AGENTS.md` (and `CLAUDE.md`). -2. `A2` ✓ Raw ingestion endpoint + validation. - - Add `POST /api/v1/analytics/collect` route and view. - - Implement payload validation, site allowlist check, IP extraction (X-Forwarded-For → REMOTE_ADDR fallback), hashing service, and raw insert. - - Per-site tokens deferred to v1.1. -3. `A3` ✓ Daily aggregate model + service + Celery task. - - Add `AnalyticsDailyMetric` model, upsert service, and `site_analytics.aggregate_daily_metrics` Celery task. - - Wire beat schedule entry in `base.py`. -4. `A4` ✓ Monthly aggregate model + service + Celery task + prune task. - - Add `AnalyticsMonthlyMetric` model, upsert service, and `site_analytics.aggregate_monthly_metrics` Celery task. - - Add `site_analytics.prune_old_pageviews` task. - - Wire beat schedule entries. -5. `A5` ✓ Admin + operational visibility. - - Register admin for raw/aggregate models. - - Add concise task result payloads/counters. -6. `A6` ✓ Retention and hardening. - - Tune bot filtering and optional throttle policy. - -## Validation Plan -- Unit tests: - - hashing/privacy rules, month rotation behavior. - - bot-filter decisions and payload validation. - - aggregation correctness (`COUNT(*)`, distinct hash counts). -- API tests (`qb_site/api/tests/`): - - `204` success path. - - `400` invalid payload. - - `403` invalid token (when enabled). -- Task tests: - - idempotent reruns. - - retry-safe partial failure behavior. - - retention pruning boundaries. -- Integration checks: - - `uv run ruff check qb_site` - - `uv run ruff format qb_site` - - Compose-backed tests via `bash scripts/repo_check_compose.sh` when available. - -## Rollout Plan -- Phase 0: dark launch ingestion in one low-risk site. -- Phase 1: enable daily/monthly aggregation and validate numbers manually for 1-2 weeks. -- Phase 2: onboard remaining static sites and publish recurring reporting output. -- Phase 3: tighten retention and evaluate need for partitioning at higher volumes. - -## Progress Notes -- 2026-02-26: - - Converted this file from generic guidance to a repo-specific living plan. - - Anchored implementation to existing `qb_site` boundaries and task patterns. - - No code implementation started yet; all chunks currently pending. -- 2026-03-25: - - Pre-implementation design review; resolved open questions and refined plan: - - CSRF: implicit via DRF `authentication_classes = []`, no decorator needed. - - Site config: `SITE_ANALYTICS_ALLOWED_SITES` comma-separated env var; per-site tokens deferred to v1.1. - - IP extraction: `X-Forwarded-For` (Heroku/proxy) → `REMOTE_ADDR` fallback in hashing service. - - Backup policy: raw pageviews truncated, daily/monthly aggregates retained; `backup_policy.py` update required in A1. - - `tasks/__init__.py` re-export pattern (consistent with syncer/analyzer) added to A1 scaffold. - - `AGENTS.md` creation required in A1. - - `A1` implemented: app scaffold, `AnalyticsPageView` model, settings, backup policy entry, `repo_check_compose.sh` wiring, AGENTS.md. - - No significant deviations from plan. `validate_backup_policy.py` passes without a live DB (uses Django app registry). - - `A2` implemented: hashing service, bot-filter service, `POST /api/v1/analytics/collect` view, URL wiring, `.env.example` entries, service + endpoint tests. - - Subtlety — field separator in hash: the design specified `sha256(ip + ua + month + salt)` with implicit concatenation, which is collision-prone (e.g. `"ab" + "c"` vs `"a" + "bc"`). Implemented with `|` pipe separators between all four fields. This is a privacy-contract detail: any future reimplementation must use the same separator or old and new hashes will diverge for the same visitor. - - Subtlety — bot traffic response: bots return `204` (same as success) rather than a distinct status code so detection heuristics are not leaked to callers. - - Subtlety — field truncation: `path`, `referrer`, and `user_agent` are silently truncated to their model `max_length` before insert rather than rejected, since truncation is preferable to dropping the event entirely for slightly over-long paths. - - Subtlety — empty `SITE_ANALYTICS_ALLOWED_SITES`: an unconfigured (empty) allowlist rejects all requests with `400`; this is intentional opt-in behaviour that prevents accidental data collection before sites are explicitly registered. - - `A3` implemented: `AnalyticsDailyMetric` model, `aggregate_daily_metrics` service, `site_analytics.aggregate_daily_metrics` Celery task, beat schedule entry, backup policy entry (`RETAIN_TABLES`), tests. - - Subtlety — rolling window default (`days_back=2`): recomputes today and yesterday on every run so events arriving near UTC midnight or during a prior task run are never missed. The task is fully idempotent: each call overwrites aggregates with a fresh count, so retries and overlapping runs are harmless. - - Subtlety — preserve aggregates when raw rows are gone: if raw pageviews have been pruned (by the A4 retention task) but an aggregate row already exists, the service leaves the existing row in place rather than zeroing it. Zeroing would silently destroy reporting data after the retention window passes. - - Subtlety — `__date` lookup and UTC: Django's `occurred_at__date=d` with `USE_TZ=True` and `TIME_ZONE=UTC` correctly evaluates the date boundary at UTC midnight in PostgreSQL. Covered by `test_date_boundary_is_utc`. - - `A4` implemented: `AnalyticsMonthlyMetric` model, monthly aggregation service, `site_analytics.aggregate_monthly_metrics` and `site_analytics.prune_old_pageviews` Celery tasks, beat schedule entries, backup policy entries, tests. - - Subtlety — `month` stored as first-of-month `DateField`: avoids a separate `YearMonthField` or string; sorts and filters naturally, and is unambiguous regardless of timezone. Any query for a given month uses `month=date(year, month, 1)`. - - Subtlety — index name length limit: Django enforces a 30-character limit on `Index` names. `sa_monthlymetric_site_month_idx` (31 chars) triggered `models.E034` at `makemigrations` time. Renamed to `sa_monthly_site_month_idx`. The daily metric constraint name `sa_dailymetric_site_date_unique` (31 chars) is a `UniqueConstraint` name, not an `Index` name, so Django does not enforce the same limit for it. - - Subtlety — prune task does not touch aggregate tables: `prune_old_pageviews` only deletes `AnalyticsPageView` rows. The daily/monthly aggregate tables are never pruned automatically; they are the durable reporting record. - - `A5` implemented: Django admin for all three models. All three admin classes disable add/change/delete permissions to enforce immutability of analytics data through the admin UI. - - `A6` implemented: `SITE_ANALYTICS_REJECT_EMPTY_UA` settings flag (default off) to drop requests with no `User-Agent` header; wired into the collect view before the existing bot-filter check. - -## Open Questions -- ~~Should `site` configuration live in DB (admin-editable) or settings/env (static)?~~ Resolved: settings/env (`SITE_ANALYTICS_ALLOWED_SITES`) for v1. -- Do we need per-path monthly aggregates in v1, or can we defer to v1.1? -- What default retention window is acceptable for privacy/compliance expectations? -- ~~Should endpoint auth be required for all sites from day one, or optional during bootstrap?~~ Resolved: no per-site tokens in v1; `SITE_ANALYTICS_ALLOWED_SITES` allowlist is the only gate. +- Hashing semantics (field separator `|`, UA normalization, month key format, salt) are part of the privacy contract; any change requires an explicit migration note and bumps all historical hashes. +- Aggregation tasks must remain idempotent and safe under retries and overlapping runs. +- All date/time boundaries use UTC. `occurred_at__date=d` with `USE_TZ=True` and `TIME_ZONE=UTC` evaluates at UTC midnight in PostgreSQL. +- `site` slugs must remain stable; renames require an explicit backfill of both raw rows and aggregate tables. + +## Operational Notes + +### Key settings (all env-overridable) +| Setting | Default | Notes | +|---|---|---| +| `SITE_ANALYTICS_HASH_SALT` | `""` | Required in production; empty disables hash safety in dev | +| `SITE_ANALYTICS_ALLOWED_SITES` | `""` | Comma-separated slugs; empty list rejects all traffic | +| `SITE_ANALYTICS_RETENTION_DAYS` | `540` | ~18 months of raw row retention | +| `SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS` | `3600` | Set to `0` to disable | +| `SITE_ANALYTICS_MONTHLY_AGGREGATE_PERIOD_SECONDS` | `86400` | Set to `0` to disable | +| `SITE_ANALYTICS_PRUNE_PERIOD_SECONDS` | `86400` | Set to `0` to disable | +| `SITE_ANALYTICS_REJECT_EMPTY_UA` | `0` | Set to `1` for stricter bot hardening | + +### Onboarding a new site +1. Add the site slug to `SITE_ANALYTICS_ALLOWED_SITES` (comma-separated, no spaces). +2. Deploy/restart the web dyno so the new slug is live. +3. Add the tracking snippet to the site (see below). +4. Verify events appear in the Django admin under `AnalyticsPageView`. +5. After one aggregation cycle, check `AnalyticsDailyMetric` for counts. + +### Static-site tracking snippet + +Place this snippet at the bottom of each page (or in a shared layout template). +Replace `YOUR_QUEUEBOARD_HOST` and `YOUR_SITE_SLUG` before deploying. + +```html + +``` + +**Notes:** +- The snippet is fire-and-forget; errors are silently swallowed so a tracking failure never affects page load. +- `sendBeacon` is preferred: it survives page unload and does not block navigation. +- No cookies, no persistent identifiers, no third-party scripts. +- The endpoint returns `204` for all non-error outcomes (success, bot drop, unknown UA) so the response body is never read. + +### Migrations +- `0001_initial` — `AnalyticsPageView` +- `0002_analyticsdailymetric` — `AnalyticsDailyMetric` +- `0003_analyticsmonthlymetric` — `AnalyticsMonthlyMetric` + +## Consequences +- Adds ~48 DB tables to backup scope (three new, classified in policy). +- Ingestion is synchronous (one DB write per request); at high volume a write buffer or async insert could be considered. +- Unique-visitor counts are approximate: same visitor across different browsers or after a UA update will be counted separately; this is acceptable for coarse funder reporting. +- Monthly hash rotation means a visitor who spans a month boundary is counted as two unique visitors. This is by design to prevent cross-month correlation. + +## Deferred (v1.1) +- Per-site auth tokens. +- Per-path monthly aggregates (`top_paths_json` field on `AnalyticsMonthlyMetric`). +- Top-referrer aggregates (`top_referrers_json`). +- DRF throttle policy on the collection endpoint. +- Partitioning `AnalyticsPageView` by month at higher volumes. ## References -- `docs/design-decisions/README.md` -- `.github/workflows/upload_backup.yaml` +- `qb_site/site_analytics/` — app source +- `qb_site/api/views/analytics_collect.py` — ingestion view +- `qb_site/qb_site/settings/base.py` — beat schedule and settings +- `scripts/backup_policy.py` — table classification - `docs/design-decisions/016-sanitized-backups.md` -- `docs/design-decisions/030-sync-task-dedupe-strategy.md` -- `docs/design-decisions/029-updatedat-discovery-watermark-and-catchup.md` -- `qb_site/qb_site/urls.py` -- `qb_site/api/urls.py` -- `qb_site/qb_site/settings/base.py` -- `qb_site/analyzer/tasks/collect_convergence.py` diff --git a/qb_site/api/views/analytics_collect.py b/qb_site/api/views/analytics_collect.py index b6875565..2b217982 100644 --- a/qb_site/api/views/analytics_collect.py +++ b/qb_site/api/views/analytics_collect.py @@ -18,17 +18,39 @@ _REFERRER_MAX = 2000 _UA_MAX = 1000 +# CORS headers added to every response so browsers on third-party/static sites +# can call this endpoint without a server-side proxy. +_CORS_HEADERS = { + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Methods": "POST, OPTIONS", + "Access-Control-Allow-Headers": "Content-Type", + "Access-Control-Max-Age": "86400", +} + + +def _cors(response: Response) -> Response: + for key, value in _CORS_HEADERS.items(): + response[key] = value + return response + class AnalyticsCollectView(APIView): """Ingest a single pageview event. Intentionally minimal: validate, hash, insert, return 204. All heavier work (aggregation, reporting) happens in periodic tasks. + + CORS headers are always returned so browsers on third-party static sites + can call this endpoint directly. """ authentication_classes: list = [] permission_classes: list = [] + def options(self, request: Request, *args: object, **kwargs: object) -> Response: + """Handle CORS preflight requests.""" + return _cors(Response(status=status.HTTP_204_NO_CONTENT)) + def post(self, request: Request, *args: object, **kwargs: object) -> Response: site = (request.data.get("site") or "").strip() path = (request.data.get("path") or "").strip() @@ -36,22 +58,22 @@ def post(self, request: Request, *args: object, **kwargs: object) -> Response: user_agent = request.META.get("HTTP_USER_AGENT", "").strip() if not site: - return Response({"detail": "site is required"}, status=status.HTTP_400_BAD_REQUEST) + return _cors(Response({"detail": "site is required"}, status=status.HTTP_400_BAD_REQUEST)) if not path: - return Response({"detail": "path is required"}, status=status.HTTP_400_BAD_REQUEST) + return _cors(Response({"detail": "path is required"}, status=status.HTTP_400_BAD_REQUEST)) allowed_sites = settings.SITE_ANALYTICS_ALLOWED_SITES if site not in allowed_sites: - return Response({"detail": "unknown site"}, status=status.HTTP_400_BAD_REQUEST) + return _cors(Response({"detail": "unknown site"}, status=status.HTTP_400_BAD_REQUEST)) # Reject empty UA when the stricter hardening flag is enabled. if not user_agent and settings.SITE_ANALYTICS_REJECT_EMPTY_UA: - return Response(status=status.HTTP_204_NO_CONTENT) + return _cors(Response(status=status.HTTP_204_NO_CONTENT)) # Silently drop bot traffic rather than returning an error, to avoid # leaking information about detection heuristics. if is_bot(user_agent): - return Response(status=status.HTTP_204_NO_CONTENT) + return _cors(Response(status=status.HTTP_204_NO_CONTENT)) now = timezone.now() month_key = now.strftime("%Y-%m") @@ -66,4 +88,4 @@ def post(self, request: Request, *args: object, **kwargs: object) -> Response: visitor_month_hash=visitor_month_hash, ) - return Response(status=status.HTTP_204_NO_CONTENT) + return _cors(Response(status=status.HTTP_204_NO_CONTENT)) diff --git a/qb_site/site_analytics/tests/test_collect_view.py b/qb_site/site_analytics/tests/test_collect_view.py index ba59d4bd..abfbbfc4 100644 --- a/qb_site/site_analytics/tests/test_collect_view.py +++ b/qb_site/site_analytics/tests/test_collect_view.py @@ -142,3 +142,16 @@ def test_non_empty_ua_accepted_when_flag_enabled(self): resp = self._post({"site": "test-site", "path": "/"}, HTTP_USER_AGENT="Mozilla/5.0") self.assertEqual(resp.status_code, 204) self.assertEqual(AnalyticsPageView.objects.count(), 1) + + # --- CORS --- + + def test_post_response_includes_cors_header(self): + resp = self._post({"site": "test-site", "path": "/"}, HTTP_USER_AGENT="Mozilla/5.0") + self.assertEqual(resp["Access-Control-Allow-Origin"], "*") + + def test_options_preflight_returns_204_with_cors_headers(self): + resp = self.client.options(URL) + self.assertEqual(resp.status_code, 204) + self.assertEqual(resp["Access-Control-Allow-Origin"], "*") + self.assertIn("POST", resp["Access-Control-Allow-Methods"]) + self.assertIn("Content-Type", resp["Access-Control-Allow-Headers"]) From 63159553267c6baf2dac3a73fd8831c5f11c0832 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 17:22:30 -0400 Subject: [PATCH 09/17] fix(dashboard): tolerate missing Dashboard keys in prs_to_list write_dashboard crashed with KeyError when rendering fixtures that predate newer Dashboard enum members (e.g. NotFromFork). Use .get() with an empty-list default so old snapshots produce an empty table rather than aborting page generation. Co-Authored-By: Claude Sonnet 4.6 --- src/queueboard/dashboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/queueboard/dashboard.py b/src/queueboard/dashboard.py index 5bbc04ec..31ee4b46 100755 --- a/src/queueboard/dashboard.py +++ b/src/queueboard/dashboard.py @@ -428,7 +428,7 @@ def _inner( if extra_settings is None: extra_settings = ExtraColumnSettings.default() - return _inner(prs[kind], kind, aggregate_info, extra_settings, header) + return _inner(prs.get(kind, []), kind, aggregate_info, extra_settings, header) # Specific code for writing the actual webpage files. From bc10070788fad59a2ee773ea7c33e67ae44a27cd Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 17:26:00 -0400 Subject: [PATCH 10/17] doc: update queueboard_main_workflow.md --- docs/queueboard_main_workflow.md | 140 +++++-------------------------- 1 file changed, 20 insertions(+), 120 deletions(-) diff --git a/docs/queueboard_main_workflow.md b/docs/queueboard_main_workflow.md index d493e699..eded9157 100644 --- a/docs/queueboard_main_workflow.md +++ b/docs/queueboard_main_workflow.md @@ -1,6 +1,4 @@ -Here is the main workflow in the `queueboard` repo, which queries data and generates the dashboard using the code in this repo (`queueboard-core`). For the planned v2 ingestion that replaces these ad‑hoc scripts with a database‑backed syncer, see docs/syncer_ingestion_plan.md. -Note that in the `queueboard` repo, the JSON files in `data/` and `processed-data/` are persisted from run to run by git pushes in this workflow. -This is also true of a few auxiliary text files: `closed_prs_to_backfill.txt`, `missing_prs.txt`, `redownload.txt`, `stubborn_prs.txt`. +Here is the main workflow in the `queueboard` repo, which queries data and generates the dashboard from an API endpoint in a deployed version of the `qb_site/` Django project in this repo (`queueboard-core`). ```yaml name: Update PR metadata @@ -30,13 +28,8 @@ jobs: url: ${{ steps.deployment.outputs.page_url }} steps: - - name: Checkout repository - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - with: - ref: master - - name: "Checkout queueboard-core" - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: leanprover-community/queueboard-core ref: master @@ -50,12 +43,12 @@ jobs: cp queueboard-core/reviewer-topics.json . - name: "Setup Python" - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: "3.12" - name: "Setup uv" - uses: astral-sh/setup-uv@1e862dfacbd1d6d858c55d9b792c756523627244 # v7.1.4 + uses: astral-sh/setup-uv@e06108dd0aef18192324c70427afc47652e63a82 # v7.5.0 - name: Install queueboard-core (editable) run: | @@ -63,140 +56,47 @@ jobs: cd queueboard-core uv pip install -e . - - name: "Run scripts/gather_stats.sh" - shell: bash -euo pipefail {0} - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - scripts/gather_stats.sh 12 2>&1 | tee gather_stats.log # Log output to gather_stats.log - - - name: "Update aggregate data file" - if: ${{ !cancelled() }} - run: | - # Write files with aggregate PR data, to "processed_data/{all_pr,open_pr,assignment}_data.json". - uv run python -m queueboard.process - - - name: "Download .json files for all open PRs" - id: "download-json" - if: ${{ !cancelled() }} - env: - GH_TOKEN: ${{ github.token }} - run: | - bash scripts/dashboard.sh - - - name: "Check data integrity" - if: ${{ !cancelled() && (steps.download-json.outcome == 'success') }} - run: | - uv run python -m queueboard.check_data_integrity - - - name: "(Re-)download missing or outdated PR data" - if: ${{ !cancelled() && (steps.download-json.outcome == 'success') }} - env: - GH_TOKEN: ${{ github.token }} - run: | - scripts/download_missing_outdated_PRs.sh - - - name: "Update aggregate data file (again)" - id: update-aggregate-again - if: ${{ !cancelled() }} - run: | - # Write files with aggregate PR data, to "processed_data/{all_pr,open_pr,assignment}_data.json". - uv run python -m queueboard.process - - - name: Commit changes - if: ${{ !cancelled() }} - id: "commit" - run: | - git config --global user.email 'github-actions[bot]@users.noreply.github.com' - git config --global user.name 'github-actions[bot]' - git add data - # Split the large file before committing and remove original - # Can be reconstructed with `cat processed_data/all_pr_data.json.* > processed_data/all_pr_data.json` - split -b 10M processed_data/all_pr_data.json processed_data/all_pr_data.json. - rm -f processed_data/all_pr_data.json - git add processed_data - # These files may not exist, if there was no broken download to begin with resp. all metadata is up to date. - rm -f broken_pr_data.txt - rm -f outdated_prs.txt - git add *.txt - git commit -m 'Update data; redownload missing and outdated data; regenerate aggregate files' - - - name: Push changes - if: ${{ !cancelled() && steps.commit.outcome == 'success' }} - run: | - # FIXME: make this more robust about incoming changes - # The other workflow does not push to this branch, so this should be fine. - git push - - - name: Upload artifact containing files used to generate API - id: upload-pre-api-artifact - if: ${{ !cancelled() && (steps.download-json.outcome == 'success') && (steps.update-aggregate-again.outcome == 'success') }} - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - with: - name: pre-api-artifact - path: | - queue.json - all-open-PRs-1.json - all-open-PRs-2a.json - all-open-PRs-2b.json - all-open-PRs-3.json - processed_data/open_pr_data.json - processed_data/assignment_data.json - - - name: "Generate the data used in dashboard generation" - id: generate-dashboard-data - if: ${{ !cancelled() && (steps.download-json.outcome == 'success') }} - run: | - uv run python -m queueboard.dashboard_data "all-open-PRs-1.json" "all-open-PRs-2a.json" "all-open-PRs-2b.json" "all-open-PRs-3.json" - rm all-open-PRs-*.json queue.json - - - name: Upload artifact containing API files - id: upload-api-artifact - if: ${{ !cancelled() && (steps.generate-dashboard-data.outcome == 'success') }} - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - with: - name: api-artifact - path: api/ - - - name: "Regenerate the dashboard webpages" - id: generate-dashboard - if: ${{ !cancelled() && (steps.generate-dashboard-data.outcome == 'success') }} - run: | - uv run python -m queueboard.dashboard - - name: "Generate dashboard from API (rule set 1)" id: generate-dashboard-api-rs1 - if: ${{ !cancelled() && (steps.generate-dashboard-data.outcome == 'success') }} env: QUEUEBOARD_API_BASE_URL: ${{ secrets.QUEUEBOARD_API_BASE_URL }} run: | uv run python -m queueboard.dashboard \ --api \ --rule-set-id 1 \ - --gh-pages-dir gh-pages/test-rule-set-1 \ - --api-dir api-rule-set-1 + --api-dir api-rule-set-1 \ + --gh-pages-dir gh-pages/test-rule-set-1 - name: "Generate dashboard from API (rule set 2)" id: generate-dashboard-api-rs2 - if: ${{ !cancelled() && (steps.generate-dashboard-data.outcome == 'success') }} env: QUEUEBOARD_API_BASE_URL: ${{ secrets.QUEUEBOARD_API_BASE_URL }} run: | uv run python -m queueboard.dashboard \ --api \ --rule-set-id 2 \ - --gh-pages-dir gh-pages/test-rule-set-2 \ - --api-dir api-rule-set-2 + --api-dir api-rule-set-2 \ + --gh-pages-dir gh-pages/test-rule-set-2 + + - name: "Generate dashboard from API (rule set 3)" + id: generate-dashboard-api-rs3 + env: + QUEUEBOARD_API_BASE_URL: ${{ secrets.QUEUEBOARD_API_BASE_URL }} + run: | + uv run python -m queueboard.dashboard \ + --api \ + --rule-set-id 3 \ + --api-dir api-rule-set-3 - name: Upload artifact id: pages-artifact - if: ${{ !cancelled() && (steps.generate-dashboard.outcome == 'success') && (steps.generate-dashboard-api-rs1.outcome == 'success') && (steps.generate-dashboard-api-rs2.outcome == 'success') }} + if: ${{ (steps.generate-dashboard-api-rs1.outcome == 'success') && (steps.generate-dashboard-api-rs2.outcome == 'success') && (steps.generate-dashboard-api-rs3.outcome == 'success') }} uses: actions/upload-pages-artifact@7b1f4a764d45c48632c6b24a0339c27f5614fb0b # v4.0.0 with: path: gh-pages - name: Deploy to GitHub Pages - if: ${{ github.ref == 'refs/heads/master' && !cancelled() && (steps.pages-artifact.outcome == 'success') }} + if: ${{ github.ref == 'refs/heads/master' && (steps.pages-artifact.outcome == 'success') }} id: deployment uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5 ``` From 19eb5e25f3066c803474f25c8c81bfc9b894b081 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 17:29:07 -0400 Subject: [PATCH 11/17] feat(dashboard): inject analytics tracking snippet into generated pages Adds --analytics-site / QUEUEBOARD_ANALYTICS_SITE support to dashboard.py: - Widens CSP connect-src when an analytics host is configured. - Injects a sendBeacon/fetch snippet before on every generated page, including the static area_stats.html and dependency_dashboard.html. - Snippet is omitted entirely when the flag is absent, so existing deployments are unaffected. Updates docs/queueboard_main_workflow.md to pass QUEUEBOARD_ANALYTICS_SITE (new repo secret) alongside the existing QUEUEBOARD_API_BASE_URL in all three dashboard-generation steps. Co-Authored-By: Claude Sonnet 4.6 --- docs/queueboard_main_workflow.md | 3 ++ src/queueboard/dashboard.py | 64 ++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/docs/queueboard_main_workflow.md b/docs/queueboard_main_workflow.md index eded9157..36f30131 100644 --- a/docs/queueboard_main_workflow.md +++ b/docs/queueboard_main_workflow.md @@ -60,6 +60,7 @@ jobs: id: generate-dashboard-api-rs1 env: QUEUEBOARD_API_BASE_URL: ${{ secrets.QUEUEBOARD_API_BASE_URL }} + QUEUEBOARD_ANALYTICS_SITE: ${{ secrets.QUEUEBOARD_ANALYTICS_SITE }} run: | uv run python -m queueboard.dashboard \ --api \ @@ -71,6 +72,7 @@ jobs: id: generate-dashboard-api-rs2 env: QUEUEBOARD_API_BASE_URL: ${{ secrets.QUEUEBOARD_API_BASE_URL }} + QUEUEBOARD_ANALYTICS_SITE: ${{ secrets.QUEUEBOARD_ANALYTICS_SITE }} run: | uv run python -m queueboard.dashboard \ --api \ @@ -82,6 +84,7 @@ jobs: id: generate-dashboard-api-rs3 env: QUEUEBOARD_API_BASE_URL: ${{ secrets.QUEUEBOARD_API_BASE_URL }} + QUEUEBOARD_ANALYTICS_SITE: ${{ secrets.QUEUEBOARD_ANALYTICS_SITE }} run: | uv run python -m queueboard.dashboard \ --api \ diff --git a/src/queueboard/dashboard.py b/src/queueboard/dashboard.py index 31ee4b46..b05ca744 100755 --- a/src/queueboard/dashboard.py +++ b/src/queueboard/dashboard.py @@ -433,12 +433,15 @@ def _inner( # Specific code for writing the actual webpage files. -HTML_HEADER = """ - + +def _make_html_header(analytics_host: str = "") -> str: + """Return the HTML header, optionally widening the CSP to allow analytics beacons.""" + connect_src = f" connect-src 'self' {analytics_host};" if analytics_host else "" + return f""" - + Mathlib review and triage dashboard " + ) GH_PAGES_DIR = "gh-pages" API_DIR = "api" +ANALYTICS_HOST: str = "" # set by main() when --analytics-site is provided +ANALYTICS_SNIPPET: str = "" # pre-rendered snippet injected before # Write a webpage with body out a file called 'outfile'. @@ -467,8 +493,9 @@ def write_webpage(body: str, outfile: str, use_tables: bool = True, standard: bo if use_tables else "" ) - footer = f"{script}\n" - print(f"{HTML_HEADER}\n{body}\n{footer}", file=fi) + analytics = f"\n{ANALYTICS_SNIPPET}" if ANALYTICS_SNIPPET else "" + footer = f"{script}{analytics}\n" + print(f"{_make_html_header(ANALYTICS_HOST)}\n{body}\n{footer}", file=fi) def _parse_args(argv: List[str]) -> argparse.Namespace: @@ -518,6 +545,14 @@ def _parse_args(argv: List[str]) -> argparse.Namespace: action="store_true", help="Ask the server to refresh cached payloads before serving (API fetch only).", ) + parser.add_argument( + "--analytics-site", + default=os.environ.get("QUEUEBOARD_ANALYTICS_SITE"), + help=( + "Site slug for pageview analytics (must be in SITE_ANALYTICS_ALLOWED_SITES on the server). " + "Requires --api-base-url. When set, a tracking snippet is injected into every generated page." + ), + ) # Legacy positional arguments remain accepted for compatibility. parser.add_argument("legacy_gh_pages_dir", nargs="?", help="Legacy gh-pages directory positional argument.") parser.add_argument("legacy_api_dir", nargs="?", help="Legacy api directory positional argument.") @@ -1122,8 +1157,15 @@ def main() -> None: args = _parse_args(sys.argv[1:]) global GH_PAGES_DIR global API_DIR + global ANALYTICS_HOST + global ANALYTICS_SNIPPET GH_PAGES_DIR = args.gh_pages_dir or args.legacy_gh_pages_dir or GH_PAGES_DIR # "gh-pages" by default API_DIR = args.api_dir or args.legacy_api_dir or API_DIR # "api" by default + if args.analytics_site and args.api_base_url: + ANALYTICS_HOST = args.api_base_url.rstrip("/") + ANALYTICS_SNIPPET = _make_analytics_snippet(ANALYTICS_HOST, args.analytics_site) + elif args.analytics_site: + print("Warning: --analytics-site requires --api-base-url; analytics snippet will be omitted.", file=sys.stderr) if args.api: _fetch_api_payloads(args, API_DIR) @@ -1177,6 +1219,14 @@ def main() -> None: with as_file(files("queueboard").joinpath("static")) as tmp: shutil.copytree(tmp, GH_PAGES_DIR, dirs_exist_ok=True) + # Inject analytics snippet into the static HTML pages (which bypass write_webpage). + if ANALYTICS_SNIPPET: + for static_html in ("area_stats.html", "dependency_dashboard.html"): + dest = path.join(GH_PAGES_DIR, static_html) + if path.exists(dest): + content = open(dest).read() + open(dest, "w").write(content.replace("", f"{ANALYTICS_SNIPPET}\n", 1)) + if __name__ == "__main__": main() From 92b422f3809acdd97414c0663bfc49cb3ad0d2ab Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 17:32:04 -0400 Subject: [PATCH 12/17] doc: expand queueboard_main_workflow.md with context and secrets table Adds a prose overview of how the workflow operates, a table of the two required repo secrets (QUEUEBOARD_API_BASE_URL and the new QUEUEBOARD_ANALYTICS_SITE), and a note that omitting the analytics secret silently disables snippet injection. Co-Authored-By: Claude Sonnet 4.6 --- docs/queueboard_main_workflow.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/queueboard_main_workflow.md b/docs/queueboard_main_workflow.md index 36f30131..db40b49c 100644 --- a/docs/queueboard_main_workflow.md +++ b/docs/queueboard_main_workflow.md @@ -1,4 +1,32 @@ -Here is the main workflow in the `queueboard` repo, which queries data and generates the dashboard from an API endpoint in a deployed version of the `qb_site/` Django project in this repo (`queueboard-core`). +This document describes the main GitHub Actions workflow used in the sibling +[`queueboard`](https://github.com/leanprover-community/queueboard) repo. +The workflow runs every 8 minutes, fetches fresh PR metadata from a deployed +instance of `qb_site/` (the Django backend in this repo), generates static +dashboard HTML, and publishes it to GitHub Pages. + +## How it works + +1. **Checkout** — checks out `queueboard-core` (this repo) to get scripts, + GraphQL query templates, and the `queueboard` Python package. +2. **Fetch + generate** — calls `python -m queueboard.dashboard --api` three + times, once per rule set (different queue-classification rules for + experimentation). Each run downloads JSON payloads from the backend API and + renders a set of HTML dashboard pages into `gh-pages//`. +3. **Deploy** — uploads the `gh-pages/` tree as a Pages artifact and deploys + it if the run is on the `master` branch and all three generation steps + succeeded. + +## Required repository secrets + +| Secret | Purpose | +|---|---| +| `QUEUEBOARD_API_BASE_URL` | Base URL of the deployed `qb_site` instance (e.g. `https://queueboard.example.com`). Used both to fetch API payloads and as the analytics endpoint host. | +| `QUEUEBOARD_ANALYTICS_SITE` | Site slug registered in `SITE_ANALYTICS_ALLOWED_SITES` on the server (e.g. `queueboard`). When set, a privacy-preserving analytics snippet is injected into every generated page. Omit to disable analytics. | + +If `QUEUEBOARD_ANALYTICS_SITE` is absent (secret not configured), the snippet +is silently omitted and all other workflow behaviour is unchanged. + +## Workflow YAML ```yaml name: Update PR metadata From 8de812ab51e141c6ca7b261ca1cda502865b6c3b Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 17:32:54 -0400 Subject: [PATCH 13/17] feat(dashboard): add privacy notice alongside analytics snippet Injects a visible one-line notice ("no cookies, no IP addresses stored") before on every analytics-enabled page, styled via a new .analytics-notice CSS rule. The notice is part of the same injection block as the tracking script, so it appears whenever analytics is active and is absent otherwise. Updates the ADR (031) to: - Add disclosure as an explicit onboarding step. - Document the required notice wording and note that site adopters are responsible for adding equivalent disclosure to their own pages. Co-Authored-By: Claude Sonnet 4.6 --- docs/design-decisions/031-analytics-ingestion-design.md | 6 ++++-- src/queueboard/dashboard.py | 6 ++++-- src/queueboard/static/style.css | 6 ++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index 1203872f..d406c194 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -78,8 +78,9 @@ 1. Add the site slug to `SITE_ANALYTICS_ALLOWED_SITES` (comma-separated, no spaces). 2. Deploy/restart the web dyno so the new slug is live. 3. Add the tracking snippet to the site (see below). -4. Verify events appear in the Django admin under `AnalyticsPageView`. -5. After one aggregation cycle, check `AnalyticsDailyMetric` for counts. +4. Add a visible privacy notice to the site informing visitors that anonymous visit counts are collected (no cookies, no IP addresses stored). See the snippet notes below for the recommended wording. +5. Verify events appear in the Django admin under `AnalyticsPageView`. +6. After one aggregation cycle, check `AnalyticsDailyMetric` for counts. ### Static-site tracking snippet @@ -115,6 +116,7 @@ Replace `YOUR_QUEUEBOARD_HOST` and `YOUR_SITE_SLUG` before deploying. - `sendBeacon` is preferred: it survives page unload and does not block navigation. - No cookies, no persistent identifiers, no third-party scripts. - The endpoint returns `204` for all non-error outcomes (success, bot drop, unknown UA) so the response body is never read. +- **Disclosure requirement:** Sites using this snippet must inform visitors that anonymous visit counts are collected. The recommended notice text is: *"This page collects anonymous visit counts for usage reporting (no cookies, no IP addresses stored)."* The queueboard dashboard injects this notice automatically alongside the snippet; other sites should add equivalent wording to their footer or privacy statement. ### Migrations - `0001_initial` — `AnalyticsPageView` diff --git a/src/queueboard/dashboard.py b/src/queueboard/dashboard.py index b05ca744..6b38f43b 100755 --- a/src/queueboard/dashboard.py +++ b/src/queueboard/dashboard.py @@ -457,9 +457,10 @@ def _make_html_header(analytics_host: str = "") -> str: def _make_analytics_snippet(host: str, site: str) -> str: - """Return the pageview tracking " ) + return f"{notice}\n{script}" GH_PAGES_DIR = "gh-pages" diff --git a/src/queueboard/static/style.css b/src/queueboard/static/style.css index 95568e2c..9516d5be 100644 --- a/src/queueboard/static/style.css +++ b/src/queueboard/static/style.css @@ -174,3 +174,9 @@ span.label { .qb-filter-option span { font-size: 0.95rem; } + +.analytics-notice { + margin-top: 2rem; + font-size: 0.8rem; + color: #888; +} From 433e7c60139079262641ace375c146bdb5e61bad Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 17:36:05 -0400 Subject: [PATCH 14/17] Revert "feat(dashboard): add privacy notice alongside analytics snippet" This reverts commit 8de812ab51e141c6ca7b261ca1cda502865b6c3b. --- docs/design-decisions/031-analytics-ingestion-design.md | 6 ++---- src/queueboard/dashboard.py | 6 ++---- src/queueboard/static/style.css | 6 ------ 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index d406c194..1203872f 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -78,9 +78,8 @@ 1. Add the site slug to `SITE_ANALYTICS_ALLOWED_SITES` (comma-separated, no spaces). 2. Deploy/restart the web dyno so the new slug is live. 3. Add the tracking snippet to the site (see below). -4. Add a visible privacy notice to the site informing visitors that anonymous visit counts are collected (no cookies, no IP addresses stored). See the snippet notes below for the recommended wording. -5. Verify events appear in the Django admin under `AnalyticsPageView`. -6. After one aggregation cycle, check `AnalyticsDailyMetric` for counts. +4. Verify events appear in the Django admin under `AnalyticsPageView`. +5. After one aggregation cycle, check `AnalyticsDailyMetric` for counts. ### Static-site tracking snippet @@ -116,7 +115,6 @@ Replace `YOUR_QUEUEBOARD_HOST` and `YOUR_SITE_SLUG` before deploying. - `sendBeacon` is preferred: it survives page unload and does not block navigation. - No cookies, no persistent identifiers, no third-party scripts. - The endpoint returns `204` for all non-error outcomes (success, bot drop, unknown UA) so the response body is never read. -- **Disclosure requirement:** Sites using this snippet must inform visitors that anonymous visit counts are collected. The recommended notice text is: *"This page collects anonymous visit counts for usage reporting (no cookies, no IP addresses stored)."* The queueboard dashboard injects this notice automatically alongside the snippet; other sites should add equivalent wording to their footer or privacy statement. ### Migrations - `0001_initial` — `AnalyticsPageView` diff --git a/src/queueboard/dashboard.py b/src/queueboard/dashboard.py index 6b38f43b..b05ca744 100755 --- a/src/queueboard/dashboard.py +++ b/src/queueboard/dashboard.py @@ -457,10 +457,9 @@ def _make_html_header(analytics_host: str = "") -> str: def _make_analytics_snippet(host: str, site: str) -> str: - """Return the privacy notice paragraph and pageview tracking " ) - return f"{notice}\n{script}" GH_PAGES_DIR = "gh-pages" diff --git a/src/queueboard/static/style.css b/src/queueboard/static/style.css index 9516d5be..95568e2c 100644 --- a/src/queueboard/static/style.css +++ b/src/queueboard/static/style.css @@ -174,9 +174,3 @@ span.label { .qb-filter-option span { font-size: 0.95rem; } - -.analytics-notice { - margin-top: 2rem; - font-size: 0.8rem; - color: #888; -} From 05846da0495c320ab9e76a4d0069865cc503382a Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 21:24:32 -0400 Subject: [PATCH 15/17] feat(dashboard): add privacy notice and document disclosure rationale MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Injects a visible one-line notice ("no cookies, no IP addresses stored") before on every analytics-enabled page, styled via a new .analytics-notice CSS rule. The notice appears whenever analytics is active and is absent otherwise. Adds a "Disclosure and privacy regulations" section to the ADR (031) covering ePrivacy Art. 5(3), GDPR Recital 26 / Art. 13, CJEU Breyer (C-582/14), EDPB Guidelines 2/2023, and CNIL consent-exempt analytics guidance — all with verified source URLs. Documents the practical position: no consent banner required, but a brief privacy notice is recommended as good practice and to satisfy GDPR Art. 13 under the cautious reading. Co-Authored-By: Claude Sonnet 4.6 --- .../031-analytics-ingestion-design.md | 17 +++++++++++++++-- src/queueboard/dashboard.py | 6 ++++-- src/queueboard/static/style.css | 6 ++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index 1203872f..9a399094 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -78,8 +78,9 @@ 1. Add the site slug to `SITE_ANALYTICS_ALLOWED_SITES` (comma-separated, no spaces). 2. Deploy/restart the web dyno so the new slug is live. 3. Add the tracking snippet to the site (see below). -4. Verify events appear in the Django admin under `AnalyticsPageView`. -5. After one aggregation cycle, check `AnalyticsDailyMetric` for counts. +4. Add a visible privacy notice to the site informing visitors that anonymous visit counts are collected (no cookies, no IP addresses stored). See disclosure notes below. +5. Verify events appear in the Django admin under `AnalyticsPageView`. +6. After one aggregation cycle, check `AnalyticsDailyMetric` for counts. ### Static-site tracking snippet @@ -116,6 +117,18 @@ Replace `YOUR_QUEUEBOARD_HOST` and `YOUR_SITE_SLUG` before deploying. - No cookies, no persistent identifiers, no third-party scripts. - The endpoint returns `204` for all non-error outcomes (success, bot drop, unknown UA) so the response body is never read. +### Disclosure and privacy regulations + +This system is designed to minimise regulatory obligations, but the picture is nuanced enough to warrant documentation. *This section is informational, not legal advice.* + +**ePrivacy Directive (EU, [2002/58/EC](https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:02002L0058-20091219) Art. 5(3)) and UK PECR (SI 2003/2426 Reg. 6)** — these rules require consent for *storing information in, or gaining access to information already stored in, the user's terminal equipment*. This system performs server-side hash computation on data transmitted in the HTTP request (IP address from the network layer, `User-Agent` header) and writes nothing to the user's device (no cookies, no localStorage, no fingerprinting scripts). The [EDPB Guidelines 2/2023 on the Technical Scope of Art. 5(3)](https://www.edpb.europa.eu/our-work-tools/our-documents/guidelines/guidelines-22023-technical-scope-art-53-eprivacy-directive_en) (adopted October 2024) take a broad view: they state that gaining access to IP addresses triggers Art. 5(3) "in cases where this information originates from the terminal equipment of a subscriber or user." Whether passively transmitted HTTP request metadata (as opposed to data actively read from device storage) falls under this scope is not conclusively settled. The [ICO's guidance on storage and access technologies](https://ico.org.uk/for-organisations/direct-marketing-and-privacy-and-electronic-communications/guidance-on-the-use-of-storage-and-access-technologies/) defines PECR Regulation 6 as covering technologies that "store information on a user's device or gain access to information on a user's device." On balance, a system that neither stores nor reads from the device is likely outside the scope of Art. 5(3) / Reg. 6, but this is an area of evolving regulatory interpretation. + +**GDPR / UK GDPR ([Regulation 2016/679](https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32016R0679))** — whether GDPR applies turns on whether the monthly rotating hash constitutes "personal data" under Art. 4(1). Recital 26 excludes "anonymous information" from GDPR's scope, and provides a "means reasonably likely" test: whether identification is feasible given "all objective factors, such as the costs of and the amount of time required for identification, taking into consideration the available technology." The CJEU's ruling in [*Breyer v. Bundesrepublik Deutschland* (C-582/14, 2016)](https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:62014CJ0582) established that dynamic IP addresses can constitute personal data, but only where the controller has "the legal means which enable it to identify the data subject with additional data which the internet service provider has about that person." In this system the raw IP is never stored and the salt is secret and rotated monthly, which is a strong argument for anonymity under Recital 26. The cautious position treats the hash as pseudonymous personal data; in that case, Art. 6(1)(f) legitimate interests is the appropriate lawful basis for coarse usage analytics — **no consent is required** — but Art. 13 transparency obligations apply, meaning visitors must be able to find information about the processing (e.g., via a linked privacy statement or footer notice). + +**[CNIL guidance](https://www.cnil.fr/fr/cookies-solutions-pour-les-outils-de-mesure-daudience) (France)** — the CNIL's framework for consent-exempt analytics covers tools that use short-lived identifiers with immediate IP anonymisation, provided the data is used solely for audience measurement, does not enable cross-site tracking, and is retained for no more than 25 months. This system satisfies those conditions. The CNIL still expects users to be informed of the tracking (e.g., via the site's privacy policy), even for consent-exempt tools. + +**Practical position:** No consent banner is required. As a matter of good practice — and to satisfy GDPR Art. 13 under the cautious reading that the hash is personal data — sites using this snippet should make a brief privacy notice accessible to visitors. The recommended notice text is: *"This page collects anonymous visit counts for usage reporting (no cookies, no IP addresses stored)."* The queueboard dashboard injects this notice automatically alongside the snippet; other sites should add equivalent wording to their footer or privacy statement. + ### Migrations - `0001_initial` — `AnalyticsPageView` - `0002_analyticsdailymetric` — `AnalyticsDailyMetric` diff --git a/src/queueboard/dashboard.py b/src/queueboard/dashboard.py index b05ca744..6b38f43b 100755 --- a/src/queueboard/dashboard.py +++ b/src/queueboard/dashboard.py @@ -457,9 +457,10 @@ def _make_html_header(analytics_host: str = "") -> str: def _make_analytics_snippet(host: str, site: str) -> str: - """Return the pageview tracking " ) + return f"{notice}\n{script}" GH_PAGES_DIR = "gh-pages" diff --git a/src/queueboard/static/style.css b/src/queueboard/static/style.css index 95568e2c..9516d5be 100644 --- a/src/queueboard/static/style.css +++ b/src/queueboard/static/style.css @@ -174,3 +174,9 @@ span.label { .qb-filter-option span { font-size: 0.95rem; } + +.analytics-notice { + margin-top: 2rem; + font-size: 0.8rem; + color: #888; +} From aff020557573795c22a3934119aa2a2da806db33 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Wed, 25 Mar 2026 21:42:13 -0400 Subject: [PATCH 16/17] fix(site_analytics): pin clock in time-dependent aggregation/prune tests Mock django.utils.timezone.now in tests that compare hardcoded fixture dates against the real clock. Without this, task tests flake near midnight boundaries and prune service tests rot as the hardcoded dates age past their retention windows. Co-Authored-By: Claude Sonnet 4.6 --- .../site_analytics/tests/test_aggregation.py | 9 ++++++- .../tests/test_monthly_aggregation.py | 24 +++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/qb_site/site_analytics/tests/test_aggregation.py b/qb_site/site_analytics/tests/test_aggregation.py index 4f13bd85..241031ec 100644 --- a/qb_site/site_analytics/tests/test_aggregation.py +++ b/qb_site/site_analytics/tests/test_aggregation.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime +from unittest import mock from django.test import TestCase, override_settings @@ -118,7 +119,13 @@ def test_date_boundary_is_utc(self): @_SALT class AggregateDailyMetricsTaskTests(TestCase): - def test_task_returns_summary_dict(self): + @mock.patch("django.utils.timezone.now") + def test_task_returns_summary_dict(self, mock_now): + # Pin the clock so the task's default date=timezone.now().date() matches + # the pageview date, avoiding midnight-boundary flakiness. + mock_now.return_value = datetime.datetime( + TODAY.year, TODAY.month, TODAY.day, 12, 0, 0, tzinfo=datetime.timezone.utc + ) _pv("s1", "/", TODAY, "h1") result = aggregate_daily_metrics_task(days_back=1) self.assertIn("upserted", result) diff --git a/qb_site/site_analytics/tests/test_monthly_aggregation.py b/qb_site/site_analytics/tests/test_monthly_aggregation.py index c06fea84..bc6a2e4f 100644 --- a/qb_site/site_analytics/tests/test_monthly_aggregation.py +++ b/qb_site/site_analytics/tests/test_monthly_aggregation.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime +from unittest import mock from django.test import TestCase, override_settings @@ -86,7 +87,13 @@ def test_no_raw_rows_preserves_existing_metric(self): metric = AnalyticsMonthlyMetric.objects.get(site="s1", month=MAR_START) self.assertEqual(metric.pageviews, 99) # preserved, not zeroed - def test_task_returns_summary_dict(self): + @mock.patch("django.utils.timezone.now") + def test_task_returns_summary_dict(self, mock_now): + # Pin the clock so the task's default date=timezone.now().date() matches + # the pageview date, avoiding midnight-boundary flakiness. + mock_now.return_value = datetime.datetime( + MAR_2026.year, MAR_2026.month, MAR_2026.day, 12, 0, 0, tzinfo=datetime.timezone.utc + ) _pv("s1", "/", MAR_2026, "h1") result = aggregate_monthly_metrics_task(months_back=1) self.assertIn("upserted", result) @@ -96,7 +103,12 @@ def test_task_returns_summary_dict(self): @_SALT class PruneOldPageviewsTests(TestCase): - def test_prunes_rows_older_than_retention(self): + # Pin the clock so hardcoded dates remain "old" or "recent" as intended. + _NOW = datetime.datetime(2026, 3, 25, 12, 0, 0, tzinfo=datetime.timezone.utc) + + @mock.patch("django.utils.timezone.now") + def test_prunes_rows_older_than_retention(self, mock_now): + mock_now.return_value = self._NOW old = _pv("s1", "/old", datetime.date(2023, 1, 1)) recent = _pv("s1", "/new", datetime.date(2026, 3, 1)) @@ -106,7 +118,9 @@ def test_prunes_rows_older_than_retention(self): self.assertTrue(AnalyticsPageView.objects.filter(pk=recent.pk).exists()) self.assertEqual(result["deleted"], 1) - def test_no_rows_deleted_when_all_recent(self): + @mock.patch("django.utils.timezone.now") + def test_no_rows_deleted_when_all_recent(self, mock_now): + mock_now.return_value = self._NOW _pv("s1", "/", datetime.date(2026, 3, 24)) result = prune_old_pageviews(retention_days=30) self.assertEqual(result["deleted"], 0) @@ -117,8 +131,10 @@ def test_returns_cutoff_and_retention_days(self): self.assertIn("cutoff", result) self.assertEqual(result["retention_days"], 90) + @mock.patch("django.utils.timezone.now") @override_settings(SITE_ANALYTICS_RETENTION_DAYS=7) - def test_uses_settings_default_when_not_specified(self): + def test_uses_settings_default_when_not_specified(self, mock_now): + mock_now.return_value = self._NOW _pv("s1", "/", datetime.date(2026, 3, 1)) result = prune_old_pageviews() self.assertEqual(result["retention_days"], 7) From c077649ac0dd2c12b5d0aa0608760fba323ab120 Mon Sep 17 00:00:00 2001 From: Bryan Gin-ge Chen Date: Thu, 26 Mar 2026 23:47:39 -0400 Subject: [PATCH 17/17] feat(site_analytics): rotate visitor-hash salt monthly via DB Replaces the fixed SITE_ANALYTICS_HASH_SALT env var with a randomly-generated per-month salt stored in a new SiteAnalyticsSalt model. A new Celery beat task (site_analytics.rotate_salt) runs at midnight UTC on the 1st of each month, creates a fresh salt, and deletes the previous row atomically, providing forward secrecy: past hashes cannot be re-derived even if the current salt leaks. Changes: - New SiteAnalyticsSalt model + migration 0004. - compute_visitor_hash() replaces compute_visitor_month_hash(): drops the explicit YYYY-MM argument; cross-month isolation is now provided by the rotating salt instead. - 60-second in-process salt cache with _reset_salt_cache() helper for test isolation; falls back to SITE_ANALYTICS_HASH_SALT until the first rotation task runs. - rotate_salt_task registered in beat schedule (monthly crontab). - SiteAnalyticsSalt added to TRUNCATE_TABLES in backup_policy.py (contains the live secret; excluded from sanitized backups). - Updated tests: ComputeVisitorHashTests, cache reset in collect-view setUp, new test_salt.py covering rotation task behaviour. - ADR and AGENTS.md updated to reflect new privacy model. Co-Authored-By: Claude Sonnet 4.6 --- .../031-analytics-ingestion-design.md | 14 +++-- qb_site/api/views/analytics_collect.py | 5 +- qb_site/qb_site/settings/base.py | 7 +++ qb_site/site_analytics/AGENTS.md | 9 ++- .../migrations/0004_siteanalyticssalt.py | 21 +++++++ qb_site/site_analytics/models/__init__.py | 1 + qb_site/site_analytics/models/salt.py | 21 +++++++ qb_site/site_analytics/services/hashing.py | 44 ++++++++++++-- qb_site/site_analytics/tasks/__init__.py | 2 + qb_site/site_analytics/tasks/rotate_salt.py | 26 ++++++++ .../site_analytics/tests/test_collect_view.py | 2 + qb_site/site_analytics/tests/test_salt.py | 48 +++++++++++++++ qb_site/site_analytics/tests/test_services.py | 60 +++++++++++++------ scripts/backup_policy.py | 3 + 14 files changed, 227 insertions(+), 36 deletions(-) create mode 100644 qb_site/site_analytics/migrations/0004_siteanalyticssalt.py create mode 100644 qb_site/site_analytics/models/salt.py create mode 100644 qb_site/site_analytics/tasks/rotate_salt.py create mode 100644 qb_site/site_analytics/tests/test_salt.py diff --git a/docs/design-decisions/031-analytics-ingestion-design.md b/docs/design-decisions/031-analytics-ingestion-design.md index 9a399094..c007f6bb 100644 --- a/docs/design-decisions/031-analytics-ingestion-design.md +++ b/docs/design-decisions/031-analytics-ingestion-design.md @@ -7,7 +7,7 @@ - Operational constraint: ingestion endpoint must be fast, resilient, and callable from third-party static sites (CORS required). ## Decision -- New Django app `site_analytics` with three models, a REST ingestion endpoint, and three Celery periodic tasks. +- New Django app `site_analytics` with four models, a REST ingestion endpoint, and four Celery periodic tasks. - Site allowlist (`SITE_ANALYTICS_ALLOWED_SITES`) gates ingestion; no per-site auth tokens in v1. - Reporting reads from aggregate tables only; raw rows are bounded-retention scratch space. @@ -21,6 +21,7 @@ - Fields: `site`, `date` (UTC), `pageviews`, `unique_visitors`. - `AnalyticsMonthlyMetric` — monthly aggregate per site; unique on `(site, month)`. - Fields: `site`, `month` (UTC first-of-month `DateField`, e.g. `2026-03-01`), `pageviews`, `unique_visitors`. +- `SiteAnalyticsSalt` — single live row holding the current month's visitor-hash salt; previous row deleted on rotation. ### Ingestion endpoint - `POST /api/v1/analytics/collect` — view in `api/views/analytics_collect.py`. @@ -32,12 +33,12 @@ - No CSRF enforcement: DRF `authentication_classes = []` / `permission_classes = []`. ### Privacy -- Raw IP not stored. `visitor_month_hash = sha256(ip | normalized_ua | YYYY-MM | salt)`. +- Raw IP not stored. `visitor_month_hash = sha256(ip | normalized_ua | salt)`. - Fields joined with `|` to prevent cross-field hash collisions. - UA is lowercased before hashing so casing variation in the same browser does not inflate unique-visitor counts. - IP extracted from `X-Forwarded-For` (leftmost address; set by Heroku routing and most reverse proxies), falling back to `REMOTE_ADDR` for direct connections. -- `month_key` is UTC `YYYY-MM`; cross-month correlation is impossible by construction. -- Salt from `SITE_ANALYTICS_HASH_SALT` env var. **Changing the salt or separator invalidates all historical hashes.** +- Cross-month correlation is prevented by monthly salt rotation: the salt is replaced at the start of each month and the old value discarded, so hashes from different months are unlinkable even with knowledge of the current salt (forward secrecy against salt leakage). +- Salt is stored in `SiteAnalyticsSalt` (DB, one live row). `SITE_ANALYTICS_HASH_SALT` env var is the fallback until the first rotation task runs. **Changing the separator invalidates all historical hashes.** ### Bot filtering - Substring denylist in `site_analytics/services/bot_filter.py` matched against lowercased UA. @@ -48,15 +49,17 @@ - `site_analytics.aggregate_daily_metrics` (default: hourly) — upserts `AnalyticsDailyMetric` for a rolling `days_back=2` window so events near UTC midnight are never missed. Fully idempotent. - `site_analytics.aggregate_monthly_metrics` (default: daily) — upserts `AnalyticsMonthlyMetric` for a rolling `months_back=2` window. - `site_analytics.prune_old_pageviews` (default: daily) — deletes `AnalyticsPageView` rows older than the retention cutoff. Aggregate tables are never pruned. +- `site_analytics.rotate_salt` (monthly, midnight UTC on the 1st) — generates a new random salt, saves it to `SiteAnalyticsSalt`, and deletes the previous row atomically. - All three tasks preserve existing aggregate rows when the corresponding raw rows have been pruned, to avoid silently zeroing out reporting data after the retention window. ### Backup policy - `site_analytics_analyticspageview` → `TRUNCATE_TABLES` (raw rows contain visitor hashes). - `site_analytics_analyticsdailymetric`, `site_analytics_analyticsmonthlymetric` → `RETAIN_TABLES` (aggregate-only, safe to share publicly). +- `site_analytics_siteanalyticssalt` → `TRUNCATE_TABLES` (contains the live secret; must never appear in sanitized backups). ## Invariants - Reporting reads must come from aggregate tables, not raw pageview scans. -- Hashing semantics (field separator `|`, UA normalization, month key format, salt) are part of the privacy contract; any change requires an explicit migration note and bumps all historical hashes. +- Hashing semantics (field separator `|`, UA normalization, salt source) are part of the privacy contract; any change requires an explicit migration note and bumps all historical hashes. - Aggregation tasks must remain idempotent and safe under retries and overlapping runs. - All date/time boundaries use UTC. `occurred_at__date=d` with `USE_TZ=True` and `TIME_ZONE=UTC` evaluates at UTC midnight in PostgreSQL. - `site` slugs must remain stable; renames require an explicit backfill of both raw rows and aggregate tables. @@ -133,6 +136,7 @@ This system is designed to minimise regulatory obligations, but the picture is n - `0001_initial` — `AnalyticsPageView` - `0002_analyticsdailymetric` — `AnalyticsDailyMetric` - `0003_analyticsmonthlymetric` — `AnalyticsMonthlyMetric` +- `0004_siteanalyticssalt` — `SiteAnalyticsSalt` ## Consequences - Adds ~48 DB tables to backup scope (three new, classified in policy). diff --git a/qb_site/api/views/analytics_collect.py b/qb_site/api/views/analytics_collect.py index 2b217982..f9782912 100644 --- a/qb_site/api/views/analytics_collect.py +++ b/qb_site/api/views/analytics_collect.py @@ -11,7 +11,7 @@ from site_analytics.models import AnalyticsPageView from site_analytics.services.bot_filter import is_bot -from site_analytics.services.hashing import compute_visitor_month_hash, get_client_ip +from site_analytics.services.hashing import compute_visitor_hash, get_client_ip # Hard caps to guard against oversized payloads hitting DB column limits. _PATH_MAX = 2000 @@ -76,8 +76,7 @@ def post(self, request: Request, *args: object, **kwargs: object) -> Response: return _cors(Response(status=status.HTTP_204_NO_CONTENT)) now = timezone.now() - month_key = now.strftime("%Y-%m") - visitor_month_hash = compute_visitor_month_hash(get_client_ip(request), user_agent, month_key) + visitor_month_hash = compute_visitor_hash(get_client_ip(request), user_agent) AnalyticsPageView.objects.create( site=site, diff --git a/qb_site/qb_site/settings/base.py b/qb_site/qb_site/settings/base.py index 347b8b6a..a9509f3d 100644 --- a/qb_site/qb_site/settings/base.py +++ b/qb_site/qb_site/settings/base.py @@ -364,6 +364,8 @@ def env_optional_bounded_int(name: str, *, minimum: int, maximum: int) -> int | ANALYTICS_CONVERGENCE_PERIOD_SECONDS = int(os.getenv("ANALYTICS_CONVERGENCE_PERIOD_SECONDS", 900)) # Site analytics settings +# Fallback salt used until the first rotate_salt task runs and writes a DB salt. +# Required in production on first deploy; thereafter the DB salt takes precedence. SITE_ANALYTICS_HASH_SALT = os.getenv("SITE_ANALYTICS_HASH_SALT", "") SITE_ANALYTICS_ALLOWED_SITES: list[str] = [ s.strip() for s in os.getenv("SITE_ANALYTICS_ALLOWED_SITES", "").split(",") if s.strip() @@ -568,3 +570,8 @@ def env_optional_bounded_int(name: str, *, minimum: int, maximum: int) -> int | "schedule": SITE_ANALYTICS_PRUNE_PERIOD_SECONDS, "kwargs": {"retention_days": SITE_ANALYTICS_RETENTION_DAYS}, } +# Rotate the visitor-hash salt at midnight UTC on the 1st of each month. +CELERY_BEAT_SCHEDULE["site_analytics_rotate_salt"] = { + "task": "site_analytics.rotate_salt", + "schedule": crontab(minute=0, hour=0, day_of_month=1), +} diff --git a/qb_site/site_analytics/AGENTS.md b/qb_site/site_analytics/AGENTS.md index fcd1694a..5bc0639b 100644 --- a/qb_site/site_analytics/AGENTS.md +++ b/qb_site/site_analytics/AGENTS.md @@ -9,13 +9,14 @@ - `models/pageview.py` — `AnalyticsPageView` raw event rows (immutable after insert). - `models/daily_metric.py` — `AnalyticsDailyMetric` (added in A3). - `models/monthly_metric.py` — `AnalyticsMonthlyMetric` (added in A4). +- `models/salt.py` — `SiteAnalyticsSalt` single-row table holding the current month's hash salt. - `services/` — hashing, bot filtering, aggregation logic. -- `tasks/` — periodic Celery tasks for aggregation and pruning. +- `tasks/` — periodic Celery tasks for aggregation, pruning, and salt rotation. - `tests/` — unit and integration tests. - API ingestion view: `qb_site/api/views/analytics_collect.py` (added in A2). ## Key Settings (all env-overridable) -- `SITE_ANALYTICS_HASH_SALT` — required secret for visitor hash; empty string disables hashing safety checks in dev. +- `SITE_ANALYTICS_HASH_SALT` — fallback salt used until the first `rotate_salt` task runs and writes a DB salt. Required on first deploy; thereafter the `SiteAnalyticsSalt` DB row takes precedence. - `SITE_ANALYTICS_ALLOWED_SITES` — comma-separated site slugs; unknown slugs rejected with `400`. - `SITE_ANALYTICS_RETENTION_DAYS` — raw pageview retention window (default 540 days / ~18 months). - `SITE_ANALYTICS_DAILY_AGGREGATE_PERIOD_SECONDS` — beat period for daily aggregation task (default 3600). @@ -28,10 +29,12 @@ Celery task names (as registered via `@shared_task(name=…)`): - `site_analytics.aggregate_daily_metrics` — idempotent upsert of daily pageview/unique-visitor counts (added in A3). - `site_analytics.aggregate_monthly_metrics` — idempotent upsert of monthly metrics; recomputes current + previous month (added in A4). - `site_analytics.prune_old_pageviews` — deletes raw rows older than `SITE_ANALYTICS_RETENTION_DAYS` (added in A4). +- `site_analytics.rotate_salt` — generates a new random visitor-hash salt and discards the previous one; runs at midnight UTC on the 1st of each month. ## Privacy Invariants - Raw IP addresses are never stored. -- `visitor_month_hash = sha256(ip + normalized_user_agent + YYYY-MM + salt)`. +- `visitor_month_hash = sha256(ip | normalized_user_agent | salt)` where `salt` is the current month's randomly generated value from `SiteAnalyticsSalt`. +- The salt is replaced at month start and the old value deleted, so hashes from different months are unlinkable even with knowledge of the current salt (forward secrecy). - IP is extracted from `X-Forwarded-For` (Heroku / reverse-proxy header) falling back to `REMOTE_ADDR`. - Changing hashing semantics requires an explicit migration/versioning note in the design doc. diff --git a/qb_site/site_analytics/migrations/0004_siteanalyticssalt.py b/qb_site/site_analytics/migrations/0004_siteanalyticssalt.py new file mode 100644 index 00000000..db397b7a --- /dev/null +++ b/qb_site/site_analytics/migrations/0004_siteanalyticssalt.py @@ -0,0 +1,21 @@ +# Generated by Django 5.2.6 on 2026-03-27 03:42 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('site_analytics', '0003_analyticsmonthlymetric'), + ] + + operations = [ + migrations.CreateModel( + name='SiteAnalyticsSalt', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('salt', models.CharField(max_length=64)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ], + ), + ] diff --git a/qb_site/site_analytics/models/__init__.py b/qb_site/site_analytics/models/__init__.py index 5bacc5b2..b2ee1c5b 100644 --- a/qb_site/site_analytics/models/__init__.py +++ b/qb_site/site_analytics/models/__init__.py @@ -3,3 +3,4 @@ from .daily_metric import AnalyticsDailyMetric # noqa: F401 from .monthly_metric import AnalyticsMonthlyMetric # noqa: F401 from .pageview import AnalyticsPageView # noqa: F401 +from .salt import SiteAnalyticsSalt # noqa: F401 diff --git a/qb_site/site_analytics/models/salt.py b/qb_site/site_analytics/models/salt.py new file mode 100644 index 00000000..49561fd9 --- /dev/null +++ b/qb_site/site_analytics/models/salt.py @@ -0,0 +1,21 @@ +"""Monthly rotating salt for visitor hashing.""" + +from __future__ import annotations + +from django.db import models + + +class SiteAnalyticsSalt(models.Model): + """Holds the current month's salt used to compute visitor_month_hash. + + Only one row is live at a time. The ``rotate_salt`` Celery task creates a + new row at the start of each month and deletes the previous one. The old + salt is intentionally discarded so past hashes cannot be re-derived even if + the current salt is ever leaked. + """ + + salt = models.CharField(max_length=64) + created_at = models.DateTimeField(auto_now_add=True) + + class Meta: + app_label = "site_analytics" diff --git a/qb_site/site_analytics/services/hashing.py b/qb_site/site_analytics/services/hashing.py index 71305a90..84b5e354 100644 --- a/qb_site/site_analytics/services/hashing.py +++ b/qb_site/site_analytics/services/hashing.py @@ -3,10 +3,40 @@ from __future__ import annotations import hashlib +import time from django.conf import settings from django.http import HttpRequest +from site_analytics.models.salt import SiteAnalyticsSalt + +# Simple in-process cache so we don't hit the DB on every request. +# Each dyno/worker caches independently; a 60-second TTL means the new salt +# is picked up within a minute of rotation, which is acceptable. +_cached_salt: str = "" +_cache_expires: float = 0.0 + + +def _reset_salt_cache() -> None: + """Invalidate the in-process salt cache. Intended for use in tests only.""" + global _cached_salt, _cache_expires + _cached_salt = "" + _cache_expires = 0.0 + + +def _get_current_salt() -> str: + global _cached_salt, _cache_expires + now = time.monotonic() + if now < _cache_expires and _cached_salt: + return _cached_salt + try: + _cached_salt = SiteAnalyticsSalt.objects.latest("created_at").salt + except SiteAnalyticsSalt.DoesNotExist: + # Fall back to the static env-var salt until the first rotation task runs. + _cached_salt = settings.SITE_ANALYTICS_HASH_SALT + _cache_expires = now + 60.0 + return _cached_salt + def get_client_ip(request: HttpRequest) -> str: """Return the client IP address. @@ -21,14 +51,16 @@ def get_client_ip(request: HttpRequest) -> str: return request.META.get("REMOTE_ADDR", "") -def compute_visitor_month_hash(ip: str, user_agent: str, month_key: str) -> str: - """Return sha256(ip | normalized_ua | month_key | salt) as a hex digest. +def compute_visitor_hash(ip: str, user_agent: str) -> str: + """Return sha256(ip | normalized_ua | salt) as a hex digest. Fields are joined with ``|`` to prevent cross-field collisions. - ``month_key`` must be a UTC ``YYYY-MM`` string so cross-month correlation - is impossible by construction. + Cross-month correlation is prevented by the monthly salt rotation: the salt + is replaced at the start of each month and the old value discarded, so + hashes from different months are unlinkable even with knowledge of the + current salt. """ - salt = settings.SITE_ANALYTICS_HASH_SALT + salt = _get_current_salt() normalized_ua = user_agent.strip().lower() - payload = f"{ip}|{normalized_ua}|{month_key}|{salt}" + payload = f"{ip}|{normalized_ua}|{salt}" return hashlib.sha256(payload.encode()).hexdigest() diff --git a/qb_site/site_analytics/tasks/__init__.py b/qb_site/site_analytics/tasks/__init__.py index f7ecbf1f..3966cc92 100644 --- a/qb_site/site_analytics/tasks/__init__.py +++ b/qb_site/site_analytics/tasks/__init__.py @@ -7,9 +7,11 @@ aggregate_monthly_metrics_task, prune_old_pageviews_task, ) +from site_analytics.tasks.rotate_salt import rotate_salt_task # noqa: F401 __all__ = [ "aggregate_daily_metrics_task", "aggregate_monthly_metrics_task", "prune_old_pageviews_task", + "rotate_salt_task", ] diff --git a/qb_site/site_analytics/tasks/rotate_salt.py b/qb_site/site_analytics/tasks/rotate_salt.py new file mode 100644 index 00000000..f5d06281 --- /dev/null +++ b/qb_site/site_analytics/tasks/rotate_salt.py @@ -0,0 +1,26 @@ +"""Celery task: rotate the monthly visitor-hash salt.""" + +from __future__ import annotations + +import secrets +from typing import Any + +from celery import shared_task +from django.db import transaction + +from site_analytics.models.salt import SiteAnalyticsSalt + + +@shared_task(name="site_analytics.rotate_salt") +def rotate_salt_task() -> dict[str, Any]: + """Generate a fresh random salt and discard the previous one. + + Runs at the start of each calendar month. The old salt is deleted so past + visitor hashes cannot be re-derived even if the new salt is ever leaked + (forward secrecy). + """ + new_salt = secrets.token_hex(32) + with transaction.atomic(): + obj = SiteAnalyticsSalt.objects.create(salt=new_salt) + deleted, _ = SiteAnalyticsSalt.objects.exclude(pk=obj.pk).delete() + return {"rotated": True, "old_deleted": deleted} diff --git a/qb_site/site_analytics/tests/test_collect_view.py b/qb_site/site_analytics/tests/test_collect_view.py index abfbbfc4..91a697cc 100644 --- a/qb_site/site_analytics/tests/test_collect_view.py +++ b/qb_site/site_analytics/tests/test_collect_view.py @@ -6,6 +6,7 @@ from rest_framework.test import APIClient from site_analytics.models import AnalyticsPageView +from site_analytics.services.hashing import _reset_salt_cache _ALLOWED = override_settings( SITE_ANALYTICS_ALLOWED_SITES=["test-site"], @@ -19,6 +20,7 @@ class AnalyticsCollectViewTests(TestCase): def setUp(self) -> None: self.client = APIClient() + _reset_salt_cache() def _post(self, data: dict, **kwargs) -> object: return self.client.post(URL, data, format="json", **kwargs) diff --git a/qb_site/site_analytics/tests/test_salt.py b/qb_site/site_analytics/tests/test_salt.py new file mode 100644 index 00000000..ac0747ea --- /dev/null +++ b/qb_site/site_analytics/tests/test_salt.py @@ -0,0 +1,48 @@ +"""Tests for the monthly salt rotation task.""" + +from __future__ import annotations + +from django.test import TestCase + +from site_analytics.models.salt import SiteAnalyticsSalt +from site_analytics.services.hashing import _reset_salt_cache +from site_analytics.tasks.rotate_salt import rotate_salt_task + + +class RotateSaltTaskTests(TestCase): + def setUp(self): + _reset_salt_cache() + + def test_creates_salt_row_when_none_exists(self): + self.assertEqual(SiteAnalyticsSalt.objects.count(), 0) + rotate_salt_task() + self.assertEqual(SiteAnalyticsSalt.objects.count(), 1) + + def test_created_salt_is_nonempty(self): + rotate_salt_task() + self.assertTrue(SiteAnalyticsSalt.objects.get().salt) + + def test_replaces_existing_salt(self): + SiteAnalyticsSalt.objects.create(salt="old-salt") + rotate_salt_task() + # Only one row should remain after rotation. + self.assertEqual(SiteAnalyticsSalt.objects.count(), 1) + self.assertNotEqual(SiteAnalyticsSalt.objects.get().salt, "old-salt") + + def test_new_salt_differs_from_previous(self): + SiteAnalyticsSalt.objects.create(salt="old-salt") + rotate_salt_task() + self.assertNotEqual(SiteAnalyticsSalt.objects.get().salt, "old-salt") + + def test_returns_summary_dict(self): + result = rotate_salt_task() + self.assertTrue(result["rotated"]) + self.assertIn("old_deleted", result) + + def test_old_deleted_count_is_accurate(self): + SiteAnalyticsSalt.objects.create(salt="first") + SiteAnalyticsSalt.objects.create(salt="second") + result = rotate_salt_task() + # Both pre-existing rows should have been deleted. + self.assertEqual(result["old_deleted"], 2) + self.assertEqual(SiteAnalyticsSalt.objects.count(), 1) diff --git a/qb_site/site_analytics/tests/test_services.py b/qb_site/site_analytics/tests/test_services.py index 6d5fa953..d3f7e79b 100644 --- a/qb_site/site_analytics/tests/test_services.py +++ b/qb_site/site_analytics/tests/test_services.py @@ -6,8 +6,9 @@ from django.test import TestCase, override_settings +from site_analytics.models.salt import SiteAnalyticsSalt from site_analytics.services.bot_filter import is_bot -from site_analytics.services.hashing import compute_visitor_month_hash, get_client_ip +from site_analytics.services.hashing import _reset_salt_cache, compute_visitor_hash, get_client_ip class GetClientIpTests(TestCase): @@ -47,37 +48,58 @@ def test_missing_both_returns_empty_string(self): @override_settings(SITE_ANALYTICS_HASH_SALT="test-salt") -class ComputeVisitorMonthHashTests(TestCase): +class ComputeVisitorHashTests(TestCase): + def setUp(self): + # Ensure each test starts with a cold cache so DB state is respected. + _reset_salt_cache() + def test_returns_64_char_hex(self): - result = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + result = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") self.assertEqual(len(result), 64) self.assertTrue(all(c in "0123456789abcdef" for c in result)) def test_deterministic(self): - a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") - b = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + a = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") + b = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") self.assertEqual(a, b) - def test_different_month_keys_produce_different_hashes(self): - a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") - b = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-04") - self.assertNotEqual(a, b) - def test_different_ips_produce_different_hashes(self): - a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") - b = compute_visitor_month_hash("1.2.3.5", "Mozilla/5.0", "2026-03") + a = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") + b = compute_visitor_hash("1.2.3.5", "Mozilla/5.0") self.assertNotEqual(a, b) def test_ua_normalized_case_insensitive(self): - a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") - b = compute_visitor_month_hash("1.2.3.4", "MOZILLA/5.0", "2026-03") + a = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") + b = compute_visitor_hash("1.2.3.4", "MOZILLA/5.0") self.assertEqual(a, b) - @override_settings(SITE_ANALYTICS_HASH_SALT="other-salt") - def test_different_salts_produce_different_hashes(self): - a = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") - with self.settings(SITE_ANALYTICS_HASH_SALT="test-salt"): - b = compute_visitor_month_hash("1.2.3.4", "Mozilla/5.0", "2026-03") + def test_falls_back_to_settings_salt_when_no_db_row(self): + # No SiteAnalyticsSalt row — should use SITE_ANALYTICS_HASH_SALT. + self.assertEqual(SiteAnalyticsSalt.objects.count(), 0) + a = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") + self.assertEqual(len(a), 64) + + def test_db_salt_takes_precedence_over_settings_salt(self): + # Hash with settings salt only. + hash_settings = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") + + # Insert a DB salt and reset cache so it's picked up. + SiteAnalyticsSalt.objects.create(salt="db-salt-value") + _reset_salt_cache() + + hash_db = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") + self.assertNotEqual(hash_settings, hash_db) + + def test_different_db_salts_produce_different_hashes(self): + SiteAnalyticsSalt.objects.create(salt="salt-one") + _reset_salt_cache() + a = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") + + SiteAnalyticsSalt.objects.all().delete() + SiteAnalyticsSalt.objects.create(salt="salt-two") + _reset_salt_cache() + b = compute_visitor_hash("1.2.3.4", "Mozilla/5.0") + self.assertNotEqual(a, b) diff --git a/scripts/backup_policy.py b/scripts/backup_policy.py index a3f60410..8386e3ea 100644 --- a/scripts/backup_policy.py +++ b/scripts/backup_policy.py @@ -8,6 +8,7 @@ "site_analytics_analyticspageview", "site_analytics_analyticsdailymetric", "site_analytics_analyticsmonthlymetric", + "site_analytics_siteanalyticssalt", "analyzer_analyzerconvergencesnapshot", "analyzer_areastatssnapshot", "analyzer_prdependency", @@ -93,6 +94,8 @@ "syncer_githubwebhookdelivery", # Raw analytics pageviews — contain visitor hashes; exclude from public backup "site_analytics_analyticspageview", + # Live salt is a secret; must never appear in sanitized backups + "site_analytics_siteanalyticssalt", ) # Tables retained in sanitized dump.