From 35cd62ccb76255ae6d7a445a1b27625350bb2070 Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Wed, 25 Mar 2026 22:21:15 -0500 Subject: [PATCH 1/2] fix: make daily digest reliable and always send all-clear --- .../postgres/00019_digest_sent_column.sql | 7 + .../sqlite/00019_digest_sent_column.sql | 6 + internal/db/store.go | 31 +- internal/db/store_settings.go | 68 +++- internal/db/store_settings_test.go | 229 +++++++++++++ internal/notifications/notifications.go | 121 ++++--- internal/notifications/notifications_test.go | 300 ++++++++++++++++++ internal/uptime/manager.go | 42 ++- 8 files changed, 720 insertions(+), 84 deletions(-) create mode 100644 internal/db/migrations/postgres/00019_digest_sent_column.sql create mode 100644 internal/db/migrations/sqlite/00019_digest_sent_column.sql diff --git a/internal/db/migrations/postgres/00019_digest_sent_column.sql b/internal/db/migrations/postgres/00019_digest_sent_column.sql new file mode 100644 index 0000000..7ca3103 --- /dev/null +++ b/internal/db/migrations/postgres/00019_digest_sent_column.sql @@ -0,0 +1,7 @@ +-- +goose Up +ALTER TABLE notification_digest_queue ADD COLUMN sent BOOLEAN NOT NULL DEFAULT FALSE; +ALTER TABLE notification_digest_queue ADD COLUMN sent_at TIMESTAMP; + +-- +goose Down +ALTER TABLE notification_digest_queue DROP COLUMN IF EXISTS sent_at; +ALTER TABLE notification_digest_queue DROP COLUMN IF EXISTS sent; diff --git a/internal/db/migrations/sqlite/00019_digest_sent_column.sql b/internal/db/migrations/sqlite/00019_digest_sent_column.sql new file mode 100644 index 0000000..7c84c72 --- /dev/null +++ b/internal/db/migrations/sqlite/00019_digest_sent_column.sql @@ -0,0 +1,6 @@ +-- +goose Up +ALTER TABLE notification_digest_queue ADD COLUMN sent INTEGER NOT NULL DEFAULT 0; +ALTER TABLE notification_digest_queue ADD COLUMN sent_at DATETIME; + +-- +goose Down +-- SQLite does not support DROP COLUMN in versions prior to 3.35; column additions are left in place on rollback. diff --git a/internal/db/store.go b/internal/db/store.go index 14965ca..e0d1063 100644 --- a/internal/db/store.go +++ b/internal/db/store.go @@ -198,20 +198,21 @@ func (s *Store) seed() error { // allowedResetTables is a whitelist of table names that can be dropped during reset. // SECURITY: This prevents potential SQL injection if table names were ever derived from user input. var allowedResetTables = map[string]bool{ - "user_status_pages": true, - "users": true, - "sessions": true, - "groups": true, - "monitors": true, - "monitor_checks": true, - "monitor_events": true, - "status_pages": true, - "api_keys": true, - "settings": true, - "monitor_outages": true, - "notification_channels": true, - "incidents": true, - "goose_db_version": true, + "user_status_pages": true, + "users": true, + "sessions": true, + "groups": true, + "monitors": true, + "monitor_checks": true, + "monitor_events": true, + "status_pages": true, + "api_keys": true, + "settings": true, + "monitor_outages": true, + "notification_channels": true, + "notification_digest_queue": true, + "incidents": true, + "goose_db_version": true, } // isValidTableName checks if a table name is in the allowed whitelist. @@ -232,7 +233,7 @@ func (s *Store) Reset() error { "user_status_pages", "users", "sessions", "groups", "monitors", "monitor_checks", "monitor_events", "status_pages", "api_keys", "settings", "monitor_outages", - "notification_channels", "incidents", + "notification_channels", "notification_digest_queue", "incidents", "goose_db_version", // Goose migration tracking table } diff --git a/internal/db/store_settings.go b/internal/db/store_settings.go index c5a6be9..7c3eab3 100644 --- a/internal/db/store_settings.go +++ b/internal/db/store_settings.go @@ -1,7 +1,9 @@ package db import ( + "fmt" "log" + "strings" "time" ) @@ -179,15 +181,10 @@ func (s *Store) InsertDigestEvent(monitorID, monitorName, monitorURL, eventType, return err } -// GetAndClearDigestEvents retrieves all queued digest events and deletes them atomically. -func (s *Store) GetAndClearDigestEvents() ([]DigestEvent, error) { - tx, err := s.db.Begin() - if err != nil { - return nil, err - } - defer func() { _ = tx.Rollback() }() - - rows, err := tx.Query("SELECT id, monitor_id, monitor_name, monitor_url, event_type, message, event_time FROM notification_digest_queue ORDER BY event_time ASC") +// GetUnsentDigestEvents retrieves all queued digest events that have not yet been sent. +// Events are NOT deleted; call MarkDigestEventsSent after a successful send. +func (s *Store) GetUnsentDigestEvents() ([]DigestEvent, error) { + rows, err := s.db.Query(s.rebind("SELECT id, monitor_id, monitor_name, monitor_url, event_type, message, event_time FROM notification_digest_queue WHERE sent = ? ORDER BY event_time ASC"), false) if err != nil { return nil, err } @@ -201,18 +198,59 @@ func (s *Store) GetAndClearDigestEvents() ([]DigestEvent, error) { } events = append(events, e) } + if err := rows.Err(); err != nil { + return nil, err + } + return events, nil +} - if len(events) > 0 { - if _, err = tx.Exec("DELETE FROM notification_digest_queue"); err != nil { - return nil, err +// MarkDigestEventsSent marks the given digest event IDs as sent so they are not +// re-dispatched on the next digest run. Old sent events are cleaned up by PruneDigestEvents. +func (s *Store) MarkDigestEventsSent(ids []int64) error { + if len(ids) == 0 { + return nil + } + + args := make([]interface{}, len(ids)) + for i, id := range ids { + args[i] = id + } + + var query string + if s.IsPostgres() { + placeholders := make([]string, len(ids)) + for i := range ids { + placeholders[i] = fmt.Sprintf("$%d", i+1) + } + query = fmt.Sprintf("UPDATE notification_digest_queue SET sent = TRUE, sent_at = NOW() WHERE id IN (%s)", + strings.Join(placeholders, ", ")) + } else { + placeholders := make([]string, len(ids)) + for i := range ids { + placeholders[i] = "?" } + query = fmt.Sprintf("UPDATE notification_digest_queue SET sent = 1, sent_at = datetime('now') WHERE id IN (%s)", + strings.Join(placeholders, ", ")) } - if err := tx.Commit(); err != nil { - return nil, err + _, err := s.db.Exec(query, args...) + return err +} + +// PruneDigestEvents deletes sent digest events older than the given number of days. +// This is called by the retention worker alongside PruneMonitorChecks. +func (s *Store) PruneDigestEvents(days int) error { + if days < 1 || days > 3650 { + return fmt.Errorf("invalid retention days: must be between 1 and 3650") } - return events, nil + var err error + if s.IsPostgres() { + _, err = s.db.Exec("DELETE FROM notification_digest_queue WHERE sent = TRUE AND sent_at < NOW() - MAKE_INTERVAL(days => $1)", days) + } else { + _, err = s.db.Exec("DELETE FROM notification_digest_queue WHERE sent = 1 AND sent_at < datetime('now', '-' || ? || ' days')", days) + } + return err } func (s *Store) GetDBSize() (int64, error) { diff --git a/internal/db/store_settings_test.go b/internal/db/store_settings_test.go index c2ef68a..281e5db 100644 --- a/internal/db/store_settings_test.go +++ b/internal/db/store_settings_test.go @@ -2,6 +2,7 @@ package db import ( "testing" + "time" ) func TestSettingsResult(t *testing.T) { @@ -45,3 +46,231 @@ func TestSystemStats(t *testing.T) { t.Logf("Total monitors: %d", stats.TotalMonitors) } } + +// --- Digest queue tests --- + +func insertTestDigestEvent(t *testing.T, s *Store, monitorID, eventType string, eventTime time.Time) { + t.Helper() + err := s.InsertDigestEvent(monitorID, "Monitor "+monitorID, "https://"+monitorID+".example.com", eventType, "test message", eventTime) + if err != nil { + t.Fatalf("InsertDigestEvent failed: %v", err) + } +} + +func TestDigestQueue_InsertAndGetUnsent(t *testing.T) { + RunTestWithBothDBs(t, "InsertAndGetUnsent", func(t *testing.T, s *Store) { + // Empty queue should return empty slice, not error. + events, err := s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents on empty queue: %v", err) + } + if len(events) != 0 { + t.Errorf("expected 0 events, got %d", len(events)) + } + + // Insert two events. + t0 := time.Date(2026, 3, 25, 8, 0, 0, 0, time.UTC) + insertTestDigestEvent(t, s, "m1", "down", t0) + insertTestDigestEvent(t, s, "m2", "degraded", t0.Add(time.Hour)) + + events, err = s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents: %v", err) + } + if len(events) != 2 { + t.Fatalf("expected 2 events, got %d", len(events)) + } + + // Events should be ordered by event_time ASC. + if events[0].MonitorID != "m1" { + t.Errorf("expected first event from m1, got %s", events[0].MonitorID) + } + if events[1].MonitorID != "m2" { + t.Errorf("expected second event from m2, got %s", events[1].MonitorID) + } + + // Verify fields are populated correctly. + if events[0].EventType != "down" { + t.Errorf("expected event_type 'down', got %s", events[0].EventType) + } + if events[0].MonitorName != "Monitor m1" { + t.Errorf("expected monitor name 'Monitor m1', got %s", events[0].MonitorName) + } + }) +} + +func TestDigestQueue_MarkSentRemovesFromUnsent(t *testing.T) { + RunTestWithBothDBs(t, "MarkSentRemovesFromUnsent", func(t *testing.T, s *Store) { + t0 := time.Date(2026, 3, 25, 8, 0, 0, 0, time.UTC) + insertTestDigestEvent(t, s, "m1", "down", t0) + insertTestDigestEvent(t, s, "m2", "degraded", t0.Add(time.Hour)) + + events, err := s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents: %v", err) + } + if len(events) != 2 { + t.Fatalf("expected 2 unsent events, got %d", len(events)) + } + + // Mark only the first event as sent. + if err := s.MarkDigestEventsSent([]int64{events[0].ID}); err != nil { + t.Fatalf("MarkDigestEventsSent: %v", err) + } + + // Only one event should remain unsent. + remaining, err := s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents after mark: %v", err) + } + if len(remaining) != 1 { + t.Fatalf("expected 1 unsent event after marking, got %d", len(remaining)) + } + if remaining[0].MonitorID != "m2" { + t.Errorf("expected remaining event from m2, got %s", remaining[0].MonitorID) + } + + // Mark the second event as sent. + if err := s.MarkDigestEventsSent([]int64{remaining[0].ID}); err != nil { + t.Fatalf("MarkDigestEventsSent second: %v", err) + } + + // Queue should now be empty. + final, err := s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents after all sent: %v", err) + } + if len(final) != 0 { + t.Errorf("expected 0 unsent events after all marked, got %d", len(final)) + } + }) +} + +func TestDigestQueue_MarkSentEmptySliceIsNoOp(t *testing.T) { + RunTestWithBothDBs(t, "MarkSentEmptySlice", func(t *testing.T, s *Store) { + // Should not error on empty ID slice. + if err := s.MarkDigestEventsSent([]int64{}); err != nil { + t.Errorf("MarkDigestEventsSent(empty) should be a no-op, got: %v", err) + } + }) +} + +func TestDigestQueue_MarkSentAllAtOnce(t *testing.T) { + RunTestWithBothDBs(t, "MarkSentAllAtOnce", func(t *testing.T, s *Store) { + t0 := time.Date(2026, 3, 25, 8, 0, 0, 0, time.UTC) + for i := 0; i < 5; i++ { + insertTestDigestEvent(t, s, "m1", "down", t0.Add(time.Duration(i)*time.Minute)) + } + + events, err := s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents: %v", err) + } + if len(events) != 5 { + t.Fatalf("expected 5 events, got %d", len(events)) + } + + ids := make([]int64, len(events)) + for i, e := range events { + ids[i] = e.ID + } + + if err := s.MarkDigestEventsSent(ids); err != nil { + t.Fatalf("MarkDigestEventsSent: %v", err) + } + + remaining, err := s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents after bulk mark: %v", err) + } + if len(remaining) != 0 { + t.Errorf("expected 0 remaining unsent, got %d", len(remaining)) + } + }) +} + +func TestDigestQueue_PruneDeletesOldSentEvents(t *testing.T) { + RunTestWithBothDBs(t, "PruneDeletesOldSent", func(t *testing.T, s *Store) { + t0 := time.Date(2026, 3, 25, 8, 0, 0, 0, time.UTC) + insertTestDigestEvent(t, s, "m1", "down", t0) + + events, err := s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents: %v", err) + } + if len(events) != 1 { + t.Fatalf("expected 1 event, got %d", len(events)) + } + + // Mark it as sent (sent_at = now in DB). + if err := s.MarkDigestEventsSent([]int64{events[0].ID}); err != nil { + t.Fatalf("MarkDigestEventsSent: %v", err) + } + + // Pruning with 365 days should NOT delete it (just marked sent moments ago). + if err := s.PruneDigestEvents(365); err != nil { + t.Fatalf("PruneDigestEvents(365): %v", err) + } + + // Manually set sent_at far in the past to simulate an old record. + if s.IsPostgres() { + _, _ = s.db.Exec("UPDATE notification_digest_queue SET sent_at = NOW() - INTERVAL '400 days'") + } else { + _, _ = s.db.Exec("UPDATE notification_digest_queue SET sent_at = datetime('now', '-400 days')") + } + + // Pruning with 365 days should now delete the old record. + if err := s.PruneDigestEvents(365); err != nil { + t.Fatalf("PruneDigestEvents(365) after backdating: %v", err) + } + + // Verify the record is gone. + var count int + _ = s.db.QueryRow("SELECT COUNT(*) FROM notification_digest_queue").Scan(&count) + if count != 0 { + t.Errorf("expected 0 rows after pruning old sent event, got %d", count) + } + }) +} + +func TestDigestQueue_PruneDoesNotDeleteUnsentEvents(t *testing.T) { + RunTestWithBothDBs(t, "PruneDoesNotDeleteUnsent", func(t *testing.T, s *Store) { + t0 := time.Date(2020, 1, 1, 8, 0, 0, 0, time.UTC) // very old event time + insertTestDigestEvent(t, s, "m1", "down", t0) + + // Pruning should NOT delete unsent events regardless of event_time age, + // because PruneDigestEvents only removes rows where sent = true. + if err := s.PruneDigestEvents(1); err != nil { + t.Fatalf("PruneDigestEvents: %v", err) + } + + events, err := s.GetUnsentDigestEvents() + if err != nil { + t.Fatalf("GetUnsentDigestEvents: %v", err) + } + if len(events) != 1 { + t.Errorf("expected unsent event to survive pruning, got %d events", len(events)) + } + }) +} + +func TestDigestQueue_PruneInvalidDays(t *testing.T) { + s := newTestStore(t) + + if err := s.PruneDigestEvents(0); err == nil { + t.Error("PruneDigestEvents(0) should return error") + } + if err := s.PruneDigestEvents(-1); err == nil { + t.Error("PruneDigestEvents(-1) should return error") + } + if err := s.PruneDigestEvents(3651); err == nil { + t.Error("PruneDigestEvents(3651) should return error") + } + // Boundary values should succeed. + if err := s.PruneDigestEvents(1); err != nil { + t.Errorf("PruneDigestEvents(1) should succeed, got: %v", err) + } + if err := s.PruneDigestEvents(3650); err != nil { + t.Errorf("PruneDigestEvents(3650) should succeed, got: %v", err) + } +} diff --git a/internal/notifications/notifications.go b/internal/notifications/notifications.go index bafc18e..a79fafc 100644 --- a/internal/notifications/notifications.go +++ b/internal/notifications/notifications.go @@ -319,18 +319,16 @@ func eventLabel(eventType string) string { } // SendDigest dispatches a daily digest summary to all enabled notification channels. +// When events is empty an "all systems operational" message is sent so that operators +// receive a daily confirmation even on incident-free days. func (s *Service) SendDigest(events []db.DigestEvent) { - if len(events) == 0 { - return - } - channels, err := s.store.GetNotificationChannels() if err != nil { log.Printf("Digest: failed to fetch channels: %v", err) return } - // Group events by monitor + // Group events by monitor (only when there are events to summarise). type monitorData struct { name string url string @@ -353,7 +351,7 @@ func (s *Service) SendDigest(events []db.DigestEvent) { } } - // Build digestMonitor list + // Build digestMonitor list. var monitors []digestMonitor for _, mid := range monitorOrder { md := byMonitor[mid] @@ -383,7 +381,7 @@ func (s *Service) SendDigest(events []db.DigestEvent) { }) } - // Sort monitors by severity (most critical first) + // Sort monitors by severity (most critical first). sort.SliceStable(monitors, func(i, j int) bool { return monitors[i].Severity < monitors[j].Severity }) @@ -422,8 +420,17 @@ func (n *SlackNotifier) sendDigest(summary digestSummary) error { } dateStr := summary.Date.Format("January 2, 2006") - subtitle := fmt.Sprintf(":clock3: %s · %d events across %d monitors", - dateStr, summary.TotalEvents, summary.MonitorCount) + + var subtitle, fallbackText string + if summary.TotalEvents == 0 { + subtitle = fmt.Sprintf(":white_check_mark: %s · All systems operational", dateStr) + fallbackText = ":bar_chart: Daily Monitoring Summary — All systems operational" + } else { + subtitle = fmt.Sprintf(":clock3: %s · %d events across %d monitors", + dateStr, summary.TotalEvents, summary.MonitorCount) + fallbackText = fmt.Sprintf(":bar_chart: Daily Monitoring Summary — %d events across %d monitors", + summary.TotalEvents, summary.MonitorCount) + } blocks := []map[string]interface{}{ { @@ -448,37 +455,44 @@ func (n *SlackNotifier) sendDigest(summary digestSummary) error { }, } - for _, m := range summary.Monitors { - var eventParts []string - for _, ec := range m.Events { - if ec.Type == "ssl_expiring" && m.SSLMessage != "" { - eventParts = append(eventParts, fmt.Sprintf("%s %s", - eventEmoji(ec.Type), m.SSLMessage)) - } else { - eventParts = append(eventParts, fmt.Sprintf("%s %s `%dx`", - eventEmoji(ec.Type), eventLabel(ec.Type), ec.Count)) - } - } - - monitorEmoji := ":white_check_mark:" - if len(m.Events) > 0 { - monitorEmoji = eventEmoji(m.Events[0].Type) - } - - text := fmt.Sprintf("%s *%s*\n%s", - monitorEmoji, m.Name, strings.Join(eventParts, " · ")) - + if summary.TotalEvents == 0 { blocks = append(blocks, map[string]interface{}{ "type": "section", "text": map[string]interface{}{ "type": "mrkdwn", - "text": text, + "text": ":white_check_mark: All systems operational — no incidents today.", }, }) - } + } else { + for _, m := range summary.Monitors { + var eventParts []string + for _, ec := range m.Events { + if ec.Type == "ssl_expiring" && m.SSLMessage != "" { + eventParts = append(eventParts, fmt.Sprintf("%s %s", + eventEmoji(ec.Type), m.SSLMessage)) + } else { + eventParts = append(eventParts, fmt.Sprintf("%s %s `%dx`", + eventEmoji(ec.Type), eventLabel(ec.Type), ec.Count)) + } + } + + monitorEmoji := ":white_check_mark:" + if len(m.Events) > 0 { + monitorEmoji = eventEmoji(m.Events[0].Type) + } - fallbackText := fmt.Sprintf(":bar_chart: Daily Monitoring Summary — %d events across %d monitors", - summary.TotalEvents, summary.MonitorCount) + text := fmt.Sprintf("%s *%s*\n%s", + monitorEmoji, m.Name, strings.Join(eventParts, " · ")) + + blocks = append(blocks, map[string]interface{}{ + "type": "section", + "text": map[string]interface{}{ + "type": "mrkdwn", + "text": text, + }, + }) + } + } payload := map[string]interface{}{ "text": fallbackText, @@ -494,7 +508,9 @@ func (n *WebhookNotifier) sendDigest(summary digestSummary, events []db.DigestEv return fmt.Errorf("webhookUrl missing or invalid") } - var monitorSummaries []map[string]interface{} + // Initialise as empty slice (not nil) so the JSON field is always an array, + // never null — even when there are no events (all-clear digest). + monitorSummaries := make([]map[string]interface{}, 0, len(summary.Monitors)) for _, m := range summary.Monitors { eventCounts := make(map[string]int) for _, ec := range m.Events { @@ -507,24 +523,37 @@ func (n *WebhookNotifier) sendDigest(summary digestSummary, events []db.DigestEv }) } - // Build plain-text summary for backwards compatibility - var lines []string - for _, m := range summary.Monitors { - var parts []string - for _, ec := range m.Events { - if ec.Type == "ssl_expiring" && m.SSLMessage != "" { - parts = append(parts, m.SSLMessage) - } else { - parts = append(parts, fmt.Sprintf("%s (%dx)", ec.Type, ec.Count)) + // Build plain-text summary line. + var summaryText string + if summary.TotalEvents == 0 { + summaryText = "All systems operational — no incidents today." + } else { + var lines []string + for _, m := range summary.Monitors { + var parts []string + for _, ec := range m.Events { + if ec.Type == "ssl_expiring" && m.SSLMessage != "" { + parts = append(parts, m.SSLMessage) + } else { + parts = append(parts, fmt.Sprintf("%s (%dx)", ec.Type, ec.Count)) + } } + lines = append(lines, fmt.Sprintf("- %s: %s", m.Name, strings.Join(parts, ", "))) } - lines = append(lines, fmt.Sprintf("- %s: %s", m.Name, strings.Join(parts, ", "))) + summaryText = strings.Join(lines, "\n") + } + + var title string + if summary.TotalEvents == 0 { + title = "Daily Monitoring Summary (all systems operational)" + } else { + title = fmt.Sprintf("Daily Monitoring Summary (%d events)", summary.TotalEvents) } payload := map[string]interface{}{ "type": "digest", - "title": fmt.Sprintf("Daily Monitoring Summary (%d events)", summary.TotalEvents), - "summary": strings.Join(lines, "\n"), + "title": title, + "summary": summaryText, "eventCount": summary.TotalEvents, "monitorCount": summary.MonitorCount, "monitors": monitorSummaries, diff --git a/internal/notifications/notifications_test.go b/internal/notifications/notifications_test.go index ccd8a40..b719aff 100644 --- a/internal/notifications/notifications_test.go +++ b/internal/notifications/notifications_test.go @@ -795,3 +795,303 @@ func TestSendDigest_SSLMessageFromEvents(t *testing.T) { t.Errorf("expected SSL message from event in digest, got: %s", text) } } + +// --- All-clear (zero-event) digest tests --- + +// newTestChannel creates a notification channel backed by an httptest server. +// The caller is responsible for closing srv. +func newTestChannelAndServer(t *testing.T, store *db.Store, id, chType string) (*httptest.Server, *map[string]interface{}) { + t.Helper() + received := &map[string]interface{}{} + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + _ = json.Unmarshal(body, received) + w.WriteHeader(http.StatusOK) + })) + ch := db.NotificationChannel{ + ID: id, + Type: chType, + Name: "Test " + chType, + Config: `{"webhookUrl":"` + srv.URL + `"}`, + Enabled: true, + CreatedAt: time.Now(), + } + if err := store.CreateNotificationChannel(ch); err != nil { + t.Fatalf("Failed to create channel: %v", err) + } + return srv, received +} + +func TestSendDigest_AllClear_Slack(t *testing.T) { + store := newTestStore(t) + svc := NewService(store) + + srv, received := newTestChannelAndServer(t, store, "nc-slack-allclear", "slack") + defer srv.Close() + + // Call SendDigest with no events — should still send an all-clear message. + svc.SendDigest(nil) + + if *received == nil { + t.Fatal("expected Slack to receive a request, but got nothing") + } + + // Must have fallback text field. + fallback, ok := (*received)["text"].(string) + if !ok || fallback == "" { + t.Error("expected non-empty fallback 'text' field") + } + if !strings.Contains(fallback, "All systems operational") { + t.Errorf("fallback text should mention all systems operational, got: %s", fallback) + } + + // Must have blocks. + blocks, ok := (*received)["blocks"].([]interface{}) + if !ok || len(blocks) == 0 { + t.Fatal("expected non-empty 'blocks' array in Slack payload") + } + + // Expect: header + context + divider + all-clear section = 4 blocks. + if len(blocks) != 4 { + t.Errorf("expected 4 blocks for all-clear digest, got %d", len(blocks)) + } + + expectedTypes := []string{"header", "context", "divider", "section"} + for i, expectedType := range expectedTypes { + block, ok := blocks[i].(map[string]interface{}) + if !ok { + t.Fatalf("block[%d] is not a map", i) + } + if block["type"] != expectedType { + t.Errorf("block[%d]: expected type %q, got %v", i, expectedType, block["type"]) + } + } + + // The context block should say "All systems operational". + ctx := blocks[1].(map[string]interface{}) + elements := ctx["elements"].([]interface{}) + ctxText := elements[0].(map[string]interface{})["text"].(string) + if !strings.Contains(ctxText, "All systems operational") { + t.Errorf("context block should mention all systems operational, got: %s", ctxText) + } + + // The section block should have the all-clear body. + section := blocks[3].(map[string]interface{}) + sectionText := section["text"].(map[string]interface{})["text"].(string) + if !strings.Contains(sectionText, "All systems operational") { + t.Errorf("section block should contain all-clear message, got: %s", sectionText) + } +} + +func TestSendDigest_AllClear_Webhook(t *testing.T) { + store := newTestStore(t) + svc := NewService(store) + + srv, received := newTestChannelAndServer(t, store, "nc-webhook-allclear", "webhook") + defer srv.Close() + + svc.SendDigest(nil) + + if *received == nil { + t.Fatal("expected webhook to receive a request, but got nothing") + } + + // type field. + if (*received)["type"] != "digest" { + t.Errorf("expected type 'digest', got %v", (*received)["type"]) + } + + // title should mention all systems operational. + title, ok := (*received)["title"].(string) + if !ok { + t.Fatal("expected title string in payload") + } + if !strings.Contains(title, "all systems operational") { + t.Errorf("title should mention all systems operational, got: %s", title) + } + + // summary text. + summary, ok := (*received)["summary"].(string) + if !ok { + t.Fatal("expected summary string in payload") + } + if !strings.Contains(summary, "All systems operational") { + t.Errorf("summary should be all-clear message, got: %s", summary) + } + + // eventCount should be 0. + if count := (*received)["eventCount"].(float64); count != 0 { + t.Errorf("expected eventCount 0, got %v", count) + } + + // monitorCount should be 0. + if count := (*received)["monitorCount"].(float64); count != 0 { + t.Errorf("expected monitorCount 0, got %v", count) + } + + // monitors must be a JSON array (not null) even when empty. + monitors, ok := (*received)["monitors"].([]interface{}) + if !ok { + t.Errorf("expected 'monitors' to be a JSON array, got %T (%v)", (*received)["monitors"], (*received)["monitors"]) + } + if len(monitors) != 0 { + t.Errorf("expected empty monitors array, got %d entries", len(monitors)) + } + + // timestamp must be RFC3339. + ts, ok := (*received)["timestamp"].(string) + if !ok { + t.Fatal("expected timestamp string in payload") + } + if _, err := time.Parse(time.RFC3339, ts); err != nil { + t.Errorf("timestamp is not RFC3339: %s", ts) + } +} + +func TestSendDigest_AllClear_EmptySliceEquivalentToNil(t *testing.T) { + // Both nil and empty-slice events should produce the same all-clear payload. + store := newTestStore(t) + svc := NewService(store) + + var nilReceived, emptyReceived map[string]interface{} + + srv1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + _ = json.Unmarshal(body, &nilReceived) + w.WriteHeader(http.StatusOK) + })) + defer srv1.Close() + + if err := store.CreateNotificationChannel(db.NotificationChannel{ + ID: "nc-nil", Type: "slack", Name: "nil", Enabled: true, + Config: `{"webhookUrl":"` + srv1.URL + `"}`, CreatedAt: time.Now(), + }); err != nil { + t.Fatalf("create channel: %v", err) + } + svc.SendDigest(nil) + + // Swap to a fresh channel + server for the empty-slice call. + srv2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + _ = json.Unmarshal(body, &emptyReceived) + w.WriteHeader(http.StatusOK) + })) + defer srv2.Close() + + // Disable first channel, add second. + _ = store.UpdateNotificationChannel("nc-nil", "nil", "slack", `{"webhookUrl":"`+srv1.URL+`"}`, false) + if err := store.CreateNotificationChannel(db.NotificationChannel{ + ID: "nc-empty", Type: "slack", Name: "empty", Enabled: true, + Config: `{"webhookUrl":"` + srv2.URL + `"}`, CreatedAt: time.Now(), + }); err != nil { + t.Fatalf("create channel: %v", err) + } + svc.SendDigest([]db.DigestEvent{}) + + // Both payloads should have the same fallback text. + nilText, _ := nilReceived["text"].(string) + emptyText, _ := emptyReceived["text"].(string) + if nilText != emptyText { + t.Errorf("nil and empty-slice digest produced different fallback text:\n nil: %s\n empty: %s", nilText, emptyText) + } +} + +func TestSendDigest_NoChannels_DoesNotPanic(t *testing.T) { + // With no channels configured, SendDigest should be a clean no-op. + store := newTestStore(t) + svc := NewService(store) + + // Should not panic regardless of event count. + svc.SendDigest(nil) + svc.SendDigest([]db.DigestEvent{}) + svc.SendDigest([]db.DigestEvent{ + {MonitorID: "m1", MonitorName: "API", EventType: "down", EventTime: time.Now()}, + }) +} + +func TestSendDigest_DisabledChannelNotCalled(t *testing.T) { + store := newTestStore(t) + svc := NewService(store) + + called := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + // Create channel but mark it disabled. + if err := store.CreateNotificationChannel(db.NotificationChannel{ + ID: "nc-disabled", Type: "slack", Name: "disabled", Enabled: false, + Config: `{"webhookUrl":"` + srv.URL + `"}`, CreatedAt: time.Now(), + }); err != nil { + t.Fatalf("create channel: %v", err) + } + + svc.SendDigest(nil) // all-clear + + if called { + t.Error("disabled channel should not receive any request") + } +} + +func TestSlackDigest_AllClear_FallbackTextFormat(t *testing.T) { + // Directly test the Slack notifier's sendDigest with a zero-event summary. + var received map[string]interface{} + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + _ = json.Unmarshal(body, &received) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + notifier := NewSlackNotifier(`{"webhookUrl":"` + srv.URL + `"}`) + summary := digestSummary{ + TotalEvents: 0, + MonitorCount: 0, + Date: time.Date(2026, 3, 25, 9, 0, 0, 0, time.UTC), + } + + if err := notifier.sendDigest(summary); err != nil { + t.Fatalf("sendDigest failed: %v", err) + } + + text := received["text"].(string) + if strings.Contains(text, "0 events") { + t.Errorf("all-clear fallback text should not say '0 events', got: %s", text) + } + if !strings.Contains(text, "All systems operational") { + t.Errorf("all-clear fallback text should mention all systems operational, got: %s", text) + } +} + +func TestWebhookDigest_AllClear_MonitorsIsArray(t *testing.T) { + // Confirm monitors field is [] not null in zero-event webhook payload. + var received map[string]interface{} + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + _ = json.Unmarshal(body, &received) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + notifier := NewWebhookNotifier(`{"webhookUrl":"` + srv.URL + `"}`) + summary := digestSummary{ + TotalEvents: 0, + MonitorCount: 0, + Date: time.Now(), + } + + if err := notifier.sendDigest(summary, nil); err != nil { + t.Fatalf("sendDigest failed: %v", err) + } + + // monitors must parse as an array ([] not null) in the JSON. + monitors, ok := received["monitors"].([]interface{}) + if !ok { + t.Fatalf("expected monitors to be JSON array, got %T: %v", received["monitors"], received["monitors"]) + } + if len(monitors) != 0 { + t.Errorf("expected empty monitors array, got %d entries", len(monitors)) + } +} diff --git a/internal/uptime/manager.go b/internal/uptime/manager.go index 45bcb6b..3c7f6dc 100644 --- a/internal/uptime/manager.go +++ b/internal/uptime/manager.go @@ -766,6 +766,8 @@ func (m *Manager) Sync() { if user, err := m.store.GetUser(1); err == nil && user.Timezone != "" { if loc, err := time.LoadLocation(user.Timezone); err == nil { notifTZ = loc + } else { + log.Printf("Digest: invalid timezone %q for user 1, falling back to UTC: %v", user.Timezone, err) } } @@ -1139,18 +1141,37 @@ func (m *Manager) digestWorker() { currentTime := now.Format("15:04") currentDate := now.Format("2006-01-02") - if currentTime == digestTime && lastSentDate != currentDate { - lastSentDate = currentDate - events, err := m.store.GetAndClearDigestEvents() + // Bug 3 fix: use >= instead of == so that if the ticker fires a minute + // late (GC pause, system load) the digest is still sent rather than skipped. + // The lastSentDate guard ensures at most one send per calendar day. + if currentTime >= digestTime && lastSentDate != currentDate { + // Bug 1b fix: fetch events BEFORE updating lastSentDate so that a + // transient DB error causes a retry on the next tick rather than + // permanently skipping today. + events, err := m.store.GetUnsentDigestEvents() if err != nil { log.Printf("Digest: failed to get events: %v", err) - continue - } - if len(events) == 0 { - continue + continue // don't mark the day as sent; retry next tick } + + // Bug 1a fix: always send the digest, even when there are no events. + // SendDigest handles the empty-event case with an "all clear" message. m.notifier.SendDigest(events) - log.Printf("Digest: sent summary with %d events", len(events)) + + // Mark events as sent in the DB so they are not re-dispatched. + if len(events) > 0 { + ids := make([]int64, len(events)) + for i, e := range events { + ids[i] = e.ID + } + if err := m.store.MarkDigestEventsSent(ids); err != nil { + log.Printf("Digest: failed to mark events as sent: %v", err) + } + } + + // Update in-memory guard only after a successful dispatch attempt. + lastSentDate = currentDate + log.Printf("Digest: sent daily summary with %d events for %s", len(events), currentDate) } } } @@ -1170,6 +1191,11 @@ func (m *Manager) retentionWorker() { if err := m.store.PruneMonitorChecks(days); err != nil { log.Printf("Retention error: %v", err) } + // Bug 5 fix: prune sent digest events using the same retention window so + // the notification_digest_queue table doesn't grow unbounded. + if err := m.store.PruneDigestEvents(days); err != nil { + log.Printf("Retention: failed to prune digest events: %v", err) + } } // Run immediately From 75cd2b90bec9aa9a7416153a811a1c133596198e Mon Sep 17 00:00:00 2001 From: Jesus Paz Date: Wed, 25 Mar 2026 22:35:39 -0500 Subject: [PATCH 2/2] refactor: clean up digest worker comments --- internal/notifications/notifications.go | 5 +---- internal/uptime/manager.go | 23 ++++++++++------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/internal/notifications/notifications.go b/internal/notifications/notifications.go index a79fafc..f2a4588 100644 --- a/internal/notifications/notifications.go +++ b/internal/notifications/notifications.go @@ -328,7 +328,6 @@ func (s *Service) SendDigest(events []db.DigestEvent) { return } - // Group events by monitor (only when there are events to summarise). type monitorData struct { name string url string @@ -508,8 +507,7 @@ func (n *WebhookNotifier) sendDigest(summary digestSummary, events []db.DigestEv return fmt.Errorf("webhookUrl missing or invalid") } - // Initialise as empty slice (not nil) so the JSON field is always an array, - // never null — even when there are no events (all-clear digest). + // Allocate so the field serialises as a JSON array rather than null. monitorSummaries := make([]map[string]interface{}, 0, len(summary.Monitors)) for _, m := range summary.Monitors { eventCounts := make(map[string]int) @@ -523,7 +521,6 @@ func (n *WebhookNotifier) sendDigest(summary digestSummary, events []db.DigestEv }) } - // Build plain-text summary line. var summaryText string if summary.TotalEvents == 0 { summaryText = "All systems operational — no incidents today." diff --git a/internal/uptime/manager.go b/internal/uptime/manager.go index 3c7f6dc..c2aad7d 100644 --- a/internal/uptime/manager.go +++ b/internal/uptime/manager.go @@ -1141,24 +1141,24 @@ func (m *Manager) digestWorker() { currentTime := now.Format("15:04") currentDate := now.Format("2006-01-02") - // Bug 3 fix: use >= instead of == so that if the ticker fires a minute - // late (GC pause, system load) the digest is still sent rather than skipped. - // The lastSentDate guard ensures at most one send per calendar day. + // >= rather than == ensures a tick that lands slightly past the target + // minute still triggers the digest rather than waiting a full day. + // lastSentDate guarantees exactly one send per calendar day. if currentTime >= digestTime && lastSentDate != currentDate { - // Bug 1b fix: fetch events BEFORE updating lastSentDate so that a - // transient DB error causes a retry on the next tick rather than - // permanently skipping today. + // Fetch events before recording the send so that a transient store + // error causes a retry on the next tick instead of silently skipping + // the day. events, err := m.store.GetUnsentDigestEvents() if err != nil { log.Printf("Digest: failed to get events: %v", err) - continue // don't mark the day as sent; retry next tick + continue } - // Bug 1a fix: always send the digest, even when there are no events. - // SendDigest handles the empty-event case with an "all clear" message. + // Always call SendDigest: it delivers an all-clear message when the + // event list is empty so operators receive a daily confirmation even + // on incident-free days. m.notifier.SendDigest(events) - // Mark events as sent in the DB so they are not re-dispatched. if len(events) > 0 { ids := make([]int64, len(events)) for i, e := range events { @@ -1169,7 +1169,6 @@ func (m *Manager) digestWorker() { } } - // Update in-memory guard only after a successful dispatch attempt. lastSentDate = currentDate log.Printf("Digest: sent daily summary with %d events for %s", len(events), currentDate) } @@ -1191,8 +1190,6 @@ func (m *Manager) retentionWorker() { if err := m.store.PruneMonitorChecks(days); err != nil { log.Printf("Retention error: %v", err) } - // Bug 5 fix: prune sent digest events using the same retention window so - // the notification_digest_queue table doesn't grow unbounded. if err := m.store.PruneDigestEvents(days); err != nil { log.Printf("Retention: failed to prune digest events: %v", err) }