From c62ec32d15642afde66e63a3e6afaa936d8e5aa6 Mon Sep 17 00:00:00 2001 From: Bob Date: Tue, 14 Apr 2026 17:26:08 +0000 Subject: [PATCH 1/4] feat(categorization): extract findCommonPhrases util with bigram support and tests Moves the common-phrase detection logic from CategoryBuilder.vue into a standalone src/util/categorization.ts module and adds 10 unit tests. Changes vs original PR #455: - Extracts function to util module for testability - Uses Map (consistent with existing CategoryBuilder code) - Filters bigram components against ignored_words and length <= 2 - Removes debug console.log statements - Full TypeScript types Closes #455 --- src/util/categorization.ts | 85 +++++++++++++++++++ src/views/settings/CategoryBuilder.vue | 22 +---- test/unit/categorization.test.node.ts | 108 +++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 20 deletions(-) create mode 100644 src/util/categorization.ts create mode 100644 test/unit/categorization.test.node.ts diff --git a/src/util/categorization.ts b/src/util/categorization.ts new file mode 100644 index 00000000..13c4fec5 --- /dev/null +++ b/src/util/categorization.ts @@ -0,0 +1,85 @@ +import { IEvent } from '~/util/interfaces'; + +// Regex used to split event titles into words +const SPLIT_REGEX = /[\s\-,:()[\]/]/; + +export interface WordEntry { + word: string; + duration: number; + events: IEvent[]; +} + +/** + * Finds common words and bigrams (two-word phrases) in event titles, + * weighted by time duration rather than event count. + * + * For each bigram that accounts for >50% of the total duration of both + * constituent words, the bigram is promoted and the constituent words' + * durations are reduced accordingly. This means "Mozilla Firefox" appears + * instead of separate "Mozilla" and "Firefox" entries when they almost + * always co-occur. + * + * Words with length <= 2 or in `ignored_words` are skipped. + */ +export function findCommonPhrases( + events: IEvent[], + ignored_words: string[] +): Map { + const words = new Map(); + const bigrams = new Map(); + + // Step 1: Build word duration dictionary + for (const event of events) { + for (const word of event.data.title.split(SPLIT_REGEX)) { + if (word.length <= 2 || ignored_words.includes(word)) { + continue; + } + const entry = words.get(word); + if (entry) { + entry.duration += event.duration; + entry.events.push(event); + } else { + words.set(word, { word, duration: event.duration, events: [event] }); + } + } + } + + // Step 2: Build bigram duration dictionary (skip bigrams with filtered words) + for (const event of events) { + const parts = event.data.title.split(SPLIT_REGEX); + for (let i = 0; i < parts.length - 1; i++) { + const w1 = parts[i]; + const w2 = parts[i + 1]; + if (w1.length <= 2 || ignored_words.includes(w1)) continue; + if (w2.length <= 2 || ignored_words.includes(w2)) continue; + const bigram = `${w1} ${w2}`; + const entry = bigrams.get(bigram); + if (entry) { + entry.duration += event.duration; + entry.events.push(event); + } else { + bigrams.set(bigram, { bigram, duration: event.duration, events: [event] }); + } + } + } + + // Step 3: Promote bigrams that dominate both constituent words (>50% threshold) + for (const [bigram, bigramEntry] of bigrams) { + const spaceIdx = bigram.indexOf(' '); + const word1 = bigram.slice(0, spaceIdx); + const word2 = bigram.slice(spaceIdx + 1); + const w1Entry = words.get(word1); + const w2Entry = words.get(word2); + if (!w1Entry || !w2Entry) continue; + + const bigram_duration = bigramEntry.duration; + if (bigram_duration / w1Entry.duration > 0.5 && bigram_duration / w2Entry.duration > 0.5) { + // Promote bigram, reduce constituent word durations + words.set(bigram, { word: bigram, duration: bigram_duration, events: bigramEntry.events }); + w1Entry.duration -= bigram_duration; + w2Entry.duration -= bigram_duration; + } + } + + return words; +} diff --git a/src/views/settings/CategoryBuilder.vue b/src/views/settings/CategoryBuilder.vue index 9d751b3f..ca51346e 100644 --- a/src/views/settings/CategoryBuilder.vue +++ b/src/views/settings/CategoryBuilder.vue @@ -113,6 +113,7 @@ import { canonicalEvents } from '~/queries'; import { getClient } from '~/util/awclient'; import CategoryEditModal from '~/components/CategoryEditModal.vue'; import { isRegexBroad, validateRegex } from '~/util/validate'; +import { findCommonPhrases } from '~/util/categorization'; export default { name: 'aw-category-builder', @@ -224,26 +225,7 @@ export default { ); const events = data[0]; - const words = new Map(); - for (const event of events) { - const words_in_event = event.data.title.split(/[\s\-,:()[\]/]/); - for (const word of words_in_event) { - if (word.length <= 2 || this.ignored_words.includes(word)) { - continue; - } - if (words.has(word)) { - words.get(word).duration += event.duration; - words.get(word).events.push(event); - } else { - words.set(word, { - word: word, - duration: event.duration, - events: [event], - }); - } - } - } - this.words = words; + this.words = findCommonPhrases(events, this.ignored_words); this.loading = false; }, showEvents(word) { diff --git a/test/unit/categorization.test.node.ts b/test/unit/categorization.test.node.ts new file mode 100644 index 00000000..3bba8bbe --- /dev/null +++ b/test/unit/categorization.test.node.ts @@ -0,0 +1,108 @@ +import { findCommonPhrases } from '~/util/categorization'; +import { IEvent } from '~/util/interfaces'; + +function makeEvent(title: string, duration: number): IEvent { + return { + timestamp: new Date().toISOString(), + duration, + data: { title }, + }; +} + +describe('findCommonPhrases', () => { + test('returns empty map for empty events', () => { + expect(findCommonPhrases([], [])).toEqual(new Map()); + }); + + test('counts single words by duration', () => { + // Single-word titles produce no bigrams, so durations accumulate directly + const events = [makeEvent('hello', 100), makeEvent('hello', 50), makeEvent('world', 80)]; + const result = findCommonPhrases(events, []); + expect(result.get('hello')?.duration).toBeCloseTo(150); + expect(result.get('world')?.duration).toBeCloseTo(80); + }); + + test('promotes bigram when it dominates both constituent words', () => { + // "Mozilla Firefox" always appears together + const events = [makeEvent('Mozilla Firefox', 100), makeEvent('Mozilla Firefox', 100)]; + const result = findCommonPhrases(events, []); + // Bigram should be promoted + expect(result.get('Mozilla Firefox')).toBeDefined(); + expect(result.get('Mozilla Firefox')?.duration).toBe(200); + // Constituent word durations reduced to 0 + expect(result.get('Mozilla')?.duration).toBe(0); + expect(result.get('Firefox')?.duration).toBe(0); + }); + + test('does not promote bigram when one word appears independently too often', () => { + const events = [ + makeEvent('Mozilla Firefox', 60), + makeEvent('Mozilla Browser', 100), // "Mozilla" has independent time + ]; + // Mozilla total: 160, Firefox: 60, bigram "Mozilla Firefox": 60 + // 60/160 = 0.375 < 0.5 → bigram NOT promoted + const result = findCommonPhrases(events, []); + expect(result.get('Mozilla Firefox')).toBeUndefined(); + expect(result.get('Mozilla')).toBeDefined(); + expect(result.get('Firefox')).toBeDefined(); + }); + + test('filters out words with length <= 2', () => { + const events = [makeEvent('is at go home', 100)]; + const result = findCommonPhrases(events, []); + expect(result.get('is')).toBeUndefined(); + expect(result.get('at')).toBeUndefined(); + expect(result.get('go')).toBeUndefined(); + expect(result.get('home')).toBeDefined(); + }); + + test('filters out ignored words', () => { + const events = [makeEvent('GitHub Chrome Test', 100)]; + const result = findCommonPhrases(events, ['GitHub', 'Chrome']); + expect(result.get('GitHub')).toBeUndefined(); + expect(result.get('Chrome')).toBeUndefined(); + expect(result.get('Test')).toBeDefined(); + }); + + test('ignored words are not used as bigram components', () => { + const events = [makeEvent('GitHub Desktop', 100), makeEvent('GitHub Desktop', 100)]; + const result = findCommonPhrases(events, ['GitHub']); + // "GitHub" is ignored, so "GitHub Desktop" bigram should not be promoted + expect(result.get('GitHub Desktop')).toBeUndefined(); + expect(result.get('Desktop')).toBeDefined(); + }); + + test('handles titles split by various separator characters', () => { + const events = [makeEvent('foo-bar,baz:qux(quux)', 100)]; + const result = findCommonPhrases(events, []); + expect(result.get('foo')).toBeDefined(); + expect(result.get('bar')).toBeDefined(); + expect(result.get('baz')).toBeDefined(); + expect(result.get('qux')).toBeDefined(); + expect(result.get('quux')).toBeDefined(); + }); + + test('accumulated duration across multiple events', () => { + const events = [ + makeEvent('Python Programming', 30), + makeEvent('Python Programming', 30), + makeEvent('Python Programming', 30), + makeEvent('Python Programming', 30), + ]; + const result = findCommonPhrases(events, []); + // All events have same title → bigram fully dominates + expect(result.get('Python Programming')?.duration).toBe(120); + expect(result.get('Python')?.duration).toBe(0); + expect(result.get('Programming')?.duration).toBe(0); + }); + + test('returns Map with word entries containing events list', () => { + const events = [makeEvent('Hello World', 100)]; + const result = findCommonPhrases(events, []); + const entry = result.get('Hello'); + expect(entry).toBeDefined(); + expect(entry?.word).toBe('Hello'); + expect(entry?.events).toHaveLength(1); + expect(entry?.events[0]).toBe(events[0]); + }); +}); From 99b073f18feef4d9c7f1d0412d9de3d740a9e70b Mon Sep 17 00:00:00 2001 From: TimeToBuildBob Date: Tue, 14 Apr 2026 17:38:55 +0000 Subject: [PATCH 2/4] fix(categorization): snapshot word durations before bigram promotion loop Without this, promoting a bigram (e.g. 'Alpha Beta') reduces constituent word durations in-place. A later bigram that shares the middle word (e.g. 'Beta Gamma') then sees Beta.duration=0, so the check becomes 10/0 = Infinity > 0.5 and the weak bigram is incorrectly promoted. Fix: build an originalDurations snapshot before the Step 3 loop and use it for all threshold comparisons; mutations to entry.duration still happen (for accurate display) but no longer corrupt subsequent checks. Also adds a regression test that fails on the unfixed code. --- src/util/categorization.ts | 13 ++++++++++++- test/unit/categorization.test.node.ts | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/util/categorization.ts b/src/util/categorization.ts index 13c4fec5..92af024c 100644 --- a/src/util/categorization.ts +++ b/src/util/categorization.ts @@ -64,6 +64,15 @@ export function findCommonPhrases( } // Step 3: Promote bigrams that dominate both constituent words (>50% threshold) + // Snapshot original durations before the loop so that promoting one bigram + // (which reduces its constituent words' durations) does not corrupt the + // threshold check for a later bigram that shares a common word. Without this, + // a trigram such as "Alpha Beta Gamma" causes Beta.duration to reach 0 after + // "Alpha Beta" is promoted, making 10/0 = Infinity pass the check for "Beta Gamma" + // even though only 10/110 ≈ 9% of Beta's original time was spent in that bigram. + const originalDurations = new Map( + Array.from(words.entries(), ([w, e]) => [w, e.duration]) + ); for (const [bigram, bigramEntry] of bigrams) { const spaceIdx = bigram.indexOf(' '); const word1 = bigram.slice(0, spaceIdx); @@ -73,7 +82,9 @@ export function findCommonPhrases( if (!w1Entry || !w2Entry) continue; const bigram_duration = bigramEntry.duration; - if (bigram_duration / w1Entry.duration > 0.5 && bigram_duration / w2Entry.duration > 0.5) { + const w1OrigDuration = originalDurations.get(word1)!; + const w2OrigDuration = originalDurations.get(word2)!; + if (bigram_duration / w1OrigDuration > 0.5 && bigram_duration / w2OrigDuration > 0.5) { // Promote bigram, reduce constituent word durations words.set(bigram, { word: bigram, duration: bigram_duration, events: bigramEntry.events }); w1Entry.duration -= bigram_duration; diff --git a/test/unit/categorization.test.node.ts b/test/unit/categorization.test.node.ts index 3bba8bbe..ceea6193 100644 --- a/test/unit/categorization.test.node.ts +++ b/test/unit/categorization.test.node.ts @@ -105,4 +105,20 @@ describe('findCommonPhrases', () => { expect(entry?.events).toHaveLength(1); expect(entry?.events[0]).toBe(events[0]); }); + + test('trigram: does not double-promote when middle word duration is consumed', () => { + // "Alpha Beta" dominates (110s); "Alpha Beta Gamma" appears rarely (10s). + // Word totals: Alpha=110, Beta=110, Gamma=10 + // Bigram totals: "Alpha Beta"=110, "Beta Gamma"=10 + // "Alpha Beta" correctly promotes (110/110 > 0.5 for both words). + // After promotion, Beta.duration drops to 0. Without snapshotting original + // durations, "Beta Gamma" sees 10/0 = Infinity and incorrectly promotes too. + // With the fix, the check uses the original Beta=110: 10/110 ≈ 0.09 < 0.5 → no promotion. + const events = [makeEvent('Alpha Beta', 100), makeEvent('Alpha Beta Gamma', 10)]; + const result = findCommonPhrases(events, []); + expect(result.get('Alpha Beta')).toBeDefined(); // correctly promoted + expect(result.get('Beta Gamma')).toBeUndefined(); // must NOT be promoted + const betaEntry = result.get('Beta'); + expect(betaEntry?.duration).toBeGreaterThanOrEqual(0); // no negative durations + }); }); From 4a853eb44fec1d5484e856b1e5cca853432b988d Mon Sep 17 00:00:00 2001 From: TimeToBuildBob Date: Tue, 14 Apr 2026 17:42:23 +0000 Subject: [PATCH 3/4] fix(categorization): replace non-null assertions with explicit undefined guards --- src/util/categorization.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util/categorization.ts b/src/util/categorization.ts index 92af024c..c900cdcd 100644 --- a/src/util/categorization.ts +++ b/src/util/categorization.ts @@ -79,11 +79,11 @@ export function findCommonPhrases( const word2 = bigram.slice(spaceIdx + 1); const w1Entry = words.get(word1); const w2Entry = words.get(word2); - if (!w1Entry || !w2Entry) continue; + const w1OrigDuration = originalDurations.get(word1); + const w2OrigDuration = originalDurations.get(word2); + if (!w1Entry || !w2Entry || w1OrigDuration === undefined || w2OrigDuration === undefined) continue; const bigram_duration = bigramEntry.duration; - const w1OrigDuration = originalDurations.get(word1)!; - const w2OrigDuration = originalDurations.get(word2)!; if (bigram_duration / w1OrigDuration > 0.5 && bigram_duration / w2OrigDuration > 0.5) { // Promote bigram, reduce constituent word durations words.set(bigram, { word: bigram, duration: bigram_duration, events: bigramEntry.events }); From 224a5a1933efd1e89a6210665d0a6fe9c8226f44 Mon Sep 17 00:00:00 2001 From: TimeToBuildBob Date: Tue, 14 Apr 2026 17:54:24 +0000 Subject: [PATCH 4/4] style(categorization): fix prettier line-length warning in guard clause --- src/util/categorization.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util/categorization.ts b/src/util/categorization.ts index c900cdcd..dee9b16d 100644 --- a/src/util/categorization.ts +++ b/src/util/categorization.ts @@ -81,7 +81,8 @@ export function findCommonPhrases( const w2Entry = words.get(word2); const w1OrigDuration = originalDurations.get(word1); const w2OrigDuration = originalDurations.get(word2); - if (!w1Entry || !w2Entry || w1OrigDuration === undefined || w2OrigDuration === undefined) continue; + if (!w1Entry || !w2Entry || w1OrigDuration === undefined || w2OrigDuration === undefined) + continue; const bigram_duration = bigramEntry.duration; if (bigram_duration / w1OrigDuration > 0.5 && bigram_duration / w2OrigDuration > 0.5) {