diff --git a/src/util/categorization.ts b/src/util/categorization.ts new file mode 100644 index 00000000..dee9b16d --- /dev/null +++ b/src/util/categorization.ts @@ -0,0 +1,97 @@ +import { IEvent } from '~/util/interfaces'; + +// Regex used to split event titles into words +const SPLIT_REGEX = /[\s\-,:()[\]/]/; + +export interface WordEntry { + word: string; + duration: number; + events: IEvent[]; +} + +/** + * Finds common words and bigrams (two-word phrases) in event titles, + * weighted by time duration rather than event count. + * + * For each bigram that accounts for >50% of the total duration of both + * constituent words, the bigram is promoted and the constituent words' + * durations are reduced accordingly. This means "Mozilla Firefox" appears + * instead of separate "Mozilla" and "Firefox" entries when they almost + * always co-occur. + * + * Words with length <= 2 or in `ignored_words` are skipped. + */ +export function findCommonPhrases( + events: IEvent[], + ignored_words: string[] +): Map { + const words = new Map(); + const bigrams = new Map(); + + // Step 1: Build word duration dictionary + for (const event of events) { + for (const word of event.data.title.split(SPLIT_REGEX)) { + if (word.length <= 2 || ignored_words.includes(word)) { + continue; + } + const entry = words.get(word); + if (entry) { + entry.duration += event.duration; + entry.events.push(event); + } else { + words.set(word, { word, duration: event.duration, events: [event] }); + } + } + } + + // Step 2: Build bigram duration dictionary (skip bigrams with filtered words) + for (const event of events) { + const parts = event.data.title.split(SPLIT_REGEX); + for (let i = 0; i < parts.length - 1; i++) { + const w1 = parts[i]; + const w2 = parts[i + 1]; + if (w1.length <= 2 || ignored_words.includes(w1)) continue; + if (w2.length <= 2 || ignored_words.includes(w2)) continue; + const bigram = `${w1} ${w2}`; + const entry = bigrams.get(bigram); + if (entry) { + entry.duration += event.duration; + entry.events.push(event); + } else { + bigrams.set(bigram, { bigram, duration: event.duration, events: [event] }); + } + } + } + + // Step 3: Promote bigrams that dominate both constituent words (>50% threshold) + // Snapshot original durations before the loop so that promoting one bigram + // (which reduces its constituent words' durations) does not corrupt the + // threshold check for a later bigram that shares a common word. Without this, + // a trigram such as "Alpha Beta Gamma" causes Beta.duration to reach 0 after + // "Alpha Beta" is promoted, making 10/0 = Infinity pass the check for "Beta Gamma" + // even though only 10/110 ≈ 9% of Beta's original time was spent in that bigram. + const originalDurations = new Map( + Array.from(words.entries(), ([w, e]) => [w, e.duration]) + ); + for (const [bigram, bigramEntry] of bigrams) { + const spaceIdx = bigram.indexOf(' '); + const word1 = bigram.slice(0, spaceIdx); + const word2 = bigram.slice(spaceIdx + 1); + const w1Entry = words.get(word1); + const w2Entry = words.get(word2); + const w1OrigDuration = originalDurations.get(word1); + const w2OrigDuration = originalDurations.get(word2); + if (!w1Entry || !w2Entry || w1OrigDuration === undefined || w2OrigDuration === undefined) + continue; + + const bigram_duration = bigramEntry.duration; + if (bigram_duration / w1OrigDuration > 0.5 && bigram_duration / w2OrigDuration > 0.5) { + // Promote bigram, reduce constituent word durations + words.set(bigram, { word: bigram, duration: bigram_duration, events: bigramEntry.events }); + w1Entry.duration -= bigram_duration; + w2Entry.duration -= bigram_duration; + } + } + + return words; +} diff --git a/src/views/settings/CategoryBuilder.vue b/src/views/settings/CategoryBuilder.vue index 9d751b3f..ca51346e 100644 --- a/src/views/settings/CategoryBuilder.vue +++ b/src/views/settings/CategoryBuilder.vue @@ -113,6 +113,7 @@ import { canonicalEvents } from '~/queries'; import { getClient } from '~/util/awclient'; import CategoryEditModal from '~/components/CategoryEditModal.vue'; import { isRegexBroad, validateRegex } from '~/util/validate'; +import { findCommonPhrases } from '~/util/categorization'; export default { name: 'aw-category-builder', @@ -224,26 +225,7 @@ export default { ); const events = data[0]; - const words = new Map(); - for (const event of events) { - const words_in_event = event.data.title.split(/[\s\-,:()[\]/]/); - for (const word of words_in_event) { - if (word.length <= 2 || this.ignored_words.includes(word)) { - continue; - } - if (words.has(word)) { - words.get(word).duration += event.duration; - words.get(word).events.push(event); - } else { - words.set(word, { - word: word, - duration: event.duration, - events: [event], - }); - } - } - } - this.words = words; + this.words = findCommonPhrases(events, this.ignored_words); this.loading = false; }, showEvents(word) { diff --git a/test/unit/categorization.test.node.ts b/test/unit/categorization.test.node.ts new file mode 100644 index 00000000..ceea6193 --- /dev/null +++ b/test/unit/categorization.test.node.ts @@ -0,0 +1,124 @@ +import { findCommonPhrases } from '~/util/categorization'; +import { IEvent } from '~/util/interfaces'; + +function makeEvent(title: string, duration: number): IEvent { + return { + timestamp: new Date().toISOString(), + duration, + data: { title }, + }; +} + +describe('findCommonPhrases', () => { + test('returns empty map for empty events', () => { + expect(findCommonPhrases([], [])).toEqual(new Map()); + }); + + test('counts single words by duration', () => { + // Single-word titles produce no bigrams, so durations accumulate directly + const events = [makeEvent('hello', 100), makeEvent('hello', 50), makeEvent('world', 80)]; + const result = findCommonPhrases(events, []); + expect(result.get('hello')?.duration).toBeCloseTo(150); + expect(result.get('world')?.duration).toBeCloseTo(80); + }); + + test('promotes bigram when it dominates both constituent words', () => { + // "Mozilla Firefox" always appears together + const events = [makeEvent('Mozilla Firefox', 100), makeEvent('Mozilla Firefox', 100)]; + const result = findCommonPhrases(events, []); + // Bigram should be promoted + expect(result.get('Mozilla Firefox')).toBeDefined(); + expect(result.get('Mozilla Firefox')?.duration).toBe(200); + // Constituent word durations reduced to 0 + expect(result.get('Mozilla')?.duration).toBe(0); + expect(result.get('Firefox')?.duration).toBe(0); + }); + + test('does not promote bigram when one word appears independently too often', () => { + const events = [ + makeEvent('Mozilla Firefox', 60), + makeEvent('Mozilla Browser', 100), // "Mozilla" has independent time + ]; + // Mozilla total: 160, Firefox: 60, bigram "Mozilla Firefox": 60 + // 60/160 = 0.375 < 0.5 → bigram NOT promoted + const result = findCommonPhrases(events, []); + expect(result.get('Mozilla Firefox')).toBeUndefined(); + expect(result.get('Mozilla')).toBeDefined(); + expect(result.get('Firefox')).toBeDefined(); + }); + + test('filters out words with length <= 2', () => { + const events = [makeEvent('is at go home', 100)]; + const result = findCommonPhrases(events, []); + expect(result.get('is')).toBeUndefined(); + expect(result.get('at')).toBeUndefined(); + expect(result.get('go')).toBeUndefined(); + expect(result.get('home')).toBeDefined(); + }); + + test('filters out ignored words', () => { + const events = [makeEvent('GitHub Chrome Test', 100)]; + const result = findCommonPhrases(events, ['GitHub', 'Chrome']); + expect(result.get('GitHub')).toBeUndefined(); + expect(result.get('Chrome')).toBeUndefined(); + expect(result.get('Test')).toBeDefined(); + }); + + test('ignored words are not used as bigram components', () => { + const events = [makeEvent('GitHub Desktop', 100), makeEvent('GitHub Desktop', 100)]; + const result = findCommonPhrases(events, ['GitHub']); + // "GitHub" is ignored, so "GitHub Desktop" bigram should not be promoted + expect(result.get('GitHub Desktop')).toBeUndefined(); + expect(result.get('Desktop')).toBeDefined(); + }); + + test('handles titles split by various separator characters', () => { + const events = [makeEvent('foo-bar,baz:qux(quux)', 100)]; + const result = findCommonPhrases(events, []); + expect(result.get('foo')).toBeDefined(); + expect(result.get('bar')).toBeDefined(); + expect(result.get('baz')).toBeDefined(); + expect(result.get('qux')).toBeDefined(); + expect(result.get('quux')).toBeDefined(); + }); + + test('accumulated duration across multiple events', () => { + const events = [ + makeEvent('Python Programming', 30), + makeEvent('Python Programming', 30), + makeEvent('Python Programming', 30), + makeEvent('Python Programming', 30), + ]; + const result = findCommonPhrases(events, []); + // All events have same title → bigram fully dominates + expect(result.get('Python Programming')?.duration).toBe(120); + expect(result.get('Python')?.duration).toBe(0); + expect(result.get('Programming')?.duration).toBe(0); + }); + + test('returns Map with word entries containing events list', () => { + const events = [makeEvent('Hello World', 100)]; + const result = findCommonPhrases(events, []); + const entry = result.get('Hello'); + expect(entry).toBeDefined(); + expect(entry?.word).toBe('Hello'); + expect(entry?.events).toHaveLength(1); + expect(entry?.events[0]).toBe(events[0]); + }); + + test('trigram: does not double-promote when middle word duration is consumed', () => { + // "Alpha Beta" dominates (110s); "Alpha Beta Gamma" appears rarely (10s). + // Word totals: Alpha=110, Beta=110, Gamma=10 + // Bigram totals: "Alpha Beta"=110, "Beta Gamma"=10 + // "Alpha Beta" correctly promotes (110/110 > 0.5 for both words). + // After promotion, Beta.duration drops to 0. Without snapshotting original + // durations, "Beta Gamma" sees 10/0 = Infinity and incorrectly promotes too. + // With the fix, the check uses the original Beta=110: 10/110 ≈ 0.09 < 0.5 → no promotion. + const events = [makeEvent('Alpha Beta', 100), makeEvent('Alpha Beta Gamma', 10)]; + const result = findCommonPhrases(events, []); + expect(result.get('Alpha Beta')).toBeDefined(); // correctly promoted + expect(result.get('Beta Gamma')).toBeUndefined(); // must NOT be promoted + const betaEntry = result.get('Beta'); + expect(betaEntry?.duration).toBeGreaterThanOrEqual(0); // no negative durations + }); +});