ActivityWatch · ErikBjare · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/src/util/categorization.ts b/src/util/categorization.ts
@@ -0,0 +1,97 @@
+import { IEvent } from '~/util/interfaces';
+
+// Regex used to split event titles into words
+const SPLIT_REGEX = /[\s\-,:()[\]/]/;
+
+export interface WordEntry {
+  word: string;
+  duration: number;
+  events: IEvent[];
+}
+
+/**
+ * Finds common words and bigrams (two-word phrases) in event titles,
+ * weighted by time duration rather than event count.
+ *
+ * For each bigram that accounts for >50% of the total duration of both
+ * constituent words, the bigram is promoted and the constituent words'
+ * durations are reduced accordingly. This means "Mozilla Firefox" appears
+ * instead of separate "Mozilla" and "Firefox" entries when they almost
+ * always co-occur.
+ *
+ * Words with length <= 2 or in `ignored_words` are skipped.
+ */
+export function findCommonPhrases(
+  events: IEvent[],
+  ignored_words: string[]
+): Map<string, WordEntry> {
+  const words = new Map<string, WordEntry>();
+  const bigrams = new Map<string, { bigram: string; duration: number; events: IEvent[] }>();
+
+  // Step 1: Build word duration dictionary
+  for (const event of events) {
+    for (const word of event.data.title.split(SPLIT_REGEX)) {
+      if (word.length <= 2 || ignored_words.includes(word)) {
+        continue;
+      }
+      const entry = words.get(word);
+      if (entry) {
+        entry.duration += event.duration;
+        entry.events.push(event);
+      } else {
+        words.set(word, { word, duration: event.duration, events: [event] });
+      }
+    }
+  }
+
+  // Step 2: Build bigram duration dictionary (skip bigrams with filtered words)
+  for (const event of events) {
+    const parts = event.data.title.split(SPLIT_REGEX);
+    for (let i = 0; i < parts.length - 1; i++) {
+      const w1 = parts[i];
+      const w2 = parts[i + 1];
+      if (w1.length <= 2 || ignored_words.includes(w1)) continue;
+      if (w2.length <= 2 || ignored_words.includes(w2)) continue;
+      const bigram = `${w1} ${w2}`;
+      const entry = bigrams.get(bigram);
+      if (entry) {
+        entry.duration += event.duration;
+        entry.events.push(event);
+      } else {
+        bigrams.set(bigram, { bigram, duration: event.duration, events: [event] });
+      }
+    }
+  }
+
+  // Step 3: Promote bigrams that dominate both constituent words (>50% threshold)
+  // Snapshot original durations before the loop so that promoting one bigram
+  // (which reduces its constituent words' durations) does not corrupt the
+  // threshold check for a later bigram that shares a common word.  Without this,
+  // a trigram such as "Alpha Beta Gamma" causes Beta.duration to reach 0 after
+  // "Alpha Beta" is promoted, making 10/0 = Infinity pass the check for "Beta Gamma"
+  // even though only 10/110 ≈ 9% of Beta's original time was spent in that bigram.
+  const originalDurations = new Map<string, number>(
+    Array.from(words.entries(), ([w, e]) => [w, e.duration])
+  );
+  for (const [bigram, bigramEntry] of bigrams) {
+    const spaceIdx = bigram.indexOf(' ');
+    const word1 = bigram.slice(0, spaceIdx);
+    const word2 = bigram.slice(spaceIdx + 1);
+    const w1Entry = words.get(word1);
+    const w2Entry = words.get(word2);
+    const w1OrigDuration = originalDurations.get(word1);
+    const w2OrigDuration = originalDurations.get(word2);
+    if (!w1Entry || !w2Entry || w1OrigDuration === undefined || w2OrigDuration === undefined)
+      continue;
+
+    const bigram_duration = bigramEntry.duration;
+    if (bigram_duration / w1OrigDuration > 0.5 && bigram_duration / w2OrigDuration > 0.5) {
+      // Promote bigram, reduce constituent word durations
+      words.set(bigram, { word: bigram, duration: bigram_duration, events: bigramEntry.events });
+      w1Entry.duration -= bigram_duration;
+      w2Entry.duration -= bigram_duration;
+    }
+  }
+
+  return words;
+}
diff --git a/src/views/settings/CategoryBuilder.vue b/src/views/settings/CategoryBuilder.vue
@@ -113,6 +113,7 @@ import { canonicalEvents } from '~/queries';
 import { getClient } from '~/util/awclient';
 import CategoryEditModal from '~/components/CategoryEditModal.vue';
 import { isRegexBroad, validateRegex } from '~/util/validate';
+import { findCommonPhrases } from '~/util/categorization';
 
 export default {
   name: 'aw-category-builder',
@@ -224,26 +225,7 @@ export default {
       );
 
       const events = data[0];
-      const words = new Map<string, { word: string; duration: number; events: any[] }>();
-      for (const event of events) {
-        const words_in_event = event.data.title.split(/[\s\-,:()[\]/]/);
-        for (const word of words_in_event) {
-          if (word.length <= 2 || this.ignored_words.includes(word)) {
-            continue;
-          }
-          if (words.has(word)) {
-            words.get(word).duration += event.duration;
-            words.get(word).events.push(event);
-          } else {
-            words.set(word, {
-              word: word,
-              duration: event.duration,
-              events: [event],
-            });
-          }
-        }
-      }
-      this.words = words;
+      this.words = findCommonPhrases(events, this.ignored_words);
       this.loading = false;
     },
     showEvents(word) {

diff --git a/test/unit/categorization.test.node.ts b/test/unit/categorization.test.node.ts
@@ -0,0 +1,124 @@
+import { findCommonPhrases } from '~/util/categorization';
+import { IEvent } from '~/util/interfaces';
+
+function makeEvent(title: string, duration: number): IEvent {
+  return {
+    timestamp: new Date().toISOString(),
+    duration,
+    data: { title },
+  };
+}
+
+describe('findCommonPhrases', () => {
+  test('returns empty map for empty events', () => {
+    expect(findCommonPhrases([], [])).toEqual(new Map());
+  });
+
+  test('counts single words by duration', () => {
+    // Single-word titles produce no bigrams, so durations accumulate directly
+    const events = [makeEvent('hello', 100), makeEvent('hello', 50), makeEvent('world', 80)];
+    const result = findCommonPhrases(events, []);
+    expect(result.get('hello')?.duration).toBeCloseTo(150);
+    expect(result.get('world')?.duration).toBeCloseTo(80);
+  });
+
+  test('promotes bigram when it dominates both constituent words', () => {
+    // "Mozilla Firefox" always appears together
+    const events = [makeEvent('Mozilla Firefox', 100), makeEvent('Mozilla Firefox', 100)];
+    const result = findCommonPhrases(events, []);
+    // Bigram should be promoted
+    expect(result.get('Mozilla Firefox')).toBeDefined();
+    expect(result.get('Mozilla Firefox')?.duration).toBe(200);
+    // Constituent word durations reduced to 0
+    expect(result.get('Mozilla')?.duration).toBe(0);
+    expect(result.get('Firefox')?.duration).toBe(0);
+  });
+
+  test('does not promote bigram when one word appears independently too often', () => {
+    const events = [
+      makeEvent('Mozilla Firefox', 60),
+      makeEvent('Mozilla Browser', 100), // "Mozilla" has independent time
+    ];
+    // Mozilla total: 160, Firefox: 60, bigram "Mozilla Firefox": 60
+    // 60/160 = 0.375 < 0.5 → bigram NOT promoted
+    const result = findCommonPhrases(events, []);
+    expect(result.get('Mozilla Firefox')).toBeUndefined();
+    expect(result.get('Mozilla')).toBeDefined();
+    expect(result.get('Firefox')).toBeDefined();
+  });
+
+  test('filters out words with length <= 2', () => {
+    const events = [makeEvent('is at go home', 100)];
+    const result = findCommonPhrases(events, []);
+    expect(result.get('is')).toBeUndefined();
+    expect(result.get('at')).toBeUndefined();
+    expect(result.get('go')).toBeUndefined();
+    expect(result.get('home')).toBeDefined();
+  });
+
+  test('filters out ignored words', () => {
+    const events = [makeEvent('GitHub Chrome Test', 100)];
+    const result = findCommonPhrases(events, ['GitHub', 'Chrome']);
+    expect(result.get('GitHub')).toBeUndefined();
+    expect(result.get('Chrome')).toBeUndefined();
+    expect(result.get('Test')).toBeDefined();
+  });
+
+  test('ignored words are not used as bigram components', () => {
+    const events = [makeEvent('GitHub Desktop', 100), makeEvent('GitHub Desktop', 100)];
+    const result = findCommonPhrases(events, ['GitHub']);
+    // "GitHub" is ignored, so "GitHub Desktop" bigram should not be promoted
+    expect(result.get('GitHub Desktop')).toBeUndefined();
+    expect(result.get('Desktop')).toBeDefined();
+  });
+
+  test('handles titles split by various separator characters', () => {
+    const events = [makeEvent('foo-bar,baz:qux(quux)', 100)];
+    const result = findCommonPhrases(events, []);
+    expect(result.get('foo')).toBeDefined();
+    expect(result.get('bar')).toBeDefined();
+    expect(result.get('baz')).toBeDefined();
+    expect(result.get('qux')).toBeDefined();
+    expect(result.get('quux')).toBeDefined();
+  });
+
+  test('accumulated duration across multiple events', () => {
+    const events = [
+      makeEvent('Python Programming', 30),
+      makeEvent('Python Programming', 30),
+      makeEvent('Python Programming', 30),
+      makeEvent('Python Programming', 30),
+    ];
+    const result = findCommonPhrases(events, []);
+    // All events have same title → bigram fully dominates
+    expect(result.get('Python Programming')?.duration).toBe(120);
+    expect(result.get('Python')?.duration).toBe(0);
+    expect(result.get('Programming')?.duration).toBe(0);
+  });
+
+  test('returns Map with word entries containing events list', () => {
+    const events = [makeEvent('Hello World', 100)];
+    const result = findCommonPhrases(events, []);
+    const entry = result.get('Hello');
+    expect(entry).toBeDefined();
+    expect(entry?.word).toBe('Hello');
+    expect(entry?.events).toHaveLength(1);
+    expect(entry?.events[0]).toBe(events[0]);
+  });
+
+  test('trigram: does not double-promote when middle word duration is consumed', () => {
+    // "Alpha Beta" dominates (110s); "Alpha Beta Gamma" appears rarely (10s).
+    // Word totals: Alpha=110, Beta=110, Gamma=10
+    // Bigram totals: "Alpha Beta"=110, "Beta Gamma"=10
+    // "Alpha Beta" correctly promotes (110/110 > 0.5 for both words).
+    // After promotion, Beta.duration drops to 0.  Without snapshotting original
+    // durations, "Beta Gamma" sees 10/0 = Infinity and incorrectly promotes too.
+    // With the fix, the check uses the original Beta=110: 10/110 ≈ 0.09 < 0.5 → no promotion.
+    const events = [makeEvent('Alpha Beta', 100), makeEvent('Alpha Beta Gamma', 10)];
+    const result = findCommonPhrases(events, []);
+    expect(result.get('Alpha Beta')).toBeDefined(); // correctly promoted
+    expect(result.get('Beta Gamma')).toBeUndefined(); // must NOT be promoted
+    const betaEntry = result.get('Beta');
+    expect(betaEntry?.duration).toBeGreaterThanOrEqual(0); // no negative durations
+  });
+});