-
-
Notifications
You must be signed in to change notification settings - Fork 145
feat(categorization): extract findCommonPhrases util with bigram support and tests #803
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c62ec32
99b073f
4a853eb
224a5a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| import { IEvent } from '~/util/interfaces'; | ||
|
|
||
| // Regex used to split event titles into words | ||
| const SPLIT_REGEX = /[\s\-,:()[\]/]/; | ||
|
|
||
| export interface WordEntry { | ||
| word: string; | ||
| duration: number; | ||
| events: IEvent[]; | ||
| } | ||
|
|
||
| /** | ||
| * Finds common words and bigrams (two-word phrases) in event titles, | ||
| * weighted by time duration rather than event count. | ||
| * | ||
| * For each bigram that accounts for >50% of the total duration of both | ||
| * constituent words, the bigram is promoted and the constituent words' | ||
| * durations are reduced accordingly. This means "Mozilla Firefox" appears | ||
| * instead of separate "Mozilla" and "Firefox" entries when they almost | ||
| * always co-occur. | ||
| * | ||
| * Words with length <= 2 or in `ignored_words` are skipped. | ||
| */ | ||
| export function findCommonPhrases( | ||
| events: IEvent[], | ||
| ignored_words: string[] | ||
| ): Map<string, WordEntry> { | ||
| const words = new Map<string, WordEntry>(); | ||
| const bigrams = new Map<string, { bigram: string; duration: number; events: IEvent[] }>(); | ||
|
|
||
| // Step 1: Build word duration dictionary | ||
| for (const event of events) { | ||
| for (const word of event.data.title.split(SPLIT_REGEX)) { | ||
| if (word.length <= 2 || ignored_words.includes(word)) { | ||
| continue; | ||
| } | ||
| const entry = words.get(word); | ||
| if (entry) { | ||
| entry.duration += event.duration; | ||
| entry.events.push(event); | ||
| } else { | ||
| words.set(word, { word, duration: event.duration, events: [event] }); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Step 2: Build bigram duration dictionary (skip bigrams with filtered words) | ||
| for (const event of events) { | ||
| const parts = event.data.title.split(SPLIT_REGEX); | ||
| for (let i = 0; i < parts.length - 1; i++) { | ||
| const w1 = parts[i]; | ||
| const w2 = parts[i + 1]; | ||
| if (w1.length <= 2 || ignored_words.includes(w1)) continue; | ||
| if (w2.length <= 2 || ignored_words.includes(w2)) continue; | ||
| const bigram = `${w1} ${w2}`; | ||
| const entry = bigrams.get(bigram); | ||
| if (entry) { | ||
| entry.duration += event.duration; | ||
| entry.events.push(event); | ||
| } else { | ||
| bigrams.set(bigram, { bigram, duration: event.duration, events: [event] }); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Step 3: Promote bigrams that dominate both constituent words (>50% threshold) | ||
| // Snapshot original durations before the loop so that promoting one bigram | ||
| // (which reduces its constituent words' durations) does not corrupt the | ||
| // threshold check for a later bigram that shares a common word. Without this, | ||
| // a trigram such as "Alpha Beta Gamma" causes Beta.duration to reach 0 after | ||
| // "Alpha Beta" is promoted, making 10/0 = Infinity pass the check for "Beta Gamma" | ||
| // even though only 10/110 ≈ 9% of Beta's original time was spent in that bigram. | ||
| const originalDurations = new Map<string, number>( | ||
| Array.from(words.entries(), ([w, e]) => [w, e.duration]) | ||
| ); | ||
| for (const [bigram, bigramEntry] of bigrams) { | ||
| const spaceIdx = bigram.indexOf(' '); | ||
| const word1 = bigram.slice(0, spaceIdx); | ||
| const word2 = bigram.slice(spaceIdx + 1); | ||
| const w1Entry = words.get(word1); | ||
| const w2Entry = words.get(word2); | ||
| const w1OrigDuration = originalDurations.get(word1); | ||
| const w2OrigDuration = originalDurations.get(word2); | ||
| if (!w1Entry || !w2Entry || w1OrigDuration === undefined || w2OrigDuration === undefined) | ||
| continue; | ||
|
|
||
| const bigram_duration = bigramEntry.duration; | ||
| if (bigram_duration / w1OrigDuration > 0.5 && bigram_duration / w2OrigDuration > 0.5) { | ||
| // Promote bigram, reduce constituent word durations | ||
| words.set(bigram, { word: bigram, duration: bigram_duration, events: bigramEntry.events }); | ||
| w1Entry.duration -= bigram_duration; | ||
| w2Entry.duration -= bigram_duration; | ||
| } | ||
| } | ||
|
Comment on lines
+66
to
+94
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The promotion loop reads Snapshot the original durations before the loop so each promotion decision is independent: // Step 3: Promote bigrams that dominate both constituent words (>50% threshold)
const originalDurations = new Map<string, number>();
for (const [w, entry] of words) {
originalDurations.set(w, entry.duration);
}
for (const [bigram, bigramEntry] of bigrams) {
const spaceIdx = bigram.indexOf(' ');
const word1 = bigram.slice(0, spaceIdx);
const word2 = bigram.slice(spaceIdx + 1);
const w1Entry = words.get(word1);
const w2Entry = words.get(word2);
if (!w1Entry || !w2Entry) continue;
const bigram_duration = bigramEntry.duration;
const orig1 = originalDurations.get(word1) ?? 0;
const orig2 = originalDurations.get(word2) ?? 0;
if (bigram_duration / orig1 > 0.5 && bigram_duration / orig2 > 0.5) {
words.set(bigram, { word: bigram, duration: bigram_duration, events: bigramEntry.events });
w1Entry.duration -= bigram_duration;
w2Entry.duration -= bigram_duration;
}
} |
||
|
|
||
| return words; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| import { findCommonPhrases } from '~/util/categorization'; | ||
| import { IEvent } from '~/util/interfaces'; | ||
|
|
||
| function makeEvent(title: string, duration: number): IEvent { | ||
| return { | ||
| timestamp: new Date().toISOString(), | ||
| duration, | ||
| data: { title }, | ||
| }; | ||
| } | ||
|
|
||
| describe('findCommonPhrases', () => { | ||
| test('returns empty map for empty events', () => { | ||
| expect(findCommonPhrases([], [])).toEqual(new Map()); | ||
| }); | ||
|
|
||
| test('counts single words by duration', () => { | ||
| // Single-word titles produce no bigrams, so durations accumulate directly | ||
| const events = [makeEvent('hello', 100), makeEvent('hello', 50), makeEvent('world', 80)]; | ||
| const result = findCommonPhrases(events, []); | ||
| expect(result.get('hello')?.duration).toBeCloseTo(150); | ||
| expect(result.get('world')?.duration).toBeCloseTo(80); | ||
| }); | ||
|
|
||
| test('promotes bigram when it dominates both constituent words', () => { | ||
| // "Mozilla Firefox" always appears together | ||
| const events = [makeEvent('Mozilla Firefox', 100), makeEvent('Mozilla Firefox', 100)]; | ||
| const result = findCommonPhrases(events, []); | ||
| // Bigram should be promoted | ||
| expect(result.get('Mozilla Firefox')).toBeDefined(); | ||
| expect(result.get('Mozilla Firefox')?.duration).toBe(200); | ||
| // Constituent word durations reduced to 0 | ||
| expect(result.get('Mozilla')?.duration).toBe(0); | ||
| expect(result.get('Firefox')?.duration).toBe(0); | ||
| }); | ||
|
|
||
| test('does not promote bigram when one word appears independently too often', () => { | ||
| const events = [ | ||
| makeEvent('Mozilla Firefox', 60), | ||
| makeEvent('Mozilla Browser', 100), // "Mozilla" has independent time | ||
| ]; | ||
| // Mozilla total: 160, Firefox: 60, bigram "Mozilla Firefox": 60 | ||
| // 60/160 = 0.375 < 0.5 → bigram NOT promoted | ||
| const result = findCommonPhrases(events, []); | ||
| expect(result.get('Mozilla Firefox')).toBeUndefined(); | ||
| expect(result.get('Mozilla')).toBeDefined(); | ||
| expect(result.get('Firefox')).toBeDefined(); | ||
| }); | ||
|
|
||
| test('filters out words with length <= 2', () => { | ||
| const events = [makeEvent('is at go home', 100)]; | ||
| const result = findCommonPhrases(events, []); | ||
| expect(result.get('is')).toBeUndefined(); | ||
| expect(result.get('at')).toBeUndefined(); | ||
| expect(result.get('go')).toBeUndefined(); | ||
| expect(result.get('home')).toBeDefined(); | ||
| }); | ||
|
|
||
| test('filters out ignored words', () => { | ||
| const events = [makeEvent('GitHub Chrome Test', 100)]; | ||
| const result = findCommonPhrases(events, ['GitHub', 'Chrome']); | ||
| expect(result.get('GitHub')).toBeUndefined(); | ||
| expect(result.get('Chrome')).toBeUndefined(); | ||
| expect(result.get('Test')).toBeDefined(); | ||
| }); | ||
|
|
||
| test('ignored words are not used as bigram components', () => { | ||
| const events = [makeEvent('GitHub Desktop', 100), makeEvent('GitHub Desktop', 100)]; | ||
| const result = findCommonPhrases(events, ['GitHub']); | ||
| // "GitHub" is ignored, so "GitHub Desktop" bigram should not be promoted | ||
| expect(result.get('GitHub Desktop')).toBeUndefined(); | ||
| expect(result.get('Desktop')).toBeDefined(); | ||
| }); | ||
|
|
||
| test('handles titles split by various separator characters', () => { | ||
| const events = [makeEvent('foo-bar,baz:qux(quux)', 100)]; | ||
| const result = findCommonPhrases(events, []); | ||
| expect(result.get('foo')).toBeDefined(); | ||
| expect(result.get('bar')).toBeDefined(); | ||
| expect(result.get('baz')).toBeDefined(); | ||
| expect(result.get('qux')).toBeDefined(); | ||
| expect(result.get('quux')).toBeDefined(); | ||
| }); | ||
|
|
||
| test('accumulated duration across multiple events', () => { | ||
| const events = [ | ||
| makeEvent('Python Programming', 30), | ||
| makeEvent('Python Programming', 30), | ||
| makeEvent('Python Programming', 30), | ||
| makeEvent('Python Programming', 30), | ||
| ]; | ||
| const result = findCommonPhrases(events, []); | ||
| // All events have same title → bigram fully dominates | ||
| expect(result.get('Python Programming')?.duration).toBe(120); | ||
| expect(result.get('Python')?.duration).toBe(0); | ||
| expect(result.get('Programming')?.duration).toBe(0); | ||
| }); | ||
|
Comment on lines
+85
to
+97
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The existing tests only verify single-bigram promotion. The mutation bug described above is exercisable with a three-word phrase where every event has the same title (e.g. test('does not produce negative duration when middle word appears in two bigrams', () => {
// "Studio" appears in both "Visual Studio" and "Studio Code"
const events = [makeEvent('Visual Studio Code', 100), makeEvent('Visual Studio Code', 100)];
const result = findCommonPhrases(events, []);
for (const [, entry] of result) {
expect(entry.duration).toBeGreaterThanOrEqual(0);
}
}); |
||
|
|
||
| test('returns Map with word entries containing events list', () => { | ||
| const events = [makeEvent('Hello World', 100)]; | ||
| const result = findCommonPhrases(events, []); | ||
| const entry = result.get('Hello'); | ||
| expect(entry).toBeDefined(); | ||
| expect(entry?.word).toBe('Hello'); | ||
| expect(entry?.events).toHaveLength(1); | ||
| expect(entry?.events[0]).toBe(events[0]); | ||
| }); | ||
|
|
||
| test('trigram: does not double-promote when middle word duration is consumed', () => { | ||
| // "Alpha Beta" dominates (110s); "Alpha Beta Gamma" appears rarely (10s). | ||
| // Word totals: Alpha=110, Beta=110, Gamma=10 | ||
| // Bigram totals: "Alpha Beta"=110, "Beta Gamma"=10 | ||
| // "Alpha Beta" correctly promotes (110/110 > 0.5 for both words). | ||
| // After promotion, Beta.duration drops to 0. Without snapshotting original | ||
| // durations, "Beta Gamma" sees 10/0 = Infinity and incorrectly promotes too. | ||
| // With the fix, the check uses the original Beta=110: 10/110 ≈ 0.09 < 0.5 → no promotion. | ||
| const events = [makeEvent('Alpha Beta', 100), makeEvent('Alpha Beta Gamma', 10)]; | ||
| const result = findCommonPhrases(events, []); | ||
| expect(result.get('Alpha Beta')).toBeDefined(); // correctly promoted | ||
| expect(result.get('Beta Gamma')).toBeUndefined(); // must NOT be promoted | ||
| const betaEntry = result.get('Beta'); | ||
| expect(betaEntry?.duration).toBeGreaterThanOrEqual(0); // no negative durations | ||
| }); | ||
| }); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After splitting
"foo-bar"the array is["foo", "bar"]. The loop pairsparts[i]withparts[i+1], so "foo" and "bar" are treated as an adjacent bigram even though a hyphen separated them. Titles like"JIRA-1234 Fix bug"could unexpectedly promote"Fix bug"as a bigram even when"JIRA"and"1234"are filtered. This is a design decision worth calling out explicitly in the JSDoc — if the intent is "adjacent in the cleaned word stream" then the current approach is correct; if the intent is "strictly space-adjacent in the original title" then the regex should only split on whitespace for bigram indexing.No code change required if this is the intended semantics; just a documentation note.