From 90cbdfd77d72e34c6cecbbd4752c020f1bb0f420 Mon Sep 17 00:00:00 2001 From: Peter Hedenskog Date: Sun, 31 May 2026 16:19:24 +0200 Subject: [PATCH] Fix Wikipedia first-party regex and missing cookies on newline-joined Set-Cookie The auto-derived first-party regex for Wikipedia pages contained a || where a single | was meant. The empty alternative between the two pipes matched every string, so any third-party request (analytics, ads, CDNs) on a Wikipedia page was wrongly classified as first-party and the thirdParty bucket stayed empty. One-character fix in the regex, plus a small regression test built from an inline HAR. While in the area: getThirdPartyCookieNames was recently taught to stop the Domain= capture at \n because some HARs concatenate multiple Set-Cookie response headers into one value joined by newlines. Its sister getCookieNames still split only on the first = and therefore silently dropped every cookie after the first in such a joined block, under-reporting page.cookies and page.cookieNames. Apply the same newline split there so both functions agree on the format. Co-authored-by: Claude noreply@anthropic.com --- lib/headers.js | 9 ++++--- lib/index.js | 2 +- test/firstPartyTest.js | 56 ++++++++++++++++++++++++++++++++++++++++++ test/headersTest.js | 11 +++++++++ 4 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 test/firstPartyTest.js diff --git a/lib/headers.js b/lib/headers.js index 8fa8d98..f02ca74 100644 --- a/lib/headers.js +++ b/lib/headers.js @@ -100,9 +100,12 @@ module.exports = { }, getCookieNames: headers => { const cookies = headers.filter(h => h.name.match(/^set-cookie$/i)); - const cookieNames = cookies.map(h => { - return h.value.split('=')[0]; - }); + // Some HARs concatenate multiple Set-Cookie response headers into + // one value joined by '\n'. Split on the newline so every cookie's + // name is captured, not just the first one. + const cookieNames = cookies.flatMap(h => + h.value.split('\n').map(c => c.split('=')[0]) + ); return cookieNames; }, getThirdPartyCookieNames: (headers, regex) => { diff --git a/lib/index.js b/lib/index.js index 9f41fbf..5b631bd 100644 --- a/lib/index.js +++ b/lib/index.js @@ -108,7 +108,7 @@ module.exports = { const mainDomain = util.getMainDomain(baseDomain); // Hack for ... Wikipedia! if (mainDomain === 'wikipedia') { - firstParty = '(.*wikipedia.*||.*wikimedia.*)'; + firstParty = '(.*wikipedia.*|.*wikimedia.*)'; } else { firstParty = '.*' + mainDomain + '.*'; } diff --git a/test/firstPartyTest.js b/test/firstPartyTest.js new file mode 100644 index 0000000..e1e0036 --- /dev/null +++ b/test/firstPartyTest.js @@ -0,0 +1,56 @@ +'use strict'; + +const test = require('ava'); +const pagexray = require('../lib/index'); + +function entry(url, mimeType) { + return { + pageref: 'page_0', + startedDateTime: '2024-01-01T00:00:00.000Z', + time: 10, + request: { method: 'GET', url, headers: [] }, + response: { + status: 200, + httpVersion: 'http/1.1', + headersSize: 0, + bodySize: 0, + content: { mimeType, size: 0 }, + headers: [], + redirectURL: '' + }, + timings: {} + }; +} + +test('First party (wikipedia): non-wikipedia hosts must be classified as third party', t => { + // The auto-derived firstParty regex for wikipedia used to be + // '(.*wikipedia.*||.*wikimedia.*)' — the empty alternative made it + // match every URL, so a Google Analytics request on a Wikipedia page + // was wrongly counted as first-party. + const har = { + log: { + creator: { name: 'Browsertime' }, + browser: { name: 'chrome', version: '1' }, + pages: [ + { + id: 'page_0', + startedDateTime: '2024-01-01T00:00:00.000Z', + title: 'wikipedia', + pageTimings: { onLoad: 1000, onContentLoad: 800 } + } + ], + entries: [ + entry('https://en.wikipedia.org/', 'text/html'), + entry('https://upload.wikimedia.org/logo.png', 'image/png'), + entry( + 'https://www.google-analytics.com/analytics.js', + 'application/javascript' + ) + ] + } + }; + + const page = pagexray.convert(har)[0]; + t.is(page.firstParty.requests, 2, 'wikipedia + wikimedia are first-party'); + t.is(page.thirdParty.requests, 1, 'google-analytics is third-party'); +}); diff --git a/test/headersTest.js b/test/headersTest.js index 00b5b6d..4cd3929 100644 --- a/test/headersTest.js +++ b/test/headersTest.js @@ -125,6 +125,17 @@ test('getThirdPartyCookieNames: skip cookies whose domain matches first-party', ); }); +test('getCookieNames: capture every cookie when Set-Cookie values are newline-joined', t => { + // Mirrors the getThirdPartyCookieNames newline fix: some HARs + // concatenate multiple Set-Cookie response headers into one value + // joined by '\n'. Splitting on '=' alone would only return the first + // cookie's name and under-count page.cookies. + const harHeaders = [ + {name: 'Set-Cookie', value: 'UID=abc; Domain=.example.com\nUIDR=1453756870'} + ]; + t.deepEqual(headers.getCookieNames(harHeaders), ['UID', 'UIDR']); +}); + test('getThirdPartyCookieNames: do not let a newline-joined cookie leak into the domain', t => { // Some HARs concatenate two Set-Cookie headers into a single value // joined by '\n'. The captured Domain= attribute must stop at the