From 9ba990115c35f435f5bd6dd3eed556ad7fce8a97 Mon Sep 17 00:00:00 2001
From: Peter Hedenskog <phedenskog@wikimedia.org>
Date: Thu, 14 May 2026 19:35:03 +0200
Subject: [PATCH] Fix missingCompression, cookie Domain= casing, multi-TLD
 domain split, favicon default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  A handful of correctness fixes that surfaced in follow-up review.

  The most impactful: missingCompression has been over-counting since 4.0.0. headers.flatten wraps every value in an array, but the encoding
  check still compared the array to strings — so every large text asset, including fully gzipped ones, was reported as missing compression.
  The fix unwraps the array, accepts compound codings like "br, gzip", and recognises zstd. On the aftonbladet test HAR the count drops from
  26 to 2.

  Third-party cookies set with lowercase "domain=" used to slip past the first/third-party check because the parser split on the literal
  string "Domain=". Cookie attribute names are case-insensitive (RFC 6265 §5.2), so domain= is now matched with a case-insensitive regex.

  getMainDomain only special-cased ".co.uk", so country sites like bbc.com.br collapsed to "com" and the auto-generated firstParty regex
  pulled in unrelated domains. Added the common two-label public suffixes (.co.jp, .com.br, .com.au, ...). A full Public Suffix List would be
  more correct but invasive.

  Two smaller consistency tweaks: favicon is now part of the default contentTypes shape so every page has the same fields (matching the README
   example), and xml joins the list of types flagged for missingCompression. Also dedupes the renderBlocking init literal that was sitting in
  two files.

  Co-authored-by: Claude Opus 4.7 (1M context) noreply@anthropic.com
---
 CHANGELOG.md             |  7 +++++++
 lib/collect.js           | 33 +++++++++++++++++++++++++++++----
 lib/headers.js           |  9 ++++++---
 lib/index.js             |  6 +-----
 lib/sitespeed.js         |  8 +++-----
 lib/util.js              | 24 ++++++++++++++++++++----
 test/contentTypesTest.js | 24 ++++++++++++++++++++++++
 test/headersTest.js      | 21 +++++++++++++++++++++
 test/utilTest.js         | 13 +++++++++++++
 9 files changed, 124 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fcbaa6d..7d079e3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,13 @@
 * Parse `Cache-Control` directives case-insensitively (RFC 7234 §5.2). `Max-Age=42` is now read as 42 instead of 0, and `No-Cache` / `No-Store` are honoured.
 * Replace the deprecated `url.parse` (Node DEP0169) with the WHATWG `URL` parser in `getHostname`.
 * Stop throwing in `getDocumentRequests` when a page has no matching entries — return an empty array instead.
+* `missingCompression` was flagging every large text asset, including gzipped ones, because `headers.flatten` wraps values in arrays and the encoding check compared the array to a string. Unwrap before comparing, accept compound codings (`br, gzip`), and recognise `zstd`.
+* Match the `Domain=` cookie attribute case-insensitively (RFC 6265 §5.2) so third-party cookies set with lowercase `domain=` are detected.
+* Recognise common two-label public suffixes (`.co.jp`, `.com.br`, `.com.au`, ...) in `getMainDomain` so the auto-generated first-party regex isn't wrong for non-`.co.uk` country domains.
+
+### Changed
+* `defaultContentTypes` now includes `favicon` so every page has a consistent shape (matches the example in the README).
+* `xml` is now considered for `missingCompression` reporting.
 
 ## 4.5.0 2026-05-12
 ### Added
diff --git a/lib/collect.js b/lib/collect.js
index e025e41..5f0ad36 100644
--- a/lib/collect.js
+++ b/lib/collect.js
@@ -34,6 +34,14 @@ function newContentDataWithTimings() {
   return content;
 }
 
+function newRenderBlocking() {
+  return {
+    blocking: 0,
+    potentiallyBlocking: 0,
+    in_body_parser_blocking: 0
+  };
+}
+
 /*
  * Collect information about a response (asset).
  * @private
@@ -146,7 +154,15 @@ module.exports = {
    * Check if the asset should be compressed but isn't.
    */
   missingCompression: (asset, myPage) => {
-    const encoding = asset.headers.response['content-encoding'];
+    // headers.flatten wraps every header value in an array, so the
+    // lookup returns e.g. ["gzip"] — comparing that to the string
+    // "gzip" always failed and every text asset > 2 KB was flagged
+    // as uncompressed. Unwrap before checking, and accept compound
+    // codings like "br, gzip".
+    const raw = asset.headers.response['content-encoding'] || [];
+    const encodings = (Array.isArray(raw) ? raw : [raw])
+      .flatMap(v => String(v).split(','))
+      .map(v => v.trim().toLowerCase());
 
     const couldBeCompressed = [
       'html',
@@ -154,9 +170,12 @@ module.exports = {
       'json',
       'javascript',
       'css',
-      'svg'
+      'svg',
+      'xml'
     ].includes(asset.type);
-    const isCompressed = ['gzip', 'br', 'deflate'].includes(encoding);
+    const isCompressed = encodings.some(e =>
+      ['gzip', 'br', 'deflate', 'zstd'].includes(e)
+    );
     const isLargeFile = asset.contentSize > 2000;
 
     if (couldBeCompressed && isLargeFile && !isCompressed) {
@@ -172,9 +191,15 @@ module.exports = {
       css: newContentData(),
       javascript: newContentData(),
       image: newContentData(),
-      font: newContentData()
+      font: newContentData(),
+      favicon: newContentData()
     };
   },
+  /**
+   * The empty renderBlocking object shape used before any per-asset
+   * or page-level render-blocking data has been attached.
+   */
+  defaultRenderBlocking: newRenderBlocking,
   /*
    * Collect content type from an asset.
    * @param {Object} asset The asset.
diff --git a/lib/headers.js b/lib/headers.js
index e130863..60df6f4 100644
--- a/lib/headers.js
+++ b/lib/headers.js
@@ -110,9 +110,12 @@ module.exports = {
     const cookieNames = cookies
       .map(h => {
         const name = h.value.split('=')[0];
-        const domain = h.value.split('Domain=')[1];
-        if (domain) {
-          const d = domain.split(';')[0];
+        // Cookie attribute names are case-insensitive (RFC 6265 §5.2),
+        // and the Domain= attribute always follows an attribute
+        // separator (`;`), not the cookie name/value `=`.
+        const domainMatch = h.value.match(/;\s*domain=([^;]+)/i);
+        if (domainMatch) {
+          const d = domainMatch[1].trim();
           if (!d.match(regex)) {
             return { name, domain: d };
           }
diff --git a/lib/index.js b/lib/index.js
index ad3ef23..9f41fbf 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -191,11 +191,7 @@ module.exports = {
 
       if (asset.renderBlocking) {
         if (!currentPage.renderBlocking) {
-          currentPage.renderBlocking = {
-            blocking: 0,
-            potentiallyBlocking: 0,
-            in_body_parser_blocking: 0
-          };
+          currentPage.renderBlocking = collect.defaultRenderBlocking();
         }
         if (asset.renderBlocking === 'potentially_blocking') {
           currentPage.renderBlocking.potentiallyBlocking += 1;
diff --git a/lib/sitespeed.js b/lib/sitespeed.js
index e13b9bc..e1556e6 100644
--- a/lib/sitespeed.js
+++ b/lib/sitespeed.js
@@ -1,5 +1,7 @@
 'use strict';
 
+const collect = require('./collect');
+
 module.exports = {
   addMetrics: (har, pages) => {
     // For each page in the HAR file, add the extra metrics in
@@ -36,11 +38,7 @@ module.exports = {
           harPage._renderBlocking.recalculateStyle
         ) {
           if (!pageXrayPage.renderBlocking) {
-            pageXrayPage.renderBlocking = {
-              blocking: 0,
-              potentiallyBlocking: 0,
-              in_body_parser_blocking: 0
-            };
+            pageXrayPage.renderBlocking = collect.defaultRenderBlocking();
           }
           pageXrayPage.renderBlocking.recalculateStyle =
             harPage._renderBlocking.recalculateStyle;
diff --git a/lib/util.js b/lib/util.js
index 1ba1e80..d7e78eb 100644
--- a/lib/util.js
+++ b/lib/util.js
@@ -31,6 +31,25 @@ const FILE_ENDINGS_MATCHERS = [
   [/.*/, 'other'] // Always
 ];
 
+// Common two-label public suffixes. Not exhaustive — a full fix would
+// consult the Public Suffix List, but this catches the cases that
+// actually break the auto-generated firstParty regex in the wild.
+const MULTI_PART_TLDS = new Set([
+  'co.uk',
+  'co.jp',
+  'co.kr',
+  'co.nz',
+  'co.za',
+  'co.in',
+  'com.au',
+  'com.br',
+  'com.mx',
+  'com.cn',
+  'com.tw',
+  'com.hk',
+  'com.sg'
+]);
+
 /**
  * Utilities for getting content from HAR:s.
  * @module util
@@ -100,10 +119,7 @@ module.exports = {
       const parts = hostname.split('.').reverse();
       if (parts != null && parts.length > 1) {
         domain = parts[1] + '.' + parts[0];
-        if (
-          hostname.toLowerCase().indexOf('.co.uk') != -1 &&
-          parts.length > 2
-        ) {
+        if (MULTI_PART_TLDS.has(domain.toLowerCase()) && parts.length > 2) {
           domain = parts[2] + '.' + domain;
         }
       }
diff --git a/test/contentTypesTest.js b/test/contentTypesTest.js
index bfce5a5..7f21883 100644
--- a/test/contentTypesTest.js
+++ b/test/contentTypesTest.js
@@ -41,4 +41,28 @@ describe('Check content types', function() {
         assert.strictEqual(types.font.requests, 8, 'We couldnt get the right number of fonts');
       });
   });
+
+  it('should always include favicon in the default content type shape', function() {
+    // Pages without a favicon used to omit the field entirely, which
+    // forced consumers to special-case it. The default shape now
+    // matches the README example.
+    return har.pagesFromTestHar('contentTypes/linkedin.har')
+      .then((result) => {
+        const types = result[0].contentTypes;
+        assert.ok('favicon' in types, 'favicon default missing');
+        assert.strictEqual(types.favicon.requests, 0);
+      });
+  });
+
+  it('should not over-report missingCompression for gzipped assets', function() {
+    // Regression: headers.flatten wraps values in arrays, so the
+    // string comparison in missingCompression never matched and every
+    // large text asset was flagged. The aftonbladet HAR is gzipped
+    // throughout — only a couple of assets should remain.
+    return har.pagesFromTestHar('contentTypes/aftonbladet.se.har')
+      .then((result) => {
+        assert.ok(result[0].missingCompression < 5,
+          'missingCompression should be near 0 on a gzipped page, got ' + result[0].missingCompression);
+      });
+  });
 });
diff --git a/test/headersTest.js b/test/headersTest.js
index a9e3f27..783115a 100644
--- a/test/headersTest.js
+++ b/test/headersTest.js
@@ -135,4 +135,25 @@ describe('headers', function() {
       assert.equal(expires, 0);
     });
   });
+
+  describe('#getThirdPartyCookieNames', () => {
+    it('should match the Domain= attribute case-insensitively', () => {
+      // RFC 6265 §5.2: attribute names are case-insensitive. The previous
+      // implementation split on the literal string "Domain=" and missed
+      // cookies that used lowercase `domain=`.
+      const harHeaders = [
+        {name: 'Set-Cookie', value: 'sid=abc; domain=.tracker.io; Path=/'}
+      ];
+      const result = headers.getThirdPartyCookieNames(harHeaders, '.*sitespeed.*');
+      assert.deepEqual(result, [{name: 'sid', domain: '.tracker.io'}]);
+    });
+
+    it('should not flag a cookie when the domain matches the first-party regex', () => {
+      const harHeaders = [
+        {name: 'Set-Cookie', value: 'sid=abc; Domain=.sitespeed.io; Path=/'}
+      ];
+      const result = headers.getThirdPartyCookieNames(harHeaders, '.*sitespeed.*');
+      assert.deepEqual(result, []);
+    });
+  });
 });
diff --git a/test/utilTest.js b/test/utilTest.js
index 426df42..5af01cf 100644
--- a/test/utilTest.js
+++ b/test/utilTest.js
@@ -101,4 +101,17 @@ describe('util', function() {
       assert.equal(util.getConnectionType('h3-29'), 'h3');
     });
   });
+
+  describe('#getMainDomain', () => {
+    it('should peel off two-label public suffixes beyond .co.uk', () => {
+      // Previously only `.co.uk` was recognised, so `bbc.com.br` collapsed
+      // to `com` and the auto-firstParty regex pulled in unrelated sites.
+      assert.equal(util.getMainDomain('www.bbc.co.uk'), 'bbc');
+      assert.equal(util.getMainDomain('www.bbc.com.br'), 'bbc');
+      assert.equal(util.getMainDomain('www.bbc.co.jp'), 'bbc');
+      assert.equal(util.getMainDomain('www.bbc.com.au'), 'bbc');
+      assert.equal(util.getMainDomain('www.bbc.com'), 'bbc');
+      assert.equal(util.getMainDomain('sitespeed.io'), 'sitespeed');
+    });
+  });
 });