From 52d928d6ec98eba70905c0c0d0e54fd49c87c929 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 26 May 2026 15:37:41 +0200 Subject: [PATCH 01/19] Add Accept-Language assertion --- src/archivist/fetcher/fullDomFetcher.test.js | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 3737eb4fa..8221f58d6 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -33,6 +33,11 @@ describe('Full DOM Fetcher', function () { if (request.url === '/delayed-content') { response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML); } + if (request.url === '/lang-header') { + const acceptLanguage = request.headers['accept-language'] || ''; + + response.writeHead(200, { 'Content-Type': 'text/html' }).write(``); + } if (request.url === '/terms.pdf') { expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf')); response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent); @@ -81,6 +86,12 @@ describe('Full DOM Fetcher', function () { await expect(fetch(url, ['.non-existent'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`); }); + it('sends the configured language as Accept-Language header', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); + + expect(result.content).to.match(/data-accept-language="en"/); + }); + context('when a DOM element exists but its content is loaded asynchronously', () => { it('waits for the element content to be fully loaded', async () => { const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/delayed-content`, ['.content'], config); From 5108ccdf20a11a0e749fcd5b0fd5961aff831c7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 26 May 2026 15:40:00 +0200 Subject: [PATCH 02/19] Pass fetcher language to headless browser launch --- src/archivist/fetcher/fullDomFetcher.js | 9 +++++---- src/archivist/fetcher/fullDomFetcher.test.js | 8 ++++---- src/archivist/index.js | 3 ++- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index bef0e6afc..34cadeef0 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -6,8 +6,6 @@ import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils let browser; export default async function fetch(url, cssSelectors, config) { - puppeteer.use(stealthPlugin({ locale: config.language })); - if (!browser) { throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".'); } @@ -97,14 +95,17 @@ export default async function fetch(url, cssSelectors, config) { /** * Launches a headless browser instance using Puppeteer if one is not already running. Returns the existing browser instance if one is already running, otherwise creates and returns a new instance. * @function launchHeadlessBrowser - * @returns {Promise} The Puppeteer browser instance. + * @param {string} language Accept-Language header value applied to the browser context + * @returns {Promise} The Puppeteer browser instance. * @async */ -export async function launchHeadlessBrowser() { +export async function launchHeadlessBrowser(language) { if (browser) { return browser; } + puppeteer.use(stealthPlugin({ locale: language })); + const options = { args: [], headless: !process.env.OTA_ENGINE_FETCHER_NO_HEADLESS, diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 8221f58d6..4be516597 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -24,7 +24,7 @@ describe('Full DOM Fetcher', function () { let expectedPDFContent; before(async () => { - await launchHeadlessBrowser(); + await launchHeadlessBrowser('en'); temporaryServer = http.createServer((request, response) => { if (request.url === '/dynamic') { @@ -59,12 +59,12 @@ describe('Full DOM Fetcher', function () { await stopHeadlessBrowser(); await expect(fetch('http://example.com', [], { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' })) .to.be.rejectedWith('The headless browser should be controlled manually'); - await launchHeadlessBrowser(); + await launchHeadlessBrowser('en'); }); it('reuses existing browser instance', async () => { - const browser1 = await launchHeadlessBrowser(); - const browser2 = await launchHeadlessBrowser(); + const browser1 = await launchHeadlessBrowser('en'); + const browser2 = await launchHeadlessBrowser('en'); expect(browser1).to.equal(browser2); }); diff --git a/src/archivist/index.js b/src/archivist/index.js index 4a81ed756..9e79ae700 100644 --- a/src/archivist/index.js +++ b/src/archivist/index.js @@ -45,6 +45,7 @@ export default class Archivist extends events.EventEmitter { constructor({ recorderConfig, fetcherConfig }) { super(); + this.fetcherConfig = fetcherConfig; this.recorder = new Recorder(recorderConfig); this.fetch = params => fetch({ ...params, config: fetcherConfig }); this.extract = extract; @@ -151,7 +152,7 @@ export default class Archivist extends events.EventEmitter { this.emit('trackingStarted', servicesIds.length, numberOfTerms, technicalUpgradeOnly); - await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]); + await Promise.all([ launchHeadlessBrowser(this.fetcherConfig.language), this.recorder.initialize() ]); this.trackingQueue.concurrency = concurrency; From 58596d0852d8acdc4815cc5d25c15863a551d779 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 26 May 2026 15:45:46 +0200 Subject: [PATCH 03/19] Apply fetcher language to Puppeteer JS context --- src/archivist/fetcher/fullDomFetcher.js | 18 ++++++- src/archivist/fetcher/fullDomFetcher.test.js | 55 ++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 34cadeef0..6bb198e2b 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -1,10 +1,19 @@ import puppeteer from 'puppeteer-extra'; import stealthPlugin from 'puppeteer-extra-plugin-stealth'; +import navigatorLanguages from 'puppeteer-extra-plugin-stealth/evasions/navigator.languages/index.js'; +import userAgentOverride from 'puppeteer-extra-plugin-stealth/evasions/user-agent-override/index.js'; import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js'; let browser; +function parseLanguage(value) { + return { + locale: value, + languages: value.split(',').map(part => part.split(';')[0].trim()), + }; +} + export default async function fetch(url, cssSelectors, config) { if (!browser) { throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".'); @@ -104,7 +113,14 @@ export async function launchHeadlessBrowser(language) { return browser; } - puppeteer.use(stealthPlugin({ locale: language })); + const { locale, languages } = parseLanguage(language); + const stealth = stealthPlugin(); + + stealth.enabledEvasions.delete('user-agent-override'); + stealth.enabledEvasions.delete('navigator.languages'); + puppeteer.use(stealth); + puppeteer.use(userAgentOverride({ locale })); + puppeteer.use(navigatorLanguages({ languages })); const options = { args: [], diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 4be516597..9c791828f 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -16,6 +16,8 @@ use(chaiAsPromised); const dynamicHTML = 'Dynamic Page'; const delayedContentHTML = 'Delayed Content
'; +const langEchoHTML = ''; +const langDetectHTML = '
'; describe('Full DOM Fetcher', function () { this.timeout(60000); @@ -38,6 +40,12 @@ describe('Full DOM Fetcher', function () { response.writeHead(200, { 'Content-Type': 'text/html' }).write(``); } + if (request.url === '/lang-echo') { + response.writeHead(200, { 'Content-Type': 'text/html' }).write(langEchoHTML); + } + if (request.url === '/lang-detect') { + response.writeHead(200, { 'Content-Type': 'text/html' }).write(langDetectHTML); + } if (request.url === '/terms.pdf') { expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf')); response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent); @@ -129,4 +137,51 @@ describe('Full DOM Fetcher', function () { }); }); }); + + describe('Language configuration', () => { + context('with a regional locale such as fr-FR,fr', () => { + const language = 'fr-FR,fr'; + const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language }; + + before(async () => { + await stopHeadlessBrowser(); + await launchHeadlessBrowser(language); + }); + + it('exposes the primary tag through navigator.language', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-echo`, [], config); + + expect(result.content).to.match(/data-language="fr-FR"/); + }); + + it('exposes every tag through navigator.languages', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-echo`, [], config); + + expect(result.content).to.match(/data-languages="fr-FR,fr"/); + }); + + it('drives client-side language detection on the fetched document', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-detect`, ['.lang-detected'], config); + + expect(result.content).to.match(/
Conditions<\/div>/); + expect(result.content).not.to.match(/>Terms { + const language = 'en-IE,en-GB;q=0.9,en;q=0.8'; + const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language }; + + before(async () => { + await stopHeadlessBrowser(); + await launchHeadlessBrowser(language); + }); + + it('strips quality factors from navigator.languages', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-echo`, [], config); + + expect(result.content).to.match(/data-languages="en-IE,en-GB,en"/); + }); + }); + }); }); From 892ffd1a8d29bfe157539d781e9bca15955ce593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 26 May 2026 15:50:50 +0200 Subject: [PATCH 04/19] Remove redundant Accept-Language override --- src/archivist/fetcher/fullDomFetcher.js | 18 +++++------------- src/archivist/fetcher/fullDomFetcher.test.js | 12 ++++++++++++ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 6bb198e2b..7b0bd3298 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -8,10 +8,9 @@ import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils let browser; function parseLanguage(value) { - return { - locale: value, - languages: value.split(',').map(part => part.split(';')[0].trim()), - }; + const languages = value.split(',').map(part => part.split(';')[0].trim()); + + return { locale: languages.join(','), languages }; } export default async function fetch(url, cssSelectors, config) { @@ -28,7 +27,7 @@ export default async function fetch(url, cssSelectors, config) { page = await context.newPage(); client = await page.createCDPSession(); - await configurePage(page, client, config); + await configurePage(page, config); const selectors = [].concat(cssSelectors).filter(Boolean); @@ -173,16 +172,9 @@ function isValidHttpStatus(status) { return (status >= 200 && status < 300) || status === 304; } -async function configurePage(page, client, config) { +async function configurePage(page, config) { await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600) await page.setDefaultNavigationTimeout(config.navigationTimeout); - await page.setExtraHTTPHeaders({ 'Accept-Language': config.language }); - - // Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment) - await client.send('Network.setUserAgentOverride', { - userAgent: await browser.userAgent(), - acceptLanguage: config.language, - }); if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) { await page.authenticate(browser.proxyCredentials); diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 9c791828f..b899434c9 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -166,6 +166,12 @@ describe('Full DOM Fetcher', function () { expect(result.content).to.match(/
Conditions<\/div>/); expect(result.content).not.to.match(/>Terms { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); + + expect(result.content).to.match(/data-accept-language="fr-FR,fr[^"]*"/); + }); }); context('with quality factors such as en-IE,en-GB;q=0.9,en;q=0.8', () => { @@ -182,6 +188,12 @@ describe('Full DOM Fetcher', function () { expect(result.content).to.match(/data-languages="en-IE,en-GB,en"/); }); + + it('preserves the configured quality factors in the Accept-Language header', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); + + expect(result.content).to.match(/data-accept-language="en-IE,en-GB;q=0\.9,en;q=0\.8"/); + }); }); }); }); From 30899fd3fa92132957beac8a50ebd0d41836aa2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 26 May 2026 15:52:31 +0200 Subject: [PATCH 05/19] Update JSDoc --- src/archivist/fetcher/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/archivist/fetcher/index.js b/src/archivist/fetcher/index.js index 81cbc1515..2c36dca7c 100644 --- a/src/archivist/fetcher/index.js +++ b/src/archivist/fetcher/index.js @@ -21,7 +21,7 @@ export const FETCHER_TYPES = { * @param {string|Array} [params.cssSelectors] List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled * @param {object} [params.config] Fetcher configuration * @param {number} [params.config.navigationTimeout] Maximum time (in milliseconds) to wait before considering the fetch failed - * @param {string} [params.config.language] Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers + * @param {string} [params.config.language] Accept-Language value applied to the browser context. Accepts any [BCP 47](https://www.rfc-editor.org/rfc/rfc5646) language tag or comma-separated priority list (for example `fr`, `en-US`, or `en-IE,en-GB;q=0.9,en;q=0.8`). The header is sent on every fetch and the tag list also drives `navigator.language` and `navigator.languages` in the headless browser * @param {number} [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled * @returns {Promise<{ mimeType: string, content: string | Buffer, fetcher: string }>} Promise containing the fetched resource's MIME type, content, and fetcher type * @throws {FetchDocumentError} When the fetch operation fails From 5af7077517d2277fb07e737cbd0fb15ba10365a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 26 May 2026 16:04:09 +0200 Subject: [PATCH 06/19] Pass language explicitly in fetcher index test hook --- src/archivist/fetcher/index.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/archivist/fetcher/index.test.js b/src/archivist/fetcher/index.test.js index 004df98ea..22145ddd8 100644 --- a/src/archivist/fetcher/index.test.js +++ b/src/archivist/fetcher/index.test.js @@ -21,7 +21,7 @@ const termsWithOtherCharsetHTML = ' Development of this release was supported by [User Rights](https://www.user-rights.org). From d4ba1ebeb25d52be09115555071dc95034bc0d1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Tue, 26 May 2026 17:04:31 +0200 Subject: [PATCH 08/19] Add stealth evasion regression tests --- src/archivist/fetcher/fullDomFetcher.test.js | 34 ++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index b899434c9..361e59ea5 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -18,6 +18,7 @@ const dynamicHTML = 'Dynamic Page { + // These assertions guard against a class of regressions where the + // puppeteer-extra-plugin-stealth fails to be registered before + // puppeteer.launch(): if it is registered later, puppeteer-extra never + // binds its onPageCreated hooks and all evasions stay inactive, + // leaving navigator.webdriver === true and HeadlessChrome in the UA. + const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }; + + it('hides navigator.webdriver', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); + + expect(result.content).to.match(/data-webdriver="false"/); + }); + + it('removes HeadlessChrome from the user agent', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); + + expect(result.content).not.to.match(/HeadlessChrome/); + }); + + it('exposes a non-empty navigator.plugins list', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); + + const match = result.content.match(/data-plugin-count="(\d+)"/); + + expect(match).to.not.be.null; + expect(Number(match[1])).to.be.greaterThan(0); + }); + }); }); From 297ca4b09f57c6ef24e23249f09b1b0a214dfa89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 10:05:32 +0200 Subject: [PATCH 09/19] Pass language explicitly in declarations validation hook --- scripts/declarations/validate/index.mocha.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/declarations/validate/index.mocha.js b/scripts/declarations/validate/index.mocha.js index 3962fb41b..3b11e75e9 100644 --- a/scripts/declarations/validate/index.mocha.js +++ b/scripts/declarations/validate/index.mocha.js @@ -51,7 +51,7 @@ export default async options => { const filePath = path.join(declarationsPath, `${serviceId}.json`); const historyFilePath = path.join(declarationsPath, `${serviceId}.history.json`); - before(launchHeadlessBrowser); + before(() => launchHeadlessBrowser(config.get('@opentermsarchive/engine.fetcher.language'))); after(stopHeadlessBrowser); From 3dc2c1039d3cf780a0065023a675a8f0af98bc4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 10:35:58 +0200 Subject: [PATCH 10/19] Default headless browser language to fetcher config --- src/archivist/fetcher/fullDomFetcher.js | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 7b0bd3298..6279e77c5 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -1,3 +1,4 @@ +import config from 'config'; import puppeteer from 'puppeteer-extra'; import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import navigatorLanguages from 'puppeteer-extra-plugin-stealth/evasions/navigator.languages/index.js'; @@ -103,8 +104,8 @@ export default async function fetch(url, cssSelectors, config) { /** * Launches a headless browser instance using Puppeteer if one is not already running. Returns the existing browser instance if one is already running, otherwise creates and returns a new instance. * @function launchHeadlessBrowser - * @param {string} language Accept-Language header value applied to the browser context - * @returns {Promise} The Puppeteer browser instance. + * @param {string} [language] Accept-Language header value applied to the browser context. Defaults to `@opentermsarchive/engine.fetcher.language` + * @returns {Promise} The Puppeteer browser instance. * @async */ export async function launchHeadlessBrowser(language) { @@ -112,7 +113,8 @@ export async function launchHeadlessBrowser(language) { return browser; } - const { locale, languages } = parseLanguage(language); + const acceptLanguage = language ?? config.get('@opentermsarchive/engine.fetcher.language'); + const { locale, languages } = parseLanguage(acceptLanguage); const stealth = stealthPlugin(); stealth.enabledEvasions.delete('user-agent-override'); From 70cc3ce606b90a89f36a870bf9ed539cedabe21a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 11:44:46 +0200 Subject: [PATCH 11/19] Revert "Default headless browser language to fetcher config" This reverts commit 3dc2c1039d3cf780a0065023a675a8f0af98bc4c. --- src/archivist/fetcher/fullDomFetcher.js | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 6279e77c5..7b0bd3298 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -1,4 +1,3 @@ -import config from 'config'; import puppeteer from 'puppeteer-extra'; import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import navigatorLanguages from 'puppeteer-extra-plugin-stealth/evasions/navigator.languages/index.js'; @@ -104,8 +103,8 @@ export default async function fetch(url, cssSelectors, config) { /** * Launches a headless browser instance using Puppeteer if one is not already running. Returns the existing browser instance if one is already running, otherwise creates and returns a new instance. * @function launchHeadlessBrowser - * @param {string} [language] Accept-Language header value applied to the browser context. Defaults to `@opentermsarchive/engine.fetcher.language` - * @returns {Promise} The Puppeteer browser instance. + * @param {string} language Accept-Language header value applied to the browser context + * @returns {Promise} The Puppeteer browser instance. * @async */ export async function launchHeadlessBrowser(language) { @@ -113,8 +112,7 @@ export async function launchHeadlessBrowser(language) { return browser; } - const acceptLanguage = language ?? config.get('@opentermsarchive/engine.fetcher.language'); - const { locale, languages } = parseLanguage(acceptLanguage); + const { locale, languages } = parseLanguage(language); const stealth = stealthPlugin(); stealth.enabledEvasions.delete('user-agent-override'); From 8e384b05d8cfc7a5d5ba35e19e88cacf2d08db42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 11:52:12 +0200 Subject: [PATCH 12/19] Drop quality factor support from fetcher language API --- src/archivist/fetcher/fullDomFetcher.js | 6 +++++- src/archivist/fetcher/fullDomFetcher.test.js | 22 +++++++++++++------- src/archivist/fetcher/index.js | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 7b0bd3298..6fe94c563 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -8,7 +8,11 @@ import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils let browser; function parseLanguage(value) { - const languages = value.split(',').map(part => part.split(';')[0].trim()); + if (value.includes(';q=')) { + throw new Error(`Quality factors are not supported in fetcher language configuration; received "${value}". Provide a comma-separated list of BCP 47 tags in priority order, for example "en-IE,en-GB,en".`); + } + + const languages = value.split(',').map(tag => tag.trim()); return { locale: languages.join(','), languages }; } diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 361e59ea5..b9b4d3070 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -178,8 +178,8 @@ describe('Full DOM Fetcher', function () { }); }); - context('with quality factors such as en-IE,en-GB;q=0.9,en;q=0.8', () => { - const language = 'en-IE,en-GB;q=0.9,en;q=0.8'; + context('with multiple comma-separated tags such as en-IE,en-GB,en', () => { + const language = 'en-IE,en-GB,en'; const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language }; before(async () => { @@ -187,16 +187,22 @@ describe('Full DOM Fetcher', function () { await launchHeadlessBrowser(language); }); - it('strips quality factors from navigator.languages', async () => { - const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-echo`, [], config); + it('derives Accept-Language quality factors from tag order', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); - expect(result.content).to.match(/data-languages="en-IE,en-GB,en"/); + expect(result.content).to.match(/data-accept-language="en-IE,en-GB;q=0\.9,en;q=0\.8"/); }); + }); - it('preserves the configured quality factors in the Accept-Language header', async () => { - const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); + context('with quality factors in the configured value', () => { + after(async () => { + await launchHeadlessBrowser('en'); + }); - expect(result.content).to.match(/data-accept-language="en-IE,en-GB;q=0\.9,en;q=0\.8"/); + it('rejects language values containing quality factors', async () => { + await stopHeadlessBrowser(); + await expect(launchHeadlessBrowser('en-IE,en-GB;q=0.9,en;q=0.8')) + .to.be.rejectedWith('Quality factors are not supported'); }); }); }); diff --git a/src/archivist/fetcher/index.js b/src/archivist/fetcher/index.js index 2c36dca7c..f9a32436c 100644 --- a/src/archivist/fetcher/index.js +++ b/src/archivist/fetcher/index.js @@ -21,7 +21,7 @@ export const FETCHER_TYPES = { * @param {string|Array} [params.cssSelectors] List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled * @param {object} [params.config] Fetcher configuration * @param {number} [params.config.navigationTimeout] Maximum time (in milliseconds) to wait before considering the fetch failed - * @param {string} [params.config.language] Accept-Language value applied to the browser context. Accepts any [BCP 47](https://www.rfc-editor.org/rfc/rfc5646) language tag or comma-separated priority list (for example `fr`, `en-US`, or `en-IE,en-GB;q=0.9,en;q=0.8`). The header is sent on every fetch and the tag list also drives `navigator.language` and `navigator.languages` in the headless browser + * @param {string} [params.config.language] Accept-Language value applied to the browser context. Accepts a comma-separated list of [BCP 47](https://www.rfc-editor.org/rfc/rfc5646) language tags in priority order, without quality factors (for example `fr`, `en-US`, or `en-IE,en-GB,en`). The tag list drives `navigator.language` and `navigator.languages` in the headless browser, and the browser derives the `Accept-Language` HTTP header from the tag order * @param {number} [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled * @returns {Promise<{ mimeType: string, content: string | Buffer, fetcher: string }>} Promise containing the fetched resource's MIME type, content, and fetcher type * @throws {FetchDocumentError} When the fetch operation fails From fe38f4b1bc38097ace39bfdda72e67fa39e5181f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 12:00:13 +0200 Subject: [PATCH 13/19] Test viewport matches a realistic resolution --- src/archivist/fetcher/fullDomFetcher.test.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index b9b4d3070..095abe2a2 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -18,7 +18,7 @@ const dynamicHTML = 'Dynamic Page { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); + + expect(result.content).to.match(/data-viewport-width="1920"/); + expect(result.content).to.match(/data-viewport-height="1080"/); + }); + it('exposes a non-empty navigator.plugins list', async () => { const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); From 52cb28918d3ff183d1296edd47f4178a570a1ea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 12:04:22 +0200 Subject: [PATCH 14/19] Share stealth probe fetch across assertions --- src/archivist/fetcher/fullDomFetcher.test.js | 29 +++++++++----------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 095abe2a2..e6d9a0879 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -214,30 +214,27 @@ describe('Full DOM Fetcher', function () { // binds its onPageCreated hooks and all evasions stay inactive, // leaving navigator.webdriver === true and HeadlessChrome in the UA. const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }; + let content; - it('hides navigator.webdriver', async () => { - const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); - - expect(result.content).to.match(/data-webdriver="false"/); + before(async () => { + ({ content } = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config)); }); - it('removes HeadlessChrome from the user agent', async () => { - const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); - - expect(result.content).not.to.match(/HeadlessChrome/); + it('hides navigator.webdriver', () => { + expect(content).to.match(/data-webdriver="false"/); }); - it('uses a realistic viewport instead of Puppeteer default', async () => { - const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); - - expect(result.content).to.match(/data-viewport-width="1920"/); - expect(result.content).to.match(/data-viewport-height="1080"/); + it('removes HeadlessChrome from the user agent', () => { + expect(content).not.to.match(/HeadlessChrome/); }); - it('exposes a non-empty navigator.plugins list', async () => { - const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config); + it('uses a realistic viewport instead of Puppeteer default', () => { + expect(content).to.match(/data-viewport-width="1920"/); + expect(content).to.match(/data-viewport-height="1080"/); + }); - const match = result.content.match(/data-plugin-count="(\d+)"/); + it('exposes a non-empty navigator.plugins list', () => { + const match = content.match(/data-plugin-count="(\d+)"/); expect(match).to.not.be.null; expect(Number(match[1])).to.be.greaterThan(0); From 2c6ca2dd679e99fc3ce6f6fe64d57b7e015e0818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 14:52:38 +0200 Subject: [PATCH 15/19] Test WebGL renderer does not leak headless mode --- src/archivist/fetcher/fullDomFetcher.test.js | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index e6d9a0879..22dd8bfdd 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -18,7 +18,7 @@ const dynamicHTML = 'Dynamic Page { + expect(content).to.not.match(/data-webgl-vendor="[^"]*Google[^"]*"/); + expect(content).to.not.match(/data-webgl-renderer="[^"]*(?:SwiftShader|ANGLE)[^"]*"/); + }); }); }); From 3d6d970a41e8a28867f56da7f10fa4a34c6df257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 15:01:29 +0200 Subject: [PATCH 16/19] Update changelog entry --- CHANGELOG.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42e8ca839..dc6b72cab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,14 +2,21 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased [minor] +## Unreleased [major] > Development of this release was supported by [User Rights](https://www.user-rights.org). +### Changed + +- **Breaking:** Apply `@opentermsarchive/engine.fetcher.language` to `navigator.language` and `navigator.languages` in addition to the `Accept-Language` header; `language: "en"` now exposes `navigator.languages` as `["en"]` instead of `["en-US", "en"]`, set `language: "en-US,en"` to restore the previous default + +### Removed + +- **Breaking:** Drop support for quality factors (`;q=…`) in `@opentermsarchive/engine.fetcher.language`; previously accepted values such as `en-IE,en-GB;q=0.9,en;q=0.8` now throw at launch. Replace with a plain comma-separated priority list, e.g. `en-IE,en-GB,en` + ### Fixed - Reactivate the `puppeteer-extra` stealth plugin in the full DOM fetcher, which had been silently inert since v10.3.1 because it was registered after `puppeteer.launch()`, restoring removal of `navigator.webdriver` and `HeadlessChrome` from the user agent -- Apply `@opentermsarchive/engine.fetcher.language` to `navigator.language` and `navigator.languages` in addition to the `Accept-Language` header; `language: "en"` now exposes `navigator.languages` as `["en"]` instead of `["en-US", "en"]`, set `language: "en-US,en"` to restore the previous default ## 12.0.2 - 2026-05-26 From d51c57d87d1dbd19dd12c8bad7994f5d91da2e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Wed, 27 May 2026 15:12:58 +0200 Subject: [PATCH 17/19] Skip browser launch in schema-only validation --- scripts/declarations/validate/index.mocha.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/declarations/validate/index.mocha.js b/scripts/declarations/validate/index.mocha.js index 3b11e75e9..4d0b59e70 100644 --- a/scripts/declarations/validate/index.mocha.js +++ b/scripts/declarations/validate/index.mocha.js @@ -51,9 +51,11 @@ export default async options => { const filePath = path.join(declarationsPath, `${serviceId}.json`); const historyFilePath = path.join(declarationsPath, `${serviceId}.history.json`); - before(() => launchHeadlessBrowser(config.get('@opentermsarchive/engine.fetcher.language'))); + if (!schemaOnly) { + before(() => launchHeadlessBrowser(config.get('@opentermsarchive/engine.fetcher.language'))); - after(stopHeadlessBrowser); + after(stopHeadlessBrowser); + } context(serviceId, () => { before(function () { From b478b8fb06cfe6b698bada62734e8665ebdacdc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 8 Jun 2026 14:06:15 +0200 Subject: [PATCH 18/19] Remove duplicate client-side language detection test Co-authored-by: Nicolas Dupont --- src/archivist/fetcher/fullDomFetcher.test.js | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 22dd8bfdd..1e2da652f 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -164,13 +164,6 @@ describe('Full DOM Fetcher', function () { expect(result.content).to.match(/data-languages="fr-FR,fr"/); }); - it('drives client-side language detection on the fetched document', async () => { - const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-detect`, ['.lang-detected'], config); - - expect(result.content).to.match(/
Conditions<\/div>/); - expect(result.content).not.to.match(/>Terms { const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); @@ -208,11 +201,7 @@ describe('Full DOM Fetcher', function () { }); describe('Stealth evasions', () => { - // These assertions guard against a class of regressions where the - // puppeteer-extra-plugin-stealth fails to be registered before - // puppeteer.launch(): if it is registered later, puppeteer-extra never - // binds its onPageCreated hooks and all evasions stay inactive, - // leaving navigator.webdriver === true and HeadlessChrome in the UA. + // These assertions guard against a class of regressions where the puppeteer-extra-plugin-stealth fails to be registered before puppeteer.launch(): if it is registered later, puppeteer-extra never binds its onPageCreated hooks and all evasions stay inactive, leaving navigator.webdriver === true and HeadlessChrome in the UA. const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }; let content; From a58367736fe9d3914ee329ddb7dd299fa4e8a1f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Biron?= Date: Mon, 8 Jun 2026 14:07:36 +0200 Subject: [PATCH 19/19] Improve changelog Co-authored-by: Nicolas Dupont --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc6b72cab..6ad934a87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ All changes that impact users of this module are documented in this file, in the ### Changed -- **Breaking:** Apply `@opentermsarchive/engine.fetcher.language` to `navigator.language` and `navigator.languages` in addition to the `Accept-Language` header; `language: "en"` now exposes `navigator.languages` as `["en"]` instead of `["en-US", "en"]`, set `language: "en-US,en"` to restore the previous default +- **Breaking:** `@opentermsarchive/engine.fetcher.language` now exposes `navigator.languages` strictly as configured, without Chrome's automatic regional-variant expansion. `language: "en"` exposes `navigator.languages` as `["en"]` instead of the previous `["en-US", "en"]` derived by Chrome; set `language: "en-US,en"` to restore the previous value ### Removed