diff --git a/CHANGELOG.md b/CHANGELOG.md index cc6c0feb9..6ad934a87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,22 @@ All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased [major] + +> Development of this release was supported by [User Rights](https://www.user-rights.org). + +### Changed + +- **Breaking:** `@opentermsarchive/engine.fetcher.language` now exposes `navigator.languages` strictly as configured, without Chrome's automatic regional-variant expansion. `language: "en"` exposes `navigator.languages` as `["en"]` instead of the previous `["en-US", "en"]` derived by Chrome; set `language: "en-US,en"` to restore the previous value + +### Removed + +- **Breaking:** Drop support for quality factors (`;q=…`) in `@opentermsarchive/engine.fetcher.language`; previously accepted values such as `en-IE,en-GB;q=0.9,en;q=0.8` now throw at launch. Replace with a plain comma-separated priority list, e.g. `en-IE,en-GB,en` + +### Fixed + +- Reactivate the `puppeteer-extra` stealth plugin in the full DOM fetcher, which had been silently inert since v10.3.1 because it was registered after `puppeteer.launch()`, restoring removal of `navigator.webdriver` and `HeadlessChrome` from the user agent + ## 12.0.2 - 2026-05-26 > Development of this release was supported by [User Rights](https://www.user-rights.org). diff --git a/scripts/declarations/validate/index.mocha.js b/scripts/declarations/validate/index.mocha.js index 3962fb41b..4d0b59e70 100644 --- a/scripts/declarations/validate/index.mocha.js +++ b/scripts/declarations/validate/index.mocha.js @@ -51,9 +51,11 @@ export default async options => { const filePath = path.join(declarationsPath, `${serviceId}.json`); const historyFilePath = path.join(declarationsPath, `${serviceId}.history.json`); - before(launchHeadlessBrowser); + if (!schemaOnly) { + before(() => launchHeadlessBrowser(config.get('@opentermsarchive/engine.fetcher.language'))); - after(stopHeadlessBrowser); + after(stopHeadlessBrowser); + } context(serviceId, () => { before(function () { diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index bef0e6afc..6fe94c563 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -1,13 +1,23 @@ import puppeteer from 'puppeteer-extra'; import stealthPlugin from 'puppeteer-extra-plugin-stealth'; +import navigatorLanguages from 'puppeteer-extra-plugin-stealth/evasions/navigator.languages/index.js'; +import userAgentOverride from 'puppeteer-extra-plugin-stealth/evasions/user-agent-override/index.js'; import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js'; let browser; -export default async function fetch(url, cssSelectors, config) { - puppeteer.use(stealthPlugin({ locale: config.language })); +function parseLanguage(value) { + if (value.includes(';q=')) { + throw new Error(`Quality factors are not supported in fetcher language configuration; received "${value}". Provide a comma-separated list of BCP 47 tags in priority order, for example "en-IE,en-GB,en".`); + } + + const languages = value.split(',').map(tag => tag.trim()); + + return { locale: languages.join(','), languages }; +} +export default async function fetch(url, cssSelectors, config) { if (!browser) { throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".'); } @@ -21,7 +31,7 @@ export default async function fetch(url, cssSelectors, config) { page = await context.newPage(); client = await page.createCDPSession(); - await configurePage(page, client, config); + await configurePage(page, config); const selectors = [].concat(cssSelectors).filter(Boolean); @@ -97,14 +107,24 @@ export default async function fetch(url, cssSelectors, config) { /** * Launches a headless browser instance using Puppeteer if one is not already running. Returns the existing browser instance if one is already running, otherwise creates and returns a new instance. * @function launchHeadlessBrowser - * @returns {Promise} The Puppeteer browser instance. + * @param {string} language Accept-Language header value applied to the browser context + * @returns {Promise} The Puppeteer browser instance. * @async */ -export async function launchHeadlessBrowser() { +export async function launchHeadlessBrowser(language) { if (browser) { return browser; } + const { locale, languages } = parseLanguage(language); + const stealth = stealthPlugin(); + + stealth.enabledEvasions.delete('user-agent-override'); + stealth.enabledEvasions.delete('navigator.languages'); + puppeteer.use(stealth); + puppeteer.use(userAgentOverride({ locale })); + puppeteer.use(navigatorLanguages({ languages })); + const options = { args: [], headless: !process.env.OTA_ENGINE_FETCHER_NO_HEADLESS, @@ -156,16 +176,9 @@ function isValidHttpStatus(status) { return (status >= 200 && status < 300) || status === 304; } -async function configurePage(page, client, config) { +async function configurePage(page, config) { await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600) await page.setDefaultNavigationTimeout(config.navigationTimeout); - await page.setExtraHTTPHeaders({ 'Accept-Language': config.language }); - - // Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment) - await client.send('Network.setUserAgentOverride', { - userAgent: await browser.userAgent(), - acceptLanguage: config.language, - }); if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) { await page.authenticate(browser.proxyCredentials); diff --git a/src/archivist/fetcher/fullDomFetcher.test.js b/src/archivist/fetcher/fullDomFetcher.test.js index 3737eb4fa..1e2da652f 100644 --- a/src/archivist/fetcher/fullDomFetcher.test.js +++ b/src/archivist/fetcher/fullDomFetcher.test.js @@ -16,6 +16,9 @@ use(chaiAsPromised); const dynamicHTML = 'Dynamic Page'; const delayedContentHTML = 'Delayed Content
'; +const langEchoHTML = ''; +const langDetectHTML = '
'; +const stealthProbeHTML = ''; describe('Full DOM Fetcher', function () { this.timeout(60000); @@ -24,7 +27,7 @@ describe('Full DOM Fetcher', function () { let expectedPDFContent; before(async () => { - await launchHeadlessBrowser(); + await launchHeadlessBrowser('en'); temporaryServer = http.createServer((request, response) => { if (request.url === '/dynamic') { @@ -33,6 +36,20 @@ describe('Full DOM Fetcher', function () { if (request.url === '/delayed-content') { response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML); } + if (request.url === '/lang-header') { + const acceptLanguage = request.headers['accept-language'] || ''; + + response.writeHead(200, { 'Content-Type': 'text/html' }).write(``); + } + if (request.url === '/lang-echo') { + response.writeHead(200, { 'Content-Type': 'text/html' }).write(langEchoHTML); + } + if (request.url === '/lang-detect') { + response.writeHead(200, { 'Content-Type': 'text/html' }).write(langDetectHTML); + } + if (request.url === '/stealth-probe') { + response.writeHead(200, { 'Content-Type': 'text/html' }).write(stealthProbeHTML); + } if (request.url === '/terms.pdf') { expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf')); response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent); @@ -54,12 +71,12 @@ describe('Full DOM Fetcher', function () { await stopHeadlessBrowser(); await expect(fetch('http://example.com', [], { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' })) .to.be.rejectedWith('The headless browser should be controlled manually'); - await launchHeadlessBrowser(); + await launchHeadlessBrowser('en'); }); it('reuses existing browser instance', async () => { - const browser1 = await launchHeadlessBrowser(); - const browser2 = await launchHeadlessBrowser(); + const browser1 = await launchHeadlessBrowser('en'); + const browser2 = await launchHeadlessBrowser('en'); expect(browser1).to.equal(browser2); }); @@ -81,6 +98,12 @@ describe('Full DOM Fetcher', function () { await expect(fetch(url, ['.non-existent'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`); }); + it('sends the configured language as Accept-Language header', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); + + expect(result.content).to.match(/data-accept-language="en"/); + }); + context('when a DOM element exists but its content is loaded asynchronously', () => { it('waits for the element content to be fully loaded', async () => { const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/delayed-content`, ['.content'], config); @@ -118,4 +141,97 @@ describe('Full DOM Fetcher', function () { }); }); }); + + describe('Language configuration', () => { + context('with a regional locale such as fr-FR,fr', () => { + const language = 'fr-FR,fr'; + const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language }; + + before(async () => { + await stopHeadlessBrowser(); + await launchHeadlessBrowser(language); + }); + + it('exposes the primary tag through navigator.language', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-echo`, [], config); + + expect(result.content).to.match(/data-language="fr-FR"/); + }); + + it('exposes every tag through navigator.languages', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-echo`, [], config); + + expect(result.content).to.match(/data-languages="fr-FR,fr"/); + }); + + it('sends every configured tag as Accept-Language header', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); + + expect(result.content).to.match(/data-accept-language="fr-FR,fr[^"]*"/); + }); + }); + + context('with multiple comma-separated tags such as en-IE,en-GB,en', () => { + const language = 'en-IE,en-GB,en'; + const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language }; + + before(async () => { + await stopHeadlessBrowser(); + await launchHeadlessBrowser(language); + }); + + it('derives Accept-Language quality factors from tag order', async () => { + const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config); + + expect(result.content).to.match(/data-accept-language="en-IE,en-GB;q=0\.9,en;q=0\.8"/); + }); + }); + + context('with quality factors in the configured value', () => { + after(async () => { + await launchHeadlessBrowser('en'); + }); + + it('rejects language values containing quality factors', async () => { + await stopHeadlessBrowser(); + await expect(launchHeadlessBrowser('en-IE,en-GB;q=0.9,en;q=0.8')) + .to.be.rejectedWith('Quality factors are not supported'); + }); + }); + }); + + describe('Stealth evasions', () => { + // These assertions guard against a class of regressions where the puppeteer-extra-plugin-stealth fails to be registered before puppeteer.launch(): if it is registered later, puppeteer-extra never binds its onPageCreated hooks and all evasions stay inactive, leaving navigator.webdriver === true and HeadlessChrome in the UA. + const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }; + let content; + + before(async () => { + ({ content } = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config)); + }); + + it('hides navigator.webdriver', () => { + expect(content).to.match(/data-webdriver="false"/); + }); + + it('removes HeadlessChrome from the user agent', () => { + expect(content).not.to.match(/HeadlessChrome/); + }); + + it('uses a realistic viewport instead of Puppeteer default', () => { + expect(content).to.match(/data-viewport-width="1920"/); + expect(content).to.match(/data-viewport-height="1080"/); + }); + + it('exposes a non-empty navigator.plugins list', () => { + const match = content.match(/data-plugin-count="(\d+)"/); + + expect(match).to.not.be.null; + expect(Number(match[1])).to.be.greaterThan(0); + }); + + it('hides headless WebGL vendor and renderer signature', () => { + expect(content).to.not.match(/data-webgl-vendor="[^"]*Google[^"]*"/); + expect(content).to.not.match(/data-webgl-renderer="[^"]*(?:SwiftShader|ANGLE)[^"]*"/); + }); + }); }); diff --git a/src/archivist/fetcher/index.js b/src/archivist/fetcher/index.js index 81cbc1515..f9a32436c 100644 --- a/src/archivist/fetcher/index.js +++ b/src/archivist/fetcher/index.js @@ -21,7 +21,7 @@ export const FETCHER_TYPES = { * @param {string|Array} [params.cssSelectors] List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled * @param {object} [params.config] Fetcher configuration * @param {number} [params.config.navigationTimeout] Maximum time (in milliseconds) to wait before considering the fetch failed - * @param {string} [params.config.language] Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers + * @param {string} [params.config.language] Accept-Language value applied to the browser context. Accepts a comma-separated list of [BCP 47](https://www.rfc-editor.org/rfc/rfc5646) language tags in priority order, without quality factors (for example `fr`, `en-US`, or `en-IE,en-GB,en`). The tag list drives `navigator.language` and `navigator.languages` in the headless browser, and the browser derives the `Accept-Language` HTTP header from the tag order * @param {number} [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled * @returns {Promise<{ mimeType: string, content: string | Buffer, fetcher: string }>} Promise containing the fetched resource's MIME type, content, and fetcher type * @throws {FetchDocumentError} When the fetch operation fails diff --git a/src/archivist/fetcher/index.test.js b/src/archivist/fetcher/index.test.js index 004df98ea..22145ddd8 100644 --- a/src/archivist/fetcher/index.test.js +++ b/src/archivist/fetcher/index.test.js @@ -21,7 +21,7 @@ const termsWithOtherCharsetHTML = '