Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,22 @@

All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased [major]

> Development of this release was supported by [User Rights](https://www.user-rights.org).

### Changed

- **Breaking:** `@opentermsarchive/engine.fetcher.language` now exposes `navigator.languages` strictly as configured, without Chrome's automatic regional-variant expansion. `language: "en"` exposes `navigator.languages` as `["en"]` instead of the previous `["en-US", "en"]` derived by Chrome; set `language: "en-US,en"` to restore the previous value

### Removed

- **Breaking:** Drop support for quality factors (`;q=…`) in `@opentermsarchive/engine.fetcher.language`; previously accepted values such as `en-IE,en-GB;q=0.9,en;q=0.8` now throw at launch. Replace with a plain comma-separated priority list, e.g. `en-IE,en-GB,en`

### Fixed

- Reactivate the `puppeteer-extra` stealth plugin in the full DOM fetcher, which had been silently inert since v10.3.1 because it was registered after `puppeteer.launch()`, restoring removal of `navigator.webdriver` and `HeadlessChrome` from the user agent

## 12.0.2 - 2026-05-26

> Development of this release was supported by [User Rights](https://www.user-rights.org).
Expand Down
6 changes: 4 additions & 2 deletions scripts/declarations/validate/index.mocha.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@ export default async options => {
const filePath = path.join(declarationsPath, `${serviceId}.json`);
const historyFilePath = path.join(declarationsPath, `${serviceId}.history.json`);

before(launchHeadlessBrowser);
if (!schemaOnly) {
before(() => launchHeadlessBrowser(config.get('@opentermsarchive/engine.fetcher.language')));

after(stopHeadlessBrowser);
after(stopHeadlessBrowser);
}

context(serviceId, () => {
before(function () {
Expand Down
39 changes: 26 additions & 13 deletions src/archivist/fetcher/fullDomFetcher.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
import puppeteer from 'puppeteer-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import navigatorLanguages from 'puppeteer-extra-plugin-stealth/evasions/navigator.languages/index.js';
import userAgentOverride from 'puppeteer-extra-plugin-stealth/evasions/user-agent-override/index.js';

import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js';

let browser;

export default async function fetch(url, cssSelectors, config) {
puppeteer.use(stealthPlugin({ locale: config.language }));
function parseLanguage(value) {
if (value.includes(';q=')) {
throw new Error(`Quality factors are not supported in fetcher language configuration; received "${value}". Provide a comma-separated list of BCP 47 tags in priority order, for example "en-IE,en-GB,en".`);
}

const languages = value.split(',').map(tag => tag.trim());

return { locale: languages.join(','), languages };
}

export default async function fetch(url, cssSelectors, config) {
if (!browser) {
throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
}
Expand All @@ -21,7 +31,7 @@ export default async function fetch(url, cssSelectors, config) {
page = await context.newPage();
client = await page.createCDPSession();

await configurePage(page, client, config);
await configurePage(page, config);

const selectors = [].concat(cssSelectors).filter(Boolean);

Expand Down Expand Up @@ -97,14 +107,24 @@ export default async function fetch(url, cssSelectors, config) {
/**
* Launches a headless browser instance using Puppeteer if one is not already running. Returns the existing browser instance if one is already running, otherwise creates and returns a new instance.
* @function launchHeadlessBrowser
* @returns {Promise<puppeteer.Browser>} The Puppeteer browser instance.
* @param {string} language Accept-Language header value applied to the browser context
* @returns {Promise<puppeteer.Browser>} The Puppeteer browser instance.
* @async
*/
export async function launchHeadlessBrowser() {
export async function launchHeadlessBrowser(language) {
if (browser) {
return browser;
}

const { locale, languages } = parseLanguage(language);
const stealth = stealthPlugin();

stealth.enabledEvasions.delete('user-agent-override');
stealth.enabledEvasions.delete('navigator.languages');
puppeteer.use(stealth);
puppeteer.use(userAgentOverride({ locale }));
puppeteer.use(navigatorLanguages({ languages }));

const options = {
args: [],
headless: !process.env.OTA_ENGINE_FETCHER_NO_HEADLESS,
Expand Down Expand Up @@ -156,16 +176,9 @@ function isValidHttpStatus(status) {
return (status >= 200 && status < 300) || status === 304;
}

async function configurePage(page, client, config) {
async function configurePage(page, config) {
await page.setViewport({ width: 1920, height: 1080 }); // Realistic viewport to avoid detection based on default Puppeteer dimensions (800x600)
await page.setDefaultNavigationTimeout(config.navigationTimeout);
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });

// Use CDP to ensure browser language is set correctly (see https://zirkelc.dev/posts/puppeteer-language-experiment)
await client.send('Network.setUserAgentOverride', {
userAgent: await browser.userAgent(),
acceptLanguage: config.language,
});

if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
await page.authenticate(browser.proxyCredentials);
Expand Down
124 changes: 120 additions & 4 deletions src/archivist/fetcher/fullDomFetcher.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ use(chaiAsPromised);

const dynamicHTML = '<!DOCTYPE html><html><head><title>Dynamic Page</title><script>setTimeout(() => { document.body.innerHTML += "<div class=\'dynamic\'>Loaded</div>"; }, 100);</script></head><body></body></html>';
const delayedContentHTML = '<!DOCTYPE html><html><head><title>Delayed Content</title><script>setTimeout(() => { document.querySelector(".content").textContent = "Final content"; }, 100);</script></head><body><div class="content"></div></body></html>';
const langEchoHTML = '<!DOCTYPE html><html><body><script>document.body.setAttribute("data-language", navigator.language); document.body.setAttribute("data-languages", navigator.languages.join(","));</script></body></html>';
const langDetectHTML = '<!DOCTYPE html><html><body><div class="lang-detected"></div><script>const lang = navigator.language.split("-")[0]; const labels = { fr: "Conditions", en: "Terms" }; document.querySelector(".lang-detected").textContent = labels[lang] || labels.en;</script></body></html>';
const stealthProbeHTML = '<!DOCTYPE html><html><body><script>document.body.setAttribute("data-webdriver", String(navigator.webdriver)); document.body.setAttribute("data-user-agent", navigator.userAgent); document.body.setAttribute("data-plugin-count", String(navigator.plugins.length)); document.body.setAttribute("data-viewport-width", String(window.innerWidth)); document.body.setAttribute("data-viewport-height", String(window.innerHeight)); (() => { const canvas = document.createElement("canvas"); const gl = canvas.getContext("webgl") || canvas.getContext("experimental-webgl"); if (!gl) { document.body.setAttribute("data-webgl-vendor", "none"); return; } const ext = gl.getExtension("WEBGL_debug_renderer_info"); document.body.setAttribute("data-webgl-vendor", ext ? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL) : ""); document.body.setAttribute("data-webgl-renderer", ext ? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) : ""); })();</script></body></html>';

describe('Full DOM Fetcher', function () {
this.timeout(60000);
Expand All @@ -24,7 +27,7 @@ describe('Full DOM Fetcher', function () {
let expectedPDFContent;

before(async () => {
await launchHeadlessBrowser();
await launchHeadlessBrowser('en');

temporaryServer = http.createServer((request, response) => {
if (request.url === '/dynamic') {
Expand All @@ -33,6 +36,20 @@ describe('Full DOM Fetcher', function () {
if (request.url === '/delayed-content') {
response.writeHead(200, { 'Content-Type': 'text/html' }).write(delayedContentHTML);
}
if (request.url === '/lang-header') {
const acceptLanguage = request.headers['accept-language'] || '';

response.writeHead(200, { 'Content-Type': 'text/html' }).write(`<!DOCTYPE html><html><body data-accept-language="${acceptLanguage}"></body></html>`);
}
if (request.url === '/lang-echo') {
response.writeHead(200, { 'Content-Type': 'text/html' }).write(langEchoHTML);
}
if (request.url === '/lang-detect') {
response.writeHead(200, { 'Content-Type': 'text/html' }).write(langDetectHTML);
}
if (request.url === '/stealth-probe') {
response.writeHead(200, { 'Content-Type': 'text/html' }).write(stealthProbeHTML);
}
if (request.url === '/terms.pdf') {
expectedPDFContent = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'));
response.writeHead(200, { 'Content-Type': 'application/pdf' }).write(expectedPDFContent);
Expand All @@ -54,12 +71,12 @@ describe('Full DOM Fetcher', function () {
await stopHeadlessBrowser();
await expect(fetch('http://example.com', [], { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' }))
.to.be.rejectedWith('The headless browser should be controlled manually');
await launchHeadlessBrowser();
await launchHeadlessBrowser('en');
});

it('reuses existing browser instance', async () => {
const browser1 = await launchHeadlessBrowser();
const browser2 = await launchHeadlessBrowser();
const browser1 = await launchHeadlessBrowser('en');
const browser2 = await launchHeadlessBrowser('en');

expect(browser1).to.equal(browser2);
});
Expand All @@ -81,6 +98,12 @@ describe('Full DOM Fetcher', function () {
await expect(fetch(url, ['.non-existent'], { ...config, navigationTimeout: timeout })).to.be.rejectedWith(`Timed out after ${timeout / 1000} seconds when trying to fetch '${url}'`);
});

it('sends the configured language as Accept-Language header', async () => {
const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config);

expect(result.content).to.match(/data-accept-language="en"/);
});

context('when a DOM element exists but its content is loaded asynchronously', () => {
it('waits for the element content to be fully loaded', async () => {
const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/delayed-content`, ['.content'], config);
Expand Down Expand Up @@ -118,4 +141,97 @@ describe('Full DOM Fetcher', function () {
});
});
});

describe('Language configuration', () => {
context('with a regional locale such as fr-FR,fr', () => {
const language = 'fr-FR,fr';
const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language };

before(async () => {
await stopHeadlessBrowser();
await launchHeadlessBrowser(language);
});

it('exposes the primary tag through navigator.language', async () => {
const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-echo`, [], config);

expect(result.content).to.match(/data-language="fr-FR"/);
});

it('exposes every tag through navigator.languages', async () => {
const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-echo`, [], config);

expect(result.content).to.match(/data-languages="fr-FR,fr"/);
});

it('sends every configured tag as Accept-Language header', async () => {
const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config);

expect(result.content).to.match(/data-accept-language="fr-FR,fr[^"]*"/);
});
});

context('with multiple comma-separated tags such as en-IE,en-GB,en', () => {
const language = 'en-IE,en-GB,en';
const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language };

before(async () => {
await stopHeadlessBrowser();
await launchHeadlessBrowser(language);
});

it('derives Accept-Language quality factors from tag order', async () => {
const result = await fetch(`http://127.0.0.1:${SERVER_PORT}/lang-header`, [], config);

expect(result.content).to.match(/data-accept-language="en-IE,en-GB;q=0\.9,en;q=0\.8"/);
});
});

context('with quality factors in the configured value', () => {
after(async () => {
await launchHeadlessBrowser('en');
});

it('rejects language values containing quality factors', async () => {
await stopHeadlessBrowser();
await expect(launchHeadlessBrowser('en-IE,en-GB;q=0.9,en;q=0.8'))
.to.be.rejectedWith('Quality factors are not supported');
});
});
});

describe('Stealth evasions', () => {
// These assertions guard against a class of regressions where the puppeteer-extra-plugin-stealth fails to be registered before puppeteer.launch(): if it is registered later, puppeteer-extra never binds its onPageCreated hooks and all evasions stay inactive, leaving navigator.webdriver === true and HeadlessChrome in the UA.
const config = { navigationTimeout: 5000, waitForElementsTimeout: 5000, language: 'en' };
let content;

before(async () => {
({ content } = await fetch(`http://127.0.0.1:${SERVER_PORT}/stealth-probe`, [], config));
});

it('hides navigator.webdriver', () => {
expect(content).to.match(/data-webdriver="false"/);
});

it('removes HeadlessChrome from the user agent', () => {
expect(content).not.to.match(/HeadlessChrome/);
});

it('uses a realistic viewport instead of Puppeteer default', () => {
expect(content).to.match(/data-viewport-width="1920"/);
expect(content).to.match(/data-viewport-height="1080"/);
});

it('exposes a non-empty navigator.plugins list', () => {
const match = content.match(/data-plugin-count="(\d+)"/);

expect(match).to.not.be.null;
expect(Number(match[1])).to.be.greaterThan(0);
});

it('hides headless WebGL vendor and renderer signature', () => {
expect(content).to.not.match(/data-webgl-vendor="[^"]*Google[^"]*"/);
expect(content).to.not.match(/data-webgl-renderer="[^"]*(?:SwiftShader|ANGLE)[^"]*"/);
});
});
});
2 changes: 1 addition & 1 deletion src/archivist/fetcher/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export const FETCHER_TYPES = {
* @param {string|Array} [params.cssSelectors] List of CSS selectors to await when loading the resource in a headless browser. Can be a CSS selector or an array of CSS selectors. Only relevant when `executeClientScripts` is enabled
* @param {object} [params.config] Fetcher configuration
* @param {number} [params.config.navigationTimeout] Maximum time (in milliseconds) to wait before considering the fetch failed
* @param {string} [params.config.language] Language (in [ISO 639-1 format](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)) to be passed in request headers
* @param {string} [params.config.language] Accept-Language value applied to the browser context. Accepts a comma-separated list of [BCP 47](https://www.rfc-editor.org/rfc/rfc5646) language tags in priority order, without quality factors (for example `fr`, `en-US`, or `en-IE,en-GB,en`). The tag list drives `navigator.language` and `navigator.languages` in the headless browser, and the browser derives the `Accept-Language` HTTP header from the tag order
* @param {number} [params.config.waitForElementsTimeout] Maximum time (in milliseconds) to wait for selectors to exist on page before considering the fetch failed. Only relevant when `executeClientScripts` is enabled
* @returns {Promise<{ mimeType: string, content: string | Buffer, fetcher: string }>} Promise containing the fetched resource's MIME type, content, and fetcher type
* @throws {FetchDocumentError} When the fetch operation fails
Expand Down
2 changes: 1 addition & 1 deletion src/archivist/fetcher/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ const termsWithOtherCharsetHTML = '<!DOCTYPE html><html><head><meta http-equiv="
describe('Fetcher', function () {
this.timeout(60000);

before(launchHeadlessBrowser);
before(() => launchHeadlessBrowser('en'));

after(stopHeadlessBrowser);

Expand Down
3 changes: 2 additions & 1 deletion src/archivist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ export default class Archivist extends events.EventEmitter {

constructor({ recorderConfig, fetcherConfig }) {
super();
this.fetcherConfig = fetcherConfig;
this.recorder = new Recorder(recorderConfig);
this.fetch = params => fetch({ ...params, config: fetcherConfig });
this.extract = extract;
Expand Down Expand Up @@ -151,7 +152,7 @@ export default class Archivist extends events.EventEmitter {

this.emit('trackingStarted', servicesIds.length, numberOfTerms, technicalUpgradeOnly);

await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]);
await Promise.all([ launchHeadlessBrowser(this.fetcherConfig.language), this.recorder.initialize() ]);

this.trackingQueue.concurrency = concurrency;

Expand Down
Loading