From 9beb223a0d29278141c822fdee7ba43994ec4192 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:12:57 +0000 Subject: [PATCH 1/2] Initial plan From 5393962c7d3a15c3c824b2da7e4f6673a8eb944e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:15:03 +0000 Subject: [PATCH 2/2] Add OECD scraper (scrapers/oecd.org.js) Co-authored-by: susannaanas <4725416+susannaanas@users.noreply.github.com> --- scrapers/oecd.org.js | 56 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scrapers/oecd.org.js diff --git a/scrapers/oecd.org.js b/scrapers/oecd.org.js new file mode 100644 index 000000000..63ba2aecd --- /dev/null +++ b/scrapers/oecd.org.js @@ -0,0 +1,56 @@ +module.exports = { + // JSON: SelectorElement "div.card---theme" (note triple dash in class name) + listSelector: 'div.card---theme', + + parse: ($, el) => { + const titleEl = $(el).find('a').first(); + const title = titleEl.text().trim(); + const rawLink = titleEl.attr('href'); + + if (!title || !rawLink) return null; + + const link = new URL(rawLink, 'https://www.oecd.org').href; + + // Lazy-load image fallback pattern + const imgEl = $(el).find('img').first(); + let img = imgEl.attr('src') + || imgEl.attr('data-src') + || imgEl.attr('data-lazy-src') + || imgEl.attr('data-original') + || null; + + // Discard base64 placeholder images + if (img && img.startsWith('data:')) img = null; + + const rawDate = $(el).find('.card__date').first().text().trim(); + const pubDate = parseOecdDate(rawDate); + + return { + title, + link, + enforcedImage: img, + content: '', + pubDate, + creator: 'OECD' + }; + } +}; + +function parseOecdDate(str) { + if (!str) return new Date().toISOString(); + + // Try DD/MM/YYYY (European slash format) + const slashMatch = str.match(/^(\d{1,2})\/(\d{1,2})\/(\d{4})$/); + if (slashMatch) { + const day = parseInt(slashMatch[1], 10); + const month = parseInt(slashMatch[2], 10) - 1; + const year = parseInt(slashMatch[3], 10); + return new Date(Date.UTC(year, month, day, 12, 0, 0)).toISOString(); + } + + // Fallback: JS Date parse ("9 March 2026", "2026-03-09", etc.) + const d = new Date(str); + if (!isNaN(d.getTime())) return d.toISOString(); + + return new Date().toISOString(); +}