From d7cdcd618e8be7e1c84c5ab1ce9a4aa44a301cf4 Mon Sep 17 00:00:00 2001 From: wvengen Date: Mon, 9 Mar 2026 16:56:01 +0100 Subject: [PATCH] add archive_regexp, add archive_blacklist_regexp, remove archive_disallow_regexp --- docs/advanced_usage.md | 15 +++++---- scrapy_webarchive/downloadermiddlewares.py | 6 ++-- scrapy_webarchive/spidermiddlewares.py | 36 +++++++++++++--------- tests/test_downloadermiddlewares.py | 10 +++++- tests/test_middleware.py | 15 ++++++--- 5 files changed, 54 insertions(+), 28 deletions(-) diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index b156848..fb3ce1b 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -12,11 +12,11 @@ yield Request(url, callback=cb_func, flags=["wacz_crawl_skip"]) When this happens, the statistic `webarchive/crawl_skip` is increased. -### Disallowing archived URLs +### Filtering URLs -If the spider has the attribute `archive_disallow_regexp`, all requests returned from the spider that match this regular expression, are ignored. For example, when a product page was returned in `start_requests`, but the product page disappeared and redirected to its category page, the category page can be disallowed, so as to avoid crawling the whole category, which would take much more time and could lead to unknown URLs (e.g. the spider's requested pagination size could be different from the website default). +If the spider has the attribute `archive_blacklist_regexp`, all requests returned from the spider that match this regular expression, are ignored. For example, when a product page was returned in `start_requests`, but the product page disappeared and redirected to its category page, the category page can be disallowed, so as to avoid crawling the whole category, which would take much more time and could lead to unknown URLs (e.g. the spider's requested pagination size could be different from the website default). -When this happens, the statistic `wacz/crawl_skip/disallowed` is increased. +When this happens, the statistic `wacz/crawl_skip/blacklisted` is increased. ### Iterating a WACZ archive index @@ -47,7 +47,7 @@ SW_WACZ_CRAWL = True #### Controlling the crawl -Not all URLs will be interesting for the crawl since your WACZ will most likely contain static files such as fonts, JavaScript (website and external), stylesheets, etc. In order to improve the performance of the spider by not reading all the irrelevant request/response entries, you can configure the following atrribute in your spider, `archive_regex`: +Not all URLs will be interesting for the crawl since your WACZ will most likely contain static files such as fonts, JavaScript (website and external), stylesheets, etc. In order to improve the performance of the spider by not reading all the irrelevant request/response entries, you can configure the following atrribute in your spider, `archive_regexp`: ``` py title="my_wacz_spider.py" hl_lines="6" from scrapy.spiders import Spider @@ -55,10 +55,10 @@ from scrapy.spiders import Spider class MyWaczSpider(Spider): name = "myspider" - archive_regex = r"^/tag/[\w-]+/$" + archive_regexp = r"^/tag/[\w-]+/$" ``` -If the spider has an `archive_regexp` attribute, only response URLs matching this regexp are presented in `start_requests`. To visualise that, the spider above will only crawl the indented cdxj records below: +If the spider has an `archive_regexp` attribute, only response URLs matching this regexp are presented in `start_requests`. To visualise that, the spider above will only crawl the 10 indented cdxj records below: ``` com,toscrape,quotes)/favicon.ico 20241007081411465 {...} @@ -78,6 +78,9 @@ com,toscrape,quotes)/static/main.css 20241007081525074 {...} > com,toscrape,quotes)/tag/truth/ 20241007081523804 {...} ``` +For each index entry that is skipped in this way, the statistic `wacz/crawl_skip/filtered` is increased. +Note that for consistency, URLs matching `archive_blacklist_regexp` are also excluded when crawling from the index. + ## Requests and Responses ### Special Keys in Request.meta diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py index d6c0650..10cb199 100644 --- a/scrapy_webarchive/downloadermiddlewares.py +++ b/scrapy_webarchive/downloadermiddlewares.py @@ -33,9 +33,9 @@ def _check_ignore_conditions(self, request: Request, spider: Spider) -> None: self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider) raise IgnoreRequest() - # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones). - if self._is_disallowed_by_spider(request.url, spider): - self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider) + # Ignore blacklisted pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones). + if self._is_blacklisted_by_spider(request.url, spider): + self.stats.inc_value("webarchive/crawl_skip/blacklisted", spider=spider) raise IgnoreRequest() def process_request(self, request: Request, spider: Spider): diff --git a/scrapy_webarchive/spidermiddlewares.py b/scrapy_webarchive/spidermiddlewares.py index 041974c..5fb12c3 100644 --- a/scrapy_webarchive/spidermiddlewares.py +++ b/scrapy_webarchive/spidermiddlewares.py @@ -115,10 +115,17 @@ def _is_off_site(self, url: str, spider: Spider) -> bool: return hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains - def _is_disallowed_by_spider(self, url: str, spider: Spider) -> bool: - """Check if the URL is disallowed by the spider's archive rules.""" + def _is_filtered_by_spider(self, url: str, spider: Spider) -> bool: + """Check if the URL is filtered out by the spider's archive rules.""" - return hasattr(spider, "archive_disallow_regexp") and not re.search(spider.archive_disallow_regexp, url) + return hasattr(spider, "archive_regexp") and \ + re.search(spider.archive_regexp, url) is None + + def _is_blacklisted_by_spider(self, url: str, spider: Spider) -> bool: + """Check if the URL is blacklisted out by the spider's archive blacklist rules.""" + + return hasattr(spider, "archive_blacklist_regexp") and \ + re.search(spider.archive_blacklist_regexp, url) is not None @property def _uri_template(self) -> Optional[str]: @@ -180,18 +187,19 @@ def process_start_requests(self, start_requests: Iterable[Request], spider: Spid # Filter out off-site requests if self._is_off_site(url, spider): self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider) - flags = ["wacz_start_request", "wacz_crawl_skip"] - # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones) - elif self._is_disallowed_by_spider(url, spider): - self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider) - flags = ["wacz_start_request", "wacz_crawl_skip"] + # Ignore filtered pages (to avoid crawling all files in the WACZ) + elif self._is_filtered_by_spider(url, spider): + self.stats.inc_value("webarchive/crawl_skip/filtered", spider=spider) + # Also ignore blacklisted pages (as it would be illogical to return them here, though use-case is different) + elif self._is_blacklisted_by_spider(url, spider): + self.stats.inc_value("webarchive/crawl_skip/blacklisted", spider=spider) else: self.stats.inc_value("webarchive/start_request_count", spider=spider) flags = ["wacz_start_request"] - yield record_transformer.request_for_record( - entry, - flags=flags, - meta={"cdxj_record": entry}, - dont_filter=True, - ) + yield record_transformer.request_for_record( + entry, + flags=flags, + meta={"cdxj_record": entry}, + dont_filter=True, + ) diff --git a/tests/test_downloadermiddlewares.py b/tests/test_downloadermiddlewares.py index a338243..d5cd7ac 100644 --- a/tests/test_downloadermiddlewares.py +++ b/tests/test_downloadermiddlewares.py @@ -1,5 +1,7 @@ from contextlib import contextmanager +import pytest +from scrapy.exceptions import IgnoreRequest from scrapy.http.request import Request from scrapy.settings import Settings from scrapy.spiders import Spider @@ -47,6 +49,13 @@ def test_retrieve_from_wacz_record_not_found(self): assert response assert response.status == 404 + def test_retrieve_from_wacz_blacklisted(self): + setattr(self.spider, "archive_blacklist_regexp", r"/tag/love/") + request = Request("https://quotes.toscrape.com/tag/love/") + with self._middleware() as mw: + with pytest.raises(IgnoreRequest): + mw.process_request(request, self.spider) + def test_retrieve_from_wacz(self): request = Request("https://quotes.toscrape.com/tag/love/") with self._middleware() as mw: @@ -54,7 +63,6 @@ def test_retrieve_from_wacz(self): assert response assert response.status == 200 - class TestWaczMiddlewareMultiWacz(BaseTestWaczMiddleware): def _get_wacz_source_url(self): wacz_1 = get_test_data_path("warc_1_1", "quotes.wacz").as_uri() diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 77cb83c..b95ad4f 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -45,11 +45,18 @@ def test_wacz_archive_filters_allowed_domains(self): with self._middleware(SW_WACZ_CRAWL=True) as mw: out = list(mw.process_start_requests([], self.spider)) - assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 61 + assert len(out) == 61 - def test_wacz_archive_filters_archive_regex(self): - setattr(self.spider, "archive_disallow_regexp", r"https://quotes\.toscrape\.com/page/\d+/") + def test_wacz_archive_filters_archive_regexp(self): + setattr(self.spider, "archive_regexp", r"https://quotes\.toscrape\.com/page/\d+/") with self._middleware(SW_WACZ_CRAWL=True) as mw: out = list(mw.process_start_requests([], self.spider)) - assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 9 + assert len(out) == 9 + + def test_wacz_archive_filters_archive_blacklist_regexp(self): + setattr(self.spider, "archive_blacklist_regexp", r"/css|\.woff2|\.css|\.ico") + + with self._middleware(SW_WACZ_CRAWL=True) as mw: + out = list(mw.process_start_requests([], self.spider)) + assert len(out) == 20