From d7cdcd618e8be7e1c84c5ab1ce9a4aa44a301cf4 Mon Sep 17 00:00:00 2001
From: wvengen <willem@thequestionmark.org>
Date: Mon, 9 Mar 2026 16:56:01 +0100
Subject: [PATCH] add archive_regexp, add archive_blacklist_regexp, remove
 archive_disallow_regexp

---
 docs/advanced_usage.md                     | 15 +++++----
 scrapy_webarchive/downloadermiddlewares.py |  6 ++--
 scrapy_webarchive/spidermiddlewares.py     | 36 +++++++++++++---------
 tests/test_downloadermiddlewares.py        | 10 +++++-
 tests/test_middleware.py                   | 15 ++++++---
 5 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md
index b156848..fb3ce1b 100644
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -12,11 +12,11 @@ yield Request(url, callback=cb_func, flags=["wacz_crawl_skip"])
 
 When this happens, the statistic `webarchive/crawl_skip` is increased.
 
-### Disallowing archived URLs
+### Filtering URLs
 
-If the spider has the attribute `archive_disallow_regexp`, all requests returned from the spider that match this regular expression, are ignored. For example, when a product page was returned in `start_requests`, but the product page disappeared and redirected to its category page, the category page can be disallowed, so as to avoid crawling the whole category, which would take much more time and could lead to unknown URLs (e.g. the spider's requested pagination size could be different from the website default).
+If the spider has the attribute `archive_blacklist_regexp`, all requests returned from the spider that match this regular expression, are ignored. For example, when a product page was returned in `start_requests`, but the product page disappeared and redirected to its category page, the category page can be disallowed, so as to avoid crawling the whole category, which would take much more time and could lead to unknown URLs (e.g. the spider's requested pagination size could be different from the website default).
 
-When this happens, the statistic `wacz/crawl_skip/disallowed` is increased.
+When this happens, the statistic `wacz/crawl_skip/blacklisted` is increased.
 
 ### Iterating a WACZ archive index
 
@@ -47,7 +47,7 @@ SW_WACZ_CRAWL = True
 
 #### Controlling the crawl
 
-Not all URLs will be interesting for the crawl since your WACZ will most likely contain static files such as fonts, JavaScript (website and external), stylesheets, etc. In order to improve the performance of the spider by not reading all the irrelevant request/response entries, you can configure the following atrribute in your spider, `archive_regex`:
+Not all URLs will be interesting for the crawl since your WACZ will most likely contain static files such as fonts, JavaScript (website and external), stylesheets, etc. In order to improve the performance of the spider by not reading all the irrelevant request/response entries, you can configure the following atrribute in your spider, `archive_regexp`:
 
 ``` py title="my_wacz_spider.py" hl_lines="6"
 from scrapy.spiders import Spider
@@ -55,10 +55,10 @@ from scrapy.spiders import Spider
 
 class MyWaczSpider(Spider):
     name = "myspider"
-    archive_regex = r"^/tag/[\w-]+/$"
+    archive_regexp = r"^/tag/[\w-]+/$"
 ```
 
-If the spider has an `archive_regexp` attribute, only response URLs matching this regexp are presented in `start_requests`. To visualise that, the spider above will only crawl the indented cdxj records below:
+If the spider has an `archive_regexp` attribute, only response URLs matching this regexp are presented in `start_requests`. To visualise that, the spider above will only crawl the 10 indented cdxj records below:
 
 ```
 com,toscrape,quotes)/favicon.ico 20241007081411465 {...}
@@ -78,6 +78,9 @@ com,toscrape,quotes)/static/main.css 20241007081525074 {...}
 > com,toscrape,quotes)/tag/truth/ 20241007081523804 {...}
 ```
 
+For each index entry that is skipped in this way, the statistic `wacz/crawl_skip/filtered` is increased.
+Note that for consistency, URLs matching `archive_blacklist_regexp` are also excluded when crawling from the index.
+
 ## Requests and Responses
 
 ### Special Keys in Request.meta
diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py
index d6c0650..10cb199 100644
--- a/scrapy_webarchive/downloadermiddlewares.py
+++ b/scrapy_webarchive/downloadermiddlewares.py
@@ -33,9 +33,9 @@ def _check_ignore_conditions(self, request: Request, spider: Spider) -> None:
             self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
             raise IgnoreRequest()
 
-        # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones).
-        if self._is_disallowed_by_spider(request.url, spider):
-            self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
+        # Ignore blacklisted pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones).
+        if self._is_blacklisted_by_spider(request.url, spider):
+            self.stats.inc_value("webarchive/crawl_skip/blacklisted", spider=spider)
             raise IgnoreRequest()
 
     def process_request(self, request: Request, spider: Spider):
diff --git a/scrapy_webarchive/spidermiddlewares.py b/scrapy_webarchive/spidermiddlewares.py
index 041974c..5fb12c3 100644
--- a/scrapy_webarchive/spidermiddlewares.py
+++ b/scrapy_webarchive/spidermiddlewares.py
@@ -115,10 +115,17 @@ def _is_off_site(self, url: str, spider: Spider) -> bool:
 
         return hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains
 
-    def _is_disallowed_by_spider(self, url: str, spider: Spider) -> bool:
-        """Check if the URL is disallowed by the spider's archive rules."""
+    def _is_filtered_by_spider(self, url: str, spider: Spider) -> bool:
+        """Check if the URL is filtered out by the spider's archive rules."""
 
-        return hasattr(spider, "archive_disallow_regexp") and not re.search(spider.archive_disallow_regexp, url)
+        return hasattr(spider, "archive_regexp") and \
+               re.search(spider.archive_regexp, url) is None
+
+    def _is_blacklisted_by_spider(self, url: str, spider: Spider) -> bool:
+        """Check if the URL is blacklisted out by the spider's archive blacklist rules."""
+
+        return hasattr(spider, "archive_blacklist_regexp") and \
+               re.search(spider.archive_blacklist_regexp, url) is not None
 
     @property
     def _uri_template(self) -> Optional[str]:
@@ -180,18 +187,19 @@ def process_start_requests(self, start_requests: Iterable[Request], spider: Spid
             # Filter out off-site requests
             if self._is_off_site(url, spider):
                 self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
-                flags = ["wacz_start_request", "wacz_crawl_skip"]
-            # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
-            elif self._is_disallowed_by_spider(url, spider):
-                self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
-                flags = ["wacz_start_request", "wacz_crawl_skip"]
+            # Ignore filtered pages (to avoid crawling all files in the WACZ)
+            elif self._is_filtered_by_spider(url, spider):
+                self.stats.inc_value("webarchive/crawl_skip/filtered", spider=spider)
+            # Also ignore blacklisted pages (as it would be illogical to return them here, though use-case is different)
+            elif self._is_blacklisted_by_spider(url, spider):
+                self.stats.inc_value("webarchive/crawl_skip/blacklisted", spider=spider)
             else:
                 self.stats.inc_value("webarchive/start_request_count", spider=spider)
                 flags = ["wacz_start_request"]
 
-            yield record_transformer.request_for_record(
-                entry,
-                flags=flags,
-                meta={"cdxj_record": entry},
-                dont_filter=True,
-            )
+                yield record_transformer.request_for_record(
+                    entry,
+                    flags=flags,
+                    meta={"cdxj_record": entry},
+                    dont_filter=True,
+                )
diff --git a/tests/test_downloadermiddlewares.py b/tests/test_downloadermiddlewares.py
index a338243..d5cd7ac 100644
--- a/tests/test_downloadermiddlewares.py
+++ b/tests/test_downloadermiddlewares.py
@@ -1,5 +1,7 @@
 from contextlib import contextmanager
 
+import pytest
+from scrapy.exceptions import IgnoreRequest
 from scrapy.http.request import Request
 from scrapy.settings import Settings
 from scrapy.spiders import Spider
@@ -47,6 +49,13 @@ def test_retrieve_from_wacz_record_not_found(self):
             assert response
             assert response.status == 404
 
+    def test_retrieve_from_wacz_blacklisted(self):
+        setattr(self.spider, "archive_blacklist_regexp", r"/tag/love/")
+        request = Request("https://quotes.toscrape.com/tag/love/")
+        with self._middleware() as mw:
+            with pytest.raises(IgnoreRequest):
+                mw.process_request(request, self.spider)
+
     def test_retrieve_from_wacz(self):
         request = Request("https://quotes.toscrape.com/tag/love/")
         with self._middleware() as mw:
@@ -54,7 +63,6 @@ def test_retrieve_from_wacz(self):
             assert response
             assert response.status == 200
 
-
 class TestWaczMiddlewareMultiWacz(BaseTestWaczMiddleware):
     def _get_wacz_source_url(self):
         wacz_1 = get_test_data_path("warc_1_1", "quotes.wacz").as_uri()
diff --git a/tests/test_middleware.py b/tests/test_middleware.py
index 77cb83c..b95ad4f 100644
--- a/tests/test_middleware.py
+++ b/tests/test_middleware.py
@@ -45,11 +45,18 @@ def test_wacz_archive_filters_allowed_domains(self):
 
         with self._middleware(SW_WACZ_CRAWL=True) as mw:
             out = list(mw.process_start_requests([], self.spider))
-            assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 61
+            assert len(out) == 61
 
-    def test_wacz_archive_filters_archive_regex(self):
-        setattr(self.spider, "archive_disallow_regexp", r"https://quotes\.toscrape\.com/page/\d+/")
+    def test_wacz_archive_filters_archive_regexp(self):
+        setattr(self.spider, "archive_regexp", r"https://quotes\.toscrape\.com/page/\d+/")
 
         with self._middleware(SW_WACZ_CRAWL=True) as mw:
             out = list(mw.process_start_requests([], self.spider))
-            assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 9
+            assert len(out) == 9
+
+    def test_wacz_archive_filters_archive_blacklist_regexp(self):
+        setattr(self.spider, "archive_blacklist_regexp", r"/css|\.woff2|\.css|\.ico")
+
+        with self._middleware(SW_WACZ_CRAWL=True) as mw:
+            out = list(mw.process_start_requests([], self.spider))
+            assert len(out) == 20