Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions docs/advanced_usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ yield Request(url, callback=cb_func, flags=["wacz_crawl_skip"])

When this happens, the statistic `webarchive/crawl_skip` is increased.

### Disallowing archived URLs
### Filtering URLs

If the spider has the attribute `archive_disallow_regexp`, all requests returned from the spider that match this regular expression, are ignored. For example, when a product page was returned in `start_requests`, but the product page disappeared and redirected to its category page, the category page can be disallowed, so as to avoid crawling the whole category, which would take much more time and could lead to unknown URLs (e.g. the spider's requested pagination size could be different from the website default).
If the spider has the attribute `archive_blacklist_regexp`, all requests returned from the spider that match this regular expression, are ignored. For example, when a product page was returned in `start_requests`, but the product page disappeared and redirected to its category page, the category page can be disallowed, so as to avoid crawling the whole category, which would take much more time and could lead to unknown URLs (e.g. the spider's requested pagination size could be different from the website default).

When this happens, the statistic `wacz/crawl_skip/disallowed` is increased.
When this happens, the statistic `wacz/crawl_skip/blacklisted` is increased.

### Iterating a WACZ archive index

Expand Down Expand Up @@ -47,18 +47,18 @@ SW_WACZ_CRAWL = True

#### Controlling the crawl

Not all URLs will be interesting for the crawl since your WACZ will most likely contain static files such as fonts, JavaScript (website and external), stylesheets, etc. In order to improve the performance of the spider by not reading all the irrelevant request/response entries, you can configure the following atrribute in your spider, `archive_regex`:
Not all URLs will be interesting for the crawl since your WACZ will most likely contain static files such as fonts, JavaScript (website and external), stylesheets, etc. In order to improve the performance of the spider by not reading all the irrelevant request/response entries, you can configure the following atrribute in your spider, `archive_regexp`:

``` py title="my_wacz_spider.py" hl_lines="6"
from scrapy.spiders import Spider


class MyWaczSpider(Spider):
name = "myspider"
archive_regex = r"^/tag/[\w-]+/$"
archive_regexp = r"^/tag/[\w-]+/$"
```

If the spider has an `archive_regexp` attribute, only response URLs matching this regexp are presented in `start_requests`. To visualise that, the spider above will only crawl the indented cdxj records below:
If the spider has an `archive_regexp` attribute, only response URLs matching this regexp are presented in `start_requests`. To visualise that, the spider above will only crawl the 10 indented cdxj records below:

```
com,toscrape,quotes)/favicon.ico 20241007081411465 {...}
Expand All @@ -78,6 +78,9 @@ com,toscrape,quotes)/static/main.css 20241007081525074 {...}
> com,toscrape,quotes)/tag/truth/ 20241007081523804 {...}
```

For each index entry that is skipped in this way, the statistic `wacz/crawl_skip/filtered` is increased.
Note that for consistency, URLs matching `archive_blacklist_regexp` are also excluded when crawling from the index.

## Requests and Responses

### Special Keys in Request.meta
Expand Down
6 changes: 3 additions & 3 deletions scrapy_webarchive/downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def _check_ignore_conditions(self, request: Request, spider: Spider) -> None:
self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
raise IgnoreRequest()

# Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones).
if self._is_disallowed_by_spider(request.url, spider):
self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
# Ignore blacklisted pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones).
if self._is_blacklisted_by_spider(request.url, spider):
self.stats.inc_value("webarchive/crawl_skip/blacklisted", spider=spider)
raise IgnoreRequest()

def process_request(self, request: Request, spider: Spider):
Expand Down
36 changes: 22 additions & 14 deletions scrapy_webarchive/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,17 @@ def _is_off_site(self, url: str, spider: Spider) -> bool:

return hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains

def _is_disallowed_by_spider(self, url: str, spider: Spider) -> bool:
"""Check if the URL is disallowed by the spider's archive rules."""
def _is_filtered_by_spider(self, url: str, spider: Spider) -> bool:
"""Check if the URL is filtered out by the spider's archive rules."""

return hasattr(spider, "archive_disallow_regexp") and not re.search(spider.archive_disallow_regexp, url)
return hasattr(spider, "archive_regexp") and \
re.search(spider.archive_regexp, url) is None

def _is_blacklisted_by_spider(self, url: str, spider: Spider) -> bool:
"""Check if the URL is blacklisted out by the spider's archive blacklist rules."""

return hasattr(spider, "archive_blacklist_regexp") and \
re.search(spider.archive_blacklist_regexp, url) is not None

@property
def _uri_template(self) -> Optional[str]:
Expand Down Expand Up @@ -180,18 +187,19 @@ def process_start_requests(self, start_requests: Iterable[Request], spider: Spid
# Filter out off-site requests
if self._is_off_site(url, spider):
self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
flags = ["wacz_start_request", "wacz_crawl_skip"]
# Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
elif self._is_disallowed_by_spider(url, spider):
self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
flags = ["wacz_start_request", "wacz_crawl_skip"]
# Ignore filtered pages (to avoid crawling all files in the WACZ)
elif self._is_filtered_by_spider(url, spider):
self.stats.inc_value("webarchive/crawl_skip/filtered", spider=spider)
# Also ignore blacklisted pages (as it would be illogical to return them here, though use-case is different)
elif self._is_blacklisted_by_spider(url, spider):
self.stats.inc_value("webarchive/crawl_skip/blacklisted", spider=spider)
else:
self.stats.inc_value("webarchive/start_request_count", spider=spider)
flags = ["wacz_start_request"]

yield record_transformer.request_for_record(
entry,
flags=flags,
meta={"cdxj_record": entry},
dont_filter=True,
)
yield record_transformer.request_for_record(
entry,
flags=flags,
meta={"cdxj_record": entry},
dont_filter=True,
)
10 changes: 9 additions & 1 deletion tests/test_downloadermiddlewares.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from contextlib import contextmanager

import pytest
from scrapy.exceptions import IgnoreRequest
from scrapy.http.request import Request
from scrapy.settings import Settings
from scrapy.spiders import Spider
Expand Down Expand Up @@ -47,14 +49,20 @@ def test_retrieve_from_wacz_record_not_found(self):
assert response
assert response.status == 404

def test_retrieve_from_wacz_blacklisted(self):
setattr(self.spider, "archive_blacklist_regexp", r"/tag/love/")
request = Request("https://quotes.toscrape.com/tag/love/")
with self._middleware() as mw:
with pytest.raises(IgnoreRequest):
mw.process_request(request, self.spider)

def test_retrieve_from_wacz(self):
request = Request("https://quotes.toscrape.com/tag/love/")
with self._middleware() as mw:
response = mw.process_request(request, self.spider)
assert response
assert response.status == 200


class TestWaczMiddlewareMultiWacz(BaseTestWaczMiddleware):
def _get_wacz_source_url(self):
wacz_1 = get_test_data_path("warc_1_1", "quotes.wacz").as_uri()
Expand Down
15 changes: 11 additions & 4 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,18 @@ def test_wacz_archive_filters_allowed_domains(self):

with self._middleware(SW_WACZ_CRAWL=True) as mw:
out = list(mw.process_start_requests([], self.spider))
assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 61
assert len(out) == 61

def test_wacz_archive_filters_archive_regex(self):
setattr(self.spider, "archive_disallow_regexp", r"https://quotes\.toscrape\.com/page/\d+/")
def test_wacz_archive_filters_archive_regexp(self):
setattr(self.spider, "archive_regexp", r"https://quotes\.toscrape\.com/page/\d+/")

with self._middleware(SW_WACZ_CRAWL=True) as mw:
out = list(mw.process_start_requests([], self.spider))
assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 9
assert len(out) == 9

def test_wacz_archive_filters_archive_blacklist_regexp(self):
setattr(self.spider, "archive_blacklist_regexp", r"/css|\.woff2|\.css|\.ico")

with self._middleware(SW_WACZ_CRAWL=True) as mw:
out = list(mw.process_start_requests([], self.spider))
assert len(out) == 20
Loading