From 5518b35048108108afb8225b8333e73277cf99e2 Mon Sep 17 00:00:00 2001 From: wvengen Date: Mon, 9 Mar 2026 21:41:57 +0100 Subject: [PATCH] Ignore unrecognized index entries (#38) --- example/webarchive_example/strategies.py | 1 + scrapy_webarchive/cdxj/models.py | 6 ++--- scrapy_webarchive/wacz/wacz_file.py | 5 ++-- tests/test_cdxj.py | 30 ++++++++++++++---------- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/example/webarchive_example/strategies.py b/example/webarchive_example/strategies.py index 35a5cf3..7a5ede4 100644 --- a/example/webarchive_example/strategies.py +++ b/example/webarchive_example/strategies.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import List, Optional + from scrapy_webarchive.models import FileInfo from scrapy_webarchive.strategies import StrategyRegistry diff --git a/scrapy_webarchive/cdxj/models.py b/scrapy_webarchive/cdxj/models.py index 9e329f7..f592fb4 100644 --- a/scrapy_webarchive/cdxj/models.py +++ b/scrapy_webarchive/cdxj/models.py @@ -4,7 +4,7 @@ import re from dataclasses import dataclass, field -from typing_extensions import TYPE_CHECKING +from typing_extensions import TYPE_CHECKING, Union if TYPE_CHECKING: from scrapy_webarchive.wacz.wacz_file import WaczFile @@ -41,13 +41,13 @@ def _parse(line: str): return CDXREC.match(line) @classmethod - def from_cdxline(cls, cdxline: str, wacz_file: "WaczFile"): + def from_cdxline(cls, cdxline: str, wacz_file: "WaczFile") -> Union[CdxjRecord, None]: """Creates a CdxjRecord instance from a CDX(J) line.""" m = cls._parse(cdxline.strip()) if not m: - raise ValueError(f"Invalid CDXJ line: '{cdxline.strip()}'") + return None parsed_data = m.groupdict(default="") parsed_data['data'] = json.loads(parsed_data['data']) diff --git a/scrapy_webarchive/wacz/wacz_file.py b/scrapy_webarchive/wacz/wacz_file.py index 08c9d40..3c8ca97 100644 --- a/scrapy_webarchive/wacz/wacz_file.py +++ b/scrapy_webarchive/wacz/wacz_file.py @@ -99,7 +99,8 @@ def _parse_index(self, index_file: Union[gzip.GzipFile, IO]) -> Dict[str, List[C for line in index_file: cdxj_record = CdxjRecord.from_cdxline(line.decode(), wacz_file=self) - cdxj_records[cdxj_record.data["url"]].append(cdxj_record) + if cdxj_record: + cdxj_records[cdxj_record.data["url"]].append(cdxj_record) return cdxj_records @@ -131,4 +132,4 @@ def iter_index(self) -> Generator[CdxjRecord, None, None]: record has its `wacz_file` attribute set to the corresponding WACZ file. """ - yield from (cdxj_record for wacz in self.waczs for cdxj_record in wacz.iter_index()) + yield from (cdxj_record for wacz in self.waczs for cdxj_record in wacz.iter_index() if cdxj_record) diff --git a/tests/test_cdxj.py b/tests/test_cdxj.py index fa52bad..f928800 100644 --- a/tests/test_cdxj.py +++ b/tests/test_cdxj.py @@ -1,4 +1,3 @@ -import pytest from scrapy_webarchive.cdxj.models import CdxjRecord @@ -26,30 +25,35 @@ def test_cdxj_record_invalid_format(): # Invalid CDXJ line (missing date) invalid_cdxj_line = "com,example)/index {\"url\": \"http://example.com/index\", \"status\": \"200\"}" - # Test that the invalid line raises a ValueError - with pytest.raises(ValueError, match=r"Invalid CDXJ line:"): - CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None) + # Test that the invalid line raises returns None + assert CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None) is None def test_cdxj_record_invalid_json_data(): # Invalid JSON in CDXJ line invalid_cdxj_line = "com,example)/index 20241003000000 {\"url\": \"http://example.com/index\", \"status\": \"200\"" - # Test that the invalid JSON raises a ValueError - with pytest.raises(ValueError): - CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None) + # Test that the invalid JSON returns None + assert CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None) is None def test_cdxj_record_empty_line(): - # Test that an empty line raises a ValueError - with pytest.raises(ValueError, match=r"Invalid CDXJ line:"): - CdxjRecord.from_cdxline('', wacz_file=None) + # Test that an empty line returns None + assert CdxjRecord.from_cdxline('', wacz_file=None) is None def test_cdxj_record_no_data_field(): # CDXJ line with no data field no_data_cdxj_line = "com,example)/index 20241003000000" - # Test that no data field raises a ValueError - with pytest.raises(ValueError, match=r"Invalid CDXJ line:"): - CdxjRecord.from_cdxline(no_data_cdxj_line, wacz_file=None) + # Test that no data field returns None + assert CdxjRecord.from_cdxline(no_data_cdxj_line, wacz_file=None) is None + + +def test_cdxj_record_urn_pageinfo(): + # CDXJ line starting with urn:pageinfo + url = "urn:pageinfo:https://example.com/index" + pageinfo_cdxj_line = url + " 20241003000000 {\"url\": \"" + url + "\" }" + + # Test that the urn:pageinfo line returns None + assert CdxjRecord.from_cdxline(pageinfo_cdxj_line, wacz_file=None) is None