From 5518b35048108108afb8225b8333e73277cf99e2 Mon Sep 17 00:00:00 2001
From: wvengen <willem@thequestionmark.org>
Date: Mon, 9 Mar 2026 21:41:57 +0100
Subject: [PATCH] Ignore unrecognized index entries (#38)

---
 example/webarchive_example/strategies.py |  1 +
 scrapy_webarchive/cdxj/models.py         |  6 ++---
 scrapy_webarchive/wacz/wacz_file.py      |  5 ++--
 tests/test_cdxj.py                       | 30 ++++++++++++++----------
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/example/webarchive_example/strategies.py b/example/webarchive_example/strategies.py
index 35a5cf3..7a5ede4 100644
--- a/example/webarchive_example/strategies.py
+++ b/example/webarchive_example/strategies.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 from typing import List, Optional
+
 from scrapy_webarchive.models import FileInfo
 from scrapy_webarchive.strategies import StrategyRegistry
 
diff --git a/scrapy_webarchive/cdxj/models.py b/scrapy_webarchive/cdxj/models.py
index 9e329f7..f592fb4 100644
--- a/scrapy_webarchive/cdxj/models.py
+++ b/scrapy_webarchive/cdxj/models.py
@@ -4,7 +4,7 @@
 import re
 from dataclasses import dataclass, field
 
-from typing_extensions import TYPE_CHECKING
+from typing_extensions import TYPE_CHECKING, Union
 
 if TYPE_CHECKING:
     from scrapy_webarchive.wacz.wacz_file import WaczFile
@@ -41,13 +41,13 @@ def _parse(line: str):
         return CDXREC.match(line)
 
     @classmethod
-    def from_cdxline(cls, cdxline: str, wacz_file: "WaczFile"):
+    def from_cdxline(cls, cdxline: str, wacz_file: "WaczFile") -> Union[CdxjRecord, None]:
         """Creates a CdxjRecord instance from a CDX(J) line."""
 
         m = cls._parse(cdxline.strip())
 
         if not m:
-            raise ValueError(f"Invalid CDXJ line: '{cdxline.strip()}'")
+            return None
 
         parsed_data = m.groupdict(default="")
         parsed_data['data'] = json.loads(parsed_data['data'])
diff --git a/scrapy_webarchive/wacz/wacz_file.py b/scrapy_webarchive/wacz/wacz_file.py
index 08c9d40..3c8ca97 100644
--- a/scrapy_webarchive/wacz/wacz_file.py
+++ b/scrapy_webarchive/wacz/wacz_file.py
@@ -99,7 +99,8 @@ def _parse_index(self, index_file: Union[gzip.GzipFile, IO]) -> Dict[str, List[C
 
         for line in index_file:
             cdxj_record = CdxjRecord.from_cdxline(line.decode(), wacz_file=self)
-            cdxj_records[cdxj_record.data["url"]].append(cdxj_record)
+            if cdxj_record:
+                cdxj_records[cdxj_record.data["url"]].append(cdxj_record)
 
         return cdxj_records
 
@@ -131,4 +132,4 @@ def iter_index(self) -> Generator[CdxjRecord, None, None]:
         record has its `wacz_file` attribute set to the corresponding WACZ file.
         """
 
-        yield from (cdxj_record for wacz in self.waczs for cdxj_record in wacz.iter_index())
+        yield from (cdxj_record for wacz in self.waczs for cdxj_record in wacz.iter_index() if cdxj_record)
diff --git a/tests/test_cdxj.py b/tests/test_cdxj.py
index fa52bad..f928800 100644
--- a/tests/test_cdxj.py
+++ b/tests/test_cdxj.py
@@ -1,4 +1,3 @@
-import pytest
 
 from scrapy_webarchive.cdxj.models import CdxjRecord
 
@@ -26,30 +25,35 @@ def test_cdxj_record_invalid_format():
     # Invalid CDXJ line (missing date)
     invalid_cdxj_line = "com,example)/index {\"url\": \"http://example.com/index\", \"status\": \"200\"}"
     
-    # Test that the invalid line raises a ValueError
-    with pytest.raises(ValueError, match=r"Invalid CDXJ line:"):
-        CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None)
+    # Test that the invalid line raises returns None
+    assert CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None) is None
 
 
 def test_cdxj_record_invalid_json_data():
     # Invalid JSON in CDXJ line
     invalid_cdxj_line = "com,example)/index 20241003000000 {\"url\": \"http://example.com/index\", \"status\": \"200\""
     
-    # Test that the invalid JSON raises a ValueError
-    with pytest.raises(ValueError):
-        CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None)
+    # Test that the invalid JSON returns None
+    assert CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None) is None
 
 
 def test_cdxj_record_empty_line():
-    # Test that an empty line raises a ValueError
-    with pytest.raises(ValueError, match=r"Invalid CDXJ line:"):
-        CdxjRecord.from_cdxline('', wacz_file=None)
+    # Test that an empty line returns None
+    assert CdxjRecord.from_cdxline('', wacz_file=None) is None
 
 
 def test_cdxj_record_no_data_field():
     # CDXJ line with no data field
     no_data_cdxj_line = "com,example)/index 20241003000000"
     
-    # Test that no data field raises a ValueError
-    with pytest.raises(ValueError, match=r"Invalid CDXJ line:"):
-        CdxjRecord.from_cdxline(no_data_cdxj_line, wacz_file=None)
+    # Test that no data field returns None
+    assert CdxjRecord.from_cdxline(no_data_cdxj_line, wacz_file=None) is None
+
+
+def test_cdxj_record_urn_pageinfo():
+    # CDXJ line starting with urn:pageinfo
+    url = "urn:pageinfo:https://example.com/index"
+    pageinfo_cdxj_line = url + " 20241003000000 {\"url\": \"" + url + "\" }"
+
+    # Test that the urn:pageinfo line returns None
+    assert CdxjRecord.from_cdxline(pageinfo_cdxj_line, wacz_file=None) is None