diff --git a/README.md b/README.md index 8e6c5df..7e6148f 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,15 @@ skillspector scan https://github.com/user/my-skill skillspector scan ./my-skill.zip ``` +#### Size limits + +SkillSpector enforces two independent caps on remote and archive inputs to bound the impact of oversized downloads and zip bombs: + +- **Per-ingest cap**: `INGEST_MAX_BYTES` (100 MiB) — applied to streamed URL downloads, total uncompressed size of zip archives, and post-clone disk usage of Git repos. +- **Zip member cap**: `INGEST_MAX_ZIP_MEMBERS` (10,000) — caps the number of entries in a single zip. + +Note that the per-file 1 MB analysis cap (`MAX_FILE_BYTES`) is a separate, downstream limit: it bounds what individual analyzers will read out of an already-ingested directory. The ingest caps above bound how much content can land on disk in the first place. A breach of either ingest cap fails closed with an `IngestLimitExceededError`. + ### Output Formats ```bash diff --git a/src/skillspector/input_handler.py b/src/skillspector/input_handler.py index e511281..7bfd4e5 100644 --- a/src/skillspector/input_handler.py +++ b/src/skillspector/input_handler.py @@ -23,7 +23,11 @@ - Single markdown files - Local directories -Ported from legacy implementation. +Each remote/archive ingest path is bounded by ``INGEST_MAX_BYTES`` and +``INGEST_MAX_ZIP_MEMBERS`` so that the per-file analysis caps downstream +of ``InputHandler.resolve()`` are not defeated by an oversized download, +a zip bomb, or a too-large git clone. This file fails closed on any +ingest budget breach (closes #21 / #131). """ from __future__ import annotations @@ -41,6 +45,30 @@ logger = get_logger(__name__) +# Hard ceiling on what any single ingest path can pull into the temp dir. +# Sized above the per-file analysis cap (``MAX_FILE_BYTES`` = 1 MB) so a +# legitimate multi-file skill is not blocked at ingest, but tight enough +# to bound memory / disk DoS from a malicious source. +INGEST_MAX_BYTES = 100 * 1024 * 1024 # 100 MiB + +# Hard ceiling on the number of members in a zip we are willing to +# extract. Catches the "many tiny files" zip-bomb variant where each +# entry is small but the entry count itself exhausts the filesystem. +INGEST_MAX_ZIP_MEMBERS = 10_000 + +# Chunk size for streaming HTTP downloads. Small enough that the +# byte-count breach check fires promptly; large enough to keep syscall +# overhead reasonable on legitimate inputs. +_DOWNLOAD_CHUNK_BYTES = 64 * 1024 + + +class IngestLimitExceededError(ValueError): + """Raised when an ingest path exceeds an ``INGEST_MAX_*`` budget. + + Subclass of ``ValueError`` so existing callers that catch + ``ValueError`` from ``InputHandler.resolve()`` continue to work. + """ + class InputHandler: """ @@ -64,8 +92,10 @@ def resolve(self, input_path: str) -> tuple[Path, str]: source_type is one of: "git", "url", "zip", "file", "directory" Raises: - ValueError: If input type cannot be determined - FileNotFoundError: If local path doesn't exist + ValueError: If input type cannot be determined, or if an + ingest path exceeds ``INGEST_MAX_BYTES`` / + ``INGEST_MAX_ZIP_MEMBERS`` (``IngestLimitExceededError``). + FileNotFoundError: If local path doesn't exist. """ input_path = input_path.strip() @@ -123,7 +153,7 @@ def _is_file_url(self, path: str) -> bool: return not self._is_git_url(path) def _clone_git(self, url: str) -> Path: - """Clone a Git repository to a temporary directory.""" + """Clone a Git repository to a temporary directory, bounded by ``INGEST_MAX_BYTES``.""" temp_dir = self._get_temp_dir() clone_dir = temp_dir / "repo" try: @@ -145,33 +175,106 @@ def _clone_git(self, url: str) -> Path: raise ValueError( "Git is not installed. Please install git to scan repositories." ) from None + + # Post-clone size check: a successful --depth 1 clone may still + # land an arbitrarily large tree on disk. Reject (and clean up) + # if it exceeds the ingest budget. + total = _directory_size_bytes(clone_dir) + if total > INGEST_MAX_BYTES: + shutil.rmtree(clone_dir, ignore_errors=True) + logger.warning( + "Git clone of %s exceeded ingest cap: %d > %d bytes", + url, + total, + INGEST_MAX_BYTES, + ) + raise IngestLimitExceededError( + f"Git clone exceeded ingest cap: {total} bytes > " + f"INGEST_MAX_BYTES ({INGEST_MAX_BYTES})" + ) return clone_dir def _download_file(self, url: str) -> Path: - """Download a file from URL to a temporary directory.""" + """Download a file from URL to a temporary directory. + + Streams the body to disk in chunks while running a byte counter. + The cap check fires before each chunk is written, so a breach + aborts immediately without accumulating the body in memory. A + partial file produced by a mid-stream breach is removed before + the exception propagates. + """ temp_dir = self._get_temp_dir() parsed = urlparse(url) filename = Path(parsed.path).name or "SKILL.md" + # Write to a stable target path inside the temp dir so we can + # rename / move it after the download succeeds without ever + # holding the body in memory. Use a sentinel name for the + # download itself; we rename / replace at the end. + download_path = temp_dir / "_download.partial" + content_type = "" try: with httpx.Client(follow_redirects=True, timeout=30) as client: - response = client.get(url) - response.raise_for_status() - content = response.content + with client.stream("GET", url) as response: + response.raise_for_status() + content_type = response.headers.get("content-type", "") + # Cheap up-front check: trust Content-Length when the + # server provides it, so we abort before reading any + # body bytes. Streaming check below covers the case + # where the header is missing or wrong. + declared = response.headers.get("content-length") + if declared is not None: + try: + declared_bytes = int(declared) + except ValueError: + # Malformed header — fall through to the + # streamed byte counter, which is authoritative. + declared_bytes = None + if declared_bytes is not None and declared_bytes > INGEST_MAX_BYTES: + raise IngestLimitExceededError( + f"Download exceeded ingest cap: " + f"Content-Length {declared} bytes > " + f"INGEST_MAX_BYTES ({INGEST_MAX_BYTES})" + ) + + received = 0 + with download_path.open("wb") as out: + for chunk in response.iter_bytes(_DOWNLOAD_CHUNK_BYTES): + received += len(chunk) + if received > INGEST_MAX_BYTES: + raise IngestLimitExceededError( + f"Download exceeded ingest cap: streamed " + f"{received} bytes > INGEST_MAX_BYTES " + f"({INGEST_MAX_BYTES})" + ) + out.write(chunk) except httpx.HTTPError as e: + # Best-effort cleanup of any partial download. + download_path.unlink(missing_ok=True) logger.warning("Download failed for %s: %s", url, e) raise ValueError(f"Failed to download file: {e}") from e - if filename.endswith(".zip") or ( - response.headers.get("content-type", "").startswith("application/zip") - ): + except IngestLimitExceededError: + # Don't leave the partial bomb on disk. + download_path.unlink(missing_ok=True) + raise + + is_zip = filename.endswith(".zip") or content_type.startswith("application/zip") + if is_zip: zip_path = temp_dir / "download.zip" - zip_path.write_bytes(content) + download_path.replace(zip_path) return self._extract_zip(zip_path) file_path = temp_dir / filename - file_path.write_bytes(content) + download_path.replace(file_path) return temp_dir def _extract_zip(self, zip_path: Path) -> Path: - """Extract a zip file to a temporary directory.""" + """Extract a zip file, bounded by ``INGEST_MAX_BYTES`` and ``INGEST_MAX_ZIP_MEMBERS``. + + Sums ``ZipInfo.file_size`` (uncompressed size) across all members + before extracting and refuses to extract if either the total or + the member count exceeds the cap. This rejects classic zip + bombs (small archive, huge declared uncompressed size) without + materialising any of the bomb on disk. + """ if not zip_path.exists(): raise FileNotFoundError(f"Zip file not found: {zip_path}") from None temp_dir = self._get_temp_dir() @@ -179,6 +282,19 @@ def _extract_zip(self, zip_path: Path) -> Path: extract_dir.mkdir(exist_ok=True) try: with zipfile.ZipFile(zip_path, "r") as zf: + infos = zf.infolist() + if len(infos) > INGEST_MAX_ZIP_MEMBERS: + raise IngestLimitExceededError( + f"Zip exceeded ingest cap: {len(infos)} members > " + f"INGEST_MAX_ZIP_MEMBERS ({INGEST_MAX_ZIP_MEMBERS})" + ) + total_uncompressed = sum(info.file_size for info in infos) + if total_uncompressed > INGEST_MAX_BYTES: + raise IngestLimitExceededError( + f"Zip exceeded ingest cap: uncompressed " + f"{total_uncompressed} bytes > INGEST_MAX_BYTES " + f"({INGEST_MAX_BYTES})" + ) zf.extractall(extract_dir) except zipfile.BadZipFile: logger.warning("Invalid zip or extract failed: %s", zip_path) @@ -196,3 +312,22 @@ def _wrap_single_file(self, file_path: Path) -> Path: dest = temp_dir / file_path.name shutil.copy2(file_path, dest) return temp_dir + + +def _directory_size_bytes(path: Path) -> int: + """Return the total size of all regular files under *path*, in bytes. + + Symlinks are not followed (``Path.is_file()`` returns ``True`` only + for regular files), so a malicious symlink to ``/dev/zero`` cannot + cause a runaway count. + """ + total = 0 + for p in path.rglob("*"): + if p.is_file() and not p.is_symlink(): + try: + total += p.stat().st_size + except OSError: + # File disappeared mid-walk (race with concurrent fs ops). + # Skip rather than fail the whole ingest. + continue + return total diff --git a/tests/unit/test_input_handler_bounds.py b/tests/unit/test_input_handler_bounds.py new file mode 100644 index 0000000..76eba5e --- /dev/null +++ b/tests/unit/test_input_handler_bounds.py @@ -0,0 +1,369 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for ingest-layer size bounds in ``InputHandler``. + +Covers the three ingest paths the bounded-reads work in PR #19 deferred +to a follow-up (issues #21 / #131): URL download, zip extraction, and +git clone. Each is bounded by ``INGEST_MAX_BYTES`` (and zip is also +bounded by ``INGEST_MAX_ZIP_MEMBERS``); each must fail closed with a +clear error message rather than letting the per-file analysis cap be +defeated upstream. +""" + +from __future__ import annotations + +import struct +import subprocess +import zipfile +from collections.abc import Callable +from pathlib import Path + +import httpx +import pytest + +from skillspector.input_handler import ( + INGEST_MAX_BYTES, + INGEST_MAX_ZIP_MEMBERS, + IngestLimitExceededError, + InputHandler, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _patch_httpx_client(monkeypatch: pytest.MonkeyPatch, handler: Callable) -> None: + """Patch ``httpx.Client`` so ``InputHandler._download_file`` uses a MockTransport.""" + import skillspector.input_handler as ih + + real_client = httpx.Client + + def factory(*args: object, **kwargs: object) -> httpx.Client: + kwargs["transport"] = httpx.MockTransport(handler) + return real_client(*args, **kwargs) + + monkeypatch.setattr(ih.httpx, "Client", factory) + + +def _make_zip(zip_path: Path, members: list[tuple[str, bytes]]) -> None: + """Write ``members`` as a real zip file to ``zip_path``.""" + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: + for name, data in members: + zf.writestr(name, data) + + +def _make_bomb_zip(zip_path: Path, declared_uncompressed: int) -> None: + """Forge a zip whose ``ZipInfo.file_size`` declares an oversized member. + + We can't easily construct a true compression bomb in-test, but the + extractor's check is against the declared uncompressed size from the + central directory. We write a one-member zip and then rewrite the + uncompressed-size field in the central directory record. + """ + name = "bomb.bin" + payload = b"a" # one real byte + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: + zf.writestr(name, payload) + + # Patch the central directory's "uncompressed size" field for the + # one member. Format from PKZIP APPNOTE 4.4.13: + # Central directory record: 4-byte sig (0x02014b50), then + # 2 version-made-by, 2 version-needed, 2 flags, 2 method, + # 2 mtime, 2 mdate, 4 crc32, + # 4 compressed size, 4 uncompressed size, ... + # So uncompressed-size offset within the record is 24 bytes from sig. + raw = zip_path.read_bytes() + sig = b"\x50\x4b\x01\x02" + idx = raw.find(sig) + assert idx >= 0, "central directory record not found" + uncomp_offset = idx + 24 + patched = ( + raw[:uncomp_offset] + struct.pack(" None: + body = b"# small markdown\n" + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(200, content=body) + + _patch_httpx_client(monkeypatch, handler) + + h = InputHandler() + try: + resolved, source_type = h.resolve("https://example.com/skill.md") + assert source_type == "url" + assert (resolved / "skill.md").read_bytes() == body + finally: + h.cleanup() + + def test_content_length_header_rejected_before_body_read( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Server declares an oversized Content-Length → reject before reading body. + + httpx normalises the ``content`` arg's length into Content-Length, + so we ship a chunked stream and inject a forged header via a raw + ``httpx.Response`` constructed from a byte-stream + explicit headers. + """ + oversized = INGEST_MAX_BYTES + 1 + + def handler(request: httpx.Request) -> httpx.Response: + # Drop Transfer-Encoding to be sure Content-Length is the + # only size signal; ship a tiny body so iter_bytes() would + # complete almost instantly if we ever got there. + return httpx.Response( + 200, + stream=httpx.ByteStream(b"x"), + headers={"content-length": str(oversized)}, + ) + + _patch_httpx_client(monkeypatch, handler) + + h = InputHandler() + try: + with pytest.raises(IngestLimitExceededError, match="Content-Length"): + h.resolve("https://example.com/huge.md") + finally: + h.cleanup() + + def test_streamed_body_overflow_rejected_when_header_missing( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """No Content-Length header → streamed byte-counter must catch overflow. + + Use a generator-backed stream so httpx cannot pre-compute and + attach a Content-Length header, then ship oversized bytes. + """ + + def body_iter(): + chunk = b"x" * (64 * 1024) + # Yield enough chunks to exceed the cap. + sent = 0 + while sent <= INGEST_MAX_BYTES + 1024: + yield chunk + sent += len(chunk) + + class _GenStream(httpx.SyncByteStream): + def __iter__(self): + return body_iter() + + def close(self): # noqa: D401 - protocol method + pass + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(200, stream=_GenStream()) + + _patch_httpx_client(monkeypatch, handler) + + h = InputHandler() + try: + with pytest.raises(IngestLimitExceededError, match="streamed"): + h.resolve("https://example.com/huge.bin") + finally: + h.cleanup() + + def test_streamed_overflow_leaves_no_partial_file_on_disk( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """A breach mid-stream must clean up the partial file. + + Closes the security-review finding: even when the cap fires, + the bytes written before the breach must not survive on disk. + Otherwise an attacker can still fill the temp dir up to + ~INGEST_MAX_BYTES by sending exactly one byte over the cap. + """ + + def body_iter(): + chunk = b"x" * (64 * 1024) + sent = 0 + while sent <= INGEST_MAX_BYTES + 1024: + yield chunk + sent += len(chunk) + + class _GenStream(httpx.SyncByteStream): + def __iter__(self): + return body_iter() + + def close(self): # noqa: D401 - protocol method + pass + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(200, stream=_GenStream()) + + _patch_httpx_client(monkeypatch, handler) + + h = InputHandler() + try: + with pytest.raises(IngestLimitExceededError): + h.resolve("https://example.com/huge.bin") + temp = h.temp_dir_for_cleanup() + assert temp is not None + # The partial download file must not survive the breach. + assert not (temp / "_download.partial").exists() + assert not (temp / "huge.bin").exists() + assert not (temp / "download.zip").exists() + finally: + h.cleanup() + + def test_download_streams_to_disk_not_memory(self, monkeypatch: pytest.MonkeyPatch) -> None: + """A legitimate download must write incrementally to disk. + + Verifies the body is not buffered as a single ``bytes`` object + in memory — the streaming refactor uses ``file.write()`` per + chunk. We can't directly measure peak memory in a unit test, + but we can assert the on-disk file ends up at the same size as + the bytes the server shipped, with no intermediate concatenation. + """ + # 5 MiB body — well under the cap, large enough that a single + # ``b''.join(chunks)`` would be a visible allocation if it ever + # happened. + body = b"a" * (5 * 1024 * 1024) + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(200, content=body) + + _patch_httpx_client(monkeypatch, handler) + + h = InputHandler() + try: + resolved, source_type = h.resolve("https://example.com/medium.bin") + assert source_type == "url" + assert (resolved / "medium.bin").stat().st_size == len(body) + # And the sentinel partial-download path must not survive. + assert not (resolved / "_download.partial").exists() + finally: + h.cleanup() + + +# --------------------------------------------------------------------------- +# Zip +# --------------------------------------------------------------------------- + + +class TestZipBound: + """``_extract_zip`` refuses zip bombs and member-count bombs.""" + + def test_under_cap_zip_succeeds(self, tmp_path: Path) -> None: + zip_path = tmp_path / "ok.zip" + _make_zip(zip_path, [("SKILL.md", b"# skill")]) + + h = InputHandler() + try: + resolved, source_type = h.resolve(str(zip_path)) + assert source_type == "zip" + assert resolved.is_dir() + assert (resolved / "SKILL.md").exists() + finally: + h.cleanup() + + def test_declared_uncompressed_oversize_rejected_before_extract(self, tmp_path: Path) -> None: + """Classic zip bomb: small archive, declared-uncompressed size > cap.""" + zip_path = tmp_path / "bomb.zip" + _make_bomb_zip(zip_path, declared_uncompressed=INGEST_MAX_BYTES + 1) + + h = InputHandler() + try: + with pytest.raises(IngestLimitExceededError, match="uncompressed"): + h.resolve(str(zip_path)) + # Crucially: nothing extracted. The extract dir may exist + # (we mkdir before pre-checking) but must be empty. + temp = h.temp_dir_for_cleanup() + assert temp is not None + extract_dir = temp / "extracted" + if extract_dir.exists(): + assert list(extract_dir.iterdir()) == [] + finally: + h.cleanup() + + def test_too_many_members_rejected(self, tmp_path: Path) -> None: + zip_path = tmp_path / "many.zip" + # One byte each, but more entries than the member cap. + members = [(f"file{i}.txt", b"x") for i in range(INGEST_MAX_ZIP_MEMBERS + 1)] + _make_zip(zip_path, members) + + h = InputHandler() + try: + with pytest.raises(IngestLimitExceededError, match="members"): + h.resolve(str(zip_path)) + finally: + h.cleanup() + + +# --------------------------------------------------------------------------- +# Git clone +# --------------------------------------------------------------------------- + + +class TestGitCloneBound: + """``_clone_git`` rejects clones whose on-disk size exceeds the cap.""" + + def test_under_cap_clone_succeeds( + self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path + ) -> None: + def fake_run(cmd, **kwargs): + # cmd is ["git", "clone", "--depth", "1", url, str(clone_dir)] + clone_dir = Path(cmd[-1]) + clone_dir.mkdir(parents=True, exist_ok=True) + (clone_dir / "SKILL.md").write_text("# small") + return subprocess.CompletedProcess(cmd, 0, b"", b"") + + monkeypatch.setattr(subprocess, "run", fake_run) + + h = InputHandler() + try: + resolved, source_type = h.resolve("https://github.com/foo/bar") + assert source_type == "git" + assert (resolved / "SKILL.md").exists() + finally: + h.cleanup() + + def test_oversize_clone_rejected_and_cleaned_up( + self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path + ) -> None: + big = b"x" * (INGEST_MAX_BYTES + 1) + + def fake_run(cmd, **kwargs): + clone_dir = Path(cmd[-1]) + clone_dir.mkdir(parents=True, exist_ok=True) + (clone_dir / "huge.bin").write_bytes(big) + return subprocess.CompletedProcess(cmd, 0, b"", b"") + + monkeypatch.setattr(subprocess, "run", fake_run) + + h = InputHandler() + try: + with pytest.raises(IngestLimitExceededError, match="Git clone"): + h.resolve("https://github.com/foo/huge-repo") + # Failed clone must be cleaned up. + temp = h.temp_dir_for_cleanup() + assert temp is not None + assert not (temp / "repo").exists() + finally: + h.cleanup()