diff --git a/submit_ce/api/submit.py b/submit_ce/api/submit.py index c5dfc31d..364fc11b 100644 --- a/submit_ce/api/submit.py +++ b/submit_ce/api/submit.py @@ -65,11 +65,20 @@ from submit_ce.api.compile_service import CompileService from submit_ce.domain import Submission, Event, License +from submit_ce.domain.size_limits import SizeLimits from submit_ce.api.file_store import SubmissionFileStore class SubmitApi(ABC): + def get_size_limits(self) -> SizeLimits: + """The size limits used to flag oversize submissions. + + Defaults to the built-in 50 MB limits. Implementations with access to + application config (e.g. the Flask implementation) override this to + honor the configured ``MAX_*_KB`` values.""" + return SizeLimits.defaults() + @abstractmethod def get(self, submission_id: str) -> Submission: """ diff --git a/submit_ce/domain/event/file.py b/submit_ce/domain/event/file.py index f59445e5..b6a1c10b 100644 --- a/submit_ce/domain/event/file.py +++ b/submit_ce/domain/event/file.py @@ -9,6 +9,7 @@ from .base import EventWithSideEffect from ..submission import Submission from ..uploads import SubmitFile +from .. import size_limits import logging logger = logging.getLogger(__name__) @@ -25,6 +26,26 @@ def _common_file_change_execute(api: SubmitApi, submission: Submission) -> None: file_store.delete_preview(str(submission.submission_id)) +def _evaluate_oversize(api: SubmitApi, submission: Submission) -> bool: + """Measure the current workspace against the configured size limits. + + Called from ``execute`` (which has file-store access). The boolean result + is stored on the event so ``project`` can apply it deterministically on + replay, when ``execute`` does not run. Reads the authoritative post-change + workspace so the flag reflects *all* current files, not just the ones this + event touched. + """ + workspace = api.get_file_store().get_workspace(str(submission.submission_id)) + if workspace is None: + return False + per_file = {file.path: file.bytes for file in workspace.files} + total = workspace.size or 0 + category = (submission.primary_classification.category + if submission.primary_classification else None) + return size_limits.is_oversize(total, per_file, primary_category=category, + limits=api.get_size_limits()) + + class UploadArchive(EventWithSideEffect): """Uploads a zip or tgz file to the workspace, unpacking all the files.""" @@ -37,6 +58,9 @@ class UploadArchive(EventWithSideEffect): bytes_added: int = 0 """Bytes added by uploading this archive.""" + oversize: bool = False + """Whether the submission is oversize after this change (set in execute).""" + def validate(self, submission: Submission) -> None: validators.submission_is_not_finalized(self, submission) @@ -45,9 +69,11 @@ def execute(self, api: SubmitApi, submission: Submission) -> None: files = api.get_file_store().store_source_package(str(submission.submission_id), self.file, 4098) self.bytes_added = sum([file.bytes for file in files]) _common_file_change_execute(api, submission) + self.oversize = _evaluate_oversize(api, submission) def project(self, submission: Submission) -> Submission: submission.uncompressed_size += self.bytes_added + submission.is_oversize = self.oversize _common_file_change_project(submission) return submission @@ -66,6 +92,9 @@ class UploadFiles(EventWithSideEffect): Field(default_factory=list, exclude=True) bytes_added: int = 0 + oversize: bool = False + """Whether the submission is oversize after this change (set in execute).""" + def validate(self, submission: Submission) -> None: validators.submission_is_not_finalized(self, submission) @@ -76,9 +105,11 @@ def execute(self, api: SubmitApi, submission: Submission) -> None: stat=file_store.store_source_file(str(submission.submission_id), f, chunk_size=4096) self.bytes_added += stat.bytes _common_file_change_execute(api, submission) + self.oversize = _evaluate_oversize(api, submission) def project(self, submission: Submission) -> Submission: submission.uncompressed_size += self.bytes_added + submission.is_oversize = self.oversize _common_file_change_project(submission) return submission @@ -96,6 +127,9 @@ class RemoveFiles(EventWithSideEffect): bytes_removed:int = 0 """Bytes removed by removing these files.""" + oversize: bool = False + """Whether the submission is oversize after this change (set in execute).""" + def validate(self, submission: Submission) -> None: validators.submission_is_not_finalized(self, submission) @@ -109,9 +143,11 @@ def execute(self, api: SubmitApi, submission: Submission) -> None: self.bytes_removed += file.bytes _common_file_change_execute(api, submission) + self.oversize = _evaluate_oversize(api, submission) def project(self, submission: Submission) -> Submission: submission.uncompressed_size -= self.bytes_removed + submission.is_oversize = self.oversize _common_file_change_project(submission) return submission @@ -134,5 +170,6 @@ def execute(self, api: SubmitApi, submission: Submission) -> None: def project(self, submission: Submission) -> Submission: submission.source_format = None submission.uncompressed_size = 0 + submission.is_oversize = False _common_file_change_project(submission) return submission diff --git a/submit_ce/domain/event/tests/test_oversize_detection.py b/submit_ce/domain/event/tests/test_oversize_detection.py new file mode 100644 index 00000000..e3a0e6a9 --- /dev/null +++ b/submit_ce/domain/event/tests/test_oversize_detection.py @@ -0,0 +1,157 @@ +"""Tests for oversize detection wired into the file-upload events. + +These exercise ``execute``/``project`` directly with a hand-rolled fake API so +no Flask app or real file store is needed. +""" + +from datetime import datetime +from types import SimpleNamespace + +from pytz import UTC + +from submit_ce.domain import submission as submod, agent +from submit_ce.domain.meta import Classification +from submit_ce.domain.event.file import ( + UploadFiles, + UploadArchive, + RemoveFiles, + RemoveAllFiles, +) +from submit_ce.domain.size_limits import SizeLimits + +MB = 1024 * 1024 + + +class _FakeStore: + def __init__(self, workspace): + self.workspace = workspace + + def store_source_package(self, sid, content, chunk_size): + return [] + + def store_source_file(self, sid, content, chunk_size): + return SimpleNamespace(bytes=0, path=getattr(content, "filename", "f")) + + def delete_source_file(self, sid, name): + return None + + def delete_all_source_files(self, sid): + pass + + def get_workspace(self, sid): + return self.workspace + + def delete_preflight(self, sid): + pass + + def delete_preview(self, sid): + pass + + +class _FakeApi: + def __init__(self, workspace, limits=None): + self._store = _FakeStore(workspace) + self._limits = limits or SizeLimits.defaults() + + def get_file_store(self): + return self._store + + def get_size_limits(self): + return self._limits + + +def _ws(total, per_file=None): + per_file = per_file or {} + files = [SimpleNamespace(path=p, bytes=b) for p, b in per_file.items()] + return SimpleNamespace(size=total, files=files) + + +def _user(uid="u1"): + return agent.PublicUser(name="Test User", user_id=uid, + email=f"{uid}@example.org", endorsements=[]) + + +def _submission(category="astro-ph.GA"): + u = _user() + return submod.Submission( + creator=u, owner=u, created=datetime.now(UTC), + primary_classification=Classification(category=category)) + + +def test_submission_defaults_not_oversize(): + assert _submission().is_oversize is False + + +def test_upload_files_flags_oversize(): + s = _submission() + api = _FakeApi(_ws(60 * MB, {"huge.pdf": 60 * MB})) + e = UploadFiles(creator=s.creator, files=[]) + e.execute(api, s) + assert e.oversize is True + s = e.project(s) + assert s.is_oversize is True + + +def test_upload_files_within_limit_not_oversize(): + s = _submission() + api = _FakeApi(_ws(10 * MB, {"ok.pdf": 10 * MB})) + e = UploadFiles(creator=s.creator, files=[]) + e.execute(api, s) + s = e.project(s) + assert s.is_oversize is False + + +def test_upload_archive_flags_oversize(): + s = _submission() + api = _FakeApi(_ws(80 * MB, {"a.tex": 80 * MB})) + e = UploadArchive(creator=s.creator, file=None) + e.execute(api, s) + s = e.project(s) + assert s.is_oversize is True + + +def test_remove_files_clears_oversize(): + s = _submission() + s.is_oversize = True + api = _FakeApi(_ws(5 * MB, {"small.tex": 5 * MB})) + e = RemoveFiles(creator=s.creator, files=[]) + e.execute(api, s) + s = e.project(s) + assert s.is_oversize is False + + +def test_remove_all_files_clears_oversize(): + s = _submission() + s.is_oversize = True + e = RemoveAllFiles(creator=s.creator) + s = e.project(s) + assert s.is_oversize is False + + +def test_project_uses_persisted_flag_on_replay(): + # On replay execute() does not run; the flag comes from the stored event. + s = _submission() + e = UploadFiles(creator=s.creator, files=[], oversize=True) + s = e.project(s) + assert s.is_oversize is True + + +def test_no_workspace_is_not_oversize(): + s = _submission() + api = _FakeApi(None) + e = UploadFiles(creator=s.creator, files=[]) + e.execute(api, s) + assert e.oversize is False + + +def test_per_archive_limit_used_in_event(): + s = _submission(category="astro-ph.GA") + limits = SizeLimits( + max_uncompressed_total={"default": 100 * MB, "astro-ph": 5 * MB}, + max_uncompressed_per_file={"default": 100 * MB}, + max_compressed={"default": 100 * MB}, + ) + api = _FakeApi(_ws(10 * MB, {"f": 10 * MB}), limits=limits) + e = UploadFiles(creator=s.creator, files=[]) + e.execute(api, s) + assert e.oversize is True # 10 MB exceeds the 5 MB astro-ph total limit diff --git a/submit_ce/domain/size_limits.py b/submit_ce/domain/size_limits.py new file mode 100644 index 00000000..d9207d15 --- /dev/null +++ b/submit_ce/domain/size_limits.py @@ -0,0 +1,207 @@ +"""Size-limit policy for submissions. + +A port of the legacy ``arXiv/Submit/Size_limits.pm``. It keeps *policy* +(the limit values) separate from *enforcement* (the size check), so the +numbers can be tuned without touching the checking logic. + +There are three independent limits, each a mapping keyed by archive with a +``'default'`` fallback (matching legacy, where only ``'default'`` is +populated but per-archive overrides are supported): + +* ``max_uncompressed_total`` -- sum of all extracted files +* ``max_uncompressed_per_file`` -- any single extracted file +* ``max_compressed`` -- size of the compressed upload + +All limits default to 50,000 KB (50 MB), the legacy arXiv size guideline. + +Only the total and per-file uncompressed limits are *enforced* by +:func:`check_sizes` (matching legacy ``check_sizes``); ``max_compressed`` is +defined for completeness and future tuning but is not currently checked. + +Values are handled in **bytes** internally (matching +:attr:`submit_ce.domain.submission.Submission.uncompressed_size` and +:attr:`submit_ce.domain.uploads.FileStatus.bytes`); the documented defaults +are expressed in KB to mirror the legacy module. + +This module is pure domain code and must not depend on Flask. The two +environment escape hatches below mirror the legacy ``OVERRIDE``/``MAXSIZE`` +knobs and are read at call time so tests can raise the limits without +reconfiguring the app. +""" + +import os +from dataclasses import dataclass +from typing import List, Mapping, Optional + +from arxiv.taxonomy.definitions import CATEGORIES + +ONE_KB = 1024 +DEFAULT_MAX_SIZE_KB = 50_000 +"""Legacy arXiv size guideline: 50,000 KB (50 MB).""" + +DEFAULT_MAX_SIZE_BYTES = DEFAULT_MAX_SIZE_KB * ONE_KB + +DEFAULT_ARCHIVE = "default" +"""Fallback key used when a category has no archive-specific override.""" + +OVERRIDE_ENV = "SUBMIT_OVERSIZE_OVERRIDE" +"""When truthy, doubles every limit (port of legacy ``OVERRIDE``).""" + +MAXSIZE_ENV = "SUBMIT_OVERSIZE_MAXSIZE_KB" +"""When set to a number of KB, raises every limit to at least that value +(port of legacy ``MAXSIZE``).""" + + +def _truthy(value: Optional[str]) -> bool: + return bool(value) and value.strip().lower() not in ("0", "false", "no", "") + + +def _apply_env_escapes(base_bytes: int) -> int: + """Apply the ``MAXSIZE``/``OVERRIDE`` escape hatches to a limit.""" + maxsize = os.environ.get(MAXSIZE_ENV) + if maxsize: + try: + base_bytes = max(base_bytes, int(maxsize) * ONE_KB) + except ValueError: + pass + if _truthy(os.environ.get(OVERRIDE_ENV)): + base_bytes *= 2 + return base_bytes + + +def archive_for_category(category: Optional[str]) -> str: + """Return the archive key for a category, or ``'default'`` if unknown.""" + if category and category in CATEGORIES: + return CATEGORIES[category].in_archive + return DEFAULT_ARCHIVE + + +@dataclass(frozen=True) +class SizeLimits: + """The three size limits, each keyed by archive (values in bytes). + + Each mapping must contain a ``'default'`` entry. Per-archive overrides are + optional; lookups fall back to ``'default'``. + """ + + max_uncompressed_total: Mapping[str, int] + max_uncompressed_per_file: Mapping[str, int] + max_compressed: Mapping[str, int] + + @classmethod + def defaults(cls) -> "SizeLimits": + """Limits at the legacy default (50 MB), with env escapes applied.""" + return cls.from_kb(DEFAULT_MAX_SIZE_KB, + DEFAULT_MAX_SIZE_KB, + DEFAULT_MAX_SIZE_KB) + + @classmethod + def from_kb(cls, total_kb: int, per_file_kb: int, + compressed_kb: int) -> "SizeLimits": + """Build limits from KB values (e.g. from config), applying escapes.""" + return cls( + max_uncompressed_total={ + DEFAULT_ARCHIVE: _apply_env_escapes(total_kb * ONE_KB)}, + max_uncompressed_per_file={ + DEFAULT_ARCHIVE: _apply_env_escapes(per_file_kb * ONE_KB)}, + max_compressed={ + DEFAULT_ARCHIVE: _apply_env_escapes(compressed_kb * ONE_KB)}, + ) + + @staticmethod + def _lookup(table: Mapping[str, int], category: Optional[str]) -> int: + return table.get(archive_for_category(category), table[DEFAULT_ARCHIVE]) + + def total_limit(self, category: Optional[str] = None) -> int: + return self._lookup(self.max_uncompressed_total, category) + + def per_file_limit(self, category: Optional[str] = None) -> int: + return self._lookup(self.max_uncompressed_per_file, category) + + def compressed_limit(self, category: Optional[str] = None) -> int: + return self._lookup(self.max_compressed, category) + + +@dataclass(frozen=True) +class OversizeReason: + """One reason a submission is oversize, with human-readable text.""" + + kind: str + """``'total'`` or ``'per_file'``.""" + + message: str + limit: int + """The limit that was exceeded, in bytes.""" + + actual: int + """The measured size, in bytes.""" + + path: Optional[str] = None + """The offending file, for ``'per_file'`` reasons.""" + + +def _mb(num_bytes: int) -> str: + return f"{num_bytes / (ONE_KB * ONE_KB):.1f} MB" + + +def check_sizes(total_uncompressed: int, + per_file_sizes: Optional[Mapping[str, int]] = None, + primary_category: Optional[str] = None, + limits: Optional[SizeLimits] = None) -> List[OversizeReason]: + """Return the reasons a submission is oversize, or ``[]`` if within limits. + + Enforces the total-uncompressed and per-file-uncompressed limits, matching + the legacy ``check_sizes``. + + Parameters + ---------- + total_uncompressed + Sum of all extracted files, in bytes. + per_file_sizes + Mapping of file path to size in bytes. Empty/omitted skips the + per-file check. + primary_category + Primary classification category, used to select per-archive limits. + limits + Limits to enforce. Defaults to :meth:`SizeLimits.defaults`. + """ + limits = limits or SizeLimits.defaults() + per_file_sizes = per_file_sizes or {} + reasons: List[OversizeReason] = [] + + total_limit = limits.total_limit(primary_category) + if total_uncompressed > total_limit: + reasons.append(OversizeReason( + kind="total", + message=(f"Total uncompressed size {_mb(total_uncompressed)} " + f"exceeds the {_mb(total_limit)} limit."), + limit=total_limit, + actual=total_uncompressed, + )) + + per_file_limit = limits.per_file_limit(primary_category) + for path, size in per_file_sizes.items(): + if size > per_file_limit: + reasons.append(OversizeReason( + kind="per_file", + message=(f"File '{path}' is {_mb(size)}, exceeding the " + f"{_mb(per_file_limit)} per-file limit."), + limit=per_file_limit, + actual=size, + path=path, + )) + return reasons + + +def is_oversize(total_uncompressed: int, + per_file_sizes: Optional[Mapping[str, int]] = None, + primary_category: Optional[str] = None, + limits: Optional[SizeLimits] = None) -> bool: + """Return ``True`` if the submission exceeds any enforced limit.""" + return bool(check_sizes(total_uncompressed, per_file_sizes, + primary_category, limits)) + + +def summarize(reasons: List[OversizeReason]) -> str: + """Join reason messages into a single human-readable warning string.""" + return " ".join(reason.message for reason in reasons) diff --git a/submit_ce/domain/submission.py b/submit_ce/domain/submission.py index 66c3f8df..143d67aa 100644 --- a/submit_ce/domain/submission.py +++ b/submit_ce/domain/submission.py @@ -289,6 +289,12 @@ class Submission: source_format: Optional[SourceFormat] = field(default=None) uncompressed_size: int = field(default=0) + is_oversize: bool = field(default=False) + """Canonical oversize flag, set from the size check when files change. + + This is the flag, set at upload time; the auto-hold is a separate effect + applied at finalize. + """ preview: Optional[Preview] = field(default=None) metadata: SubmissionMetadata = field(default_factory=SubmissionMetadata) diff --git a/submit_ce/domain/tests/test_size_limits.py b/submit_ce/domain/tests/test_size_limits.py new file mode 100644 index 00000000..3c454bbf --- /dev/null +++ b/submit_ce/domain/tests/test_size_limits.py @@ -0,0 +1,163 @@ +"""Tests for the size-limit policy module (:mod:`submit_ce.domain.size_limits`).""" + +from submit_ce.domain import size_limits as sl +from submit_ce.domain.size_limits import ( + ONE_KB, + DEFAULT_MAX_SIZE_KB, + DEFAULT_MAX_SIZE_BYTES, + SizeLimits, + archive_for_category, + check_sizes, + is_oversize, + summarize, +) + +MB = ONE_KB * ONE_KB + + +# --- limit values & lookups ------------------------------------------------- + +def test_default_limit_is_50mb(): + assert DEFAULT_MAX_SIZE_KB == 50_000 + assert DEFAULT_MAX_SIZE_BYTES == 50_000 * ONE_KB + + +def test_defaults_populate_all_three_limits(): + limits = SizeLimits.defaults() + assert limits.total_limit() == DEFAULT_MAX_SIZE_BYTES + assert limits.per_file_limit() == DEFAULT_MAX_SIZE_BYTES + assert limits.compressed_limit() == DEFAULT_MAX_SIZE_BYTES + + +def test_from_kb_converts_to_bytes_independently(): + limits = SizeLimits.from_kb(total_kb=10, per_file_kb=20, compressed_kb=30) + assert limits.total_limit() == 10 * ONE_KB + assert limits.per_file_limit() == 20 * ONE_KB + assert limits.compressed_limit() == 30 * ONE_KB + + +def test_archive_for_category(): + assert archive_for_category("math.GT") == "math" + assert archive_for_category("astro-ph.GA") == "astro-ph" + assert archive_for_category("hep-th") == "hep-th" + assert archive_for_category(None) == "default" + assert archive_for_category("not.a.category") == "default" + + +def test_per_archive_override_falls_back_to_default(): + limits = SizeLimits( + max_uncompressed_total={"default": 100, "astro-ph": 999}, + max_uncompressed_per_file={"default": 100}, + max_compressed={"default": 100}, + ) + # astro-ph.GA resolves to the astro-ph override... + assert limits.total_limit("astro-ph.GA") == 999 + # ...while other categories fall back to 'default'. + assert limits.total_limit("math.GT") == 100 + assert limits.total_limit(None) == 100 + + +# --- check_sizes ------------------------------------------------------------ + +def test_within_limits_returns_no_reasons(): + limits = SizeLimits.from_kb(50_000, 50_000, 50_000) + reasons = check_sizes(total_uncompressed=10 * MB, + per_file_sizes={"a.tex": 1 * MB, "b.png": 5 * MB}, + limits=limits) + assert reasons == [] + assert is_oversize(10 * MB, {"a.tex": 1 * MB}, limits=limits) is False + + +def test_total_over_limit_flags_total(): + limits = SizeLimits.from_kb(50_000, 50_000, 50_000) + reasons = check_sizes(total_uncompressed=60 * MB, limits=limits) + assert len(reasons) == 1 + assert reasons[0].kind == "total" + assert reasons[0].actual == 60 * MB + assert reasons[0].limit == DEFAULT_MAX_SIZE_BYTES + assert is_oversize(60 * MB, limits=limits) is True + + +def test_per_file_over_limit_flags_each_file(): + limits = SizeLimits.from_kb(50_000, 50_000, 50_000) + reasons = check_sizes( + total_uncompressed=10 * MB, # total is fine + per_file_sizes={"small.tex": 1 * MB, "huge.dat": 60 * MB}, + limits=limits, + ) + assert len(reasons) == 1 + assert reasons[0].kind == "per_file" + assert reasons[0].path == "huge.dat" + assert reasons[0].actual == 60 * MB + + +def test_both_dimensions_can_trip(): + limits = SizeLimits.from_kb(50_000, 50_000, 50_000) + reasons = check_sizes( + total_uncompressed=120 * MB, + per_file_sizes={"huge.dat": 60 * MB}, + limits=limits, + ) + kinds = sorted(r.kind for r in reasons) + assert kinds == ["per_file", "total"] + + +def test_exactly_at_limit_is_not_oversize(): + limits = SizeLimits.from_kb(50_000, 50_000, 50_000) + assert check_sizes(DEFAULT_MAX_SIZE_BYTES, + {"f": DEFAULT_MAX_SIZE_BYTES}, + limits=limits) == [] + + +def test_compressed_limit_is_not_enforced(): + # A tiny per-file/total but a defined compressed limit: check_sizes only + # looks at total + per-file, never compressed. + limits = SizeLimits.from_kb(50_000, 50_000, compressed_kb=1) + assert check_sizes(10 * MB, {"a": 1 * MB}, limits=limits) == [] + + +def test_per_archive_limit_used_in_check(): + limits = SizeLimits( + max_uncompressed_total={"default": 100 * MB, "astro-ph": 5 * MB}, + max_uncompressed_per_file={"default": 100 * MB}, + max_compressed={"default": 100 * MB}, + ) + # 10MB is over the 5MB astro-ph limit but under the 100MB default. + assert is_oversize(10 * MB, primary_category="astro-ph.GA", limits=limits) + assert not is_oversize(10 * MB, primary_category="math.GT", limits=limits) + + +def test_summarize_joins_messages(): + limits = SizeLimits.from_kb(50_000, 50_000, 50_000) + reasons = check_sizes(120 * MB, {"huge.dat": 60 * MB}, limits=limits) + text = summarize(reasons) + assert "Total uncompressed size" in text + assert "huge.dat" in text + + +# --- environment escape hatches --------------------------------------------- + +def test_override_env_doubles_limits(monkeypatch): + monkeypatch.setenv(sl.OVERRIDE_ENV, "1") + limits = SizeLimits.defaults() + assert limits.total_limit() == 2 * DEFAULT_MAX_SIZE_BYTES + + +def test_override_env_falsey_does_nothing(monkeypatch): + monkeypatch.setenv(sl.OVERRIDE_ENV, "0") + assert SizeLimits.defaults().total_limit() == DEFAULT_MAX_SIZE_BYTES + + +def test_maxsize_env_raises_limits(monkeypatch): + monkeypatch.setenv(sl.MAXSIZE_ENV, "200000") # 200,000 KB + assert SizeLimits.defaults().total_limit() == 200_000 * ONE_KB + + +def test_maxsize_env_only_raises_never_lowers(monkeypatch): + monkeypatch.setenv(sl.MAXSIZE_ENV, "1") # below the default + assert SizeLimits.defaults().total_limit() == DEFAULT_MAX_SIZE_BYTES + + +def test_maxsize_env_ignores_garbage(monkeypatch): + monkeypatch.setenv(sl.MAXSIZE_ENV, "not-a-number") + assert SizeLimits.defaults().total_limit() == DEFAULT_MAX_SIZE_BYTES diff --git a/submit_ce/implementations/legacy_implementation/db.py b/submit_ce/implementations/legacy_implementation/db.py index a6e3acbc..e253ea9d 100644 --- a/submit_ce/implementations/legacy_implementation/db.py +++ b/submit_ce/implementations/legacy_implementation/db.py @@ -680,6 +680,7 @@ def to_submission(row: models.Submission, updated=row.get_updated(), source_format=source_format, uncompressed_size=uncompressed_size, + is_oversize=bool(row.is_oversize), submitter_is_author=bool(row.is_author), submitter_accepts_policy=bool(row.agree_policy), submitter_contact_verified=bool(row.userinfo), diff --git a/submit_ce/implementations/legacy_implementation/flask_impl.py b/submit_ce/implementations/legacy_implementation/flask_impl.py index d3c6383d..83cb603a 100644 --- a/submit_ce/implementations/legacy_implementation/flask_impl.py +++ b/submit_ce/implementations/legacy_implementation/flask_impl.py @@ -1,8 +1,10 @@ from arxiv.db import Session +from flask import current_app, has_app_context from . import LegacySubmitImplementation from sqlalchemy.orm import Session as SqlalchemySession +from submit_ce.domain.size_limits import SizeLimits, DEFAULT_MAX_SIZE_KB from submit_ce.implementations.compile.compile_api_service import CompileApiService def flask_get_session() -> SqlalchemySession: @@ -21,3 +23,14 @@ def __init__(self, compiler = compiler or CompileApiService() super().__init__(store=store, compiler=compiler) self.get_session = flask_get_session + + def get_size_limits(self) -> SizeLimits: + """Build size limits from the configured ``MAX_*_KB`` values.""" + if not has_app_context(): + return SizeLimits.defaults() + cfg = current_app.config + return SizeLimits.from_kb( + cfg.get("MAX_UNCOMPRESSED_TOTAL_KB", DEFAULT_MAX_SIZE_KB), + cfg.get("MAX_UNCOMPRESSED_PER_FILE_KB", DEFAULT_MAX_SIZE_KB), + cfg.get("MAX_COMPRESSED_KB", DEFAULT_MAX_SIZE_KB), + ) diff --git a/submit_ce/implementations/legacy_implementation/models.py b/submit_ce/implementations/legacy_implementation/models.py index 380f4f34..fc5403ba 100644 --- a/submit_ce/implementations/legacy_implementation/models.py +++ b/submit_ce/implementations/legacy_implementation/models.py @@ -320,6 +320,8 @@ def update_from_submission(self, submission: domain.Submission) -> None: self.source_size = submission.uncompressed_size + self.is_oversize = 1 if submission.is_oversize else 0 + if submission.source_format: self.source_format = submission.source_format.value else: diff --git a/submit_ce/ui/config.py b/submit_ce/ui/config.py index 6e9e8662..c2aab247 100644 --- a/submit_ce/ui/config.py +++ b/submit_ce/ui/config.py @@ -99,6 +99,18 @@ def __init__(self, **kwargs): """If true, only admin users can use the system. Intended to allowe closed to the public dev or beta system.""" + MAX_UNCOMPRESSED_TOTAL_KB: int = 50_000 + """Max total uncompressed submission size, in KB, before the submission is + flagged oversize (default 50 MB; the legacy arXiv size guideline).""" + + MAX_UNCOMPRESSED_PER_FILE_KB: int = 50_000 + """Max uncompressed size of any single file, in KB, before the submission + is flagged oversize (default 50 MB).""" + + MAX_COMPRESSED_KB: int = 50_000 + """Max compressed upload size, in KB. Defined for completeness; not + currently enforced (matches legacy check_sizes).""" + COMPILE_API_URL: str = "https://tex2pdf-api-default-874717964009.us-central1.run.app" """The tex2pdf-api url. Do not end with a /. diff --git a/submit_ce/ui/conftest.py b/submit_ce/ui/conftest.py index d8335dbe..76554428 100644 --- a/submit_ce/ui/conftest.py +++ b/submit_ce/ui/conftest.py @@ -4,6 +4,7 @@ import tempfile import uuid import logging +from types import SimpleNamespace from unittest.mock import MagicMock import arxiv.db.models as classic @@ -284,6 +285,50 @@ class _FakePdf: mock_store = MagicMock() mock_store.store_source_file.return_value = fake_stat + # The upload events read the workspace to evaluate the size limits. + small_ws = MagicMock() + small_ws.size = 10_000 + small_ws.files = [] + mock_store.get_workspace.return_value = small_ws + + original_store = current_app.api.store + current_app.api.store = mock_store + try: + submission, _ = current_app.api.save( + UploadFiles(creator=user, client=ua, files=[_FakePdf()]), + submission_id=sub_cross.submission_id, + ) + finally: + current_app.api.store = original_store + + return submission + + +@pytest.fixture(scope="function") +def sub_files_oversize(app, authorized_user, sub_cross): + """A submission whose uploaded files exceed the size limit. + + The submission is flagged ``is_oversize`` but is NOT yet on hold; the + auto-hold is only applied at finalize time.""" + with app.app_context(): + user = authorized_user + ua = InternalClient(name=f"test_client_{__file__}") + big = 60 * 1024 * 1024 # 60 MB, over the 50 MB default limit + + class _FakePdf: + filename = "huge.pdf" + content_type = "application/pdf" + stream = io.BytesIO(b"%PDF-1.4\n%%EOF\n") + + fake_stat = MagicMock() + fake_stat.bytes = big + + mock_store = MagicMock() + mock_store.store_source_file.return_value = fake_stat + big_ws = MagicMock() + big_ws.size = big + big_ws.files = [SimpleNamespace(path="huge.pdf", bytes=big)] + mock_store.get_workspace.return_value = big_ws original_store = current_app.api.store current_app.api.store = mock_store diff --git a/submit_ce/ui/controllers/new/tests/test_upload_delete.py b/submit_ce/ui/controllers/new/tests/test_upload_delete.py index 5cd08611..8bfc315a 100644 --- a/submit_ce/ui/controllers/new/tests/test_upload_delete.py +++ b/submit_ce/ui/controllers/new/tests/test_upload_delete.py @@ -40,6 +40,11 @@ def test_delete_file_post_confirmed(app, authorized_client, sub_files, mocker): mock_store = mocker.patch.object(app.api, 'store') mock_store.delete_source_file.return_value.bytes = 1 + # RemoveFiles re-evaluates the size limits, which reads the workspace. + ws = mocker.MagicMock() + ws.size = 0 + ws.files = [] + mock_store.get_workspace.return_value = ws resp = authorized_client.post(url, data={ 'csrf_token': csrf, diff --git a/submit_ce/ui/controllers/new/upload.py b/submit_ce/ui/controllers/new/upload.py index 51a1c8f5..082e87d0 100644 --- a/submit_ce/ui/controllers/new/upload.py +++ b/submit_ce/ui/controllers/new/upload.py @@ -251,6 +251,23 @@ def _get_upload(params: MultiDict, session: Session, submission: Submission, return rdata, status.OK, {} +def _flash_oversize_warning(submission: Submission) -> None: + """Warn the submitter that an oversize submission will be held for review. + + The submission is not rejected: the size check is a soft gate. The flag is + persisted during event save; the auto-hold is applied when the submission is + finalized.""" + if not submission.is_oversize: + return + alerts.flash_warning( + Markup( + 'This submission exceeds the arXiv size guideline. You can still ' + 'submit, but it will be placed on hold for moderator review. See ' + 'arxiv.org/help/sizes for ways to reduce ' + 'the size, or to request a size exception.'), + title='Submission is oversize') + + def _upload_archive(form: AddfilesForm, file: FileStorage, submitter: User, client: Client, submission: Submission, rdata: Dict[str, Any], token: str) \ @@ -287,6 +304,7 @@ def _upload_archive(form: AddfilesForm, file: FileStorage, f' package size is {converted_size}. See below for errors.', title='Upload complete, with errors' ) + _flash_oversize_warning(submission) alerts.flash_hidden(workspace.model_dump(), '_status') rdata.update({'status': workspace}) @@ -330,6 +348,7 @@ def _upload_files(form: AddfilesForm, file: FileStorage, f' package size is {converted_size}. See below for errors.', title='Upload complete, with errors' ) + _flash_oversize_warning(submission) alerts.flash_hidden(workspace.model_dump(), '_status') rdata.update({'status': workspace}) diff --git a/submit_ce/ui/tests/test_oversize_persistence.py b/submit_ce/ui/tests/test_oversize_persistence.py new file mode 100644 index 00000000..4002c641 --- /dev/null +++ b/submit_ce/ui/tests/test_oversize_persistence.py @@ -0,0 +1,50 @@ +"""Phase 3: the oversize flag is projected to the classic DB columns. + +Backward-compatibility contract: an oversize upload writes +``arXiv_submissions.is_oversize = 1`` (and leaves ``auto_hold = 0`` until +finalize), and the flag survives a domain -> DB -> domain round-trip. +""" + +from flask import current_app +from arxiv.db import Session +from sqlalchemy import text + + +def _row(sid): + return Session.execute( + text(""" + SELECT is_oversize, auto_hold + FROM arXiv_submissions + WHERE submission_id = :sid + """), + {"sid": sid}, + ).fetchone() + + +def test_oversize_upload_writes_is_oversize_column(app, sub_files_oversize): + with app.app_context(): + row = _row(sub_files_oversize.submission_id) + assert row is not None + assert row.is_oversize == 1 + # The auto-hold effect is not applied at upload time (unset/0). + assert not row.auto_hold + + +def test_normal_upload_writes_is_oversize_zero(app, sub_files): + with app.app_context(): + row = _row(sub_files.submission_id) + assert row is not None + assert row.is_oversize == 0 + + +def test_is_oversize_round_trips(app, sub_files_oversize): + """Reloading the submission reads the flag back from the DB column.""" + with app.app_context(): + reloaded = current_app.api.get(str(sub_files_oversize.submission_id)) + assert reloaded.is_oversize is True + + +def test_normal_submission_round_trips_false(app, sub_files): + with app.app_context(): + reloaded = current_app.api.get(str(sub_files.submission_id)) + assert reloaded.is_oversize is False diff --git a/submit_ce/ui/tests/test_oversize_upload.py b/submit_ce/ui/tests/test_oversize_upload.py new file mode 100644 index 00000000..2e12f4f0 --- /dev/null +++ b/submit_ce/ui/tests/test_oversize_upload.py @@ -0,0 +1,29 @@ +"""Phase 2 oversize tests at the UI/persistence level. + +These use the composable submission fixtures from ``submit_ce/ui/conftest.py``. +""" + +from flask import current_app + +from submit_ce.domain.event.file import UploadFiles + + +def test_oversize_upload_sets_flag(sub_files_oversize): + """An oversize upload flags the submission but does not hold it yet.""" + assert sub_files_oversize.is_oversize is True + # The auto-hold is only applied at finalize; a working submission is not + # on hold yet. + assert sub_files_oversize.is_on_hold is False + + +def test_normal_upload_not_oversize(sub_files): + assert sub_files.is_oversize is False + + +def test_oversize_flag_persists_on_event(app, sub_files_oversize): + """The event's oversize flag round-trips through the event history.""" + with app.app_context(): + _, history = current_app.api.get_with_history( + str(sub_files_oversize.submission_id)) + uploads = [e for e in history if isinstance(e, UploadFiles)] + assert uploads and uploads[-1].oversize is True