openai · thesofakillers · Oct 31, 2025 · Oct 31, 2025
diff --git a/.github/workflows/paperbench_tests.yml b/.github/workflows/paperbench_tests.yml
@@ -31,6 +31,11 @@ jobs:
         lfs: true
         fetch-depth: 1
 
+    - name: Hydrate PaperBench data
+      run: |
+        git lfs fetch --include "project/paperbench/data/**"
+        git lfs checkout project/paperbench/data
+
     - name: Install uv
       uses: astral-sh/setup-uv@v5
 

diff --git a/.lfsconfig b/.lfsconfig
@@ -0,0 +1,3 @@
+[lfs]
+	fetchexclude = project/paperbench/data/**
+	skipdownloaderrors = true
diff --git a/project/paperbench/README.md b/project/paperbench/README.md
@@ -43,23 +43,34 @@ All commands in this README should be run from the [root](./) of the PaperBench
 
 ### Installation
 
-Install PaperBench with [uv](https://docs.astral.sh/uv/)[^1]
+Install PaperBench with [uv](https://docs.astral.sh/uv/)
 
 ```console
-UV_GIT_LFS=1 uv sync
+uv sync
 ```
 
-[^1]:
-    If you get an LFS error here, try cleaning your uv cache: `uv cache clean` and then re-running the command. See
-    [this issue comment](https://github.com/astral-sh/uv/issues/12938#issuecomment-2816186433)
-
 ### Get the data
 
-The dataset is stored using [Git-LFS](https://git-lfs.com/). Download and install LFS, then run:
+The dataset is stored using [Git-LFS](https://git-lfs.com/) and is intentionally not fetched during the install
+step above. Hydrate it manually and point PaperBench at the hydrated directory:
+
+```console
+git clone https://github.com/openai/frontier-evals.git --filter=blob:none
+cd frontier-evals
+git lfs fetch --include "project/paperbench/data/**"
+git lfs checkout project/paperbench/data
+export PAPERBENCH_DATA_DIR="$(pwd)/project/paperbench/data"  # add to your shell profile
+```
+
+If you are already working from a full `frontier-evals` clone, you can run the `git lfs fetch` / `git lfs checkout`
+commands from the repository root and skip setting `PAPERBENCH_DATA_DIR` (the default path resolves to
+`<repo>/project/paperbench/data`). When the data is stored elsewhere, set the environment variable to the location
+you hydrated.
+
+The JudgeEval tarballs that cannot be redistributed automatically can still be created with:
 
 ```console
-git lfs fetch --all
-git lfs pull
+PAPERBENCH_DATA_DIR=/path/to/data uv run python -m paperbench.judge.judge_eval.download_data
 ```
 
 ### Environment variables

diff --git a/project/paperbench/paperbench/judge/judge_eval/download_data.py b/project/paperbench/paperbench/judge/judge_eval/download_data.py
@@ -108,7 +108,7 @@ def process_benchmark(repo_url: str, commit_hash: str, output_dir: str) -> bool:
 
 
 def main() -> None:
-    data_dir = get_paperbench_data_dir()
+    data_dir = get_paperbench_data_dir(require_exists=False)
 
     for benchmark, config in REPOS.items():
         print(f"\nProcessing {benchmark}...")

diff --git a/project/paperbench/paperbench/utils.py b/project/paperbench/paperbench/utils.py
@@ -22,6 +22,25 @@
 P = ParamSpec("P")
 R = TypeVar("R")
 
+DATA_DIR_ENV_VAR = "PAPERBENCH_DATA_DIR"
+
+
+class PaperbenchDataNotFoundError(FileNotFoundError):
+    """Raised when we cannot locate the PaperBench data directory."""
+
+    def __init__(self, candidate: Path):
+        message = (
+            "Unable to locate the PaperBench data directory.\n"
+            f"Checked: {candidate}\n"
+            "If you installed PaperBench from a git dependency, the dataset is not pulled "
+            "automatically. Set the PAPERBENCH_DATA_DIR environment variable to point to a "
+            "clone of `frontier-evals` where you've run `git lfs fetch --include "
+            '"project/paperbench/data/**"` (and `git lfs checkout`), or hydrate the data in '
+            "another location and point PAPERBENCH_DATA_DIR at it."
+        )
+        super().__init__(message)
+        self.candidate = candidate
+
 
 def in_ci() -> bool:
     """Checks if the tests are running in CI."""
@@ -72,10 +91,31 @@ def get_root() -> Path:
     return path
 
 
-def get_paperbench_data_dir() -> Path:
-    """Returns an absolute path to the paperbench data directory."""
+def get_paperbench_data_dir(*, require_exists: bool = True) -> Path:
+    """Returns an absolute path to the PaperBench data directory.
+
+    Args:
+        require_exists: When True (default) we raise a PaperbenchDataNotFoundError if the
+            resolved directory is missing. When False we return the candidate directory and
+            create it if an explicit override was provided.
+    """
 
-    return get_root().parent / "data"
+    override = os.environ.get(DATA_DIR_ENV_VAR)
+    if override:
+        override_path = Path(override).expanduser()
+        if override_path.exists():
+            return override_path
+        if not require_exists:
+            override_path.mkdir(parents=True, exist_ok=True)
+            return override_path
+        raise PaperbenchDataNotFoundError(override_path)
+
+    default_path = get_root().parent / "data"
+    if default_path.exists():
+        return default_path
+    if require_exists:
+        raise PaperbenchDataNotFoundError(default_path)
+    return default_path
 
 
 def build_canonical_sub_path(run_dir: Path | str, timestamp: str) -> str: