pcc/tests/python/test_boc_threading_proof.py at master · jiamo/pcc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""Proof that pcc's threading model gives real free-threaded parallelism.

Builds two binaries with ``PCC_WITH_THREADS=1``:

  * ``benchmarks/python/boc_bank_demo.py`` — 4 pthreads each running a CPU-bound mixer.
  * ``benchmarks/python/boc_bank_demo_serial.py`` — same total work on one thread.

Pass criteria:

  1. Both binaries exit 0 within timeout.
  2. The parallel binary's stdout contains ``DONE`` and one ``t<i>`` line
     per worker thread (so all threads completed).
  3. Wall-clock(serial) / Wall-clock(parallel) > 2.5x — proves the
     pthread substrate runs in parallel rather than serializing through
     a GIL or shared lock.

Side effect: rebuilds ``pcc/py_runtime/libpy_runtime.a`` with
``PCC_WITH_THREADS=1`` and removes it on teardown so subsequent tests
get their own (default) build.
"""
from __future__ import annotations

import os
import shutil
import subprocess
import time
from pathlib import Path

import pytest


REPO = Path(__file__).absolute().parents[2]
PY_RUNTIME = REPO / "pcc" / "py_runtime"
PARALLEL_SRC = REPO / "benchmarks" / "python" / "boc_bank_demo.py"
SERIAL_SRC = REPO / "benchmarks" / "python" / "boc_bank_demo_serial.py"

# Minimum parallel speedup we require to count the proof as PASS.
# Empirically measured ~3.57x on a 4-thread macOS arm64 host; we set the
# bar at 2.5x to leave headroom for noisy CI hardware while still
# rejecting any run where threads serialized.
MIN_SPEEDUP = 2.5

# Expected number of worker threads in the parallel demo.
N_WORKERS = 4


def _archive_paths() -> tuple[Path, Path]:
    archive = PY_RUNTIME / "libpy_runtime.a"
    stamp = Path(str(archive) + ".target")
    return archive, stamp


def _wipe_archive() -> None:
    archive, stamp = _archive_paths()
    if archive.exists():
        archive.unlink()
    if stamp.exists():
        stamp.unlink()
    shutil.rmtree(PY_RUNTIME / "build", ignore_errors=True)


@pytest.fixture
def threaded_runtime(monkeypatch):
    if os.environ.get("PYTEST_XDIST_WORKER"):
        pytest.skip("BoC threading proof mutates the shared runtime archive; run with -n0")
    monkeypatch.setenv("PCC_RUNTIME_CC", "cc")
    monkeypatch.setenv("PCC_RUNTIME_HIGH", "c")
    monkeypatch.setenv("PCC_WITH_THREADS", "1")
    _wipe_archive()
    yield
    _wipe_archive()


def _run_binary(exe: Path, timeout: float = 60.0) -> tuple[float, str]:
    """Run ``exe`` 3 times, return (min wall-clock, stdout-of-min-run).

    CPU-bound benchmarks have one-sided noise — extra time can leak in
    from cold cache, OS scheduling, the first subprocess spawn — but
    nothing makes a fixed number of integer ops finish faster than the
    hardware allows. The minimum across runs is the cleanest estimate.
    """
    best_elapsed = float("inf")
    best_stdout = ""
    for _ in range(3):
        start = time.perf_counter()
        result = subprocess.run(
            [str(exe)], capture_output=True, text=True, timeout=timeout,
        )
        elapsed = time.perf_counter() - start
        assert result.returncode == 0, (
            f"{exe.name} exited {result.returncode}\n"
            f"stdout:\n{result.stdout}\nstderr:\n{result.stderr}"
        )
        if elapsed < best_elapsed:
            best_elapsed = elapsed
            best_stdout = result.stdout
    return best_elapsed, best_stdout


def test_pcc_threads_give_real_parallel_speedup(tmp_path, threaded_runtime):
    from pcc.py_frontend.pipeline import compile_python

    parallel_src = tmp_path / "boc_bank_demo.py"
    serial_src = tmp_path / "boc_bank_demo_serial.py"
    shutil.copyfile(PARALLEL_SRC, parallel_src)
    shutil.copyfile(SERIAL_SRC, serial_src)
    parallel_exe = tmp_path / "parallel.out"
    serial_exe = tmp_path / "serial.out"

    compile_python(
        str(parallel_src), str(parallel_exe),
        ir_scaffold_mode="on", libpython_mode="off",
    )
    compile_python(
        str(serial_src), str(serial_exe),
        ir_scaffold_mode="on", libpython_mode="off",
    )

    serial_time, serial_out = _run_binary(serial_exe, timeout=60.0)
    parallel_time, parallel_out = _run_binary(parallel_exe, timeout=60.0)

    parallel_lines = [ln.strip() for ln in parallel_out.strip().splitlines()]
    assert "DONE" in parallel_lines, (
        f"parallel demo missing DONE marker.\noutput:\n{parallel_out}"
    )
    worker_lines = [ln for ln in parallel_lines if ln.startswith("t") and " r=" in ln]
    assert len(worker_lines) == N_WORKERS, (
        f"expected {N_WORKERS} worker output lines, got "
        f"{len(worker_lines)}: {worker_lines}"
    )

    speedup = serial_time / parallel_time
    print(
        f"\n[boc-proof] serial={serial_time:.2f}s "
        f"parallel={parallel_time:.2f}s "
        f"speedup={speedup:.2f}x "
        f"(threshold={MIN_SPEEDUP}x)"
    )
    assert speedup >= MIN_SPEEDUP, (
        f"insufficient parallel speedup: {speedup:.2f}x "
        f"(need >= {MIN_SPEEDUP}x). "
        f"serial={serial_time:.2f}s parallel={parallel_time:.2f}s. "
        "If this fires on a single-core CI host, lower MIN_SPEEDUP — "
        "but on multicore hosts this asserts that pthreads truly "
        "parallelize pcc-compiled Python code."
    )