-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_boc_threading_proof.py
More file actions
146 lines (119 loc) · 5.07 KB
/
Copy pathtest_boc_threading_proof.py
File metadata and controls
146 lines (119 loc) · 5.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""Proof that pcc's threading model gives real free-threaded parallelism.
Builds two binaries with ``PCC_WITH_THREADS=1``:
* ``benchmarks/python/boc_bank_demo.py`` — 4 pthreads each running a CPU-bound mixer.
* ``benchmarks/python/boc_bank_demo_serial.py`` — same total work on one thread.
Pass criteria:
1. Both binaries exit 0 within timeout.
2. The parallel binary's stdout contains ``DONE`` and one ``t<i>`` line
per worker thread (so all threads completed).
3. Wall-clock(serial) / Wall-clock(parallel) > 2.5x — proves the
pthread substrate runs in parallel rather than serializing through
a GIL or shared lock.
Side effect: rebuilds ``pcc/py_runtime/libpy_runtime.a`` with
``PCC_WITH_THREADS=1`` and removes it on teardown so subsequent tests
get their own (default) build.
"""
from __future__ import annotations
import os
import shutil
import subprocess
import time
from pathlib import Path
import pytest
REPO = Path(__file__).absolute().parents[2]
PY_RUNTIME = REPO / "pcc" / "py_runtime"
PARALLEL_SRC = REPO / "benchmarks" / "python" / "boc_bank_demo.py"
SERIAL_SRC = REPO / "benchmarks" / "python" / "boc_bank_demo_serial.py"
# Minimum parallel speedup we require to count the proof as PASS.
# Empirically measured ~3.57x on a 4-thread macOS arm64 host; we set the
# bar at 2.5x to leave headroom for noisy CI hardware while still
# rejecting any run where threads serialized.
MIN_SPEEDUP = 2.5
# Expected number of worker threads in the parallel demo.
N_WORKERS = 4
def _archive_paths() -> tuple[Path, Path]:
archive = PY_RUNTIME / "libpy_runtime.a"
stamp = Path(str(archive) + ".target")
return archive, stamp
def _wipe_archive() -> None:
archive, stamp = _archive_paths()
if archive.exists():
archive.unlink()
if stamp.exists():
stamp.unlink()
shutil.rmtree(PY_RUNTIME / "build", ignore_errors=True)
@pytest.fixture
def threaded_runtime(monkeypatch):
if os.environ.get("PYTEST_XDIST_WORKER"):
pytest.skip("BoC threading proof mutates the shared runtime archive; run with -n0")
monkeypatch.setenv("PCC_RUNTIME_CC", "cc")
monkeypatch.setenv("PCC_RUNTIME_HIGH", "c")
monkeypatch.setenv("PCC_WITH_THREADS", "1")
_wipe_archive()
yield
_wipe_archive()
def _run_binary(exe: Path, timeout: float = 60.0) -> tuple[float, str]:
"""Run ``exe`` 3 times, return (min wall-clock, stdout-of-min-run).
CPU-bound benchmarks have one-sided noise — extra time can leak in
from cold cache, OS scheduling, the first subprocess spawn — but
nothing makes a fixed number of integer ops finish faster than the
hardware allows. The minimum across runs is the cleanest estimate.
"""
best_elapsed = float("inf")
best_stdout = ""
for _ in range(3):
start = time.perf_counter()
result = subprocess.run(
[str(exe)], capture_output=True, text=True, timeout=timeout,
)
elapsed = time.perf_counter() - start
assert result.returncode == 0, (
f"{exe.name} exited {result.returncode}\n"
f"stdout:\n{result.stdout}\nstderr:\n{result.stderr}"
)
if elapsed < best_elapsed:
best_elapsed = elapsed
best_stdout = result.stdout
return best_elapsed, best_stdout
def test_pcc_threads_give_real_parallel_speedup(tmp_path, threaded_runtime):
from pcc.py_frontend.pipeline import compile_python
parallel_src = tmp_path / "boc_bank_demo.py"
serial_src = tmp_path / "boc_bank_demo_serial.py"
shutil.copyfile(PARALLEL_SRC, parallel_src)
shutil.copyfile(SERIAL_SRC, serial_src)
parallel_exe = tmp_path / "parallel.out"
serial_exe = tmp_path / "serial.out"
compile_python(
str(parallel_src), str(parallel_exe),
ir_scaffold_mode="on", libpython_mode="off",
)
compile_python(
str(serial_src), str(serial_exe),
ir_scaffold_mode="on", libpython_mode="off",
)
serial_time, serial_out = _run_binary(serial_exe, timeout=60.0)
parallel_time, parallel_out = _run_binary(parallel_exe, timeout=60.0)
parallel_lines = [ln.strip() for ln in parallel_out.strip().splitlines()]
assert "DONE" in parallel_lines, (
f"parallel demo missing DONE marker.\noutput:\n{parallel_out}"
)
worker_lines = [ln for ln in parallel_lines if ln.startswith("t") and " r=" in ln]
assert len(worker_lines) == N_WORKERS, (
f"expected {N_WORKERS} worker output lines, got "
f"{len(worker_lines)}: {worker_lines}"
)
speedup = serial_time / parallel_time
print(
f"\n[boc-proof] serial={serial_time:.2f}s "
f"parallel={parallel_time:.2f}s "
f"speedup={speedup:.2f}x "
f"(threshold={MIN_SPEEDUP}x)"
)
assert speedup >= MIN_SPEEDUP, (
f"insufficient parallel speedup: {speedup:.2f}x "
f"(need >= {MIN_SPEEDUP}x). "
f"serial={serial_time:.2f}s parallel={parallel_time:.2f}s. "
"If this fires on a single-core CI host, lower MIN_SPEEDUP — "
"but on multicore hosts this asserts that pthreads truly "
"parallelize pcc-compiled Python code."
)