From 1d151d0503cee1b668312ecbb25acac7a69ff22a Mon Sep 17 00:00:00 2001 From: Zhifei Li Date: Tue, 23 Jun 2026 22:16:58 -0700 Subject: [PATCH 01/13] chore(eval): make the reproduction package self-contained and public-safe Reproduction: - Consolidate to a single entry doc: rename eval/REPRODUCE.md -> eval/README.md (auto-renders on the eval/ dir) and update all references. - Document the three ways to supply tile images to the reader: self-hosted serve with materialized tiles, the public search API, and a self-hosted serve that renders tiles on demand from a kiwix ZIM. Make TILES_DIR optional so the reader can use serve-returned base64 tiles instead of a local corpus. - Remove the reader-side "local-wiki" rendering path entirely so all tile rendering happens serve-side: drop LocalWikiTiledScreenshotRetriever, the lookup_reference_url machinery, and the --local-wiki / --local-wiki-screenshot-dir / --lookup-reference-url flags (they relied on an out-of-tree module and hardcoded placeholder paths). - Run benchmarks on their full (filtered) sets; keep the 1000-example subsample only for nq and sqa, which is what the paper reports. Hygiene: - Read the Jina API key from JINA_API_KEY instead of a hardcoded default. - Update stale notes now that the indexes and tile corpus are published on HF. - Drop internal working-notes docs from the repo and scrub leftover references to old internal repo/module names. - Tidy .gitignore to use generic patterns and add a scoped eval/.gitignore. --- .gitignore | 24 - .../internal/screenshot-optimization-notes.md | 239 -- docs/reproducing_paper.md | 592 ----- .../plans/2026-05-11-pixelrag-restructure.md | 1315 ---------- .../plans/2026-05-25-pixelrag-frontend.md | 2313 ----------------- .../2026-05-27-chromium-build-centralia.md | 438 ---- .../2026-05-11-pixelrag-restructure-design.md | 359 --- .../2026-05-25-pixelrag-frontend-design.md | 290 --- eval/.gitignore | 6 + eval/PAPER_EXPERIMENT_MAP.md | 129 - eval/{REPRODUCE.md => README.md} | 62 +- eval/REPRODUCE_PROGRESS.txt | 366 --- eval/lib/__init__.py | 2 - eval/lib/benchmarks.py | 8 +- eval/lib/grader.py | 6 +- eval/lib/retrieval.py | 425 +-- eval/lib/retrievers.py | 20 +- eval/lib/simpleqa_data.py | 2 +- eval/pyproject.toml | 8 +- eval/reproduce.sh | 26 +- eval/run_bench.py | 47 +- eval/run_livevqa.py | 2 +- eval/serve_up.sh | 2 +- 23 files changed, 78 insertions(+), 6603 deletions(-) delete mode 100644 docs/internal/screenshot-optimization-notes.md delete mode 100644 docs/reproducing_paper.md delete mode 100644 docs/superpowers/plans/2026-05-11-pixelrag-restructure.md delete mode 100644 docs/superpowers/plans/2026-05-25-pixelrag-frontend.md delete mode 100644 docs/superpowers/plans/2026-05-27-chromium-build-centralia.md delete mode 100644 docs/superpowers/specs/2026-05-11-pixelrag-restructure-design.md delete mode 100644 docs/superpowers/specs/2026-05-25-pixelrag-frontend-design.md create mode 100644 eval/.gitignore delete mode 100644 eval/PAPER_EXPERIMENT_MAP.md rename eval/{REPRODUCE.md => README.md} (65%) delete mode 100644 eval/REPRODUCE_PROGRESS.txt diff --git a/.gitignore b/.gitignore index a28f164..c82149b 100644 --- a/.gitignore +++ b/.gitignore @@ -37,33 +37,9 @@ logs/ *.log arxiv demos/e2e/output/ -eval/eval_output/ .superpowers/ .vercel -# Large local retrieval artifacts (not committed) -eval/tmp_news_state.db -eval/live_pixel_full.json -eval/live_reader_full.json -eval/frozen_reader_full.json -eval/mms_base_live.jsonl -eval/mms_lora_live.jsonl -eval/mms_naive_live.jsonl -eval/evqa_base_landmarks_live.jsonl -eval/evqa_base_inat_live.jsonl -eval/evqa_lora_landmarks_live.jsonl -eval/evqa_lora_inat_live.jsonl -eval/mms_traf_live.jsonl -eval/evqa_traf_landmarks.jsonl -eval/evqa_traf_inaturalist.jsonl -eval/evqa_naive_landmarks.jsonl -eval/evqa_naive_inaturalist.jsonl -eval/mms_naive_nothink.jsonl -eval/evqa_base_landmarks_nothink.jsonl -eval/evqa_base_inat_nothink.jsonl -eval/evqa_lora_landmarks_nothink.jsonl -eval/evqa_lora_inat_nothink.jsonl -eval/paper_grader_out/ node_modules/ .next/ diff --git a/docs/internal/screenshot-optimization-notes.md b/docs/internal/screenshot-optimization-notes.md deleted file mode 100644 index d16c5d9..0000000 --- a/docs/internal/screenshot-optimization-notes.md +++ /dev/null @@ -1,239 +0,0 @@ -# Screenshot Throughput Optimization — Working Progress - -## Target: 150 t/s @ 100% correct (8192px tiles, maxi Wikipedia) - -## Current Best - -| Config | t/s | Correct | Notes | -|--------|-----|---------|-------| -| multi-process 48w (frameStoppedLoading) | **91** | 100% ✓ | Stable, production-ready | -| multi-process 48w (frameNavigated) | **98** | 100% ✓ | Stable (igpu incompatible) | -| multi-process 48w (2000 art) | **113** | 99.8% ✓ | Steady-state | -| igpu 48w + frameStoppedLoading | **117-132** | 90-97% | Fast but 3-10% about:blank | -| igpu 48w + directClip | **128-148** | 48-90% | Fastest, worst correctness | - -## Production System Comparison - -The wiki-screenshot production system (`~/pixelrag-src/wiki-screenshot/`) uses: -```python -wait_fonts = False # for kiwix/ZIM datasource -wait_images = False # for kiwix/ZIM datasource -pre_screenshot_delay = 0.5 # fixed 500ms sleep, no fonts.ready -``` -- Playwright-based (not CDP websocket) -- GPU-accelerated (8× L40S per machine) -- Multi-machine: 4 machines × ~70-80 t/s = ~290 t/s total -- Full Wikipedia (8.28M articles) processed in ~1 day - -Our optimizations added `fonts.ready + eager images + double-rAF` for pixel-perfect -correctness. Production skips these waits entirely (`pre_screenshot_delay=0` in -coordinator). This is safe for Kiwix because all assets (including fonts) are served -from localhost — they load before `wait_until="load"` fires. - -Gemini Vision validation of 5000 production tiles: -- 0% BROKEN_RENDER, 0% ERROR_PAGE (rendering is correct without font wait) -- 12% BLANK/PARTIAL_BLANK (tile loop overshoots page height — separate bug) - -**Benchmark result**: Removing font/image wait gives only +4% throughput (99 vs 96 t/s) -because nav is not the bottleneck — capture IPC is. The 290 t/s production rate comes -from 4 machines × GPU acceleration, not from skipping font waits. - -## Pipeline Bottleneck Analysis - -``` -Stage Capacity Bottleneck? -Nav 430 pg/s No (3.4x headroom) -Capture 125 t/s YES (C/T_c = 48/321ms) - -Steady-state theoretical: 125-150 t/s -Actual (200 art): 98 t/s (75% utilization, 25% = nav serial) -Actual (2000 art): 113 t/s (85% utilization) -``` - -Per-capture breakdown at 48 concurrent: -- IPC roundtrip: 181ms (ForceRedraw browser→renderer→compositor, 8 async hops) -- DrawRenderPass: 62ms (composite 136 quads) -- CopyDrawnRenderPass: 46ms (memcpy 28MB) - -Throughput = `C / T_c(C)` converges at ~125-130 t/s (USL contention curve). -Nav latency (186ms) does not affect steady-state throughput (Little's Law). -Minimum workers to saturate capture: `C × (1 + T_nav/T_cap) = 72`. - -## Chromium Patches (in custom build) - -| Patch | File | Impact | -|-------|------|--------| -| rawFilePath | page_handler.cc + Page.pdl | Async write raw BGRA to /dev/shm (ThreadPool) | -| directClip | page_handler.cc + Page.pdl | CopyFromSurface(src_rect) without emulation change | -| skipRedraw | page_handler.cc + Page.pdl | ForceRedrawWithCallback → CopyFromSurface | -| ForceRedrawWithCallback | render_widget_host_impl.cc | Lightweight ForceRedraw with commit callback | -| directClip ForceRedraw fix | page_handler.cc | directClip also does ForceRedraw before copy | - -## Strategy Architecture - -Strategies separated from bench framework: -- `pixelrag_render.strategies/` — capture strategies (CDPPhased, CDPSequential, etc.) -- `pixelrag_render.bench/` — measurement harness with GT validation + experiment dump -- `Bench` class: `bench.run(strategy)` → GT cache + capture + verify + JSON dump - -### CDPPhasedStrategy (best strategy) -- Work-stealing queue (asyncio.Queue, not round-robin) -- Semaphore-limited concurrent captures -- `wait_for_event("Page.frameStoppedLoading")` filtered by main frameId -- Per-tile semaphore release (fine-grained pipelining) -- Configurable: tile_height, nav_timeout, use_direct_clip, extra_chrome_args - -### WebsocketConnection -- Background `_recv_loop` for multiplexed CDP -- `wait_for_event(method, timeout, filter_fn)` for async event listening -- Supports concurrent `cdp()` calls via pending futures dict - -## What Was Tried - -### Worked -- ✅ rawFilePath: async write bypasses PNG encoding (+15%) -- ✅ directClip: parallel tile capture within viewport -- ✅ Phased strategy: semaphore-limited captures reduce contention (+15%) -- ✅ Work-stealing queue: better load balancing -- ✅ frameNavigated/frameStoppedLoading wait: fixes igpu about:blank race -- ✅ Presentation feedback ForceRedraw: 100% correct (but slower) - -### Partially Worked -- ⚠️ --in-process-gpu: 120+ t/s but 5-10% about:blank captures -- ⚠️ SwapPromise ForceRedraw: shot_p50 325→303ms (7% gain) -- ⚠️ directClip for all tiles: fast but correctness depends on ForceRedraw - -### Did Not Work -- ❌ --single-process: 168 t/s but 74% correct -- ❌ peekPixels (SkiaRenderer): headless uses SoftwareRenderer -- ❌ Immediate BeginFrame feedback flush: breaks frame pipeline -- ❌ CDPScreenshotNewSurface: RequestRepaintOnNewSurface overhead -- ❌ 2-tab pipelining: Chrome UI thread serializes ForceRedraw -- ❌ Chrome flags (disable-lcd-text etc.): ±2% -- ❌ headless_shell: slower than chrome (no shared HTTP cache) -- ❌ One-shot strategy: launch overhead 1-2s/process -- ❌ Firefox Playwright: 2.6x slower than Chrome -- ❌ Servo (servoshell 0.1.0): stub package, not ready -- ❌ CEF (cefpython3): abandoned, no modern Python wheel -- ❌ WebKitGTK snapshot: needs GPU/display access -- ❌ RequestRepaintOnNewSurface in skipRedraw: didn't fix igpu race -- ❌ Bitmap dimension retry: about:blank renders at full viewport size -- ❌ Pixel content retry: can't distinguish white page from about:blank - -## igpu About:blank Root Cause - -Chrome `--in-process-gpu` has two bugs at 48 concurrent workers: -1. **frameNavigated event not fired**: Chrome sometimes silently drops - `Page.frameNavigated` CDP event under high concurrency. - Fix: use `Page.frameStoppedLoading` (always reliable). -2. **Compositor surface race**: ForceRedraw's presentation feedback arrives - before the new page's CompositorFrame is activated in viz. CopyFromSurface - reads the old surface (about:blank at 875×8192, indistinguishable from - real page by dimensions). No reliable Python-side detection possible. - -## Key Analysis Methods Used - -- **Pipeline bottleneck analysis** (closed queueing model) -- **Little's Law**: steady-state throughput = C/T_c when capture-bound -- **USL contention curve**: C/T_c(C) convergence at ~125-130 t/s -- **USE method**: Utilization (79%), Saturation (semaphore queue), Errors (0) -- **Per-capture breakdown**: DrawRenderPass (57ms) + CopyDrawnRenderPass (18ms) - + IPC overhead (95ms) measured via Chromium instrumentation - -## Scale Estimate - -30M tiles (18.7M articles × ~1.6 tiles/article): -- Single machine 98 t/s: 30M/98 = 85 hours = **3.5 days** -- Single machine 120 t/s (igpu, 95% correct): 30M/120 = 69 hours = **2.9 days** -- 4 machines × 98 t/s = 392 t/s: 30M/392 = 21 hours = **< 1 day** -- Production system (290 t/s, 4 machines): ~1 day (matches historical data) - -## Production Pipeline: fast_cdp backend - -``` -Chrome 48w (capture) → /dev/shm (raw BGRA) → ProcessPool 4w (JPEG) → disk - 98 t/s 28MB/tile ~100 t/s 100KB/tile -``` - -Architecture: -- `render_articles()` in `pixelrag_render.backends.fast_cdp` -- Capture: CDPPhasedStrategy logic (work-stealing, semaphore, frameStoppedLoading) -- Compression: `concurrent.futures.ProcessPoolExecutor(4)` — GIL-free, separate cores -- Raw files in /dev/shm/pixelrag_render/ — auto-deleted after compression -- Output: JPEG tiles + tiles.json manifest per article - -Key: compression never blocks capture. Chrome writes raw → returns immediately. -Compression reads raw file asynchronously on different CPU cores. - -128-core machine: 48 cores for Chrome, 4 cores for JPEG, 76 cores idle. -JPEG compression of 875×8192 takes ~10-20ms → 4 cores handle 200-400 t/s → -plenty of headroom over 98 t/s capture rate. - -Storage: 30M tiles × 100KB JPEG = ~3 TB - -## GPU Acceleration (Brewster H200 findings) - -Lab machines have 8× H200/B200 GPUs but: -- `/dev/dri/renderD*` needs `render` group membership (no sudo) -- Docker daemon not running; rootless docker lacks nvidia-container-toolkit -- SwiftShader (CPU Vulkan) doesn't improve throughput vs software rendering -- headless Chrome ignores `--use-gl` flags (GPU process crashes on init) -- When GPU DOES init (via Xvfb + ANGLE), missing NVIDIA userspace drivers in container - -To unlock GPU: `sudo usermod -aG render $USER` on lab machine. -Expected impact: 4x faster DrawRenderPass based on production system data. - -## Backend reconciliation & SPA-render fix (2026-06-11) - -### The three render code paths (who actually runs what) -- `backends/websocket.py` — the **shipped** general-purpose renderer. The `pixelshot` - CLI, the `pixelbrowse` skill, and the `pixelrag index` pipeline (`render_urls`, - `backend="cdp"`/`"websocket"`) all go through it. Simple: per-worker queue, inline - JPEG over CDP, no extra deps. -- `backends/fast_cdp.py` — high-throughput batch path (`render_articles`): phased-logic - capture + rawFilePath to /dev/shm + ProcessPool JPEG. **No in-repo caller** — invoked - only by an out-of-repo ops script. The 8.28M flagship Wikipedia index was built by a - *separate* system (Playwright/GPU/4-machine, see "Production System Comparison"), not - by either of these. -- `strategies/*` — the benchmarking menu; used only by `bench/`. Kept as research scaffolding. - -### Regression fixed: websocket backend rendered SPAs / tall pages wrong -`backends/websocket.py` had drifted from the established capture pattern — it had **no -nav-completion wait** (fired `document.fonts.ready` immediately after `Page.navigate`) -and **no per-tile scroll**, both of which `fast_cdp` and the production strategies have. -Consequences: -- JS/SPA pages were measured/captured mid-hydration at a transient (often much taller) - layout → tiled into mostly-empty space = blank tiles (this is the "tile loop overshoots - page height" blank bug noted under "Production System Comparison", here root-caused). -- At small `tile_height` (the skill uses 1568) every tile past the first was blank, - because content below the short device viewport is never rasterized without scrolling. - -Fix (verified in `bench/` against ground truth at 100% on the smoke set): -- Wait for the `load` event before measuring/capturing (`readyState==='complete'` - shortcut + 12s cap). SSR pages fire `load` ~as fast as `fonts.ready`, so ~0 cost - (measured: Wikipedia render time unchanged). -- Scroll each tile into view before capture (mirrors `fast_cdp`). -- Optional `--wait-network-idle` (JS PerformanceObserver) for pages that fetch content - after load; off by default (costs a quiet window/page), on by default in the skill. - -### Raw vs inline-JPEG is the dominant throughput lever (measured, 48w, N=600, this box) -| config | correct | t/s | note | -|---|---|---|---| -| phased **raw** (fast_cdp config) | 99.7% | **306** | capture-only in bench; JPEG is decoupled/parallel | -| phased jpeg (inline) | 98.2% | 182 | Chrome encodes JPEG on the capture critical path | -| sequential raw | 99.7% | 221 | | -| sequential jpeg (inline) | 98.2% | 142 | | - -Takeaways: (1) **inline JPEG encoding is the bottleneck** — bypassing it with rawFilePath -+ parallel compression is ~+56-68%. (2) phased's semaphore/work-stealing buys ~+38% over -sequential **in raw mode** (in jpeg mode the encoding bottleneck masks it to ~+8% — an -earlier jpeg-only comparison was misleading). So `fast_cdp` is ~2x the simple inline path -at batch scale and is **kept**. Absolute t/s here is optimistic (capture-only, short -window, 128-core box) vs the ~91-113 production figure; the *ratios* are the point. - -### Design direction -Ship **one simple backend** (`websocket.py`, inline JPEG) for the CLI/skill/`pixelrag index` -— that scale doesn't need the raw+decoupled machinery, and the flagship index uses the -separate system anyway. Keep `fast_cdp` + `strategies/` as batch/research code. The shared -capture-readiness logic (load wait, scroll) should eventually live in one place so the -shipped backend can't silently drift from the correct pattern again. diff --git a/docs/reproducing_paper.md b/docs/reproducing_paper.md deleted file mode 100644 index f27da6c..0000000 --- a/docs/reproducing_paper.md +++ /dev/null @@ -1,592 +0,0 @@ -# Reproducing Paper Results - -> **Paper**: *PixelRAG: Retrieval and Generation in Pixel Space over Millions of Web Screenshots* -> -> This document maps every table and figure in the paper to the exact commands needed to reproduce the numbers. - -## Prerequisites - -### Infrastructure - -| Component | Description | Where | -|-----------|-------------|-------| -| **Wikipedia tile index (base)** | 28M vectors, Qwen3-VL-Embedding-2B (pretrained) | `pixelrag-data/search_index/` (215 GB FAISS IVF, dim=2048) | -| **Wikipedia tile index (fine-tuned)** | 26M vectors, LoRA checkpoint-200 | `pixelrag-data/search_index_lora_vit_ckpt200_v2/` (202 GB) | -| **Wikipedia text index** | 15.7M text chunks (1024 tokens, Trafilatura) | `pixelrag-data/text_search_index_1024/` (121 GB) | -| **Article metadata** | URL↔tile mapping for 7.1M articles | `pixelrag-data/articles.json` (199 MB) | -| **Tile images** | ~30M PNG tiles (1024×1024) | Remote NFS or local SSD (~5.6 TB) | -| **News tile index** | 3.6M tiles (BBC/AP/CNN) for LiveVQA | S3: `s3://wiki-screenshot-tiles-backup/kiwix_tiles/news_image_search_index/` | -| **News text index** | 866K text chunks for news | S3: `s3://wiki-screenshot-tiles-backup/kiwix_tiles/news_text_search_index/` | -| **News tiles** | Raw PNG tiles for news articles | S3: `s3://wiki-screenshot-tiles-backup/kiwix_tiles/news_tiles/` | -| **LoRA adapter** | Fine-tuned embedding LoRA weights | S3: `s3://wiki-screenshot-tiles-backup/kiwix_tiles/adapters/lora_vit_ckpt200/` | -| **Kiwix ZIM** | Offline Wikipedia for HTML baselines | S3: `s3://wiki-screenshot-tiles-backup/kiwix_tiles/zim/` | - -All S3 paths use AWS profile `leann` (`aws s3 --profile leann ...`). - -### Services to Start - -```bash -# 1. Screenshot search API (port 30888) — serves the pixel tile index -pixelrag-serve \ - --index-dir pixelrag-data/search_index \ # or search_index_lora_vit_ckpt200_v2 - --tiles-dir /path/to/wikipedia_tiles \ - --articles-json pixelrag-data/articles.json \ - --model Qwen/Qwen3-VL-Embedding-2B \ - --device cuda --port 30888 - -# 2. Text search API (port 30889) — serves the text chunk index -pixelrag-serve \ - --index-dir pixelrag-data/text_search_index_1024 \ - --tiles-dir /path/to/text_chunks \ - --articles-json pixelrag-data/articles.json \ - --model Qwen/Qwen3-VL-Embedding-2B \ - --device cuda --port 30889 - -# 3. Reader model (port 8000) — vLLM serving Qwen3.5-4B (default reader) -vllm serve Qwen/Qwen3.5-4B-Instruct \ - --port 8000 --tensor-parallel-size 1 \ - --max-model-len 32768 -``` - -### Environment - -```bash -cd ~/pixelrag/eval - -# Install eval dependencies (one-time) -uv pip install pandas tqdm trafilatura openai aiohttp datasets huggingface-hub - -# For grading -export OPENAI_API_KEY=sk-... # GPT-4.1 judge -``` - ---- - -## Table 1: Main Results (6 Benchmarks × 4 Methods) - -**Reader**: Qwen3.5-4B, **k=3**, **Grader**: GPT-4.1 judge (except LiveVQA = exact match) - -### No Retrieval (baseline) - -```bash -# SimpleQA — no retrieval -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --num-examples 1000 --no-think - -# NQ — no retrieval -python run_bench.py \ - --task nq --model Qwen/Qwen3.5-4B-Instruct \ - --num-examples 1000 --no-think - -# NQ-Tables — no retrieval -python run_bench.py \ - --task nq_tables --model Qwen/Qwen3.5-4B-Instruct \ - --num-examples 1000 --no-think - -# MMSearch — no retrieval (300 examples) -python run_bench.py \ - --task mmsearch --model Qwen/Qwen3.5-4B-Instruct \ - --num-examples 300 --no-think - -# EVQA — no retrieval (landmarks, automatic only, n=749) -python run_bench.py \ - --task encyclopedic_vqa --model Qwen/Qwen3.5-4B-Instruct \ - --evqa-dataset-filter landmarks --evqa-question-type-filter automatic \ - --num-examples 749 --no-think - -# LiveVQA — see "LiveVQA Separate Pipeline" section below -``` - -### Text Retrieval — Trafilatura (Text → Text) - -Requires: text search API on port 30889 with Trafilatura-parsed text chunks. - -```bash -# SimpleQA — Trafilatura text retrieval -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --text-api --text-api-url http://localhost:30889/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# NQ — Trafilatura text retrieval -python run_bench.py \ - --task nq --model Qwen/Qwen3.5-4B-Instruct \ - --text-api --text-api-url http://localhost:30889/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# NQ-Tables -python run_bench.py \ - --task nq_tables --model Qwen/Qwen3.5-4B-Instruct \ - --text-api --text-api-url http://localhost:30889/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# MMSearch (multimodal query: text + image → text index) -python run_bench.py \ - --task mmsearch --model Qwen/Qwen3.5-4B-Instruct \ - --text-api --text-api-url http://localhost:30889/search \ - --retrieval-top-k 3 --num-examples 300 --no-think - -# EVQA -python run_bench.py \ - --task encyclopedic_vqa --model Qwen/Qwen3.5-4B-Instruct \ - --text-api --text-api-url http://localhost:30889/search \ - --evqa-dataset-filter landmarks --evqa-question-type-filter automatic \ - --retrieval-top-k 3 --num-examples 749 --no-think - -# LiveVQA — see "LiveVQA Separate Pipeline" section below -``` - -### Text Retrieval — mwparserfromhell - -Same as Trafilatura but requires a separate text index built with mwparserfromhell parser. -The text API must be started pointing to that index. - -```bash -# Same commands as Trafilatura above, but --text-api-url points to -# the mwparserfromhell text index API (different port or index-dir). -# The parser choice is baked into the index at build time, not a runtime flag. -``` - -### PixelRAG (base) — Screenshot → Screenshot - -Requires: screenshot search API on port 30888 with base (pretrained) embedding index. - -```bash -# SimpleQA — pixel retrieval (base) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# NQ — pixel retrieval (base) -python run_bench.py \ - --task nq --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# NQ-Tables -python run_bench.py \ - --task nq_tables --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# MMSearch (multimodal: query image sent alongside text) -python run_bench.py \ - --task mmsearch --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --num-examples 300 --no-think - -# EVQA (multimodal: landmark photo + question text) -python run_bench.py \ - --task encyclopedic_vqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --evqa-dataset-filter landmarks --evqa-question-type-filter automatic \ - --retrieval-top-k 3 --num-examples 749 --no-think - -# LiveVQA — see "LiveVQA Separate Pipeline" section below -``` - -### PixelRAG (fine-tuned) — Screenshot → Screenshot with LoRA embedding - -Same commands as PixelRAG (base), but the search API must be started with the fine-tuned index: - -```bash -# Start search API with fine-tuned index -pixelrag-serve \ - --index-dir pixelrag-data/search_index_lora_vit_ckpt200_v2 \ - --tiles-dir /path/to/wikipedia_tiles \ - --articles-json pixelrag-data/articles.json \ - --model Qwen/Qwen3-VL-Embedding-2B \ - --peft-adapter /path/to/lora_checkpoint_200 \ - --device cuda --port 30888 -``` - -Then run the same `--local-api` commands above. - -### Grading - -```bash -cd ~/pixelrag/eval - -# Grade with GPT-4.1 judge (Wikipedia QA tasks) -python grade.py simpleqa eval_output/simpleqa_*.jsonl -python grade.py encyclopedic_vqa eval_output/encyclopedic_vqa_*.jsonl -python grade.py mmsearch eval_output/mmsearch_*.jsonl - -# For NQ/NQ-Tables (with LLM judge for paper numbers) -python grade.py nq eval_output/nq_*.jsonl --llm-judge -python grade.py nq_tables eval_output/nq_tables_*.jsonl --llm-judge - -# For LiveVQA (exact letter match — handled by the LiveVQA pipeline scripts) -``` - ---- - -## Table 3: Retrieval–Reader Modality Ablation - -**Task**: SimpleQA (1000) + LiveVQA (6632), **Reader**: Qwen3.5-4B, **k=3**, -**Embedding**: Qwen3-VL-Embedding-2B (base, no LoRA) - -| Row | Retrieval | Reader Input | Flags | -|-----|-----------|-------------|-------| -| Screenshot → Screenshot | Pixel index | Raw tile images | `--local-api` | -| Screenshot → OCR text | Pixel index | OCR'd text from tiles | `--local-api --read-as-text-ocr` | -| Text → Rendered image | Text index | Text chunks rendered as PNG | `--text-api --render-as-image` | -| Text → Text | Text index | Raw text chunks | `--text-api` | -| Text → HTML | Text index | Raw HTML from kiwix | `--text-api --html-dom-lookup` | - -```bash -# Screenshot → Screenshot (same as main results PixelRAG base) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# Screenshot → OCR text -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --read-as-text-ocr --ocr-url http://localhost:8202/v1 \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# Text → Rendered image -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --text-api --text-api-url http://localhost:30889/search \ - --render-as-image \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# Text → Text (same as main results Trafilatura) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --text-api --text-api-url http://localhost:30889/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# Text → HTML (DOM lookup) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --text-api --text-api-url http://localhost:30889/search \ - --html-dom-lookup \ - --retrieval-top-k 3 --num-examples 1000 --no-think -``` - -For LiveVQA, use the separate pipeline (see "LiveVQA Separate Pipeline" section) with the corresponding ablation scripts. - ---- - -## Table 4: Embedding Training Recipe Ablation - -**Evaluated on mini-datastore** (400 queries, 7426 tiles). - -This ablation uses `--prebuilt-tiles-dir` pointing to the pre-built mini-datastore, with different embedding checkpoints. Each row corresponds to a different embedding training recipe: - -```bash -# Base model (no fine-tuning) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --use-tiled-retrieval --use-qwen3vl-embedding \ - --qwen3vl-model Qwen/Qwen3-VL-Embedding-2B \ - --embedding-backend hf \ - --prebuilt-tiles-dir tiles-hard-mini/ \ - --retrieval-top-k 3 --num-examples 400 --no-think - -# With LoRA checkpoint (dynamic hard negatives + ViT unfrozen) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --use-tiled-retrieval --use-qwen3vl-embedding \ - --qwen3vl-model Qwen/Qwen3-VL-Embedding-2B \ - --embedding-backend biqwen3 \ - --peft-adapter /path/to/checkpoint-200 \ - --prebuilt-tiles-dir tiles-hard-mini/ \ - --retrieval-top-k 3 --num-examples 400 --no-think -``` - -The intermediate checkpoints (in-batch negatives, naive hard negatives, dynamic hard negatives frozen) each have their own PEFT adapter path. - ---- - -## Figure 2: Token Efficiency (SimpleQA, k=1,2,3, 4 readers) - -**Task**: SimpleQA (1000), **Readers**: Qwen3.5-4B, Qwen3.5-9B, Qwen3.5-27B, Qwen3.6-35B-A3B - -For each reader × k × retrieval method, run: - -```bash -# Example: Qwen3.5-4B, k=1, PixelRAG (fine-tuned) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --reader-top-k 1 \ - --num-examples 1000 --no-think - -# Example: Qwen3.5-4B, k=2, PixelRAG (fine-tuned) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --reader-top-k 2 \ - --num-examples 1000 --no-think - -# Example: Qwen3.5-4B, k=3, PixelRAG (fine-tuned) -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 \ - --num-examples 1000 --no-think -``` - -> **Optimization**: Use `--retrieval-top-k 3 --reader-top-k N` to retrieve once at k=3 and evaluate at k=1,2,3 from the same JSONL (the full retrieved set is stored in `retrieved_images`). - -For each reader, change `--model` and start the appropriate vLLM server. -Repeat for text retrieval (Trafilatura: `--text-api`) and PixelRAG base (base index). - -The plot script is at `arxiv/figures/plot_token_efficiency.py`. - ---- - -## Figure 3: Agentic Multi-Hop QA (MoNaCo) - -**Task**: MoNaCo (1315 questions), **Agent**: GPT-5 ReAct, **k=5 per search** - -Uses `eval/run_monaco.py` — a ReAct agent that issues search tool calls. - -```bash -cd ~/pixelrag/eval - -# PixelRAG backend -python run_monaco.py \ - --reader gpt-5 \ - --retrieval pixel \ - --pixel-api http://localhost:30888/search \ - --default-top-k 5 - -# Text retrieval backend (Trafilatura) -python run_monaco.py \ - --reader gpt-5 \ - --retrieval text \ - --text-api http://localhost:30889/search \ - --default-top-k 5 - -# Grade (token F1 computed inline; add --judge for LLM judge F1) -python run_monaco.py \ - --reader gpt-5 \ - --retrieval pixel \ - --judge --judge-model gpt-4.1-2025-04-14 - -# Or grade existing predictions: -python grade.py monaco eval_output/monaco/ -``` - -The dataset (`monaco_version_1_release.jsonl`) should be placed at -`eval/data/monaco/` or passed via `--data-path`. - ---- - -## Figure 4: Image Compression Curve - -**Task**: SimpleQA (1000), **Reader**: Qwen3.5-4B (base + SFT), k=1..5, compression c=1×/2×/3× - -```bash -# No compression (c=1×), k=3 -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 5 --reader-top-k 3 \ - --num-examples 1000 --no-think - -# 2× compression, k=3 -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 5 --reader-top-k 3 \ - --pixel-compress-ratio 2.0 \ - --num-examples 1000 --no-think - -# 3× compression, k=3 -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 5 --reader-top-k 3 \ - --pixel-compress-ratio 3.0 \ - --num-examples 1000 --no-think -``` - -For the SFT reader, replace `--model` with the SFT checkpoint path and serve it via vLLM. - -The plot script is at `arxiv/figures/plot_sft_compression_curve.py`. - ---- - -## Table 8: Full Reader-Model Sweep (31 VLMs) - -**Task**: SimpleQA (1000), **k=3**, pixel retrieval (base) vs text retrieval (Trafilatura) - -For each of the 31 reader models, run two jobs: - -```bash -# Pixel retrieval -python run_bench.py \ - --task simpleqa --model \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think - -# Text retrieval -python run_bench.py \ - --task simpleqa --model \ - --text-api --text-api-url http://localhost:30889/search \ - --retrieval-top-k 3 --num-examples 1000 --no-think -``` - -where `` is one of: -- `liuhaotian/llava-v1.5-7b` -- `meta-llama/Llama-3.2-11B-Vision-Instruct` (k=1 for pixel due to architecture limit) -- `meta-llama/Llama-3.2-90B-Vision-Instruct` (k=1 for pixel) -- `meta-llama/Llama-4-Scout-17B-16E-Instruct` -- `meta-llama/Llama-4-Maverick-17B-128E-Instruct` -- `Qwen/Qwen2-VL-2B-Instruct` through `Qwen/Qwen2-VL-72B-Instruct` -- `Qwen/Qwen2.5-VL-3B-Instruct` through `Qwen/Qwen2.5-VL-72B-Instruct` -- `Qwen/Qwen3-VL-2B` through `Qwen/Qwen3-VL-235B-A22B` -- `Qwen/Qwen3.5-0.8B` through `Qwen/Qwen3.5-35B-A3B` -- `Qwen/Qwen3.6-27B`, `Qwen/Qwen3.6-35B-A3B` - -For reasoning-mode models, omit `--no-think`. - -Each model requires its own vLLM instance (or OpenRouter/Commonstack for API models). - ---- - -## LiveVQA (Table 1 + Table 3) - -LiveVQA uses `eval/run_livevqa.py` — a dedicated script for the news corpus. - -**Requires**: News pixel search API (port 30890), news text search API (port 30892), -LiveVQA v4 JSON dataset, vLLM reader. - -```bash -cd ~/pixelrag/eval - -# No retrieval -python run_livevqa.py --mode naive \ - --model Qwen/Qwen3.5-4B-Instruct \ - --output eval_output/livevqa_naive.jsonl - -# PixelRAG (screenshot → screenshot) -python run_livevqa.py --mode pixel \ - --pixel-api http://localhost:30890/search \ - --model Qwen/Qwen3.5-4B-Instruct \ - --output eval_output/livevqa_pixel.jsonl - -# Text retrieval (Trafilatura) -python run_livevqa.py --mode text \ - --text-api http://localhost:30892/search \ - --model Qwen/Qwen3.5-4B-Instruct \ - --output eval_output/livevqa_text.jsonl - -# Hybrid (pixel + text) -python run_livevqa.py --mode hybrid \ - --pixel-api http://localhost:30890/search \ - --text-api http://localhost:30892/search \ - --model Qwen/Qwen3.5-4B-Instruct \ - --output eval_output/livevqa_hybrid.jsonl -``` - -Grading is automatic (5-option MC exact letter match) — printed at the end of each run. - ---- - -## Known Issues (Blockers for Reproduction) - -### ~~0. Missing simpleqa modules~~ (FIXED) - -`screenshot.py` and `pixel_query.py` have been copied into `eval/lib/`. -Selenium import is deferred so it doesn't block `--local-api` users. - -### ~~1. `dr_agent` not importable~~ (FIXED) - -Dataset loaders extracted into `eval/lib/benchmarks.py`. The `run_bench.py` -import now reads from `simpleqa.datasets_loader` instead of `dr_agent`. - -### ~~2. Grading script not in this repo~~ (FIXED) - -`eval/grade.py` implements GPT-4.1 3-way grading (CORRECT/INCORRECT/NOT_ATTEMPTED) using -the same prompt template as the paper. No dependency on the old repo's evaluation framework. - -For the legacy full evaluation framework (per-example HTML reports, etc.), the original -is still at `~/pixelrag-src/Vis-RAG/agent/scripts/evaluate.py`. - -### 3. Hardcoded paths in retrieval.py - -`eval/lib/retrieval.py` lines 84–88 have placeholder paths (`/path/to/project`, `/path/to/data`) for the local kiwix tile store. These are only used by `LocalWikiTiledScreenshotRetriever` (ground-truth screenshot mode), not by the production `--local-api` mode. - -### ~~4. LiveVQA uses separate pipeline~~ (FIXED) - -`eval/run_livevqa.py` handles all LiveVQA modes (naive, pixel, text, hybrid). - -### ~~5. MoNaCo runs from old repo~~ (FIXED) - -`eval/run_monaco.py` implements the full ReAct agent loop with pixel/text retrieval backends. - -### 6. mwparserfromhell text index - -The paper's second text baseline uses mwparserfromhell parser. The text index must be built separately with this parser — the parser choice is embedded at index build time, not at query time. The build pipeline for this variant needs to be documented. - -### 7. News corpus indexes - -LiveVQA requires separate tile and text indexes built over the news corpus (BBC/AP/CNN). These indexes are on a different machine/path and need their own `pixelrag-serve` instances. - ---- - -## Grading Protocol Summary - -| Benchmark | Metric | Grader | -|-----------|--------|--------| -| SimpleQA | CORRECT/INCORRECT/NOT_ATTEMPTED → accuracy | GPT-4.1 (temp=0, seed=42) | -| NQ | Same 3-way judge | GPT-4.1 (temp=0, seed=42) | -| NQ-Tables | Same 3-way judge (up to 10 gold aliases joined with OR) | GPT-4.1 | -| MMSearch | Same 3-way judge | GPT-4.1 | -| EVQA | Same 3-way judge (reference_list → "Any of: ref1 \| ref2") | GPT-4.1 | -| LiveVQA | 5-option multiple-choice exact letter match | No LLM | -| MoNaCo | Token-level F1 (primary), LLM judge F1 (secondary) | GPT-4.1 | - ---- - -## Quick Smoke Test (Verify Pipeline Works) - -Run a single example end-to-end before committing to full runs: - -```bash -# 1. Verify search API is responding -curl -s http://localhost:30888/status | python -m json.tool - -# 2. Run 5 examples, no retrieval -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --num-examples 5 --no-think --force - -# 3. Run 5 examples, pixel retrieval -python run_bench.py \ - --task simpleqa --model Qwen/Qwen3.5-4B-Instruct \ - --local-api --local-api-url http://localhost:30888/search \ - --retrieval-top-k 3 --num-examples 5 --no-think --force - -# 4. Grade -cd ~/pixelrag-src/Vis-RAG/agent -python scripts/evaluate.py simpleqa ~/pixelrag/eval/eval_output/.jsonl -``` - ---- - -## Output File Convention - -All outputs go to `eval_output/` with auto-generated filenames: - -``` -eval_output/{task}_{mode}_{model_safe}_{n}.jsonl -``` - -Examples: -- `eval_output/simpleqa_naive_qwen_qwen3.5_4b_instruct_1000.jsonl` -- `eval_output/simpleqa_local_api_qwen_qwen3.5_4b_instruct_1000.jsonl` -- `eval_output/nq_text_api_qwen_qwen3.5_4b_instruct_1000.jsonl` - -Grading results are saved alongside as `*_eval_results.json`. diff --git a/docs/superpowers/plans/2026-05-11-pixelrag-restructure.md b/docs/superpowers/plans/2026-05-11-pixelrag-restructure.md deleted file mode 100644 index e9d9766..0000000 --- a/docs/superpowers/plans/2026-05-11-pixelrag-restructure.md +++ /dev/null @@ -1,1315 +0,0 @@ -# PixelRAG 5-Package Restructure Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Restructure ~/pixelrag/ from the current messy first-pass merge into 5 clean packages: ingest, embed, index, serve, train. - -**Architecture:** Five independent uv workspace packages. ingest renders documents to tiles, embed provides orchestrator-free chunk/embed/build tools, index orchestrates full pipelines, serve provides the search API, train handles model fine-tuning. Source repos at ~/pixelrag-src/ are read-only. - -**Tech Stack:** Python 3.12+, uv workspaces, FastAPI, FAISS, torch, Playwright, Chromium CDP - ---- - -## File Structure - -``` -~/pixelrag/ -├── pyproject.toml # workspace root -├── uv.lock -├── LICENSE -├── README.md -├── .gitignore -├── packages/ -│ ├── ingest/ -│ │ ├── pyproject.toml -│ │ └── src/pixelrag_render/ -│ │ ├── __init__.py -│ │ ├── render.py # Public API dispatch -│ │ ├── backends/ -│ │ │ ├── __init__.py -│ │ │ ├── cdp.py # Lean CDP capture (default) -│ │ │ ├── playwright.py # Full Playwright (compat) -│ │ │ └── pdf.py # PDF rendering -│ │ └── bench/ -│ │ ├── benchmark.py -│ │ ├── benchmark_optimizations.py -│ │ ├── benchmark_fullpage.py -│ │ └── benchmark_longtail_matrix.py -│ │ -│ ├── embed/ -│ │ ├── pyproject.toml -│ │ └── src/pixelrag_embed/ -│ │ ├── __init__.py -│ │ ├── chunk.py # Tile → 1024px strips -│ │ ├── embed.py # Images → vectors -│ │ └── index.py # Vectors → FAISS -│ │ -│ ├── index/ -│ │ ├── pyproject.toml -│ │ └── src/pixelrag_index/ -│ │ ├── __init__.py -│ │ ├── config.py # pixelrag.yaml parser -│ │ ├── pipelines.py # End-to-end orchestration -│ │ ├── distributed.py # S3ShardCoordinator (optional) -│ │ ├── monitor.py # Progress dashboard -│ │ └── sources/ -│ │ ├── __init__.py -│ │ ├── base.py # Source ABC -│ │ ├── kiwix.py # Wikipedia ZIM -│ │ ├── web.py # URLs + download (generalized news) -│ │ ├── pdf.py # PDF directory -│ │ └── local.py # Auto-detect mixed files -│ │ -│ ├── serve/ -│ │ ├── pyproject.toml -│ │ └── src/pixelrag_serve/ -│ │ ├── __init__.py -│ │ └── api.py # Unified search API -│ │ -│ └── train/ -│ ├── pyproject.toml -│ └── src/pixelrag_train/ -│ ├── __init__.py -│ ├── models/ -│ │ ├── __init__.py -│ │ └── biqwen3.py -│ ├── contrastive.py -│ └── mine.py -│ -└── eval/ - ├── run_naive_simpleqa.py - └── simpleqa/ -``` - ---- - -### Task 1: Clean workspace and create scaffold - -**Files:** -- Modify: `~/pixelrag/pyproject.toml` -- Create: all package directories and `__init__.py` files - -- [ ] **Step 1: Remove old packages directory** - -```bash -cd ~/pixelrag -rm -rf packages/ -``` - -- [ ] **Step 2: Create new package skeleton** - -```bash -cd ~/pixelrag - -# ingest -mkdir -p packages/render/src/pixelrag_render/backends -mkdir -p packages/render/src/pixelrag_render/bench - -# embed -mkdir -p packages/embed/src/pixelrag_embed - -# index -mkdir -p packages/index/src/pixelrag_index/sources - -# serve -mkdir -p packages/serve/src/pixelrag_serve - -# train -mkdir -p packages/train/src/pixelrag_train/models - -# __init__.py for all packages -for pkg in \ - packages/render/src/pixelrag_render \ - packages/render/src/pixelrag_render/backends \ - packages/embed/src/pixelrag_embed \ - packages/index/src/pixelrag_index \ - packages/index/src/pixelrag_index/sources \ - packages/serve/src/pixelrag_serve \ - packages/train/src/pixelrag_train \ - packages/train/src/pixelrag_train/models; do - touch "$pkg/__init__.py" -done -``` - -- [ ] **Step 3: Update workspace root pyproject.toml** - -Write `~/pixelrag/pyproject.toml`: -```toml -[project] -name = "pixelrag" -version = "0.1.0" -description = "Visual Retrieval-Augmented Generation — render, embed, index, search, train" -requires-python = ">=3.12" - -[tool.uv.workspace] -members = ["packages/*"] - -[tool.uv] -override-dependencies = ["nvidia-cudnn-cu12==9.20.0.48"] -environments = ["sys_platform == 'linux'"] - -[[tool.uv.index]] -name = "pytorch-cu129" -url = "https://download.pytorch.org/whl/cu129" -explicit = true -``` - -- [ ] **Step 4: Commit scaffold** - -```bash -cd ~/pixelrag -git add -A -git commit -m "scaffold: 5-package workspace (ingest, embed, index, serve, train)" -``` - ---- - -### Task 2: pixelrag-render package - -**Files:** -- Create: `packages/render/pyproject.toml` -- Create: `packages/render/src/pixelrag_render/render.py` -- Create: `packages/render/src/pixelrag_render/backends/cdp.py` -- Copy+strip: `packages/render/src/pixelrag_render/backends/playwright.py` -- Create: `packages/render/src/pixelrag_render/backends/pdf.py` -- Copy: `packages/render/src/pixelrag_render/bench/*.py` - -- [ ] **Step 1: Create pyproject.toml** - -Write `~/pixelrag/packages/render/pyproject.toml`: -```toml -[project] -name = "pixelrag-render" -version = "0.1.0" -description = "Document → image tiles. Renders web pages, PDFs, and local files as tiled screenshots." -requires-python = ">=3.12" -dependencies = [ - "playwright>=1.40.0", - "pillow>=10.0.0", - "aiohttp>=3.9.0", -] - -[project.optional-dependencies] -pdf = ["pdf2image>=1.16.0"] -dev = ["pytest>=7.0.0", "pytest-asyncio>=0.21.0"] - -[project.scripts] -pixelrag-render = "pixelrag_render.render:main" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["src/pixelrag_render"] -``` - -- [ ] **Step 2: Create render.py — public API** - -Write `~/pixelrag/packages/render/src/pixelrag_render/render.py`: -```python -"""Public API for rendering documents to image tiles. - -Usage: - from pixelrag_render import render_url, render_pdf, render_file - - tiles = render_url("https://en.wikipedia.org/wiki/Python", "./output") - tiles = render_pdf("paper.pdf", "./output") - tiles = render_file("doc.html", "./output") # auto-detect type -""" - -import argparse -import os -from pathlib import Path - - -def render_url( - url: str, - output_dir: str, - backend: str = "cdp", - *, - tile_height: int = 8192, - quality: int = 85, - viewport_width: int = 875, - workers: int = 1, - **backend_kwargs, -) -> list[Path]: - """Render a URL to tiled screenshots. - - Args: - url: Web page URL to render. - output_dir: Directory for output tiles. - backend: "cdp" (default, fastest) or "playwright" (more options). - tile_height: Height of each tile in pixels. - quality: JPEG quality (1-100). - viewport_width: Browser viewport width. - workers: Number of browser workers for batch rendering. - - Returns: - List of tile file paths. - """ - if backend == "cdp": - from .backends.cdp import render_urls - return render_urls( - [url], output_dir, - tile_height=tile_height, quality=quality, - viewport_width=viewport_width, workers=workers, - **backend_kwargs, - ) - elif backend == "playwright": - from .backends.playwright import render_urls - return render_urls( - [url], output_dir, - tile_height=tile_height, quality=quality, - viewport_width=viewport_width, - **backend_kwargs, - ) - else: - raise ValueError(f"Unknown backend: {backend!r}. Use 'cdp' or 'playwright'.") - - -def render_pdf( - path: str, - output_dir: str, - *, - dpi: int = 200, - pages: str | None = None, -) -> list[Path]: - """Render a PDF to tiled page images. - - Args: - path: Path to PDF file. - output_dir: Directory for output tiles. - dpi: Rendering resolution. - pages: Page range (e.g. "1-10"). None = all pages. - - Returns: - List of tile file paths. - """ - from .backends.pdf import render_pdf as _render_pdf - return _render_pdf(path, output_dir, dpi=dpi, pages=pages) - - -def render_file( - path: str, - output_dir: str, - backend: str = "cdp", - **kwargs, -) -> list[Path]: - """Auto-detect file type and render to tiles. - - Supports: .pdf, .html, .png/.jpg (direct copy), URLs (if starts with http). - """ - p = str(path) - if p.startswith("http://") or p.startswith("https://"): - return render_url(p, output_dir, backend=backend, **kwargs) - ext = os.path.splitext(p)[1].lower() - if ext == ".pdf": - return render_pdf(p, output_dir, **kwargs) - elif ext in (".html", ".htm"): - file_url = f"file://{os.path.abspath(p)}" - return render_url(file_url, output_dir, backend=backend, **kwargs) - elif ext in (".png", ".jpg", ".jpeg", ".webp"): - # Image files: copy directly as a single tile - os.makedirs(output_dir, exist_ok=True) - import shutil - dest = Path(output_dir) / Path(p).name - shutil.copy2(p, dest) - return [dest] - else: - raise ValueError(f"Unsupported file type: {ext}") - - -def main(): - parser = argparse.ArgumentParser(description="Render documents to image tiles") - parser.add_argument("inputs", nargs="+", help="URLs, file paths, or directories") - parser.add_argument("--output", "-o", default="./tiles", help="Output directory") - parser.add_argument("--backend", default="cdp", choices=["cdp", "playwright"]) - parser.add_argument("--tile-height", type=int, default=8192) - parser.add_argument("--quality", type=int, default=85) - parser.add_argument("--viewport-width", type=int, default=875) - parser.add_argument("--workers", type=int, default=4) - parser.add_argument("--dpi", type=int, default=200, help="PDF rendering DPI") - args = parser.parse_args() - - all_tiles = [] - for inp in args.inputs: - tiles = render_file( - inp, args.output, - backend=args.backend, - tile_height=args.tile_height, - quality=args.quality, - viewport_width=args.viewport_width, - workers=args.workers, - dpi=args.dpi, - ) - all_tiles.extend(tiles) - print(f"{inp} → {len(tiles)} tiles") - - print(f"\nTotal: {len(all_tiles)} tiles in {args.output}") - - -if __name__ == "__main__": - main() -``` - -- [ ] **Step 3: Create cdp.py — lean CDP backend** - -Copy from source and generalize: -```bash -cp ~/pixelrag-src/wiki-screenshot/scripts/render_news_pages.py \ - ~/pixelrag/packages/render/src/pixelrag_render/backends/cdp.py -``` - -Then apply these transformations to `cdp.py`: -1. Remove news-specific imports (`from wiki_screenshot.news.db import NewsDB`, `from wiki_screenshot.news.metrics import start_metrics_server`) -2. Remove `NewsDB` usage in `worker()` and `run_batch()` — replace with simple success/fail counters -3. Remove `check_nginx()` preflight — not general -4. Remove `main()` function (the CLI with `--db-path`, `--pages-dir` etc.) — replaced by `render.py` -5. Rename `capture_article()` to a general name -6. Export a `render_urls(urls, output_dir, ...)` function that `render.py` calls -7. Remove hardcoded paths (`/opt/dlami/nvme/`) -8. Keep: `_launch_browser()`, `BROWSER_ARGS`, the CDP capture logic, multi-browser worker architecture, JPEG tile output - -- [ ] **Step 4: Copy and strip playwright.py** - -```bash -cp ~/pixelrag-src/wiki-screenshot/src/wiki_screenshot/tools/playwright_tool.py \ - ~/pixelrag/packages/render/src/pixelrag_render/backends/playwright.py -``` - -Transformations to `playwright.py`: -1. Remove imports of `streaming_capture`, `raw_pixels`, `temp_dirs` -2. Remove the `use_streaming` code path and all streaming-related params -3. Remove unused experimental options (keep only: `use_cdp_screenshot`, `cdp_optimize_for_speed`, `segmented_save_tiles`, `segment_height`, `enable_gpu`, `device_scale_factor`, `image_format`, `quality`, `width`, `max_height`) -4. Remove `_cdp_sessions` management if not needed for the retained CDP path -5. Export a `render_urls(urls, output_dir, ...)` function matching cdp.py's interface -6. Keep: CDP screenshot mode, segmented tile capture, GPU rasterization flags, core `_capture_page()` logic -7. Target: strip from ~2388 lines to ~500-800 lines of production-relevant code - -- [ ] **Step 5: Create pdf.py — PDF backend** - -Write `~/pixelrag/packages/render/src/pixelrag_render/backends/pdf.py`: -```python -"""PDF rendering backend: PDF pages → tile images.""" - -import json -import os -from pathlib import Path - - -def render_pdf( - path: str, - output_dir: str, - *, - dpi: int = 200, - pages: str | None = None, -) -> list[Path]: - """Render PDF pages as tile images. - - Args: - path: Path to PDF file. - output_dir: Output directory for tiles. - dpi: Rendering resolution. - pages: Page range string (e.g. "1-10", "3,5,7"). None = all. - - Returns: - List of tile image paths. - """ - try: - from pdf2image import convert_from_path - except ImportError: - raise ImportError( - "pdf2image is required for PDF rendering. " - "Install with: pip install pixelrag-render[pdf]" - ) - - pdf_path = Path(path) - doc_id = pdf_path.stem - tile_dir = Path(output_dir) / f"{doc_id}.tiles" - tile_dir.mkdir(parents=True, exist_ok=True) - - # Parse page range - kwargs = {"dpi": dpi} - if pages: - if "-" in pages: - first, last = pages.split("-", 1) - kwargs["first_page"] = int(first) - kwargs["last_page"] = int(last) - else: - page_nums = [int(p.strip()) for p in pages.split(",")] - kwargs["first_page"] = min(page_nums) - kwargs["last_page"] = max(page_nums) - - images = convert_from_path(str(pdf_path), **kwargs) - - tile_paths = [] - for i, img in enumerate(images): - tile_path = tile_dir / f"tile_{i:04d}.jpg" - img.save(tile_path, "JPEG", quality=85) - tile_paths.append(tile_path) - - # Write manifest - manifest = { - "source": str(pdf_path), - "dpi": dpi, - "pages": len(images), - "tiles": [p.name for p in tile_paths], - "complete": True, - } - with open(tile_dir / "tiles.json", "w") as f: - json.dump(manifest, f, indent=2) - - return tile_paths -``` - -- [ ] **Step 6: Copy bench/** - -```bash -cp ~/pixelrag-src/wiki-screenshot/bench/benchmark.py \ - ~/pixelrag/packages/render/src/pixelrag_render/bench/ -cp ~/pixelrag-src/wiki-screenshot/bench/benchmark_optimizations.py \ - ~/pixelrag/packages/render/src/pixelrag_render/bench/ -cp ~/pixelrag-src/wiki-screenshot/bench/benchmark_fullpage.py \ - ~/pixelrag/packages/render/src/pixelrag_render/bench/ -cp ~/pixelrag-src/wiki-screenshot/bench/benchmark_longtail_matrix.py \ - ~/pixelrag/packages/render/src/pixelrag_render/bench/ -``` - -Fix imports in bench files: replace `wiki_screenshot` → `pixelrag_render`: -```bash -find ~/pixelrag/packages/render/src/pixelrag_render/bench -name '*.py' -exec sed -i \ - -e 's/from wiki_screenshot/from pixelrag_render/g' \ - -e 's/import wiki_screenshot/import pixelrag_render/g' \ - {} + -``` - -- [ ] **Step 7: Verify ingest package imports** - -```bash -cd ~/pixelrag -uv sync --package pixelrag-render 2>&1 | tail -3 -uv run --package pixelrag-render python -c "from pixelrag_render.render import render_url, render_pdf, render_file; print('OK')" -``` - -- [ ] **Step 8: Commit** - -```bash -cd ~/pixelrag -git add packages/render/ -git commit -m "feat: add pixelrag-render package (CDP/Playwright/PDF backends)" -``` - ---- - -### Task 3: pixelrag-embed package - -**Files:** -- Create: `packages/embed/pyproject.toml` -- Copy: `packages/embed/src/pixelrag_embed/chunk.py` -- Copy: `packages/embed/src/pixelrag_embed/embed.py` -- Copy: `packages/embed/src/pixelrag_embed/index.py` - -- [ ] **Step 1: Create pyproject.toml** - -Write `~/pixelrag/packages/embed/pyproject.toml`: -```toml -[project] -name = "pixelrag-embed" -version = "0.1.0" -description = "Image tiles → vectors → FAISS index. Three independent CLI tools." -requires-python = ">=3.12" -dependencies = [ - "torch>=2.9.0", - "transformers>=4.57.0", - "faiss-cpu>=1.9.0", - "pillow>=10.0.0", - "numpy>=1.26.0", - "tqdm>=4.60.0", -] - -[project.optional-dependencies] -gpu = ["faiss-gpu-cu12>=1.13.2"] - -[project.scripts] -pixelrag-chunk = "pixelrag_embed.chunk:main" -pixelrag-embed = "pixelrag_embed.embed:main" -pixelrag-build-index = "pixelrag_embed.index:main" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["src/pixelrag_embed"] - -[tool.uv] -override-dependencies = ["nvidia-cudnn-cu12==9.20.0.48"] - -[tool.uv.sources] -torch = [{ index = "pytorch-cu129" }] -``` - -- [ ] **Step 2: Copy chunk.py** - -```bash -cp ~/pixelrag-src/wiki-screenshot/embedding/chunk_tiles.py \ - ~/pixelrag/packages/embed/src/pixelrag_embed/chunk.py -``` - -No import renames needed — `chunk_tiles.py` only uses stdlib + PIL. - -- [ ] **Step 3: Copy embed.py** - -```bash -cp ~/pixelrag-src/wiki-screenshot/embedding/embed_tiles.py \ - ~/pixelrag/packages/embed/src/pixelrag_embed/embed.py -``` - -No import renames needed — `embed_tiles.py` only uses stdlib + numpy/PIL/tqdm + subprocess for vLLM/sglang. - -- [ ] **Step 4: Copy index.py** - -```bash -cp ~/pixelrag-src/wiki-screenshot/indexing/build_index.py \ - ~/pixelrag/packages/embed/src/pixelrag_embed/index.py -``` - -No import renames needed — `build_index.py` only uses stdlib + numpy + faiss. - -- [ ] **Step 5: Clean hardcoded paths in all three files** - -```bash -find ~/pixelrag/packages/embed/src -name '*.py' -exec sed -i \ - -e 's|/opt/dlami/nvme/[^ "'"'"'\\)]*|./data|g' \ - -e 's|/home/user/[^ "'"'"'\\)]*|./|g' \ - -e 's|/home/ubuntu/[^ "'"'"'\\)]*|./|g' \ - -e 's|/home/andy/[^ "'"'"'\\)]*|./|g' \ - {} + -``` - -- [ ] **Step 6: Verify embed package imports** - -```bash -cd ~/pixelrag -uv sync --package pixelrag-embed 2>&1 | tail -3 -uv run --package pixelrag-embed python -c "from pixelrag_embed import chunk, embed, index; print('OK')" -``` - -- [ ] **Step 7: Commit** - -```bash -cd ~/pixelrag -git add packages/embed/ -git commit -m "feat: add pixelrag-embed package (chunk, embed, build-index)" -``` - ---- - -### Task 4: pixelrag-serve package - -**Files:** -- Create: `packages/serve/pyproject.toml` -- Create: `packages/serve/src/pixelrag_serve/api.py` (merged from 3 APIs) - -- [ ] **Step 1: Create pyproject.toml** - -Write `~/pixelrag/packages/serve/pyproject.toml`: -```toml -[project] -name = "pixelrag-serve" -version = "0.1.0" -description = "FAISS-based visual search API. Serves any pre-built index." -requires-python = ">=3.12" -dependencies = [ - "fastapi>=0.115.0", - "uvicorn>=0.30.0", - "numpy>=1.26.0", - "faiss-cpu>=1.9.0", - "transformers>=4.57.0", - "torch>=2.9.0", - "qwen-vl-utils", - "pillow>=10.0.0", - "pydantic>=2.0.0", -] - -[project.optional-dependencies] -gpu = ["faiss-gpu-cu12>=1.13.2"] - -[project.scripts] -pixelrag-serve = "pixelrag_serve.api:main" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["src/pixelrag_serve"] - -[tool.uv.sources] -torch = [{ index = "pytorch-cu129" }] -``` - -- [ ] **Step 2: Create unified api.py** - -Start from the existing adapted search_api.py (which already has CPU support): -```bash -cp ~/pixelrag-src/wiki-screenshot/serving/search_api.py \ - ~/pixelrag/packages/serve/src/pixelrag_serve/api.py -``` - -Apply transformations to `api.py`: -1. Remove vllm backend (keep only direct transformers inference) -2. Remove `torch.compile()` call -3. Add `--device cpu|cuda` arg (default cpu), use `torch.float32` on CPU -4. Replace hardcoded `/opt/dlami/nvme/` paths with env var defaults (`PIXELRAG_INDEX_DIR`, `PIXELRAG_ARTICLES_JSON`) -5. Replace `torch_dtype` with `dtype` in `from_pretrained()` to fix deprecation warning -6. This is the unified API — it serves any FAISS index (wiki, news, text, any). No separate news_search_api or text_search_api needed if the index format is consistent. - -The existing api.py from the first-pass merge (at `~/pixelrag/packages/serving/src/pixelrag_serving/search_api.py`) already has most of these changes. Use that as the starting point instead: -```bash -# Actually use the already-adapted version -cp ~/pixelrag/packages/serving/src/pixelrag_serving/search_api.py \ - ~/pixelrag/packages/serve/src/pixelrag_serve/api.py 2>/dev/null || \ -cp ~/pixelrag-src/wiki-screenshot/serving/search_api.py \ - ~/pixelrag/packages/serve/src/pixelrag_serve/api.py -``` - -If using the source version, apply the CPU adaptations from the spec (remove vllm, add --device, fix torch_dtype, replace paths). - -- [ ] **Step 3: Verify serve package** - -```bash -cd ~/pixelrag -uv sync --package pixelrag-serve 2>&1 | tail -3 -uv run --package pixelrag-serve python -c "from pixelrag_serve import api; print('OK')" -``` - -- [ ] **Step 4: Smoke test with existing downloaded index** - -```bash -PIXELRAG_INDEX_DIR=/home/yichuan/pixelrag-data/text_search_index_1024 \ -PIXELRAG_ARTICLES_JSON=/home/yichuan/pixelrag-data/articles.json \ -uv run --package pixelrag-serve python -m pixelrag_serve.api --port 31001 & -sleep 120 # wait for index + model loading -curl -s http://localhost:31001/health -curl -s -X POST http://localhost:31001/search \ - -H "Content-Type: application/json" \ - -d '{"queries": [{"text": "capital of France"}], "n_docs": 3}' | python3 -m json.tool | head -20 -kill %1 -``` - -- [ ] **Step 5: Commit** - -```bash -cd ~/pixelrag -git add packages/serve/ -git commit -m "feat: add pixelrag-serve package (unified FAISS search API)" -``` - ---- - -### Task 5: pixelrag-train package - -**Files:** -- Create: `packages/train/pyproject.toml` -- Copy: `packages/train/src/pixelrag_train/models/biqwen3.py` -- Copy+rename: `packages/train/src/pixelrag_train/contrastive.py` -- Create: `packages/train/src/pixelrag_train/mine.py` (merged from 2 scripts) -- Copy: tests - -- [ ] **Step 1: Create pyproject.toml** - -Write `~/pixelrag/packages/train/pyproject.toml`: -```toml -[project] -name = "pixelrag-train" -version = "0.1.0" -description = "LoRA/DoRA contrastive fine-tuning for visual document retrieval embeddings" -requires-python = ">=3.12" -dependencies = [ - "torch==2.9.1", - "torchvision", - "transformers==4.57.1", - "nvidia-cudnn-cu12==9.20.0.48", - "peft>=0.15.0", - "accelerate>=1.0.0", - "Pillow", - "tqdm", - "numpy", - "faiss-cpu", - "wandb", - "safetensors", - "huggingface-hub", - "qwen-vl-utils", - "datasets", -] - -[project.scripts] -pixelrag-train = "pixelrag_train.contrastive:main" -pixelrag-mine = "pixelrag_train.mine:main" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["src/pixelrag_train"] - -[tool.uv] -override-dependencies = ["nvidia-cudnn-cu12==9.20.0.48"] - -[tool.uv.sources] -torch = [{ index = "pytorch-cu129" }] -torchvision = [{ index = "pytorch-cu129" }] -``` - -- [ ] **Step 2: Copy model and training code** - -```bash -SRC=~/pixelrag-src/wiki-screenshot-training -DST=~/pixelrag/packages/train - -# Model -cp $SRC/models/__init__.py $DST/src/pixelrag_train/models/ -cp $SRC/models/biqwen3.py $DST/src/pixelrag_train/models/ - -# Training script -cp $SRC/train_contrastors.py $DST/src/pixelrag_train/contrastive.py -``` - -- [ ] **Step 3: Create merged mine.py** - -Copy the image mining script as base, then merge text mining functionality: -```bash -cp ~/pixelrag-src/wiki-screenshot-training/mine_hard_negatives.py \ - ~/pixelrag/packages/train/src/pixelrag_train/mine.py -``` - -Add to the `mine.py` argparse a `--mode image|text` flag. The image mode calls the image search API (original `mine_hard_negatives.py` behavior). The text mode calls the text search API (original `mine_text_hard_negatives.py` behavior). Read both source files to understand the differences and merge them. - -Key differences between the two scripts: -- `mine_hard_negatives.py` queries `:30888/search` with image results (returns `chunk_path`) -- `mine_text_hard_negatives.py` queries `:30889/search` with text results (returns `article_id`, `chunk_index`, `text`) -- Both share: query loading, JSONL I/O, concurrent API calls, dedup logic - -- [ ] **Step 4: Copy tests** - -```bash -cp ~/pixelrag-src/wiki-screenshot-training/tests/test_grad_equivalence.py \ - ~/pixelrag/packages/train/tests/ -cp ~/pixelrag-src/wiki-screenshot-training/tests/test_grad_multi_gpu.py \ - ~/pixelrag/packages/train/tests/ -mkdir -p ~/pixelrag/packages/train/tests -``` - -- [ ] **Step 5: Clean hardcoded paths** - -```bash -find ~/pixelrag/packages/train -name '*.py' -exec sed -i \ - -e 's|/opt/dlami/nvme/[^ "'"'"'\\)]*|./data|g' \ - -e 's|/home/user/[^ "'"'"'\\)]*|./|g' \ - -e 's|/home/ubuntu/[^ "'"'"'\\)]*|./|g' \ - {} + -``` - -- [ ] **Step 6: Commit** - -```bash -cd ~/pixelrag -git add packages/train/ -git commit -m "feat: add pixelrag-train package (contrastive training + mining)" -``` - ---- - -### Task 6: pixelrag-index package - -**Files:** -- Create: `packages/index/pyproject.toml` -- Create: `packages/index/src/pixelrag_index/config.py` -- Create: `packages/index/src/pixelrag_index/pipelines.py` -- Copy+refactor: `packages/index/src/pixelrag_index/distributed.py` -- Copy+refactor: `packages/index/src/pixelrag_index/monitor.py` -- Copy+refactor: `packages/index/src/pixelrag_index/sources/kiwix.py` -- Create: `packages/index/src/pixelrag_index/sources/web.py` -- Create: `packages/index/src/pixelrag_index/sources/pdf.py` -- Create: `packages/index/src/pixelrag_index/sources/local.py` -- Create: `packages/index/src/pixelrag_index/sources/base.py` - -- [ ] **Step 1: Create pyproject.toml** - -Write `~/pixelrag/packages/index/pyproject.toml`: -```toml -[project] -name = "pixelrag-index" -version = "0.1.0" -description = "Build searchable FAISS indexes from any document source" -requires-python = ">=3.12" -dependencies = [ - "pixelrag-render", - "pixelrag-embed", - "pyyaml>=6.0", - "tqdm>=4.60.0", -] - -[project.optional-dependencies] -distributed = ["boto3>=1.42.0"] - -[project.scripts] -pixelrag-index = "pixelrag_index.pipelines:main" -pixelrag-monitor = "pixelrag_index.monitor:main" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["src/pixelrag_index"] -``` - -- [ ] **Step 2: Create sources/base.py** - -Write `~/pixelrag/packages/index/src/pixelrag_index/sources/base.py`: -```python -"""Base class for document sources.""" - -from dataclasses import dataclass -from typing import Iterator - - -@dataclass -class Document: - """A document to be rendered and indexed.""" - id: str - url: str | None = None - path: str | None = None - metadata: dict | None = None - - -class Source: - """Base class for document sources. Subclasses yield Documents.""" - - def __iter__(self) -> Iterator[Document]: - raise NotImplementedError - - def __len__(self) -> int: - raise NotImplementedError -``` - -- [ ] **Step 3: Create config.py** - -Write `~/pixelrag/packages/index/src/pixelrag_index/config.py`: -```python -"""Parse pixelrag.yaml configuration with parameter forwarding.""" - -import os -from pathlib import Path - -import yaml - -from .sources import SOURCES - - -DEFAULT_CONFIG = { - "ingest": {"backend": "cdp", "quality": 85, "tile_height": 8192}, - "embed": {"model": "Qwen/Qwen3-VL-Embedding-2B", "device": "cuda"}, - "output": "./index", -} - - -def load_config(path: str | None = None) -> dict: - """Load config from pixelrag.yaml or defaults. - - Looks for pixelrag.yaml in: explicit path > cwd > ~/.config/pixelrag/ - """ - if path is None: - candidates = [ - Path("pixelrag.yaml"), - Path("pixelrag.yml"), - Path.home() / ".config" / "pixelrag" / "pixelrag.yaml", - ] - for c in candidates: - if c.exists(): - path = str(c) - break - - if path and os.path.exists(path): - with open(path) as f: - config = yaml.safe_load(f) - else: - config = {} - - # Merge with defaults - result = {**DEFAULT_CONFIG, **config} - return result - - -def make_source(config: dict): - """Create a Source instance from config["source"] with parameter forwarding.""" - source_config = dict(config.get("source", {})) - source_type = source_config.pop("type", "local") - - if source_type not in SOURCES: - raise ValueError( - f"Unknown source type: {source_type!r}. " - f"Available: {', '.join(SOURCES.keys())}" - ) - - return SOURCES[source_type](**source_config) -``` - -- [ ] **Step 4: Create sources/kiwix.py** - -Copy from source and refactor to use the new Source interface: -```bash -cp ~/pixelrag-src/wiki-screenshot/src/wiki_screenshot/datasources/kiwix.py \ - ~/pixelrag/packages/index/src/pixelrag_index/sources/kiwix.py -``` - -Refactor: make `KiwixSource` extend `Source`, yield `Document` objects instead of `Article` objects. Remove imports of `wiki_screenshot`. Keep the core article iteration logic (fetch from kiwix-serve, cache articles.json). - -- [ ] **Step 5: Create sources/web.py** - -Copy news-related code and generalize: -```bash -# Start from the news datasource as the iteration layer -cp ~/pixelrag-src/wiki-screenshot/src/wiki_screenshot/datasources/news.py \ - ~/pixelrag/packages/index/src/pixelrag_index/sources/web.py -``` - -Then integrate download logic from `news/download.py` and `news/db.py` as internal implementation. Rename news-specific classes/functions to general names. Add `preset` parameter with `"news"` preset containing BBC/CNN/AP domain limits and cookie banner CSS. - -Key transformations: -1. `NewsDataSource` → `WebSource(Source)` -2. Import and embed `NewsDownloader` logic from `news/download.py` (or import it as a submodule) -3. Import `NewsDB` from `news/db.py` as `WebDB` (SQLite state tracking) -4. Add `PRESETS` dict with `"news"` key containing domain limits, cookie CSS -5. Yield `Document` objects instead of `Article` - -- [ ] **Step 6: Create sources/pdf.py and sources/local.py** - -Write `~/pixelrag/packages/index/src/pixelrag_index/sources/pdf.py`: -```python -"""PDF directory source — iterates PDF files for rendering.""" - -import os -from pathlib import Path -from typing import Iterator - -from .base import Document, Source - - -class PDFSource(Source): - def __init__(self, path: str, **kwargs): - self.path = Path(path) - self.kwargs = kwargs - self._files = sorted(self.path.glob("**/*.pdf")) - - def __iter__(self) -> Iterator[Document]: - for pdf in self._files: - yield Document( - id=pdf.stem, - path=str(pdf), - metadata={"type": "pdf", **self.kwargs}, - ) - - def __len__(self) -> int: - return len(self._files) -``` - -Write `~/pixelrag/packages/index/src/pixelrag_index/sources/local.py`: -```python -"""Local directory source — auto-detects file types and routes.""" - -import os -from pathlib import Path -from typing import Iterator - -from .base import Document, Source - -SUPPORTED_EXTENSIONS = { - ".pdf": "pdf", - ".html": "web", - ".htm": "web", - ".png": "image", - ".jpg": "image", - ".jpeg": "image", - ".webp": "image", -} - - -class LocalSource(Source): - def __init__(self, path: str, **kwargs): - self.path = Path(path) - self.kwargs = kwargs - self._files = [] - for f in sorted(self.path.rglob("*")): - if f.is_file() and f.suffix.lower() in SUPPORTED_EXTENSIONS: - self._files.append(f) - - def __iter__(self) -> Iterator[Document]: - for f in self._files: - ext = f.suffix.lower() - file_type = SUPPORTED_EXTENSIONS.get(ext, "unknown") - if file_type == "web": - url = f"file://{f.resolve()}" - yield Document(id=f.stem, url=url, metadata={"type": file_type}) - else: - yield Document(id=f.stem, path=str(f), metadata={"type": file_type}) - - def __len__(self) -> int: - return len(self._files) -``` - -- [ ] **Step 7: Create sources/__init__.py registry** - -Write `~/pixelrag/packages/index/src/pixelrag_index/sources/__init__.py`: -```python -"""Document source registry.""" - -from .base import Document, Source -from .kiwix import KiwixSource -from .local import LocalSource -from .pdf import PDFSource -from .web import WebSource - -SOURCES = { - "kiwix": KiwixSource, - "web": WebSource, - "pdf": PDFSource, - "local": LocalSource, -} - -__all__ = ["Document", "Source", "SOURCES", "KiwixSource", "WebSource", "PDFSource", "LocalSource"] -``` - -- [ ] **Step 8: Copy and refactor distributed.py** - -```bash -cp ~/pixelrag-src/wiki-screenshot/src/wiki_screenshot/coordinator.py \ - ~/pixelrag/packages/index/src/pixelrag_index/distributed.py -``` - -Rename: `wiki_screenshot` imports → none needed (coordinator.py only uses stdlib + boto3). -Replace hardcoded paths. - -- [ ] **Step 9: Copy and refactor monitor.py** - -```bash -cp ~/pixelrag-src/wiki-screenshot/scripts/monitor_global.py \ - ~/pixelrag/packages/index/src/pixelrag_index/monitor.py -``` - -Replace `from pixelrag_capture.coordinator import S3ShardCoordinator` → `from .distributed import S3ShardCoordinator`. - -- [ ] **Step 10: Create pipelines.py — orchestration** - -Write `~/pixelrag/packages/index/src/pixelrag_index/pipelines.py`: -```python -"""End-to-end pipeline: source → ingest → chunk → embed → build index.""" - -import argparse -import logging -import os -import subprocess -import sys -from pathlib import Path - -from .config import load_config, make_source - -logger = logging.getLogger("pixelrag-index") - - -def build(config: dict) -> Path: - """Build a searchable index from a document source. - - Chains: source → pixelrag-render (render) → pixelrag-chunk → pixelrag-embed → pixelrag-build-index - """ - source = make_source(config) - output_dir = Path(config.get("output", "./index")) - tiles_dir = output_dir / "tiles" - chunks_dir = output_dir / "chunks" - embeddings_dir = output_dir / "embeddings" - index_dir = output_dir - - ingest_config = config.get("ingest", {}) - embed_config = config.get("embed", {}) - - os.makedirs(tiles_dir, exist_ok=True) - os.makedirs(chunks_dir, exist_ok=True) - os.makedirs(embeddings_dir, exist_ok=True) - - logger.info("Source: %s (%d documents)", type(source).__name__, len(source)) - - # Stage 1: Render documents to tiles - from pixelrag_render.render import render_url, render_pdf, render_file - - logger.info("Stage 1: Rendering %d documents...", len(source)) - for doc in source: - doc_tiles_dir = str(tiles_dir / f"{doc.id}.tiles") - if doc.url: - render_url(doc.url, doc_tiles_dir, **ingest_config) - elif doc.path: - render_file(doc.path, doc_tiles_dir, **ingest_config) - logger.info(" Rendered: %s", doc.id) - - # Stage 2: Chunk tiles - logger.info("Stage 2: Chunking tiles...") - subprocess.run([ - sys.executable, "-m", "pixelrag_embed.chunk", - "--tiles-dir", str(tiles_dir), - ], check=True) - - # Stage 3: Embed chunks - logger.info("Stage 3: Embedding chunks...") - embed_cmd = [ - sys.executable, "-m", "pixelrag_embed.embed", - "--shard-dir", str(tiles_dir), - "--output-dir", str(embeddings_dir), - ] - if "gpu_ids" in embed_config: - embed_cmd.extend(["--gpu-ids", ",".join(str(g) for g in embed_config["gpu_ids"])]) - if "model" in embed_config: - embed_cmd.extend(["--model", embed_config["model"]]) - if "backend" in embed_config: - embed_cmd.extend(["--backend", embed_config["backend"]]) - subprocess.run(embed_cmd, check=True) - - # Stage 4: Build FAISS index - logger.info("Stage 4: Building FAISS index...") - subprocess.run([ - sys.executable, "-m", "pixelrag_embed.index", - "build", - "--embeddings-dir", str(embeddings_dir), - "--output-dir", str(index_dir), - ], check=True) - - logger.info("Index built at: %s", index_dir) - return index_dir - - -def main(): - parser = argparse.ArgumentParser(description="Build a visual search index") - parser.add_argument("command", choices=["build"], help="Command to run") - parser.add_argument("--config", "-c", default=None, help="Path to pixelrag.yaml") - parser.add_argument("--source", "-s", default=None, help="Source path (overrides config)") - parser.add_argument("--output", "-o", default=None, help="Output directory") - args = parser.parse_args() - - logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s") - - config = load_config(args.config) - if args.source: - config.setdefault("source", {})["path"] = args.source - if args.output: - config["output"] = args.output - - if args.command == "build": - build(config) - - -if __name__ == "__main__": - main() -``` - -- [ ] **Step 11: Commit** - -```bash -cd ~/pixelrag -git add packages/index/ -git commit -m "feat: add pixelrag-index package (orchestration, sources, distributed)" -``` - ---- - -### Task 7: Update eval/, README, and cleanup - -**Files:** -- Modify: `eval/` (fix imports if needed) -- Modify: `README.md` -- Remove: old `packages/` remnants - -- [ ] **Step 1: Verify eval/ still works** - -eval/ should be unchanged. Check for broken imports: -```bash -grep -rn 'pixelrag_capture\|pixelrag_serving\|pixelrag_training\|wiki_screenshot' ~/pixelrag/eval/ --include='*.py' -``` - -If any found, fix them. eval/ scripts talk to the search API over HTTP, so they shouldn't import from other packages. - -- [ ] **Step 2: Update README.md** - -Rewrite to reflect the new 5-package architecture, user personas, and quick-start examples for each user type. - -- [ ] **Step 3: Clean up arxiv/ directory** - -The `arxiv/` directory appeared — add to `.gitignore` if it shouldn't be tracked, or remove from git. - -- [ ] **Step 4: Final sweep** - -```bash -cd ~/pixelrag -# No secrets -grep -rn 'hf_[A-Za-z0-9]\{20,\}' --include='*.py' --include='*.sh' . | grep -v .git/ -# No Tsinghua mirror -grep -rn 'tsinghua' --include='*.toml' . | grep -v .git/ -# No hardcoded machine paths -grep -rn '/opt/dlami\|/home/user/\|/home/ubuntu/\|/home/andy/' --include='*.py' . | grep -v .git/ | head -10 -# No large files -find . -size +1M -type f | grep -v '.git/' | grep -v '.venv/' -``` - -- [ ] **Step 5: Commit** - -```bash -cd ~/pixelrag -git add -A -git commit -m "cleanup: update README, eval, remove old package remnants" -``` - ---- - -### Task 8: Workspace verification - -- [ ] **Step 1: Resolve workspace dependencies** - -```bash -cd ~/pixelrag -rm -f uv.lock -uv sync 2>&1 | tail -5 -``` - -- [ ] **Step 2: Verify each package imports** - -```bash -uv run --package pixelrag-render python -c "from pixelrag_render.render import render_url; print('ingest OK')" -uv run --package pixelrag-embed python -c "from pixelrag_embed import chunk, embed, index; print('embed OK')" -uv run --package pixelrag-serve python -c "from pixelrag_serve import api; print('serve OK')" -uv run --package pixelrag-train python -c "from pixelrag_train.models.biqwen3 import BiQwen3; print('train OK')" -uv run --package pixelrag-index python -c "from pixelrag_index.config import load_config; print('index OK')" -``` - -- [ ] **Step 3: Verify serving still works** - -```bash -PIXELRAG_INDEX_DIR=/home/yichuan/pixelrag-data/text_search_index_1024 \ -PIXELRAG_ARTICLES_JSON=/home/yichuan/pixelrag-data/articles.json \ -uv run --package pixelrag-serve pixelrag-serve --port 31001 & -# Wait for loading, then test -sleep 120 -curl -s http://localhost:31001/health -curl -s -X POST http://localhost:31001/search \ - -H "Content-Type: application/json" \ - -d '{"queries": [{"text": "Apollo 11"}], "n_docs": 3}' -kill %1 -``` - -- [ ] **Step 4: Commit lock file** - -```bash -cd ~/pixelrag -git add uv.lock -git commit -m "chore: regenerate uv.lock for 5-package workspace" -``` diff --git a/docs/superpowers/plans/2026-05-25-pixelrag-frontend.md b/docs/superpowers/plans/2026-05-25-pixelrag-frontend.md deleted file mode 100644 index 1ade91f..0000000 --- a/docs/superpowers/plans/2026-05-25-pixelrag-frontend.md +++ /dev/null @@ -1,2313 +0,0 @@ -# PixelRAG Frontend Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Build a modern Next.js frontend for the PixelRAG visual retrieval engine — search page with image grid, API docs, and index dashboard. - -**Architecture:** Standalone Next.js 15 app in `web/` directory. Communicates with existing FastAPI backend via API proxy rewrites in dev, CORS in production. Dark theme, indigo accent, image-first design. - -**Tech Stack:** Next.js 15 (App Router), Tailwind CSS 4, shadcn/ui, Framer Motion, TypeScript - -**Spec:** `docs/superpowers/specs/2026-05-25-pixelrag-frontend-design.md` - ---- - -## File Map - -### New files (all under `web/`) - -| File | Responsibility | -|------|---------------| -| `web/src/lib/types.ts` | TypeScript types mirroring FastAPI Pydantic models | -| `web/src/lib/api.ts` | Typed fetch wrapper for all backend endpoints | -| `web/src/app/layout.tsx` | Root layout: nav bar, fonts, theme | -| `web/src/app/page.tsx` | Search home page: SearchBar + results | -| `web/src/app/status/page.tsx` | Index dashboard | -| `web/src/app/docs/page.tsx` | API documentation | -| `web/src/components/SearchBar.tsx` | Text + image input with drag-drop | -| `web/src/components/TileCard.tsx` | Single tile result card | -| `web/src/components/ResultGroup.tsx` | Article group with horizontal tile row | -| `web/src/components/Lightbox.tsx` | Fullscreen tile viewer with pan/zoom/nav | -| `web/src/components/ComparePanel.tsx` | Side-by-side tile comparison | -| `web/src/components/ApiPlayground.tsx` | Try-it-live widget for docs page | -| `web/src/components/StatusCard.tsx` | Metric display card | - -### Modified files - -| File | Change | -|------|--------| -| `serve/src/pixelrag_serve/api.py` | Add CORS middleware (3 lines) | - ---- - -## Phase 1: Core Search (MVP) - -### Task 1: Project Scaffolding - -**Files:** -- Create: `web/` (entire Next.js project via CLI) -- Modify: `web/src/app/globals.css` (custom theme tokens) -- Modify: `web/next.config.ts` (API proxy rewrites) -- Modify: `web/postcss.config.mjs` (verify Tailwind v4 plugin) - -- [ ] **Step 1: Scaffold Next.js project with shadcn/ui** - -```bash -cd /home/yichuan/pixelrag -npx shadcn@latest init -t next web -``` - -When prompted, accept defaults. This creates a Next.js 15 + Tailwind CSS 4 + shadcn/ui project in `web/`. - -- [ ] **Step 2: Verify the scaffold built correctly** - -```bash -cd /home/yichuan/pixelrag/web && npm run build -``` - -Expected: Build succeeds with no errors. - -- [ ] **Step 3: Install additional dependencies** - -```bash -cd /home/yichuan/pixelrag/web && npm install framer-motion -``` - -- [ ] **Step 4: Add shadcn/ui components we'll need** - -```bash -cd /home/yichuan/pixelrag/web -npx shadcn@latest add button input badge collapsible dialog slider -``` - -- [ ] **Step 5: Configure API proxy rewrites** - -Replace `web/next.config.ts` with: - -```ts -import type { NextConfig } from "next"; - -const nextConfig: NextConfig = { - async rewrites() { - return [ - { - source: "/api/:path*", - destination: "http://localhost:30001/:path*", - }, - ]; - }, -}; - -export default nextConfig; -``` - -- [ ] **Step 6: Set up custom theme in globals.css** - -Replace the `@theme` block in `web/src/app/globals.css` with the PixelRAG color palette. Keep the existing `@import "tailwindcss"` and shadcn layers. Add the custom theme tokens: - -```css -@import "tailwindcss"; - -@theme inline { - --color-background: #0c0c0c; - --color-surface: #1a1a1a; - --color-border: #222222; - --color-foreground: #ffffff; - --color-muted: #888888; - --color-muted-foreground: #555555; - --color-accent: #6366f1; - --color-accent-light: #8b5cf6; - --color-score: #6366f1; - --color-method-get: #3b82f6; - --color-method-post: #22c55e; - - --font-sans: "Inter", ui-sans-serif, system-ui, sans-serif; - --font-display: "Crimson Pro", ui-serif, Georgia, serif; - --font-mono: "JetBrains Mono", ui-monospace, monospace; -} - -/* shadcn overrides for dark theme */ -:root { - color-scheme: dark; -} - -body { - background: var(--color-background); - color: var(--color-foreground); - font-family: var(--font-sans); -} -``` - -Note: The exact format depends on what the shadcn init generated. Preserve any existing shadcn CSS variables and layer imports. The key additions are the custom color tokens and font families. - -- [ ] **Step 7: Add Google Fonts** - -In `web/src/app/layout.tsx`, add font imports. The shadcn scaffold creates a layout with a font already — modify it to use Inter + Crimson Pro + JetBrains Mono: - -```tsx -import { Inter, Crimson_Pro, JetBrains_Mono } from "next/font/google"; - -const inter = Inter({ subsets: ["latin"], variable: "--font-sans" }); -const crimsonPro = Crimson_Pro({ subsets: ["latin"], variable: "--font-display" }); -const jetbrainsMono = JetBrains_Mono({ subsets: ["latin"], variable: "--font-mono" }); - -// In the tag: - -``` - -- [ ] **Step 8: Verify dev server starts** - -```bash -cd /home/yichuan/pixelrag/web && npm run dev & -sleep 3 -curl -s http://localhost:3000 | head -20 -kill %1 -``` - -Expected: HTML response from Next.js dev server. - -- [ ] **Step 9: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/ -git commit -m "feat(web): scaffold Next.js 15 + Tailwind 4 + shadcn/ui project" -``` - ---- - -### Task 2: Backend CORS Middleware - -**Files:** -- Modify: `serve/src/pixelrag_serve/api.py` (lines 46-54) - -- [ ] **Step 1: Add CORS middleware to FastAPI** - -In `serve/src/pixelrag_serve/api.py`, add after the `app = FastAPI(...)` line (line 54): - -```python -from fastapi.middleware.cors import CORSMiddleware - -app.add_middleware( - CORSMiddleware, - allow_origins=["http://localhost:3000"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) -``` - -The import `CORSMiddleware` should be added to the existing import block at the top. The `from fastapi.middleware.cors import CORSMiddleware` line goes near line 47 with the other fastapi imports. - -- [ ] **Step 2: Commit** - -```bash -cd /home/yichuan/pixelrag -git add serve/src/pixelrag_serve/api.py -git commit -m "feat(serve): add CORS middleware for frontend dev server" -``` - ---- - -### Task 3: TypeScript Types + API Client - -**Files:** -- Create: `web/src/lib/types.ts` -- Create: `web/src/lib/api.ts` - -- [ ] **Step 1: Create TypeScript types mirroring the Pydantic models** - -Create `web/src/lib/types.ts`: - -```ts -export interface Query { - text?: string; - image?: string; // base64-encoded - embedding?: number[]; -} - -export interface SearchRequest { - queries: Query[]; - n_docs?: number; - nprobe?: number; - min_tile_height?: number; - instruction?: string; -} - -export interface Hit { - score: number; - vector_id: number; - article_id: number; - tile_index: number; - chunk_index: number; - y_offset: number; - tile_height: number; - path: string; - url: string; -} - -export interface QueryResult { - hits: Hit[]; -} - -export interface SearchResponse { - results: QueryResult[]; -} - -export interface StatusResponse { - total_vectors: number; - dimension: number; - nlist: number; - nprobe: number; - model: string; - index_dir: string; - tiles_dir: string; - index_built_at: string; - index_size_bytes: number; - metadata_size_bytes: number; -} - -export interface ArticleGroup { - article_id: number; - title: string; - url: string; - hits: (Hit & { rank: number })[]; -} -``` - -- [ ] **Step 2: Create API client** - -Create `web/src/lib/api.ts`: - -```ts -import type { SearchRequest, SearchResponse, StatusResponse } from "./types"; - -const API_BASE = "/api"; - -async function fetchApi(path: string, init?: RequestInit): Promise { - const res = await fetch(`${API_BASE}${path}`, init); - if (!res.ok) { - const body = await res.text(); - throw new Error(`API ${res.status}: ${body}`); - } - return res.json(); -} - -export async function search(req: SearchRequest): Promise { - return fetchApi("/search", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(req), - }); -} - -export async function getStatus(): Promise { - return fetchApi("/status"); -} - -export async function getHealth(): Promise<{ status: string }> { - return fetchApi<{ status: string }>("/health"); -} - -export function tileUrl(path: string): string { - return `${API_BASE}/tile?path=${encodeURIComponent(path)}`; -} - -export async function reconstruct( - vectorIds: number[] -): Promise<{ embeddings: number[][] }> { - return fetchApi<{ embeddings: number[][] }>("/reconstruct", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ vector_ids: vectorIds }), - }); -} -``` - -- [ ] **Step 3: Verify types compile** - -```bash -cd /home/yichuan/pixelrag/web && npx tsc --noEmit -``` - -Expected: No errors. - -- [ ] **Step 4: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/lib/types.ts web/src/lib/api.ts -git commit -m "feat(web): add TypeScript types and API client" -``` - ---- - -### Task 4: Navigation Shell (Layout) - -**Files:** -- Modify: `web/src/app/layout.tsx` - -- [ ] **Step 1: Build the root layout with nav bar** - -Replace `web/src/app/layout.tsx` with the full layout. Keep the font setup from Task 1 Step 7. The nav bar has: logo (left), page links (right: Search, Docs, Status). - -```tsx -import type { Metadata } from "next"; -import { Inter, Crimson_Pro, JetBrains_Mono } from "next/font/google"; -import Link from "next/link"; -import "./globals.css"; - -const inter = Inter({ subsets: ["latin"], variable: "--font-sans" }); -const crimsonPro = Crimson_Pro({ - subsets: ["latin"], - variable: "--font-display", -}); -const jetbrainsMono = JetBrains_Mono({ - subsets: ["latin"], - variable: "--font-mono", -}); - -export const metadata: Metadata = { - title: "PixelRAG", - description: "Visual retrieval over Wikipedia screenshot tiles", -}; - -export default function RootLayout({ - children, -}: { - children: React.ReactNode; -}) { - return ( - - - -
{children}
- - - ); -} -``` - -- [ ] **Step 2: Verify layout renders** - -```bash -cd /home/yichuan/pixelrag/web && npm run build -``` - -Expected: Build succeeds. - -- [ ] **Step 3: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/app/layout.tsx -git commit -m "feat(web): add navigation shell with header" -``` - ---- - -### Task 5: TileCard Component - -**Files:** -- Create: `web/src/components/TileCard.tsx` - -- [ ] **Step 1: Build the TileCard component** - -Create `web/src/components/TileCard.tsx`: - -```tsx -"use client"; - -import Image from "next/image"; -import { useState } from "react"; -import type { Hit } from "@/lib/types"; -import { tileUrl } from "@/lib/api"; - -interface TileCardProps { - hit: Hit; - rank: number; - selected?: boolean; - onSelect?: (hit: Hit) => void; - onClick?: (hit: Hit) => void; -} - -export function TileCard({ - hit, - rank, - selected, - onSelect, - onClick, -}: TileCardProps) { - const [imgError, setImgError] = useState(false); - - return ( -
onClick?.(hit)} - > - {/* Tile image */} -
- {imgError ? ( -
- tile {hit.tile_index}:{hit.chunk_index} -
- ) : ( - {`tile setImgError(true)} - /> - )} -
- - {/* Rank badge */} -
- #{rank} -
- - {/* Select checkbox (visible on hover or when selected) */} - {onSelect && ( -
{ - e.stopPropagation(); - onSelect(hit); - }} - > - {selected && "✓"} -
- )} - - {/* Metadata footer */} -
- - {hit.score.toFixed(3)} - - - {hit.tile_height}px - -
-
- ); -} -``` - -- [ ] **Step 2: Verify it compiles** - -```bash -cd /home/yichuan/pixelrag/web && npx tsc --noEmit -``` - -Expected: No errors. - -- [ ] **Step 3: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/components/TileCard.tsx -git commit -m "feat(web): add TileCard component" -``` - ---- - -### Task 6: ResultGroup Component - -**Files:** -- Create: `web/src/components/ResultGroup.tsx` - -- [ ] **Step 1: Build the ResultGroup component** - -Create `web/src/components/ResultGroup.tsx`: - -```tsx -"use client"; - -import { ExternalLink } from "lucide-react"; -import type { Hit, ArticleGroup } from "@/lib/types"; -import { TileCard } from "./TileCard"; - -interface ResultGroupProps { - group: ArticleGroup; - selectedHits: Set; - onSelectHit: (hit: Hit) => void; - onClickHit: (hit: Hit) => void; -} - -export function ResultGroup({ - group, - selectedHits, - onSelectHit, - onClickHit, -}: ResultGroupProps) { - return ( -
- {/* Article header */} -
-

{group.title}

- {group.url && ( - - {new URL(group.url).hostname} - - - )} - - {group.hits.length} tile{group.hits.length !== 1 && "s"} - -
- - {/* Horizontal scrollable tile row */} -
- {group.hits.map((hit) => ( - - ))} -
-
- ); -} -``` - -- [ ] **Step 2: Add a utility to group hits by article** - -Add to the bottom of `web/src/lib/types.ts`: - -```ts -export function groupHitsByArticle(hits: Hit[]): ArticleGroup[] { - const map = new Map(); - hits.forEach((hit, index) => { - const ranked = { ...hit, rank: index + 1 }; - let group = map.get(hit.article_id); - if (!group) { - const slug = hit.url.split("/wiki/").pop() ?? ""; - const title = decodeURIComponent(slug).replace(/_/g, " ") || `Article #${hit.article_id}`; - group = { article_id: hit.article_id, title, url: hit.url, hits: [] }; - map.set(hit.article_id, group); - } - group.hits.push(ranked); - }); - return Array.from(map.values()); -} -``` - -- [ ] **Step 3: Verify compilation** - -```bash -cd /home/yichuan/pixelrag/web && npx tsc --noEmit -``` - -Expected: No errors. - -- [ ] **Step 4: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/components/ResultGroup.tsx web/src/lib/types.ts -git commit -m "feat(web): add ResultGroup component with article grouping" -``` - ---- - -### Task 7: SearchBar Component (Text Only) - -**Files:** -- Create: `web/src/components/SearchBar.tsx` - -- [ ] **Step 1: Build the SearchBar component** - -Create `web/src/components/SearchBar.tsx`: - -```tsx -"use client"; - -import { useState, useRef, useCallback } from "react"; -import { Search, X, ImagePlus } from "lucide-react"; -import { Button } from "@/components/ui/button"; -import { Input } from "@/components/ui/input"; - -interface SearchBarProps { - onSearch: (query: string, image?: string) => void; - isLoading: boolean; -} - -export function SearchBar({ onSearch, isLoading }: SearchBarProps) { - const [query, setQuery] = useState(""); - const [imagePreview, setImagePreview] = useState(null); - const [imageBase64, setImageBase64] = useState(null); - const fileInputRef = useRef(null); - - const handleSubmit = useCallback(() => { - if (!query.trim() && !imageBase64) return; - onSearch(query.trim(), imageBase64 ?? undefined); - }, [query, imageBase64, onSearch]); - - const handleKeyDown = useCallback( - (e: React.KeyboardEvent) => { - if (e.key === "Enter") handleSubmit(); - }, - [handleSubmit] - ); - - const handleImageUpload = useCallback((file: File) => { - if (!file.type.startsWith("image/")) return; - const reader = new FileReader(); - reader.onload = (e) => { - const dataUrl = e.target?.result as string; - setImagePreview(dataUrl); - setImageBase64(dataUrl.split(",")[1]); - }; - reader.readAsDataURL(file); - }, []); - - const handleDrop = useCallback( - (e: React.DragEvent) => { - e.preventDefault(); - const file = e.dataTransfer.files[0]; - if (file) handleImageUpload(file); - }, - [handleImageUpload] - ); - - const clearImage = useCallback(() => { - setImagePreview(null); - setImageBase64(null); - if (fileInputRef.current) fileInputRef.current.value = ""; - }, []); - - return ( -
-
e.preventDefault()} - > - {/* Image preview thumbnail */} - {imagePreview && ( -
- Query image - -
- )} - - {/* Text input */} -
- setQuery(e.target.value)} - onKeyDown={handleKeyDown} - placeholder="Search Wikipedia visually..." - className="bg-surface border-border text-foreground placeholder:text-muted-foreground h-11 pr-10" - /> - -
- - { - const file = e.target.files?.[0]; - if (file) handleImageUpload(file); - }} - /> - - {/* Search button */} - -
- - {/* Mode chips */} -
- {["Text query", "Image upload", "Drag & drop"].map((label) => ( - - {label} - - ))} -
-
- ); -} -``` - -- [ ] **Step 2: Verify compilation** - -```bash -cd /home/yichuan/pixelrag/web && npx tsc --noEmit -``` - -- [ ] **Step 3: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/components/SearchBar.tsx -git commit -m "feat(web): add SearchBar component with text and image input" -``` - ---- - -### Task 8: Search Page - -**Files:** -- Modify: `web/src/app/page.tsx` - -- [ ] **Step 1: Build the search home page** - -Replace `web/src/app/page.tsx`: - -```tsx -"use client"; - -import { useState, useCallback } from "react"; -import { SearchBar } from "@/components/SearchBar"; -import { ResultGroup } from "@/components/ResultGroup"; -import { Lightbox } from "@/components/Lightbox"; -import { search } from "@/lib/api"; -import type { Hit, ArticleGroup } from "@/lib/types"; -import { groupHitsByArticle } from "@/lib/types"; - -export default function SearchPage() { - const [groups, setGroups] = useState([]); - const [allHits, setAllHits] = useState([]); - const [isLoading, setIsLoading] = useState(false); - const [error, setError] = useState(null); - const [resultMeta, setResultMeta] = useState<{ - count: number; - timeMs: number; - } | null>(null); - const [selectedHits, setSelectedHits] = useState>(new Set()); - const [lightboxHit, setLightboxHit] = useState(null); - const [hasSearched, setHasSearched] = useState(false); - - const handleSearch = useCallback( - async (query: string, image?: string) => { - setIsLoading(true); - setError(null); - setSelectedHits(new Set()); - const t0 = performance.now(); - - try { - const queryObj: { text?: string; image?: string } = {}; - if (query) queryObj.text = query; - if (image) queryObj.image = image; - - const res = await search({ - queries: [queryObj], - n_docs: 20, - }); - const elapsed = performance.now() - t0; - const hits = res.results[0]?.hits ?? []; - setAllHits(hits); - setGroups(groupHitsByArticle(hits)); - setResultMeta({ count: hits.length, timeMs: elapsed }); - setHasSearched(true); - } catch (err) { - setError(err instanceof Error ? err.message : "Search failed"); - setGroups([]); - setAllHits([]); - } finally { - setIsLoading(false); - } - }, - [] - ); - - const handleSelectHit = useCallback((hit: Hit) => { - setSelectedHits((prev) => { - const next = new Set(prev); - if (next.has(hit.vector_id)) { - next.delete(hit.vector_id); - } else { - next.add(hit.vector_id); - } - return next; - }); - }, []); - - const handleClickHit = useCallback((hit: Hit) => { - setLightboxHit(hit); - }, []); - - return ( -
- {/* Hero */} -
-

- VisRAG -

-

- Visual retrieval over 15.7M Wikipedia screenshot tiles -

-
- - {/* Search */} - - - {/* Status bar */} - {resultMeta && ( -
- {resultMeta.count} results in{" "} - - {(resultMeta.timeMs / 1000).toFixed(2)}s - -
- )} - - {/* Error */} - {error && ( -
- {error} -
- )} - - {/* Results */} - {groups.length > 0 && ( -
- {groups.map((group) => ( - - ))} -
- )} - - {/* Empty state */} - {hasSearched && groups.length === 0 && !error && !isLoading && ( -
- No results found -
- )} - - {/* Lightbox */} - {lightboxHit && ( - setLightboxHit(null)} - onNavigate={setLightboxHit} - /> - )} - - {/* Compare floating bar */} - {selectedHits.size >= 2 && ( -
- - {selectedHits.size} tiles selected - - - -
- )} -
- ); -} -``` - -- [ ] **Step 2: Verify build** - -```bash -cd /home/yichuan/pixelrag/web && npm run build -``` - -Expected: Build succeeds. (Lightbox component not yet created — create a stub first, see Task 9.) - -Note: Before building, create a minimal Lightbox stub so the import doesn't fail: - -```bash -mkdir -p /home/yichuan/pixelrag/web/src/components -``` - -Create `web/src/components/Lightbox.tsx` with a minimal stub: - -```tsx -"use client"; - -import type { Hit } from "@/lib/types"; - -interface LightboxProps { - hit: Hit; - allHits: Hit[]; - onClose: () => void; - onNavigate: (hit: Hit) => void; -} - -export function Lightbox({ onClose }: LightboxProps) { - return ( -
-

Lightbox placeholder

-
- ); -} -``` - -- [ ] **Step 3: Build and verify** - -```bash -cd /home/yichuan/pixelrag/web && npm run build -``` - -Expected: Build succeeds. - -- [ ] **Step 4: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/app/page.tsx web/src/components/Lightbox.tsx -git commit -m "feat(web): add search home page with result grouping" -``` - ---- - -### Task 9: Tile Lightbox - -**Files:** -- Modify: `web/src/components/Lightbox.tsx` (replace stub) - -- [ ] **Step 1: Implement the full Lightbox component** - -Replace `web/src/components/Lightbox.tsx`: - -```tsx -"use client"; - -import { useEffect, useCallback, useState, useRef } from "react"; -import { motion, AnimatePresence } from "framer-motion"; -import { X, ChevronLeft, ChevronRight, ExternalLink } from "lucide-react"; -import type { Hit } from "@/lib/types"; -import { tileUrl } from "@/lib/api"; - -interface LightboxProps { - hit: Hit; - allHits: Hit[]; - onClose: () => void; - onNavigate: (hit: Hit) => void; -} - -export function Lightbox({ hit, allHits, onClose, onNavigate }: LightboxProps) { - const [scale, setScale] = useState(1); - const [position, setPosition] = useState({ x: 0, y: 0 }); - const [dragging, setDragging] = useState(false); - const dragStart = useRef({ x: 0, y: 0 }); - const posStart = useRef({ x: 0, y: 0 }); - - const currentIndex = allHits.findIndex( - (h) => h.vector_id === hit.vector_id - ); - const hasPrev = currentIndex > 0; - const hasNext = currentIndex < allHits.length - 1; - - const resetView = useCallback(() => { - setScale(1); - setPosition({ x: 0, y: 0 }); - }, []); - - const goPrev = useCallback(() => { - if (hasPrev) { - resetView(); - onNavigate(allHits[currentIndex - 1]); - } - }, [hasPrev, currentIndex, allHits, onNavigate, resetView]); - - const goNext = useCallback(() => { - if (hasNext) { - resetView(); - onNavigate(allHits[currentIndex + 1]); - } - }, [hasNext, currentIndex, allHits, onNavigate, resetView]); - - useEffect(() => { - const handleKey = (e: KeyboardEvent) => { - if (e.key === "Escape") onClose(); - if (e.key === "ArrowLeft") goPrev(); - if (e.key === "ArrowRight") goNext(); - }; - window.addEventListener("keydown", handleKey); - return () => window.removeEventListener("keydown", handleKey); - }, [onClose, goPrev, goNext]); - - const handleWheel = useCallback((e: React.WheelEvent) => { - e.preventDefault(); - setScale((prev) => Math.max(0.5, Math.min(5, prev - e.deltaY * 0.002))); - }, []); - - const handleMouseDown = useCallback( - (e: React.MouseEvent) => { - if (scale <= 1) return; - setDragging(true); - dragStart.current = { x: e.clientX, y: e.clientY }; - posStart.current = { ...position }; - }, - [scale, position] - ); - - const handleMouseMove = useCallback( - (e: React.MouseEvent) => { - if (!dragging) return; - setPosition({ - x: posStart.current.x + (e.clientX - dragStart.current.x), - y: posStart.current.y + (e.clientY - dragStart.current.y), - }); - }, - [dragging] - ); - - const handleMouseUp = useCallback(() => { - setDragging(false); - }, []); - - const slug = hit.url.split("/wiki/").pop() ?? ""; - const title = - decodeURIComponent(slug).replace(/_/g, " ") || - `Article #${hit.article_id}`; - - return ( - - { - if (e.target === e.currentTarget) onClose(); - }} - > - {/* Image area */} -
- {`tile -
- - {/* Metadata sidebar */} - -

{title}

- {hit.url && ( - - Open article - - )} - -
-
-
Score
-
- {hit.score.toFixed(4)} -
-
-
-
Rank
-
#{currentIndex + 1} of {allHits.length}
-
-
-
Position
-
- tile {hit.tile_index} : chunk {hit.chunk_index} -
-
-
-
Tile Height
-
{hit.tile_height}px
-
-
-
Y Offset
-
{hit.y_offset}px
-
-
-
Vector ID
-
{hit.vector_id}
-
-
- -
- Scroll to zoom · Drag to pan · Arrow keys to navigate -
-
- - {/* Close button */} - - - {/* Navigation arrows */} - {hasPrev && ( - - )} - {hasNext && ( - - )} -
-
- ); -} -``` - -- [ ] **Step 2: Verify build** - -```bash -cd /home/yichuan/pixelrag/web && npm run build -``` - -Expected: Build succeeds. - -- [ ] **Step 3: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/components/Lightbox.tsx -git commit -m "feat(web): add Lightbox component with pan/zoom/navigation" -``` - ---- - -## Phase 2: Full Features - -### Task 10: Advanced Search Controls - -**Files:** -- Create: `web/src/components/SearchControls.tsx` -- Modify: `web/src/app/page.tsx` - -- [ ] **Step 1: Create SearchControls component** - -Create `web/src/components/SearchControls.tsx`: - -```tsx -"use client"; - -import { useState } from "react"; -import { ChevronDown, ChevronUp } from "lucide-react"; -import { Input } from "@/components/ui/input"; - -export interface SearchOptions { - n_docs: number; - nprobe?: number; - min_tile_height?: number; - instruction?: string; -} - -interface SearchControlsProps { - options: SearchOptions; - onChange: (options: SearchOptions) => void; -} - -export function SearchControls({ options, onChange }: SearchControlsProps) { - const [open, setOpen] = useState(false); - - return ( -
- - - {open && ( -
-
- - - onChange({ ...options, n_docs: parseInt(e.target.value) || 10 }) - } - min={1} - max={100} - className="mt-1 h-8 text-xs bg-background border-border" - /> -
-
- - - onChange({ - ...options, - nprobe: e.target.value ? parseInt(e.target.value) : undefined, - }) - } - placeholder="default" - className="mt-1 h-8 text-xs bg-background border-border" - /> -
-
- - - onChange({ - ...options, - min_tile_height: e.target.value - ? parseInt(e.target.value) - : undefined, - }) - } - placeholder="none" - className="mt-1 h-8 text-xs bg-background border-border" - /> -
-
- - - onChange({ - ...options, - instruction: e.target.value || undefined, - }) - } - placeholder="default" - className="mt-1 h-8 text-xs bg-background border-border" - /> -
-
- )} -
- ); -} -``` - -- [ ] **Step 2: Wire SearchControls into the search page** - -In `web/src/app/page.tsx`, add state and pass options to the search call: - -1. Add import: `import { SearchControls, type SearchOptions } from "@/components/SearchControls";` -2. Add state: `const [searchOptions, setSearchOptions] = useState({ n_docs: 20 });` -3. In `handleSearch`, change the `search()` call to use `searchOptions`: - ```ts - const res = await search({ - queries: [queryObj], - n_docs: searchOptions.n_docs, - nprobe: searchOptions.nprobe, - min_tile_height: searchOptions.min_tile_height, - instruction: searchOptions.instruction, - }); - ``` -4. Add `` right after ``. - -- [ ] **Step 3: Verify build** - -```bash -cd /home/yichuan/pixelrag/web && npm run build -``` - -- [ ] **Step 4: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/components/SearchControls.tsx web/src/app/page.tsx -git commit -m "feat(web): add advanced search controls (nprobe, min_tile_height, instruction)" -``` - ---- - -### Task 11: Side-by-Side Compare Panel - -**Files:** -- Create: `web/src/components/ComparePanel.tsx` -- Modify: `web/src/app/page.tsx` - -- [ ] **Step 1: Create ComparePanel component** - -Create `web/src/components/ComparePanel.tsx`: - -```tsx -"use client"; - -import { motion } from "framer-motion"; -import { X } from "lucide-react"; -import type { Hit } from "@/lib/types"; -import { tileUrl } from "@/lib/api"; - -interface ComparePanelProps { - hits: Hit[]; - allHits: Hit[]; - onClose: () => void; -} - -export function ComparePanel({ hits, allHits, onClose }: ComparePanelProps) { - return ( - -
-
-

- Comparing {hits.length} tiles -

- -
- -
- {hits.map((hit) => { - const rank = - allHits.findIndex((h) => h.vector_id === hit.vector_id) + 1; - const slug = hit.url.split("/wiki/").pop() ?? ""; - const title = - decodeURIComponent(slug).replace(/_/g, " ") || - `Article #${hit.article_id}`; - - return ( -
- {`tile -
-
- - {hit.score.toFixed(4)} - - - Rank #{rank} - -
-
{title}
-
- tile {hit.tile_index}:{hit.chunk_index} ·{" "} - {hit.tile_height}px -
-
-
- ); - })} -
-
-
- ); -} -``` - -- [ ] **Step 2: Wire ComparePanel into search page** - -In `web/src/app/page.tsx`: - -1. Add import: `import { ComparePanel } from "@/components/ComparePanel";` -2. Add state: `const [showCompare, setShowCompare] = useState(false);` -3. Replace the `Compare` button onClick with: `onClick={() => setShowCompare(true)}` -4. Add ComparePanel below the floating bar (inside AnimatePresence): - -```tsx -{showCompare && selectedHits.size >= 2 && ( - selectedHits.has(h.vector_id))} - allHits={allHits} - onClose={() => setShowCompare(false)} - /> -)} -``` - -- [ ] **Step 3: Verify build** - -```bash -cd /home/yichuan/pixelrag/web && npm run build -``` - -- [ ] **Step 4: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/components/ComparePanel.tsx web/src/app/page.tsx -git commit -m "feat(web): add side-by-side tile comparison panel" -``` - ---- - -### Task 12: Index Status Dashboard - -**Files:** -- Create: `web/src/components/StatusCard.tsx` -- Create: `web/src/app/status/page.tsx` - -- [ ] **Step 1: Create StatusCard component** - -Create `web/src/components/StatusCard.tsx`: - -```tsx -interface StatusCardProps { - label: string; - value: string; - sub?: string; -} - -export function StatusCard({ label, value, sub }: StatusCardProps) { - return ( -
-
- {label} -
-
{value}
- {sub && ( -
{sub}
- )} -
- ); -} -``` - -- [ ] **Step 2: Create status page** - -Create `web/src/app/status/page.tsx`: - -```tsx -"use client"; - -import { useEffect, useState } from "react"; -import { getStatus } from "@/lib/api"; -import type { StatusResponse } from "@/lib/types"; -import { StatusCard } from "@/components/StatusCard"; - -function formatBytes(bytes: number): string { - if (bytes < 1024) return `${bytes} B`; - if (bytes < 1024 ** 2) return `${(bytes / 1024).toFixed(1)} KB`; - if (bytes < 1024 ** 3) return `${(bytes / 1024 ** 2).toFixed(1)} MB`; - return `${(bytes / 1024 ** 3).toFixed(2)} GB`; -} - -function formatVectors(n: number): string { - if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`; - if (n >= 1_000) return `${(n / 1_000).toFixed(1)}K`; - return `${n}`; -} - -export default function StatusPage() { - const [status, setStatus] = useState(null); - const [error, setError] = useState(null); - - useEffect(() => { - getStatus() - .then(setStatus) - .catch((err) => - setError(err instanceof Error ? err.message : "Failed to load status") - ); - }, []); - - if (error) { - return ( -
-

- Index Status -

-
- {error} -
-
- ); - } - - if (!status) { - return ( -
-

- Index Status -

-
- {[1, 2, 3, 4].map((i) => ( -
- ))} -
-
- ); - } - - return ( -
-

- Index Status -

- -
- - - - -
- -

Configuration

-
- {[ - ["nlist", `${status.nlist}`], - ["nprobe", `${status.nprobe}`], - ["Built at", new Date(status.index_built_at).toLocaleString()], - ["Index dir", status.index_dir], - ["Tiles dir", status.tiles_dir], - ].map(([label, value]) => ( -
- - {label} - - {value} -
- ))} -
-
- ); -} -``` - -- [ ] **Step 3: Verify build** - -```bash -cd /home/yichuan/pixelrag/web && npm run build -``` - -- [ ] **Step 4: Commit** - -```bash -cd /home/yichuan/pixelrag -git add web/src/components/StatusCard.tsx web/src/app/status/page.tsx -git commit -m "feat(web): add index status dashboard page" -``` - ---- - -### Task 13: API Documentation Page - -**Files:** -- Create: `web/src/components/ApiPlayground.tsx` -- Create: `web/src/app/docs/page.tsx` - -- [ ] **Step 1: Create ApiPlayground component** - -Create `web/src/components/ApiPlayground.tsx`: - -```tsx -"use client"; - -import { useState } from "react"; -import { Button } from "@/components/ui/button"; - -interface ApiPlaygroundProps { - method: "GET" | "POST"; - path: string; - defaultBody?: string; -} - -export function ApiPlayground({ - method, - path, - defaultBody, -}: ApiPlaygroundProps) { - const [body, setBody] = useState(defaultBody ?? ""); - const [response, setResponse] = useState(null); - const [isLoading, setIsLoading] = useState(false); - const [error, setError] = useState(null); - - const handleSend = async () => { - setIsLoading(true); - setError(null); - setResponse(null); - - try { - const url = `/api${path}`; - const init: RequestInit = - method === "POST" - ? { - method: "POST", - headers: { "Content-Type": "application/json" }, - body, - } - : {}; - const res = await fetch(url, init); - const text = await res.text(); - try { - setResponse(JSON.stringify(JSON.parse(text), null, 2)); - } catch { - setResponse(text); - } - } catch (err) { - setError(err instanceof Error ? err.message : "Request failed"); - } finally { - setIsLoading(false); - } - }; - - return ( -
-
- Try it -
- {method === "POST" && ( -