diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 84f2882a9..a4ef95f56 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v8.1.0
         with:
-          version: "0.11.13"
+          version: "0.11.16"
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
@@ -35,6 +35,7 @@ jobs:
             radis_dev-web:latest
             radis_dev-default_worker:latest
             radis_dev-llm_worker:latest
+            radis_dev-embeddings_worker:latest
           cache-from: type=gha
           cache-to: type=gha,mode=max
       - name: Start Docker containers
diff --git a/Dockerfile b/Dockerfile
index 5f4982270..57215cc2a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,7 +21,7 @@ RUN apt-get update \
   postgresql-client-17 \
   && rm -rf /var/lib/apt/lists/*
 
-COPY --from=ghcr.io/astral-sh/uv:0.11.13 /uv /uvx /bin/
+COPY --from=ghcr.io/astral-sh/uv:0.11.16 /uv /uvx /bin/
 
 WORKDIR /app
 
diff --git a/docker-compose.base.yml b/docker-compose.base.yml
index 7fa3cc471..ece29b59a 100644
--- a/docker-compose.base.yml
+++ b/docker-compose.base.yml
@@ -17,6 +17,12 @@ x-app: &default-app
     DJANGO_SERVER_EMAIL: ${DJANGO_SERVER_EMAIL:?}
     EXTERNAL_LLM_PROVIDER_URL: ${EXTERNAL_LLM_PROVIDER_URL:-}
     EXTERNAL_LLM_PROVIDER_API_KEY: ${EXTERNAL_LLM_PROVIDER_API_KEY:-}
+    EMBEDDING_BACKEND: ${EMBEDDING_BACKEND:-openai}
+    EMBEDDING_PROVIDER_URL: ${EMBEDDING_PROVIDER_URL:-}
+    EMBEDDING_PROVIDER_PATH: ${EMBEDDING_PROVIDER_PATH:-}
+    EMBEDDING_PROVIDER_API_KEY: ${EMBEDDING_PROVIDER_API_KEY:-}
+    EMBEDDING_MODEL_NAME: ${EMBEDDING_MODEL_NAME:-Qwen/Qwen3-Embedding-4B}
+    EMBEDDING_DIM: ${EMBEDDING_DIM:-1024}
     IS_DOCKER_CONTAINER: 1
     HTTP_PROXY: ${HTTP_PROXY:-}
     HTTPS_PROXY: ${HTTPS_PROXY:-}
@@ -54,8 +60,12 @@ services:
     <<: *default-app
     hostname: llm_worker.local
 
+  embeddings_worker:
+    <<: *default-app
+    hostname: embeddings_worker.local
+
   postgres:
-    image: postgres:17
+    image: pgvector/pgvector:pg17
     hostname: postgres.local
     volumes:
       - postgres_data:/var/lib/postgresql/data
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
index fd6cd1d02..196e25463 100644
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@@ -82,6 +82,15 @@ services:
         ./manage.py bg_worker -l debug -q llm --autoreload
       "
 
+  embeddings_worker:
+    <<: *default-app
+    image: radis_dev-embeddings_worker:latest
+    command: >
+      bash -c "
+        wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} &&
+        ./manage.py bg_worker -l debug -q embeddings --autoreload --concurrency 4
+      "
+
   postgres:
     environment:
       POSTGRES_PASSWORD: postgres
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index 15c386573..5824b6e23 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -77,6 +77,16 @@ services:
     deploy:
       <<: *deploy
 
+  embeddings_worker:
+    <<: *default-app
+    command: >
+      bash -c "
+        wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} &&
+        ./manage.py bg_worker -q embeddings --concurrency 4
+      "
+    deploy:
+      <<: *deploy
+
   postgres:
     environment:
       POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?}
diff --git a/docs/superpowers/plans/2026-05-28-hybrid-search.md b/docs/superpowers/plans/2026-05-28-hybrid-search.md
new file mode 100644
index 000000000..2b264e320
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-28-hybrid-search.md
@@ -0,0 +1,1510 @@
+# Hybrid Search — Embedding Orchestrator + Negation-Aware Query Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Land the two remaining design pieces of the unified hybrid-search spec — (1) the periodic `EmbeddingJob`/`EmbeddingTask` orchestrator that replaces the `post_save`-driven embedding path, and (2) `QueryParser.unparse_for_embedding` which strips `NOT` branches before the vector half of `search()` calls the embedding service.
+
+**Architecture (orchestrator):** Mirror `ExtractionJob`/`ExtractionTask` (`radis/extractions/tasks.py:32`) and `subscription_launcher` (`radis/subscriptions/tasks.py:115`). A periodic `embedding_launcher` on `default` queue creates one `EmbeddingJob` (system-owned) per drain; `process_embedding_job` (also `default`) batches `ReportSearchVector` rows with `embedding IS NULL` into `EmbeddingTask` rows and dispatches them; `process_embedding_task` (on `embeddings` queue) calls `EmbeddingClient`, `bulk_update`s the vectors, and rolls status up via `AnalysisJob.update_job_state`.
+
+**Architecture (negation):** A new `QueryParser.unparse_for_embedding(node)` static walker emits a string with `UnaryNode("NOT", X)` branches dropped and empty `BinaryNode` legs collapsed. The pgsearch provider calls it instead of `unparse()` before `embed_query`; if the result is empty, the vector side is skipped and the request degrades to FTS-only.
+
+**Tech Stack:** Django 5.1, Procrastinate (periodic tasks + `queueing_lock`), pgvector, pytest-django.
+
+**Spec:** `docs/superpowers/specs/2026-05-28-hybrid-search.md` (§6 orchestrator, §7.2/§7.8 negation).
+
+**Branch:** `feat/hybrid-search` (continue here; no worktree required).
+
+---
+
+## File Structure
+
+**Files to create:**
+
+| Path | Responsibility |
+|---|---|
+| `radis/pgsearch/migrations/0004_embedding_job_task.py` | Schema migration for `EmbeddingJob`, `EmbeddingTask`, `EmbeddingTask.reports` M2M |
+| `radis/pgsearch/migrations/0005_system_user.py` | Data migration that idempotently creates the system user |
+| `radis/pgsearch/tests/test_models_embedding.py` | Model-level tests: status defaults, owner FK, M2M |
+| `radis/pgsearch/tests/test_embedding_launcher.py` | Unit tests for `embedding_launcher` |
+| `radis/pgsearch/tests/test_process_embedding_job.py` | Unit tests for `process_embedding_job` |
+| `radis/pgsearch/tests/test_process_embedding_task.py` | Unit tests for `process_embedding_task` |
+| `radis/pgsearch/tests/test_migrations_system_user.py` | Test for the data migration |
+| `radis/search/tests/test_query_parser_unparse_for_embedding.py` | Unit tests for the new `QueryParser.unparse_for_embedding` |
+
+**Files to modify:**
+
+| Path | Change |
+|---|---|
+| `radis/settings/base.py:341-365` | Add `EMBEDDING_DRAIN_CRON`, `EMBEDDING_SYSTEM_USERNAME`; remove `EMBEDDING_BACKFILL_PRIORITY` (last) |
+| `radis/pgsearch/models.py` | Add `EmbeddingJob` and `EmbeddingTask` model classes |
+| `radis/pgsearch/tasks.py` | Replace contents: add `embedding_launcher`, `process_embedding_job`, `process_embedding_task`; remove `embed_reports` and `enqueue_embed_reports` |
+| `radis/pgsearch/signals.py` | Remove `enqueue_report_embedding` receiver (lines 19-23); keep the FTS receiver |
+| `radis/pgsearch/tests/test_signals.py` | Delete the two embedding-signal tests; the file becomes empty and is deleted |
+| `radis/search/utils/query_parser.py:293-314` | Add `QueryParser.unparse_for_embedding` next to existing `unparse` |
+| `radis/pgsearch/providers.py:103,213` | Replace `QueryParser.unparse(search.query)` with `QueryParser.unparse_for_embedding(search.query)`; skip embedding call when result is empty |
+| `radis/pgsearch/tests/test_provider_hybrid.py` | Add a hybrid test exercising the `NOT X` and `A AND NOT B` paths |
+| `docker-compose.dev.yml:85-92` | Add `--concurrency 4` to `embeddings_worker` command |
+| `docker-compose.prod.yml:80-88` | Add `--concurrency 4` to `embeddings_worker` command |
+
+**Files to delete:**
+
+| Path | Reason |
+|---|---|
+| `radis/pgsearch/management/commands/backfill_embeddings.py` | Replaced by `embedding_launcher.defer()` from a shell |
+| `radis/pgsearch/tests/test_backfill_command.py` | Tests for the deleted command |
+| `radis/pgsearch/tests/test_embed_reports_task.py` | Tests for the deleted `embed_reports` task |
+| `radis/pgsearch/tests/test_signals.py` | Whole file is deleted once the embedding tests are removed |
+
+---
+
+## Task 1: Add new settings (additive only)
+
+**Files:**
+- Modify: `radis/settings/base.py:341-365`
+- Modify: `example.env`
+
+`EMBEDDING_BACKFILL_PRIORITY` stays for now — it is removed in Task 10 after every caller is gone.
+
+- [ ] **Step 1: Add `EMBEDDING_DRAIN_CRON` and `EMBEDDING_SYSTEM_USERNAME` to settings**
+
+Edit `radis/settings/base.py`. Add after line 347 (after `EMBEDDING_DIM = env.int(...)`):
+
+```python
+EMBEDDING_DRAIN_CRON = env.str("EMBEDDING_DRAIN_CRON", default="0 2 * * *")
+```
+
+Add after line 360 (the `EMBEDDING_BACKFILL_PRIORITY` line):
+
+```python
+EMBEDDING_SYSTEM_USERNAME = "system"
+```
+
+- [ ] **Step 2: Document the env var in `example.env`**
+
+Append to the `EMBEDDING_*` block in `example.env`:
+
+```
+# Cron expression for the embedding orchestrator. Default nightly at 02:00.
+# Use "*/15 * * * *" for more aggressive dev draining.
+EMBEDDING_DRAIN_CRON=0 2 * * *
+```
+
+- [ ] **Step 3: Verify Django config loads**
+
+Run: `uv run python manage.py shell -c "from django.conf import settings; print(settings.EMBEDDING_DRAIN_CRON, settings.EMBEDDING_SYSTEM_USERNAME)"`
+Expected: prints `0 2 * * * system`
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add radis/settings/base.py example.env
+git commit -m "feat(pgsearch): add EMBEDDING_DRAIN_CRON and EMBEDDING_SYSTEM_USERNAME settings"
+```
+
+---
+
+## Task 2: Add `EmbeddingJob` and `EmbeddingTask` models
+
+**Files:**
+- Modify: `radis/pgsearch/models.py`
+- Create: `radis/pgsearch/migrations/0004_embedding_job_task.py`
+- Create: `radis/pgsearch/tests/test_models_embedding.py`
+
+- [ ] **Step 1: Write the failing model tests**
+
+Create `radis/pgsearch/tests/test_models_embedding.py`:
+
+```python
+import pytest
+from django.contrib.auth import get_user_model
+
+from radis.pgsearch.models import EmbeddingJob, EmbeddingTask
+from radis.reports.factories import ReportFactory
+
+User = get_user_model()
+pytestmark = pytest.mark.django_db
+
+
+def _system_user() -> "User":
+    return User.objects.create(username="system", is_active=False)
+
+
+def test_embedding_job_defaults():
+    job = EmbeddingJob.objects.create(owner=_system_user())
+    assert job.status == EmbeddingJob.Status.UNVERIFIED
+    assert job.urgent is False
+    assert job.send_finished_mail is False
+    assert job.queued_job_id is None
+
+
+def test_embedding_task_links_to_reports():
+    job = EmbeddingJob.objects.create(owner=_system_user())
+    reports = [ReportFactory.create() for _ in range(3)]
+    task = EmbeddingTask.objects.create(job=job)
+    task.reports.set(reports)
+    assert task.status == EmbeddingTask.Status.PENDING
+    assert set(task.reports.values_list("pk", flat=True)) == {r.pk for r in reports}
+    assert task.attempts == 0
+    assert task.queued_job_id is None
+```
+
+- [ ] **Step 2: Run tests — expect ImportError**
+
+Run: `uv run pytest radis/pgsearch/tests/test_models_embedding.py -v`
+Expected: FAIL — `ImportError: cannot import name 'EmbeddingJob'`
+
+- [ ] **Step 3: Add models to `radis/pgsearch/models.py`**
+
+Append to `radis/pgsearch/models.py`:
+
+```python
+from django.urls import reverse
+from procrastinate.contrib.django import app
+from procrastinate.contrib.django.models import ProcrastinateJob
+
+from radis.core.models import AnalysisJob, AnalysisTask
+
+
+class EmbeddingJob(AnalysisJob):
+    default_priority = settings.EMBEDDING_INDEX_PRIORITY
+    urgent_priority = settings.EMBEDDING_INDEX_PRIORITY
+
+    queued_job_id: int | None
+    queued_job = models.OneToOneField(
+        ProcrastinateJob, null=True, on_delete=models.SET_NULL, related_name="+"
+    )
+
+    tasks: models.QuerySet["EmbeddingTask"]
+
+    class Meta:
+        ordering = ["-created_at"]
+
+    def __str__(self) -> str:
+        return f"EmbeddingJob [{self.pk}]"
+
+    def delay(self) -> None:
+        queued_job_id = app.configure_task(
+            "radis.pgsearch.tasks.process_embedding_job",
+            allow_unknown=False,
+            priority=self.default_priority,
+        ).defer(job_id=self.pk)
+        self.queued_job_id = queued_job_id
+        self.save()
+
+
+class EmbeddingTask(AnalysisTask):
+    job = models.ForeignKey[EmbeddingJob](
+        EmbeddingJob, on_delete=models.CASCADE, related_name="tasks"
+    )
+    reports = models.ManyToManyField(Report, related_name="embedding_tasks")
+
+    def delay(self) -> None:
+        queued_job_id = app.configure_task(
+            "radis.pgsearch.tasks.process_embedding_task",
+            allow_unknown=False,
+            priority=settings.EMBEDDING_INDEX_PRIORITY,
+        ).defer(task_id=self.pk)
+        self.queued_job_id = queued_job_id
+        self.save()
+```
+
+- [ ] **Step 4: Generate the migration**
+
+Run: `uv run python manage.py makemigrations pgsearch --name embedding_job_task`
+Expected: creates `radis/pgsearch/migrations/0004_embedding_job_task.py` containing `CreateModel` operations for `EmbeddingJob`, `EmbeddingTask`, and the M2M through-table.
+
+- [ ] **Step 5: Apply the migration and re-run tests**
+
+Run: `uv run python manage.py migrate pgsearch`
+Then: `uv run pytest radis/pgsearch/tests/test_models_embedding.py -v`
+Expected: PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add radis/pgsearch/models.py radis/pgsearch/migrations/0004_embedding_job_task.py radis/pgsearch/tests/test_models_embedding.py
+git commit -m "feat(pgsearch): add EmbeddingJob and EmbeddingTask models"
+```
+
+---
+
+## Task 3: Create the system user via data migration
+
+**Files:**
+- Create: `radis/pgsearch/migrations/0005_system_user.py`
+- Create: `radis/pgsearch/tests/test_migrations_system_user.py`
+
+- [ ] **Step 1: Write the failing migration test**
+
+Create `radis/pgsearch/tests/test_migrations_system_user.py`:
+
+```python
+import pytest
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+
+@pytest.mark.django_db
+def test_system_user_exists_after_migrations():
+    user = User.objects.get(username="system")
+    assert user.is_active is False
+    assert not user.has_usable_password()
+
+
+@pytest.mark.django_db
+def test_creating_system_user_twice_is_a_noop():
+    from radis.pgsearch.migrations import _system_user_helper
+
+    before = User.objects.filter(username="system").count()
+    _system_user_helper.create_system_user_idempotent(User)
+    after = User.objects.filter(username="system").count()
+    assert before == after == 1
+```
+
+- [ ] **Step 2: Run tests — expect failure**
+
+Run: `uv run pytest radis/pgsearch/tests/test_migrations_system_user.py -v`
+Expected: FAIL — system user does not exist yet OR ImportError on `_system_user_helper`.
+
+- [ ] **Step 3: Create the helper module**
+
+Create `radis/pgsearch/migrations/_system_user_helper.py`:
+
+```python
+from django.conf import settings
+
+
+def create_system_user_idempotent(user_model) -> None:
+    username = settings.EMBEDDING_SYSTEM_USERNAME
+    user, created = user_model.objects.get_or_create(
+        username=username,
+        defaults={"is_active": False},
+    )
+    if created:
+        user.set_unusable_password()
+        user.save()
+```
+
+- [ ] **Step 4: Create the data migration**
+
+Create `radis/pgsearch/migrations/0005_system_user.py`:
+
+```python
+from django.conf import settings
+from django.db import migrations
+
+from radis.pgsearch.migrations._system_user_helper import create_system_user_idempotent
+
+
+def forwards(apps, schema_editor):
+    User = apps.get_model(*settings.AUTH_USER_MODEL.split("."))
+    create_system_user_idempotent(User)
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("pgsearch", "0004_embedding_job_task"),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+    operations = [migrations.RunPython(forwards, reverse_code=migrations.RunPython.noop)]
+```
+
+- [ ] **Step 5: Apply migration and run tests**
+
+Run: `uv run python manage.py migrate pgsearch`
+Then: `uv run pytest radis/pgsearch/tests/test_migrations_system_user.py -v`
+Expected: PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add radis/pgsearch/migrations/0005_system_user.py radis/pgsearch/migrations/_system_user_helper.py radis/pgsearch/tests/test_migrations_system_user.py
+git commit -m "feat(pgsearch): add data migration for system user"
+```
+
+---
+
+## Task 4: Implement `process_embedding_task` (sub-task)
+
+**Files:**
+- Modify: `radis/pgsearch/tasks.py`
+- Create: `radis/pgsearch/tests/test_process_embedding_task.py`
+
+The existing `embed_reports` task and its helper stay in place for now — they are removed in Task 8. This task adds the new sub-task alongside.
+
+- [ ] **Step 1: Write the failing tests**
+
+Create `radis/pgsearch/tests/test_process_embedding_task.py`:
+
+```python
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import pytest
+from django.contrib.auth import get_user_model
+
+from radis.pgsearch.models import EmbeddingJob, EmbeddingTask, ReportSearchVector
+from radis.pgsearch.tasks import process_embedding_task as _wrapped
+from radis.pgsearch.utils.embedding_client import EmbeddingClientError
+from radis.reports.factories import ReportFactory
+
+User = get_user_model()
+process_embedding_task = _wrapped.__wrapped__  # type: ignore[attr-defined]
+pytestmark = pytest.mark.django_db
+
+
+def _make_task() -> EmbeddingTask:
+    owner = User.objects.get(username="system")
+    job = EmbeddingJob.objects.create(owner=owner)
+    task = EmbeddingTask.objects.create(job=job)
+    reports = [ReportFactory.create() for _ in range(2)]
+    task.reports.set(reports)
+    return task
+
+
+def _unit_vec(dim: int) -> list[float]:
+    v = np.ones(dim, dtype=np.float32)
+    return (v / np.linalg.norm(v)).tolist()
+
+
+def test_process_embedding_task_writes_vectors_and_marks_success(settings):
+    task = _make_task()
+    vec = _unit_vec(settings.EMBEDDING_DIM)
+    fake_client = MagicMock()
+    fake_client.embed_documents.return_value = [vec, vec]
+    with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client):
+        process_embedding_task(task.id)
+
+    task.refresh_from_db()
+    assert task.status == EmbeddingTask.Status.SUCCESS
+    assert task.queued_job_id is None
+    for report in task.reports.all():
+        rsv = ReportSearchVector.objects.get(report=report)
+        assert rsv.embedding is not None
+
+
+def test_process_embedding_task_failure_sets_status_and_raises():
+    task = _make_task()
+    fake_client = MagicMock()
+    fake_client.embed_documents.side_effect = EmbeddingClientError("boom")
+    with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client):
+        with pytest.raises(EmbeddingClientError):
+            process_embedding_task(task.id)
+
+    task.refresh_from_db()
+    assert task.status == EmbeddingTask.Status.FAILURE
+    assert task.queued_job_id is None
+    assert "boom" in task.message
+
+
+def test_process_embedding_task_calls_update_job_state(settings):
+    task = _make_task()
+    vec = _unit_vec(settings.EMBEDDING_DIM)
+    fake_client = MagicMock()
+    fake_client.embed_documents.return_value = [vec, vec]
+    with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client):
+        process_embedding_task(task.id)
+
+    task.job.refresh_from_db()
+    # All tasks succeeded; AnalysisJob.update_job_state rolls up to SUCCESS.
+    assert task.job.status == EmbeddingJob.Status.SUCCESS
+```
+
+- [ ] **Step 2: Run tests — expect ImportError**
+
+Run: `uv run pytest radis/pgsearch/tests/test_process_embedding_task.py -v`
+Expected: FAIL — `ImportError: cannot import name 'process_embedding_task'`
+
+- [ ] **Step 3: Add `process_embedding_task` to `radis/pgsearch/tasks.py`**
+
+Append to `radis/pgsearch/tasks.py` (existing imports already cover `logger`, `EmbeddingClient`, `ReportSearchVector`, `app`, `django_settings`):
+
+```python
+from django.utils import timezone
+
+from .models import EmbeddingTask
+from .utils.embedding_client import EmbeddingClientError
+
+
+@app.task(queue="embeddings")
+def process_embedding_task(task_id: int) -> None:
+    task = EmbeddingTask.objects.get(id=task_id)
+    task.status = EmbeddingTask.Status.IN_PROGRESS
+    task.started_at = timezone.now()
+    task.attempts = task.attempts + 1
+    task.save()
+
+    client = EmbeddingClient()
+    try:
+        report_ids = list(task.reports.values_list("pk", flat=True))
+        rsvs = list(
+            ReportSearchVector.objects
+            .filter(report_id__in=report_ids)
+            .select_related("report")
+            .only("id", "report_id", "report__body")
+        )
+        texts = [rsv.report.body for rsv in rsvs]
+        vectors = client.embed_documents(texts)
+        for rsv, vec in zip(rsvs, vectors, strict=True):
+            rsv.embedding = vec
+        ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"])
+
+        task.status = EmbeddingTask.Status.SUCCESS
+    except EmbeddingClientError as exc:
+        logger.exception("Embedding task %s failed: %s", task_id, exc)
+        task.status = EmbeddingTask.Status.FAILURE
+        task.message = str(exc)
+        raise
+    finally:
+        task.ended_at = timezone.now()
+        task.queued_job_id = None
+        task.save()
+        task.job.update_job_state()
+        client.close()
+```
+
+- [ ] **Step 4: Run tests and verify they pass**
+
+Run: `uv run pytest radis/pgsearch/tests/test_process_embedding_task.py -v`
+Expected: PASS (3 tests)
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add radis/pgsearch/tasks.py radis/pgsearch/tests/test_process_embedding_task.py
+git commit -m "feat(pgsearch): add process_embedding_task on embeddings queue"
+```
+
+---
+
+## Task 5: Implement `process_embedding_job` (orchestrator)
+
+**Files:**
+- Modify: `radis/pgsearch/tasks.py`
+- Create: `radis/pgsearch/tests/test_process_embedding_job.py`
+
+- [ ] **Step 1: Write the failing tests**
+
+Create `radis/pgsearch/tests/test_process_embedding_job.py`:
+
+```python
+from unittest.mock import patch
+
+import pytest
+from django.contrib.auth import get_user_model
+
+from radis.pgsearch.models import EmbeddingJob, EmbeddingTask, ReportSearchVector
+from radis.pgsearch.tasks import process_embedding_job as _wrapped
+from radis.reports.factories import ReportFactory
+
+User = get_user_model()
+process_embedding_job = _wrapped.__wrapped__  # type: ignore[attr-defined]
+pytestmark = pytest.mark.django_db
+
+
+def _new_job() -> EmbeddingJob:
+    owner = User.objects.get(username="system")
+    return EmbeddingJob.objects.create(owner=owner, status=EmbeddingJob.Status.PREPARING)
+
+
+def _make_pending_reports(n: int):
+    reports = [ReportFactory.create() for _ in range(n)]
+    # ReportFactory triggers the FTS post_save signal which creates ReportSearchVector
+    # rows with embedding=NULL; that's exactly the pending state we want.
+    return reports
+
+
+def test_process_embedding_job_batches_pending_reports(settings):
+    settings.EMBEDDING_BATCH_SIZE = 2
+    job = _new_job()
+    reports = _make_pending_reports(5)
+
+    with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock:
+        process_embedding_job(job.id)
+
+    job.refresh_from_db()
+    assert job.status == EmbeddingJob.Status.PENDING
+    # ceil(5 / 2) = 3 tasks
+    assert job.tasks.count() == 3
+    # All tasks are dispatched
+    assert delay_mock.call_count == 3
+    # Every pending report is in exactly one task
+    covered = set()
+    for task in job.tasks.all():
+        covered.update(task.reports.values_list("pk", flat=True))
+    assert covered == {r.pk for r in reports}
+
+
+def test_process_embedding_job_resume_path_only_redispatches_pending_tasks(settings):
+    settings.EMBEDDING_BATCH_SIZE = 2
+    job = _new_job()
+    reports = _make_pending_reports(2)
+    # Simulate a previous orchestrator run that created one task already.
+    existing = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING)
+    existing.reports.set(reports)
+    succeeded = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.SUCCESS)
+
+    with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock:
+        process_embedding_job(job.id)
+
+    job.refresh_from_db()
+    assert job.status == EmbeddingJob.Status.PENDING
+    # No new tasks created
+    assert job.tasks.count() == 2
+    # Only the pending one is dispatched
+    assert delay_mock.call_count == 1
+
+
+def test_process_embedding_job_with_no_pending_rows():
+    job = _new_job()
+    # No reports exist → no ReportSearchVector rows with embedding IS NULL.
+
+    with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock:
+        process_embedding_job(job.id)
+
+    job.refresh_from_db()
+    assert job.status == EmbeddingJob.Status.PENDING
+    assert job.tasks.count() == 0
+    assert delay_mock.call_count == 0
+```
+
+- [ ] **Step 2: Run tests — expect ImportError**
+
+Run: `uv run pytest radis/pgsearch/tests/test_process_embedding_job.py -v`
+Expected: FAIL — `ImportError: cannot import name 'process_embedding_job'`
+
+- [ ] **Step 3: Add `process_embedding_job` to `radis/pgsearch/tasks.py`**
+
+Append to `radis/pgsearch/tasks.py`:
+
+```python
+from .models import EmbeddingJob
+
+
+def _create_embedding_task(job: EmbeddingJob, report_ids: list[int]) -> EmbeddingTask:
+    from radis.reports.models import Report
+
+    task = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING)
+    task.reports.set(Report.objects.filter(pk__in=report_ids))
+    return task
+
+
+@app.task
+def process_embedding_job(job_id: int) -> None:
+    job = EmbeddingJob.objects.get(id=job_id)
+    assert job.status == EmbeddingJob.Status.PREPARING
+
+    if job.tasks.exists():
+        tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING)
+    else:
+        pending_ids_iter = (
+            ReportSearchVector.objects
+            .filter(embedding__isnull=True)
+            .values_list("report_id", flat=True)
+            .iterator(chunk_size=10_000)
+        )
+        batch: list[int] = []
+        for report_id in pending_ids_iter:
+            batch.append(int(report_id))
+            if len(batch) >= django_settings.EMBEDDING_BATCH_SIZE:
+                _create_embedding_task(job, batch)
+                batch = []
+        if batch:
+            _create_embedding_task(job, batch)
+
+        tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING)
+
+    job.status = EmbeddingJob.Status.PENDING
+    job.queued_job_id = None
+    job.save()
+
+    for task in tasks_to_enqueue:
+        if not task.is_queued:
+            task.delay()
+```
+
+- [ ] **Step 4: Run tests and verify pass**
+
+Run: `uv run pytest radis/pgsearch/tests/test_process_embedding_job.py -v`
+Expected: PASS (3 tests)
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add radis/pgsearch/tasks.py radis/pgsearch/tests/test_process_embedding_job.py
+git commit -m "feat(pgsearch): add process_embedding_job orchestrator"
+```
+
+---
+
+## Task 6: Implement `embedding_launcher` (periodic)
+
+**Files:**
+- Modify: `radis/pgsearch/tasks.py`
+- Create: `radis/pgsearch/tests/test_embedding_launcher.py`
+
+- [ ] **Step 1: Write the failing tests**
+
+Create `radis/pgsearch/tests/test_embedding_launcher.py`:
+
+```python
+from unittest.mock import patch
+
+import pytest
+from django.contrib.auth import get_user_model
+
+from radis.pgsearch.models import EmbeddingJob
+from radis.pgsearch.tasks import embedding_launcher as _wrapped
+from radis.reports.factories import ReportFactory
+
+User = get_user_model()
+embedding_launcher = _wrapped.__wrapped__  # type: ignore[attr-defined]
+pytestmark = pytest.mark.django_db
+
+
+def test_embedding_launcher_noop_when_job_in_flight():
+    owner = User.objects.get(username="system")
+    EmbeddingJob.objects.create(owner=owner, status=EmbeddingJob.Status.PREPARING)
+    # Make a pending report so the second guard wouldn't short-circuit on its own.
+    ReportFactory.create()
+
+    with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock:
+        embedding_launcher(context=None, timestamp=0)
+
+    assert delay_mock.call_count == 0
+    # No new job created.
+    assert EmbeddingJob.objects.count() == 1
+
+
+def test_embedding_launcher_noop_when_no_pending_rows():
+    with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock:
+        embedding_launcher(context=None, timestamp=0)
+
+    assert delay_mock.call_count == 0
+    assert EmbeddingJob.objects.count() == 0
+
+
+def test_embedding_launcher_happy_path_creates_job_and_defers(
+    django_capture_on_commit_callbacks,
+):
+    ReportFactory.create()
+
+    with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock:
+        with django_capture_on_commit_callbacks(execute=True):
+            embedding_launcher(context=None, timestamp=0)
+
+    assert EmbeddingJob.objects.count() == 1
+    job = EmbeddingJob.objects.get()
+    assert job.status == EmbeddingJob.Status.PREPARING
+    assert job.owner.username == "system"
+    delay_mock.assert_called_once()
+```
+
+- [ ] **Step 2: Run tests — expect ImportError**
+
+Run: `uv run pytest radis/pgsearch/tests/test_embedding_launcher.py -v`
+Expected: FAIL — `ImportError: cannot import name 'embedding_launcher'`
+
+- [ ] **Step 3: Add `embedding_launcher` to `radis/pgsearch/tasks.py`**
+
+Append to `radis/pgsearch/tasks.py`:
+
+```python
+from django.contrib.auth import get_user_model
+from django.db import transaction
+
+
+@app.periodic(cron=django_settings.EMBEDDING_DRAIN_CRON)
+@app.task(
+    queue="default",
+    queueing_lock="embedding_launcher",
+    pass_context=True,
+)
+def embedding_launcher(context, timestamp: int) -> None:
+    in_flight = EmbeddingJob.objects.filter(
+        status__in=[
+            EmbeddingJob.Status.PREPARING,
+            EmbeddingJob.Status.PENDING,
+            EmbeddingJob.Status.IN_PROGRESS,
+        ]
+    ).exists()
+    if in_flight:
+        logger.info("EmbeddingJob already in flight; launcher tick is a no-op.")
+        return
+
+    has_pending = ReportSearchVector.objects.filter(embedding__isnull=True).exists()
+    if not has_pending:
+        logger.debug("No reports pending embedding; launcher tick is a no-op.")
+        return
+
+    User = get_user_model()
+    system_user = User.objects.get(username=django_settings.EMBEDDING_SYSTEM_USERNAME)
+    job = EmbeddingJob.objects.create(
+        owner=system_user,
+        status=EmbeddingJob.Status.PREPARING,
+    )
+    transaction.on_commit(job.delay)
+```
+
+- [ ] **Step 4: Run tests and verify pass**
+
+Run: `uv run pytest radis/pgsearch/tests/test_embedding_launcher.py -v`
+Expected: PASS (3 tests)
+
+- [ ] **Step 5: Verify the full pgsearch test suite still passes**
+
+Run: `uv run pytest radis/pgsearch/ -v`
+Expected: PASS for all new tests; old `test_embed_reports_task.py`, `test_backfill_command.py`, `test_signals.py` still pass since their targets aren't removed yet.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add radis/pgsearch/tasks.py radis/pgsearch/tests/test_embedding_launcher.py
+git commit -m "feat(pgsearch): add embedding_launcher periodic task"
+```
+
+---
+
+## Task 7: Remove old `enqueue_report_embedding` signal
+
+**Files:**
+- Modify: `radis/pgsearch/signals.py:19-23`
+- Delete: `radis/pgsearch/tests/test_signals.py` (the file becomes empty)
+
+The FTS signal `create_or_update_report_search_vector` stays. The embedding signal is the only thing being removed.
+
+- [ ] **Step 1: Remove the embedding signal receiver**
+
+Replace `radis/pgsearch/signals.py` contents with:
+
+```python
+from django.db.models.signals import post_save
+from django.dispatch import receiver
+
+from radis.reports.models import Report
+
+from .models import ReportSearchVector
+
+
+@receiver(post_save, sender=Report)
+def create_or_update_report_search_vector(sender, instance, created, **kwargs):
+    if created:
+        ReportSearchVector.objects.create(report=instance)
+        return
+    instance.search_vector.save()
+```
+
+(Removes the `transaction` and `enqueue_embed_reports` imports along with the second receiver.)
+
+- [ ] **Step 2: Delete the signal test file**
+
+Run: `rm radis/pgsearch/tests/test_signals.py`
+
+- [ ] **Step 3: Run the full pgsearch test suite**
+
+Run: `uv run pytest radis/pgsearch/ -v`
+Expected: PASS for everything; `test_signals.py` no longer collected.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add radis/pgsearch/signals.py
+git rm radis/pgsearch/tests/test_signals.py
+git commit -m "refactor(pgsearch): remove post_save embedding signal (replaced by orchestrator)"
+```
+
+---
+
+## Task 8: Remove `embed_reports` task and `enqueue_embed_reports` helper
+
+**Files:**
+- Modify: `radis/pgsearch/tasks.py`
+- Delete: `radis/pgsearch/tests/test_embed_reports_task.py`
+
+At this point nothing imports `embed_reports` or `enqueue_embed_reports` (the signal was removed in Task 7; the backfill command is removed in Task 9 — but the command's import is what we now break by removing the task. The fix is to remove both in one logical step: this task removes the task, Task 9 removes the command. Order matters — do Task 8 *and* Task 9 in immediate succession so the tree never has a dangling import.
+
+Confirm with grep before deleting:
+
+- [ ] **Step 1: Confirm only the backfill command still imports the helper**
+
+Run: `grep -rn "enqueue_embed_reports\|embed_reports" radis/ --include="*.py" | grep -v __pycache__`
+Expected: only references in `radis/pgsearch/tasks.py`, `radis/pgsearch/management/commands/backfill_embeddings.py`, and `radis/pgsearch/tests/test_embed_reports_task.py`.
+
+- [ ] **Step 2: Remove `embed_reports` and `enqueue_embed_reports` from `radis/pgsearch/tasks.py`**
+
+In `radis/pgsearch/tasks.py`, delete the function definitions for `embed_reports` (the `@app.task(queue="embeddings")` block currently at lines ~37-68) and `enqueue_embed_reports` (currently at lines ~71-84). Keep `bulk_index_reports`, `enqueue_bulk_index_reports`, and all the new orchestrator code added in Tasks 4–6.
+
+- [ ] **Step 3: Delete the old test file**
+
+Run: `rm radis/pgsearch/tests/test_embed_reports_task.py`
+
+- [ ] **Step 4: Verify the backfill command still imports cleanly is now expected to fail**
+
+Run: `uv run python manage.py shell -c "from radis.pgsearch.management.commands import backfill_embeddings"`
+Expected: `ImportError: cannot import name 'enqueue_embed_reports'` — this confirms Task 9 (deleting the command) is the immediate next step.
+
+- [ ] **Step 5: Do NOT commit yet — proceed straight to Task 9**
+
+The tree is in a broken intermediate state. Move to Task 9 before committing.
+
+---
+
+## Task 9: Remove `backfill_embeddings` management command
+
+**Files:**
+- Delete: `radis/pgsearch/management/commands/backfill_embeddings.py`
+- Delete: `radis/pgsearch/tests/test_backfill_command.py`
+
+- [ ] **Step 1: Delete the command and its test**
+
+Run:
+```bash
+rm radis/pgsearch/management/commands/backfill_embeddings.py
+rm radis/pgsearch/tests/test_backfill_command.py
+```
+
+- [ ] **Step 2: Verify no remaining references**
+
+Run: `grep -rn "backfill_embeddings\|enqueue_embed_reports\|embed_reports" radis/ --include="*.py" | grep -v __pycache__`
+Expected: empty output.
+
+- [ ] **Step 3: Run the full pgsearch test suite**
+
+Run: `uv run pytest radis/pgsearch/ -v`
+Expected: PASS for everything; the removed test files are no longer collected.
+
+- [ ] **Step 4: Commit Tasks 8 + 9 together**
+
+```bash
+git add radis/pgsearch/tasks.py
+git rm radis/pgsearch/tests/test_embed_reports_task.py
+git rm radis/pgsearch/management/commands/backfill_embeddings.py
+git rm radis/pgsearch/tests/test_backfill_command.py
+git commit -m "refactor(pgsearch): remove embed_reports task and backfill_embeddings command"
+```
+
+---
+
+## Task 10: Remove `EMBEDDING_BACKFILL_PRIORITY` setting
+
+**Files:**
+- Modify: `radis/settings/base.py:360`
+
+- [ ] **Step 1: Confirm no remaining references**
+
+Run: `grep -rn "EMBEDDING_BACKFILL_PRIORITY" radis/ --include="*.py" | grep -v __pycache__`
+Expected: only `radis/settings/base.py:360`.
+
+- [ ] **Step 2: Remove the setting line**
+
+In `radis/settings/base.py`, delete the line:
+
+```python
+EMBEDDING_BACKFILL_PRIORITY = -1
+```
+
+- [ ] **Step 3: Verify Django still loads**
+
+Run: `uv run python manage.py shell -c "from django.conf import settings; print(settings.EMBEDDING_INDEX_PRIORITY)"`
+Expected: prints `0`.
+
+- [ ] **Step 4: Run full test suite to confirm nothing dangles**
+
+Run: `uv run pytest radis/pgsearch/ -v`
+Expected: PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add radis/settings/base.py
+git commit -m "refactor(pgsearch): remove EMBEDDING_BACKFILL_PRIORITY setting"
+```
+
+---
+
+## Task 11: Set `--concurrency 4` on the embeddings worker
+
+**Files:**
+- Modify: `docker-compose.dev.yml:85-92`
+- Modify: `docker-compose.prod.yml:80-88`
+
+The orchestrator is on `default`, so the `embeddings_worker` only runs `process_embedding_task`. Concurrency 4 saturates a typical embedding endpoint while leaving headroom; raise/lower per deployment.
+
+- [ ] **Step 1: Update `docker-compose.dev.yml`**
+
+Edit `docker-compose.dev.yml`. Change the `embeddings_worker` command from:
+
+```yaml
+    command: >
+      bash -c "
+        wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} &&
+        ./manage.py bg_worker -l debug -q embeddings --autoreload
+      "
+```
+
+to:
+
+```yaml
+    command: >
+      bash -c "
+        wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} &&
+        ./manage.py bg_worker -l debug -q embeddings --autoreload --concurrency 4
+      "
+```
+
+- [ ] **Step 2: Update `docker-compose.prod.yml`**
+
+Edit `docker-compose.prod.yml`. Change the `embeddings_worker` command from:
+
+```yaml
+    command: >
+      bash -c "
+        wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} &&
+        ./manage.py bg_worker -q embeddings
+      "
+```
+
+to:
+
+```yaml
+    command: >
+      bash -c "
+        wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} &&
+        ./manage.py bg_worker -q embeddings --concurrency 4
+      "
+```
+
+- [ ] **Step 3: Validate compose syntax**
+
+Run: `docker compose -f docker-compose.dev.yml config > /dev/null && docker compose -f docker-compose.prod.yml config > /dev/null`
+Expected: exit 0, no output. (If Docker is not running locally, skip — this just confirms YAML is well-formed.)
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add docker-compose.dev.yml docker-compose.prod.yml
+git commit -m "feat(infra): run embeddings_worker with --concurrency 4"
+```
+
+---
+
+## Task 12: Add `QueryParser.unparse_for_embedding`
+
+**Files:**
+- Modify: `radis/search/utils/query_parser.py:293-314` (append new static method after the existing `unparse`)
+- Create: `radis/search/tests/test_query_parser_unparse_for_embedding.py`
+
+The method walks the same AST as `unparse` (`TermNode | ParensNode | UnaryNode | BinaryNode` defined at `radis/search/utils/query_parser.py:55`) but drops `UnaryNode("NOT", X)` branches and collapses empty `BinaryNode` legs. The grammar's only unary operator is `NOT` (per `radis/search/utils/query_parser.py:214`), so the implementation can assume that. The empty string is a legitimate return value (e.g., for `NOT X` alone) and callers handle it.
+
+- [ ] **Step 1: Write the failing tests**
+
+Create `radis/search/tests/test_query_parser_unparse_for_embedding.py`:
+
+```python
+import pytest
+
+from radis.search.utils.query_parser import QueryParser
+
+
+@pytest.mark.parametrize(
+    "query,expected",
+    [
+        # Simple positive term — unchanged.
+        ("pneumothorax", "pneumothorax"),
+        # Phrase preserved with quotes.
+        ('"chest x-ray"', '"chest x-ray"'),
+        # Implicit AND (no operator) — both sides survive.
+        ("cardiac arrest", "cardiac arrest"),
+        # Explicit AND — both sides survive, operator preserved.
+        ("A AND B", "A AND B"),
+        # OR — both sides survive, operator preserved.
+        ("A OR B", "A OR B"),
+        # NOT alone — empty.
+        ("NOT pneumothorax", ""),
+        # AND NOT — left survives, NOT branch dropped, AND collapses.
+        ("A AND NOT B", "A"),
+        # NOT AND — right survives, NOT branch dropped, AND collapses.
+        ("NOT A AND B", "B"),
+        # NOT OR NOT — both branches dropped, empty.
+        ("NOT A OR NOT B", ""),
+        # Mixed: AND OR with a NOT branch — surviving structure retained.
+        ("(A AND NOT B) OR C", "(A) OR C"),
+        # Nested NOT inside parens — empty parens collapsed.
+        ("A AND (NOT B)", "A"),
+        # Double-nested OR with one NOT — only NOT branch dropped.
+        ("(A OR B) AND NOT C", "(A OR B)"),
+    ],
+)
+def test_unparse_for_embedding(query, expected):
+    node, _fixes = QueryParser().parse(query)
+    assert node is not None, f"parser produced empty node for {query!r}"
+    assert QueryParser.unparse_for_embedding(node) == expected
+```
+
+- [ ] **Step 2: Run tests — expect AttributeError**
+
+Run: `uv run pytest radis/search/tests/test_query_parser_unparse_for_embedding.py -v`
+Expected: FAIL — `AttributeError: type object 'QueryParser' has no attribute 'unparse_for_embedding'`
+
+- [ ] **Step 3: Add the method to `radis/search/utils/query_parser.py`**
+
+Append immediately after the existing `unparse` static method (after the closing of the `if/elif` chain that ends around line 314):
+
+```python
+    @staticmethod
+    def unparse_for_embedding(node: QueryNode) -> str:
+        """Like ``unparse``, but drops the operand of every ``UnaryNode("NOT", X)``
+        and collapses any ``BinaryNode`` whose children both become empty.
+        Returns the empty string if the whole query reduces to NOT clauses.
+
+        Used by the hybrid-search vector half to avoid polarity-blind embedding
+        of negated terms (see spec 2026-05-28-hybrid-search §7.8).
+        """
+        if isinstance(node, TermNode):
+            return QueryParser.unparse(node)
+        if isinstance(node, ParensNode):
+            inner = QueryParser.unparse_for_embedding(node.expression)
+            return f"({inner})" if inner else ""
+        if isinstance(node, UnaryNode):
+            return ""
+        if isinstance(node, BinaryNode):
+            left = QueryParser.unparse_for_embedding(node.left)
+            right = QueryParser.unparse_for_embedding(node.right)
+            if not left and not right:
+                return ""
+            if not left:
+                return right
+            if not right:
+                return left
+            if node.implicit:
+                return f"{left} {right}"
+            return f"{left} {node.operator} {right}"
+        raise ValueError(f"Unknown node type: {type(node)}")
+```
+
+- [ ] **Step 4: Run tests and verify pass**
+
+Run: `uv run pytest radis/search/tests/test_query_parser_unparse_for_embedding.py -v`
+Expected: PASS (12 parameterized cases).
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add radis/search/utils/query_parser.py radis/search/tests/test_query_parser_unparse_for_embedding.py
+git commit -m "feat(search): add QueryParser.unparse_for_embedding that strips NOT branches"
+```
+
+---
+
+## Task 13: Wire `unparse_for_embedding` into the pgsearch provider
+
+**Files:**
+- Modify: `radis/pgsearch/providers.py:103` (in `search()`)
+- Modify: `radis/pgsearch/providers.py:213` (in `retrieve()`)
+- Modify: `radis/pgsearch/tests/test_provider_hybrid.py`
+
+Both `search()` and `retrieve()` currently call `QueryParser.unparse(search.query)` to build the text passed to `embed_query`. Replace with `unparse_for_embedding`. If the result is empty (e.g., the user query is `NOT X` alone), skip the embedding call and leave `query_vec = None` — the existing FTS-only fallback handles it.
+
+- [ ] **Step 1: Write the failing test**
+
+Append to `radis/pgsearch/tests/test_provider_hybrid.py` (use existing fixtures; structure mirrors current tests in that file):
+
+```python
+def test_search_skips_embedding_when_query_reduces_to_not(monkeypatch, ...):
+    """`NOT X` alone produces an empty embedding string; the provider must
+    not call the embedding service and must return FTS-only results."""
+    from radis.pgsearch import providers
+    from radis.search.site import Search, SearchFilters
+    from radis.search.utils.query_parser import QueryParser
+
+    embed_query_calls: list[str] = []
+
+    class FakeEC:
+        def __init__(self): pass
+        def __enter__(self): return self
+        def __exit__(self, *a): return False
+        def embed_query(self, text):
+            embed_query_calls.append(text)
+            raise AssertionError("embed_query should not be called for NOT-only query")
+
+    monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC)
+
+    node, _ = QueryParser().parse("NOT pneumothorax")
+    search = Search(query=node, offset=0, limit=10, filters=SearchFilters(group=...))
+    result = providers.search(search)
+
+    assert embed_query_calls == []
+    # FTS-only path still returns a SearchResult (possibly with zero hits).
+    assert result is not None
+
+
+def test_search_embeds_only_positive_branch_for_and_not(monkeypatch, ...):
+    """`A AND NOT B` embeds only `A`; FTS half still enforces the exclusion."""
+    from radis.pgsearch import providers
+    from radis.search.site import Search, SearchFilters
+    from radis.search.utils.query_parser import QueryParser
+
+    embed_query_calls: list[str] = []
+
+    class FakeEC:
+        def __init__(self): pass
+        def __enter__(self): return self
+        def __exit__(self, *a): return False
+        def embed_query(self, text):
+            embed_query_calls.append(text)
+            # Return a valid normalized unit vector of the right dim.
+            import numpy as np
+            from django.conf import settings as dj
+            v = np.ones(dj.EMBEDDING_DIM, dtype=np.float32)
+            return (v / np.linalg.norm(v)).tolist()
+
+    monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC)
+
+    node, _ = QueryParser().parse("pneumothorax AND NOT effusion")
+    search = Search(query=node, offset=0, limit=10, filters=SearchFilters(group=...))
+    providers.search(search)
+
+    assert embed_query_calls == ["pneumothorax"]
+```
+
+Replace `group=...` with the actual fixture used elsewhere in the file (it is whatever value the existing hybrid tests pass — read the file's other test bodies for the canonical filter setup).
+
+- [ ] **Step 2: Run tests — expect failure**
+
+Run: `uv run pytest radis/pgsearch/tests/test_provider_hybrid.py -k "not_when_query_reduces_to_not or and_not" -v`
+Expected: FAIL — `embed_query` is still called with the unstripped text.
+
+- [ ] **Step 3: Modify `radis/pgsearch/providers.py:search()`**
+
+Locate the block currently at lines ~102-110 in `radis/pgsearch/providers.py`:
+
+```python
+    # Vector side: query embedding (sync HTTP); fall back gracefully on failure.
+    query_text = QueryParser.unparse(search.query)
+    query_vec: list[float] | None
+    try:
+        with EmbeddingClient() as ec:
+            query_vec = ec.embed_query(query_text)
+    except EmbeddingClientError as e:
+        logger.warning("Hybrid search falling back to FTS-only: %s", e)
+        query_vec = None
+```
+
+Replace with:
+
+```python
+    # Vector side: strip NOT branches (see spec §7.8). If nothing is left,
+    # skip the embedding call entirely and fall through to FTS-only.
+    query_text = QueryParser.unparse_for_embedding(search.query)
+    query_vec: list[float] | None = None
+    if query_text.strip():
+        try:
+            with EmbeddingClient() as ec:
+                query_vec = ec.embed_query(query_text)
+        except EmbeddingClientError as e:
+            logger.warning("Hybrid search falling back to FTS-only: %s", e)
+            query_vec = None
+```
+
+- [ ] **Step 4: Apply the same change to `retrieve()`**
+
+Locate the analogous block at lines ~212-220 in `radis/pgsearch/providers.py`. Apply the identical replacement.
+
+- [ ] **Step 5: Run tests and verify pass**
+
+Run: `uv run pytest radis/pgsearch/tests/test_provider_hybrid.py -v`
+Expected: PASS for all hybrid tests including the two new ones.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add radis/pgsearch/providers.py radis/pgsearch/tests/test_provider_hybrid.py
+git commit -m "feat(pgsearch): use unparse_for_embedding to strip NOT branches before embed_query"
+```
+
+---
+
+## Task 14: Replace `EMBEDDING_DIM_MIGRATION_LITERAL` with `MigrationLoader`-based check
+
+**Files:**
+- Modify: `radis/pgsearch/apps.py` (delete constant, add helper, rewrite check)
+- Modify: `radis/pgsearch/tests/test_apps_checks.py` (replace constant import)
+
+The current check in `radis/pgsearch/apps.py` compares `settings.EMBEDDING_DIM` against a hand-maintained constant `EMBEDDING_DIM_MIGRATION_LITERAL = 1024` that must be kept in sync with the literal in migration 0003 by convention only. This task replaces the constant with a helper that reads the dim from Django's on-disk migration files via `MigrationLoader`, so there is one source of truth (the `dimensions=...` literal that `makemigrations` itself generates from `settings.EMBEDDING_DIM`). A new error id `pgsearch.E002` covers the case where the embedding field cannot be located in the migrations.
+
+See spec §4.6 for the design rationale and the alternatives-considered table.
+
+- [ ] **Step 1: Rewrite the test file to source the dim from the new helper**
+
+Replace the entire contents of `radis/pgsearch/tests/test_apps_checks.py` with:
+
+```python
+"""Tests for the Django system check that guards EMBEDDING_DIM/migration parity."""
+
+from unittest.mock import patch
+
+from django.test import override_settings
+
+from radis.pgsearch.apps import (
+    _migration_embedding_dim,
+    check_embedding_dim_matches_migration,
+)
+
+
+def test_migration_embedding_dim_returns_int_without_db():
+    dim = _migration_embedding_dim()
+    assert isinstance(dim, int)
+    assert dim == 1024
+
+
+def test_check_passes_when_dim_matches_migration():
+    dim = _migration_embedding_dim()
+    with override_settings(EMBEDDING_DIM=dim):
+        assert check_embedding_dim_matches_migration(app_configs=None) == []
+
+
+def test_check_fails_with_e001_when_dim_diverges_from_migration():
+    dim = _migration_embedding_dim()
+    with override_settings(EMBEDDING_DIM=dim + 1):
+        errors = check_embedding_dim_matches_migration(app_configs=None)
+    assert len(errors) == 1
+    err = errors[0]
+    assert err.id == "pgsearch.E001"
+    assert str(dim) in err.msg
+    assert str(dim + 1) in err.msg
+
+
+def test_check_fails_with_e002_when_migration_field_missing():
+    with patch(
+        "radis.pgsearch.apps._migration_embedding_dim", return_value=None
+    ):
+        errors = check_embedding_dim_matches_migration(app_configs=None)
+    assert len(errors) == 1
+    assert errors[0].id == "pgsearch.E002"
+```
+
+- [ ] **Step 2: Run tests — expect ImportError**
+
+Run: `uv run pytest radis/pgsearch/tests/test_apps_checks.py -v`
+Expected: FAIL — `ImportError: cannot import name '_migration_embedding_dim' from 'radis.pgsearch.apps'`
+
+- [ ] **Step 3: Rewrite `radis/pgsearch/apps.py`**
+
+Replace the entire file with:
+
+```python
+from django.apps import AppConfig
+from django.conf import settings
+from django.core.checks import Error, register
+
+
+class PgSearchConfig(AppConfig):
+    name = "radis.pgsearch"
+
+    def ready(self):
+        from . import signals as signals  # noqa: F401
+
+        register_app()
+
+
+def _migration_embedding_dim() -> int | None:
+    """Return the `dimensions` value of `ReportSearchVector.embedding` as
+    captured by the on-disk pgsearch migrations. Returns None if the field
+    cannot be located (migrations missing or model renamed)."""
+    from django.db.migrations.loader import MigrationLoader
+
+    loader = MigrationLoader(connection=None, ignore_no_migrations=True)
+    state = loader.project_state()
+    try:
+        model = state.apps.get_model("pgsearch", "ReportSearchVector")
+        return model._meta.get_field("embedding").dimensions
+    except (LookupError, AttributeError):
+        return None
+
+
+@register()
+def check_embedding_dim_matches_migration(app_configs, **kwargs):
+    """Fail loudly when settings.EMBEDDING_DIM diverges from the dim baked
+    into the pgsearch migrations. Mismatched values would otherwise surface as
+    opaque pgvector dimension errors on the first write or query."""
+    migration_dim = _migration_embedding_dim()
+
+    if migration_dim is None:
+        return [
+            Error(
+                "Could not determine the embedding column dimension from the "
+                "pgsearch migrations. Either the migrations are missing the "
+                "embedding field or the model has been renamed.",
+                id="pgsearch.E002",
+                hint=(
+                    "Verify that `radis/pgsearch/migrations/` contains a "
+                    "migration that adds the `embedding` field to "
+                    "`ReportSearchVector`, and that `makemigrations pgsearch` "
+                    "succeeds without changes."
+                ),
+            )
+        ]
+
+    if settings.EMBEDDING_DIM != migration_dim:
+        return [
+            Error(
+                f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the "
+                f"dim baked into the pgsearch migrations "
+                f"(vector({migration_dim})). Writes will fail with a pgvector "
+                f"dimension error. Either set "
+                f"EMBEDDING_DIM={migration_dim}, or run `makemigrations "
+                f"pgsearch` to capture the new dim and follow the §4.5 "
+                f"procedure to drop and recreate the embedding column.",
+                id="pgsearch.E001",
+                hint=(
+                    "Update EMBEDDING_DIM in your .env to match the existing "
+                    "migrations, or generate a new migration that matches the "
+                    "new dim."
+                ),
+            )
+        ]
+    return []
+
+
+def register_app():
+    from django.conf import settings
+
+    from radis.extractions.site import (
+        ExtractionRetrievalProvider,
+        register_extraction_retrieval_provider,
+    )
+    from radis.search.site import SearchProvider, register_search_provider
+    from radis.subscriptions.site import (
+        SubscriptionFilterProvider,
+        SubscriptionRetrievalProvider,
+        register_subscription_filter_provider,
+        register_subscription_retrieval_provider,
+    )
+
+    from .providers import count, filter, retrieve, search
+
+    register_search_provider(
+        SearchProvider(
+            name="PG Search",
+            search=search,
+            max_results=max(
+                settings.HYBRID_VECTOR_TOP_K, settings.HYBRID_FTS_MAX_RESULTS
+            ),
+        )
+    )
+
+    register_extraction_retrieval_provider(
+        ExtractionRetrievalProvider(
+            name="PG Search",
+            count=count,
+            retrieve=retrieve,
+            max_results=None,
+        )
+    )
+
+    register_subscription_retrieval_provider(
+        SubscriptionRetrievalProvider(
+            name="PG Search",
+            retrieve=retrieve,
+        )
+    )
+    register_subscription_filter_provider(
+        SubscriptionFilterProvider(
+            name="PG Search",
+            filter=filter,
+        )
+    )
+```
+
+The `EMBEDDING_DIM_MIGRATION_LITERAL` constant and its sync-keeping comment block are gone. `register_app()` is unchanged from the current implementation.
+
+- [ ] **Step 4: Run tests and verify pass**
+
+Run: `uv run pytest radis/pgsearch/tests/test_apps_checks.py -v`
+Expected: PASS (4 tests).
+
+- [ ] **Step 5: Run `manage.py check` to confirm the system check still works end-to-end**
+
+Run: `uv run python manage.py check`
+Expected: passes with no errors (current `.env` has `EMBEDDING_DIM=1024` matching the migration).
+
+Also verify the negative case manually:
+
+Run: `EMBEDDING_DIM=999 uv run python manage.py check`
+Expected: prints a single `pgsearch.E001` error mentioning both `999` and `1024`, and `manage.py check` exits non-zero.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add radis/pgsearch/apps.py radis/pgsearch/tests/test_apps_checks.py
+git commit -m "refactor(pgsearch): derive embedding-dim check from MigrationLoader, drop hand-edited literal"
+```
+
+---
+
+## Final verification
+
+- [ ] **Step 1: Run lint**
+
+Run: `uv run cli lint`
+Expected: PASS (no new violations).
+
+- [ ] **Step 2: Run the full pgsearch test suite**
+
+Run: `uv run pytest radis/pgsearch/ -v`
+Expected: PASS for every test.
+
+- [ ] **Step 3: Run the broader app test suite**
+
+Run: `uv run cli test`
+Expected: PASS. (Pay attention to extractions/subscriptions/search since they share the AnalysisJob base.)
+
+- [ ] **Step 4: Smoke-test in dev containers (manual)**
+
+```bash
+uv run cli compose-up -- --watch
+# in another terminal:
+uv run cli shell
+>>> from radis.reports.factories import ReportFactory
+>>> ReportFactory.create_batch(5)
+>>> from radis.pgsearch.tasks import embedding_launcher
+>>> embedding_launcher.defer()
+# watch logs:
+docker compose logs -f default_worker embeddings_worker
+# verify EmbeddingJob and tasks are created and reach SUCCESS:
+>>> from radis.pgsearch.models import EmbeddingJob
+>>> EmbeddingJob.objects.latest("created_at").status
+```
+
+Expected: latest job's status is `SU` (SUCCESS).
+
+- [ ] **Step 5: Push branch**
+
+Only after the above pass.
+
+```bash
+git push -u origin feat/hybrid-search
+```
+
+---
+
+## Spec coverage cross-check
+
+| Spec requirement | Task |
+|---|---|
+| §6.2 `embeddings_worker --concurrency 4` | Task 11 |
+| §6.3 priority table (no `EMBEDDING_BACKFILL_PRIORITY`) | Task 10 |
+| §6.4 `EmbeddingJob`, `EmbeddingTask` models | Task 2 |
+| §6.4 owner = system user via data migration | Task 3 |
+| §6.5 `embedding_launcher` with `queueing_lock` + in-flight check | Task 6 |
+| §6.6 `process_embedding_job` PREPARING → PENDING flow | Task 5 |
+| §7.2 `unparse_for_embedding` used in search() + empty-string short-circuit | Task 13 |
+| §7.8 `QueryParser.unparse_for_embedding` AST walker | Task 12 |
+| §6.7 `process_embedding_task` on `embeddings` queue | Task 4 |
+| §6.8 No post_save signal | Task 7 |
+| §6.8 No `backfill_embeddings` command | Tasks 8 + 9 |
+| §8.1 `EMBEDDING_DRAIN_CRON` env var | Task 1 |
+| §8.2 `EMBEDDING_SYSTEM_USERNAME` constant | Task 1 |
+| §10.1 unit tests for launcher/job/task | Tasks 4, 5, 6 |
+| §4.6 `MigrationLoader`-based EMBEDDING_DIM check + `pgsearch.E002` | Task 14 |
diff --git a/docs/superpowers/specs/2026-05-15-hybrid-search-design.md b/docs/superpowers/specs/2026-05-15-hybrid-search-design.md
new file mode 100644
index 000000000..77a2d59e5
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-15-hybrid-search-design.md
@@ -0,0 +1,594 @@
+# Hybrid Search Design (FTS + Dense Vector via Qwen3-Embedding-4B)
+
+**Status:** Draft — design phase
+**Author:** RADIS team (Samuel Kwong)
+**Date:** 2026-05-15
+**Implementation skill (next step):** `writing-plans`
+
+---
+
+## 1. Overview
+
+RADIS today provides PostgreSQL full-text search (FTS) over radiology reports via the `radis.pgsearch` provider: each `Report` gets a 1:1 `ReportSearchVector` row holding a `tsvector`, kept in sync via `post_save` signal and a bulk re-index task. Queries are ranked by `ts_rank` and snippeted via `ts_headline`.
+
+This spec extends that infrastructure with a dense-vector retrieval side, fused with FTS via Reciprocal Rank Fusion (RRF), to deliver **hybrid search**. Embeddings are produced by a Qwen3-Embedding-4B inference endpoint and stored in the same `ReportSearchVector` table.
+
+The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — `SearchView`, `ExtractionJob`, `SubscriptionJob`, the REST API — see no signature differences. Only the body of `radis.pgsearch.providers.search()` and `retrieve()` changes.
+
+## 2. Goals & non-goals
+
+### Goals
+
+- Combine the existing FTS recall with semantic recall so queries like "no pneumothorax" surface reports that describe the absence without containing the exact word (modulo the dense-retrieval polarity limitation in §11).
+- Keep the existing `SearchProvider` contract intact.
+- Index embeddings asynchronously without blocking report ingest.
+- Keep embedding load isolated from chat/extraction/subscription LLM tasks.
+- Degrade gracefully when the embedding service is unavailable (search continues as FTS-only).
+- Make the embedding backend pluggable so Ollama can be used in dev and a Qwen3 endpoint in prod with the same code path.
+
+### Non-goals
+
+- No new search-provider plugin slot. The single `pgsearch` provider continues to be the only one registered.
+- No per-query UI toggle for semantic vs. lexical. Hybrid is the new default.
+- No Vespa, Elasticsearch, or OpenSearch adapter.
+- No solution for negation/polarity (§11 documents this as known future work).
+- No automated re-embedding when `EMBEDDING_DIM` changes. That is a manual operator procedure: drop column, re-migrate, run `backfill_embeddings`.
+- No on-disk vector quantization. Float32 storage from day one; revisit if RAM pressure appears.
+
+## 3. Architecture
+
+```
+┌──────────────────────────────────────────────────────────────────────┐
+│  SearchView, REST API, ExtractionJob, SubscriptionJob                │
+└──────────────┬───────────────────────────────────────────────────────┘
+               │ Search(query, filters, offset, limit)
+               ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  radis.pgsearch.providers.search()       (hybrid, replaces FTS-only) │
+│                                                                      │
+│  1. embed_query() ──► EmbeddingClient ──► Qwen3 endpoint             │
+│     on failure: query_vec = None                                     │
+│                                                                      │
+│  2. Vector top-K   ────► ReportSearchVector  (HNSW on .embedding)    │
+│                          filtered by structured filters              │
+│                                                                      │
+│  3. FTS hits       ────► ReportSearchVector  (GIN on .search_vector) │
+│                          filtered by structured filters              │
+│                                                                      │
+│  4. Python-side RRF fusion of (vec_top_K ∪ fts_hits)                 │
+│  5. Pagination on the fused order                                    │
+│  6. ts_headline() ────► ReportSearchVector  (page-slice only)        │
+└──────────────────────────────────────────────────────────────────────┘
+
+┌──────────────────────────────────────────────────────────────────────┐
+│  Async indexing path                                                 │
+│                                                                      │
+│  Report.save() ──post_save──► enqueue_embed_reports([id])            │
+│                                  │                                   │
+│                                  ▼                                   │
+│                       Procrastinate queue: "embeddings"              │
+│                                  │                                   │
+│                                  ▼                                   │
+│  embeddings_worker ──► embed_reports(ids)                            │
+│                          ├─ EmbeddingClient.embed_documents(...)     │
+│                          ├─ L2-normalize                             │
+│                          └─ ReportSearchVector.objects.update()      │
+│                                                                      │
+│  ./manage.py backfill_embeddings ──► batched enqueue on same queue   │
+└──────────────────────────────────────────────────────────────────────┘
+```
+
+**Components added inside `radis.pgsearch`:**
+
+| File | Purpose |
+|---|---|
+| `utils/embedding_client.py` | Sync + async HTTP clients with pluggable backends (`openai`, `ollama`) |
+| `migrations/0002_pgvector_extension.py` | `CREATE EXTENSION IF NOT EXISTS vector;` |
+| `migrations/0003_report_embedding.py` | Adds `embedding vector(N)` column + HNSW index |
+| `models.py` (modified) | Adds `embedding` field + `HnswIndex` |
+| `signals.py` (modified) | Adds second `post_save` receiver to enqueue embedding |
+| `tasks.py` (modified) | Adds `embed_reports` Procrastinate task on `embeddings` queue |
+| `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic |
+| `management/commands/backfill_embeddings.py` | Idempotent backfill command |
+| `tests/...` | Coverage per §10 |
+
+**Infrastructure additions:**
+
+| File | Change |
+|---|---|
+| `pyproject.toml` | Add `pgvector>=0.3` dependency |
+| `radis/settings/base.py` | New env-driven + constant settings (§8) |
+| `example.env` | Document `EMBEDDING_*` env vars for openai and ollama backends |
+| `docker-compose.base.yml` | Add `embeddings_worker` service + `EMBEDDING_*` env vars |
+| `docker-compose.dev.yml` / `.prod.yml` | `embeddings_worker.command` running `bg_worker -q embeddings` |
+
+## 4. Schema and migrations
+
+### 4.1 Dependency
+
+Add to `pyproject.toml`:
+
+```toml
+"pgvector>=0.3",
+```
+
+### 4.2 Postgres extension migration
+
+`radis/pgsearch/migrations/0002_pgvector_extension.py`:
+
+```python
+class Migration(migrations.Migration):
+    dependencies = [("pgsearch", "0001_initial")]
+    operations = [
+        migrations.RunSQL(
+            sql="CREATE EXTENSION IF NOT EXISTS vector;",
+            reverse_sql=migrations.RunSQL.noop,   # do not drop in prod
+        ),
+    ]
+```
+
+Reverse is a no-op because the extension may be shared with other Postgres usage and dropping it would damage unrelated state. Dev rollback is handled by recreating the database.
+
+### 4.3 Schema migration
+
+`radis/pgsearch/migrations/0003_report_embedding.py`: standard `AddField` with a `VectorField(dimensions=settings.EMBEDDING_DIM, null=True)` and `AddIndex` for an `HnswIndex` with `opclasses=["vector_cosine_ops"]`, `m=16`, `ef_construction=64`.
+
+### 4.4 Model update
+
+`radis/pgsearch/models.py`:
+
+```python
+from django.conf import settings
+from pgvector.django import HnswIndex, VectorField
+
+class ReportSearchVector(models.Model):
+    report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_vector")
+    search_vector = SearchVectorField(null=True)
+    embedding = VectorField(dimensions=settings.EMBEDDING_DIM, null=True)
+
+    class Meta:
+        indexes = [
+            GinIndex(fields=["search_vector"]),
+            HnswIndex(
+                name="pgsearch_embedding_hnsw",
+                fields=["embedding"],
+                m=16,
+                ef_construction=64,
+                opclasses=["vector_cosine_ops"],
+            ),
+        ]
+```
+
+`embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled asynchronously by `embed_reports`. A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only.
+
+`save()` on `ReportSearchVector` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by the embedding task via `update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent.
+
+### 4.5 Operational note on `EMBEDDING_DIM`
+
+pgvector columns and HNSW indexes are bound to a fixed dimension at create time. Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure:
+
+1. Drop the HNSW index and the `embedding` column.
+2. Re-run `0003_report_embedding` with the new `EMBEDDING_DIM`.
+3. Run `./manage.py backfill_embeddings`.
+
+This is documented as a deployment-time decision and intentionally not automated.
+
+## 5. Embedding client
+
+### 5.1 Module layout
+
+`radis/pgsearch/utils/embedding_client.py` exposes:
+
+- `class EmbeddingBackend(Protocol)` with `path`, `build_payload`, `parse_response`.
+- `class OpenAIBackend(EmbeddingBackend)` — default path `/v1/embeddings`, body `{model, input: [...]}`, response `{data: [{embedding: [...]}]}`.
+- `class OllamaBackend(EmbeddingBackend)` — default path `/api/embed`, body `{model, input: [...]}`, response `{embeddings: [[...]]}`.
+- `BACKENDS: dict[str, EmbeddingBackend] = {"openai": OpenAIBackend(), "ollama": OllamaBackend()}`.
+- `class EmbeddingClientError(Exception)`.
+- `class EmbeddingClient` — sync client used by `embed_reports` task and the query path.
+- `class AsyncEmbeddingClient` — async variant, kept for parity with `chats/utils/chat_client.py` and so the query path can call it from ASGI views without `async_to_sync` later.
+
+### 5.2 Interface
+
+```python
+class EmbeddingClient:
+    def __init__(self):
+        self._backend = BACKENDS[settings.EMBEDDING_BACKEND]
+        self._path = settings.EMBEDDING_PROVIDER_PATH or self._backend.path
+        self._url = settings.EMBEDDING_PROVIDER_URL.rstrip("/") + self._path
+        self._model = settings.EMBEDDING_MODEL_NAME
+        self._timeout = settings.EMBEDDING_REQUEST_TIMEOUT
+        self._headers = {"Authorization": f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}"} \
+                        if settings.EMBEDDING_PROVIDER_API_KEY else {}
+
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        """Embed texts verbatim. Truncates each to EMBEDDING_MAX_INPUT_CHARS first.
+        Returns L2-normalized vectors of length EMBEDDING_DIM."""
+
+    def embed_query(self, text: str) -> list[float]:
+        """Prepend EMBEDDING_QUERY_INSTRUCTION, then embed_documents([text])[0]."""
+```
+
+### 5.3 Wire shapes
+
+| Backend | Path (default) | Request | Response |
+|---|---|---|---|
+| `openai` | `/v1/embeddings` | `{"model": M, "input": [t, ...]}` | `{"data": [{"embedding": [...]}, ...]}` |
+| `ollama` | `/api/embed` | `{"model": M, "input": [t, ...]}` | `{"embeddings": [[...], ...]}` |
+
+`EMBEDDING_PROVIDER_PATH` (env) overrides the backend default — this is how the production endpoint at `/api/embeddings` with an OpenAI-style payload is supported by the `openai` backend with a one-line config change, no new backend needed.
+
+### 5.4 Behavior details
+
+- **Query instruction:** the model card for Qwen3-Embedding recommends a task-specific instruction prefix on the query side only. `embed_query` prepends `EMBEDDING_QUERY_INSTRUCTION` (a Python constant in `base.py`); `embed_documents` does not.
+- **Truncation:** any text longer than `EMBEDDING_MAX_INPUT_CHARS` is truncated at the character limit before being sent. A WARNING is logged with the report id (when known) and char count. Qwen3-Embedding-4B supports up to 32k tokens, so truncation will be rare for radiology bodies but is bounded as a defense against pathological inputs.
+- **Normalization:** every returned vector is L2-normalized client-side, unconditionally. With unit vectors, cosine distance is monotonic in dot product, which makes the HNSW `vector_cosine_ops` operator effectively a fast inner-product search. Whether the upstream server normalizes is irrelevant.
+- **Dimension validation:** every vector is checked to have length `EMBEDDING_DIM`. A mismatch raises `EmbeddingClientError`.
+- **Batching:** `embed_documents` sends a single HTTP call per invocation. Higher-level callers (`embed_reports` task) split into batches of `EMBEDDING_BATCH_SIZE` before calling.
+- **Errors:** non-2xx, timeout, malformed JSON, missing key, or wrong dim all raise `EmbeddingClientError`. The client never falls back internally — fallback policy is owned by the caller.
+- **Dev recipe (Ollama):**
+  ```bash
+  ollama pull dengcao/Qwen3-Embedding-4B:Q5_K_M
+  # in .env:
+  EMBEDDING_BACKEND=ollama
+  EMBEDDING_PROVIDER_URL=http://host.docker.internal:11434
+  EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M
+  EMBEDDING_DIM=2560
+  ```
+  GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, run `backfill_embeddings`.
+
+## 6. Async indexing
+
+### 6.1 Queue and worker
+
+A new Procrastinate queue named **`embeddings`** is added, served by a new container **`embeddings_worker`**. This isolates embedding load from the existing `default` and `llm` queues. The `embeddings` worker's command:
+
+```
+./manage.py bg_worker -l debug -q embeddings --autoreload   # dev
+./manage.py bg_worker -l info  -q embeddings                # prod
+```
+
+The worker inherits the same image and environment as `default_worker` / `llm_worker` via the existing `&default-app` anchor.
+
+### 6.2 Priorities
+
+Procrastinate priority is "higher = sooner". Embedding tasks always run at lower priority than the existing LLM tasks so a backfill never starves extraction/subscription work — though in practice this only matters *within* a queue, and `embeddings` is a separate queue from `llm`. The priorities are still set defensively in case workers are ever consolidated:
+
+| Task | Priority |
+|---|---|
+| `EXTRACTION_DEFAULT_PRIORITY` (existing) | 2 |
+| `EXTRACTION_URGENT_PRIORITY` (existing) | 3 |
+| `SUBSCRIPTION_DEFAULT_PRIORITY` (existing) | 3 |
+| `SUBSCRIPTION_URGENT_PRIORITY` (existing) | 4 |
+| `EMBEDDING_INDEX_PRIORITY` (new) | 0 |
+| `EMBEDDING_BACKFILL_PRIORITY` (new) | -1 |
+
+Backfill below incremental ensures fresh-report embeddings always overtake a backfill job in flight.
+
+### 6.3 Task: `embed_reports`
+
+`radis/pgsearch/tasks.py`:
+
+```python
+@app.task(queue="embeddings")
+def embed_reports(report_ids: list[int]) -> None:
+    """Embed the given reports and write the vector to ReportSearchVector.embedding.
+    Idempotent. Skips rows that already have an embedding."""
+```
+
+Implementation outline:
+
+1. `target = ReportSearchVector.objects.filter(report_id__in=ids).select_related("report").only("report_id", "report__body")`. No `embedding__isnull` short-circuit at this layer — the task always re-embeds whatever it is given. Backfill controls the "only fill in nulls" policy by filtering at enqueue time (§6.5).
+2. Iterate in chunks of `EMBEDDING_BATCH_SIZE`; for each chunk, call `EmbeddingClient().embed_documents([rsv.report.body for rsv in chunk])`.
+3. `ReportSearchVector.objects.filter(pk=rsv.pk).update(embedding=vec)` per row. (Postgres `UPDATE … SET embedding = CASE pk WHEN … END` is a possible optimization if profiling shows the per-row update is a bottleneck; not done in v1.)
+4. Any `EmbeddingClientError` is re-raised so Procrastinate's default retry policy with exponential backoff handles transient failures.
+
+Helper `enqueue_embed_reports(report_ids, priority=settings.EMBEDDING_INDEX_PRIORITY)` mirrors the existing `enqueue_bulk_index_reports`.
+
+**V1 re-embedding policy:** the signal enqueues on every `Report.save()`, including metadata-only updates, so metadata edits trigger a wasted re-embed. Accepted simplicity for v1; §11.4 documents body-change detection as a future optimization.
+
+### 6.4 Signal
+
+`radis/pgsearch/signals.py` keeps the existing receiver for the FTS path and adds:
+
+```python
+@receiver(post_save, sender=Report)
+def enqueue_report_embedding(sender, instance, **kwargs):
+    enqueue_embed_reports([instance.pk], priority=settings.EMBEDDING_INDEX_PRIORITY)
+```
+
+Two separate receivers (not one combined) so an enqueue error in the embedding path cannot break the FTS-indexing path. The signal fires on both create and update; `embed_reports` always overwrites the embedding for the given ids, so metadata-only updates do trigger an unnecessary re-embed in v1. Body-change detection (a `pre_save` that suppresses enqueue when only metadata changed) is an optimization deferred to §11.4. `ReportSearchVector.save()` is *not* modified to null `embedding` — the task's unconditional overwrite makes that redundant.
+
+### 6.5 Backfill command
+
+`radis/pgsearch/management/commands/backfill_embeddings.py`:
+
+```
+./manage.py backfill_embeddings [--batch-size 500] [--limit N] [--dry-run]
+```
+
+Behavior:
+
+- Iterates `ReportSearchVector.objects.filter(embedding__isnull=True).values_list("report_id", flat=True)`.
+- Chunks ids by `--batch-size` (default 500).
+- For each chunk, calls `enqueue_embed_reports(chunk, priority=settings.EMBEDDING_BACKFILL_PRIORITY)`.
+- `--limit N` caps total reports enqueued.
+- `--dry-run` skips enqueue and prints the would-be count.
+- The "only fill in nulls" filter is applied at enqueue time (here), not inside the task. Re-running the command is safe because rows that got embedded since the last run no longer match the `embedding__isnull=True` filter and won't be re-enqueued.
+
+## 7. Hybrid search provider
+
+### 7.1 Universe and fusion
+
+The hybrid result universe is the **union** of two filter-bounded candidate sets:
+
+- **Vector top-K:** the `HYBRID_VECTOR_TOP_K` nearest rows by cosine distance to the query embedding, filtered by structured filters and `embedding IS NOT NULL`. *Not* constrained to the FTS hit set.
+- **FTS hits:** all rows matching the tsquery and the structured filters, capped at `HYBRID_FTS_MAX_RESULTS`.
+
+A report appears in results if it is in **either** set. This is the change from the earlier draft, made because radiology queries like "no pneumothorax" must be able to surface reports that lexically don't match (the GIN index drops "no" as a stop word) but are semantically related.
+
+Each report's score is plain Reciprocal Rank Fusion:
+
+```
+score(d) = (1 / (HYBRID_RRF_K + vec_rank[d])  if d ∈ vec_top_K  else 0)
+         + (1 / (HYBRID_RRF_K + fts_rank[d])  if d ∈ fts_hits   else 0)
+```
+
+Properties:
+
+- Reports in both sides outrank reports in only one side (sum of two terms vs. one).
+- Vector contribution decays after rank K (no `vec_rank` entry), so the ordering naturally transitions from "hybrid head" to "FTS tail" with no explicit cutoff.
+- A query with zero FTS hits returns `vec_top_K` ranked by vector position only — pure semantic search.
+- A query with embedding failure returns FTS hits ranked by `ts_rank` only — the pre-hybrid behavior.
+
+### 7.2 `search()` flow
+
+```python
+def search(s: Search) -> SearchResult:
+    query_str = _build_query_string(s.query)
+    language  = _resolve_language(s.filters)
+    filter_q  = _build_filter_query(s.filters)
+    tsquery   = SearchQuery(query_str, search_type="raw", config=language)
+
+    # Vector side
+    query_text = QueryParser.unparse(s.query)  # same helper SearchView already uses
+    try:
+        query_vec = EmbeddingClient().embed_query(query_text)
+    except EmbeddingClientError as e:
+        logger.warning("Falling back to FTS-only: %s", e)
+        query_vec = None
+
+    vec_rank: dict[int, int] = {}
+    if query_vec is not None:
+        ids = list(
+            ReportSearchVector.objects
+                .filter(filter_q)
+                .exclude(embedding__isnull=True)
+                .annotate(distance=CosineDistance("embedding", query_vec))
+                .order_by("distance", "report_id")
+                .values_list("report_id", flat=True)[:settings.HYBRID_VECTOR_TOP_K]
+        )
+        vec_rank = {rid: i + 1 for i, rid in enumerate(ids)}
+
+    # FTS side
+    fts_rows = list(
+        ReportSearchVector.objects
+            .filter(filter_q)
+            .filter(search_vector=tsquery)
+            .annotate(rank=SearchRank(F("search_vector"), tsquery))
+            .order_by("-rank", "report_id")
+            .values("report_id", "rank")[:settings.HYBRID_FTS_MAX_RESULTS]
+    )
+    fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)}
+
+    # Fusion (pure Python, factored out for unit testing)
+    ordered_ids = _rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K)
+
+    total_count = len(ordered_ids)
+    total_relation = (
+        "at_least"
+        if len(fts_rows) >= settings.HYBRID_FTS_MAX_RESULTS
+           or len(vec_rank) >= settings.HYBRID_VECTOR_TOP_K
+        else "exact"
+    )
+    page_ids = ordered_ids[s.offset : s.offset + (s.limit or len(ordered_ids))]
+
+    # Headline + hydration for the page slice only
+    page_rows = (
+        ReportSearchVector.objects
+            .filter(report_id__in=page_ids)
+            .annotate(
+                summary=SearchHeadline("report__body", tsquery, config=language,
+                                       start_sel="<em>", stop_sel="</em>",
+                                       min_words=10, max_words=20, max_fragments=10),
+                rank=SearchRank(F("search_vector"), tsquery),
+            )
+            .select_related("report")
+    )
+    by_id = {r.report_id: r for r in page_rows}
+    documents = [
+        document_from_pgsearch_response(_with_fallback_summary(by_id[rid]))
+        for rid in page_ids if rid in by_id
+    ]
+    return SearchResult(total_count=total_count, total_relation=total_relation, documents=documents)
+```
+
+### 7.3 Empty-summary fallback
+
+`SearchHeadline` returns an empty string when the document body has no FTS match (the vector-only hit case). `_with_fallback_summary` replaces an empty summary with the first 30 words of `report.body`. Trivial helper, ~5 lines.
+
+### 7.4 `retrieve()`
+
+Same fusion logic, returns an iterator of `report__document_id` in `ordered_ids` order. No headline. Used by `ExtractionJob` and `SubscriptionJob` to walk the matching id set.
+
+### 7.5 `count()` and `filter()`
+
+Unchanged. These operate on filters only and never call the embedding service.
+
+### 7.6 `ReportDocument.relevance`
+
+Kept as `ts_rank` for API backwards compatibility. RRF is an internal ordering signal and is not exposed on the public document type. RRF scores are logged at DEBUG for diagnostics.
+
+### 7.7 `search_provider.max_results`
+
+Updated to `max(HYBRID_VECTOR_TOP_K, HYBRID_FTS_MAX_RESULTS)`, which is what the `SearchView` page-bound check uses to reject impossibly-deep pagination.
+
+## 8. Configuration
+
+### 8.1 Env-driven (per-deployment, set in `.env`)
+
+```python
+# radis/settings/base.py
+EMBEDDING_BACKEND          = env.str("EMBEDDING_BACKEND", default="openai")
+EMBEDDING_PROVIDER_URL     = env.str("EMBEDDING_PROVIDER_URL", default="")
+EMBEDDING_PROVIDER_PATH    = env.str("EMBEDDING_PROVIDER_PATH", default="")   # "" = backend default
+EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="")
+EMBEDDING_MODEL_NAME       = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B")
+EMBEDDING_DIM              = env.int("EMBEDDING_DIM", default=1024)
+```
+
+These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5).
+
+### 8.2 Code constants (tuning knobs, in `base.py`)
+
+```python
+EMBEDDING_REQUEST_TIMEOUT = 30  # seconds
+EMBEDDING_MAX_INPUT_CHARS = 60_000
+EMBEDDING_QUERY_INSTRUCTION = (
+    "Instruct: Given a radiology search query, retrieve relevant radiology reports.\n"
+    "Query: "
+)
+EMBEDDING_BATCH_SIZE = 32
+
+EMBEDDING_INDEX_PRIORITY = 0
+EMBEDDING_BACKFILL_PRIORITY = -1
+
+HYBRID_VECTOR_TOP_K    = 100
+HYBRID_FTS_MAX_RESULTS = 10_000
+HYBRID_RRF_K           = 60
+```
+
+These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks).
+
+### 8.3 `example.env`
+
+Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, keyed off `EMBEDDING_BACKEND`.
+
+### 8.4 Compose
+
+`docker-compose.base.yml`:
+
+- New service `embeddings_worker` inheriting `*default-app`.
+- The `EMBEDDING_BACKEND`, `EMBEDDING_PROVIDER_URL`, `EMBEDDING_PROVIDER_PATH`, `EMBEDDING_PROVIDER_API_KEY`, `EMBEDDING_MODEL_NAME`, `EMBEDDING_DIM` env keys added to the `&default-app` block so all services see them.
+
+`docker-compose.dev.yml`:
+
+- `embeddings_worker.command`: `bash -c "wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && ./manage.py bg_worker -l debug -q embeddings --autoreload"`.
+
+`docker-compose.prod.yml`:
+
+- Same without `--autoreload`, log level `info`.
+
+## 9. Error handling and degradation
+
+| Failure | Behavior | Logging |
+|---|---|---|
+| Embedding service returns 5xx/timeout during query-time | `query_vec = None`; result list ordered by FTS-only; request succeeds | WARNING with request id |
+| Embedding service returns 4xx during query-time | Same FTS-only fallback (treats as misconfig at request layer) | ERROR |
+| Embedding service returns malformed body | `EmbeddingClientError` raised; query falls back to FTS-only | ERROR |
+| Embedding service down during indexing task | Task raises; Procrastinate retries with exponential backoff; `embedding` stays NULL | WARNING per attempt, ERROR after final retry |
+| Report body > `EMBEDDING_MAX_INPUT_CHARS` | Truncate, embed truncated text | WARNING with report_id and char count |
+| Report deleted between enqueue and task run | Task fetches no rows for that id; no error | DEBUG |
+| Vector dim mismatch on write | Postgres raises; task fails, retried | ERROR — escalate to admin |
+| `EMBEDDING_PROVIDER_URL` empty at startup | `EmbeddingClient` construction defers to call site; calls log + raise; query falls back to FTS-only | WARNING once on first request |
+
+**Deliberate non-policies:**
+
+- The product never fails a search request because the embedding service is down. It degrades to FTS-only.
+- Query embeddings are not cached. The complexity and freshness trade-off is not worth it at the corpora sizes RADIS targets.
+- `EmbeddingClient` does not retry internally. Procrastinate retries the whole task; the query path uses a single shot.
+
+**Observability:**
+
+- Provider logs at DEBUG: vec hit count, FTS hit count, intersection count, fusion ms, query-embed ms.
+- `embed_reports` logs at INFO: batch size, total chars, latency, success/skip/retry counts.
+- The existing OpenTelemetry overlay (commit `653e0c67`) tags telemetry per service; `embeddings_worker` shows up automatically.
+
+## 10. Testing strategy
+
+### 10.1 Unit tests (no DB)
+
+| File | Coverage |
+|---|---|
+| `tests/unit/test_embedding_client.py` | Backend payload/response round-trip, path override, instruction prefix, normalization, dim validation, all error modes, truncation |
+| `tests/unit/test_provider_fusion.py` | `_rrf_fuse(vec_rank, fts_rank, k)` pure-Python helper: disjoint, overlapping, FTS-only, vector-only, both-empty, tiebreak by report_id |
+| `tests/unit/test_signals.py` | `post_save` enqueues `embed_reports([id])` with `EMBEDDING_INDEX_PRIORITY` |
+| `tests/unit/test_tasks.py` (extends existing) | Always overwrites embedding when re-run (no internal short-circuit); batch splitting; missing ids are skipped without error; client errors propagate so Procrastinate retries |
+| `tests/unit/test_backfill_command.py` | Batching, `--limit`, `--dry-run`, only-null-embedding selection |
+
+### 10.2 Integration tests (real Postgres + pgvector)
+
+| File | Coverage |
+|---|---|
+| `tests/integration/test_migrations.py` (new, `django-test-migrations`) | Extension migration runs; column + HNSW index created with configured dim; reverse works |
+| `tests/integration/test_provider_hybrid.py` (new) | FTS-only hit, vector-only hit ("no pneumothorax" fixture), both-sides hit, filter honoring, stable pagination, embedding-service-down fallback, NULL-embedding rows still returned, `ts_headline` query-count bounded to page, empty-summary fallback |
+
+Factories: existing `ReportSearchVectorFactory` gains optional `embedding` kwarg (default `None`). New `ReportSearchVectorWithEmbeddingFactory` generates deterministic normalized vectors of the configured dim from a seed. Real Qwen3 embeddings are not used in tests.
+
+### 10.3 View-level smoke
+
+`radis/search/tests/test_views.py` (extend):
+
+- Search request with hybrid enabled returns 200 and renders documents.
+- Search request with `EMBEDDING_PROVIDER_URL=""` returns 200 (FTS-only path).
+
+### 10.4 Acceptance (`@pytest.mark.acceptance`)
+
+One end-to-end test against the dev containers, with the embedding service stubbed (either a small in-test FastAPI or a recorded fixture response), verifying the search page returns hybrid results. Marked acceptance so it's opt-in like the existing acceptance suite.
+
+### 10.5 Explicitly not tested
+
+- Live Qwen3 retrieval quality (offline eval, out of scope).
+- pgvector HNSW recall under specific data shapes (extension's responsibility).
+- Wire formats beyond the two supported backends.
+
+## 11. Known limitations and future work
+
+### 11.1 Negation / polarity (the "no pneumothorax" problem)
+
+Dense embedding models — including Qwen3-Embedding — embed semantically opposite phrases close together. "No pneumothorax" and "pneumothorax present" produce nearby vectors, so the vector half of the hybrid score is *polarity-blind*. The FTS half partly compensates by allowing the user to construct explicit AND-NOT queries, but Postgres' GIN index drops "no" as a stop word, so a naive query like `no pneumothorax` is effectively `pneumothorax` on the FTS side.
+
+This is a real concern for radiology, where negated findings are pervasive ("no acute …", "no evidence of …", "no significant …"). **Hybrid search as designed here does not solve this.** It is documented as an accepted limitation of v1, and a v2 conversation should address it.
+
+Candidate solutions to evaluate in a future spec (none committed):
+
+- A cross-encoder re-ranker over the top-N hybrid results (e.g., a small instruction-tuned model that knows to score "no X" against "X present" as opposite).
+- Adding a sparse/late-interaction model (SPLADE, ColBERT) alongside the dense vector — sparse models preserve token-level polarity.
+- Negation-aware query preprocessing: detect negation, route to a different retrieval mode, or expand to phrasal `AND-NOT` clauses on the FTS side that bypass the stop-word filter (e.g., search the raw body, not the tsvector).
+- Structured-findings indexing: have the LLM extract presence/absence flags per finding category at ingest time, search those structured fields instead of (or in addition to) prose.
+
+### 11.2 Dimension changes are manual
+
+See §4.5.
+
+### 11.3 GGUF dev embeddings ≠ bf16 prod embeddings
+
+Documented in §5.4. Mitigated by running `backfill_embeddings` after a model swap.
+
+### 11.4 No body-change detection in the signal
+
+V1 re-embeds on every `Report.save()`. If profiling shows wasted traffic from metadata-only updates, add a `pre_save` that only nulls `embedding` when `body` changed.
+
+### 11.5 Per-row `UPDATE` in the embedding task
+
+V1 issues one `UPDATE` per row inside a batch. If this becomes a bottleneck, switch to a single `UPDATE … FROM (VALUES …)` or pgvector's `bulk_create` with `update_conflicts`.
+
+## 12. Rollout plan
+
+1. **Schema and dependency.** Land the `pgvector` Python dep, the extension migration, and the schema migration. No behavior change at this point — `embedding` is nullable, queries still see only FTS.
+2. **Embedding client and tests.** Land the client module and unit tests. No callers yet.
+3. **Async indexing.** Land the task, signal, backfill command, and `embeddings_worker` service. New reports start getting embedded; the column gradually populates.
+4. **Backfill.** Run `backfill_embeddings` against the existing corpus (manual op, can run for hours/days depending on size — that's fine, it's bounded by `EMBEDDING_BACKFILL_PRIORITY`).
+5. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only.
+6. **Monitor.** Watch search latency p95, embedding queue depth, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed.
+
+Each step is independently mergeable; steps 1–4 ship as quiet infrastructure changes with no user-visible effect, step 5 is the moment hybrid goes live.
diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md
new file mode 100644
index 000000000..e73dc29c3
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md
@@ -0,0 +1,991 @@
+# Hybrid Search Design (FTS + Dense Vector via Qwen3-Embedding-4B)
+
+**Status:** Draft — design phase
+**Author:** RADIS team (Samuel Kwong)
+**Date:** 2026-05-28
+**Implementation skill (next step):** `writing-plans`
+**Supersedes:** `2026-05-15-hybrid-search-design.md`
+
+---
+
+## 1. Overview
+
+RADIS today provides PostgreSQL full-text search (FTS) over radiology reports via the `radis.pgsearch` provider: each `Report` gets a 1:1 `ReportSearchIndex` row holding a `tsvector`, kept in sync via `post_save` signal and a bulk re-index task. Queries are ranked by `ts_rank` and snippeted via `ts_headline`.
+
+This spec extends that infrastructure with a dense-vector retrieval side, fused with FTS via Reciprocal Rank Fusion (RRF), to deliver **hybrid search**. Embeddings are produced by a Qwen3-Embedding-4B inference endpoint and stored in the same `ReportSearchIndex` table.
+
+The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — `SearchView`, `ExtractionJob`, `SubscriptionJob`, the REST API — see no signature differences. Only the body of `radis.pgsearch.providers.search()` and `retrieve()` changes.
+
+## 2. Goals & non-goals
+
+### Goals
+
+- Combine the existing FTS recall with semantic recall so queries like "no pneumothorax" surface reports that describe the absence without containing the exact word (modulo the dense-retrieval polarity limitation in §11).
+- Keep the existing `SearchProvider` contract intact.
+- Index embeddings asynchronously without blocking report ingest.
+- Keep embedding load isolated from chat/extraction/subscription LLM tasks.
+- Degrade gracefully when the embedding service is unavailable (search continues as FTS-only).
+- Make the embedding backend pluggable so Ollama can be used in dev and a Qwen3 endpoint in prod with the same code path.
+
+### Non-goals
+
+- No new search-provider plugin slot. The single `pgsearch` provider continues to be the only one registered.
+- No per-query UI toggle for semantic vs. lexical. Hybrid is the new default.
+- No Vespa, Elasticsearch, or OpenSearch adapter.
+- No solution for negation/polarity (§11 documents this as known future work).
+- No automated re-embedding when `EMBEDDING_DIM` changes. That is a manual operator procedure: drop column, re-migrate, re-PUT affected reports (see §4.5).
+- No on-disk vector quantization. Float32 storage from day one; revisit if RAM pressure appears.
+
+## 3. Architecture
+
+```
+┌──────────────────────────────────────────────────────────────────────┐
+│  SearchView, REST API, ExtractionJob, SubscriptionJob                │
+└──────────────┬───────────────────────────────────────────────────────┘
+               │ Search(query, filters, offset, limit)
+               ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  radis.pgsearch.providers.search()       (hybrid, replaces FTS-only) │
+│                                                                      │
+│  1. embed_query() ──► EmbeddingClient ──► Qwen3 endpoint             │
+│     on failure: query_vec = None                                     │
+│                                                                      │
+│  2. Vector top-K   ────► ReportSearchIndex  (HNSW on .embedding)    │
+│                          filtered by structured filters              │
+│                                                                      │
+│  3. FTS hits       ────► ReportSearchIndex  (GIN on .search_vector) │
+│                          filtered by structured filters              │
+│                                                                      │
+│  4. Python-side RRF fusion of (vec_top_K ∪ fts_hits)                 │
+│  5. Pagination on the fused order                                    │
+│  6. ts_headline() ────► ReportSearchIndex  (page-slice only)        │
+└──────────────────────────────────────────────────────────────────────┘
+
+┌──────────────────────────────────────────────────────────────────────┐
+│  Async indexing path  (handler-registry → deferred via Procrastinate)│
+│                                                                      │
+│  Report view  (single-create / PUT / bulk-upsert)                    │
+│        │                                                             │
+│        ▼  transaction.atomic() block                                 │
+│  ReportSerializer / bulk_upsert_reports                              │
+│    ├─ DB write (Report rows)                                         │
+│    └─ transaction.on_commit:                                         │
+│         dispatches reports_created_handlers / reports_updated_       │
+│         handlers (radis.reports.site registry) with the touched      │
+│         Report instances                                             │
+│        │                                                             │
+│        ▼  (one of the registered subscribers is pgsearch:)           │
+│  pgsearch._handle_reports_changed(reports)                           │
+│    ├─ PGSEARCH_SYNC_INDEXING=True:                                   │
+│    │     bulk_upsert_report_search_indexes(report_ids) inline,       │
+│    │     then embed_reports_task.defer(report_ids=...)               │
+│    └─ PGSEARCH_SYNC_INDEXING=False:                                  │
+│          enqueue_bulk_index_reports(report_ids); the embed task is   │
+│          chained at the tail of bulk_index_reports (see below)       │
+│        │                                                             │
+│        ▼  HTTP response returned (201 / 200) immediately             │
+│                                                                      │
+│  ──── elsewhere, on the default_worker process ────                  │
+│                                                                      │
+│  bulk_index_reports(report_ids)   (default queue)                    │
+│    ├─ bulk_upsert_report_search_indexes(report_ids)                  │
+│    └─ embed_reports_task.defer(report_ids=...)                       │
+│                                                                      │
+│  ──── elsewhere, on the embeddings_worker process ────               │
+│                                                                      │
+│  embed_reports_task(report_ids)   (embeddings queue)                 │
+│    ├─ load RSVs (select_related("report"))                           │
+│    ├─ EmbeddingClient.embed_documents([body, ...])  (batched)        │
+│    ├─ L2-normalize; ReportSearchIndex.objects.bulk_update           │
+│    └─ on EmbeddingClientError: raise                                 │
+│         → Procrastinate retry policy (exp backoff, N attempts)       │
+└──────────────────────────────────────────────────────────────────────┘
+```
+
+`radis.reports` already exposes a handler registry (`reports_created_handlers` / `reports_updated_handlers` in `radis.reports.site`) whose docstring is explicit about its purpose: *"The handler can be used to index those reports in an external search database."* Pgsearch registers `_handle_reports_changed` on both. The view layer never imports anything from `pgsearch`; it only dispatches the registry.
+
+Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id}/?upsert=true`) and bulk-upsert (`POST /api/reports/bulk-upsert/`) — flow through the same handler, which schedules a Procrastinate task on the dedicated `embeddings` queue (directly in sync FTS mode; chained at the end of `bulk_index_reports` in deferred FTS mode). The write path returns immediately after the transaction commits; the embedding service is touched only by the worker. This:
+
+- **Decouples write-path uptime from the embedding service.** API responses succeed even when the embedding endpoint is down or slow.
+- **Bounds concurrent load on the embedding service** via the worker's `--concurrency K` — explicit, configurable backpressure rather than implicit request-driven concurrency.
+- **Auto-recovers from transient outages** via Procrastinate's retry policy with exponential backoff.
+- **Inverts the dependency** so `radis.reports` stays unaware of search/indexing concerns; adding or swapping a search provider is a registration call, not a view edit.
+- **Symmetric across single-create and bulk-upsert** — one enqueue site, one task, one worker.
+
+**Components added inside `radis.pgsearch`:**
+
+| File | Purpose |
+|---|---|
+| `utils/embedding_client.py` | `EmbeddingClient` used by both the query path and `embed_reports_task` on the worker; pluggable backends (`openai`, `ollama`) |
+| `apps.py` (modified) | `register_app()` now also registers `_handle_reports_changed` on both `reports_created_handlers` and `reports_updated_handlers`. In sync FTS mode the handler upserts inline then defers `embed_reports_task`; in deferred FTS mode it enqueues `bulk_index_reports`, which chains the embed task at the end of its own run. This is the only place pgsearch wires itself into the reports app. |
+| `tasks.py` (embedding entries) | `embed_reports_task(report_ids)` Procrastinate task on the `embeddings` queue. Loads RSVs by `report_id`, calls `EmbeddingClient.embed_documents`, then `bulk_update`. Raises on `EmbeddingClientError` so the Procrastinate retry policy applies. |
+| `admin.py` | Registers `ReportSearchIndex` with a `has_embedding` list display, an `embedding` `IsNull` filter, and an admin action `enqueue_pending_embeddings` that defers `embed_reports_task` for the selected rows whose embedding is NULL. Mirrors the `embed_pending` management command for operators who prefer the UI. |
+| `migrations/0002_hybrid_search.py` | Single schema migration: `CREATE EXTENSION vector`; adds `embedding vector(N)` column + HNSW index |
+| `models.py` (modified) | Adds `embedding` field + `HnswIndex` to `ReportSearchIndex`. No Job/Task models. |
+| `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** |
+| `tasks.py` (FTS bits) | FTS bulk-indexing helper `bulk_upsert_report_search_indexes` and the `bulk_index_reports` Procrastinate task. `bulk_index_reports` upserts the RSV rows and then chains `embed_reports_task.defer(...)` at the end of its run, so the embeddings worker only ever sees report ids whose RSV rows are already committed (see §6.6). |
+| `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic |
+| `tests/...` | Coverage per §10 |
+
+**Infrastructure additions:**
+
+| File | Change |
+|---|---|
+| `pyproject.toml` | Add `pgvector>=0.3` dependency |
+| `radis/settings/base.py` | New env-driven + constant settings (§8) |
+| `radis/settings/test.py` | Override `EMBEDDING_PROVIDER_URL=""` so any incidental construction of `EmbeddingClient` fast-fails into `EmbeddingClientError` in CI (no live embedding service). Tests that exercise embedding patch the client explicitly. |
+| `example.env` | Document `EMBEDDING_*` env vars for openai and ollama backends |
+| `radis/reports/api/viewsets.py` | **Unchanged from main** in shape. It already dispatches `reports_created_handlers` / `reports_updated_handlers` from `on_commit`; pgsearch hooks in via that registry. Nothing in `viewsets.py` imports from `radis.pgsearch`. |
+
+## 4. Schema and migrations
+
+### 4.1 Dependency
+
+Add to `pyproject.toml`:
+
+```toml
+"pgvector>=0.3",
+```
+
+### 4.2 Schema migration
+
+Schema lives in a single file `radis/pgsearch/migrations/0002_hybrid_search.py`,
+depending on `pgsearch.0001_initial` and `reports.0013_alter_report_options`.
+Three operations:
+
+1. `RunSQL("CREATE EXTENSION IF NOT EXISTS vector;", reverse_sql=RunSQL.noop)`.
+   Reverse is a no-op because the extension may be shared with other Postgres
+   usage and dropping it would damage unrelated state. Dev rollback is handled
+   by recreating the database.
+2. `AddField` `embedding` on `ReportSearchIndex`:
+   `pgvector.django.vector.VectorField(dimensions=settings.EMBEDDING_DIM, null=True)`.
+3. `AddIndex` HNSW on `embedding`: `m=16`, `ef_construction=64`,
+   `opclasses=["vector_cosine_ops"]`, `name="pgsearch_embedding_hnsw"`.
+
+The all-deferred embedding architecture (§6) has no orchestrator tables or
+system user, so this migration carries only schema. Reverse drops the index
+and column.
+
+### 4.4 Model update
+
+`radis/pgsearch/models.py`:
+
+```python
+from django.conf import settings
+from pgvector.django import HnswIndex, VectorField
+
+class ReportSearchIndex(models.Model):
+    report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_index")
+    search_vector = SearchVectorField(null=True)
+    embedding = VectorField(dimensions=settings.EMBEDDING_DIM, null=True)
+
+    class Meta:
+        indexes = [
+            GinIndex(fields=["search_vector"]),
+            HnswIndex(
+                name="pgsearch_embedding_hnsw",
+                fields=["embedding"],
+                m=16,
+                ef_construction=64,
+                opclasses=["vector_cosine_ops"],
+            ),
+        ]
+```
+
+`embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled by the `embed_reports_task` Procrastinate worker, enqueued from `transaction.on_commit` (§6). A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only.
+
+`save()` on `ReportSearchIndex` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by `embed_reports_task` via `bulk_update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent.
+
+### 4.5 Operational note on `EMBEDDING_DIM`
+
+pgvector columns and HNSW indexes are bound to a fixed dimension at create time, and HNSW has a 2000-dim ceiling (so `EMBEDDING_DIM ≤ 2000`; Qwen3-Embedding-4B's native 2560 is Matryoshka-truncated client-side). Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure:
+
+1. Drop the HNSW index and the `embedding` column.
+2. Re-run `0002_hybrid_search` with the new `EMBEDDING_DIM`. This re-creates
+   the column at the new dim plus the HNSW index.
+3. Run `./manage.py embed_pending` to enqueue an `embed_reports_task` for
+   every row that's now NULL. The command is idempotent and resumable; the
+   embeddings worker drains the queue at its configured `--concurrency`.
+   See §6.5.
+4. From here on, new writes enqueue tasks against the new dim automatically.
+
+This is documented as a deployment-time decision and intentionally not automated.
+
+### 4.6 Startup safety check for env/migration drift
+
+Two Django system checks guard against the failure mode where
+`settings.EMBEDDING_DIM` no longer matches what the squashed
+`0002_hybrid_search` migration describes. Without these the divergence would
+surface later as an opaque pgvector dimension error on the first write or
+query.
+
+The migration-side dim is *not* stored in a hand-edited constant. Instead it
+is derived at check time from Django's `MigrationLoader` project state —
+built from the migration files on disk without a database connection — so
+there is exactly one source of truth (the `dimensions=...` literal that
+`makemigrations` itself generated from `settings.EMBEDDING_DIM` when
+`0002_hybrid_search` was first written).
+
+```python
+# radis/pgsearch/apps.py
+
+def _migration_embedding_dim() -> int | None:
+    """Return the `dimensions` value of `ReportSearchIndex.embedding` as
+    captured by the on-disk pgsearch migrations. Returns None if the field
+    cannot be located (e.g., migrations are missing or out of sync)."""
+    from django.db.migrations.loader import MigrationLoader
+
+    loader = MigrationLoader(connection=None, ignore_no_migrations=True)
+    state = loader.project_state()
+    try:
+        model = state.apps.get_model("pgsearch", "ReportSearchIndex")
+        return model._meta.get_field("embedding").dimensions
+    except (LookupError, AttributeError):
+        return None
+
+
+@register()
+def check_embedding_dim_matches_migration(app_configs, **kwargs):
+    migration_dim = _migration_embedding_dim()
+    if migration_dim is None:
+        return [Error(
+            "Could not determine the embedding column dimension from the "
+            "pgsearch migrations.",
+            id="pgsearch.E002",
+            hint="Verify that radis/pgsearch/migrations/ contains a migration "
+                 "that adds `embedding` to `ReportSearchIndex`.",
+        )]
+    if settings.EMBEDDING_DIM != migration_dim:
+        return [Error(
+            f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the dim "
+            f"baked into the pgsearch migrations (vector({migration_dim})). "
+            f"Either set EMBEDDING_DIM={migration_dim}, or run "
+            f"`makemigrations pgsearch` to capture the new dim and follow §4.5.",
+            id="pgsearch.E001",
+        )]
+    return []
+```
+
+Check IDs:
+
+| ID | When it fires |
+|---|---|
+| `pgsearch.E001` | `settings.EMBEDDING_DIM != migration_dim`. The familiar drift case. |
+| `pgsearch.E002` | `_migration_embedding_dim()` returns `None`. Indicates the migration tree is missing the `embedding` field — either it was deleted without replacement, or the model was renamed. Surfaces what would otherwise be a silent NoneType crash. |
+
+Alternatives considered and rejected:
+
+| Option | Authoritative for | DB connection | Verdict |
+|---|---|---|---|
+| Hand-edited constant (status quo before this change) | Nothing — must be manually transcribed | No | Drift-prone |
+| Parse `migrations/0002_hybrid_search.py` source | The literal in one specific file | No | Brittle; couples to filename |
+| `MigrationLoader` project state | The aggregated dim across all migrations | No | Chosen |
+| `information_schema.columns` on the live DB | The actually-deployed column dim | Yes | Loses the offline-check property |
+
+`MigrationLoader.project_state()` reflects the *post-all-migrations* state, so
+if a later migration drops and recreates the column at a different dim, the
+check stays correct without any code change to `apps.py`.
+
+## 5. Embedding client
+
+### 5.1 Module layout
+
+`radis/pgsearch/utils/embedding_client.py` exposes:
+
+- `class EmbeddingBackend(Protocol)` with `path`, `build_payload`, `parse_response`.
+- `class OpenAIBackend(EmbeddingBackend)` — default path `/v1/embeddings`, body `{model, input: [...]}`, response `{data: [{embedding: [...]}]}`.
+- `class OllamaBackend(EmbeddingBackend)` — default path `/api/embed`, body `{model, input: [...]}`, response `{embeddings: [[...]]}`.
+- `BACKENDS: dict[str, EmbeddingBackend] = {"openai": OpenAIBackend(), "ollama": OllamaBackend()}`.
+- `class EmbeddingClientError(Exception)`.
+- `class EmbeddingClient` — sync client used by both the query path (`providers.search` / `providers.retrieve`) and the `embed_reports_task` worker task (§6.2). A single client class keeps the configuration surface narrow; worker-side concurrency is provided by Procrastinate's `--concurrency K` flag spawning K sync task slots, not by intra-task asyncio.
+
+### 5.2 Interface
+
+```python
+class EmbeddingClient:
+    def __init__(self):
+        self._backend = BACKENDS[settings.EMBEDDING_BACKEND]
+        self._path = settings.EMBEDDING_PROVIDER_PATH or self._backend.path
+        self._url = settings.EMBEDDING_PROVIDER_URL.rstrip("/") + self._path
+        self._model = settings.EMBEDDING_MODEL_NAME
+        self._timeout = settings.EMBEDDING_REQUEST_TIMEOUT
+        self._headers = {"Authorization": f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}"} \
+                        if settings.EMBEDDING_PROVIDER_API_KEY else {}
+
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        """Embed texts verbatim. Returns L2-normalized vectors of length
+        EMBEDDING_DIM. Raises `EmbeddingPayloadTooLargeError` (subclass of
+        `EmbeddingClientError`) when the backend rejects the request because
+        one or more inputs exceed the model's context window."""
+
+    def embed_query(self, text: str) -> list[float]:
+        """Prepend EMBEDDING_QUERY_INSTRUCTION, then embed_documents([text])[0]."""
+```
+
+### 5.3 Wire shapes
+
+| Backend | Path (default) | Request | Response |
+|---|---|---|---|
+| `openai` | `/v1/embeddings` | `{"model": M, "input": [t, ...]}` | `{"data": [{"embedding": [...]}, ...]}` |
+| `ollama` | `/api/embed` | `{"model": M, "input": [t, ...]}` | `{"embeddings": [[...], ...]}` |
+
+`EMBEDDING_PROVIDER_PATH` (env) overrides the backend default — this is how the production endpoint at `/api/embeddings` with an OpenAI-style payload is supported by the `openai` backend with a one-line config change, no new backend needed.
+
+### 5.4 Behavior details
+
+- **Query instruction:** the model card for Qwen3-Embedding recommends a task-specific instruction prefix on the query side only. `embed_query` prepends `EMBEDDING_QUERY_INSTRUCTION` (a Python constant in `base.py`); `embed_documents` does not.
+- **Overlength inputs:** the client does *not* truncate. The model's context window is the authoritative limit, and the backend signals overlength via HTTP 413 or 400/422 with a context-length message in the body. The client detects that via a loose substring match on common keywords (`context length`, `max tokens`, `too long`, `exceeds`, …) and raises the typed `EmbeddingPayloadTooLargeError`. The `embed_reports_task` worker catches that subclass and bisects the chunk (§6.2); the query path lets it propagate (which the search view treats the same as any other `EmbeddingClientError` — fall back to FTS-only for that request).
+- **Normalization:** every returned vector is L2-normalized client-side, unconditionally. With unit vectors, cosine distance is monotonic in dot product, which makes the HNSW `vector_cosine_ops` operator effectively a fast inner-product search. Whether the upstream server normalizes is irrelevant.
+- **Dimension validation:** every vector is checked to have length `EMBEDDING_DIM`. A mismatch raises `EmbeddingClientError`.
+- **Batching:** `embed_documents` sends a single HTTP call per invocation. The write path and `embed_pending` both go through `enqueue_embed_reports(report_ids)` (defined in `tasks.py`), which chunks the input by `EMBEDDING_SUBJOB_SIZE` and defers one `embed_reports_task` per subjob. Inside each task, `EMBEDDING_BATCH_SIZE` controls the per-HTTP-call size. See §6.3 for the three-layer batching model.
+- **Errors:** non-2xx, timeout, malformed JSON, missing key, or wrong dim all raise `EmbeddingClientError`. The client never falls back internally — fallback policy is owned by the caller.
+- **Dev recipe (Ollama):**
+  ```bash
+  ollama pull dengcao/Qwen3-Embedding-4B:Q5_K_M
+  # in .env:
+  EMBEDDING_BACKEND=ollama
+  EMBEDDING_PROVIDER_URL=http://host.docker.internal:11434
+  EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M
+  EMBEDDING_DIM=2560
+  ```
+  GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, clear the column (`ReportSearchIndex.objects.update(embedding=None)`) and run `./manage.py embed_pending`.
+
+## 6. Async indexing (deferred via Procrastinate)
+
+Every successful report write enqueues an async Procrastinate task that embeds the report(s) on a dedicated worker queue. The write path is decoupled from the embedding service's uptime, transient outages auto-recover via Procrastinate's retry policy, and load on the embedding service is bounded by worker concurrency rather than request concurrency.
+
+### 6.1 The enqueue at write time
+
+`viewsets.py` is unchanged from main — it already dispatches `reports_created_handlers` / `reports_updated_handlers` inside `transaction.on_commit`. Pgsearch subscribes to those at app startup:
+
+```python
+# radis/pgsearch/apps.py — inside register_app()
+
+def _handle_reports_changed(reports):
+    if not reports:
+        return
+    report_ids = [r.pk for r in reports]
+    if settings.PGSEARCH_SYNC_INDEXING:
+        bulk_upsert_report_search_indexes(report_ids)
+        embed_reports_task.defer(report_ids=report_ids)
+    else:
+        # bulk_index_reports chains embed_reports_task at the end of its run,
+        # so the embeddings worker never sees a report id before its RSV row
+        # is committed.
+        enqueue_bulk_index_reports(report_ids)
+
+register_reports_created_handler(
+    ReportsCreatedHandler(name="PG Search", handle=_handle_reports_changed)
+)
+register_reports_updated_handler(
+    ReportsUpdatedHandler(name="PG Search", handle=_handle_reports_changed)
+)
+```
+
+The view contributes nothing pgsearch-specific. Whatever fires `reports_created_handlers` / `reports_updated_handlers` (the API viewsets, the Django admin's `save_model`, any future caller) automatically gets FTS + embedding for free.
+
+When the `transaction.atomic()` block commits:
+
+1. Report rows are durable.
+2. RSV rows exist (or will exist once `bulk_index_reports` runs, in the deferred FTS mode — see §6.6).
+3. A row is inserted into `procrastinate_jobs` describing the embedding work.
+
+The HTTP response returns at that point. The view handler does **not** await embedding.
+
+### 6.2 The task
+
+`radis/pgsearch/tasks.py`:
+
+```python
+@app.task(queue="embeddings")
+def embed_reports_task(report_ids: list[int]) -> None:
+    if not report_ids:
+        return
+
+    rsvs = list(
+        ReportSearchIndex.objects.filter(report_id__in=report_ids)
+        .select_related("report")
+        .only("id", "report_id", "report__body")
+    )
+    if not rsvs:
+        logger.warning("embed_reports_task: no RSVs for report ids %s", report_ids)
+        return
+
+    batch_size = settings.EMBEDDING_BATCH_SIZE
+    embedded: list[ReportSearchIndex] = []
+    skipped: list[ReportSearchIndex] = []
+    with EmbeddingClient() as client:
+        for start in range(0, len(rsvs), batch_size):
+            chunk = rsvs[start : start + batch_size]
+            _embed_with_bisect(client, chunk, embedded, skipped)
+
+    if embedded:
+        ReportSearchIndex.objects.bulk_update(embedded, fields=["embedding"])
+    if skipped:
+        logger.error("…skipped as too large; report_ids=%s", [r.report_id for r in skipped])
+
+
+def _embed_with_bisect(client, rsvs, embedded, skipped):
+    """Embed rsvs. On EmbeddingPayloadTooLargeError, bisect until we isolate
+    the single offender — then log report_id + body_chars and skip it.
+    Other EmbeddingClientError types propagate so Procrastinate retries."""
+    if not rsvs:
+        return
+    try:
+        vectors = client.embed_documents([rsv.report.body for rsv in rsvs])
+    except EmbeddingPayloadTooLargeError as exc:
+        if len(rsvs) == 1:
+            logger.error(
+                "embed_reports_task: report_id=%s body_chars=%d rejected as too "
+                "large; skipping. Backend: %s",
+                rsvs[0].report_id, len(rsvs[0].report.body), exc,
+            )
+            skipped.append(rsvs[0])
+            return
+        mid = len(rsvs) // 2
+        _embed_with_bisect(client, rsvs[:mid], embedded, skipped)
+        _embed_with_bisect(client, rsvs[mid:], embedded, skipped)
+        return
+    for rsv, vec in zip(rsvs, vectors, strict=True):
+        rsv.embedding = vec
+        embedded.append(rsv)
+```
+
+**Sync, not async**: each task issues batches sequentially (one HTTP round-trip at a time, waiting for the response before launching the next), so asyncio inside a single task wouldn't add concurrency. Worker concurrency comes from Procrastinate's `--concurrency K` flag, which gives K independent task slots regardless of whether the task body is `def` or `async def`. A sync task keeps the call graph readable — direct ORM, direct `httpx.Client`, no `database_sync_to_async` shims.
+
+**Internal batching**: a single task accepts an arbitrarily-sized `report_ids` list (e.g., a 1000-row bulk-upsert dispatches one task) and chunks it into HTTP calls of `EMBEDDING_BATCH_SIZE` reports each. This decouples the *enqueue size* (one task per ingest event, naturally sized to the workload) from the *embedding service call size* (always bounded by `EMBEDDING_BATCH_SIZE`, regardless of input). The vLLM endpoint sees a steady stream of equally-sized batches rather than occasional spike requests.
+
+**Bisect on payload-too-large**: the client signals overlength inputs via the typed `EmbeddingPayloadTooLargeError` subclass (§5.4). The task catches it in `_embed_with_bisect` and recursively halves the failing chunk; the recursion terminates either when a sub-chunk succeeds or when a single rsv is isolated. In the isolated case the task logs ERROR with the specific `report_id` + `body_chars`, appends to `skipped`, and continues — the rest of the batch still gets embedded. At task end, ERROR-level summary lists all skipped ids so operators can find them with one log search. The skipped reports' RSVs stay NULL; re-running `embed_pending` will re-attempt and re-log them, which is the expected stop signal for the operator to fix the upstream report or raise the model's context window. Bisect cost: worst case `O(K log K)` extra HTTP calls per offending chunk, but only when an offender exists — the common case is one HTTP call per chunk.
+
+**Two layers of retry for transient errors**: the actual embed call is wrapped in `_embed_chunk_with_retry`, a [stamina](https://stamina.hynek.me/)-decorated function:
+
+```python
+def _is_retryable_embedding_error(exc: Exception) -> bool:
+    return isinstance(exc, EmbeddingClientError) and not isinstance(
+        exc, EmbeddingPayloadTooLargeError
+    )
+
+@stamina.retry(
+    on=_is_retryable_embedding_error,
+    attempts=3, timeout=30.0, wait_initial=0.5, wait_max=8.0,
+)
+def _embed_chunk_with_retry(client, texts):
+    return client.embed_documents(texts)
+```
+
+- **stamina (inline, per-call):** 3 attempts within ~30 s, exponential backoff with jitter. Handles brief blips — a single 5xx, a network jitter, a transient timeout. The predicate `_is_retryable_embedding_error` explicitly *excludes* `EmbeddingPayloadTooLargeError` so the bisect logic owns that case end-to-end without burning retry budget on a deterministic rejection.
+- **Procrastinate (task-level, per-task):** when stamina's budget is exhausted the exception escapes the task, and Procrastinate's exponential-backoff retry kicks in for the whole batch. Handles extended outages where the embedding service is down for minutes-to-hours. On retry the entire batch loop reruns (idempotent: `bulk_update` overwrites identical vectors with no change).
+- **Why two layers and not just one:** stamina inside the task absorbs the common case of "the service blipped once" without the operator-visible noise of a Procrastinate retry event, and without re-doing all the bookkeeping (`SELECT FOR UPDATE SKIP LOCKED`, lease, ack). Procrastinate above the task covers the long-tail case stamina is not budgeted for. Stamina alone would mean a single 30-s outage permanently fails the task; Procrastinate alone would mean every blip incurs a full task replay.
+
+For tests, the repo-wide `conftest.py` disables stamina globally via `stamina.set_active(False)`; specific tests that exercise retry behaviour opt back in with the `stamina_active` fixture.
+
+### 6.3 The worker and the concurrency model
+
+A dedicated `embeddings_worker` container is added to `docker-compose.*.yml` with an explicit concurrency flag:
+
+```yaml
+embeddings_worker:
+  <<: *default-app
+  command: |
+    bash -c "
+      wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} &&
+      ./manage.py bg_worker -q embeddings --concurrency 4
+    "
+```
+
+Three explicit choices:
+
+- **Dedicated queue (`embeddings`)**: isolated from `default` (extraction / subscription) and `llm`. A backfill or write burst can't starve unrelated tasks.
+- **`--concurrency 4`** (the concurrency knob): up to 4 `embed_reports_task` slots in flight on the worker at once. Each slot processes its batches sequentially, so `--concurrency K` translates directly to "up to K embedding HTTP requests in flight to the embedding service per worker process." Total system concurrency = `worker_count × --concurrency`. The default of 4 leaves capacity for the query path's `embed_query` to share the same embedding service. Tunable per deployment.
+- **Sync task body**: the task is `def`, not `async def`. Procrastinate gives concurrency through K independent task slots regardless of sync vs async, and the embedding batch loop is sequential by design — switching to async would not add any in-task concurrency, just a `database_sync_to_async` shim layer.
+
+**Three layers of "batching"**, easy to confuse, kept separate by design:
+
+| Layer | Knob | What it controls |
+|---|---|---|
+| Per-Procrastinate-task size | `EMBEDDING_SUBJOB_SIZE` (settings constant; default 100) | How many report ids one `embed_reports_task` instance carries. The single chunking point for *every* enqueue — write-path handler, FTS chain tail, `embed_pending`, admin action — via `enqueue_embed_reports(report_ids)`. |
+| Per-HTTP-call size | `EMBEDDING_BATCH_SIZE` (settings constant; default 32) | How many report bodies are sent in one `embed_documents` call *inside* one task. One subjob of 100 → ~3 HTTP calls of 32. |
+| Concurrent task slots per worker | `--concurrency K` (compose flag; default 4) | How many `embed_reports_task` instances run in parallel on a single worker. |
+| Concurrent HTTP calls across all workers | `worker_count × --concurrency K` | The system's actual load ceiling on the embedding service. |
+
+Why subjob granularity matters: a 1M-row `embed_pending` backfill becomes ~10k subjobs of 100, not one giant task. Multiple workers can drain in parallel; a stuck or failing subjob has bounded blast radius (retries reprocess only 100 ids, not 1M); Procrastinate's `--concurrency K` actually means something for backfill throughput. Write-path bulk-upserts get the same treatment: a 1000-row upload → 10 embed subjobs, not one.
+
+To scale up, prefer adding worker processes (crash isolation + connection-pool fan-out) over raising `--concurrency` past ~8 (the embedding service typically saturates around there anyway). Total embedding load on the service is `worker_count × --concurrency`.
+
+### 6.4 Failure semantics
+
+Procrastinate handles transient failures automatically; `embed_pending` (§6.5) handles extended outages.
+
+| Failure | What happens |
+|---|---|
+| **Brief blip** (single 5xx / timeout / network jitter ≲ seconds) | stamina inside the task retries the same HTTP call up to 3 times within ~30 s. Most cases recover before the task even completes its current batch loop iteration. No Procrastinate retry event. |
+| **Transient outage** (service degraded for minutes; outlasts stamina's 30 s budget) | Stamina exhausts → exception escapes the task → Procrastinate's task-level retry kicks in with exponential backoff. Most cases auto-recover; the embedding is written without operator action. |
+| **Extended outage** (service down longer than Procrastinate's retry window) | Task ends in `failed` state in `procrastinate_jobs`. RSV stays NULL. Operator runs `./manage.py embed_pending` (or the admin action) once the service recovers to re-enqueue the affected rows. |
+| **Wrong-dim vector returned by backend** | `EmbeddingClientError` raised → retries → all fail the same way → task ends `failed`. Operator inspects, fixes config (or the `pgsearch.E001` system check catches it at deploy time). |
+| **Worker offline / crashed** | Tasks pile up in `procrastinate_jobs.todo`. When a worker starts, it picks them up via `SELECT ... FOR UPDATE SKIP LOCKED`. No data loss. Write path unaffected. |
+| **Embedding written and report immediately deleted** | `bulk_update` updates zero rows for the deleted RSV; rest of the batch is unaffected. Benign. |
+| **`EMBEDDING_PROVIDER_URL` empty / misconfigured** | `EmbeddingClient.__init__` raises `EmbeddingClientError` at task start → retries fail → task ends `failed`. Operator fixes settings, runs `embed_pending`. |
+| **`settings.EMBEDDING_DIM` ≠ migration dim** | `pgsearch.E001` system check blocks startup; this is caught at deploy time, not runtime. |
+
+The **write path never fails because of embedding**. Reports are saved, FTS indexed sync, vector indexing best-effort with retries + recovery.
+
+### 6.5 `embed_pending` — operator-driven recovery
+
+The `./manage.py embed_pending` command is retained, with one change: it now **enqueues `embed_reports_task` instances** rather than running embedding work inline in the command process. This keeps the embedding service load bounded by the worker's configured concurrency rather than by however fast the operator's shell can iterate.
+
+```python
+async def _drain(self, ids, batch_size):
+    for i in range(0, len(ids), batch_size):
+        chunk = ids[i : i + batch_size]
+        embed_reports_task.defer(report_ids=chunk)
+        self.stdout.write(f"  enqueued {i + len(chunk)}/{len(ids)}")
+```
+
+The three scenarios still apply:
+
+1. **Backfill** of historical NULLs (rows loaded before the deferred-embedding architecture shipped).
+2. **Dim or model change** following §4.5 (or `ReportSearchIndex.objects.update(embedding=None)` for a same-dim model swap).
+3. **Outage recovery** for tasks that exhausted Procrastinate retries during an extended embedding-service outage.
+
+Properties:
+
+- **Idempotent.** Filter is `embedding IS NULL`; re-runs are no-ops on already-drained rows.
+- **Resumable.** No checkpoint state. Killed mid-run → re-run picks up remaining NULLs.
+- **Rate-limited.** The worker's `--concurrency K` caps concurrent embedding HTTP calls regardless of how many tasks the command enqueues. Operators cannot accidentally hammer the embedding service.
+- **Visible.** Enqueued tasks appear in the standard Procrastinate observability surface (admin, logs, telemetry). Failed retries surface there as well.
+
+### 6.6 `PGSEARCH_SYNC_INDEXING` retained; ordering enforced by chaining
+
+The pre-existing `PGSEARCH_SYNC_INDEXING` switch is **retained** with the same semantics it had before hybrid search: it controls whether FTS bulk-indexing runs inline on the request thread or is deferred to a `bulk_index_reports` Procrastinate task. Pgsearch's `_handle_reports_changed` reads the flag and dispatches accordingly:
+
+| Mode | `PGSEARCH_SYNC_INDEXING` | FTS step | Embedding step |
+|---|---|---|---|
+| Sync | `True` | `bulk_upsert_report_search_indexes(ids)` inline inside the handler | `embed_reports_task.defer(...)` immediately after, in the same handler call. RSV rows are already committed. |
+| Deferred (default) | `False` | `enqueue_bulk_index_reports(ids)` defers `bulk_index_reports` to the `default` queue | `bulk_index_reports` itself defers `embed_reports_task` at the end of its run. Handler does *not* defer embed directly. |
+
+`bulk_index_reports` now ends with `embed_reports_task.defer(report_ids=...)`. The defer happens inside the same task body, after `bulk_upsert_report_search_indexes` has committed the RSV rows, so the embeddings worker can only observe a `report_ids` payload whose RSV rows already exist. This replaces the earlier "defensive idempotent re-upsert at the top of the embed task" design — the chain is the ordering guarantee.
+
+Properties:
+
+- **No race.** The embeddings worker never picks up a report id before its RSV row is committed. The embed task can read `report.body` and write `embedding` without checking for RSV existence.
+- **Simple embed task.** No `bulk_upsert_report_search_indexes` shim at the top, no idempotent re-upsert cost on the embeddings worker, no extra commit hop.
+- **Operator choice preserved.** Deployments that prefer sync FTS keep that option; deployments that prefer the deferred FTS task for large bulks keep that option. Hybrid search is orthogonal to the FTS-mode decision.
+- **Two queues, two concerns.** FTS deferral runs on the `default` queue (where `bulk_index_reports` already lived); embedding runs on the dedicated `embeddings` queue. FTS-only worker capacity does not compete with embedding capacity.
+- **Operator-triggered re-embed.** The `embed_pending` management command and the `enqueue_pending_embeddings` admin action defer `embed_reports_task` directly. Both bypass `bulk_index_reports` but the invariant still holds: their queries are over existing `ReportSearchIndex` rows with `embedding IS NULL`, so the RSV rows exist by construction.
+
+The single-create / PUT path is unaffected by `PGSEARCH_SYNC_INDEXING`. Its FTS step is the `post_save` signal on `Report`, which is always sync inline by construction. The same handler still fires for it; the handler then takes the sync-mode branch's behaviour (immediate embed defer), which is correct since the RSV row was just written sync by the signal.
+
+### 6.7 Sync DRF; no async views required
+
+The enqueue (`embed_reports_task.defer(...)`) is a synchronous Procrastinate API call, so the report views remain plain sync DRF (`ReportViewSet`, unchanged in shape from main). No `await` lives inside any request handler. The async-view rewrite proposed in PR #230 is **not a dependency** of this design and is intentionally not pulled in — the entire embedding workload lives on the worker side, behind the `embeddings` queue.
+
+## 7. Hybrid search provider
+
+### 7.1 Universe and fusion
+
+The hybrid result universe is the **union** of two filter-bounded candidate sets:
+
+- **Vector top-K:** the `HYBRID_VECTOR_TOP_K` nearest rows by cosine distance to the query embedding, filtered by structured filters and `embedding IS NOT NULL`. *Not* constrained to the FTS hit set.
+- **FTS hits:** all rows matching the tsquery and the structured filters, capped at `HYBRID_FTS_MAX_RESULTS`.
+
+A report appears in results if it is in **either** set. This is the change from the earlier draft, made because radiology queries like "no pneumothorax" must be able to surface reports that lexically don't match (the GIN index drops "no" as a stop word) but are semantically related.
+
+Each report's score is plain Reciprocal Rank Fusion:
+
+```
+score(d) = (1 / (HYBRID_RRF_K + vec_rank[d])  if d ∈ vec_top_K  else 0)
+         + (1 / (HYBRID_RRF_K + fts_rank[d])  if d ∈ fts_hits   else 0)
+```
+
+Properties:
+
+- Reports in both sides outrank reports in only one side (sum of two terms vs. one).
+- Vector contribution decays after rank K (no `vec_rank` entry), so the ordering naturally transitions from "hybrid head" to "FTS tail" with no explicit cutoff.
+- A query with zero FTS hits returns `vec_top_K` ranked by vector position only — pure semantic search.
+- A query with embedding failure returns FTS hits ranked by `ts_rank` only — the pre-hybrid behavior.
+
+### 7.2 `search()` flow
+
+```python
+def search(s: Search) -> SearchResult:
+    query_str = _build_query_string(s.query)
+    language  = _resolve_language(s.filters)
+    filter_q  = _build_filter_query(s.filters)
+    tsquery   = SearchQuery(query_str, search_type="raw", config=language)
+
+    # Vector side: strip NOT branches before embedding (see §7.8), then embed.
+    # If stripping leaves nothing (e.g., the user query was just `NOT X`),
+    # skip vector retrieval entirely and fall through to FTS-only.
+    query_text = QueryParser.unparse_for_embedding(s.query)
+    query_vec: list[float] | None = None
+    if query_text.strip():
+        try:
+            query_vec = EmbeddingClient().embed_query(query_text)
+        except EmbeddingClientError as e:
+            logger.warning("Falling back to FTS-only: %s", e)
+            query_vec = None
+
+    vec_rank: dict[int, int] = {}
+    if query_vec is not None:
+        ids = list(
+            ReportSearchIndex.objects
+                .filter(filter_q)
+                .exclude(embedding__isnull=True)
+                .annotate(distance=CosineDistance("embedding", query_vec))
+                .order_by("distance", "report_id")
+                .values_list("report_id", flat=True)[:settings.HYBRID_VECTOR_TOP_K]
+        )
+        vec_rank = {rid: i + 1 for i, rid in enumerate(ids)}
+
+    # FTS side
+    fts_rows = list(
+        ReportSearchIndex.objects
+            .filter(filter_q)
+            .filter(search_vector=tsquery)
+            .annotate(rank=SearchRank(F("search_vector"), tsquery))
+            .order_by("-rank", "report_id")
+            .values("report_id", "rank")[:settings.HYBRID_FTS_MAX_RESULTS]
+    )
+    fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)}
+
+    # Fusion (pure Python, factored out for unit testing)
+    ordered_ids = _rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K)
+
+    total_count = len(ordered_ids)
+    total_relation = (
+        "at_least"
+        if len(fts_rows) >= settings.HYBRID_FTS_MAX_RESULTS
+           or len(vec_rank) >= settings.HYBRID_VECTOR_TOP_K
+        else "exact"
+    )
+    page_ids = ordered_ids[s.offset : s.offset + (s.limit or len(ordered_ids))]
+
+    # Headline + hydration for the page slice only
+    page_rows = (
+        ReportSearchIndex.objects
+            .filter(report_id__in=page_ids)
+            .annotate(
+                summary=SearchHeadline("report__body", tsquery, config=language,
+                                       start_sel="<em>", stop_sel="</em>",
+                                       min_words=10, max_words=20, max_fragments=10),
+                rank=SearchRank(F("search_vector"), tsquery),
+            )
+            .select_related("report")
+    )
+    by_id = {r.report_id: r for r in page_rows}
+    documents = [
+        document_from_pgsearch_response(_with_fallback_summary(by_id[rid]))
+        for rid in page_ids if rid in by_id
+    ]
+    return SearchResult(total_count=total_count, total_relation=total_relation, documents=documents)
+```
+
+### 7.3 Empty-summary fallback
+
+`SearchHeadline` returns an empty string when the document body has no FTS match (the vector-only hit case). `_with_fallback_summary` replaces an empty summary with the first 30 words of `report.body`. Trivial helper, ~5 lines.
+
+### 7.4 `retrieve()`
+
+Same fusion logic, returns an iterator of `report__document_id` in `ordered_ids` order. No headline. Used by `ExtractionJob` and `SubscriptionJob` to walk the matching id set.
+
+### 7.5 `count()` and `filter()`
+
+Unchanged. These operate on filters only and never call the embedding service.
+
+### 7.6 `ReportDocument` score fields
+
+`ReportDocument` (`radis/search/site.py`) carries three score fields. The
+existing `relevance` is preserved for API backwards compatibility; two new
+fields are added so callers (and the UI) can see *why* a result ranked where
+it did:
+
+```python
+class ReportDocument(NamedTuple):
+    relevance: float | None                  # FTS ts_rank — existing; 0.0 for vector-only hits
+    document_id: str
+    # ...
+    cosine_distance: float | None = None     # NEW — pgvector cosine distance; None for FTS-only hits
+    rrf_score: float = 0.0                   # NEW — the value the final ordering is based on
+```
+
+Semantics:
+
+- `relevance` — Postgres `ts_rank` of the row's `search_vector` against the
+  tsquery. Same field/shape pre- and post-hybrid; callers that read it
+  continue to work. Defaults to `0.0` for documents that came from the vector
+  half only.
+- `cosine_distance` — the `CosineDistance("embedding", query_vec)` annotation
+  for rows that made `vec_top_K`. `None` for FTS-only hits and whenever the
+  query path skipped vector retrieval (embedding service down, or the query
+  reduced to `NOT` after §7.8 stripping).
+- `rrf_score` — the fused score from §7.1; this is what the result ordering
+  is based on. Exposed for transparency, debugging, and UI display
+  (operators can see at a glance which side contributed). Also useful when
+  the §11.6 re-ranker lands: it will read `rrf_score` to seed its top-N
+  candidate selection.
+
+All three fields are populated by `document_from_pgsearch_response` during
+the page-slice hydration step in §7.2. The hydration query annotates the page
+rows with `ts_rank`, looks up the corresponding entries in the `vec_rank` /
+`fts_rank` / `rrf` dicts, and assembles the document.
+
+### 7.7 `search_provider.max_results`
+
+Updated to `max(HYBRID_VECTOR_TOP_K, HYBRID_FTS_MAX_RESULTS)`, which is what the `SearchView` page-bound check uses to reject impossibly-deep pagination.
+
+### 7.8 Negation-aware query for embedding
+
+Dense embedding models are polarity-blind: the vector for `"NOT pneumothorax"`
+clusters near the vector for `"pneumothorax"`, so the top-K nearest neighbours
+to a `NOT X` query are documents *about* X — the polar opposite of what the
+user asked for. The FTS half handles `NOT X` correctly (it returns docs
+without X), so when both halves are fused naively the vector half pollutes
+the candidate pool with anti-matches.
+
+The fix is upstream of embedding: strip negated branches from the query string
+before sending it to the embedding model. The FTS side still receives the
+full structured query, so its negation semantics are preserved.
+
+A new static method on `QueryParser` walks the AST and emits a stripped
+string. The shape mirrors the existing `QueryParser.unparse` walker:
+
+```python
+@staticmethod
+def unparse_for_embedding(node: QueryNode) -> str:
+    """Like unparse(), but drops the operand of every UnaryNode("NOT", X)
+    and collapses any BinaryNode whose children both become empty.
+    Returns the empty string if the whole query reduces to NOT clauses."""
+    if isinstance(node, TermNode):
+        # Same as unparse: emit the term verbatim (PHRASE keeps quotes).
+        return QueryParser.unparse(node)
+    if isinstance(node, ParensNode):
+        inner = QueryParser.unparse_for_embedding(node.expression)
+        return f"({inner})" if inner else ""
+    if isinstance(node, UnaryNode):
+        # The only unary operator in the grammar is NOT — drop the operand.
+        return ""
+    if isinstance(node, BinaryNode):
+        left = QueryParser.unparse_for_embedding(node.left)
+        right = QueryParser.unparse_for_embedding(node.right)
+        if not left and not right:
+            return ""
+        if not left:
+            return right
+        if not right:
+            return left
+        if node.implicit:
+            return f"{left} {right}"
+        return f"{left} {node.operator} {right}"
+    raise ValueError(f"Unknown node type: {type(node)}")
+```
+
+Outcomes:
+
+| User query | `unparse()` (FTS path) | `unparse_for_embedding()` (vector path) | Behavior |
+|---|---|---|---|
+| `pneumothorax` | `pneumothorax` | `pneumothorax` | Both halves agree; RRF amplifies. |
+| `A AND NOT B` | `A AND NOT B` | `A` | Vector embeds the positive concept; FTS enforces the exclusion. |
+| `NOT X` | `NOT X` | `""` | Vector path skipped (see §7.2); FTS-only ranking. |
+| `(A AND NOT B) OR C` | `(A AND NOT B) OR C` | `(A) OR C` | Empty NOT branch collapses; surviving structure retained for vector. |
+
+The method does not attempt to resolve OR-asymmetry or other operator
+mismatches documented in §11.5 — those remain open trade-offs in the design.
+This is a targeted fix for the `NOT` case, which is the most acute failure
+mode for radiology queries.
+
+## 8. Configuration
+
+### 8.1 Env-driven (per-deployment, set in `.env`)
+
+```python
+# radis/settings/base.py
+EMBEDDING_BACKEND          = env.str("EMBEDDING_BACKEND", default="openai")
+EMBEDDING_PROVIDER_URL     = env.str("EMBEDDING_PROVIDER_URL", default="")
+EMBEDDING_PROVIDER_PATH    = env.str("EMBEDDING_PROVIDER_PATH", default="")   # "" = backend default
+EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="")
+EMBEDDING_MODEL_NAME       = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B")
+EMBEDDING_DIM              = env.int("EMBEDDING_DIM", default=1024)
+```
+
+These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). Worker concurrency is set in the compose command line (`bg_worker -q embeddings --concurrency K`), not via env — it's a deployment-shape decision rather than a runtime tunable.
+
+### 8.2 Code constants (tuning knobs, in `base.py`)
+
+```python
+EMBEDDING_REQUEST_TIMEOUT = 30  # seconds
+EMBEDDING_QUERY_INSTRUCTION = (
+    "Instruct: Given a radiology search query, retrieve relevant radiology reports.\n"
+    "Query: "
+)
+EMBEDDING_BATCH_SIZE = 32
+EMBEDDING_SUBJOB_SIZE = 100
+
+HYBRID_VECTOR_TOP_K    = 100
+HYBRID_FTS_MAX_RESULTS = 10_000
+HYBRID_RRF_K           = 60
+```
+
+These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks).
+
+### 8.3 `example.env`
+
+Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, keyed off `EMBEDDING_BACKEND`.
+
+### 8.4 Compose
+
+`docker-compose.base.yml`:
+
+- The `EMBEDDING_BACKEND`, `EMBEDDING_PROVIDER_URL`, `EMBEDDING_PROVIDER_PATH`, `EMBEDDING_PROVIDER_API_KEY`, `EMBEDDING_MODEL_NAME`, `EMBEDDING_DIM` env keys are added to the `&default-app` block so all services see them.
+- New service `embeddings_worker` inheriting `*default-app` runs `./manage.py bg_worker -q embeddings --concurrency 4` (see §6.3).
+
+`docker-compose.dev.yml`:
+
+Both files add an `embeddings_worker.command` block. Dev uses `-l debug --autoreload`; prod uses `-l info`. Both pass `-q embeddings --concurrency 4` by default — tune per deployment.
+
+## 9. Error handling and degradation
+
+| Failure | Behavior | Logging |
+|---|---|---|
+| Embedding service returns 5xx/timeout during query-time | `query_vec = None`; result list ordered by FTS-only; request succeeds | WARNING with request id |
+| Embedding service returns 4xx during query-time | Same FTS-only fallback (treats as misconfig at request layer) | ERROR |
+| Embedding service returns malformed body | `EmbeddingClientError` raised; query falls back to FTS-only | ERROR |
+| Embedding service down during `embed_reports_task` execution | Task raises `EmbeddingClientError`; Procrastinate retries with exponential backoff. After retries exhaust, task ends `failed`; `embedding` stays NULL. **API request was never affected** (already returned at the on_commit point). | WARNING per retry; ERROR on final failure |
+| Orchestrator crashes during task creation (partial dispatch) | Job stays in `PREPARING`. Next launcher tick sees in-flight job and no-ops. Operator marks job `FAILURE` in admin to allow a fresh run | ERROR + operator action |
+| Sub-task fails after Procrastinate retries exhausted | Task ends as `FAILURE`. `update_job_state` rolls the job to `WARNING` (some tasks succeeded) or `FAILURE` (all failed). NULL rows remain; next launcher creates a new job to retry them | ERROR |
+| Report body exceeds embedding model's context window (backend returns 413, or 400/422 with a context-length message) | Client raises `EmbeddingPayloadTooLargeError`. Task bisects the chunk and retries; once the offender is isolated to one report, it is skipped and its RSV stays NULL. The rest of the chunk still gets embedded. | ERROR per offender (report_id + body_chars) and ERROR summary listing all skipped ids |
+| Report deleted between task creation and execution | Sub-task's `task.reports.values_list(...)` returns fewer rows; `embed_documents` called on smaller list; no error | DEBUG |
+| Vector dim mismatch on write | Postgres raises; sub-task fails, retried | ERROR — escalate to admin |
+| `EMBEDDING_PROVIDER_URL` empty at startup | `EmbeddingClient` construction defers to call site; calls log + raise; query falls back to FTS-only | WARNING once on first request |
+| System user missing (data migration didn't run) | Launcher raises `User.DoesNotExist`. Loud failure; deployment misconfiguration. Fix: run migrations | ERROR |
+
+**Deliberate non-policies:**
+
+- The product never fails a search request because the embedding service is down. It degrades to FTS-only.
+- Query embeddings are not cached. The complexity and freshness trade-off is not worth it at the corpora sizes RADIS targets.
+- `EmbeddingClient` does not retry internally. The worker path layers `stamina.retry` over the client call inside `_embed_chunk_with_retry` (3 attempts / 30 s budget) and lets Procrastinate's task-level retry handle anything stamina can't absorb. The query path uses a single shot and falls back to FTS-only on any `EmbeddingClientError`.
+
+**Observability:**
+
+- Provider logs at DEBUG: vec hit count, FTS hit count, intersection count, fusion ms, query-embed ms.
+- `embed_reports_task` logs at INFO: batch size, total chars, latency, attempt number.
+- The existing OpenTelemetry overlay (commit `653e0c67`) tags telemetry per service; embedding spans show up under the `embeddings_worker` service.
+
+## 10. Testing strategy
+
+### 10.1 Unit tests (no DB)
+
+| File | Coverage |
+|---|---|
+| `tests/unit/test_embedding_client.py` | Backend payload/response round-trip, path override, instruction prefix, normalization, dim validation, all error modes, truncation |
+| `tests/unit/test_provider_fusion.py` | `_rrf_fuse(vec_rank, fts_rank, k)` pure-Python helper: disjoint, overlapping, FTS-only, vector-only, both-empty, tiebreak by report_id |
+| `tests/unit/test_embed_reports_task.py` | Loads RSVs by report_id, calls `EmbeddingClient.embed_documents`, bulk-updates vectors. Asserts internal batching by `EMBEDDING_BATCH_SIZE`, that `EmbeddingClientError` propagates so Procrastinate's retry policy applies (the task does not swallow), and that `bulk_index_reports` chains `embed_reports_task.defer(...)` at the end of its run so the embeddings worker only sees report ids whose RSV rows are committed. |
+
+### 10.2 Integration tests (real Postgres + pgvector)
+
+| File | Coverage |
+|---|---|
+| `tests/integration/test_migrations.py` (new, `django-test-migrations`) | Extension migration runs; column + HNSW index created with configured dim; reverse works |
+| `tests/integration/test_provider_hybrid.py` (new) | FTS-only hit, vector-only hit ("no pneumothorax" fixture), both-sides hit, filter honoring, stable pagination, embedding-service-down fallback, NULL-embedding rows still returned, `ts_headline` query-count bounded to page, empty-summary fallback |
+
+Factories: existing `ReportSearchIndexFactory` gains optional `embedding` kwarg (default `None`). New `ReportSearchIndexWithEmbeddingFactory` generates deterministic normalized vectors of the configured dim from a seed. Real Qwen3 embeddings are not used in tests.
+
+### 10.3 View-level smoke
+
+`radis/search/tests/test_views.py` (extend):
+
+- Search request with hybrid enabled returns 200 and renders documents.
+- Search request with `EMBEDDING_PROVIDER_URL=""` returns 200 (FTS-only path).
+
+### 10.4 Acceptance (`@pytest.mark.acceptance`)
+
+One end-to-end test against the dev containers, with the embedding service stubbed (either a small in-test FastAPI or a recorded fixture response), verifying the search page returns hybrid results. Marked acceptance so it's opt-in like the existing acceptance suite.
+
+### 10.5 Explicitly not tested
+
+- Live Qwen3 retrieval quality (offline eval, out of scope).
+- pgvector HNSW recall under specific data shapes (extension's responsibility).
+- Wire formats beyond the two supported backends.
+
+## 11. Known limitations and future work
+
+### 11.1 Negation / polarity (the "no pneumothorax" problem)
+
+Dense embedding models — including Qwen3-Embedding — embed semantically opposite phrases close together. "No pneumothorax" and "pneumothorax present" produce nearby vectors, so the vector half of the hybrid score is *polarity-blind*. The FTS half partly compensates by allowing the user to construct explicit AND-NOT queries, but Postgres' GIN index drops "no" as a stop word, so a naive query like `no pneumothorax` is effectively `pneumothorax` on the FTS side.
+
+This is a real concern for radiology, where negated findings are pervasive ("no acute …", "no evidence of …", "no significant …"). **Hybrid search as designed here does not solve this.** It is documented as an accepted limitation of v1, and a v2 conversation should address it.
+
+Candidate solutions to evaluate in a future spec (none committed):
+
+- A cross-encoder re-ranker over the top-N hybrid results (e.g., a small instruction-tuned model that knows to score "no X" against "X present" as opposite).
+- Adding a sparse/late-interaction model (SPLADE, ColBERT) alongside the dense vector — sparse models preserve token-level polarity.
+- Negation-aware query preprocessing: detect negation, route to a different retrieval mode, or expand to phrasal `AND-NOT` clauses on the FTS side that bypass the stop-word filter (e.g., search the raw body, not the tsvector).
+- Structured-findings indexing: have the LLM extract presence/absence flags per finding category at ingest time, search those structured fields instead of (or in addition to) prose.
+
+### 11.2 Dimension changes are manual
+
+See §4.5.
+
+### 11.3 GGUF dev embeddings ≠ bf16 prod embeddings
+
+Documented in §5.4. Mitigated by following §4.5 after a model swap and then running `./manage.py embed_pending` (§6.5), which enqueues `embed_reports_task` for every NULL row; the embeddings worker drains the queue at its configured concurrency.
+
+### 11.4 No body-change detection for re-embedding
+
+V1 re-embeds anything where `embedding IS NULL`. A future optimization could
+track whether the body actually changed (e.g., a `body_hash` column on
+`ReportSearchIndex` updated only on body changes) so metadata-only updates
+don't have to null the embedding. Not in v1; profiling will tell us whether it
+matters.
+
+### 11.5 Operator-aware queries: residual FTS / vector asymmetry
+
+Both halves of hybrid search receive a derivation of the same parsed `QueryNode`, but interpret it through completely different machinery. The FTS side consumes a `tsquery` built by `_build_query_string` where `AND`, `OR`, `NOT`, quoted phrases, and parens are first-class boolean operators (`&`, `|`, `!`, `<->`, `()`). The vector side consumes a string derived from the AST by `QueryParser.unparse_for_embedding` (§7.8) and feeds it to the embedding model as natural language; the remaining operators become ordinary word tokens that the model has no operator-aware machinery to interpret.
+
+Practical consequences after the §7.8 NOT-stripping fix:
+
+- **Natural-phrase queries** (`pneumothorax`, `chest x-ray`, implicit-AND `cardiac arrest`) — both halves point the same direction. RRF amplifies the agreement. This is the workload hybrid search is best at.
+- **`A AND B`** — FTS strictly intersects; vector returns docs about a topic-mix of A and B. Docs matching both lexically *and* semantically rank highest, which is the desired outcome. Vector contributes useful expansion but not boolean precision.
+- **`A OR B`** — FTS unions; the vector half has no concept of disjunction and just produces a centroid-style embedding. Docs about either A or B that happen to be near the centroid still get retrieved, but a doc purely about A may not appear unless it's also close to the centroid. **Open trade-off.** Vector half degrades from "asset" to "noise" for OR-heavy queries; no fix in this spec.
+- **`NOT X` / `A AND NOT B`** — addressed by §7.8. Vector embeds only the positive branches; FTS enforces the negation; the halves are aligned.
+
+The asymmetry is real and remains a quality consideration for OR-heavy queries. The §11.6 cross-encoder re-ranker, when added, can sharpen the head of results but cannot fix a polluted candidate pool — see the analysis at the end of this section for why upstream stripping (the §7.8 approach for `NOT`) is the architecturally correct order of operations.
+
+**Why a re-ranker alone cannot fix recall problems.** A cross-encoder re-ranker improves precision *within the candidate pool it is given* — it cannot improve recall of that pool. If a polarity-blind vector half had poisoned a `NOT pneumothorax` pool with ~100 anti-matches, re-ranking the top-20 would sharpen the head but ~590 correct docs would still live below the re-ranker's cutoff at their original RRF positions. The architecturally correct order is to fix recall upstream (§7.8) and *then* layer a re-ranker for precision (§11.6). A re-ranker without the upstream fix is rearranging deck chairs on a polluted pool.
+
+### 11.6 Cross-encoder re-ranker (deferred)
+
+A planned follow-up adds a re-ranker stage between hybrid fusion and result hydration to lift precision (especially on operator-light natural-phrase queries, where the candidate pool is already correct but RRF ordering is mediocre) and to partially compensate for §11.1's polarity blindness. Two backend patterns are under consideration:
+
+- **Pointwise cross-encoder via vLLM.** Qwen3-Reranker-4B served with `vllm serve … --task score` exposes `/v1/rerank` (Cohere/TEI shape: `{model, query, documents}` → `[{index, relevance_score}]`). Logit-based scoring (yes/no token logits → softmax) gives graded relevance in [0,1]. Latency ~30–100 ms per pair on a single GPU; for top-20 candidates that's ~0.5–1.5 s added.
+- **Listwise LLM re-ranker** via the existing OpenAI-compatible chat-completions endpoint. The LLM is prompted with the query and the top-N candidates packed into a single message; structured output (`response_format=json_object`) returns a ranked list of indices. One HTTP call per query rather than N. Latency ~1–3 s for top-20 depending on model size. Quality trades off graded precision for the LLM's strong instruction-following — particularly the explicit "respect negation" cue, which the pointwise reranker has to learn implicitly.
+
+vLLM is the recommended production host for the pointwise path because Ollama (as of mid-2025) does not expose token logits cleanly, which collapses Qwen3-Reranker to a binary 1.0/0.0 signal and loses graded ordering. Ollama can still serve the LLM listwise backend without issue.
+
+### 11.7 Evaluation strategy for the layered hybrid stack
+
+Six profiles cover the additive layers:
+
+| Profile | Negation strip (§11.5) | Re-ranker (§11.6) |
+|---|---|---|
+| `baseline` | off | off |
+| `strip` | on | off |
+| `rerank-qwen` | off | Qwen3-Reranker via vLLM |
+| `rerank-llm` | off | listwise LLM |
+| `both-qwen` | on | Qwen3-Reranker via vLLM |
+| `both-llm` | on | listwise LLM |
+
+A `run_search_eval` management command loops a set of test queries through all six profiles (toggling settings via `override_settings`) and dumps comparable JSON output with top-N docs, per-layer scores (`ts_rank`, `cosine_distance`, `rrf_score`, `rerank_score`), and per-profile latencies.
+
+**Labeling.** Per-pair LLM relevance judgment ("is doc D relevant to query Q?") is unreliable for radiology because (a) it inherits the same polarity blind spot the system is trying to evaluate, and (b) it introduces circular bias when the labeling LLM and re-ranker LLM share a family. The preferred approach is *concept-based polarity-aware labeling*: label each report once per clinical concept with `PRESENT` / `ABSENT` / `NOT_MENTIONED`, then derive query relevance deterministically (`pneumothorax` → `PRESENT ∪ ABSENT`; `NOT pneumothorax` → `NOT_MENTIONED ∪ ABSENT` for strict exclusion, or `ABSENT` only for "rule-out" semantics). The concept labels are reusable across many queries and survive prompt/model changes. The upstream label-filter work in PR #196 produces structured labels with comparable semantics and is the intended source of ground truth for production-scale evaluation.
+
+## 12. Rollout plan
+
+1. **Schema + dep.** `pgvector` pip dep + `0002_hybrid_search` migration (extension + embedding column + HNSW). No behaviour change yet.
+2. **Embedding clients + tests.** Land `EmbeddingClient` (sync, query side) and `AsyncEmbeddingClient` (async, worker side). No callers wired up yet.
+3. **Worker + task + queue.** Add `embeddings_worker` container (compose), `embed_reports_task` async task on the `embeddings` queue, and the worker command at `--concurrency 4`. Without callers, the worker stays idle.
+4. **Write-path enqueue.** Modify the single-create `on_commit` and `bulk_upsert_reports`' `on_commit` to call `embed_reports_task.defer(report_ids=touched_pks)`. The bulk-upsert path keeps both `PGSEARCH_SYNC_INDEXING` modes (§6.6); the sync mode defers embedding immediately after FTS, the deferred mode chains embedding at the tail of `bulk_index_reports`. From this point on, **every write enqueues an embedding task**; the embeddings worker drains the queue.
+5. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. Rows still missing an embedding participate via the FTS half only.
+6. **(Optional) historical backfill.** Run `./manage.py embed_pending` to enqueue an `embed_reports_task` for every existing NULL row. Same command serves outage recovery and dim/model-change scenarios (§6.5).
+7. **Monitor.** Watch search latency p95, write latency p95 (unchanged — just the enqueue), embedding-queue depth, retry rate, and `procrastinate_jobs.failed` count.
+
+Each step is independently mergeable; steps 1–3 ship as quiet infrastructure with no user-visible effect, step 4 starts populating the column on every write, step 5 is the moment hybrid search goes live for users.
diff --git a/example.env b/example.env
index de23797db..bf2ec8e37 100644
--- a/example.env
+++ b/example.env
@@ -134,6 +134,40 @@ REPORT_LLM_PROVIDER_URL="http://host.docker.internal:11434/v1"
 # 'cli generate-example-reports'.
 REPORT_LLM_PROVIDER_API_KEY="ollama"
 
+# Embedding service configuration (used by radis.pgsearch for hybrid search).
+# The embedding service is independent of the LLM service above.
+#
+# Choose a backend. Two are built in:
+#   - openai: posts {"model": M, "input": [t,...]} to /v1/embeddings and reads
+#             {"data":[{"embedding":[...]}]} responses (OpenAI / vLLM / TEI).
+#   - ollama: posts {"model": M, "input": [t,...]} to /api/embed and reads
+#             {"embeddings":[[...]]} responses (Ollama 0.2.0+).
+EMBEDDING_BACKEND=openai
+
+# Base URL of the embedding service. Path is appended automatically.
+EMBEDDING_PROVIDER_URL=
+
+# Optional: override the backend's default path. For a custom endpoint at
+# /api/embeddings with an OpenAI-style payload, set EMBEDDING_BACKEND=openai
+# and EMBEDDING_PROVIDER_PATH=/api/embeddings.
+EMBEDDING_PROVIDER_PATH=
+
+# Optional bearer token. Sent as "Authorization: Bearer <key>" when non-empty.
+EMBEDDING_PROVIDER_API_KEY=
+
+# The model name to request from the embedding service.
+EMBEDDING_MODEL_NAME=Qwen/Qwen3-Embedding-4B
+
+# Vector dimension. Schema-coupled: changing this after deploy requires dropping
+# the embedding column, re-migrating, and running `./manage.py backfill_embeddings`.
+EMBEDDING_DIM=1024
+
+# Development with local Ollama:
+# EMBEDDING_BACKEND=ollama
+# EMBEDDING_PROVIDER_URL=http://host.docker.internal:11434
+# EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M
+# EMBEDDING_DIM=2560
+
 # OpenTelemetry Configuration
 # Set this to the OTLP HTTP endpoint of the centralized openradx-observability stack.
 # See https://github.com/openradx/openradx-observability for setup instructions.
diff --git a/pyproject.toml b/pyproject.toml
index cb901ddd5..ae4f0db32 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,9 @@ dependencies = [
     "openai>=1.64.0",
     "openpyxl>=3.1.5",
     "pandas>=2.2.3",
+    "pgvector>=0.3",
     "procrastinate[django]>=3.0.2",
+    "stamina>=24.3.0",
     "psycopg[binary]>=3.2.5",
     "pycountry>=24.6.1",
     "pyparsing>=3.2.1",
diff --git a/radis/conftest.py b/radis/conftest.py
index 600eece82..fa40db9b3 100644
--- a/radis/conftest.py
+++ b/radis/conftest.py
@@ -1,4 +1,5 @@
 import nest_asyncio
+import stamina
 
 pytest_plugins = ["adit_radis_shared.pytest_fixtures"]
 
@@ -11,3 +12,7 @@ def pytest_configure():
     # https://github.com/pytest-dev/pytest-asyncio/issues/543
     # https://github.com/microsoft/playwright-pytest/issues/167
     nest_asyncio.apply()
+
+    # Disable stamina retries for tests by default; transient-blip retry
+    # behaviour is exercised explicitly where needed via `stamina.set_active`.
+    stamina.set_active(False)
diff --git a/radis/pgsearch/admin.py b/radis/pgsearch/admin.py
new file mode 100644
index 000000000..bf3991eb5
--- /dev/null
+++ b/radis/pgsearch/admin.py
@@ -0,0 +1,102 @@
+from django.contrib import admin, messages
+from django.db.models import Count
+from django.db.models.query import QuerySet
+from django.http.request import HttpRequest
+from procrastinate.contrib.django.models import ProcrastinateJob
+
+from .models import ReportSearchIndex
+from .tasks import enqueue_embed_reports
+
+
+@admin.register(ReportSearchIndex)
+class ReportSearchIndexAdmin(admin.ModelAdmin):
+    list_display = ("id", "report_id", "has_embedding")
+    list_filter = (("embedding", admin.EmptyFieldListFilter),)
+    search_fields = ("report__document_id",)
+    actions = ("enqueue_pending_embeddings", "clear_embeddings_for_remodel")
+    change_list_template = "admin/pgsearch/reportsearchindex/change_list.html"
+
+    def has_delete_permission(self, request, obj=None):
+        # RSI rows are managed by the post_save signal on Report — deleting
+        # one orphans the report from search until someone saves the report
+        # again. Block delete (this also hides the "delete selected" action).
+        return False
+
+    @admin.display(boolean=True, description="Embedded")
+    def has_embedding(self, obj: ReportSearchIndex) -> bool:
+        return obj.embedding is not None
+
+    def changelist_view(self, request, extra_context=None):
+        extra_context = extra_context or {}
+        extra_context["embedding_pipeline_stats"] = self._embedding_pipeline_stats()
+        return super().changelist_view(request, extra_context=extra_context)
+
+    @staticmethod
+    def _embedding_pipeline_stats() -> dict[str, int]:
+        """Snapshot of the embedding pipeline for the admin badge: how many
+        reports are still missing an embedding, and what Procrastinate is
+        doing about it right now."""
+        pending = ReportSearchIndex.objects.filter(embedding__isnull=True).count()
+        queue_counts = dict(
+            ProcrastinateJob.objects.filter(queue_name="embeddings")
+            .values_list("status")
+            .annotate(n=Count("id"))
+        )
+        return {
+            "pending_reports": pending,
+            "todo": queue_counts.get("todo", 0),
+            "doing": queue_counts.get("doing", 0),
+            "failed": queue_counts.get("failed", 0),
+        }
+
+    @admin.action(description="Enqueue embedding for selected rows (NULL only)")
+    def enqueue_pending_embeddings(
+        self, request: HttpRequest, queryset: QuerySet[ReportSearchIndex]
+    ) -> None:
+        report_ids = list(
+            queryset.filter(embedding__isnull=True)
+            .order_by("report_id")
+            .values_list("report_id", flat=True)
+        )
+        if not report_ids:
+            self.message_user(
+                request,
+                "No selected rows are missing an embedding.",
+                level=messages.WARNING,
+            )
+            return
+
+        subjob_count = enqueue_embed_reports(report_ids)
+
+        self.message_user(
+            request,
+            f"Enqueued {len(report_ids)} report(s) across "
+            f"{subjob_count} subjob(s) for embedding.",
+            level=messages.SUCCESS,
+        )
+
+    @admin.action(description="Clear embeddings (NULL them) — for same-dim model swap")
+    def clear_embeddings_for_remodel(
+        self, request: HttpRequest, queryset: QuerySet[ReportSearchIndex]
+    ) -> None:
+        # Same-dim model swap procedure: NULL the existing embeddings so
+        # the new model writes fresh ones via `embed_pending`. Uses
+        # queryset.update so post_save signals don't fire (we don't want
+        # auto-re-embedding here — that'd hit the embedding service
+        # immediately, possibly with the OLD model still configured).
+        # The operator drives the backfill explicitly afterward.
+        cleared = queryset.filter(embedding__isnull=False).update(embedding=None)
+        if not cleared:
+            self.message_user(
+                request,
+                "No selected rows had an embedding to clear.",
+                level=messages.WARNING,
+            )
+            return
+        self.message_user(
+            request,
+            f"Cleared embeddings on {cleared} row(s). Run "
+            f"`./manage.py embed_pending` (or the 'Enqueue embedding' "
+            f"action) to backfill against the new model.",
+            level=messages.SUCCESS,
+        )
diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py
index 449bc8686..ad5ebd71a 100644
--- a/radis/pgsearch/apps.py
+++ b/radis/pgsearch/apps.py
@@ -1,20 +1,118 @@
 from django.apps import AppConfig
+from django.conf import settings
+from django.core.checks import Error, register
 
 
 class PgSearchConfig(AppConfig):
     name = "radis.pgsearch"
 
     def ready(self):
-        from . import signals as signals
+        from . import signals as signals  # noqa: F401
 
         register_app()
 
 
+def _migration_embedding_dim() -> int | None:
+    """Return the `dimensions` value of `ReportSearchIndex.embedding` as
+    captured by the on-disk pgsearch migrations. Returns None if the field
+    cannot be located (migrations missing or model renamed)."""
+    from django.db.migrations.loader import MigrationLoader
+
+    loader = MigrationLoader(connection=None, ignore_no_migrations=True)
+    state = loader.project_state()
+    try:
+        model = state.apps.get_model("pgsearch", "ReportSearchIndex")
+        return model._meta.get_field("embedding").dimensions
+    except (LookupError, AttributeError):
+        return None
+
+
+@register()
+def check_embedding_dim_matches_migration(app_configs, **kwargs):
+    """Fail loudly when settings.EMBEDDING_DIM diverges from the dim baked
+    into the pgsearch migrations. Mismatched values would otherwise surface as
+    opaque pgvector dimension errors on the first write or query."""
+    migration_dim = _migration_embedding_dim()
+
+    if migration_dim is None:
+        return [
+            Error(
+                "Could not determine the embedding column dimension from the "
+                "pgsearch migrations. Either the migrations are missing the "
+                "embedding field or the model has been renamed.",
+                id="pgsearch.E002",
+                hint=(
+                    "Verify that `radis/pgsearch/migrations/` contains a "
+                    "migration that adds the `embedding` field to "
+                    "`ReportSearchIndex`, and that `makemigrations pgsearch` "
+                    "succeeds without changes."
+                ),
+            )
+        ]
+
+    if settings.EMBEDDING_DIM != migration_dim:
+        return [
+            Error(
+                f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the "
+                f"dim baked into the pgsearch migrations "
+                f"(vector({migration_dim})). Writes will fail with a pgvector "
+                f"dimension error. Either set "
+                f"EMBEDDING_DIM={migration_dim}, or run `makemigrations "
+                f"pgsearch` to capture the new dim and follow the §4.5 "
+                f"procedure to drop and recreate the embedding column.",
+                id="pgsearch.E001",
+                hint=(
+                    "Update EMBEDDING_DIM in your .env to match the existing "
+                    "migrations, or generate a new migration that matches the "
+                    "new dim."
+                ),
+            )
+        ]
+    return []
+
+
+def _index_reports(reports):
+    """pgsearch's subscriber on reports_created_handlers / reports_updated_handlers.
+
+    Owns both FTS indexing and embedding for the touched reports. The mode
+    flag `PGSEARCH_SYNC_INDEXING` controls whether FTS runs inline on the
+    request thread or is deferred to a Procrastinate task on the `default`
+    queue. Embedding is always deferred to the `embeddings` queue.
+
+    Ordering between FTS and embedding is the same in both modes: RSI rows
+    exist (and `report.body` is reachable) before `embed_reports_task` runs.
+    In sync mode the handler upserts inline, then defers embed. In async
+    mode the handler only enqueues `bulk_index_reports`; that task chains
+    `embed_reports_task` at the end of its own run, so the embeddings worker
+    never picks up a report before its RSI row is committed.
+    """
+    if not reports:
+        return
+
+    from radis.pgsearch.tasks import enqueue_bulk_index_reports, enqueue_embed_reports
+    from radis.pgsearch.utils.indexing import bulk_upsert_report_search_indexes
+
+    report_ids = [report.pk for report in reports]
+    if settings.PGSEARCH_SYNC_INDEXING:
+        bulk_upsert_report_search_indexes(report_ids)
+        enqueue_embed_reports(report_ids)
+    else:
+        enqueue_bulk_index_reports(report_ids)
+
+
 def register_app():
+    from django.conf import settings
+
     from radis.extractions.site import (
         ExtractionRetrievalProvider,
         register_extraction_retrieval_provider,
     )
+    from radis.reports.site import (
+        ReportsCreatedHandler,
+        ReportsUpdatedHandler,
+        register_reports_created_handler,
+        register_reports_updated_handler,
+    )
     from radis.search.site import SearchProvider, register_search_provider
     from radis.subscriptions.site import (
         SubscriptionFilterProvider,
@@ -25,11 +123,20 @@ def register_app():
 
     from .providers import count, filter, retrieve, search
 
+    register_reports_created_handler(
+        ReportsCreatedHandler(name="PG Search", handle=_index_reports)
+    )
+    register_reports_updated_handler(
+        ReportsUpdatedHandler(name="PG Search", handle=_index_reports)
+    )
+
     register_search_provider(
         SearchProvider(
             name="PG Search",
             search=search,
-            max_results=1000,
+            max_results=max(
+                settings.HYBRID_VECTOR_TOP_K, settings.HYBRID_FTS_MAX_RESULTS
+            ),
         )
     )
 
diff --git a/radis/pgsearch/management/commands/embed_pending.py b/radis/pgsearch/management/commands/embed_pending.py
new file mode 100644
index 000000000..a9c712458
--- /dev/null
+++ b/radis/pgsearch/management/commands/embed_pending.py
@@ -0,0 +1,82 @@
+"""Enqueue `embed_reports_task` for every `ReportSearchIndex` whose embedding
+is still NULL.
+
+Operators run this for three scenarios:
+
+1. **Backfill.** Reports loaded before the deferred-embedding wiring shipped.
+2. **Dim or model change.** After §4.5: drop the column, re-migrate (or
+   `ReportSearchIndex.objects.update(embedding=None)` for a same-dim model
+   swap), then run this command to re-embed against the new model.
+3. **Outage recovery.** Tasks that exhausted Procrastinate retries during an
+   extended embedding-service outage — re-run after the service recovers.
+
+The command itself does no HTTP work; it defers Procrastinate tasks onto the
+`embeddings` queue. The embeddings worker drains them at its configured
+`--concurrency`, so operators cannot accidentally hammer the embedding service.
+
+Chunking goes through the shared `enqueue_embed_reports` helper, so the
+subjob size matches what the write-path handler and the admin action use
+(default `settings.EMBEDDING_SUBJOB_SIZE`).
+
+Properties:
+
+- **Idempotent.** The filter is `embedding IS NULL`; re-runs are no-ops on
+  rows the worker has already drained.
+- **Resumable.** No checkpoint state. Killed mid-enqueue → re-run picks up
+  the still-NULL rows.
+- **Rate-limited.** Worker concurrency caps load on the embedding service
+  regardless of how many tasks this command enqueues.
+"""
+from django.conf import settings
+from django.core.management.base import BaseCommand
+
+from radis.pgsearch.models import ReportSearchIndex
+from radis.pgsearch.tasks import enqueue_embed_reports
+
+
+class Command(BaseCommand):
+    help = (
+        "Enqueue embed_reports_task subjobs for every ReportSearchIndex "
+        "with embedding=NULL. The embeddings worker drains the queue at "
+        "its configured concurrency."
+    )
+
+    def add_arguments(self, parser) -> None:
+        parser.add_argument(
+            "--subjob-size",
+            type=int,
+            default=settings.EMBEDDING_SUBJOB_SIZE,
+            help=(
+                f"Reports per Procrastinate subjob (default "
+                f"{settings.EMBEDDING_SUBJOB_SIZE}). The worker further "
+                f"chunks each subjob into HTTP calls of "
+                f"EMBEDDING_BATCH_SIZE={settings.EMBEDDING_BATCH_SIZE}."
+            ),
+        )
+        parser.add_argument(
+            "--limit",
+            type=int,
+            default=None,
+            help="Stop after enqueuing N reports (default: enqueue all).",
+        )
+
+    def handle(self, *args, **opts) -> None:
+        ids = list(
+            ReportSearchIndex.objects.filter(embedding__isnull=True)
+            .order_by("report_id")
+            .values_list("report_id", flat=True)
+        )
+        if opts["limit"] is not None:
+            ids = ids[: opts["limit"]]
+        if not ids:
+            self.stdout.write("Nothing to embed.")
+            return
+
+        subjob_size = opts["subjob_size"]
+        self.stdout.write(
+            f"Enqueuing {len(ids)} report(s) in subjobs of {subjob_size}..."
+        )
+        subjob_count = enqueue_embed_reports(ids, subjob_size=subjob_size)
+        self.stdout.write(
+            self.style.SUCCESS(f"Done. Deferred {subjob_count} subjob(s).")
+        )
diff --git a/radis/pgsearch/migrations/0002_hybrid_search.py b/radis/pgsearch/migrations/0002_hybrid_search.py
new file mode 100644
index 000000000..98e7174b2
--- /dev/null
+++ b/radis/pgsearch/migrations/0002_hybrid_search.py
@@ -0,0 +1,62 @@
+"""Hybrid-search schema additions on top of pgsearch.0001_initial:
+
+- Rename the per-report search row from `ReportSearchVector` to
+  `ReportSearchIndex` (now holds the FTS tsvector *and* the dense
+  embedding; future trigram column would also live there).
+- Update the reverse accessor on Report (`search_vector` → `search_index`).
+- Install the pgvector extension.
+- Add the `embedding vector(1024)` column and its HNSW index for cosine
+  similarity search.
+
+Squashed from the previously-separate `0002_hybrid_search` (extension +
+embedding field + HNSW) and `0003_rename_search_index` (RenameModel +
+AlterField) so that hybrid search ships as a single coherent migration
+rather than three intermediate states no operator will ever see in
+isolation.
+"""
+import django.db.models.deletion
+import pgvector.django.indexes
+import pgvector.django.vector
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("pgsearch", "0001_initial"),
+        ("reports", "0013_alter_report_options"),
+    ]
+
+    operations = [
+        migrations.RunSQL(
+            sql="CREATE EXTENSION IF NOT EXISTS vector;",
+            reverse_sql=migrations.RunSQL.noop,
+        ),
+        migrations.RenameModel(
+            old_name="ReportSearchVector",
+            new_name="ReportSearchIndex",
+        ),
+        migrations.AlterField(
+            model_name="reportsearchindex",
+            name="report",
+            field=models.OneToOneField(
+                on_delete=django.db.models.deletion.CASCADE,
+                related_name="search_index",
+                to="reports.report",
+            ),
+        ),
+        migrations.AddField(
+            model_name="reportsearchindex",
+            name="embedding",
+            field=pgvector.django.vector.VectorField(dimensions=1024, null=True),
+        ),
+        migrations.AddIndex(
+            model_name="reportsearchindex",
+            index=pgvector.django.indexes.HnswIndex(
+                ef_construction=64,
+                fields=["embedding"],
+                m=16,
+                name="pgsearch_embedding_hnsw",
+                opclasses=["vector_cosine_ops"],
+            ),
+        ),
+    ]
diff --git a/radis/pgsearch/models.py b/radis/pgsearch/models.py
index 63550fca9..ab6135816 100644
--- a/radis/pgsearch/models.py
+++ b/radis/pgsearch/models.py
@@ -1,21 +1,41 @@
+from django.conf import settings
 from django.contrib.postgres.indexes import GinIndex
 from django.contrib.postgres.search import SearchVector, SearchVectorField
 from django.db import models
+from pgvector.django import HnswIndex, VectorField
 
 from radis.reports.models import Report
 
 from .utils.language_utils import code_to_language
 
 
-class ReportSearchVector(models.Model):
-    report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_vector")
+class ReportSearchIndex(models.Model):
+    """Per-report row that backs every search modality. Holds the FTS
+    `search_vector` (tsvector) and the dense `embedding` vector for
+    hybrid search; a future trigram column would also live here. Named
+    after its role, not after any single field — adding another search
+    representation shouldn't force another rename."""
+
+    report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_index")
     search_vector = SearchVectorField(null=True)
+    embedding = VectorField(dimensions=settings.EMBEDDING_DIM, null=True)
 
     class Meta:
-        indexes = [GinIndex(fields=["search_vector"])]
+        verbose_name = "Report search index"
+        verbose_name_plural = "Report search indexes"
+        indexes = [
+            GinIndex(fields=["search_vector"]),
+            HnswIndex(
+                name="pgsearch_embedding_hnsw",
+                fields=["embedding"],
+                m=16,
+                ef_construction=64,
+                opclasses=["vector_cosine_ops"],
+            ),
+        ]
 
     def __str__(self) -> str:
-        return f"Report {self.report.id} search vector"
+        return f"Report {self.report.id} search index"
 
     def save(self, *args, **kwargs):
         body = self.report.body if self.report else ""
diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py
index 1078db805..5aa02c3e2 100644
--- a/radis/pgsearch/providers.py
+++ b/radis/pgsearch/providers.py
@@ -1,21 +1,27 @@
 import logging
-from typing import Iterator, cast
+from typing import Iterator, Literal, cast
 
+from django.conf import settings
 from django.contrib.postgres.search import SearchHeadline, SearchQuery, SearchRank
 from django.db.models import F, Q
+from pgvector.django import CosineDistance
 
-from radis.search.site import Search, SearchFilters, SearchResult
+from radis.reports.models import Report
+from radis.search.site import ReportDocument, Search, SearchFilters, SearchResult
 from radis.search.utils.query_parser import (
     BinaryNode,
     ParensNode,
     QueryNode,
+    QueryParser,
     TermNode,
     UnaryNode,
     is_search_token_char,
 )
 
-from .models import ReportSearchVector
-from .utils.document_utils import AnnotatedReportSearchVector, document_from_pgsearch_response
+from .models import ReportSearchIndex
+from .utils.document_utils import AnnotatedReportSearchIndex, document_from_pgsearch_response
+from .utils.embedding_client import EmbeddingClient, EmbeddingClientError
+from .utils.fusion import rrf_fuse, summary_with_fallback
 from .utils.language_utils import code_to_language
 
 logger = logging.getLogger(__name__)
@@ -90,44 +96,105 @@ def _build_filter_query(filters: SearchFilters) -> Q:
 def search(search: Search) -> SearchResult:
     query_str = _build_query_string(search.query)
     language = _resolve_language(search.filters)
-    query = SearchQuery(query_str, search_type="raw", config=language)
     filter_query = _build_filter_query(search.filters)
-    results = (
-        ReportSearchVector.objects.filter(filter_query)
-        .filter(search_vector=query)
-        .annotate(
-            rank=SearchRank(
-                F("search_vector"),
-                query,
-            )
+    tsquery = SearchQuery(query_str, search_type="raw", config=language)
+
+    # Vector side: strip NOT branches (see spec §7.8). If nothing is left,
+    # skip the embedding call entirely and fall through to FTS-only.
+    query_text = QueryParser.unparse_for_embedding(search.query)
+    query_vec: list[float] | None = None
+    if query_text.strip():
+        try:
+            with EmbeddingClient() as ec:
+                query_vec = ec.embed_query(query_text)
+        except EmbeddingClientError as e:
+            logger.warning("Hybrid search falling back to FTS-only: %s", e)
+            query_vec = None
+
+    vec_rank: dict[int, int] = {}
+    vec_distance: dict[int, float] = {}
+    if query_vec is not None:
+        vec_rows = list(
+            ReportSearchIndex.objects.filter(filter_query)
+            .distinct()
+            .exclude(embedding__isnull=True)
+            .annotate(distance=CosineDistance("embedding", query_vec))
+            .order_by("distance", "report_id")
+            .values_list("report_id", "distance")[: settings.HYBRID_VECTOR_TOP_K]
+        )
+        for i, (rid, dist) in enumerate(vec_rows):
+            vec_rank[rid] = i + 1
+            vec_distance[rid] = float(dist)
+
+    # FTS side: bounded set, ts_rank only (no headline at this stage).
+    fts_rows = list(
+        ReportSearchIndex.objects.filter(filter_query)
+        .distinct()
+        .filter(search_vector=tsquery)
+        .annotate(rank=SearchRank(F("search_vector"), tsquery))
+        .order_by("-rank", "report_id")
+        .values("report_id", "rank")[: settings.HYBRID_FTS_MAX_RESULTS]
+    )
+    fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)}
+
+    # Fusion.
+    ordered_pairs = rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K)
+    rrf_score_by_id = dict(ordered_pairs)
+    ordered_ids = list(rrf_score_by_id)
+    total_count = len(ordered_ids)
+    total_relation: Literal["exact", "at_least", "approximately"] = (
+        "at_least"
+        if (
+            len(fts_rows) >= settings.HYBRID_FTS_MAX_RESULTS
+            or len(vec_rank) >= settings.HYBRID_VECTOR_TOP_K
         )
+        else "exact"
+    )
+
+    if search.limit is None:
+        page_ids = ordered_ids[search.offset :]
+    else:
+        page_ids = ordered_ids[search.offset : search.offset + search.limit]
+
+    # Headline + hydration for the page slice only.
+    page_rows = (
+        ReportSearchIndex.objects.filter(report_id__in=page_ids)
         .annotate(
             summary=SearchHeadline(
                 "report__body",
-                query,
+                tsquery,
                 config=language,
                 start_sel="<em>",
                 stop_sel="</em>",
                 min_words=10,
                 max_words=20,
                 max_fragments=10,
-            )
+            ),
+            rank=SearchRank(F("search_vector"), tsquery),
         )
         .select_related("report")
-        .order_by("-rank")
     )
+    by_id = {r.report.pk: r for r in page_rows}
 
-    total_count = results.count()
-    if search.limit is None:
-        results = results[search.offset :]
-    else:
-        results = results[search.offset : search.offset + search.limit]
-    documents = [
-        document_from_pgsearch_response(cast(AnnotatedReportSearchVector, result))
-        for result in results
-    ]
+    documents: list[ReportDocument] = []
+    for rid in page_ids:
+        rsv = by_id.get(rid)
+        if rsv is None:
+            continue
+        rsv.summary = summary_with_fallback(  # type: ignore[attr-defined]
+            rsv.report.body, rsv.summary or "", max_words=30  # type: ignore[attr-defined]
+        )
+        documents.append(
+            document_from_pgsearch_response(
+                cast(AnnotatedReportSearchIndex, rsv),
+                cosine_distance=vec_distance.get(rid),
+                rrf_score=rrf_score_by_id.get(rid, 0.0),
+            )
+        )
 
-    return SearchResult(total_count=total_count, total_relation="exact", documents=documents)
+    return SearchResult(
+        total_count=total_count, total_relation=total_relation, documents=documents
+    )
 
 
 def count(search: Search) -> int:
@@ -135,35 +202,63 @@ def count(search: Search) -> int:
     language = _resolve_language(search.filters)
     query = SearchQuery(query_str, search_type="raw", config=language)
     filter_query = _build_filter_query(search.filters)
-    results = ReportSearchVector.objects.filter(filter_query).filter(search_vector=query)
+    results = ReportSearchIndex.objects.filter(filter_query).filter(search_vector=query)
     return results.count()
 
 
 def retrieve(search: Search) -> Iterator[str]:
     query_str = _build_query_string(search.query)
     language = _resolve_language(search.filters)
-    query = SearchQuery(query_str, search_type="raw", config=language)
     filter_query = _build_filter_query(search.filters)
-    results = (
-        ReportSearchVector.objects.filter(filter_query)
-        .filter(search_vector=query)
-        .annotate(
-            rank=SearchRank(
-                F("search_vector"),
-                query,
-            )
+    tsquery = SearchQuery(query_str, search_type="raw", config=language)
+
+    # Vector side: strip NOT branches (see spec §7.8). If nothing is left,
+    # skip the embedding call entirely and fall through to FTS-only.
+    query_text = QueryParser.unparse_for_embedding(search.query)
+    query_vec: list[float] | None = None
+    if query_text.strip():
+        try:
+            with EmbeddingClient() as ec:
+                query_vec = ec.embed_query(query_text)
+        except EmbeddingClientError as e:
+            logger.warning("Hybrid retrieve falling back to FTS-only: %s", e)
+            query_vec = None
+
+    vec_rank: dict[int, int] = {}
+    if query_vec is not None:
+        vec_ids = list(
+            ReportSearchIndex.objects.filter(filter_query)
+            .distinct()
+            .exclude(embedding__isnull=True)
+            .annotate(distance=CosineDistance("embedding", query_vec))
+            .order_by("distance", "report_id")
+            .values_list("report_id", flat=True)[: settings.HYBRID_VECTOR_TOP_K]
         )
-        .select_related("report")
-        .order_by("-rank")
-        .values_list("report__document_id", flat=True)
+        vec_rank = {rid: i + 1 for i, rid in enumerate(vec_ids)}
+
+    fts_rows = list(
+        ReportSearchIndex.objects.filter(filter_query)
+        .distinct()
+        .filter(search_vector=tsquery)
+        .annotate(rank=SearchRank(F("search_vector"), tsquery))
+        .order_by("-rank", "report_id")
+        .values("report_id", "rank")[: settings.HYBRID_FTS_MAX_RESULTS]
     )
+    fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)}
 
-    return results.iterator()
+    ordered_ids = [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K)]
+    if not ordered_ids:
+        return iter([])
+
+    id_to_doc = dict(
+        Report.objects.filter(pk__in=ordered_ids).values_list("pk", "document_id")
+    )
+    return (id_to_doc[rid] for rid in ordered_ids if rid in id_to_doc)
 
 
 def filter(filter: SearchFilters) -> Iterator[str]:
     filter_query = _build_filter_query(filter)
-    results = ReportSearchVector.objects.filter(filter_query).values_list(
+    results = ReportSearchIndex.objects.filter(filter_query).values_list(
         "report__document_id", flat=True
     )
     return results.iterator()
diff --git a/radis/pgsearch/signals.py b/radis/pgsearch/signals.py
index 6d7e6f021..dc351fab9 100644
--- a/radis/pgsearch/signals.py
+++ b/radis/pgsearch/signals.py
@@ -3,13 +3,12 @@
 
 from radis.reports.models import Report
 
-from .models import ReportSearchVector
+from .models import ReportSearchIndex
 
 
 @receiver(post_save, sender=Report)
-def create_or_update_report_search_vector(sender, instance, created, **kwargs):
+def create_or_update_report_search_index(sender, instance, created, **kwargs):
     if created:
-        ReportSearchVector.objects.create(report=instance)
+        ReportSearchIndex.objects.create(report=instance)
         return
-
-    instance.search_vector.save()
+    instance.search_index.save()
diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py
index 2645a28d4..856b0ee88 100644
--- a/radis/pgsearch/tasks.py
+++ b/radis/pgsearch/tasks.py
@@ -1,19 +1,66 @@
 import logging
 
+import stamina
+from django.conf import settings
 from procrastinate.contrib.django import app
 from procrastinate.types import JSONValue
 
-from .utils.indexing import bulk_upsert_report_search_vectors
+from .models import ReportSearchIndex
+from .utils.embedding_client import (
+    EmbeddingClient,
+    EmbeddingClientError,
+    EmbeddingPayloadTooLargeError,
+)
+from .utils.indexing import bulk_upsert_report_search_indexes
 
 logger = logging.getLogger(__name__)
 
 
+def _is_retryable_embedding_error(exc: Exception) -> bool:
+    """stamina retry predicate. Retry transient embedding-service failures
+    (5xx, network, timeouts — all surfaced as `EmbeddingClientError`) but
+    NOT `EmbeddingPayloadTooLargeError`, which is a deterministic rejection
+    of an input that exceeds the model's context window. Retrying that
+    one would just hit the same wall — the bisect logic in
+    `_embed_with_bisect` handles it instead."""
+    return isinstance(exc, EmbeddingClientError) and not isinstance(
+        exc, EmbeddingPayloadTooLargeError
+    )
+
+
+@stamina.retry(
+    on=_is_retryable_embedding_error,
+    attempts=3,
+    timeout=30.0,
+    wait_initial=0.5,
+    wait_max=8.0,
+)
+def _embed_chunk_with_retry(
+    client: EmbeddingClient, texts: list[str]
+) -> list[list[float]]:
+    """Single embed call wrapped in stamina-controlled transient retries.
+
+    Layered with Procrastinate's task-level retry: stamina handles brief
+    blips (3 attempts within ~30s); Procrastinate handles extended outages
+    (whole-task retry on backoff). `EmbeddingPayloadTooLargeError` is
+    excluded by the predicate so the bisect logic above this layer can
+    catch and resolve it without burning retry budget."""
+    return client.embed_documents(texts)
+
+
 @app.task
 def bulk_index_reports(report_ids: list[int]) -> None:
+    """Deferred FTS bulk-indexing for the bulk-upsert path
+    (when `PGSEARCH_SYNC_INDEXING=False`).
+
+    Chains into `embed_reports_task` subjobs once RSV rows exist, so the
+    embeddings worker never reads a missing `report.body` or a stale tsvector.
+    """
     if not report_ids:
         return
     logger.info("Indexing %s reports in bulk.", len(report_ids))
-    bulk_upsert_report_search_vectors(report_ids)
+    bulk_upsert_report_search_indexes(report_ids)
+    enqueue_embed_reports(report_ids)
 
 
 def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None:
@@ -28,3 +75,132 @@ def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None:
         "radis.pgsearch.tasks.bulk_index_reports",
         allow_unknown=False,
     ).defer(report_ids=payload)
+
+
+def enqueue_embed_reports(
+    report_ids: list[int], *, subjob_size: int | None = None
+) -> int:
+    """Chunk `report_ids` into subjobs and defer one `embed_reports_task`
+    per chunk. Returns the number of subjobs deferred.
+
+    Subjob size defaults to `settings.EMBEDDING_SUBJOB_SIZE` (the
+    Procrastinate-task granularity). It's distinct from
+    `settings.EMBEDDING_BATCH_SIZE` (the per-HTTP-call size inside one
+    task). A 1M-report backfill becomes ~10k subjobs of 100, each making
+    ~3 HTTP calls of 32 — many workers can drain in parallel, retries
+    have bounded blast radius, and a stuck task can't tie up the worker
+    on the whole queue's worth of work.
+
+    Single call site for every place that enqueues embedding work: the
+    write-path handler, the FTS chain tail, `embed_pending`, and the
+    admin action. Operators read one knob, not several.
+    """
+    if not report_ids:
+        return 0
+    size = subjob_size if subjob_size is not None else settings.EMBEDDING_SUBJOB_SIZE
+    count = 0
+    for start in range(0, len(report_ids), size):
+        chunk = report_ids[start : start + size]
+        embed_reports_task.defer(report_ids=list(chunk))
+        count += 1
+    return count
+
+
+def _embed_with_bisect(
+    client: EmbeddingClient,
+    rsvs: list[ReportSearchIndex],
+    embedded: list[ReportSearchIndex],
+    skipped: list[ReportSearchIndex],
+) -> None:
+    """Embed `rsvs` and append `(rsv, vec)` pairs to `embedded`. When the
+    backend rejects the request as too large, bisect and recurse. Once the
+    offender is isolated to a single rsv, log its `report_id` + body length
+    and append it to `skipped` instead of raising — that way the rest of
+    the task's batch still gets embedded.
+
+    Transient errors are absorbed by `_embed_chunk_with_retry`'s stamina
+    wrapper. Anything that escapes after stamina's attempts/timeout budget
+    is exhausted propagates so Procrastinate's task-level retry applies.
+    """
+    if not rsvs:
+        return
+    try:
+        vectors = _embed_chunk_with_retry(client, [rsv.report.body for rsv in rsvs])
+    except EmbeddingPayloadTooLargeError as exc:
+        if len(rsvs) == 1:
+            offender = rsvs[0]
+            logger.error(
+                "embed_reports_task: report_id=%s body_chars=%d rejected by embedding "
+                "service as too large; skipping. Backend error: %s",
+                offender.report_id,
+                len(offender.report.body),
+                exc,
+            )
+            skipped.append(offender)
+            return
+        mid = len(rsvs) // 2
+        _embed_with_bisect(client, rsvs[:mid], embedded, skipped)
+        _embed_with_bisect(client, rsvs[mid:], embedded, skipped)
+        return
+
+    for rsv, vec in zip(rsvs, vectors, strict=True):
+        rsv.embedding = vec
+        embedded.append(rsv)
+
+
+@app.task(queue="embeddings")
+def embed_reports_task(report_ids: list[int]) -> None:
+    """Embed the named reports.
+
+    Two layers of failure handling sit between the embedding service and
+    this task:
+
+    * `_embed_chunk_with_retry` retries transient `EmbeddingClientError`
+      via stamina (3 attempts, ~30s budget) for brief blips.
+    * `_embed_with_bisect` catches the deterministic
+      `EmbeddingPayloadTooLargeError` and recurses until it isolates the
+      offending report, then logs ERROR with `report_id` + body length and
+      skips it (its RSV stays NULL). The rest of the batch still embeds.
+
+    Anything that escapes both — sustained `EmbeddingClientError` past
+    stamina's budget — propagates so Procrastinate's task-level retry
+    policy applies.
+
+    Callers must ensure ReportSearchIndex rows exist before deferring this
+    task. `bulk_index_reports` chains the defer at the end of its run, and
+    `embed_pending` / the admin action filter on existing RSV rows by
+    construction.
+    """
+    if not report_ids:
+        return
+
+    rsvs = list(
+        ReportSearchIndex.objects.filter(report_id__in=report_ids)
+        .select_related("report")
+        .only("id", "report_id", "report__body")
+    )
+    if not rsvs:
+        logger.warning(
+            "embed_reports_task: no ReportSearchIndex rows for report ids %s",
+            report_ids,
+        )
+        return
+
+    batch_size = settings.EMBEDDING_BATCH_SIZE
+    embedded: list[ReportSearchIndex] = []
+    skipped: list[ReportSearchIndex] = []
+    with EmbeddingClient() as client:
+        for start in range(0, len(rsvs), batch_size):
+            chunk = rsvs[start : start + batch_size]
+            _embed_with_bisect(client, chunk, embedded, skipped)
+
+    if embedded:
+        ReportSearchIndex.objects.bulk_update(embedded, fields=["embedding"])
+    if skipped:
+        logger.error(
+            "embed_reports_task: %d report(s) skipped as too large for the embedding "
+            "model; report_ids=%s. Fix the upstream report or raise the model context "
+            "limit; their RSV rows stay NULL until embedded.",
+            len(skipped),
+            [rsv.report_id for rsv in skipped],
+        )
diff --git a/radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html b/radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html
new file mode 100644
index 000000000..58b5790fb
--- /dev/null
+++ b/radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html
@@ -0,0 +1,21 @@
+{% extends "admin/change_list.html" %}
+
+{% block content %}
+{% if embedding_pipeline_stats %}
+<div class="module" style="margin-bottom: 15px; padding: 10px 14px; border-left: 4px solid #79aec8;">
+  <strong>Embedding pipeline</strong>
+  &nbsp;·&nbsp; <strong>{{ embedding_pipeline_stats.pending_reports }}</strong> report{{ embedding_pipeline_stats.pending_reports|pluralize }} awaiting embedding
+  &nbsp;·&nbsp; <strong>{{ embedding_pipeline_stats.todo }}</strong> queued
+  &nbsp;·&nbsp; <strong>{{ embedding_pipeline_stats.doing }}</strong> in-flight
+  &nbsp;·&nbsp;
+  {% if embedding_pipeline_stats.failed %}
+    <strong style="color: #ba2121;">{{ embedding_pipeline_stats.failed }}</strong>
+  {% else %}
+    <strong>0</strong>
+  {% endif %}
+  failed
+  <span style="color: #666; margin-left: 10px;">(<code>embeddings</code> queue)</span>
+</div>
+{% endif %}
+{{ block.super }}
+{% endblock %}
diff --git a/radis/pgsearch/tests/test_admin.py b/radis/pgsearch/tests/test_admin.py
new file mode 100644
index 000000000..eb2a701b7
--- /dev/null
+++ b/radis/pgsearch/tests/test_admin.py
@@ -0,0 +1,117 @@
+"""Tests for the ReportSearchIndex admin pipeline-stats badge."""
+from unittest.mock import MagicMock
+
+from django.contrib.admin.sites import AdminSite
+from django.db import connection
+
+import pytest
+
+from radis.pgsearch.admin import ReportSearchIndexAdmin
+from radis.pgsearch.models import ReportSearchIndex
+from radis.reports.factories import ReportFactory
+
+pytestmark = pytest.mark.django_db(transaction=True)
+
+
+@pytest.fixture(autouse=True)
+def _clear_procrastinate_jobs():
+    """ProcrastinateJob is read-only via the ORM, so pytest-django's
+    flush between transactional tests doesn't clear it. Truncate
+    explicitly so each test starts from an empty queue."""
+    with connection.cursor() as cur:
+        cur.execute("TRUNCATE procrastinate_jobs RESTART IDENTITY CASCADE")
+    yield
+    with connection.cursor() as cur:
+        cur.execute("TRUNCATE procrastinate_jobs RESTART IDENTITY CASCADE")
+
+
+def _insert_procrastinate_job(status: str, queue: str = "embeddings") -> None:
+    """Insert a row directly via SQL because ProcrastinateJob's Django ORM
+    surface is intentionally read-only — Procrastinate owns writes. We
+    only need (queue_name, status) for the stats helper to count."""
+    with connection.cursor() as cur:
+        cur.execute(
+            "INSERT INTO procrastinate_jobs "
+            "(queue_name, task_name, priority, lock, queueing_lock, args, status, attempts) "
+            "VALUES (%s, %s, %s, NULL, NULL, %s, %s::procrastinate_job_status, %s)",
+            [
+                queue,
+                "radis.pgsearch.tasks.embed_reports_task",
+                0,
+                '{"report_ids": []}',
+                status,
+                0,
+            ],
+        )
+
+
+def test_pipeline_stats_counts_pending_rsvs():
+    [ReportFactory.create() for _ in range(3)]
+    embedded = ReportFactory.create()
+    rsv = ReportSearchIndex.objects.get(report_id=embedded.pk)
+    rsv.embedding = [0.0] * 1024
+    rsv.save()
+
+    stats = ReportSearchIndexAdmin._embedding_pipeline_stats()
+    assert stats["pending_reports"] == 3
+
+
+def test_pipeline_stats_counts_procrastinate_jobs_by_status():
+    _insert_procrastinate_job("todo")
+    _insert_procrastinate_job("todo")
+    _insert_procrastinate_job("doing")
+    _insert_procrastinate_job("failed")
+    # Job on a different queue must not be counted.
+    _insert_procrastinate_job("todo", queue="default")
+
+    stats = ReportSearchIndexAdmin._embedding_pipeline_stats()
+    assert stats["todo"] == 2
+    assert stats["doing"] == 1
+    assert stats["failed"] == 1
+
+
+def test_pipeline_stats_zero_when_no_queue_activity():
+    stats = ReportSearchIndexAdmin._embedding_pipeline_stats()
+    assert stats == {
+        "pending_reports": 0,
+        "todo": 0,
+        "doing": 0,
+        "failed": 0,
+    }
+
+
+def test_delete_permission_denied():
+    """RSI rows are managed by the post_save signal on Report — admin must
+    not let operators delete them out from under the model."""
+    admin_instance = ReportSearchIndexAdmin(ReportSearchIndex, AdminSite())
+    assert admin_instance.has_delete_permission(MagicMock()) is False
+
+
+def test_clear_embeddings_for_remodel_nulls_only_selected_rows_with_embeddings():
+    """Same-dim model swap: NULL the existing embeddings on selected rows.
+    Rows already NULL are no-ops; rows outside the selection are untouched."""
+    targets = [ReportFactory.create() for _ in range(3)]
+    untouched = ReportFactory.create()
+    for r in targets + [untouched]:
+        rsi = ReportSearchIndex.objects.get(report_id=r.pk)
+        rsi.embedding = [0.1] * 1024
+        rsi.save()
+    # One target already NULL — should be skipped by the filter.
+    ReportSearchIndex.objects.filter(report_id=targets[0].pk).update(embedding=None)
+
+    selected = ReportSearchIndex.objects.filter(
+        report_id__in=[r.pk for r in targets]
+    )
+    admin_instance = ReportSearchIndexAdmin(ReportSearchIndex, AdminSite())
+    admin_instance.message_user = MagicMock()
+    admin_instance.clear_embeddings_for_remodel(MagicMock(), selected)
+
+    # Two of three targets had embeddings and got cleared.
+    assert ReportSearchIndex.objects.filter(
+        report_id__in=[r.pk for r in targets], embedding__isnull=True
+    ).count() == 3
+    # The non-selected row is untouched.
+    assert ReportSearchIndex.objects.get(report_id=untouched.pk).embedding is not None
+    # message_user reports the number cleared, not the number selected.
+    msg_args = admin_instance.message_user.call_args
+    assert "Cleared embeddings on 2 row(s)" in msg_args.args[1]
diff --git a/radis/pgsearch/tests/test_apps_checks.py b/radis/pgsearch/tests/test_apps_checks.py
new file mode 100644
index 000000000..7658fc6dc
--- /dev/null
+++ b/radis/pgsearch/tests/test_apps_checks.py
@@ -0,0 +1,43 @@
+"""Tests for the Django system check that guards EMBEDDING_DIM/migration parity."""
+
+from unittest.mock import patch
+
+from django.test import override_settings
+
+from radis.pgsearch.apps import (
+    _migration_embedding_dim,
+    check_embedding_dim_matches_migration,
+)
+
+
+def test_migration_embedding_dim_returns_int_without_db():
+    dim = _migration_embedding_dim()
+    assert isinstance(dim, int)
+    assert dim == 1024
+
+
+def test_check_passes_when_dim_matches_migration():
+    dim = _migration_embedding_dim()
+    with override_settings(EMBEDDING_DIM=dim):
+        assert check_embedding_dim_matches_migration(app_configs=None) == []
+
+
+def test_check_fails_with_e001_when_dim_diverges_from_migration():
+    dim = _migration_embedding_dim()
+    assert dim is not None
+    with override_settings(EMBEDDING_DIM=dim + 1):
+        errors = check_embedding_dim_matches_migration(app_configs=None)
+    assert len(errors) == 1
+    err = errors[0]
+    assert err.id == "pgsearch.E001"
+    assert str(dim) in err.msg
+    assert str(dim + 1) in err.msg
+
+
+def test_check_fails_with_e002_when_migration_field_missing():
+    with patch(
+        "radis.pgsearch.apps._migration_embedding_dim", return_value=None
+    ):
+        errors = check_embedding_dim_matches_migration(app_configs=None)
+    assert len(errors) == 1
+    assert errors[0].id == "pgsearch.E002"
diff --git a/radis/pgsearch/tests/test_embed_pending_command.py b/radis/pgsearch/tests/test_embed_pending_command.py
new file mode 100644
index 000000000..83eb5c8e0
--- /dev/null
+++ b/radis/pgsearch/tests/test_embed_pending_command.py
@@ -0,0 +1,60 @@
+"""Tests for the `embed_pending` management command."""
+from io import StringIO
+from unittest.mock import patch
+
+import pytest
+from django.core.management import call_command
+
+from radis.reports.factories import ReportFactory
+
+pytestmark = pytest.mark.django_db
+
+
+def test_nothing_to_embed():
+    out = StringIO()
+    with patch(
+        "radis.pgsearch.management.commands.embed_pending.enqueue_embed_reports"
+    ) as enqueue:
+        call_command("embed_pending", stdout=out)
+    assert "Nothing to embed." in out.getvalue()
+    enqueue.assert_not_called()
+
+
+def test_enqueues_via_helper_with_explicit_subjob_size():
+    # ReportFactory triggers the FTS post_save signal → RSV row with embedding=NULL.
+    reports = [ReportFactory.create() for _ in range(5)]
+    expected_ids = sorted(r.pk for r in reports)
+
+    out = StringIO()
+    with patch(
+        "radis.pgsearch.management.commands.embed_pending.enqueue_embed_reports",
+        return_value=3,
+    ) as enqueue:
+        call_command("embed_pending", "--subjob-size", "2", stdout=out)
+
+    # The command delegates chunking to the shared helper.
+    enqueue.assert_called_once()
+    args, kwargs = enqueue.call_args
+    assert sorted(args[0]) == expected_ids
+    assert kwargs["subjob_size"] == 2
+
+    output = out.getvalue()
+    assert "5 report(s) in subjobs of 2" in output
+    assert "Deferred 3 subjob(s)" in output
+
+
+def test_limit_caps_work():
+    [ReportFactory.create() for _ in range(5)]
+
+    out = StringIO()
+    with patch(
+        "radis.pgsearch.management.commands.embed_pending.enqueue_embed_reports",
+        return_value=1,
+    ) as enqueue:
+        call_command(
+            "embed_pending", "--limit", "3", "--subjob-size", "10", stdout=out
+        )
+
+    args, kwargs = enqueue.call_args
+    assert len(args[0]) == 3
+    assert kwargs["subjob_size"] == 10
diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py
new file mode 100644
index 000000000..eefe5611f
--- /dev/null
+++ b/radis/pgsearch/tests/test_embed_reports_task.py
@@ -0,0 +1,307 @@
+"""Tests for `embed_reports_task` and its chaining from `bulk_index_reports`."""
+import logging
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import pytest
+import stamina
+
+from radis.pgsearch.models import ReportSearchIndex
+from radis.pgsearch.tasks import (
+    bulk_index_reports,
+    embed_reports_task,
+    enqueue_embed_reports,
+)
+from radis.pgsearch.utils.embedding_client import (
+    EmbeddingClientError,
+    EmbeddingPayloadTooLargeError,
+)
+from radis.reports.factories import ReportFactory
+
+
+@pytest.fixture
+def stamina_active():
+    """Enable stamina retries for the duration of one test. The repo-wide
+    conftest disables them so the rest of the suite isn't slowed by retry
+    backoffs."""
+    stamina.set_active(True)
+    yield
+    stamina.set_active(False)
+
+pytestmark = pytest.mark.django_db(transaction=True)
+
+
+def _unit_vec(dim: int) -> list[float]:
+    v = np.ones(dim, dtype=np.float32)
+    return (v / np.linalg.norm(v)).tolist()
+
+
+def _make_fake_client(vec: list[float]) -> MagicMock:
+    """MagicMock that mimics `with EmbeddingClient() as c` and
+    `c.embed_documents([...])`."""
+    instance = MagicMock()
+    instance.__enter__ = MagicMock(return_value=instance)
+    instance.__exit__ = MagicMock(return_value=None)
+    instance.embed_documents = MagicMock(side_effect=lambda texts: [vec] * len(texts))
+    return instance
+
+
+def test_empty_input_no_ops():
+    with patch("radis.pgsearch.tasks.EmbeddingClient") as client_cls:
+        embed_reports_task(report_ids=[])
+    client_cls.assert_not_called()
+
+
+def test_no_matching_rsvs_no_ops():
+    """Report ids that don't resolve to RSV rows are a no-op — the task does
+    not contact the embedding service."""
+    with patch("radis.pgsearch.tasks.EmbeddingClient") as client_cls:
+        embed_reports_task(report_ids=[999_999])
+    client_cls.assert_not_called()
+
+
+def test_embeds_in_internal_batches(settings):
+    settings.EMBEDDING_BATCH_SIZE = 2
+    reports = [ReportFactory.create() for _ in range(5)]
+    pks = [r.pk for r in reports]
+    vec = _unit_vec(settings.EMBEDDING_DIM)
+    fake = _make_fake_client(vec)
+
+    with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake):
+        embed_reports_task(report_ids=pks)
+
+    # 5 reports with batch_size=2 → 3 embed_documents calls of sizes 2, 2, 1.
+    assert fake.embed_documents.call_count == 3
+    sizes = [len(call.args[0]) for call in fake.embed_documents.call_args_list]
+    assert sorted(sizes) == [1, 2, 2]
+    assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 0
+
+
+def test_embedding_error_propagates():
+    """Procrastinate retries depend on the exception escaping the task."""
+    reports = [ReportFactory.create() for _ in range(2)]
+    pks = [r.pk for r in reports]
+    fake = MagicMock()
+    fake.__enter__ = MagicMock(return_value=fake)
+    fake.__exit__ = MagicMock(return_value=None)
+    fake.embed_documents = MagicMock(side_effect=EmbeddingClientError("service down"))
+
+    with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake):
+        with pytest.raises(EmbeddingClientError):
+            embed_reports_task(report_ids=pks)
+
+    assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 2
+
+
+def test_bulk_index_reports_chains_into_embed_reports_task(settings):
+    """`bulk_index_reports` upserts RSVs and then chunks the embed work via
+    `enqueue_embed_reports`. The chain is the ordering guarantee: the
+    embeddings worker only ever sees report ids whose RSV rows are already
+    committed."""
+    settings.EMBEDDING_SUBJOB_SIZE = 100
+    reports = [ReportFactory.create() for _ in range(3)]
+    pks = [r.pk for r in reports]
+    ReportSearchIndex.objects.filter(report_id__in=pks).delete()
+
+    with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer:
+        bulk_index_reports(report_ids=pks)
+
+    # RSVs were upserted, then one embed subjob covering all 3 ids was
+    # deferred (3 < SUBJOB_SIZE so the whole batch fits in one subjob).
+    assert ReportSearchIndex.objects.filter(report_id__in=pks).count() == 3
+    defer.assert_called_once_with(report_ids=pks)
+
+
+def test_bulk_index_reports_splits_into_subjobs_when_exceeding_subjob_size(settings):
+    """A bulk-upsert larger than `EMBEDDING_SUBJOB_SIZE` must defer multiple
+    embed subjobs so the embeddings worker can drain them in parallel and
+    retries/failures have bounded blast radius."""
+    settings.EMBEDDING_SUBJOB_SIZE = 4
+    reports = [ReportFactory.create() for _ in range(10)]
+    pks = [r.pk for r in reports]
+
+    with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer:
+        bulk_index_reports(report_ids=pks)
+
+    # 10 reports / subjob 4 → 3 defer calls of sizes 4, 4, 2.
+    assert defer.call_count == 3
+    enqueued_chunks = [call.kwargs["report_ids"] for call in defer.call_args_list]
+    assert [len(c) for c in enqueued_chunks] == [4, 4, 2]
+    # The union of all chunks covers exactly the input ids in order.
+    assert [pk for c in enqueued_chunks for pk in c] == pks
+
+
+def test_enqueue_embed_reports_helper_chunks_by_subjob_size(settings):
+    """The shared `enqueue_embed_reports` helper is the single chunking
+    point. A 1M-row backfill becomes ~10k subjobs (no single huge task);
+    a single create with one id becomes one subjob (no overhead)."""
+    settings.EMBEDDING_SUBJOB_SIZE = 3
+
+    with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer:
+        count = enqueue_embed_reports([1, 2, 3, 4, 5, 6, 7])
+
+    assert count == 3
+    assert defer.call_count == 3
+    assert [c.kwargs["report_ids"] for c in defer.call_args_list] == [
+        [1, 2, 3],
+        [4, 5, 6],
+        [7],
+    ]
+
+
+def test_enqueue_embed_reports_helper_empty_input_is_noop():
+    with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer:
+        count = enqueue_embed_reports([])
+    assert count == 0
+    defer.assert_not_called()
+
+
+def test_enqueue_embed_reports_helper_explicit_subjob_size_overrides_setting(settings):
+    """Operators (e.g., `embed_pending --subjob-size=…`) can pass a
+    one-off override without mutating the global setting."""
+    settings.EMBEDDING_SUBJOB_SIZE = 100
+
+    with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer:
+        count = enqueue_embed_reports([1, 2, 3, 4, 5], subjob_size=2)
+
+    assert count == 3
+    assert [c.kwargs["report_ids"] for c in defer.call_args_list] == [
+        [1, 2], [3, 4], [5]
+    ]
+
+
+def test_bisects_on_too_large_and_isolates_offender(settings, caplog, monkeypatch):
+    """When the backend rejects a batch as too large, the task bisects until
+    it isolates the single offending report, logs ERROR with its id + body
+    length, skips it, and still embeds the rest of the batch."""
+    settings.EMBEDDING_BATCH_SIZE = 4
+    reports = [ReportFactory.create() for _ in range(4)]
+    pks = [r.pk for r in reports]
+    offender_pk = pks[2]  # the third report is the one we mark too large
+
+    vec = _unit_vec(settings.EMBEDDING_DIM)
+
+    def fake_embed(texts):
+        # Simulate the backend rejecting any payload that contains the
+        # offending report's body. The body is fetched by report_id.
+        offender_body = ReportSearchIndex.objects.select_related("report").get(
+            report_id=offender_pk
+        ).report.body
+        if offender_body in texts:
+            raise EmbeddingPayloadTooLargeError("over context window")
+        return [vec] * len(texts)
+
+    fake = MagicMock()
+    fake.__enter__ = MagicMock(return_value=fake)
+    fake.__exit__ = MagicMock(return_value=None)
+    fake.embed_documents = MagicMock(side_effect=fake_embed)
+
+    # The project's `radis` logger has `propagate=False` in settings, so
+    # caplog's root handler doesn't see records emitted under it. Attach
+    # caplog's handler directly to the task logger for the duration of
+    # this test.
+    task_logger = logging.getLogger("radis.pgsearch.tasks")
+    task_logger.addHandler(caplog.handler)
+    caplog.set_level(logging.ERROR, logger="radis.pgsearch.tasks")
+    try:
+        with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake):
+            embed_reports_task(report_ids=pks)
+    finally:
+        task_logger.removeHandler(caplog.handler)
+
+    # The three good reports got embeddings; the offender stayed NULL.
+    rsvs_by_pk = {
+        rsv.report_id: rsv
+        for rsv in ReportSearchIndex.objects.filter(report_id__in=pks)
+    }
+    assert rsvs_by_pk[offender_pk].embedding is None
+    for pk in pks:
+        if pk == offender_pk:
+            continue
+        assert rsvs_by_pk[pk].embedding is not None
+
+    # The bisect logged the specific offender's id + body length, and the
+    # task-level summary listed it among skipped ids.
+    error_msgs = [r.getMessage() for r in caplog.records if r.levelname == "ERROR"]
+    assert any(
+        f"report_id={offender_pk}" in msg and "body_chars=" in msg
+        for msg in error_msgs
+    )
+    assert any(
+        "skipped as too large" in msg and str(offender_pk) in msg
+        for msg in error_msgs
+    )
+
+
+def test_non_too_large_error_propagates_without_bisecting():
+    """A generic EmbeddingClientError (5xx, network, etc.) must NOT bisect —
+    Procrastinate's retry policy should handle it, retrying the whole batch.
+    (Stamina retries are disabled in the conftest, so this is a single call.)"""
+    reports = [ReportFactory.create() for _ in range(4)]
+    pks = [r.pk for r in reports]
+    fake = MagicMock()
+    fake.__enter__ = MagicMock(return_value=fake)
+    fake.__exit__ = MagicMock(return_value=None)
+    fake.embed_documents = MagicMock(side_effect=EmbeddingClientError("service down"))
+
+    with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake):
+        with pytest.raises(EmbeddingClientError):
+            embed_reports_task(report_ids=pks)
+
+    # Only one call should have been made — no bisect on non-too-large errors.
+    assert fake.embed_documents.call_count == 1
+    assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 4
+
+
+def test_stamina_retries_transient_then_succeeds(settings, stamina_active):
+    """stamina retries transient EmbeddingClientError: an embed call that
+    fails the first two attempts and succeeds on the third returns vectors
+    without the bisect logic ever firing, and without escalating to
+    Procrastinate's task-level retry."""
+    settings.EMBEDDING_BATCH_SIZE = 4
+    reports = [ReportFactory.create() for _ in range(3)]
+    pks = [r.pk for r in reports]
+    vec = _unit_vec(settings.EMBEDDING_DIM)
+
+    fake = MagicMock()
+    fake.__enter__ = MagicMock(return_value=fake)
+    fake.__exit__ = MagicMock(return_value=None)
+    fake.embed_documents = MagicMock(
+        side_effect=[
+            EmbeddingClientError("blip 1"),
+            EmbeddingClientError("blip 2"),
+            [vec, vec, vec],
+        ]
+    )
+
+    with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake):
+        embed_reports_task(report_ids=pks)
+
+    # The mock was called 3 times: two retries + one success.
+    assert fake.embed_documents.call_count == 3
+    # All three reports got embeddings; none stayed NULL.
+    assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 0
+
+
+def test_stamina_does_not_retry_payload_too_large(settings, stamina_active):
+    """EmbeddingPayloadTooLargeError must skip the stamina retry layer and
+    go straight to the bisect logic. With one offender in a single-row
+    chunk, the embed_documents mock should be called exactly once (no
+    retries), and the offender is logged + skipped."""
+    settings.EMBEDDING_BATCH_SIZE = 1
+    reports = [ReportFactory.create() for _ in range(1)]
+    pks = [r.pk for r in reports]
+
+    fake = MagicMock()
+    fake.__enter__ = MagicMock(return_value=fake)
+    fake.__exit__ = MagicMock(return_value=None)
+    fake.embed_documents = MagicMock(
+        side_effect=EmbeddingPayloadTooLargeError("over context")
+    )
+
+    with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake):
+        embed_reports_task(report_ids=pks)
+
+    # Single call — no stamina retry for payload-too-large.
+    assert fake.embed_documents.call_count == 1
+    assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 1
diff --git a/radis/pgsearch/tests/test_embedding_client.py b/radis/pgsearch/tests/test_embedding_client.py
new file mode 100644
index 000000000..6a7c32263
--- /dev/null
+++ b/radis/pgsearch/tests/test_embedding_client.py
@@ -0,0 +1,405 @@
+import json
+
+import httpx
+import pytest
+from django.test import override_settings
+
+from radis.pgsearch.utils.embedding_client import (
+    BACKENDS,
+    OllamaBackend,
+    OpenAIBackend,
+)
+
+
+def test_openai_backend_builds_payload():
+    backend = OpenAIBackend()
+    payload = backend.build_payload(model="m1", texts=["a", "b"])
+    assert payload == {"model": "m1", "input": ["a", "b"]}
+
+
+def test_openai_backend_default_path():
+    assert OpenAIBackend().path == "/v1/embeddings"
+
+
+def test_openai_backend_parses_response():
+    backend = OpenAIBackend()
+    body = {"data": [{"embedding": [0.1, 0.2]}, {"embedding": [0.3, 0.4]}]}
+    assert backend.parse_response(body) == [[0.1, 0.2], [0.3, 0.4]]
+
+
+def test_openai_backend_parse_raises_on_missing_data_key():
+    from radis.pgsearch.utils.embedding_client import EmbeddingClientError
+
+    backend = OpenAIBackend()
+    with pytest.raises(EmbeddingClientError):
+        backend.parse_response({"oops": []})
+
+
+def test_ollama_backend_builds_payload():
+    backend = OllamaBackend()
+    payload = backend.build_payload(model="m1", texts=["a", "b"])
+    assert payload == {"model": "m1", "input": ["a", "b"]}
+
+
+def test_ollama_backend_default_path():
+    assert OllamaBackend().path == "/api/embed"
+
+
+def test_ollama_backend_parses_response():
+    backend = OllamaBackend()
+    body = {"embeddings": [[0.1, 0.2], [0.3, 0.4]]}
+    assert backend.parse_response(body) == [[0.1, 0.2], [0.3, 0.4]]
+
+
+def test_ollama_backend_parse_raises_on_missing_key():
+    from radis.pgsearch.utils.embedding_client import EmbeddingClientError
+
+    backend = OllamaBackend()
+    with pytest.raises(EmbeddingClientError):
+        backend.parse_response({"data": []})
+
+
+def test_backends_registry_keys():
+    assert set(BACKENDS.keys()) == {"openai", "ollama"}
+
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="secret",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=4,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="INST: ",
+)
+def test_embed_documents_posts_payload_and_normalizes(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    seen = {}
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        seen["url"] = str(request.url)
+        seen["auth"] = request.headers.get("authorization")
+        seen["body"] = json.loads(request.content)
+        return httpx.Response(
+            200, json={"data": [{"embedding": [3.0, 0.0, 0.0, 4.0]}]}
+        )
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+
+    client = ec.EmbeddingClient()
+    vectors = client.embed_documents(["hello"])
+
+    assert seen["url"] == "http://embed.example/v1/embeddings"
+    assert seen["auth"] == "Bearer secret"
+    assert seen["body"] == {"model": "qwen3", "input": ["hello"]}
+    # L2-normalized: original norm = 5, normalized = [0.6, 0, 0, 0.8]
+    assert len(vectors) == 1
+    assert vectors[0] == pytest.approx([0.6, 0.0, 0.0, 0.8])
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="/api/embeddings",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_provider_path_override(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    seen = {}
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        seen["url"] = str(request.url)
+        return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]})
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+    ec.EmbeddingClient().embed_documents(["x"])
+    assert seen["url"] == "http://embed.example/api/embeddings"
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="INST: ",
+)
+def test_embed_query_prepends_instruction(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    seen = {}
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        seen["body"] = json.loads(request.content)
+        return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]})
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+    ec.EmbeddingClient().embed_query("hello")
+    assert seen["body"]["input"] == ["INST: hello"]
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_dim_too_small_raises(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        # Returns dim=1, expected dim=2 -> too small, must raise.
+        return httpx.Response(200, json={"data": [{"embedding": [1.0]}]})
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+    with pytest.raises(ec.EmbeddingClientError):
+        ec.EmbeddingClient().embed_documents(["x"])
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_oversized_embedding_truncates_and_renormalizes(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        # Returns dim=4 ([3,4,99,99]); EMBEDDING_DIM=2 keeps [3,4], norm 5 -> [0.6, 0.8].
+        return httpx.Response(200, json={"data": [{"embedding": [3.0, 4.0, 99.0, 99.0]}]})
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+    vectors = ec.EmbeddingClient().embed_documents(["x"])
+    assert len(vectors) == 1
+    assert vectors[0] == pytest.approx([0.6, 0.8])
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_5xx_raises(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        return httpx.Response(503, text="service unavailable")
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+    with pytest.raises(ec.EmbeddingClientError):
+        ec.EmbeddingClient().embed_documents(["x"])
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_close_releases_http_client(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    closed = {"value": False}
+
+    class TrackingClient:
+        def post(self, *args, **kwargs):
+            raise AssertionError("not used in this test")
+
+        def close(self):
+            closed["value"] = True
+
+    monkeypatch.setattr(ec, "_build_http_client", lambda: TrackingClient())
+    client = ec.EmbeddingClient()
+    client.close()
+    assert closed["value"] is True
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_context_manager_closes_http_client(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    closed = {"value": False}
+
+    class TrackingClient:
+        def post(self, *args, **kwargs):
+            raise AssertionError("not used in this test")
+
+        def close(self):
+            closed["value"] = True
+
+    monkeypatch.setattr(ec, "_build_http_client", lambda: TrackingClient())
+    with ec.EmbeddingClient():
+        pass
+    assert closed["value"] is True
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="v1/embeddings",  # missing leading slash
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_provider_path_without_leading_slash_raises():
+    from radis.pgsearch.utils import embedding_client as ec
+
+    with pytest.raises(ec.EmbeddingClientError, match="must start with '/'"):
+        ec.EmbeddingClient()
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_response_count_mismatch_raises(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        # Requested 2 inputs, backend returns only 1.
+        return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]})
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+    with pytest.raises(ec.EmbeddingClientError, match="count mismatch"):
+        ec.EmbeddingClient().embed_documents(["a", "b"])
+
+
+@pytest.mark.parametrize(
+    "status, body",
+    [
+        (413, "Payload too large"),
+        (400, "This model's maximum context length is 8192 tokens, however your "
+              "messages resulted in 9143 tokens"),
+        (400, '{"error": {"code": "context_length_exceeded"}}'),
+        (422, "input exceeds the model context"),
+        (400, "request too long"),
+    ],
+)
+def test_is_payload_too_large_detects_overlength_responses(status, body):
+    from radis.pgsearch.utils.embedding_client import _is_payload_too_large
+
+    assert _is_payload_too_large(httpx.Response(status, text=body)) is True
+
+
+@pytest.mark.parametrize(
+    "status, body",
+    [
+        (400, "missing required field 'model'"),
+        (401, "invalid api key"),
+        (500, "internal server error"),
+        (503, "service unavailable"),
+    ],
+)
+def test_is_payload_too_large_negatives(status, body):
+    from radis.pgsearch.utils.embedding_client import _is_payload_too_large
+
+    assert _is_payload_too_large(httpx.Response(status, text=body)) is False
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_overlength_response_raises_typed_subclass(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        return httpx.Response(
+            400,
+            text="This model's maximum context length is 8192 tokens.",
+        )
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+    with pytest.raises(ec.EmbeddingPayloadTooLargeError):
+        ec.EmbeddingClient().embed_documents(["x"])
+
+
+@override_settings(
+    EMBEDDING_BACKEND="openai",
+    EMBEDDING_PROVIDER_URL="http://embed.example",
+    EMBEDDING_PROVIDER_PATH="",
+    EMBEDDING_PROVIDER_API_KEY="",
+    EMBEDDING_MODEL_NAME="qwen3",
+    EMBEDDING_DIM=2,
+    EMBEDDING_REQUEST_TIMEOUT=10,
+    EMBEDDING_QUERY_INSTRUCTION="",
+)
+def test_generic_4xx_still_raises_base_error(monkeypatch):
+    from radis.pgsearch.utils import embedding_client as ec
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        return httpx.Response(401, text="invalid api key")
+
+    monkeypatch.setattr(
+        ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler))
+    )
+    with pytest.raises(ec.EmbeddingClientError) as excinfo:
+        ec.EmbeddingClient().embed_documents(["x"])
+    assert not isinstance(excinfo.value, ec.EmbeddingPayloadTooLargeError)
diff --git a/radis/pgsearch/tests/test_fusion.py b/radis/pgsearch/tests/test_fusion.py
new file mode 100644
index 000000000..794750918
--- /dev/null
+++ b/radis/pgsearch/tests/test_fusion.py
@@ -0,0 +1,74 @@
+import pytest
+
+from radis.pgsearch.utils.fusion import rrf_fuse, summary_with_fallback
+
+
+def test_rrf_both_sides_have_hits_overlap():
+    vec_rank = {1: 1, 2: 2, 3: 3}
+    fts_rank = {2: 1, 3: 2, 4: 3}
+    # Expected scores (k=60):
+    #   1: 1/(60+1)         = 0.01639
+    #   2: 1/(61)+1/(61)    = 0.03279
+    #   3: 1/(63)+1/(62)    = 0.03200
+    #   4: 1/(63)           = 0.01587
+    assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [2, 3, 1, 4]
+
+
+def test_rrf_disjoint_universes():
+    vec_rank = {1: 1}
+    fts_rank = {2: 1}
+    assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [1, 2]
+
+
+def test_rrf_only_fts():
+    vec_rank: dict[int, int] = {}
+    fts_rank = {10: 1, 20: 2, 30: 3}
+    assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [10, 20, 30]
+
+
+def test_rrf_only_vec():
+    vec_rank = {10: 1, 20: 2, 30: 3}
+    fts_rank: dict[int, int] = {}
+    assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [10, 20, 30]
+
+
+def test_rrf_empty():
+    assert rrf_fuse({}, {}, k=60) == []
+
+
+def test_rrf_tiebreak_by_id():
+    # Two ids with identical contributions; smaller id wins.
+    vec_rank = {2: 1}
+    fts_rank = {1: 1}
+    # Both contribute 1/61. Tiebreak by id ascending.
+    assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [1, 2]
+
+
+def test_rrf_returns_scores_descending_with_tiebreak():
+    vec_rank = {1: 1}
+    fts_rank = {1: 1, 2: 2}
+    pairs = rrf_fuse(vec_rank, fts_rank, k=60)
+    # id 1: in both, score = 1/61 + 1/61 = 2/61
+    # id 2: in fts only, score = 1/62
+    assert pairs[0][0] == 1
+    assert pairs[1][0] == 2
+    assert pairs[0][1] == pytest.approx(2.0 / 61.0)
+    assert pairs[1][1] == pytest.approx(1.0 / 62.0)
+
+
+def test_summary_with_fallback_keeps_nonempty():
+    assert summary_with_fallback("any body", "an <em>existing</em> headline", 30) == (
+        "an <em>existing</em> headline"
+    )
+
+
+def test_summary_with_fallback_uses_body_head_when_empty():
+    body = " ".join(f"word{i}" for i in range(100))
+    out = summary_with_fallback(body, "", max_words=5)
+    assert out == "word0 word1 word2 word3 word4"
+
+
+def test_summary_with_fallback_short_body():
+    assert summary_with_fallback("only three words here", "", max_words=10) == (
+        "only three words here"
+    )
diff --git a/radis/pgsearch/tests/test_indexing.py b/radis/pgsearch/tests/test_indexing.py
index 344018f5b..282aceb56 100644
--- a/radis/pgsearch/tests/test_indexing.py
+++ b/radis/pgsearch/tests/test_indexing.py
@@ -1,7 +1,7 @@
 import pytest
 
-from radis.pgsearch.models import ReportSearchVector
-from radis.pgsearch.utils.indexing import bulk_upsert_report_search_vectors
+from radis.pgsearch.models import ReportSearchIndex
+from radis.pgsearch.utils.indexing import bulk_upsert_report_search_indexes
 from radis.reports.models import Language, Report
 
 
@@ -24,10 +24,10 @@ def test_bulk_index_matches_signal_vector() -> None:
         language=language,
     )
 
-    signal_vector = ReportSearchVector.objects.get(report=report).search_vector
-    ReportSearchVector.objects.filter(report=report).delete()
+    signal_vector = ReportSearchIndex.objects.get(report=report).search_vector
+    ReportSearchIndex.objects.filter(report=report).delete()
 
-    bulk_upsert_report_search_vectors([report.pk])
-    bulk_vector = ReportSearchVector.objects.get(report=report).search_vector
+    bulk_upsert_report_search_indexes([report.pk])
+    bulk_vector = ReportSearchIndex.objects.get(report=report).search_vector
 
     assert signal_vector == bulk_vector
diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py
new file mode 100644
index 000000000..2966272b8
--- /dev/null
+++ b/radis/pgsearch/tests/test_provider_hybrid.py
@@ -0,0 +1,280 @@
+from unittest.mock import patch
+
+import pytest
+from django.contrib.auth.models import Group
+
+from radis.pgsearch.models import ReportSearchIndex
+from radis.pgsearch.providers import retrieve, search
+from radis.pgsearch.utils.embedding_client import EmbeddingClientError
+from radis.reports.factories import ReportFactory
+from radis.search.site import Search, SearchFilters
+from radis.search.utils.query_parser import QueryParser
+
+pytestmark = pytest.mark.django_db
+
+
+def _unit_vec(idx: int, dim: int) -> list[float]:
+    """Deterministic unit vector that points in dimension `idx`."""
+    v = [0.0] * dim
+    v[idx % dim] = 1.0
+    return v
+
+
+def _make_search(query_str: str, group_id: int) -> Search:
+    node, _ = QueryParser().parse(query_str)
+    assert node is not None
+    return Search(
+        query=node,
+        filters=SearchFilters(group=group_id),
+        offset=0,
+        limit=25,
+    )
+
+
+@pytest.fixture
+def group(db):
+    return Group.objects.create(name="radiology")
+
+
+@pytest.fixture
+def reports_with_embeddings(group, settings):
+    dim = settings.EMBEDDING_DIM
+    # r0: matches FTS for "pneumothorax", vector unrelated (dim 99)
+    r0 = ReportFactory.create(body="Findings: pneumothorax on the left.")
+    r0.groups.add(group)
+    # r1: doesn't lexically match "pneumothorax"; embedding at dim 1 (not identical to query dim 0)
+    r1 = ReportFactory.create(body="Lungs are clear bilaterally.")
+    r1.groups.add(group)
+    # r2: matches FTS (multiple times for stronger ts_rank) AND vector exactly at query dim 0
+    r2 = ReportFactory.create(
+        body="No pneumothorax detected. Previous pneumothorax resolved. Lungs clear."
+    )
+    r2.groups.add(group)
+    ReportSearchIndex.objects.filter(report=r0).update(embedding=_unit_vec(99, dim))
+    ReportSearchIndex.objects.filter(report=r1).update(embedding=_unit_vec(1, dim))
+    ReportSearchIndex.objects.filter(report=r2).update(embedding=_unit_vec(0, dim))
+    return r0, r1, r2
+
+
+def test_hybrid_returns_fts_only_hit(group, reports_with_embeddings, settings):
+    r0, _, _ = reports_with_embeddings
+    dim = settings.EMBEDDING_DIM
+    # Query vector points at dim 50 — far from all docs. So vec_top_K still
+    # returns docs, but their distances are large. FTS for "pneumothorax"
+    # picks up r0 and r2.
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.return_value = _unit_vec(50, dim)
+        result = search(_make_search("pneumothorax", group.pk))
+
+    ids = [d.document_id for d in result.documents]
+    assert r0.document_id in ids
+
+
+def test_hybrid_returns_vector_only_hit(group, reports_with_embeddings, settings):
+    _, r1, _ = reports_with_embeddings
+    dim = settings.EMBEDDING_DIM
+    # Query vector at dim 0 — closest to r1 and r2. FTS for "pneumothorax"
+    # excludes r1 lexically; vector side must surface it.
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.return_value = _unit_vec(0, dim)
+        result = search(_make_search("pneumothorax", group.pk))
+
+    ids = [d.document_id for d in result.documents]
+    assert r1.document_id in ids
+
+
+def test_hybrid_both_sides_match_ranks_first(group, reports_with_embeddings, settings):
+    _, _, r2 = reports_with_embeddings
+    dim = settings.EMBEDDING_DIM
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.return_value = _unit_vec(0, dim)
+        result = search(_make_search("pneumothorax", group.pk))
+
+    ids = [d.document_id for d in result.documents]
+    # r2 is in both vec_top_K and FTS hits; should rank above pure-side matches.
+    assert ids[0] == r2.document_id
+
+
+def test_embedding_failure_falls_back_to_fts(group, reports_with_embeddings):
+    r0, _, r2 = reports_with_embeddings
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.side_effect = EmbeddingClientError("down")
+        result = search(_make_search("pneumothorax", group.pk))
+
+    ids = [d.document_id for d in result.documents]
+    # Both FTS-matching reports come back, no vector-only ones.
+    assert set(ids) == {r0.document_id, r2.document_id}
+
+
+def test_reports_with_null_embedding_still_returned_via_fts(group, settings):
+    dim = settings.EMBEDDING_DIM
+    r = ReportFactory.create(body="pneumothorax findings")
+    r.groups.add(group)
+    # Leave embedding NULL.
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.return_value = _unit_vec(0, dim)
+        result = search(_make_search("pneumothorax", group.pk))
+
+    ids = [d.document_id for d in result.documents]
+    assert r.document_id in ids
+
+
+def test_empty_summary_falls_back_to_body_head(group, settings):
+    dim = settings.EMBEDDING_DIM
+    # Doc whose body does not contain the query word — vector-only hit.
+    r = ReportFactory.create(
+        body="lung parenchyma demonstrates clear bilaterally with no abnormality",
+    )
+    r.groups.add(group)
+    ReportSearchIndex.objects.filter(report=r).update(embedding=_unit_vec(0, dim))
+
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.return_value = _unit_vec(0, dim)
+        result = search(_make_search("pneumothorax", group.pk))
+
+    doc = next(d for d in result.documents if d.document_id == r.document_id)
+    # Summary is non-empty (fell back to body head) and is plain text (no <em>).
+    assert doc.summary
+    assert "<em>" not in doc.summary
+
+
+def test_retrieve_returns_hybrid_ordered_document_ids(group, reports_with_embeddings, settings):
+    r0, r1, r2 = reports_with_embeddings
+    dim = settings.EMBEDDING_DIM
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.return_value = _unit_vec(0, dim)
+        doc_ids = list(retrieve(_make_search("pneumothorax", group.pk)))
+
+    # r2 (both sides) first, then any order containing r0 and r1.
+    assert doc_ids[0] == r2.document_id
+    assert set(doc_ids) >= {r0.document_id, r1.document_id, r2.document_id}
+
+
+def test_retrieve_falls_back_to_fts_on_embedding_error(group, reports_with_embeddings):
+    r0, _, r2 = reports_with_embeddings
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.side_effect = EmbeddingClientError("down")
+        doc_ids = list(retrieve(_make_search("pneumothorax", group.pk)))
+    assert set(doc_ids) == {r0.document_id, r2.document_id}
+
+
+def test_documents_carry_cosine_distance_and_rrf_score(
+    group, reports_with_embeddings, settings
+):
+    """Verify cosine_distance is set for vector-side hits and rrf_score reflects fusion."""
+    _, _, r2 = reports_with_embeddings
+    dim = settings.EMBEDDING_DIM
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.return_value = _unit_vec(0, dim)
+        result = search(_make_search("pneumothorax", group.pk))
+
+    # r2 is in both vector top-K and FTS hits, so its rrf_score should be the largest.
+    top = result.documents[0]
+    assert top.document_id == r2.document_id
+    assert top.cosine_distance is not None
+    assert top.cosine_distance >= 0.0
+    assert top.rrf_score > 0.0
+    # All later documents have a strictly lower or equal rrf_score.
+    for prev, curr in zip(result.documents, result.documents[1:], strict=False):
+        assert curr.rrf_score <= prev.rrf_score
+
+
+def test_m2m_filter_does_not_duplicate_results(group, settings):
+    """Reports with multiple modalities must appear exactly once when the modality
+    filter joins the M2M table. Without `.distinct()` on the queryset, joining on
+    report__modalities__code__in produces one row per matching modality, which
+    inflates rank position and corrupts top-K slicing."""
+    dim = settings.EMBEDDING_DIM
+    r = ReportFactory.create(body="pneumothorax findings", modalities=["CT", "MR", "DX"])
+    r.groups.add(group)
+    ReportSearchIndex.objects.filter(report=r).update(embedding=_unit_vec(0, dim))
+
+    node, _ = QueryParser().parse("pneumothorax")
+    assert node is not None
+    s = Search(
+        query=node,
+        filters=SearchFilters(group=group.pk, modalities=["CT", "MR", "DX"]),
+        offset=0,
+        limit=10,
+    )
+    with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient:
+        MockClient.return_value.__enter__.return_value = MockClient.return_value
+        MockClient.return_value.__exit__.return_value = None
+        MockClient.return_value.embed_query.return_value = _unit_vec(0, dim)
+        result = search(s)
+
+    matching = [d for d in result.documents if d.document_id == r.document_id]
+    assert len(matching) == 1, f"Expected 1 occurrence, got {len(matching)}"
+
+
+def test_search_skips_embedding_when_query_reduces_to_not(monkeypatch, group):
+    """`NOT X` alone produces an empty embedding string; the provider must
+    not call the embedding service and must return FTS-only results."""
+    from radis.pgsearch import providers
+
+    embed_query_calls: list[str] = []
+
+    class FakeEC:
+        def __init__(self): pass
+        def __enter__(self): return self
+        def __exit__(self, *a): return False
+        def embed_query(self, text):
+            embed_query_calls.append(text)
+            raise AssertionError("embed_query should not be called for NOT-only query")
+
+    monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC)
+
+    node, _ = QueryParser().parse("NOT pneumothorax")
+    assert node is not None
+    search = Search(query=node, filters=SearchFilters(group=group.pk), offset=0, limit=10)
+    result = providers.search(search)
+
+    assert embed_query_calls == []
+    # FTS-only path still returns a SearchResult (possibly with zero hits).
+    assert result is not None
+
+
+def test_search_embeds_only_positive_branch_for_and_not(monkeypatch, group, settings):
+    """`A AND NOT B` embeds only `A`; FTS half still enforces the exclusion."""
+    embed_query_calls: list[str] = []
+    dim = settings.EMBEDDING_DIM
+
+    class FakeEC:
+        def __init__(self): pass
+        def __enter__(self): return self
+        def __exit__(self, *a): return False
+        def embed_query(self, text):
+            embed_query_calls.append(text)
+            # Return a valid normalized unit vector of the right dim.
+            import numpy as np
+            v = np.ones(dim, dtype=np.float32)
+            return (v / np.linalg.norm(v)).tolist()
+
+    monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC)
+
+    from radis.pgsearch import providers
+
+    node, _ = QueryParser().parse("pneumothorax AND NOT effusion")
+    assert node is not None
+    search = Search(query=node, filters=SearchFilters(group=group.pk), offset=0, limit=10)
+    providers.search(search)
+
+    assert embed_query_calls == ["pneumothorax"]
diff --git a/radis/pgsearch/utils/document_utils.py b/radis/pgsearch/utils/document_utils.py
index 4a2ff3649..31a8623fb 100644
--- a/radis/pgsearch/utils/document_utils.py
+++ b/radis/pgsearch/utils/document_utils.py
@@ -1,8 +1,8 @@
-from radis.pgsearch.models import ReportSearchVector
+from radis.pgsearch.models import ReportSearchIndex
 from radis.search.site import ReportDocument
 
 
-class AnnotatedReportSearchVector(ReportSearchVector):
+class AnnotatedReportSearchIndex(ReportSearchIndex):
     rank: float
     summary: str
 
@@ -11,7 +11,9 @@ class Meta:
 
 
 def document_from_pgsearch_response(
-    record: AnnotatedReportSearchVector,
+    record: AnnotatedReportSearchIndex,
+    cosine_distance: float | None = None,
+    rrf_score: float = 0.0,
 ) -> ReportDocument:
     report = record.report
     return ReportDocument(
@@ -24,4 +26,6 @@ def document_from_pgsearch_response(
         study_description=report.study_description,
         modalities=report.modality_codes,
         summary=record.summary,
+        cosine_distance=cosine_distance,
+        rrf_score=rrf_score,
     )
diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py
new file mode 100644
index 000000000..b16da16af
--- /dev/null
+++ b/radis/pgsearch/utils/embedding_client.py
@@ -0,0 +1,217 @@
+from __future__ import annotations
+
+import logging
+import math
+from dataclasses import dataclass
+from typing import Protocol
+
+import httpx
+from django.conf import settings
+
+
+class EmbeddingClientError(Exception):
+    """Raised when the embedding service returns an error or a malformed response."""
+
+
+class EmbeddingPayloadTooLargeError(EmbeddingClientError):
+    """Raised when the backend rejects a request because one or more inputs
+    exceed the model's context window. Callers can bisect the batch and
+    retry — `embed_reports_task` does exactly that."""
+
+
+class EmbeddingBackend(Protocol):
+    path: str
+
+    def build_payload(self, model: str, texts: list[str]) -> dict: ...
+
+    def parse_response(self, body: dict) -> list[list[float]]: ...
+
+
+class OpenAIBackend:
+    path: str = "/v1/embeddings"
+
+    def build_payload(self, model: str, texts: list[str]) -> dict:
+        return {"model": model, "input": texts}
+
+    def parse_response(self, body: dict) -> list[list[float]]:
+        try:
+            return [item["embedding"] for item in body["data"]]
+        except (KeyError, TypeError) as e:
+            raise EmbeddingClientError(
+                f"OpenAI-style response missing 'data[*].embedding': {e}"
+            ) from e
+
+
+class OllamaBackend:
+    path: str = "/api/embed"
+
+    def build_payload(self, model: str, texts: list[str]) -> dict:
+        return {"model": model, "input": texts}
+
+    def parse_response(self, body: dict) -> list[list[float]]:
+        try:
+            return list(body["embeddings"])
+        except (KeyError, TypeError) as e:
+            raise EmbeddingClientError(
+                f"Ollama-style response missing 'embeddings': {e}"
+            ) from e
+
+
+BACKENDS: dict[str, EmbeddingBackend] = {
+    "openai": OpenAIBackend(),
+    "ollama": OllamaBackend(),
+}
+
+logger = logging.getLogger(__name__)
+
+
+def _build_http_client() -> httpx.Client:
+    """Indirection so tests can swap in a MockTransport."""
+    return httpx.Client(timeout=settings.EMBEDDING_REQUEST_TIMEOUT)
+
+
+def _l2_normalize(vec: list[float]) -> list[float]:
+    norm = math.sqrt(sum(x * x for x in vec))
+    if norm == 0.0:
+        return vec
+    return [x / norm for x in vec]
+
+
+# Substrings (case-insensitive) seen in embedding-service responses when one
+# or more inputs exceed the model's context window. Kept loose because the
+# exact phrasing varies across OpenAI / vLLM / Ollama and minor version bumps.
+_TOO_LARGE_MARKERS = (
+    "context length",
+    "context_length",
+    "maximum context",
+    "max_tokens",
+    "max tokens",
+    "max_position",
+    "too long",
+    "too large",
+    "too many tokens",
+    "exceeds",
+    "exceeded",
+)
+
+
+def _is_payload_too_large(response: httpx.Response) -> bool:
+    """Best-effort detection: is this 4xx caused by an input exceeding the
+    model's context window (i.e., bisecting the batch could resolve it)?"""
+    if response.status_code == 413:
+        return True
+    if response.status_code not in (400, 422):
+        return False
+    body_lower = response.text.lower()
+    return any(marker in body_lower for marker in _TOO_LARGE_MARKERS)
+
+
+@dataclass(frozen=True)
+class _ResolvedConfig:
+    backend: EmbeddingBackend
+    url: str
+    model: str
+    dim: int
+    instruction: str
+    headers: dict[str, str]
+
+
+def _resolve_config() -> _ResolvedConfig:
+    """Read+validate Django settings once; raise EmbeddingClientError on misconfig."""
+    try:
+        backend = BACKENDS[settings.EMBEDDING_BACKEND]
+    except KeyError as e:
+        raise EmbeddingClientError(
+            f"Unknown EMBEDDING_BACKEND={settings.EMBEDDING_BACKEND!r}; "
+            f"known: {sorted(BACKENDS)}"
+        ) from e
+    path = settings.EMBEDDING_PROVIDER_PATH or backend.path
+    if not path.startswith("/"):
+        raise EmbeddingClientError(
+            f"EMBEDDING_PROVIDER_PATH must start with '/'; got {path!r}"
+        )
+    base = settings.EMBEDDING_PROVIDER_URL.rstrip("/")
+    if not base:
+        raise EmbeddingClientError("EMBEDDING_PROVIDER_URL is not configured")
+    headers: dict[str, str] = {}
+    if settings.EMBEDDING_PROVIDER_API_KEY:
+        headers["Authorization"] = f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}"
+    return _ResolvedConfig(
+        backend=backend,
+        url=f"{base}{path}",
+        model=settings.EMBEDDING_MODEL_NAME,
+        dim=settings.EMBEDDING_DIM,
+        instruction=settings.EMBEDDING_QUERY_INSTRUCTION,
+        headers=headers,
+    )
+
+
+def _normalize_response(
+    raw: list[list[float]], expected_count: int, target_dim: int
+) -> list[list[float]]:
+    if len(raw) != expected_count:
+        raise EmbeddingClientError(
+            f"Embedding count mismatch: requested {expected_count}, "
+            f"backend returned {len(raw)}"
+        )
+    normalized: list[list[float]] = []
+    for vec in raw:
+        if len(vec) < target_dim:
+            raise EmbeddingClientError(
+                f"Embedding dim too small: got {len(vec)}, expected at least {target_dim}"
+            )
+        if len(vec) > target_dim:
+            # Matryoshka truncation: keep first EMBEDDING_DIM components, then re-normalize.
+            # Qwen3-Embedding is trained to retain quality at truncated dimensions.
+            normalized.append(_l2_normalize(list(vec[:target_dim])))
+        else:
+            # Length already matches; still normalize since we can't assume
+            # all providers return unit vectors.
+            normalized.append(_l2_normalize(list(vec)))
+    return normalized
+
+
+class EmbeddingClient:
+    def __init__(self) -> None:
+        cfg = _resolve_config()
+        self._cfg = cfg
+        self._http = _build_http_client()
+
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        payload = self._cfg.backend.build_payload(self._cfg.model, texts)
+        try:
+            response = self._http.post(self._cfg.url, json=payload, headers=self._cfg.headers)
+        except httpx.HTTPError as e:
+            raise EmbeddingClientError(f"HTTP error contacting {self._cfg.url}: {e}") from e
+        if response.status_code >= 400:
+            snippet = response.text[:200]
+            if _is_payload_too_large(response):
+                raise EmbeddingPayloadTooLargeError(
+                    f"Embedding service rejected payload as too large "
+                    f"({response.status_code}): {snippet}"
+                )
+            raise EmbeddingClientError(
+                f"Embedding service returned {response.status_code}: {snippet}"
+            )
+        try:
+            body = response.json()
+        except ValueError as e:
+            raise EmbeddingClientError(f"Embedding response is not JSON: {e}") from e
+        raw = self._cfg.backend.parse_response(body)
+        return _normalize_response(raw, len(texts), self._cfg.dim)
+
+    def embed_query(self, text: str) -> list[float]:
+        prefixed = f"{self._cfg.instruction}{text}" if self._cfg.instruction else text
+        vectors = self.embed_documents([prefixed])
+        if not vectors:
+            raise EmbeddingClientError("Embedding service returned no vectors for query")
+        return vectors[0]
+
+    def close(self) -> None:
+        self._http.close()
+
+    def __enter__(self) -> "EmbeddingClient":
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()
diff --git a/radis/pgsearch/utils/fusion.py b/radis/pgsearch/utils/fusion.py
new file mode 100644
index 000000000..a93f35b97
--- /dev/null
+++ b/radis/pgsearch/utils/fusion.py
@@ -0,0 +1,33 @@
+def rrf_fuse(
+    vec_rank: dict[int, int],
+    fts_rank: dict[int, int],
+    k: int,
+) -> list[tuple[int, float]]:
+    """Reciprocal Rank Fusion.
+
+    vec_rank and fts_rank map report_id -> 1-based rank position in each retriever.
+    Returns (report_id, fused_score) tuples ordered by descending score,
+    with stable ascending-id tiebreak.
+    """
+    all_ids = set(vec_rank) | set(fts_rank)
+
+    def score(rid: int) -> float:
+        s = 0.0
+        if rid in vec_rank:
+            s += 1.0 / (k + vec_rank[rid])
+        if rid in fts_rank:
+            s += 1.0 / (k + fts_rank[rid])
+        return s
+
+    scored = [(rid, score(rid)) for rid in all_ids]
+    scored.sort(key=lambda pair: (-pair[1], pair[0]))
+    return scored
+
+
+def summary_with_fallback(body: str, summary: str, max_words: int) -> str:
+    """SearchHeadline returns '' for documents that don't match the tsquery
+    (e.g., vector-only hits). Fall back to the first `max_words` words of the body."""
+    if summary:
+        return summary
+    words = body.split()
+    return " ".join(words[:max_words])
diff --git a/radis/pgsearch/utils/indexing.py b/radis/pgsearch/utils/indexing.py
index 882ba4b3e..0d90b55fd 100644
--- a/radis/pgsearch/utils/indexing.py
+++ b/radis/pgsearch/utils/indexing.py
@@ -8,7 +8,7 @@
 
 from radis.reports.models import Report
 
-from ..models import ReportSearchVector
+from ..models import ReportSearchIndex
 from .language_utils import code_to_language
 
 logger = logging.getLogger(__name__)
@@ -19,7 +19,7 @@ def _chunked(items: list[int], size: int) -> Iterable[list[int]]:
         yield items[index : index + size]
 
 
-def bulk_upsert_report_search_vectors(
+def bulk_upsert_report_search_indexes(
     report_ids: Iterable[int],
     chunk_size: int | None = None,
 ) -> None:
@@ -56,8 +56,8 @@ def bulk_upsert_report_search_vectors(
             )
 
         for config, config_ids in config_to_ids.items():
-            ReportSearchVector.objects.bulk_create(
-                [ReportSearchVector(report_id=report_id) for report_id in config_ids],
+            ReportSearchIndex.objects.bulk_create(
+                [ReportSearchIndex(report_id=report_id) for report_id in config_ids],
                 ignore_conflicts=True,
                 batch_size=settings.PGSEARCH_BULK_INSERT_BATCH_SIZE,
             )
@@ -65,7 +65,7 @@ def bulk_upsert_report_search_vectors(
             with connection.cursor() as cursor:
                 cursor.execute(
                     """
-                    UPDATE pgsearch_reportsearchvector v
+                    UPDATE pgsearch_reportsearchindex v
                     SET search_vector = to_tsvector(%s::regconfig, r.body)
                     FROM reports_report r
                     WHERE v.report_id = r.id AND r.id = ANY(%s)
diff --git a/radis/reports/api/viewsets.py b/radis/reports/api/viewsets.py
index bb684b154..3567774f5 100644
--- a/radis/reports/api/viewsets.py
+++ b/radis/reports/api/viewsets.py
@@ -1,7 +1,6 @@
 import logging
 from typing import Any
 
-from django.conf import settings
 from django.db import transaction
 from django.http import Http404
 from django.utils import timezone
@@ -13,9 +12,6 @@
 from rest_framework.response import Response
 from rest_framework.serializers import BaseSerializer
 
-from radis.pgsearch.tasks import enqueue_bulk_index_reports
-from radis.pgsearch.utils.indexing import bulk_upsert_report_search_vectors
-
 from ..models import Language, Metadata, Modality, Report
 from ..site import (
     document_fetchers,
@@ -241,12 +237,6 @@ def _dedupe_groups(items: list[Any]) -> tuple[list[int], int]:
                     group_duplicate_count,
                 )
 
-        touched_report_ids = [
-            report_id_by_document_id[document_id]
-            for document_id in [*created_ids, *updated_ids]
-            if document_id in report_id_by_document_id
-        ]
-
         def on_commit():
             if created_ids:
                 created_reports = list(Report.objects.filter(document_id__in=created_ids))
@@ -256,11 +246,6 @@ def on_commit():
                 updated_reports = list(Report.objects.filter(document_id__in=updated_ids))
                 for handler in reports_updated_handlers:
                     handler.handle(updated_reports)
-            if touched_report_ids:
-                if settings.PGSEARCH_SYNC_INDEXING:
-                    bulk_upsert_report_search_vectors(touched_report_ids)
-                else:
-                    enqueue_bulk_index_reports(touched_report_ids)
 
         transaction.on_commit(on_commit)
 
diff --git a/radis/search/site.py b/radis/search/site.py
index 5e0ac4b65..170c2fc3d 100644
--- a/radis/search/site.py
+++ b/radis/search/site.py
@@ -16,6 +16,8 @@ class ReportDocument(NamedTuple):
     study_description: str
     modalities: list[str]
     summary: str
+    cosine_distance: float | None = None
+    rrf_score: float = 0.0
 
     @property
     def full_report(self) -> Report:
diff --git a/radis/search/templates/search/_result_header.html b/radis/search/templates/search/_result_header.html
index 49a2dcfab..c201e8220 100644
--- a/radis/search/templates/search/_result_header.html
+++ b/radis/search/templates/search/_result_header.html
@@ -12,7 +12,9 @@
     <div>
         <div class="d-flex flex-column">
             <small>Result: #{{ counter|add:offset }}</small>
-            <small>Relevance: {{ document.relevance|floatformat:3 }}</small>
+            <small title="PostgreSQL full-text ts_rank; 0 for vector-only hits">FTS rank: {{ document.relevance|floatformat:3 }}</small>
+            <small title="pgvector cosine distance from the query embedding (lower = more similar)">Cosine dist: {{ document.cosine_distance|floatformat:3|default:"—" }}</small>
+            <small title="Reciprocal Rank Fusion score; this determines the result ordering">RRF score: {{ document.rrf_score|floatformat:4 }}</small>
         </div>
     </div>
 </div>
diff --git a/radis/search/tests/test_query_parser.py b/radis/search/tests/test_query_parser.py
index 28d05dd29..3ee06138e 100644
--- a/radis/search/tests/test_query_parser.py
+++ b/radis/search/tests/test_query_parser.py
@@ -136,6 +136,23 @@ def test_fixed_queries():
     assert is_fixed_query("foo \\) bar", "foo bar", 2)
 
 
+def test_strips_field_filter_syntax():
+    # Field-filter syntax (`field:value`) has no place in the query grammar —
+    # structured field filtering lives in `SearchFilters`. Stripping the
+    # whole token keeps the colon from being silently dropped into a
+    # corrupted single word (`bodypneumonia`).
+    assert is_fixed_query("pneumonia body:pneumonia", "pneumonia", 1)
+    # Two stripped filters still count as one "step ran", matching how
+    # `_replace_invalid_characters` reports its own pass.
+    assert is_empty_query("body:pneumonia patient_sex:F", 1)
+    assert is_empty_query("body:pneumonia", 1)
+    # Colons inside phrases are preserved verbatim (operator syntax doesn't
+    # apply inside quoted strings).
+    assert is_valid_query('"body:pneumonia"')
+    # Time-like tokens with embedded colons are also stripped.
+    assert is_fixed_query("time:14:30 finding", "finding", 1)
+
+
 def test_empty_queries():
     assert is_empty_query("", 0)
     assert is_empty_query("   ", 0)
diff --git a/radis/search/tests/test_query_parser_unparse_for_embedding.py b/radis/search/tests/test_query_parser_unparse_for_embedding.py
new file mode 100644
index 000000000..833db95cc
--- /dev/null
+++ b/radis/search/tests/test_query_parser_unparse_for_embedding.py
@@ -0,0 +1,41 @@
+import pytest
+
+from radis.search.utils.query_parser import QueryParser
+
+
+@pytest.mark.parametrize(
+    "query,expected",
+    [
+        # Simple positive term — unchanged.
+        ("pneumothorax", "pneumothorax"),
+        # Phrase: quotes dropped, value preserved (embedding tokenizers handle
+        # multi-word spans natively; the quote chars are noise).
+        ('"chest x-ray"', "chest x-ray"),
+        # Implicit AND (no operator) — both sides survive, joined by a space.
+        ("cardiac arrest", "cardiac arrest"),
+        # Explicit AND — operator token dropped; bag of terms.
+        ("A AND B", "A B"),
+        # Explicit OR — operator token dropped; bag of terms.
+        ("A OR B", "A B"),
+        # NOT alone — empty (polarity-blind for negation).
+        ("NOT pneumothorax", ""),
+        # AND NOT — left survives, NOT branch dropped, AND collapses.
+        ("A AND NOT B", "A"),
+        # NOT AND — right survives, NOT branch dropped, AND collapses.
+        ("NOT A AND B", "B"),
+        # NOT OR NOT — both branches dropped, empty.
+        ("NOT A OR NOT B", ""),
+        # Mixed AND OR with a NOT branch — grouping parens dropped,
+        # operators dropped, surviving terms joined.
+        ("(A AND NOT B) OR C", "A C"),
+        # Nested NOT inside parens — empty parens collapsed.
+        ("A AND (NOT B)", "A"),
+        # Double-nested OR with one NOT — parens + operators dropped,
+        # surviving disjunction terms joined.
+        ("(A OR B) AND NOT C", "A B"),
+    ],
+)
+def test_unparse_for_embedding(query, expected):
+    node, _fixes = QueryParser().parse(query)
+    assert node is not None, f"parser produced empty node for {query!r}"
+    assert QueryParser.unparse_for_embedding(node) == expected
diff --git a/radis/search/tests/test_views.py b/radis/search/tests/test_views.py
index 3a0f672f6..8e0290da9 100644
--- a/radis/search/tests/test_views.py
+++ b/radis/search/tests/test_views.py
@@ -324,3 +324,18 @@ def test_search_view_form_validation_errors(client: Client):
         response = client.get("/search/", search_params)
         assert response.status_code == 200
         assert "form" in response.context
+
+
+@pytest.mark.django_db
+def test_search_view_returns_200_when_embedding_provider_unset(client: Client, settings):
+    """SearchView returns 200 via FTS-only fallback when EMBEDDING_PROVIDER_URL is unset."""
+    from django.conf import settings as django_settings
+
+    settings.EMBEDDING_PROVIDER_URL = ""
+    settings.MIDDLEWARE = [
+        m for m in django_settings.MIDDLEWARE if "debug_toolbar" not in m.lower()
+    ]
+    user = create_test_user_with_active_group()
+    client.force_login(user)
+    response = client.get("/search/?query=pneumothorax")
+    assert response.status_code == 200
diff --git a/radis/search/utils/query_parser.py b/radis/search/utils/query_parser.py
index 4782a39a1..f26ac92d0 100644
--- a/radis/search/utils/query_parser.py
+++ b/radis/search/utils/query_parser.py
@@ -141,6 +141,25 @@ def _modify_unquoted_segments(
 
         return "".join(results)
 
+    def _strip_field_filters(self, input_string: str) -> str:
+        """Drop `field:value` tokens (e.g., ``body:pneumonia``,
+        ``patient_sex:F``, ``time:14:30``).
+
+        The parser grammar has no field-filter syntax — structured field
+        filtering lives on the provider side via ``SearchFilters``. Without
+        this step the colon would be silently stripped by
+        ``_replace_invalid_characters`` and ``body:pneumonia`` would collapse
+        to ``bodypneumonia``, a meaningless token that pollutes both the FTS
+        tsquery and the dense-embedding text. Drop the whole token instead.
+
+        Operates only on unquoted segments so ``"body:pneumonia"`` inside a
+        phrase is preserved verbatim.
+        """
+        pattern = re.compile(r"\b\w+:\S+")
+        return self._modify_unquoted_segments(
+            input_string, lambda s: pattern.sub("", s)
+        )
+
     def _replace_invalid_characters(self, input_string: str) -> str:
         def handle_segment(segment: str) -> str:
             return "".join(char for char in segment if is_search_query_char(char))
@@ -244,6 +263,11 @@ def parse(self, query: str) -> tuple[QueryNode | None, list[str]]:
         if query_before != query_after:
             fixes.append("Fixed unbalanced parentheses")
 
+        query_before = query_after
+        query_after = self._strip_field_filters(query_before)
+        if query_before != query_after:
+            fixes.append("Stripped field-filter syntax (use the filter widgets instead)")
+
         query_before = query_after
         query_after = self._replace_invalid_characters(query_before)
         if query_before != query_after:
@@ -312,3 +336,41 @@ def unparse(node: QueryNode) -> str:
             )
         else:
             raise ValueError(f"Unknown node type: {type(node)}")
+
+    @staticmethod
+    def unparse_for_embedding(node: QueryNode) -> str:
+        """Render the query as a plain bag of terms suitable for a dense
+        embedding model.
+
+        - Drops every ``UnaryNode("NOT", X)`` (embeddings are polarity-blind
+          for negation; see spec §7.8).
+        - Drops boolean operator tokens (``AND``/``OR``): they're query syntax,
+          not content. The embedding model would otherwise see them as
+          stopword-ish tokens cluttering the input.
+        - Drops grouping parentheses for the same reason.
+        - Drops quotes around phrases — embedding tokenizers handle multi-word
+          spans natively; the literal quote chars only add noise.
+
+        Returns the empty string if the whole query reduces to NOT clauses.
+        Used by the hybrid-search vector half via ``providers.search``.
+        """
+        if isinstance(node, TermNode):
+            # Emit the raw value for both WORD and PHRASE — no surrounding
+            # quotes, since the embedding model doesn't care about them.
+            return node.value
+        if isinstance(node, ParensNode):
+            return QueryParser.unparse_for_embedding(node.expression)
+        if isinstance(node, UnaryNode):
+            return ""
+        if isinstance(node, BinaryNode):
+            left = QueryParser.unparse_for_embedding(node.left)
+            right = QueryParser.unparse_for_embedding(node.right)
+            if not left and not right:
+                return ""
+            if not left:
+                return right
+            if not right:
+                return left
+            # Always join with a single space — operator tokens are dropped.
+            return f"{left} {right}"
+        raise ValueError(f"Unknown node type: {type(node)}")
diff --git a/radis/settings/base.py b/radis/settings/base.py
index 319f24853..bd3a8565e 100644
--- a/radis/settings/base.py
+++ b/radis/settings/base.py
@@ -319,9 +319,7 @@
     },
     "dbbackup": {
         "BACKEND": "django.core.files.storage.FileSystemStorage",
-        "OPTIONS": {
-            "location": env.str("DBBACKUP_STORAGE_LOCATION", default="/tmp/backups-radis")
-        },
+        "OPTIONS": {"location": env.str("DBBACKUP_STORAGE_LOCATION", default="/tmp/backups-radis")},
     },
 }
 DBBACKUP_CLEANUP_KEEP = 30
@@ -338,6 +336,27 @@
 LLM_SERVICE_DEV_PORT = env.int("LLM_SERVICE_DEV_PORT", default=8080)
 LLM_SERVICE_URL = env.str("LLM_SERVICE_URL", default=f"http://localhost:{LLM_SERVICE_DEV_PORT}/v1")
 
+# Embedding service (per-deployment)
+EMBEDDING_BACKEND = env.str("EMBEDDING_BACKEND", default="openai")
+EMBEDDING_PROVIDER_URL = env.str("EMBEDDING_PROVIDER_URL", default="")
+EMBEDDING_PROVIDER_PATH = env.str("EMBEDDING_PROVIDER_PATH", default="")
+EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="")
+EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B")
+EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024)
+
+# Embedding tuning constants
+EMBEDDING_REQUEST_TIMEOUT = 30
+EMBEDDING_QUERY_INSTRUCTION = (
+    "Instruct: Given a radiology search query, retrieve relevant radiology reports.\nQuery: "
+)
+EMBEDDING_BATCH_SIZE = 32
+EMBEDDING_SUBJOB_SIZE = 1000
+
+# Hybrid search tuning
+HYBRID_VECTOR_TOP_K = 100
+HYBRID_FTS_MAX_RESULTS = 10_000
+HYBRID_RRF_K = 60
+
 # Chat
 CHAT_GENERATE_TITLE_SYSTEM_PROMPT = """
 Summarize the following conversation in $num_words words or less and in the same language as
diff --git a/radis/settings/test.py b/radis/settings/test.py
index 1c084d467..697d21e5c 100644
--- a/radis/settings/test.py
+++ b/radis/settings/test.py
@@ -10,3 +10,10 @@
     DATABASES["default"]["TEST"] = {"NAME": test_database}  # noqa: F405
 
 DEBUG_TOOLBAR_CONFIG = {"SHOW_TOOLBAR_CALLBACK": lambda request: False}
+
+# Tests must not hit a live embedding service. Embedding work is deferred via
+# a Procrastinate task; tests do not run a worker by default. Blanking the URL
+# means any incidental construction of EmbeddingClient/AsyncEmbeddingClient
+# fast-fails into EmbeddingClientError rather than touching the network. Tests
+# that exercise the embedding path explicitly patch the client.
+EMBEDDING_PROVIDER_URL = ""
diff --git a/uv.lock b/uv.lock
index 53c0b5467..8eec83838 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2020,6 +2020,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" },
 ]
 
+[[package]]
+name = "pgvector"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/25/6c/6d8b4b03b958c02fa8687ec6063c49d952a189f8c91ebbe51e877dfab8f7/pgvector-0.4.2.tar.gz", hash = "sha256:322cac0c1dc5d41c9ecf782bd9991b7966685dee3a00bc873631391ed949513a", size = 31354, upload-time = "2025-12-05T01:07:17.87Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/26/6cee8a1ce8c43625ec561aff19df07f9776b7525d9002c86bceb3e0ac970/pgvector-0.4.2-py3-none-any.whl", hash = "sha256:549d45f7a18593783d5eec609ea1684a724ba8405c4cb182a0b2b08aeff04e08", size = 27441, upload-time = "2025-12-05T01:07:16.536Z" },
+]
+
 [[package]]
 name = "platformdirs"
 version = "4.5.1"
@@ -2768,6 +2780,7 @@ dependencies = [
     { name = "openai" },
     { name = "openpyxl" },
     { name = "pandas" },
+    { name = "pgvector" },
     { name = "procrastinate", extra = ["django"] },
     { name = "psycopg", extra = ["binary"] },
     { name = "pycountry" },
@@ -2853,6 +2866,7 @@ requires-dist = [
     { name = "openai", specifier = ">=1.64.0" },
     { name = "openpyxl", specifier = ">=3.1.5" },
     { name = "pandas", specifier = ">=2.2.3" },
+    { name = "pgvector", specifier = ">=0.3" },
     { name = "procrastinate", extras = ["django"], specifier = ">=3.0.2" },
     { name = "psycopg", extras = ["binary"], specifier = ">=3.2.5" },
     { name = "pycountry", specifier = ">=24.6.1" },