From f82f2f95c33e7479c50dfdea1ed75cbb94eadbbd Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 17:20:09 +0000 Subject: [PATCH 01/68] feat(pgsearch): add pgvector dependency for hybrid search --- pyproject.toml | 1 + uv.lock | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index cb901ddd..0707351a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "openai>=1.64.0", "openpyxl>=3.1.5", "pandas>=2.2.3", + "pgvector>=0.3", "procrastinate[django]>=3.0.2", "psycopg[binary]>=3.2.5", "pycountry>=24.6.1", diff --git a/uv.lock b/uv.lock index 53c0b546..8eec8383 100644 --- a/uv.lock +++ b/uv.lock @@ -2020,6 +2020,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, ] +[[package]] +name = "pgvector" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/25/6c/6d8b4b03b958c02fa8687ec6063c49d952a189f8c91ebbe51e877dfab8f7/pgvector-0.4.2.tar.gz", hash = "sha256:322cac0c1dc5d41c9ecf782bd9991b7966685dee3a00bc873631391ed949513a", size = 31354, upload-time = "2025-12-05T01:07:17.87Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/26/6cee8a1ce8c43625ec561aff19df07f9776b7525d9002c86bceb3e0ac970/pgvector-0.4.2-py3-none-any.whl", hash = "sha256:549d45f7a18593783d5eec609ea1684a724ba8405c4cb182a0b2b08aeff04e08", size = 27441, upload-time = "2025-12-05T01:07:16.536Z" }, +] + [[package]] name = "platformdirs" version = "4.5.1" @@ -2768,6 +2780,7 @@ dependencies = [ { name = "openai" }, { name = "openpyxl" }, { name = "pandas" }, + { name = "pgvector" }, { name = "procrastinate", extra = ["django"] }, { name = "psycopg", extra = ["binary"] }, { name = "pycountry" }, @@ -2853,6 +2866,7 @@ requires-dist = [ { name = "openai", specifier = ">=1.64.0" }, { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=2.2.3" }, + { name = "pgvector", specifier = ">=0.3" }, { name = "procrastinate", extras = ["django"], specifier = ">=3.0.2" }, { name = "psycopg", extras = ["binary"], specifier = ">=3.2.5" }, { name = "pycountry", specifier = ">=24.6.1" }, From ca893528f48b04753594de425874251bc965c5e6 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 17:21:47 +0000 Subject: [PATCH 02/68] feat(pgsearch): add embedding and hybrid search settings --- radis/settings/base.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/radis/settings/base.py b/radis/settings/base.py index 319f2485..d9436ef8 100644 --- a/radis/settings/base.py +++ b/radis/settings/base.py @@ -338,6 +338,32 @@ LLM_SERVICE_DEV_PORT = env.int("LLM_SERVICE_DEV_PORT", default=8080) LLM_SERVICE_URL = env.str("LLM_SERVICE_URL", default=f"http://localhost:{LLM_SERVICE_DEV_PORT}/v1") +# Embedding service (per-deployment, see hybrid-search spec §8.1) +EMBEDDING_BACKEND = env.str("EMBEDDING_BACKEND", default="openai") +EMBEDDING_PROVIDER_URL = env.str("EMBEDDING_PROVIDER_URL", default="") +EMBEDDING_PROVIDER_PATH = env.str("EMBEDDING_PROVIDER_PATH", default="") +EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="") +EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B") +EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) + +# Embedding tuning constants (see hybrid-search spec §8.2) +EMBEDDING_REQUEST_TIMEOUT = 30 +EMBEDDING_MAX_INPUT_CHARS = 60_000 +EMBEDDING_QUERY_INSTRUCTION = ( + "Instruct: Given a radiology search query, retrieve relevant radiology reports.\n" + "Query: " +) +EMBEDDING_BATCH_SIZE = 32 + +# Embedding queue priorities (procrastinate "higher = sooner") +EMBEDDING_INDEX_PRIORITY = 0 +EMBEDDING_BACKFILL_PRIORITY = -1 + +# Hybrid search tuning +HYBRID_VECTOR_TOP_K = 100 +HYBRID_FTS_MAX_RESULTS = 10_000 +HYBRID_RRF_K = 60 + # Chat CHAT_GENERATE_TITLE_SYSTEM_PROMPT = """ Summarize the following conversation in $num_words words or less and in the same language as From 047749678494ce796401b93a93e6884227df9ca1 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 17:24:10 +0000 Subject: [PATCH 03/68] feat(pgsearch): add embedding backend protocol and built-in OpenAI/Ollama backends Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/tests/test_embedding_client.py | 59 +++++++++++++++++++ radis/pgsearch/utils/embedding_client.py | 51 ++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 radis/pgsearch/tests/test_embedding_client.py create mode 100644 radis/pgsearch/utils/embedding_client.py diff --git a/radis/pgsearch/tests/test_embedding_client.py b/radis/pgsearch/tests/test_embedding_client.py new file mode 100644 index 00000000..1bcc4c75 --- /dev/null +++ b/radis/pgsearch/tests/test_embedding_client.py @@ -0,0 +1,59 @@ +import pytest + +from radis.pgsearch.utils.embedding_client import ( + BACKENDS, + OllamaBackend, + OpenAIBackend, +) + + +def test_openai_backend_builds_payload(): + backend = OpenAIBackend() + payload = backend.build_payload(model="m1", texts=["a", "b"]) + assert payload == {"model": "m1", "input": ["a", "b"]} + + +def test_openai_backend_default_path(): + assert OpenAIBackend().path == "/v1/embeddings" + + +def test_openai_backend_parses_response(): + backend = OpenAIBackend() + body = {"data": [{"embedding": [0.1, 0.2]}, {"embedding": [0.3, 0.4]}]} + assert backend.parse_response(body) == [[0.1, 0.2], [0.3, 0.4]] + + +def test_openai_backend_parse_raises_on_missing_data_key(): + from radis.pgsearch.utils.embedding_client import EmbeddingClientError + + backend = OpenAIBackend() + with pytest.raises(EmbeddingClientError): + backend.parse_response({"oops": []}) + + +def test_ollama_backend_builds_payload(): + backend = OllamaBackend() + payload = backend.build_payload(model="m1", texts=["a", "b"]) + assert payload == {"model": "m1", "input": ["a", "b"]} + + +def test_ollama_backend_default_path(): + assert OllamaBackend().path == "/api/embed" + + +def test_ollama_backend_parses_response(): + backend = OllamaBackend() + body = {"embeddings": [[0.1, 0.2], [0.3, 0.4]]} + assert backend.parse_response(body) == [[0.1, 0.2], [0.3, 0.4]] + + +def test_ollama_backend_parse_raises_on_missing_key(): + from radis.pgsearch.utils.embedding_client import EmbeddingClientError + + backend = OllamaBackend() + with pytest.raises(EmbeddingClientError): + backend.parse_response({"data": []}) + + +def test_backends_registry_keys(): + assert set(BACKENDS.keys()) == {"openai", "ollama"} diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py new file mode 100644 index 00000000..74fc6faf --- /dev/null +++ b/radis/pgsearch/utils/embedding_client.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import Protocol + + +class EmbeddingClientError(Exception): + """Raised when the embedding service returns an error or a malformed response.""" + + +class EmbeddingBackend(Protocol): + path: str + + def build_payload(self, model: str, texts: list[str]) -> dict: ... + + def parse_response(self, body: dict) -> list[list[float]]: ... + + +class OpenAIBackend: + path: str = "/v1/embeddings" + + def build_payload(self, model: str, texts: list[str]) -> dict: + return {"model": model, "input": texts} + + def parse_response(self, body: dict) -> list[list[float]]: + try: + return [item["embedding"] for item in body["data"]] + except (KeyError, TypeError) as e: + raise EmbeddingClientError( + f"OpenAI-style response missing 'data[*].embedding': {e}" + ) from e + + +class OllamaBackend: + path: str = "/api/embed" + + def build_payload(self, model: str, texts: list[str]) -> dict: + return {"model": model, "input": texts} + + def parse_response(self, body: dict) -> list[list[float]]: + try: + return list(body["embeddings"]) + except (KeyError, TypeError) as e: + raise EmbeddingClientError( + f"Ollama-style response missing 'embeddings': {e}" + ) from e + + +BACKENDS: dict[str, EmbeddingBackend] = { + "openai": OpenAIBackend(), + "ollama": OllamaBackend(), +} From 38262150d20a81c994274690bb7261ad2d56d1bb Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 17:28:23 +0000 Subject: [PATCH 04/68] feat(pgsearch): add sync EmbeddingClient with normalize, truncate, errors Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/tests/test_embedding_client.py | 177 ++++++++++++++++++ radis/pgsearch/utils/embedding_client.py | 87 ++++++++- 2 files changed, 263 insertions(+), 1 deletion(-) diff --git a/radis/pgsearch/tests/test_embedding_client.py b/radis/pgsearch/tests/test_embedding_client.py index 1bcc4c75..426e339d 100644 --- a/radis/pgsearch/tests/test_embedding_client.py +++ b/radis/pgsearch/tests/test_embedding_client.py @@ -1,4 +1,8 @@ +import json + +import httpx import pytest +from django.test import override_settings from radis.pgsearch.utils.embedding_client import ( BACKENDS, @@ -57,3 +61,176 @@ def test_ollama_backend_parse_raises_on_missing_key(): def test_backends_registry_keys(): assert set(BACKENDS.keys()) == {"openai", "ollama"} + + +def _mock_transport(handler): + """Build an httpx MockTransport that delegates to a handler(request).""" + return httpx.MockTransport(handler) + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="secret", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=4, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="INST: ", +) +def test_embed_documents_posts_payload_and_normalizes(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + seen = {} + + def handler(request: httpx.Request) -> httpx.Response: + seen["url"] = str(request.url) + seen["auth"] = request.headers.get("authorization") + seen["body"] = json.loads(request.content) + return httpx.Response( + 200, json={"data": [{"embedding": [3.0, 0.0, 0.0, 4.0]}]} + ) + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ) + + client = ec.EmbeddingClient() + vectors = client.embed_documents(["hello"]) + + assert seen["url"] == "http://embed.example/v1/embeddings" + assert seen["auth"] == "Bearer secret" + assert seen["body"] == {"model": "qwen3", "input": ["hello"]} + # L2-normalized: original norm = 5, normalized = [0.6, 0, 0, 0.8] + assert len(vectors) == 1 + assert vectors[0] == pytest.approx([0.6, 0.0, 0.0, 0.8]) + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="/api/embeddings", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_provider_path_override(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + seen = {} + + def handler(request: httpx.Request) -> httpx.Response: + seen["url"] = str(request.url) + return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]}) + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ) + ec.EmbeddingClient().embed_documents(["x"]) + assert seen["url"] == "http://embed.example/api/embeddings" + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="INST: ", +) +def test_embed_query_prepends_instruction(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + seen = {} + + def handler(request: httpx.Request) -> httpx.Response: + seen["body"] = json.loads(request.content) + return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]}) + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ) + ec.EmbeddingClient().embed_query("hello") + assert seen["body"]["input"] == ["INST: hello"] + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=5, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_truncates_long_input(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + seen = {} + + def handler(request: httpx.Request) -> httpx.Response: + seen["body"] = json.loads(request.content) + return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]}) + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ) + ec.EmbeddingClient().embed_documents(["abcdefghij"]) + assert seen["body"]["input"] == ["abcde"] + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_dim_mismatch_raises(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0, 3.0]}]}) + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ) + with pytest.raises(ec.EmbeddingClientError): + ec.EmbeddingClient().embed_documents(["x"]) + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_5xx_raises(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(503, text="service unavailable") + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ) + with pytest.raises(ec.EmbeddingClientError): + ec.EmbeddingClient().embed_documents(["x"]) diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py index 74fc6faf..1b97402d 100644 --- a/radis/pgsearch/utils/embedding_client.py +++ b/radis/pgsearch/utils/embedding_client.py @@ -1,6 +1,11 @@ from __future__ import annotations -from typing import Protocol +import logging +import math +from typing import Iterable, Protocol + +import httpx +from django.conf import settings class EmbeddingClientError(Exception): @@ -49,3 +54,83 @@ def parse_response(self, body: dict) -> list[list[float]]: "openai": OpenAIBackend(), "ollama": OllamaBackend(), } + +logger = logging.getLogger(__name__) + + +def _build_http_client() -> httpx.Client: + """Indirection so tests can swap in a MockTransport.""" + return httpx.Client(timeout=settings.EMBEDDING_REQUEST_TIMEOUT) + + +def _l2_normalize(vec: list[float]) -> list[float]: + norm = math.sqrt(sum(x * x for x in vec)) + if norm == 0.0: + return vec + return [x / norm for x in vec] + + +def _truncate(texts: Iterable[str], max_chars: int) -> list[str]: + out: list[str] = [] + for t in texts: + if len(t) > max_chars: + logger.warning( + "Truncating embedding input from %d to %d chars", len(t), max_chars + ) + out.append(t[:max_chars]) + else: + out.append(t) + return out + + +class EmbeddingClient: + def __init__(self) -> None: + try: + self._backend = BACKENDS[settings.EMBEDDING_BACKEND] + except KeyError as e: + raise EmbeddingClientError( + f"Unknown EMBEDDING_BACKEND={settings.EMBEDDING_BACKEND!r}; " + f"known: {sorted(BACKENDS)}" + ) from e + path = settings.EMBEDDING_PROVIDER_PATH or self._backend.path + base = settings.EMBEDDING_PROVIDER_URL.rstrip("/") + if not base: + raise EmbeddingClientError("EMBEDDING_PROVIDER_URL is not configured") + self._url = f"{base}{path}" + self._model = settings.EMBEDDING_MODEL_NAME + self._dim = settings.EMBEDDING_DIM + self._max_chars = settings.EMBEDDING_MAX_INPUT_CHARS + self._instruction = settings.EMBEDDING_QUERY_INSTRUCTION + self._headers: dict[str, str] = {} + if settings.EMBEDDING_PROVIDER_API_KEY: + self._headers["Authorization"] = f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}" + + def embed_documents(self, texts: list[str]) -> list[list[float]]: + truncated = _truncate(texts, self._max_chars) + payload = self._backend.build_payload(self._model, truncated) + with _build_http_client() as http: + try: + response = http.post(self._url, json=payload, headers=self._headers) + except httpx.HTTPError as e: + raise EmbeddingClientError(f"HTTP error contacting {self._url}: {e}") from e + if response.status_code >= 400: + raise EmbeddingClientError( + f"Embedding service returned {response.status_code}: {response.text[:200]}" + ) + try: + body = response.json() + except ValueError as e: + raise EmbeddingClientError(f"Embedding response is not JSON: {e}") from e + raw = self._backend.parse_response(body) + normalized: list[list[float]] = [] + for vec in raw: + if len(vec) != self._dim: + raise EmbeddingClientError( + f"Embedding dim mismatch: got {len(vec)}, expected {self._dim}" + ) + normalized.append(_l2_normalize(list(vec))) + return normalized + + def embed_query(self, text: str) -> list[float]: + prefixed = f"{self._instruction}{text}" if self._instruction else text + return self.embed_documents([prefixed])[0] From 8d4fa001ef8eb77bf0a6b12dae03f4011f90b586 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 17:31:18 +0000 Subject: [PATCH 05/68] refactor(pgsearch): reuse httpx.Client across embedding batches; inline _mock_transport --- radis/pgsearch/tests/test_embedding_client.py | 16 ++++++---------- radis/pgsearch/utils/embedding_client.py | 10 +++++----- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/radis/pgsearch/tests/test_embedding_client.py b/radis/pgsearch/tests/test_embedding_client.py index 426e339d..a591ad3c 100644 --- a/radis/pgsearch/tests/test_embedding_client.py +++ b/radis/pgsearch/tests/test_embedding_client.py @@ -63,10 +63,6 @@ def test_backends_registry_keys(): assert set(BACKENDS.keys()) == {"openai", "ollama"} -def _mock_transport(handler): - """Build an httpx MockTransport that delegates to a handler(request).""" - return httpx.MockTransport(handler) - @override_settings( EMBEDDING_BACKEND="openai", @@ -93,7 +89,7 @@ def handler(request: httpx.Request) -> httpx.Response: ) monkeypatch.setattr( - ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) ) client = ec.EmbeddingClient() @@ -128,7 +124,7 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]}) monkeypatch.setattr( - ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) ) ec.EmbeddingClient().embed_documents(["x"]) assert seen["url"] == "http://embed.example/api/embeddings" @@ -155,7 +151,7 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]}) monkeypatch.setattr( - ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) ) ec.EmbeddingClient().embed_query("hello") assert seen["body"]["input"] == ["INST: hello"] @@ -182,7 +178,7 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]}) monkeypatch.setattr( - ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) ) ec.EmbeddingClient().embed_documents(["abcdefghij"]) assert seen["body"]["input"] == ["abcde"] @@ -206,7 +202,7 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0, 3.0]}]}) monkeypatch.setattr( - ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) ) with pytest.raises(ec.EmbeddingClientError): ec.EmbeddingClient().embed_documents(["x"]) @@ -230,7 +226,7 @@ def handler(request: httpx.Request) -> httpx.Response: return httpx.Response(503, text="service unavailable") monkeypatch.setattr( - ec, "_build_http_client", lambda: httpx.Client(transport=_mock_transport(handler)) + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) ) with pytest.raises(ec.EmbeddingClientError): ec.EmbeddingClient().embed_documents(["x"]) diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py index 1b97402d..707a9306 100644 --- a/radis/pgsearch/utils/embedding_client.py +++ b/radis/pgsearch/utils/embedding_client.py @@ -104,15 +104,15 @@ def __init__(self) -> None: self._headers: dict[str, str] = {} if settings.EMBEDDING_PROVIDER_API_KEY: self._headers["Authorization"] = f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}" + self._http = _build_http_client() def embed_documents(self, texts: list[str]) -> list[list[float]]: truncated = _truncate(texts, self._max_chars) payload = self._backend.build_payload(self._model, truncated) - with _build_http_client() as http: - try: - response = http.post(self._url, json=payload, headers=self._headers) - except httpx.HTTPError as e: - raise EmbeddingClientError(f"HTTP error contacting {self._url}: {e}") from e + try: + response = self._http.post(self._url, json=payload, headers=self._headers) + except httpx.HTTPError as e: + raise EmbeddingClientError(f"HTTP error contacting {self._url}: {e}") from e if response.status_code >= 400: raise EmbeddingClientError( f"Embedding service returned {response.status_code}: {response.text[:200]}" From 8376c63cf90b8e1554212989126ce20ccb1e1369 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 17:34:04 +0000 Subject: [PATCH 06/68] feat(pgsearch): add close()/__enter__/__exit__ to EmbeddingClient --- radis/pgsearch/tests/test_embedding_client.py | 58 +++++++++++++++++++ radis/pgsearch/utils/embedding_client.py | 9 +++ 2 files changed, 67 insertions(+) diff --git a/radis/pgsearch/tests/test_embedding_client.py b/radis/pgsearch/tests/test_embedding_client.py index a591ad3c..56f97cfd 100644 --- a/radis/pgsearch/tests/test_embedding_client.py +++ b/radis/pgsearch/tests/test_embedding_client.py @@ -230,3 +230,61 @@ def handler(request: httpx.Request) -> httpx.Response: ) with pytest.raises(ec.EmbeddingClientError): ec.EmbeddingClient().embed_documents(["x"]) + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_close_releases_http_client(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + closed = {"value": False} + + class TrackingClient: + def post(self, *args, **kwargs): + raise AssertionError("not used in this test") + + def close(self): + closed["value"] = True + + monkeypatch.setattr(ec, "_build_http_client", lambda: TrackingClient()) + client = ec.EmbeddingClient() + client.close() + assert closed["value"] is True + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_context_manager_closes_http_client(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + closed = {"value": False} + + class TrackingClient: + def post(self, *args, **kwargs): + raise AssertionError("not used in this test") + + def close(self): + closed["value"] = True + + monkeypatch.setattr(ec, "_build_http_client", lambda: TrackingClient()) + with ec.EmbeddingClient(): + pass + assert closed["value"] is True diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py index 707a9306..73d76582 100644 --- a/radis/pgsearch/utils/embedding_client.py +++ b/radis/pgsearch/utils/embedding_client.py @@ -134,3 +134,12 @@ def embed_documents(self, texts: list[str]) -> list[list[float]]: def embed_query(self, text: str) -> list[float]: prefixed = f"{self._instruction}{text}" if self._instruction else text return self.embed_documents([prefixed])[0] + + def close(self) -> None: + self._http.close() + + def __enter__(self) -> "EmbeddingClient": + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + self.close() From afe5840e1143d5c8398903be5156d2a9802039f8 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 17:49:00 +0000 Subject: [PATCH 07/68] feat(pgsearch): add RRF fusion helper and empty-headline fallback --- radis/pgsearch/tests/test_fusion.py | 60 +++++++++++++++++++++++++++++ radis/pgsearch/utils/fusion.py | 30 +++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 radis/pgsearch/tests/test_fusion.py create mode 100644 radis/pgsearch/utils/fusion.py diff --git a/radis/pgsearch/tests/test_fusion.py b/radis/pgsearch/tests/test_fusion.py new file mode 100644 index 00000000..36fe638c --- /dev/null +++ b/radis/pgsearch/tests/test_fusion.py @@ -0,0 +1,60 @@ +from radis.pgsearch.utils.fusion import rrf_fuse, summary_with_fallback + + +def test_rrf_both_sides_have_hits_overlap(): + vec_rank = {1: 1, 2: 2, 3: 3} + fts_rank = {2: 1, 3: 2, 4: 3} + # Expected scores (k=60): + # 1: 1/(60+1) = 0.01639 + # 2: 1/(61)+1/(61) = 0.03279 + # 3: 1/(63)+1/(62) = 0.03200 + # 4: 1/(63) = 0.01587 + assert rrf_fuse(vec_rank, fts_rank, k=60) == [2, 3, 1, 4] + + +def test_rrf_disjoint_universes(): + vec_rank = {1: 1} + fts_rank = {2: 1} + assert rrf_fuse(vec_rank, fts_rank, k=60) == [1, 2] + + +def test_rrf_only_fts(): + vec_rank: dict[int, int] = {} + fts_rank = {10: 1, 20: 2, 30: 3} + assert rrf_fuse(vec_rank, fts_rank, k=60) == [10, 20, 30] + + +def test_rrf_only_vec(): + vec_rank = {10: 1, 20: 2, 30: 3} + fts_rank: dict[int, int] = {} + assert rrf_fuse(vec_rank, fts_rank, k=60) == [10, 20, 30] + + +def test_rrf_empty(): + assert rrf_fuse({}, {}, k=60) == [] + + +def test_rrf_tiebreak_by_id(): + # Two ids with identical contributions; smaller id wins. + vec_rank = {2: 1} + fts_rank = {1: 1} + # Both contribute 1/61. Tiebreak by id ascending. + assert rrf_fuse(vec_rank, fts_rank, k=60) == [1, 2] + + +def test_summary_with_fallback_keeps_nonempty(): + assert summary_with_fallback("any body", "an existing headline", 30) == ( + "an existing headline" + ) + + +def test_summary_with_fallback_uses_body_head_when_empty(): + body = " ".join(f"word{i}" for i in range(100)) + out = summary_with_fallback(body, "", max_words=5) + assert out == "word0 word1 word2 word3 word4" + + +def test_summary_with_fallback_short_body(): + assert summary_with_fallback("only three words here", "", max_words=10) == ( + "only three words here" + ) diff --git a/radis/pgsearch/utils/fusion.py b/radis/pgsearch/utils/fusion.py new file mode 100644 index 00000000..948731ef --- /dev/null +++ b/radis/pgsearch/utils/fusion.py @@ -0,0 +1,30 @@ +def rrf_fuse( + vec_rank: dict[int, int], + fts_rank: dict[int, int], + k: int, +) -> list[int]: + """Reciprocal Rank Fusion. + + vec_rank and fts_rank map report_id -> 1-based rank position in each retriever. + Returns report ids ordered by descending RRF score, with stable id tiebreak. + """ + all_ids = set(vec_rank) | set(fts_rank) + + def score(rid: int) -> float: + s = 0.0 + if rid in vec_rank: + s += 1.0 / (k + vec_rank[rid]) + if rid in fts_rank: + s += 1.0 / (k + fts_rank[rid]) + return s + + return sorted(all_ids, key=lambda rid: (-score(rid), rid)) + + +def summary_with_fallback(body: str, summary: str, max_words: int) -> str: + """SearchHeadline returns '' for documents that don't match the tsquery + (e.g., vector-only hits). Fall back to the first `max_words` words of the body.""" + if summary: + return summary + words = body.split() + return " ".join(words[:max_words]) From ba9fa24c861ca7652ae1e1b6218b6096d677d48e Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 18:09:09 +0000 Subject: [PATCH 08/68] feat(pgsearch): install pgvector extension and swap to pgvector/pgvector:pg17 image Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.base.yml | 2 +- radis/pgsearch/migrations/0002_pgvector_extension.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 radis/pgsearch/migrations/0002_pgvector_extension.py diff --git a/docker-compose.base.yml b/docker-compose.base.yml index 7fa3cc47..36b2e7d5 100644 --- a/docker-compose.base.yml +++ b/docker-compose.base.yml @@ -55,7 +55,7 @@ services: hostname: llm_worker.local postgres: - image: postgres:17 + image: pgvector/pgvector:pg17 hostname: postgres.local volumes: - postgres_data:/var/lib/postgresql/data diff --git a/radis/pgsearch/migrations/0002_pgvector_extension.py b/radis/pgsearch/migrations/0002_pgvector_extension.py new file mode 100644 index 00000000..c862dbce --- /dev/null +++ b/radis/pgsearch/migrations/0002_pgvector_extension.py @@ -0,0 +1,11 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [("pgsearch", "0001_initial")] + operations = [ + migrations.RunSQL( + sql="CREATE EXTENSION IF NOT EXISTS vector;", + reverse_sql=migrations.RunSQL.noop, + ), + ] From 3cee09e2c12800683b94663a369da9e2e4f9f384 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 18:23:20 +0000 Subject: [PATCH 09/68] feat(pgsearch): add nullable embedding column with HNSW cosine index Adds VectorField(dimensions=1024, null=True) to ReportSearchVector and creates an HNSW index (m=16, ef_construction=64, vector_cosine_ops) for efficient cosine similarity search. Co-Authored-By: Claude Sonnet 4.6 --- .../migrations/0003_report_embedding.py | 25 +++++++++++++++++++ radis/pgsearch/models.py | 14 ++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 radis/pgsearch/migrations/0003_report_embedding.py diff --git a/radis/pgsearch/migrations/0003_report_embedding.py b/radis/pgsearch/migrations/0003_report_embedding.py new file mode 100644 index 00000000..b014e102 --- /dev/null +++ b/radis/pgsearch/migrations/0003_report_embedding.py @@ -0,0 +1,25 @@ +# Generated by Django 6.0.1 on 2026-05-15 18:19 + +import pgvector.django.indexes +import pgvector.django.vector +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('pgsearch', '0002_pgvector_extension'), + ('reports', '0013_alter_report_options'), + ] + + operations = [ + migrations.AddField( + model_name='reportsearchvector', + name='embedding', + field=pgvector.django.vector.VectorField(dimensions=1024, null=True), + ), + migrations.AddIndex( + model_name='reportsearchvector', + index=pgvector.django.indexes.HnswIndex(ef_construction=64, fields=['embedding'], m=16, name='pgsearch_embedding_hnsw', opclasses=['vector_cosine_ops']), + ), + ] diff --git a/radis/pgsearch/models.py b/radis/pgsearch/models.py index 63550fca..5cd90e8b 100644 --- a/radis/pgsearch/models.py +++ b/radis/pgsearch/models.py @@ -1,6 +1,8 @@ +from django.conf import settings from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.search import SearchVector, SearchVectorField from django.db import models +from pgvector.django import HnswIndex, VectorField from radis.reports.models import Report @@ -10,9 +12,19 @@ class ReportSearchVector(models.Model): report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_vector") search_vector = SearchVectorField(null=True) + embedding = VectorField(dimensions=settings.EMBEDDING_DIM, null=True) class Meta: - indexes = [GinIndex(fields=["search_vector"])] + indexes = [ + GinIndex(fields=["search_vector"]), + HnswIndex( + name="pgsearch_embedding_hnsw", + fields=["embedding"], + m=16, + ef_construction=64, + opclasses=["vector_cosine_ops"], + ), + ] def __str__(self) -> str: return f"Report {self.report.id} search vector" From ea620f29897576c472f8aabfd0c011540d4210c9 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 18:39:22 +0000 Subject: [PATCH 10/68] feat(pgsearch): add embed_reports task on dedicated embeddings queue Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/tasks.py | 45 +++++++++++ .../pgsearch/tests/test_embed_reports_task.py | 77 +++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 radis/pgsearch/tests/test_embed_reports_task.py diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index 2645a28d..6b906023 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -1,8 +1,11 @@ import logging +from django.conf import settings as django_settings from procrastinate.contrib.django import app from procrastinate.types import JSONValue +from .models import ReportSearchVector +from .utils.embedding_client import EmbeddingClient from .utils.indexing import bulk_upsert_report_search_vectors logger = logging.getLogger(__name__) @@ -28,3 +31,45 @@ def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: "radis.pgsearch.tasks.bulk_index_reports", allow_unknown=False, ).defer(report_ids=payload) + + +@app.task(queue="embeddings") +def embed_reports(report_ids: list[int]) -> None: + """Compute and write embeddings for the given reports. Overwrites any existing + embedding. Idempotent across re-runs except for the cost of the API call.""" + if not report_ids: + return + + rsvs = list( + ReportSearchVector.objects.filter(report_id__in=report_ids) + .select_related("report") + .only("id", "report_id", "report__body") + ) + if not rsvs: + return + + client = EmbeddingClient() + batch_size = django_settings.EMBEDDING_BATCH_SIZE + + for start in range(0, len(rsvs), batch_size): + chunk = rsvs[start : start + batch_size] + texts = [rsv.report.body for rsv in chunk] + vectors = client.embed_documents(texts) + for rsv, vec in zip(chunk, vectors, strict=True): + ReportSearchVector.objects.filter(pk=rsv.pk).update(embedding=vec) + + +def enqueue_embed_reports( + report_ids: list[int], + priority: int | None = None, +) -> int | None: + if not report_ids: + return None + if priority is None: + priority = django_settings.EMBEDDING_INDEX_PRIORITY + payload: list[JSONValue] = [int(rid) for rid in report_ids] + return app.configure_task( + "radis.pgsearch.tasks.embed_reports", + allow_unknown=False, + priority=priority, + ).defer(report_ids=payload) diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py new file mode 100644 index 00000000..5450f325 --- /dev/null +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -0,0 +1,77 @@ +from unittest.mock import patch + +import pytest + +from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.tasks import embed_reports +from radis.reports.factories import ReportFactory + + +@pytest.mark.django_db +def test_embed_reports_writes_normalized_vector(): + report = ReportFactory.create(body="Findings: no acute abnormality.") + fake_vec = [1.0] + [0.0] * 1023 # already normalized + + with patch( + "radis.pgsearch.tasks.EmbeddingClient" + ) as MockClient: + MockClient.return_value.embed_documents.return_value = [fake_vec] + embed_reports.__wrapped__([report.pk]) + + rsv = ReportSearchVector.objects.get(report=report) + assert rsv.embedding is not None + assert len(rsv.embedding) == 1024 + assert pytest.approx(rsv.embedding[0]) == 1.0 + + +@pytest.mark.django_db +def test_embed_reports_overwrites_existing_embedding(): + report = ReportFactory.create() + rsv = ReportSearchVector.objects.get(report=report) + rsv.embedding = [0.5] * 1024 + rsv.save(update_fields=["embedding"]) + + new_vec = [1.0] + [0.0] * 1023 + with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: + MockClient.return_value.embed_documents.return_value = [new_vec] + embed_reports.__wrapped__([report.pk]) + + rsv.refresh_from_db() + assert pytest.approx(rsv.embedding[0]) == 1.0 + assert pytest.approx(rsv.embedding[1]) == 0.0 + + +@pytest.mark.django_db +def test_embed_reports_skips_missing_ids_without_error(): + with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: + # No reports created. Should not call the client at all. + embed_reports.__wrapped__([99999]) + MockClient.return_value.embed_documents.assert_not_called() + + +@pytest.mark.django_db +def test_embed_reports_splits_into_batches(settings): + settings.EMBEDDING_BATCH_SIZE = 2 + reports = [ReportFactory.create() for _ in range(5)] + fake_vec = [1.0] + [0.0] * 1023 + + with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: + MockClient.return_value.embed_documents.side_effect = [ + [fake_vec, fake_vec], + [fake_vec, fake_vec], + [fake_vec], + ] + embed_reports.__wrapped__([r.pk for r in reports]) + + assert MockClient.return_value.embed_documents.call_count == 3 + + +@pytest.mark.django_db +def test_embed_reports_propagates_client_error(): + from radis.pgsearch.utils.embedding_client import EmbeddingClientError + + report = ReportFactory.create() + with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: + MockClient.return_value.embed_documents.side_effect = EmbeddingClientError("boom") + with pytest.raises(EmbeddingClientError): + embed_reports.__wrapped__([report.pk]) From cbc798ccf47ebc6a5562e8913090a7ca6803c808 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 18:42:02 +0000 Subject: [PATCH 11/68] fix(pgsearch): close EmbeddingClient http pool in embed_reports finally block --- radis/pgsearch/tasks.py | 15 +++++++----- .../pgsearch/tests/test_embed_reports_task.py | 24 +++++++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index 6b906023..b62bf4b5 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -51,12 +51,15 @@ def embed_reports(report_ids: list[int]) -> None: client = EmbeddingClient() batch_size = django_settings.EMBEDDING_BATCH_SIZE - for start in range(0, len(rsvs), batch_size): - chunk = rsvs[start : start + batch_size] - texts = [rsv.report.body for rsv in chunk] - vectors = client.embed_documents(texts) - for rsv, vec in zip(chunk, vectors, strict=True): - ReportSearchVector.objects.filter(pk=rsv.pk).update(embedding=vec) + try: + for start in range(0, len(rsvs), batch_size): + chunk = rsvs[start : start + batch_size] + texts = [rsv.report.body for rsv in chunk] + vectors = client.embed_documents(texts) + for rsv, vec in zip(chunk, vectors, strict=True): + ReportSearchVector.objects.filter(pk=rsv.pk).update(embedding=vec) + finally: + client.close() def enqueue_embed_reports( diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py index 5450f325..748c6dd0 100644 --- a/radis/pgsearch/tests/test_embed_reports_task.py +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -75,3 +75,27 @@ def test_embed_reports_propagates_client_error(): MockClient.return_value.embed_documents.side_effect = EmbeddingClientError("boom") with pytest.raises(EmbeddingClientError): embed_reports.__wrapped__([report.pk]) + + +@pytest.mark.django_db +def test_embed_reports_closes_client_on_success(): + report = ReportFactory.create() + fake_vec = [1.0] + [0.0] * 1023 + + with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: + MockClient.return_value.embed_documents.return_value = [fake_vec] + embed_reports.__wrapped__([report.pk]) + + MockClient.return_value.close.assert_called_once() + + +@pytest.mark.django_db +def test_embed_reports_closes_client_on_error(): + from radis.pgsearch.utils.embedding_client import EmbeddingClientError + + report = ReportFactory.create() + with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: + MockClient.return_value.embed_documents.side_effect = EmbeddingClientError("boom") + with pytest.raises(EmbeddingClientError): + embed_reports.__wrapped__([report.pk]) + MockClient.return_value.close.assert_called_once() From 249d9ff75ec63b087d3a7e3d4ca08271e590508e Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 18:46:29 +0000 Subject: [PATCH 12/68] feat(pgsearch): enqueue embed_reports on Report post_save Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/signals.py | 7 ++++++- radis/pgsearch/tests/test_signals.py | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 radis/pgsearch/tests/test_signals.py diff --git a/radis/pgsearch/signals.py b/radis/pgsearch/signals.py index 6d7e6f02..d60ae781 100644 --- a/radis/pgsearch/signals.py +++ b/radis/pgsearch/signals.py @@ -4,6 +4,7 @@ from radis.reports.models import Report from .models import ReportSearchVector +from .tasks import enqueue_embed_reports @receiver(post_save, sender=Report) @@ -11,5 +12,9 @@ def create_or_update_report_search_vector(sender, instance, created, **kwargs): if created: ReportSearchVector.objects.create(report=instance) return - instance.search_vector.save() + + +@receiver(post_save, sender=Report) +def enqueue_report_embedding(sender, instance, **kwargs): + enqueue_embed_reports([instance.pk]) diff --git a/radis/pgsearch/tests/test_signals.py b/radis/pgsearch/tests/test_signals.py new file mode 100644 index 00000000..894c03c6 --- /dev/null +++ b/radis/pgsearch/tests/test_signals.py @@ -0,0 +1,24 @@ +from unittest.mock import patch + +import pytest + +from radis.reports.factories import ReportFactory + + +@pytest.mark.django_db +def test_report_save_enqueues_embed_reports(): + with patch("radis.pgsearch.signals.enqueue_embed_reports") as enqueue: + report = ReportFactory.create() + # factory_boy calls instance.save() twice (initial create + after post-generation + # hooks), so enqueue_embed_reports is called at least once with the report PK. + assert enqueue.call_count >= 1 + enqueue.assert_called_with([report.pk]) + + +@pytest.mark.django_db +def test_report_update_also_enqueues_embed_reports(): + report = ReportFactory.create() + with patch("radis.pgsearch.signals.enqueue_embed_reports") as enqueue: + report.body = "Updated body" + report.save() + enqueue.assert_called_once_with([report.pk]) From 8855dff2451680fddee1dc6a5a317d325c3f756f Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 18:48:21 +0000 Subject: [PATCH 13/68] test(pgsearch): use Report.objects.create() to assert single post_save fire on create --- radis/pgsearch/tests/test_signals.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/radis/pgsearch/tests/test_signals.py b/radis/pgsearch/tests/test_signals.py index 894c03c6..78151acf 100644 --- a/radis/pgsearch/tests/test_signals.py +++ b/radis/pgsearch/tests/test_signals.py @@ -7,12 +7,26 @@ @pytest.mark.django_db def test_report_save_enqueues_embed_reports(): + from radis.reports.models import Language, Report + + language = Language.objects.create(code="en") with patch("radis.pgsearch.signals.enqueue_embed_reports") as enqueue: - report = ReportFactory.create() - # factory_boy calls instance.save() twice (initial create + after post-generation - # hooks), so enqueue_embed_reports is called at least once with the report PK. - assert enqueue.call_count >= 1 - enqueue.assert_called_with([report.pk]) + report = Report.objects.create( + document_id="DOC-SIGNAL-1", + pacs_aet="PACS", + pacs_name="PACS", + pacs_link="", + patient_id="P1", + patient_birth_date="1980-01-01", + patient_sex="M", + study_description="Study", + study_datetime="2024-01-01T00:00:00Z", + study_instance_uid="1.2.3.4", + accession_number="ACC1", + body="Body.", + language=language, + ) + enqueue.assert_called_once_with([report.pk]) @pytest.mark.django_db From 23ed743909916ef5140f254905a9f63198e6f7ee Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 18:50:19 +0000 Subject: [PATCH 14/68] feat(pgsearch): add backfill_embeddings management command Co-Authored-By: Claude Sonnet 4.6 --- .../commands/backfill_embeddings.py | 48 ++++++++++++ radis/pgsearch/tests/test_backfill_command.py | 78 +++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 radis/pgsearch/management/commands/backfill_embeddings.py create mode 100644 radis/pgsearch/tests/test_backfill_command.py diff --git a/radis/pgsearch/management/commands/backfill_embeddings.py b/radis/pgsearch/management/commands/backfill_embeddings.py new file mode 100644 index 00000000..1fca3488 --- /dev/null +++ b/radis/pgsearch/management/commands/backfill_embeddings.py @@ -0,0 +1,48 @@ +from django.conf import settings +from django.core.management.base import BaseCommand + +from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.tasks import enqueue_embed_reports + + +class Command(BaseCommand): + help = ( + "Enqueue embed_reports tasks for all reports that don't yet have an " + "embedding. Idempotent: rows that already have an embedding are skipped." + ) + + def add_arguments(self, parser): + parser.add_argument("--batch-size", type=int, default=500) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Maximum number of reports to enqueue (default: all).", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the count of reports that would be enqueued, but enqueue nothing.", + ) + + def handle(self, *args, batch_size, limit, dry_run, **options): + qs = ( + ReportSearchVector.objects.filter(embedding__isnull=True) + .order_by("report_id") + .values_list("report_id", flat=True) + ) + if limit is not None: + qs = qs[:limit] + + ids = list(qs) + if dry_run: + self.stdout.write(f"Dry run: would enqueue {len(ids)} reports.") + return + + priority = settings.EMBEDDING_BACKFILL_PRIORITY + total = 0 + for start in range(0, len(ids), batch_size): + chunk = ids[start : start + batch_size] + enqueue_embed_reports(chunk, priority=priority) + total += len(chunk) + self.stdout.write(f"Enqueued {total} reports for embedding.") diff --git a/radis/pgsearch/tests/test_backfill_command.py b/radis/pgsearch/tests/test_backfill_command.py new file mode 100644 index 00000000..8dea351e --- /dev/null +++ b/radis/pgsearch/tests/test_backfill_command.py @@ -0,0 +1,78 @@ +from io import StringIO +from unittest.mock import patch + +import pytest +from django.conf import settings +from django.core.management import call_command + +from radis.pgsearch.models import ReportSearchVector +from radis.reports.factories import ReportFactory + + +@pytest.mark.django_db +def test_backfill_enqueues_only_null_embeddings(): + r_null = ReportFactory.create() + r_filled = ReportFactory.create() + ReportSearchVector.objects.filter(report=r_filled).update( + embedding=[1.0] + [0.0] * (settings.EMBEDDING_DIM - 1) + ) + + with patch( + "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" + ) as enqueue: + call_command("backfill_embeddings", batch_size=10, stdout=StringIO()) + + # Only the null-embedding report should be in any of the enqueue calls. + enqueued_ids = [rid for call in enqueue.call_args_list for rid in call.args[0]] + assert r_null.pk in enqueued_ids + assert r_filled.pk not in enqueued_ids + + +@pytest.mark.django_db +def test_backfill_chunks_by_batch_size(): + [ReportFactory.create() for _ in range(5)] + + with patch( + "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" + ) as enqueue: + call_command("backfill_embeddings", batch_size=2, stdout=StringIO()) + + sizes = [len(call.args[0]) for call in enqueue.call_args_list] + assert sizes == [2, 2, 1] + + +@pytest.mark.django_db +def test_backfill_limit_caps_total(): + [ReportFactory.create() for _ in range(5)] + + with patch( + "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" + ) as enqueue: + call_command("backfill_embeddings", batch_size=10, limit=3, stdout=StringIO()) + + enqueued_ids = [rid for call in enqueue.call_args_list for rid in call.args[0]] + assert len(enqueued_ids) == 3 + + +@pytest.mark.django_db +def test_backfill_dry_run_does_not_enqueue(): + [ReportFactory.create() for _ in range(3)] + out = StringIO() + + with patch( + "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" + ) as enqueue: + call_command("backfill_embeddings", dry_run=True, stdout=out) + + enqueue.assert_not_called() + assert "would enqueue 3" in out.getvalue().lower() + + +@pytest.mark.django_db +def test_backfill_uses_backfill_priority(): + ReportFactory.create() + with patch( + "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" + ) as enqueue: + call_command("backfill_embeddings", stdout=StringIO()) + assert enqueue.call_args.kwargs["priority"] == settings.EMBEDDING_BACKFILL_PRIORITY From 4cd63437578ae858e2a36ad39615bd46a043e7e6 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 18:57:57 +0000 Subject: [PATCH 15/68] feat(pgsearch): replace search() with hybrid FTS + vector RRF fusion Implements hybrid search provider combining PostgreSQL full-text search with pgvector cosine-distance retrieval, fused via Reciprocal Rank Fusion. Falls back gracefully to FTS-only on EmbeddingClient failure. Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/providers.py | 100 ++++++++++---- radis/pgsearch/tests/test_provider_hybrid.py | 137 +++++++++++++++++++ 2 files changed, 213 insertions(+), 24 deletions(-) create mode 100644 radis/pgsearch/tests/test_provider_hybrid.py diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py index 1078db80..c092f0a9 100644 --- a/radis/pgsearch/providers.py +++ b/radis/pgsearch/providers.py @@ -1,14 +1,17 @@ import logging -from typing import Iterator, cast +from typing import Iterator, Literal, cast +from django.conf import settings from django.contrib.postgres.search import SearchHeadline, SearchQuery, SearchRank from django.db.models import F, Q +from pgvector.django import CosineDistance -from radis.search.site import Search, SearchFilters, SearchResult +from radis.search.site import ReportDocument, Search, SearchFilters, SearchResult from radis.search.utils.query_parser import ( BinaryNode, ParensNode, QueryNode, + QueryParser, TermNode, UnaryNode, is_search_token_char, @@ -16,6 +19,8 @@ from .models import ReportSearchVector from .utils.document_utils import AnnotatedReportSearchVector, document_from_pgsearch_response +from .utils.embedding_client import EmbeddingClient, EmbeddingClientError +from .utils.fusion import rrf_fuse, summary_with_fallback from .utils.language_utils import code_to_language logger = logging.getLogger(__name__) @@ -90,44 +95,91 @@ def _build_filter_query(filters: SearchFilters) -> Q: def search(search: Search) -> SearchResult: query_str = _build_query_string(search.query) language = _resolve_language(search.filters) - query = SearchQuery(query_str, search_type="raw", config=language) filter_query = _build_filter_query(search.filters) - results = ( + tsquery = SearchQuery(query_str, search_type="raw", config=language) + + # Vector side: query embedding (sync HTTP); fall back gracefully on failure. + query_text = QueryParser.unparse(search.query) + query_vec: list[float] | None + try: + query_vec = EmbeddingClient().embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Hybrid search falling back to FTS-only: %s", e) + query_vec = None + + vec_rank: dict[int, int] = {} + if query_vec is not None: + vec_ids = list( + ReportSearchVector.objects.filter(filter_query) + .exclude(embedding__isnull=True) + .annotate(distance=CosineDistance("embedding", query_vec)) + .order_by("distance", "report_id") + .values_list("report_id", flat=True)[: settings.HYBRID_VECTOR_TOP_K] + ) + vec_rank = {rid: i + 1 for i, rid in enumerate(vec_ids)} + + # FTS side: bounded set, ts_rank only (no headline at this stage). + fts_rows = list( ReportSearchVector.objects.filter(filter_query) - .filter(search_vector=query) - .annotate( - rank=SearchRank( - F("search_vector"), - query, - ) + .filter(search_vector=tsquery) + .annotate(rank=SearchRank(F("search_vector"), tsquery)) + .order_by("-rank", "report_id") + .values("report_id", "rank")[: settings.HYBRID_FTS_MAX_RESULTS] + ) + fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)} + + # Fusion. + ordered_ids = rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K) + total_count = len(ordered_ids) + total_relation: Literal["exact", "at_least", "approximately"] = ( + "at_least" + if ( + len(fts_rows) >= settings.HYBRID_FTS_MAX_RESULTS + or len(vec_rank) >= settings.HYBRID_VECTOR_TOP_K ) + else "exact" + ) + + if search.limit is None: + page_ids = ordered_ids[search.offset :] + else: + page_ids = ordered_ids[search.offset : search.offset + search.limit] + + # Headline + hydration for the page slice only. + page_rows = ( + ReportSearchVector.objects.filter(report_id__in=page_ids) .annotate( summary=SearchHeadline( "report__body", - query, + tsquery, config=language, start_sel="", stop_sel="", min_words=10, max_words=20, max_fragments=10, - ) + ), + rank=SearchRank(F("search_vector"), tsquery), ) .select_related("report") - .order_by("-rank") ) + by_id = {r.report_id: r for r in page_rows} + + documents: list[ReportDocument] = [] + for rid in page_ids: + rsv = by_id.get(rid) + if rsv is None: + continue + rsv.summary = summary_with_fallback( # type: ignore[attr-defined] + rsv.report.body, rsv.summary or "", max_words=30 # type: ignore[attr-defined] + ) + documents.append( + document_from_pgsearch_response(cast(AnnotatedReportSearchVector, rsv)) + ) - total_count = results.count() - if search.limit is None: - results = results[search.offset :] - else: - results = results[search.offset : search.offset + search.limit] - documents = [ - document_from_pgsearch_response(cast(AnnotatedReportSearchVector, result)) - for result in results - ] - - return SearchResult(total_count=total_count, total_relation="exact", documents=documents) + return SearchResult( + total_count=total_count, total_relation=total_relation, documents=documents + ) def count(search: Search) -> int: diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py new file mode 100644 index 00000000..b74da808 --- /dev/null +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -0,0 +1,137 @@ +from unittest.mock import patch + +import pytest +from django.contrib.auth.models import Group + +from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.providers import search +from radis.pgsearch.utils.embedding_client import EmbeddingClientError +from radis.reports.factories import ReportFactory +from radis.search.site import Search, SearchFilters +from radis.search.utils.query_parser import QueryParser + + +def _unit_vec(idx: int, dim: int) -> list[float]: + """Deterministic unit vector that points in dimension `idx`.""" + v = [0.0] * dim + v[idx % dim] = 1.0 + return v + + +def _make_search(query_str: str, group_id: int) -> Search: + node, _ = QueryParser().parse(query_str) + assert node is not None + return Search( + query=node, + filters=SearchFilters(group=group_id), + offset=0, + limit=25, + ) + + +@pytest.fixture +def group(db): + return Group.objects.create(name="radiology") + + +@pytest.fixture +def reports_with_embeddings(group, settings): + dim = settings.EMBEDDING_DIM + # r0: matches FTS for "pneumothorax", vector unrelated (dim 99) + r0 = ReportFactory.create(body="Findings: pneumothorax on the left.") + r0.groups.add(group) + # r1: doesn't lexically match "pneumothorax"; embedding at dim 1 (not identical to query dim 0) + r1 = ReportFactory.create(body="Lungs are clear bilaterally.") + r1.groups.add(group) + # r2: matches FTS (multiple times for stronger ts_rank) AND vector exactly at query dim 0 + r2 = ReportFactory.create( + body="No pneumothorax detected. Previous pneumothorax resolved. Lungs clear." + ) + r2.groups.add(group) + ReportSearchVector.objects.filter(report=r0).update(embedding=_unit_vec(99, dim)) + ReportSearchVector.objects.filter(report=r1).update(embedding=_unit_vec(1, dim)) + ReportSearchVector.objects.filter(report=r2).update(embedding=_unit_vec(0, dim)) + return r0, r1, r2 + + +def test_hybrid_returns_fts_only_hit(group, reports_with_embeddings, settings): + r0, _, _ = reports_with_embeddings + dim = settings.EMBEDDING_DIM + # Query vector points at dim 50 — far from all docs. So vec_top_K still + # returns docs, but their distances are large. FTS for "pneumothorax" + # picks up r0 and r2. + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.embed_query.return_value = _unit_vec(50, dim) + result = search(_make_search("pneumothorax", group.pk)) + + ids = [d.document_id for d in result.documents] + assert r0.document_id in ids + + +def test_hybrid_returns_vector_only_hit(group, reports_with_embeddings, settings): + _, r1, _ = reports_with_embeddings + dim = settings.EMBEDDING_DIM + # Query vector at dim 0 — closest to r1 and r2. FTS for "pneumothorax" + # excludes r1 lexically; vector side must surface it. + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) + result = search(_make_search("pneumothorax", group.pk)) + + ids = [d.document_id for d in result.documents] + assert r1.document_id in ids + + +def test_hybrid_both_sides_match_ranks_first(group, reports_with_embeddings, settings): + _, _, r2 = reports_with_embeddings + dim = settings.EMBEDDING_DIM + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) + result = search(_make_search("pneumothorax", group.pk)) + + ids = [d.document_id for d in result.documents] + # r2 is in both vec_top_K and FTS hits; should rank above pure-side matches. + assert ids[0] == r2.document_id + + +def test_embedding_failure_falls_back_to_fts(group, reports_with_embeddings): + r0, _, r2 = reports_with_embeddings + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.embed_query.side_effect = EmbeddingClientError("down") + result = search(_make_search("pneumothorax", group.pk)) + + ids = [d.document_id for d in result.documents] + # Both FTS-matching reports come back, no vector-only ones. + assert set(ids) == {r0.document_id, r2.document_id} + + +@pytest.mark.django_db +def test_reports_with_null_embedding_still_returned_via_fts(group, settings): + dim = settings.EMBEDDING_DIM + r = ReportFactory.create(body="pneumothorax findings") + r.groups.add(group) + # Leave embedding NULL. + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) + result = search(_make_search("pneumothorax", group.pk)) + + ids = [d.document_id for d in result.documents] + assert r.document_id in ids + + +def test_empty_summary_falls_back_to_body_head(group, settings): + dim = settings.EMBEDDING_DIM + # Doc whose body does not contain the query word — vector-only hit. + r = ReportFactory.create( + body="lung parenchyma demonstrates clear bilaterally with no abnormality", + ) + r.groups.add(group) + ReportSearchVector.objects.filter(report=r).update(embedding=_unit_vec(0, dim)) + + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) + result = search(_make_search("pneumothorax", group.pk)) + + doc = next(d for d in result.documents if d.document_id == r.document_id) + # Summary is non-empty (fell back to body head) and is plain text (no ). + assert doc.summary + assert "" not in doc.summary From d8f2da84c5b325e50602dfd2f5743210c494687e Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 19:03:07 +0000 Subject: [PATCH 16/68] test(pgsearch): use module-level pytestmark for django_db in hybrid provider tests --- radis/pgsearch/tests/test_provider_hybrid.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index b74da808..93bce619 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -10,6 +10,8 @@ from radis.search.site import Search, SearchFilters from radis.search.utils.query_parser import QueryParser +pytestmark = pytest.mark.django_db + def _unit_vec(idx: int, dim: int) -> list[float]: """Deterministic unit vector that points in dimension `idx`.""" @@ -104,7 +106,6 @@ def test_embedding_failure_falls_back_to_fts(group, reports_with_embeddings): assert set(ids) == {r0.document_id, r2.document_id} -@pytest.mark.django_db def test_reports_with_null_embedding_still_returned_via_fts(group, settings): dim = settings.EMBEDDING_DIM r = ReportFactory.create(body="pneumothorax findings") From 8d576ae50e7d00b684d4cf38cb16708e9170609c Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 19:04:50 +0000 Subject: [PATCH 17/68] feat(pgsearch): make retrieve() hybrid-aware to match search() order --- radis/pgsearch/providers.py | 48 ++++++++++++++------ radis/pgsearch/tests/test_provider_hybrid.py | 22 ++++++++- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py index c092f0a9..c333516c 100644 --- a/radis/pgsearch/providers.py +++ b/radis/pgsearch/providers.py @@ -6,6 +6,7 @@ from django.db.models import F, Q from pgvector.django import CosineDistance +from radis.reports.models import Report from radis.search.site import ReportDocument, Search, SearchFilters, SearchResult from radis.search.utils.query_parser import ( BinaryNode, @@ -194,23 +195,44 @@ def count(search: Search) -> int: def retrieve(search: Search) -> Iterator[str]: query_str = _build_query_string(search.query) language = _resolve_language(search.filters) - query = SearchQuery(query_str, search_type="raw", config=language) filter_query = _build_filter_query(search.filters) - results = ( - ReportSearchVector.objects.filter(filter_query) - .filter(search_vector=query) - .annotate( - rank=SearchRank( - F("search_vector"), - query, - ) + tsquery = SearchQuery(query_str, search_type="raw", config=language) + + query_text = QueryParser.unparse(search.query) + try: + query_vec = EmbeddingClient().embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Hybrid retrieve falling back to FTS-only: %s", e) + query_vec = None + + vec_rank: dict[int, int] = {} + if query_vec is not None: + vec_ids = list( + ReportSearchVector.objects.filter(filter_query) + .exclude(embedding__isnull=True) + .annotate(distance=CosineDistance("embedding", query_vec)) + .order_by("distance", "report_id") + .values_list("report_id", flat=True)[: settings.HYBRID_VECTOR_TOP_K] ) - .select_related("report") - .order_by("-rank") - .values_list("report__document_id", flat=True) + vec_rank = {rid: i + 1 for i, rid in enumerate(vec_ids)} + + fts_rows = list( + ReportSearchVector.objects.filter(filter_query) + .filter(search_vector=tsquery) + .annotate(rank=SearchRank(F("search_vector"), tsquery)) + .order_by("-rank", "report_id") + .values("report_id", "rank")[: settings.HYBRID_FTS_MAX_RESULTS] ) + fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)} - return results.iterator() + ordered_ids = rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K) + if not ordered_ids: + return iter([]) + + id_to_doc = dict( + Report.objects.filter(pk__in=ordered_ids).values_list("pk", "document_id") + ) + return (id_to_doc[rid] for rid in ordered_ids if rid in id_to_doc) def filter(filter: SearchFilters) -> Iterator[str]: diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index 93bce619..f017190d 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -4,7 +4,7 @@ from django.contrib.auth.models import Group from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.providers import search +from radis.pgsearch.providers import retrieve, search from radis.pgsearch.utils.embedding_client import EmbeddingClientError from radis.reports.factories import ReportFactory from radis.search.site import Search, SearchFilters @@ -136,3 +136,23 @@ def test_empty_summary_falls_back_to_body_head(group, settings): # Summary is non-empty (fell back to body head) and is plain text (no ). assert doc.summary assert "" not in doc.summary + + +def test_retrieve_returns_hybrid_ordered_document_ids(group, reports_with_embeddings, settings): + r0, r1, r2 = reports_with_embeddings + dim = settings.EMBEDDING_DIM + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) + doc_ids = list(retrieve(_make_search("pneumothorax", group.pk))) + + # r2 (both sides) first, then any order containing r0 and r1. + assert doc_ids[0] == r2.document_id + assert set(doc_ids) >= {r0.document_id, r1.document_id, r2.document_id} + + +def test_retrieve_falls_back_to_fts_on_embedding_error(group, reports_with_embeddings): + r0, _, r2 = reports_with_embeddings + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.embed_query.side_effect = EmbeddingClientError("down") + doc_ids = list(retrieve(_make_search("pneumothorax", group.pk))) + assert set(doc_ids) == {r0.document_id, r2.document_id} From 6cb8d6f901016254ab621b4e920384fa1946780d Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 19:06:34 +0000 Subject: [PATCH 18/68] feat(pgsearch): bump SearchProvider.max_results to hybrid bound --- radis/pgsearch/apps.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py index 449bc868..2472547e 100644 --- a/radis/pgsearch/apps.py +++ b/radis/pgsearch/apps.py @@ -11,6 +11,8 @@ def ready(self): def register_app(): + from django.conf import settings + from radis.extractions.site import ( ExtractionRetrievalProvider, register_extraction_retrieval_provider, @@ -29,7 +31,9 @@ def register_app(): SearchProvider( name="PG Search", search=search, - max_results=1000, + max_results=max( + settings.HYBRID_VECTOR_TOP_K, settings.HYBRID_FTS_MAX_RESULTS + ), ) ) From d3c560c6c534ab76e648b42b665c42be8dcca5d5 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 19:08:05 +0000 Subject: [PATCH 19/68] feat(infra): add embeddings_worker container on dedicated queue Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.base.yml | 10 ++++++++++ docker-compose.dev.yml | 9 +++++++++ docker-compose.prod.yml | 10 ++++++++++ 3 files changed, 29 insertions(+) diff --git a/docker-compose.base.yml b/docker-compose.base.yml index 36b2e7d5..ece29b59 100644 --- a/docker-compose.base.yml +++ b/docker-compose.base.yml @@ -17,6 +17,12 @@ x-app: &default-app DJANGO_SERVER_EMAIL: ${DJANGO_SERVER_EMAIL:?} EXTERNAL_LLM_PROVIDER_URL: ${EXTERNAL_LLM_PROVIDER_URL:-} EXTERNAL_LLM_PROVIDER_API_KEY: ${EXTERNAL_LLM_PROVIDER_API_KEY:-} + EMBEDDING_BACKEND: ${EMBEDDING_BACKEND:-openai} + EMBEDDING_PROVIDER_URL: ${EMBEDDING_PROVIDER_URL:-} + EMBEDDING_PROVIDER_PATH: ${EMBEDDING_PROVIDER_PATH:-} + EMBEDDING_PROVIDER_API_KEY: ${EMBEDDING_PROVIDER_API_KEY:-} + EMBEDDING_MODEL_NAME: ${EMBEDDING_MODEL_NAME:-Qwen/Qwen3-Embedding-4B} + EMBEDDING_DIM: ${EMBEDDING_DIM:-1024} IS_DOCKER_CONTAINER: 1 HTTP_PROXY: ${HTTP_PROXY:-} HTTPS_PROXY: ${HTTPS_PROXY:-} @@ -54,6 +60,10 @@ services: <<: *default-app hostname: llm_worker.local + embeddings_worker: + <<: *default-app + hostname: embeddings_worker.local + postgres: image: pgvector/pgvector:pg17 hostname: postgres.local diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index fd6cd1d0..dcc7f1d4 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -82,6 +82,15 @@ services: ./manage.py bg_worker -l debug -q llm --autoreload " + embeddings_worker: + <<: *default-app + image: radis_dev-embeddings_worker:latest + command: > + bash -c " + wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && + ./manage.py bg_worker -l debug -q embeddings --autoreload + " + postgres: environment: POSTGRES_PASSWORD: postgres diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 15c38657..fb10378d 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -77,6 +77,16 @@ services: deploy: <<: *deploy + embeddings_worker: + <<: *default-app + command: > + bash -c " + wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && + ./manage.py bg_worker -q embeddings + " + deploy: + <<: *deploy + postgres: environment: POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?} From 95b3ac19761c1c063ffa6ffc3c7b9bc5bfb45b06 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 19:09:09 +0000 Subject: [PATCH 20/68] docs(infra): document EMBEDDING_* env vars in example.env --- example.env | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/example.env b/example.env index de23797d..bf2ec8e3 100644 --- a/example.env +++ b/example.env @@ -134,6 +134,40 @@ REPORT_LLM_PROVIDER_URL="http://host.docker.internal:11434/v1" # 'cli generate-example-reports'. REPORT_LLM_PROVIDER_API_KEY="ollama" +# Embedding service configuration (used by radis.pgsearch for hybrid search). +# The embedding service is independent of the LLM service above. +# +# Choose a backend. Two are built in: +# - openai: posts {"model": M, "input": [t,...]} to /v1/embeddings and reads +# {"data":[{"embedding":[...]}]} responses (OpenAI / vLLM / TEI). +# - ollama: posts {"model": M, "input": [t,...]} to /api/embed and reads +# {"embeddings":[[...]]} responses (Ollama 0.2.0+). +EMBEDDING_BACKEND=openai + +# Base URL of the embedding service. Path is appended automatically. +EMBEDDING_PROVIDER_URL= + +# Optional: override the backend's default path. For a custom endpoint at +# /api/embeddings with an OpenAI-style payload, set EMBEDDING_BACKEND=openai +# and EMBEDDING_PROVIDER_PATH=/api/embeddings. +EMBEDDING_PROVIDER_PATH= + +# Optional bearer token. Sent as "Authorization: Bearer " when non-empty. +EMBEDDING_PROVIDER_API_KEY= + +# The model name to request from the embedding service. +EMBEDDING_MODEL_NAME=Qwen/Qwen3-Embedding-4B + +# Vector dimension. Schema-coupled: changing this after deploy requires dropping +# the embedding column, re-migrating, and running `./manage.py backfill_embeddings`. +EMBEDDING_DIM=1024 + +# Development with local Ollama: +# EMBEDDING_BACKEND=ollama +# EMBEDDING_PROVIDER_URL=http://host.docker.internal:11434 +# EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M +# EMBEDDING_DIM=2560 + # OpenTelemetry Configuration # Set this to the OTLP HTTP endpoint of the centralized openradx-observability stack. # See https://github.com/openradx/openradx-observability for setup instructions. From 2ed951de31d107e6dd8999cf81da4643a8b6e94d Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 19:11:06 +0000 Subject: [PATCH 21/68] test(search): verify FTS-only fallback when embedding URL is unset Co-Authored-By: Claude Sonnet 4.6 --- radis/search/tests/test_views.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/radis/search/tests/test_views.py b/radis/search/tests/test_views.py index 3a0f672f..b345cfa6 100644 --- a/radis/search/tests/test_views.py +++ b/radis/search/tests/test_views.py @@ -324,3 +324,13 @@ def test_search_view_form_validation_errors(client: Client): response = client.get("/search/", search_params) assert response.status_code == 200 assert "form" in response.context + + +@pytest.mark.django_db +def test_search_view_returns_200_when_embedding_provider_unset(client: Client, settings): + """SearchView returns 200 via FTS-only fallback when EMBEDDING_PROVIDER_URL is unset.""" + settings.EMBEDDING_PROVIDER_URL = "" + user = create_test_user_with_active_group() + client.force_login(user) + response = client.get("/search/?query=pneumothorax") + assert response.status_code == 200 From c485bcde4e61c633f729c3155d738accfabafd63 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 19:12:38 +0000 Subject: [PATCH 22/68] fix(test): strip debug_toolbar middleware so FTS-only smoke can assert 200 Co-Authored-By: Claude Sonnet 4.6 --- radis/search/tests/test_views.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/radis/search/tests/test_views.py b/radis/search/tests/test_views.py index b345cfa6..8e0290da 100644 --- a/radis/search/tests/test_views.py +++ b/radis/search/tests/test_views.py @@ -329,7 +329,12 @@ def test_search_view_form_validation_errors(client: Client): @pytest.mark.django_db def test_search_view_returns_200_when_embedding_provider_unset(client: Client, settings): """SearchView returns 200 via FTS-only fallback when EMBEDDING_PROVIDER_URL is unset.""" + from django.conf import settings as django_settings + settings.EMBEDDING_PROVIDER_URL = "" + settings.MIDDLEWARE = [ + m for m in django_settings.MIDDLEWARE if "debug_toolbar" not in m.lower() + ] user = create_test_user_with_active_group() client.force_login(user) response = client.get("/search/?query=pneumothorax") From 218b0caa73e53fd637e510030b85baedcb047d75 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 15 May 2026 19:21:46 +0000 Subject: [PATCH 23/68] fix(pgsearch): close EmbeddingClient http pool in search() and retrieve() Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/providers.py | 6 ++++-- radis/pgsearch/tests/test_provider_hybrid.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py index c333516c..c9d5dafd 100644 --- a/radis/pgsearch/providers.py +++ b/radis/pgsearch/providers.py @@ -103,7 +103,8 @@ def search(search: Search) -> SearchResult: query_text = QueryParser.unparse(search.query) query_vec: list[float] | None try: - query_vec = EmbeddingClient().embed_query(query_text) + with EmbeddingClient() as ec: + query_vec = ec.embed_query(query_text) except EmbeddingClientError as e: logger.warning("Hybrid search falling back to FTS-only: %s", e) query_vec = None @@ -200,7 +201,8 @@ def retrieve(search: Search) -> Iterator[str]: query_text = QueryParser.unparse(search.query) try: - query_vec = EmbeddingClient().embed_query(query_text) + with EmbeddingClient() as ec: + query_vec = ec.embed_query(query_text) except EmbeddingClientError as e: logger.warning("Hybrid retrieve falling back to FTS-only: %s", e) query_vec = None diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index f017190d..c98782a7 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -63,6 +63,8 @@ def test_hybrid_returns_fts_only_hit(group, reports_with_embeddings, settings): # returns docs, but their distances are large. FTS for "pneumothorax" # picks up r0 and r2. with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None MockClient.return_value.embed_query.return_value = _unit_vec(50, dim) result = search(_make_search("pneumothorax", group.pk)) @@ -76,6 +78,8 @@ def test_hybrid_returns_vector_only_hit(group, reports_with_embeddings, settings # Query vector at dim 0 — closest to r1 and r2. FTS for "pneumothorax" # excludes r1 lexically; vector side must surface it. with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) result = search(_make_search("pneumothorax", group.pk)) @@ -87,6 +91,8 @@ def test_hybrid_both_sides_match_ranks_first(group, reports_with_embeddings, set _, _, r2 = reports_with_embeddings dim = settings.EMBEDDING_DIM with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) result = search(_make_search("pneumothorax", group.pk)) @@ -98,6 +104,8 @@ def test_hybrid_both_sides_match_ranks_first(group, reports_with_embeddings, set def test_embedding_failure_falls_back_to_fts(group, reports_with_embeddings): r0, _, r2 = reports_with_embeddings with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None MockClient.return_value.embed_query.side_effect = EmbeddingClientError("down") result = search(_make_search("pneumothorax", group.pk)) @@ -112,6 +120,8 @@ def test_reports_with_null_embedding_still_returned_via_fts(group, settings): r.groups.add(group) # Leave embedding NULL. with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) result = search(_make_search("pneumothorax", group.pk)) @@ -129,6 +139,8 @@ def test_empty_summary_falls_back_to_body_head(group, settings): ReportSearchVector.objects.filter(report=r).update(embedding=_unit_vec(0, dim)) with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) result = search(_make_search("pneumothorax", group.pk)) @@ -142,6 +154,8 @@ def test_retrieve_returns_hybrid_ordered_document_ids(group, reports_with_embedd r0, r1, r2 = reports_with_embeddings dim = settings.EMBEDDING_DIM with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) doc_ids = list(retrieve(_make_search("pneumothorax", group.pk))) @@ -153,6 +167,8 @@ def test_retrieve_returns_hybrid_ordered_document_ids(group, reports_with_embedd def test_retrieve_falls_back_to_fts_on_embedding_error(group, reports_with_embeddings): r0, _, r2 = reports_with_embeddings with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None MockClient.return_value.embed_query.side_effect = EmbeddingClientError("down") doc_ids = list(retrieve(_make_search("pneumothorax", group.pk))) assert set(doc_ids) == {r0.document_id, r2.document_id} From 16e3caacf4b0ad2e78818f6ff3441378972ea046 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Sun, 17 May 2026 19:20:03 +0000 Subject: [PATCH 24/68] feat(pgsearch): expose cosine_distance and rrf_score on ReportDocument Add two new optional fields to ReportDocument (cosine_distance, rrf_score=0.0) so callers can inspect both the raw vector similarity and the RRF-fused ranking signal that determined result order. Change rrf_fuse to return list[tuple[int, float]] instead of list[int]; update both search() and retrieve() callers, and thread the captured distances and scores through document_from_pgsearch_response. Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/providers.py | 21 ++++++++++++----- radis/pgsearch/tests/test_fusion.py | 24 ++++++++++++++++---- radis/pgsearch/tests/test_provider_hybrid.py | 23 +++++++++++++++++++ radis/pgsearch/utils/document_utils.py | 4 ++++ radis/pgsearch/utils/fusion.py | 9 +++++--- radis/search/site.py | 2 ++ 6 files changed, 69 insertions(+), 14 deletions(-) diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py index c9d5dafd..f11119be 100644 --- a/radis/pgsearch/providers.py +++ b/radis/pgsearch/providers.py @@ -110,15 +110,18 @@ def search(search: Search) -> SearchResult: query_vec = None vec_rank: dict[int, int] = {} + vec_distance: dict[int, float] = {} if query_vec is not None: - vec_ids = list( + vec_rows = list( ReportSearchVector.objects.filter(filter_query) .exclude(embedding__isnull=True) .annotate(distance=CosineDistance("embedding", query_vec)) .order_by("distance", "report_id") - .values_list("report_id", flat=True)[: settings.HYBRID_VECTOR_TOP_K] + .values_list("report_id", "distance")[: settings.HYBRID_VECTOR_TOP_K] ) - vec_rank = {rid: i + 1 for i, rid in enumerate(vec_ids)} + for i, (rid, dist) in enumerate(vec_rows): + vec_rank[rid] = i + 1 + vec_distance[rid] = float(dist) # FTS side: bounded set, ts_rank only (no headline at this stage). fts_rows = list( @@ -131,7 +134,9 @@ def search(search: Search) -> SearchResult: fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)} # Fusion. - ordered_ids = rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K) + ordered_pairs = rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K) + ordered_ids = [rid for rid, _ in ordered_pairs] + rrf_score_by_id = {rid: score for rid, score in ordered_pairs} total_count = len(ordered_ids) total_relation: Literal["exact", "at_least", "approximately"] = ( "at_least" @@ -176,7 +181,11 @@ def search(search: Search) -> SearchResult: rsv.report.body, rsv.summary or "", max_words=30 # type: ignore[attr-defined] ) documents.append( - document_from_pgsearch_response(cast(AnnotatedReportSearchVector, rsv)) + document_from_pgsearch_response( + cast(AnnotatedReportSearchVector, rsv), + cosine_distance=vec_distance.get(rid), + rrf_score=rrf_score_by_id.get(rid, 0.0), + ) ) return SearchResult( @@ -227,7 +236,7 @@ def retrieve(search: Search) -> Iterator[str]: ) fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)} - ordered_ids = rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K) + ordered_ids = [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K)] if not ordered_ids: return iter([]) diff --git a/radis/pgsearch/tests/test_fusion.py b/radis/pgsearch/tests/test_fusion.py index 36fe638c..79475091 100644 --- a/radis/pgsearch/tests/test_fusion.py +++ b/radis/pgsearch/tests/test_fusion.py @@ -1,3 +1,5 @@ +import pytest + from radis.pgsearch.utils.fusion import rrf_fuse, summary_with_fallback @@ -9,25 +11,25 @@ def test_rrf_both_sides_have_hits_overlap(): # 2: 1/(61)+1/(61) = 0.03279 # 3: 1/(63)+1/(62) = 0.03200 # 4: 1/(63) = 0.01587 - assert rrf_fuse(vec_rank, fts_rank, k=60) == [2, 3, 1, 4] + assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [2, 3, 1, 4] def test_rrf_disjoint_universes(): vec_rank = {1: 1} fts_rank = {2: 1} - assert rrf_fuse(vec_rank, fts_rank, k=60) == [1, 2] + assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [1, 2] def test_rrf_only_fts(): vec_rank: dict[int, int] = {} fts_rank = {10: 1, 20: 2, 30: 3} - assert rrf_fuse(vec_rank, fts_rank, k=60) == [10, 20, 30] + assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [10, 20, 30] def test_rrf_only_vec(): vec_rank = {10: 1, 20: 2, 30: 3} fts_rank: dict[int, int] = {} - assert rrf_fuse(vec_rank, fts_rank, k=60) == [10, 20, 30] + assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [10, 20, 30] def test_rrf_empty(): @@ -39,7 +41,19 @@ def test_rrf_tiebreak_by_id(): vec_rank = {2: 1} fts_rank = {1: 1} # Both contribute 1/61. Tiebreak by id ascending. - assert rrf_fuse(vec_rank, fts_rank, k=60) == [1, 2] + assert [rid for rid, _ in rrf_fuse(vec_rank, fts_rank, k=60)] == [1, 2] + + +def test_rrf_returns_scores_descending_with_tiebreak(): + vec_rank = {1: 1} + fts_rank = {1: 1, 2: 2} + pairs = rrf_fuse(vec_rank, fts_rank, k=60) + # id 1: in both, score = 1/61 + 1/61 = 2/61 + # id 2: in fts only, score = 1/62 + assert pairs[0][0] == 1 + assert pairs[1][0] == 2 + assert pairs[0][1] == pytest.approx(2.0 / 61.0) + assert pairs[1][1] == pytest.approx(1.0 / 62.0) def test_summary_with_fallback_keeps_nonempty(): diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index c98782a7..4c2ad070 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -172,3 +172,26 @@ def test_retrieve_falls_back_to_fts_on_embedding_error(group, reports_with_embed MockClient.return_value.embed_query.side_effect = EmbeddingClientError("down") doc_ids = list(retrieve(_make_search("pneumothorax", group.pk))) assert set(doc_ids) == {r0.document_id, r2.document_id} + + +def test_documents_carry_cosine_distance_and_rrf_score( + group, reports_with_embeddings, settings +): + """Verify cosine_distance is set for vector-side hits and rrf_score reflects fusion.""" + _, _, r2 = reports_with_embeddings + dim = settings.EMBEDDING_DIM + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None + MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) + result = search(_make_search("pneumothorax", group.pk)) + + # r2 is in both vector top-K and FTS hits, so its rrf_score should be the largest. + top = result.documents[0] + assert top.document_id == r2.document_id + assert top.cosine_distance is not None + assert top.cosine_distance >= 0.0 + assert top.rrf_score > 0.0 + # All later documents have a strictly lower or equal rrf_score. + for prev, curr in zip(result.documents, result.documents[1:]): + assert curr.rrf_score <= prev.rrf_score diff --git a/radis/pgsearch/utils/document_utils.py b/radis/pgsearch/utils/document_utils.py index 4a2ff364..bed79439 100644 --- a/radis/pgsearch/utils/document_utils.py +++ b/radis/pgsearch/utils/document_utils.py @@ -12,6 +12,8 @@ class Meta: def document_from_pgsearch_response( record: AnnotatedReportSearchVector, + cosine_distance: float | None = None, + rrf_score: float = 0.0, ) -> ReportDocument: report = record.report return ReportDocument( @@ -24,4 +26,6 @@ def document_from_pgsearch_response( study_description=report.study_description, modalities=report.modality_codes, summary=record.summary, + cosine_distance=cosine_distance, + rrf_score=rrf_score, ) diff --git a/radis/pgsearch/utils/fusion.py b/radis/pgsearch/utils/fusion.py index 948731ef..a93f35b9 100644 --- a/radis/pgsearch/utils/fusion.py +++ b/radis/pgsearch/utils/fusion.py @@ -2,11 +2,12 @@ def rrf_fuse( vec_rank: dict[int, int], fts_rank: dict[int, int], k: int, -) -> list[int]: +) -> list[tuple[int, float]]: """Reciprocal Rank Fusion. vec_rank and fts_rank map report_id -> 1-based rank position in each retriever. - Returns report ids ordered by descending RRF score, with stable id tiebreak. + Returns (report_id, fused_score) tuples ordered by descending score, + with stable ascending-id tiebreak. """ all_ids = set(vec_rank) | set(fts_rank) @@ -18,7 +19,9 @@ def score(rid: int) -> float: s += 1.0 / (k + fts_rank[rid]) return s - return sorted(all_ids, key=lambda rid: (-score(rid), rid)) + scored = [(rid, score(rid)) for rid in all_ids] + scored.sort(key=lambda pair: (-pair[1], pair[0])) + return scored def summary_with_fallback(body: str, summary: str, max_words: int) -> str: diff --git a/radis/search/site.py b/radis/search/site.py index 5e0ac4b6..170c2fc3 100644 --- a/radis/search/site.py +++ b/radis/search/site.py @@ -16,6 +16,8 @@ class ReportDocument(NamedTuple): study_description: str modalities: list[str] summary: str + cosine_distance: float | None = None + rrf_score: float = 0.0 @property def full_report(self) -> Report: From 9763130feb26e5dd4d2847c76c1aeaf184d96c24 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Sun, 17 May 2026 19:26:37 +0000 Subject: [PATCH 25/68] feat(search): surface cosine distance and RRF score in result header --- radis/search/templates/search/_result_header.html | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/radis/search/templates/search/_result_header.html b/radis/search/templates/search/_result_header.html index 49a2dcfa..c201e822 100644 --- a/radis/search/templates/search/_result_header.html +++ b/radis/search/templates/search/_result_header.html @@ -12,7 +12,9 @@
Result: #{{ counter|add:offset }} - Relevance: {{ document.relevance|floatformat:3 }} + FTS rank: {{ document.relevance|floatformat:3 }} + Cosine dist: {{ document.cosine_distance|floatformat:3|default:"—" }} + RRF score: {{ document.rrf_score|floatformat:4 }}
From c4750af0bad90160e7026e1ff461f9069f499ba6 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Sun, 17 May 2026 19:27:23 +0000 Subject: [PATCH 26/68] feat(pgsearch): Matryoshka-truncate oversized embeddings to EMBEDDING_DIM Embedding providers like Ollama's Qwen3-Embedding-4B return native 2560-dim vectors and ignore the dimensions parameter. pgvector's HNSW index has a hard 2000-dim limit, so EMBEDDING_DIM must stay <=2000. Qwen3 is trained with Matryoshka representation learning, so the first N dimensions of a 2560-dim vector are a valid lower-dim embedding once renormalized. EmbeddingClient.embed_documents now: - raises EmbeddingClientError when the returned vector is shorter than EMBEDDING_DIM - truncates oversized vectors to the first EMBEDDING_DIM components - L2-renormalizes after truncation Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/tests/test_embedding_client.py | 31 +++++++++++++++++-- radis/pgsearch/utils/embedding_client.py | 9 ++++-- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/radis/pgsearch/tests/test_embedding_client.py b/radis/pgsearch/tests/test_embedding_client.py index 56f97cfd..4174a2ed 100644 --- a/radis/pgsearch/tests/test_embedding_client.py +++ b/radis/pgsearch/tests/test_embedding_client.py @@ -195,11 +195,12 @@ def handler(request: httpx.Request) -> httpx.Response: EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) -def test_dim_mismatch_raises(monkeypatch): +def test_dim_too_small_raises(monkeypatch): from radis.pgsearch.utils import embedding_client as ec def handler(request: httpx.Request) -> httpx.Response: - return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0, 3.0]}]}) + # Returns dim=1, expected dim=2 -> too small, must raise. + return httpx.Response(200, json={"data": [{"embedding": [1.0]}]}) monkeypatch.setattr( ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) @@ -208,6 +209,32 @@ def handler(request: httpx.Request) -> httpx.Response: ec.EmbeddingClient().embed_documents(["x"]) +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_oversized_embedding_truncates_and_renormalizes(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + def handler(request: httpx.Request) -> httpx.Response: + # Returns dim=4 ([3,4,99,99]); EMBEDDING_DIM=2 keeps [3,4], norm 5 -> [0.6, 0.8]. + return httpx.Response(200, json={"data": [{"embedding": [3.0, 4.0, 99.0, 99.0]}]}) + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) + ) + vectors = ec.EmbeddingClient().embed_documents(["x"]) + assert len(vectors) == 1 + assert vectors[0] == pytest.approx([0.6, 0.8]) + + @override_settings( EMBEDDING_BACKEND="openai", EMBEDDING_PROVIDER_URL="http://embed.example", diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py index 73d76582..d54d1649 100644 --- a/radis/pgsearch/utils/embedding_client.py +++ b/radis/pgsearch/utils/embedding_client.py @@ -124,11 +124,14 @@ def embed_documents(self, texts: list[str]) -> list[list[float]]: raw = self._backend.parse_response(body) normalized: list[list[float]] = [] for vec in raw: - if len(vec) != self._dim: + if len(vec) < self._dim: raise EmbeddingClientError( - f"Embedding dim mismatch: got {len(vec)}, expected {self._dim}" + f"Embedding dim too small: got {len(vec)}, expected at least {self._dim}" ) - normalized.append(_l2_normalize(list(vec))) + # Matryoshka truncation: keep first EMBEDDING_DIM components, then re-normalize. + # Qwen3-Embedding is trained to retain quality at truncated dimensions. + truncated = list(vec[: self._dim]) + normalized.append(_l2_normalize(truncated)) return normalized def embed_query(self, text: str) -> list[float]: From 15ded7ae049730b2e154f3d08eb6f3cecea264b1 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Sun, 17 May 2026 19:53:52 +0000 Subject: [PATCH 27/68] ci: tag built dev image for embeddings_worker too The new embeddings_worker service introduced for hybrid search uses `radis_dev-embeddings_worker:latest`. CI runs `compose-up --no-build` so the image must be pre-built; add the tag to the build step. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 84f2882a..9384abcf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,6 +35,7 @@ jobs: radis_dev-web:latest radis_dev-default_worker:latest radis_dev-llm_worker:latest + radis_dev-embeddings_worker:latest cache-from: type=gha cache-to: type=gha,mode=max - name: Start Docker containers From 4e8d4a80f7a23de2c65edec64279157fa6e8d5b4 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Sun, 17 May 2026 20:34:45 +0000 Subject: [PATCH 28/68] fix(pgsearch): satisfy pyright on report_id and Procrastinate __wrapped__ - providers.py: r.report.pk instead of r.report_id (report is already select_related, so this avoids an extra query while making pyright happy without django-stubs). - test_embed_reports_task.py: bind embed_reports to the unwrapped function once at module scope and call it directly throughout; one type-ignore on the indirection line replaces seven scattered ones. Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/providers.py | 2 +- .../pgsearch/tests/test_embed_reports_task.py | 20 +++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py index f11119be..0d76ece2 100644 --- a/radis/pgsearch/providers.py +++ b/radis/pgsearch/providers.py @@ -170,7 +170,7 @@ def search(search: Search) -> SearchResult: ) .select_related("report") ) - by_id = {r.report_id: r for r in page_rows} + by_id = {r.report.pk: r for r in page_rows} documents: list[ReportDocument] = [] for rid in page_ids: diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py index 748c6dd0..48aed57c 100644 --- a/radis/pgsearch/tests/test_embed_reports_task.py +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -3,9 +3,13 @@ import pytest from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.tasks import embed_reports +from radis.pgsearch.tasks import embed_reports as _embed_reports_task from radis.reports.factories import ReportFactory +# Procrastinate's @app.task wraps the function; tests call the underlying +# function directly to skip the broker layer. +embed_reports = _embed_reports_task.__wrapped__ # type: ignore[attr-defined] + @pytest.mark.django_db def test_embed_reports_writes_normalized_vector(): @@ -16,7 +20,7 @@ def test_embed_reports_writes_normalized_vector(): "radis.pgsearch.tasks.EmbeddingClient" ) as MockClient: MockClient.return_value.embed_documents.return_value = [fake_vec] - embed_reports.__wrapped__([report.pk]) + embed_reports([report.pk]) rsv = ReportSearchVector.objects.get(report=report) assert rsv.embedding is not None @@ -34,7 +38,7 @@ def test_embed_reports_overwrites_existing_embedding(): new_vec = [1.0] + [0.0] * 1023 with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: MockClient.return_value.embed_documents.return_value = [new_vec] - embed_reports.__wrapped__([report.pk]) + embed_reports([report.pk]) rsv.refresh_from_db() assert pytest.approx(rsv.embedding[0]) == 1.0 @@ -45,7 +49,7 @@ def test_embed_reports_overwrites_existing_embedding(): def test_embed_reports_skips_missing_ids_without_error(): with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: # No reports created. Should not call the client at all. - embed_reports.__wrapped__([99999]) + embed_reports([99999]) MockClient.return_value.embed_documents.assert_not_called() @@ -61,7 +65,7 @@ def test_embed_reports_splits_into_batches(settings): [fake_vec, fake_vec], [fake_vec], ] - embed_reports.__wrapped__([r.pk for r in reports]) + embed_reports([r.pk for r in reports]) assert MockClient.return_value.embed_documents.call_count == 3 @@ -74,7 +78,7 @@ def test_embed_reports_propagates_client_error(): with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: MockClient.return_value.embed_documents.side_effect = EmbeddingClientError("boom") with pytest.raises(EmbeddingClientError): - embed_reports.__wrapped__([report.pk]) + embed_reports([report.pk]) @pytest.mark.django_db @@ -84,7 +88,7 @@ def test_embed_reports_closes_client_on_success(): with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: MockClient.return_value.embed_documents.return_value = [fake_vec] - embed_reports.__wrapped__([report.pk]) + embed_reports([report.pk]) MockClient.return_value.close.assert_called_once() @@ -97,5 +101,5 @@ def test_embed_reports_closes_client_on_error(): with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: MockClient.return_value.embed_documents.side_effect = EmbeddingClientError("boom") with pytest.raises(EmbeddingClientError): - embed_reports.__wrapped__([report.pk]) + embed_reports([report.pk]) MockClient.return_value.close.assert_called_once() From c51b1e5ccae0ab1710f336b638ccdebed5141630 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Mon, 18 May 2026 08:38:45 +0000 Subject: [PATCH 29/68] feat(pgsearch): validate embedding-client config and response shape - Reject EMBEDDING_PROVIDER_PATH that doesn't start with '/' (would otherwise produce malformed URLs like 'http://hostv1/embeddings'). - Raise EmbeddingClientError when the backend returns a different number of vectors than requested, so embed_query()'s [0] indexing can't hit IndexError on a buggy backend. - Split the Matryoshka-truncation branch from the no-truncation path for clarity; both still re-normalize since providers don't guarantee unit vectors. - embed_query raises EmbeddingClientError on empty results instead of IndexError. Addresses PR #226 review comments from gemini-code-assist (1, 4) and coderabbitai (14, 15). Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/tests/test_embedding_client.py | 43 +++++++++++++++++++ radis/pgsearch/utils/embedding_client.py | 30 ++++++++++--- 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/radis/pgsearch/tests/test_embedding_client.py b/radis/pgsearch/tests/test_embedding_client.py index 4174a2ed..82c231f6 100644 --- a/radis/pgsearch/tests/test_embedding_client.py +++ b/radis/pgsearch/tests/test_embedding_client.py @@ -315,3 +315,46 @@ def close(self): with ec.EmbeddingClient(): pass assert closed["value"] is True + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="v1/embeddings", # missing leading slash + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_provider_path_without_leading_slash_raises(): + from radis.pgsearch.utils import embedding_client as ec + + with pytest.raises(ec.EmbeddingClientError, match="must start with '/'"): + ec.EmbeddingClient() + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_MAX_INPUT_CHARS=100, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_response_count_mismatch_raises(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + def handler(request: httpx.Request) -> httpx.Response: + # Requested 2 inputs, backend returns only 1. + return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]}) + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) + ) + with pytest.raises(ec.EmbeddingClientError, match="count mismatch"): + ec.EmbeddingClient().embed_documents(["a", "b"]) diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py index d54d1649..4ff39fd6 100644 --- a/radis/pgsearch/utils/embedding_client.py +++ b/radis/pgsearch/utils/embedding_client.py @@ -93,6 +93,10 @@ def __init__(self) -> None: f"known: {sorted(BACKENDS)}" ) from e path = settings.EMBEDDING_PROVIDER_PATH or self._backend.path + if not path.startswith("/"): + raise EmbeddingClientError( + f"EMBEDDING_PROVIDER_PATH must start with '/'; got {path!r}" + ) base = settings.EMBEDDING_PROVIDER_URL.rstrip("/") if not base: raise EmbeddingClientError("EMBEDDING_PROVIDER_URL is not configured") @@ -107,8 +111,8 @@ def __init__(self) -> None: self._http = _build_http_client() def embed_documents(self, texts: list[str]) -> list[list[float]]: - truncated = _truncate(texts, self._max_chars) - payload = self._backend.build_payload(self._model, truncated) + truncated_texts = _truncate(texts, self._max_chars) + payload = self._backend.build_payload(self._model, truncated_texts) try: response = self._http.post(self._url, json=payload, headers=self._headers) except httpx.HTTPError as e: @@ -122,21 +126,33 @@ def embed_documents(self, texts: list[str]) -> list[list[float]]: except ValueError as e: raise EmbeddingClientError(f"Embedding response is not JSON: {e}") from e raw = self._backend.parse_response(body) + if len(raw) != len(truncated_texts): + raise EmbeddingClientError( + f"Embedding count mismatch: requested {len(truncated_texts)}, " + f"backend returned {len(raw)}" + ) normalized: list[list[float]] = [] for vec in raw: if len(vec) < self._dim: raise EmbeddingClientError( f"Embedding dim too small: got {len(vec)}, expected at least {self._dim}" ) - # Matryoshka truncation: keep first EMBEDDING_DIM components, then re-normalize. - # Qwen3-Embedding is trained to retain quality at truncated dimensions. - truncated = list(vec[: self._dim]) - normalized.append(_l2_normalize(truncated)) + if len(vec) > self._dim: + # Matryoshka truncation: keep first EMBEDDING_DIM components, then re-normalize. + # Qwen3-Embedding is trained to retain quality at truncated dimensions. + normalized.append(_l2_normalize(list(vec[: self._dim]))) + else: + # Length already matches; still normalize since we can't assume + # all providers return unit vectors. + normalized.append(_l2_normalize(list(vec))) return normalized def embed_query(self, text: str) -> list[float]: prefixed = f"{self._instruction}{text}" if self._instruction else text - return self.embed_documents([prefixed])[0] + vectors = self.embed_documents([prefixed]) + if not vectors: + raise EmbeddingClientError("Embedding service returned no vectors for query") + return vectors[0] def close(self) -> None: self._http.close() From 0d2abdb9a9f641ca70c2d5af139e3704860c9f09 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Mon, 18 May 2026 08:41:11 +0000 Subject: [PATCH 30/68] fix(pgsearch): dedupe candidate rows in hybrid search with .distinct() The filter Q built by `_build_filter_query` can include joins through M2M tables (modalities, groups). Without `.distinct()`, a report with N matching modalities appears N times in the vector top-K or FTS hit set, consuming K slots and corrupting rank enumeration. Add `.distinct()` to both querysets in `search()` and `retrieve()`. Also simplify the ordered_pairs unpacking: `dict(ordered_pairs)` is clearer than the two-pass list comprehension + dict comprehension, and `list(dict_keys)` preserves insertion order. Addresses PR #226 review comments from gemini-code-assist (2) and coderabbitai (9). Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/providers.py | 8 ++++-- radis/pgsearch/tests/test_provider_hybrid.py | 28 ++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py index 0d76ece2..9046cb82 100644 --- a/radis/pgsearch/providers.py +++ b/radis/pgsearch/providers.py @@ -114,6 +114,7 @@ def search(search: Search) -> SearchResult: if query_vec is not None: vec_rows = list( ReportSearchVector.objects.filter(filter_query) + .distinct() .exclude(embedding__isnull=True) .annotate(distance=CosineDistance("embedding", query_vec)) .order_by("distance", "report_id") @@ -126,6 +127,7 @@ def search(search: Search) -> SearchResult: # FTS side: bounded set, ts_rank only (no headline at this stage). fts_rows = list( ReportSearchVector.objects.filter(filter_query) + .distinct() .filter(search_vector=tsquery) .annotate(rank=SearchRank(F("search_vector"), tsquery)) .order_by("-rank", "report_id") @@ -135,8 +137,8 @@ def search(search: Search) -> SearchResult: # Fusion. ordered_pairs = rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K) - ordered_ids = [rid for rid, _ in ordered_pairs] - rrf_score_by_id = {rid: score for rid, score in ordered_pairs} + rrf_score_by_id = dict(ordered_pairs) + ordered_ids = list(rrf_score_by_id) total_count = len(ordered_ids) total_relation: Literal["exact", "at_least", "approximately"] = ( "at_least" @@ -220,6 +222,7 @@ def retrieve(search: Search) -> Iterator[str]: if query_vec is not None: vec_ids = list( ReportSearchVector.objects.filter(filter_query) + .distinct() .exclude(embedding__isnull=True) .annotate(distance=CosineDistance("embedding", query_vec)) .order_by("distance", "report_id") @@ -229,6 +232,7 @@ def retrieve(search: Search) -> Iterator[str]: fts_rows = list( ReportSearchVector.objects.filter(filter_query) + .distinct() .filter(search_vector=tsquery) .annotate(rank=SearchRank(F("search_vector"), tsquery)) .order_by("-rank", "report_id") diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index 4c2ad070..f382e422 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -195,3 +195,31 @@ def test_documents_carry_cosine_distance_and_rrf_score( # All later documents have a strictly lower or equal rrf_score. for prev, curr in zip(result.documents, result.documents[1:]): assert curr.rrf_score <= prev.rrf_score + + +def test_m2m_filter_does_not_duplicate_results(group, settings): + """Reports with multiple modalities must appear exactly once when the modality + filter joins the M2M table. Without `.distinct()` on the queryset, joining on + report__modalities__code__in produces one row per matching modality, which + inflates rank position and corrupts top-K slicing.""" + dim = settings.EMBEDDING_DIM + r = ReportFactory.create(body="pneumothorax findings", modalities=["CT", "MR", "DX"]) + r.groups.add(group) + ReportSearchVector.objects.filter(report=r).update(embedding=_unit_vec(0, dim)) + + node, _ = QueryParser().parse("pneumothorax") + assert node is not None + s = Search( + query=node, + filters=SearchFilters(group=group.pk, modalities=["CT", "MR", "DX"]), + offset=0, + limit=10, + ) + with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: + MockClient.return_value.__enter__.return_value = MockClient.return_value + MockClient.return_value.__exit__.return_value = None + MockClient.return_value.embed_query.return_value = _unit_vec(0, dim) + result = search(s) + + matching = [d for d in result.documents if d.document_id == r.document_id] + assert len(matching) == 1, f"Expected 1 occurrence, got {len(matching)}" From 2ce60e8039081fbe79a1cc96c188bf5dd5a754d5 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Mon, 18 May 2026 08:43:29 +0000 Subject: [PATCH 31/68] fix(pgsearch): bulk_update embeddings, guard batch size, defer signal to on_commit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related fixes to embed_reports and its signal: 1. Replace the per-row .update() loop inside embed_reports with a single bulk_update() call per batch. One DB round-trip per batch instead of one per report. Closes spec §11.5 / PR #226 comment 3. 2. Validate EMBEDDING_BATCH_SIZE > 0 before the loop and raise ImproperlyConfigured with a clear message instead of an inscrutable infinite loop / range(0, N, 0) ValueError. 3. Wrap enqueue_embed_reports in transaction.on_commit so the embeddings worker can't pick up a job before the surrounding Report.save() transaction commits and find no ReportSearchVector to update. Matches the pattern used elsewhere in the codebase. Tests adjusted: test_signals now uses django_capture_on_commit_callbacks and transaction=True to observe the deferred enqueue. New test for the batch-size guard. Addresses PR #226 review comments from gemini-code-assist (3) and coderabbitai (10, 11). Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/signals.py | 5 ++- radis/pgsearch/tasks.py | 10 ++++- .../pgsearch/tests/test_embed_reports_task.py | 11 +++++ radis/pgsearch/tests/test_signals.py | 44 ++++++++++--------- 4 files changed, 46 insertions(+), 24 deletions(-) diff --git a/radis/pgsearch/signals.py b/radis/pgsearch/signals.py index d60ae781..492e143f 100644 --- a/radis/pgsearch/signals.py +++ b/radis/pgsearch/signals.py @@ -1,3 +1,4 @@ +from django.db import transaction from django.db.models.signals import post_save from django.dispatch import receiver @@ -17,4 +18,6 @@ def create_or_update_report_search_vector(sender, instance, created, **kwargs): @receiver(post_save, sender=Report) def enqueue_report_embedding(sender, instance, **kwargs): - enqueue_embed_reports([instance.pk]) + # Defer to on_commit so the embed_reports worker can't race the surrounding + # transaction and find no ReportSearchVector row to update. + transaction.on_commit(lambda: enqueue_embed_reports([instance.pk])) diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index b62bf4b5..75d42d28 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -1,6 +1,7 @@ import logging from django.conf import settings as django_settings +from django.core.exceptions import ImproperlyConfigured from procrastinate.contrib.django import app from procrastinate.types import JSONValue @@ -48,16 +49,21 @@ def embed_reports(report_ids: list[int]) -> None: if not rsvs: return - client = EmbeddingClient() batch_size = django_settings.EMBEDDING_BATCH_SIZE + if batch_size <= 0: + raise ImproperlyConfigured( + f"EMBEDDING_BATCH_SIZE must be > 0, got {batch_size}" + ) + client = EmbeddingClient() try: for start in range(0, len(rsvs), batch_size): chunk = rsvs[start : start + batch_size] texts = [rsv.report.body for rsv in chunk] vectors = client.embed_documents(texts) for rsv, vec in zip(chunk, vectors, strict=True): - ReportSearchVector.objects.filter(pk=rsv.pk).update(embedding=vec) + rsv.embedding = vec + ReportSearchVector.objects.bulk_update(chunk, fields=["embedding"]) finally: client.close() diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py index 48aed57c..345907b8 100644 --- a/radis/pgsearch/tests/test_embed_reports_task.py +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -103,3 +103,14 @@ def test_embed_reports_closes_client_on_error(): with pytest.raises(EmbeddingClientError): embed_reports([report.pk]) MockClient.return_value.close.assert_called_once() + + +@pytest.mark.django_db +def test_embed_reports_raises_on_invalid_batch_size(settings): + from django.core.exceptions import ImproperlyConfigured + + settings.EMBEDDING_BATCH_SIZE = 0 + report = ReportFactory.create() + with patch("radis.pgsearch.tasks.EmbeddingClient"): + with pytest.raises(ImproperlyConfigured, match="EMBEDDING_BATCH_SIZE must be > 0"): + embed_reports([report.pk]) diff --git a/radis/pgsearch/tests/test_signals.py b/radis/pgsearch/tests/test_signals.py index 78151acf..de8f7652 100644 --- a/radis/pgsearch/tests/test_signals.py +++ b/radis/pgsearch/tests/test_signals.py @@ -5,34 +5,36 @@ from radis.reports.factories import ReportFactory -@pytest.mark.django_db -def test_report_save_enqueues_embed_reports(): +@pytest.mark.django_db(transaction=True) +def test_report_save_enqueues_embed_reports(django_capture_on_commit_callbacks): from radis.reports.models import Language, Report language = Language.objects.create(code="en") with patch("radis.pgsearch.signals.enqueue_embed_reports") as enqueue: - report = Report.objects.create( - document_id="DOC-SIGNAL-1", - pacs_aet="PACS", - pacs_name="PACS", - pacs_link="", - patient_id="P1", - patient_birth_date="1980-01-01", - patient_sex="M", - study_description="Study", - study_datetime="2024-01-01T00:00:00Z", - study_instance_uid="1.2.3.4", - accession_number="ACC1", - body="Body.", - language=language, - ) + with django_capture_on_commit_callbacks(execute=True): + report = Report.objects.create( + document_id="DOC-SIGNAL-1", + pacs_aet="PACS", + pacs_name="PACS", + pacs_link="", + patient_id="P1", + patient_birth_date="1980-01-01", + patient_sex="M", + study_description="Study", + study_datetime="2024-01-01T00:00:00Z", + study_instance_uid="1.2.3.4", + accession_number="ACC1", + body="Body.", + language=language, + ) enqueue.assert_called_once_with([report.pk]) -@pytest.mark.django_db -def test_report_update_also_enqueues_embed_reports(): +@pytest.mark.django_db(transaction=True) +def test_report_update_also_enqueues_embed_reports(django_capture_on_commit_callbacks): report = ReportFactory.create() with patch("radis.pgsearch.signals.enqueue_embed_reports") as enqueue: - report.body = "Updated body" - report.save() + with django_capture_on_commit_callbacks(execute=True): + report.body = "Updated body" + report.save() enqueue.assert_called_once_with([report.pk]) From 95d017242eb9aeacb840ccf9d9555e267f7b12ed Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Mon, 18 May 2026 08:44:33 +0000 Subject: [PATCH 32/68] feat(pgsearch): stream backfill ids and validate CLI bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reject --batch-size <= 0 and --limit < 0 in handle() with CommandError. - Replace `list(qs)` materialization with `.iterator(chunk_size=...)` so a rollout against millions of reports doesn't exhaust worker memory. - Dry-run prints `qs.count()` instead of `len(list)` — equivalent semantics with a single SQL COUNT. Addresses PR #226 review comments from coderabbitai (6, 7). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../commands/backfill_embeddings.py | 21 ++++++++++++++----- radis/pgsearch/tests/test_backfill_command.py | 16 ++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/radis/pgsearch/management/commands/backfill_embeddings.py b/radis/pgsearch/management/commands/backfill_embeddings.py index 1fca3488..00482bde 100644 --- a/radis/pgsearch/management/commands/backfill_embeddings.py +++ b/radis/pgsearch/management/commands/backfill_embeddings.py @@ -1,5 +1,5 @@ from django.conf import settings -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError from radis.pgsearch.models import ReportSearchVector from radis.pgsearch.tasks import enqueue_embed_reports @@ -26,6 +26,11 @@ def add_arguments(self, parser): ) def handle(self, *args, batch_size, limit, dry_run, **options): + if batch_size <= 0: + raise CommandError(f"--batch-size must be > 0, got {batch_size}") + if limit is not None and limit < 0: + raise CommandError(f"--limit must be >= 0, got {limit}") + qs = ( ReportSearchVector.objects.filter(embedding__isnull=True) .order_by("report_id") @@ -34,15 +39,21 @@ def handle(self, *args, batch_size, limit, dry_run, **options): if limit is not None: qs = qs[:limit] - ids = list(qs) if dry_run: - self.stdout.write(f"Dry run: would enqueue {len(ids)} reports.") + self.stdout.write(f"Dry run: would enqueue {qs.count()} reports.") return priority = settings.EMBEDDING_BACKFILL_PRIORITY total = 0 - for start in range(0, len(ids), batch_size): - chunk = ids[start : start + batch_size] + chunk: list[int] = [] + # Use a server-side cursor so we don't materialize the whole id set in memory. + for rid in qs.iterator(chunk_size=batch_size): + chunk.append(rid) + if len(chunk) >= batch_size: + enqueue_embed_reports(chunk, priority=priority) + total += len(chunk) + chunk = [] + if chunk: enqueue_embed_reports(chunk, priority=priority) total += len(chunk) self.stdout.write(f"Enqueued {total} reports for embedding.") diff --git a/radis/pgsearch/tests/test_backfill_command.py b/radis/pgsearch/tests/test_backfill_command.py index 8dea351e..5900757d 100644 --- a/radis/pgsearch/tests/test_backfill_command.py +++ b/radis/pgsearch/tests/test_backfill_command.py @@ -76,3 +76,19 @@ def test_backfill_uses_backfill_priority(): ) as enqueue: call_command("backfill_embeddings", stdout=StringIO()) assert enqueue.call_args.kwargs["priority"] == settings.EMBEDDING_BACKFILL_PRIORITY + + +@pytest.mark.django_db +def test_backfill_rejects_zero_batch_size(): + from django.core.management.base import CommandError + + with pytest.raises(CommandError, match="--batch-size must be > 0"): + call_command("backfill_embeddings", batch_size=0, stdout=StringIO()) + + +@pytest.mark.django_db +def test_backfill_rejects_negative_limit(): + from django.core.management.base import CommandError + + with pytest.raises(CommandError, match="--limit must be >= 0"): + call_command("backfill_embeddings", limit=-1, stdout=StringIO()) From 8409fbaada0e0c395cc243d80d20e74bc752a840 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Mon, 18 May 2026 08:45:47 +0000 Subject: [PATCH 33/68] feat(pgsearch): system check for EMBEDDING_DIM vs migration parity The embedding migration freezes vector(1024) at generation time, but ReportSearchVector.embedding reads settings.EMBEDDING_DIM at runtime. A divergence would surface as an opaque pgvector dimension error on the first INSERT or query. Add a Django system check (pgsearch.E001) that fires on every manage.py check, runserver boot, and CI run. The migration literal lives next to the check (EMBEDDING_DIM_MIGRATION_LITERAL = 1024) with a comment explaining that future migrations must update both in the same PR. Addresses PR #226 review comment from coderabbitai (8). Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/apps.py | 32 ++++++++++++++++++++++++ radis/pgsearch/tests/test_apps_checks.py | 23 +++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 radis/pgsearch/tests/test_apps_checks.py diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py index 2472547e..101fc309 100644 --- a/radis/pgsearch/apps.py +++ b/radis/pgsearch/apps.py @@ -1,4 +1,12 @@ from django.apps import AppConfig +from django.conf import settings +from django.core.checks import Error, register + +# Keep in sync with the dimensions= literal in +# radis/pgsearch/migrations/0003_report_embedding.py. The migration +# captures dim at generation time, so changing this requires a new +# migration that re-creates the embedding column. +EMBEDDING_DIM_MIGRATION_LITERAL = 1024 class PgSearchConfig(AppConfig): @@ -10,6 +18,30 @@ def ready(self): register_app() +@register() +def check_embedding_dim_matches_migration(app_configs, **kwargs): + """Fail loudly when settings.EMBEDDING_DIM diverges from the dim baked + into migration 0003. Mismatched values would otherwise surface as opaque + pgvector dimension errors on the first write/query.""" + if settings.EMBEDDING_DIM != EMBEDDING_DIM_MIGRATION_LITERAL: + return [ + Error( + f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the dim " + f"baked into migration 0003 (vector({EMBEDDING_DIM_MIGRATION_LITERAL})). " + f"Writes will fail with a pgvector dimension error. Either set " + f"EMBEDDING_DIM={EMBEDDING_DIM_MIGRATION_LITERAL} or write a new " + f"migration that drops and recreates the embedding column at the new dim.", + id="pgsearch.E001", + hint=( + "Update EMBEDDING_DIM in your .env, or write a migration that " + "matches the new dim and update EMBEDDING_DIM_MIGRATION_LITERAL " + "in radis/pgsearch/apps.py." + ), + ) + ] + return [] + + def register_app(): from django.conf import settings diff --git a/radis/pgsearch/tests/test_apps_checks.py b/radis/pgsearch/tests/test_apps_checks.py new file mode 100644 index 00000000..36bdd59c --- /dev/null +++ b/radis/pgsearch/tests/test_apps_checks.py @@ -0,0 +1,23 @@ +"""Tests for the Django system check that guards EMBEDDING_DIM/migration parity.""" + +from django.test import override_settings + +from radis.pgsearch.apps import ( + EMBEDDING_DIM_MIGRATION_LITERAL, + check_embedding_dim_matches_migration, +) + + +def test_check_passes_when_dim_matches_migration(): + with override_settings(EMBEDDING_DIM=EMBEDDING_DIM_MIGRATION_LITERAL): + assert check_embedding_dim_matches_migration(app_configs=None) == [] + + +def test_check_fails_when_dim_diverges_from_migration(): + with override_settings(EMBEDDING_DIM=EMBEDDING_DIM_MIGRATION_LITERAL + 1): + errors = check_embedding_dim_matches_migration(app_configs=None) + assert len(errors) == 1 + err = errors[0] + assert err.id == "pgsearch.E001" + assert str(EMBEDDING_DIM_MIGRATION_LITERAL) in err.msg + assert str(EMBEDDING_DIM_MIGRATION_LITERAL + 1) in err.msg From 0faf3b0cb19b10deb3d549feb5811dc650543f1b Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Mon, 18 May 2026 08:46:10 +0000 Subject: [PATCH 34/68] fix(test): pass strict=False to rrf_score ordering zip zip(documents, documents[1:]) intentionally walks N-1 adjacent pairs; the truncation is desired. Mark it explicit to satisfy Ruff B905. Addresses PR #226 review comment from coderabbitai (13). Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/tests/test_provider_hybrid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index f382e422..a6ec7b77 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -193,7 +193,7 @@ def test_documents_carry_cosine_distance_and_rrf_score( assert top.cosine_distance >= 0.0 assert top.rrf_score > 0.0 # All later documents have a strictly lower or equal rrf_score. - for prev, curr in zip(result.documents, result.documents[1:]): + for prev, curr in zip(result.documents, result.documents[1:], strict=False): assert curr.rrf_score <= prev.rrf_score From 4f773b27dfa520a26e2ce9cf16f9d085a3eb8266 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 16:26:14 +0000 Subject: [PATCH 35/68] docs(pgsearch): add hybrid search design spec with embedding orchestrator Single unified spec covering hybrid FTS + dense-vector retrieval and the periodic EmbeddingJob/EmbeddingTask orchestrator that drains pending embeddings without per-API-call job amplification. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-15-hybrid-search-design.md | 904 ++++++++++++++++++ 1 file changed, 904 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-15-hybrid-search-design.md diff --git a/docs/superpowers/specs/2026-05-15-hybrid-search-design.md b/docs/superpowers/specs/2026-05-15-hybrid-search-design.md new file mode 100644 index 00000000..ea402aad --- /dev/null +++ b/docs/superpowers/specs/2026-05-15-hybrid-search-design.md @@ -0,0 +1,904 @@ +# Hybrid Search Design (FTS + Dense Vector via Qwen3-Embedding-4B) + +**Status:** Draft — design phase +**Author:** RADIS team (Samuel Kwong) +**Date:** 2026-05-15 (revised 2026-05-28) +**Implementation skill (next step):** `writing-plans` + +**Revision 2026-05-28:** §6 ("Async indexing") was redesigned around a periodic +`EmbeddingJob` / `EmbeddingTask` orchestrator instead of a `post_save`-driven +per-report task. Affected sections: §3 (architecture diagram & file table), §4.5 +(model-change procedure), §5.4 (dev recipe), §6 (full replacement), §8 +(settings), §9 (failure modes), §10 (testing), §12 (rollout). + +--- + +## 1. Overview + +RADIS today provides PostgreSQL full-text search (FTS) over radiology reports via the `radis.pgsearch` provider: each `Report` gets a 1:1 `ReportSearchVector` row holding a `tsvector`, kept in sync via `post_save` signal and a bulk re-index task. Queries are ranked by `ts_rank` and snippeted via `ts_headline`. + +This spec extends that infrastructure with a dense-vector retrieval side, fused with FTS via Reciprocal Rank Fusion (RRF), to deliver **hybrid search**. Embeddings are produced by a Qwen3-Embedding-4B inference endpoint and stored in the same `ReportSearchVector` table. + +The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — `SearchView`, `ExtractionJob`, `SubscriptionJob`, the REST API — see no signature differences. Only the body of `radis.pgsearch.providers.search()` and `retrieve()` changes. + +## 2. Goals & non-goals + +### Goals + +- Combine the existing FTS recall with semantic recall so queries like "no pneumothorax" surface reports that describe the absence without containing the exact word (modulo the dense-retrieval polarity limitation in §11). +- Keep the existing `SearchProvider` contract intact. +- Index embeddings asynchronously without blocking report ingest. +- Keep embedding load isolated from chat/extraction/subscription LLM tasks. +- Degrade gracefully when the embedding service is unavailable (search continues as FTS-only). +- Make the embedding backend pluggable so Ollama can be used in dev and a Qwen3 endpoint in prod with the same code path. + +### Non-goals + +- No new search-provider plugin slot. The single `pgsearch` provider continues to be the only one registered. +- No per-query UI toggle for semantic vs. lexical. Hybrid is the new default. +- No Vespa, Elasticsearch, or OpenSearch adapter. +- No solution for negation/polarity (§11 documents this as known future work). +- No automated re-embedding when `EMBEDDING_DIM` changes. That is a manual operator procedure: drop column, re-migrate, defer the embedding orchestrator (see §4.5). +- No on-disk vector quantization. Float32 storage from day one; revisit if RAM pressure appears. + +## 3. Architecture + +``` +┌──────────────────────────────────────────────────────────────────────┐ +│ SearchView, REST API, ExtractionJob, SubscriptionJob │ +└──────────────┬───────────────────────────────────────────────────────┘ + │ Search(query, filters, offset, limit) + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ radis.pgsearch.providers.search() (hybrid, replaces FTS-only) │ +│ │ +│ 1. embed_query() ──► EmbeddingClient ──► Qwen3 endpoint │ +│ on failure: query_vec = None │ +│ │ +│ 2. Vector top-K ────► ReportSearchVector (HNSW on .embedding) │ +│ filtered by structured filters │ +│ │ +│ 3. FTS hits ────► ReportSearchVector (GIN on .search_vector) │ +│ filtered by structured filters │ +│ │ +│ 4. Python-side RRF fusion of (vec_top_K ∪ fts_hits) │ +│ 5. Pagination on the fused order │ +│ 6. ts_headline() ────► ReportSearchVector (page-slice only) │ +└──────────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────────┐ +│ Async indexing path (Job/Task orchestrator, periodic-driven) │ +│ │ +│ cron (settings.EMBEDDING_DRAIN_CRON, default nightly 02:00) │ +│ │ │ +│ ▼ │ +│ embedding_launcher() — `default` queue │ +│ ├─ queueing_lock="embedding_launcher" │ +│ ├─ skip if any EmbeddingJob in PREPARING/PENDING/IN_PROGRESS │ +│ ├─ skip if no rows with embedding IS NULL │ +│ └─ EmbeddingJob.objects.create(...) → job.delay() │ +│ │ +│ process_embedding_job(job_id) — `default` queue │ +│ ├─ iterate ReportSearchVector with embedding IS NULL │ +│ ├─ chunk by EMBEDDING_BATCH_SIZE → EmbeddingTask rows │ +│ ├─ task.reports.set(chunk); task.delay() (no HTTP work) │ +│ └─ job.status = PENDING; return │ +│ │ +│ process_embedding_task(task_id) — `embeddings` queue │ +│ ├─ EmbeddingClient.embed_documents([r.body for r in task.reports])│ +│ ├─ L2-normalize; bulk_update ReportSearchVector.embedding │ +│ ├─ task.status = SUCCESS/FAILURE; clear queued_job_id │ +│ └─ job.update_job_state() │ +│ │ +│ Operator-triggered drain: from a Django shell run │ +│ `embedding_launcher.defer()` — same code path as periodic. │ +└──────────────────────────────────────────────────────────────────────┘ +``` + +The bulk-upsert API path (`reports/api/viewsets.py:_bulk_upsert_reports`) +already creates `ReportSearchVector` rows with `embedding=NULL` via the FTS +indexing call in its `on_commit` block. The single-create API path goes through +the standard `Report.save()` and the FTS `post_save` signal, which likewise +creates the `ReportSearchVector` row with NULL embedding. Both ingest paths +deposit work into the same DB-resident pending pool; the orchestrator drains it +on the next periodic tick (or on an operator-triggered defer). There is no +per-API-call embedding job. + +**Components added inside `radis.pgsearch`:** + +| File | Purpose | +|---|---| +| `utils/embedding_client.py` | Sync + async HTTP clients with pluggable backends (`openai`, `ollama`) | +| `migrations/0002_pgvector_extension.py` | `CREATE EXTENSION IF NOT EXISTS vector;` | +| `migrations/0003_report_embedding.py` | Adds `embedding vector(N)` column + HNSW index | +| `migrations/0004_embedding_job_task.py` | Adds `EmbeddingJob` and `EmbeddingTask` tables + M2M to `Report` | +| `migrations/0005_system_user.py` | Data migration: creates the system user if missing | +| `models.py` (modified) | Adds `embedding` field + `HnswIndex`; defines `EmbeddingJob` and `EmbeddingTask` inheriting `AnalysisJob`/`AnalysisTask` | +| `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** | +| `tasks.py` (modified) | Adds `embedding_launcher` (periodic), `process_embedding_job` (`default` queue), `process_embedding_task` (`embeddings` queue) | +| `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic | +| `tests/...` | Coverage per §10 | + +**Infrastructure additions:** + +| File | Change | +|---|---| +| `pyproject.toml` | Add `pgvector>=0.3` dependency | +| `radis/settings/base.py` | New env-driven + constant settings (§8) | +| `example.env` | Document `EMBEDDING_*` env vars for openai and ollama backends | +| `docker-compose.base.yml` | Add `embeddings_worker` service + `EMBEDDING_*` env vars | +| `docker-compose.dev.yml` / `.prod.yml` | `embeddings_worker.command` running `bg_worker -q embeddings` | + +## 4. Schema and migrations + +### 4.1 Dependency + +Add to `pyproject.toml`: + +```toml +"pgvector>=0.3", +``` + +### 4.2 Postgres extension migration + +`radis/pgsearch/migrations/0002_pgvector_extension.py`: + +```python +class Migration(migrations.Migration): + dependencies = [("pgsearch", "0001_initial")] + operations = [ + migrations.RunSQL( + sql="CREATE EXTENSION IF NOT EXISTS vector;", + reverse_sql=migrations.RunSQL.noop, # do not drop in prod + ), + ] +``` + +Reverse is a no-op because the extension may be shared with other Postgres usage and dropping it would damage unrelated state. Dev rollback is handled by recreating the database. + +### 4.3 Schema migration + +`radis/pgsearch/migrations/0003_report_embedding.py`: standard `AddField` with a `VectorField(dimensions=settings.EMBEDDING_DIM, null=True)` and `AddIndex` for an `HnswIndex` with `opclasses=["vector_cosine_ops"]`, `m=16`, `ef_construction=64`. + +### 4.4 Model update + +`radis/pgsearch/models.py`: + +```python +from django.conf import settings +from pgvector.django import HnswIndex, VectorField + +class ReportSearchVector(models.Model): + report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_vector") + search_vector = SearchVectorField(null=True) + embedding = VectorField(dimensions=settings.EMBEDDING_DIM, null=True) + + class Meta: + indexes = [ + GinIndex(fields=["search_vector"]), + HnswIndex( + name="pgsearch_embedding_hnsw", + fields=["embedding"], + m=16, + ef_construction=64, + opclasses=["vector_cosine_ops"], + ), + ] +``` + +`embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled asynchronously by `process_embedding_task` (§6.7). A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only. + +`save()` on `ReportSearchVector` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by `process_embedding_task` via `bulk_update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent. + +### 4.5 Operational note on `EMBEDDING_DIM` + +pgvector columns and HNSW indexes are bound to a fixed dimension at create time, and HNSW has a 2000-dim ceiling (so `EMBEDDING_DIM ≤ 2000`; Qwen3-Embedding-4B's native 2560 is Matryoshka-truncated client-side). A Django system check (`pgsearch.E001`) compares `settings.EMBEDDING_DIM` against the literal in migration 0003 and fails `manage.py check` on mismatch. Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure: + +1. Drop the HNSW index and the `embedding` column. +2. Re-run `0003_report_embedding` with the new `EMBEDDING_DIM`. +3. From a Django shell, defer the embedding orchestrator immediately so the + next nightly tick is not waited for: + + ```python + from radis.pgsearch.tasks import embedding_launcher + embedding_launcher.defer() + ``` + +This is documented as a deployment-time decision and intentionally not automated. + +## 5. Embedding client + +### 5.1 Module layout + +`radis/pgsearch/utils/embedding_client.py` exposes: + +- `class EmbeddingBackend(Protocol)` with `path`, `build_payload`, `parse_response`. +- `class OpenAIBackend(EmbeddingBackend)` — default path `/v1/embeddings`, body `{model, input: [...]}`, response `{data: [{embedding: [...]}]}`. +- `class OllamaBackend(EmbeddingBackend)` — default path `/api/embed`, body `{model, input: [...]}`, response `{embeddings: [[...]]}`. +- `BACKENDS: dict[str, EmbeddingBackend] = {"openai": OpenAIBackend(), "ollama": OllamaBackend()}`. +- `class EmbeddingClientError(Exception)`. +- `class EmbeddingClient` — sync client used by `process_embedding_task` and the query path. +- `class AsyncEmbeddingClient` — async variant, kept for parity with `chats/utils/chat_client.py` and so the query path can call it from ASGI views without `async_to_sync` later. + +### 5.2 Interface + +```python +class EmbeddingClient: + def __init__(self): + self._backend = BACKENDS[settings.EMBEDDING_BACKEND] + self._path = settings.EMBEDDING_PROVIDER_PATH or self._backend.path + self._url = settings.EMBEDDING_PROVIDER_URL.rstrip("/") + self._path + self._model = settings.EMBEDDING_MODEL_NAME + self._timeout = settings.EMBEDDING_REQUEST_TIMEOUT + self._headers = {"Authorization": f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}"} \ + if settings.EMBEDDING_PROVIDER_API_KEY else {} + + def embed_documents(self, texts: list[str]) -> list[list[float]]: + """Embed texts verbatim. Truncates each to EMBEDDING_MAX_INPUT_CHARS first. + Returns L2-normalized vectors of length EMBEDDING_DIM.""" + + def embed_query(self, text: str) -> list[float]: + """Prepend EMBEDDING_QUERY_INSTRUCTION, then embed_documents([text])[0].""" +``` + +### 5.3 Wire shapes + +| Backend | Path (default) | Request | Response | +|---|---|---|---| +| `openai` | `/v1/embeddings` | `{"model": M, "input": [t, ...]}` | `{"data": [{"embedding": [...]}, ...]}` | +| `ollama` | `/api/embed` | `{"model": M, "input": [t, ...]}` | `{"embeddings": [[...], ...]}` | + +`EMBEDDING_PROVIDER_PATH` (env) overrides the backend default — this is how the production endpoint at `/api/embeddings` with an OpenAI-style payload is supported by the `openai` backend with a one-line config change, no new backend needed. + +### 5.4 Behavior details + +- **Query instruction:** the model card for Qwen3-Embedding recommends a task-specific instruction prefix on the query side only. `embed_query` prepends `EMBEDDING_QUERY_INSTRUCTION` (a Python constant in `base.py`); `embed_documents` does not. +- **Truncation:** any text longer than `EMBEDDING_MAX_INPUT_CHARS` is truncated at the character limit before being sent. A WARNING is logged with the report id (when known) and char count. Qwen3-Embedding-4B supports up to 32k tokens, so truncation will be rare for radiology bodies but is bounded as a defense against pathological inputs. +- **Normalization:** every returned vector is L2-normalized client-side, unconditionally. With unit vectors, cosine distance is monotonic in dot product, which makes the HNSW `vector_cosine_ops` operator effectively a fast inner-product search. Whether the upstream server normalizes is irrelevant. +- **Dimension validation:** every vector is checked to have length `EMBEDDING_DIM`. A mismatch raises `EmbeddingClientError`. +- **Batching:** `embed_documents` sends a single HTTP call per invocation. The higher-level orchestrator (`process_embedding_job`) groups reports into `EmbeddingTask` batches of `EMBEDDING_BATCH_SIZE` before dispatching them to `process_embedding_task`. +- **Errors:** non-2xx, timeout, malformed JSON, missing key, or wrong dim all raise `EmbeddingClientError`. The client never falls back internally — fallback policy is owned by the caller. +- **Dev recipe (Ollama):** + ```bash + ollama pull dengcao/Qwen3-Embedding-4B:Q5_K_M + # in .env: + EMBEDDING_BACKEND=ollama + EMBEDDING_PROVIDER_URL=http://host.docker.internal:11434 + EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M + EMBEDDING_DIM=2560 + ``` + GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, defer the embedding orchestrator from a Django shell (see §4.5). + +## 6. Async indexing (Job/Task orchestrator) + +The embedding lifecycle uses the same Job/Task pattern as `ExtractionJob` / +`ExtractionTask` (`radis/extractions/tasks.py:32`) and `SubscriptionJob` / +`SubscriptionTask` (`radis/subscriptions/tasks.py:33`). A periodic launcher +creates one `EmbeddingJob` per drain run; the orchestrator splits pending +reports into `EmbeddingTask` batches; each task is processed on the +`embeddings` queue. + +### 6.1 Why a Job/Task orchestrator instead of a per-save signal + +Two ingest paths exercise RADIS: + +- **Single-create** (`POST /api/reports/`) routes through `Report.objects.create` + in the serializer (`radis/reports/api/serializers.py:87`). A `post_save` + signal here would fire once per API call. +- **Bulk-upsert** (`POST /api/reports/bulk-upsert`) routes through + `Report.objects.bulk_create` / `bulk_update` + (`radis/reports/api/viewsets.py:_bulk_upsert_reports`). `post_save` **does + not fire** on `bulk_create`. The FTS path rescues itself by explicitly + calling `enqueue_bulk_index_reports(touched_ids)` in `on_commit`; an + embedding signal would still be skipped on this path. + +A per-save signal would therefore either (a) skip the bulk path entirely, or +(b) produce one Procrastinate job per single-create API call — each opening an +HTTP connection to the embedding service with batch=1. With ETL pipelines that +may post one report at a time, this fragments the workload to one job per API +call and wastes the embedding endpoint's batch capacity. + +The orchestrator design accepts a longer freshness window (hours / next-cycle) +in exchange for batched, throughput-friendly embedding runs that serve all +three operational scenarios with one mechanism: + +| Scenario | What happens | +|---|---| +| **Initial bulk upload** (millions of reports via `/bulk-upsert`) | `ReportSearchVector` rows created with `embedding=NULL`. Operator defers the launcher immediately or waits for the next cron tick. One `EmbeddingJob` produces N `EmbeddingTask` batches. | +| **Daily ad-hoc upload** | Reports land NULL via either ingest path. Next periodic tick consolidates the day's pending pool into a single `EmbeddingJob`. | +| **Model-change backfill** | Operator follows §4.5 (drop column, re-migrate), then defers the launcher from a shell. Same code path as the periodic. | + +### 6.2 Queue and worker + +The `embeddings` Procrastinate queue is served by the `embeddings_worker` +container. The orchestrator (`process_embedding_job`) runs on the `default` +queue alongside `process_extraction_job` and `process_subscription_job`; the +sub-tasks (`process_embedding_task`) run on `embeddings`. + +``` +./manage.py bg_worker -l debug -q embeddings --autoreload --concurrency 4 # dev +./manage.py bg_worker -l info -q embeddings --concurrency 4 # prod +``` + +`embeddings_worker` concurrency tunes parallelism against the embedding +endpoint. Recommended 4; raise if the endpoint has spare throughput, lower if +it rate-limits. The orchestrator does not run on this queue, so there is no +self-deadlock condition tied to concurrency on the `embeddings` queue. + +### 6.3 Priorities + +Procrastinate priority is "higher = sooner". Embedding work runs at lower +priority than extraction and subscription so it never starves user-driven LLM +operations. The orchestrator (`default` queue) and sub-tasks (`embeddings` +queue) share `EMBEDDING_INDEX_PRIORITY`; there is no separate backfill +priority because the backfill path is the same orchestrator. + +| Task | Priority | +|---|---| +| `EXTRACTION_DEFAULT_PRIORITY` (existing) | 2 | +| `EXTRACTION_URGENT_PRIORITY` (existing) | 3 | +| `SUBSCRIPTION_DEFAULT_PRIORITY` (existing) | 3 | +| `SUBSCRIPTION_URGENT_PRIORITY` (existing) | 4 | +| `EMBEDDING_INDEX_PRIORITY` (new) | 0 | + +### 6.4 Models + +`radis/pgsearch/models.py` defines two new models inheriting `AnalysisJob` and +`AnalysisTask` (`radis/core/models.py:17,220`): + +```python +from radis.core.models import AnalysisJob, AnalysisTask + + +class EmbeddingJob(AnalysisJob): + default_priority = settings.EMBEDDING_INDEX_PRIORITY + urgent_priority = settings.EMBEDDING_INDEX_PRIORITY # no urgent variant + + def delay(self) -> None: + queued_job_id = app.configure_task( + "radis.pgsearch.tasks.process_embedding_job", + allow_unknown=False, + priority=self.default_priority, + ).defer(job_id=self.pk) + self.queued_job_id = queued_job_id + self.save() + + +class EmbeddingTask(AnalysisTask): + job = models.ForeignKey(EmbeddingJob, on_delete=models.CASCADE, related_name="tasks") + reports = models.ManyToManyField(Report, related_name="embedding_tasks") + + def delay(self) -> None: + queued_job_id = app.configure_task( + "radis.pgsearch.tasks.process_embedding_task", + allow_unknown=False, + priority=settings.EMBEDDING_INDEX_PRIORITY, + ).defer(task_id=self.pk) + self.queued_job_id = queued_job_id + self.save() +``` + +**Owner field.** `AnalysisJob.owner` is non-nullable (`settings.AUTH_USER_MODEL`). +Embedding jobs are system-driven and have no human creator. A data migration +(`0005_system_user.py`) creates a `User(username=settings.EMBEDDING_SYSTEM_USERNAME, +is_active=False, password=unusable)` idempotently; the launcher assigns this +user as `owner` on every `EmbeddingJob`. This avoids subclass-level overrides +of `owner` and keeps the abstract contract clean. + +**No `get_absolute_url` in v1.** Existing `ExtractionJob` and `SubscriptionJob` +implement `get_absolute_url` because they have user-facing detail views. +`EmbeddingJob` has no user-facing UI in v1 — operators inspect it via Django +admin (default `ModelAdmin` registration is sufficient). The inherited abstract +`AnalysisJob.get_absolute_url` body is `...`, returning `None`; no call site in +radis treats an `EmbeddingJob` like a user-facing analysis job. A future spec +can add the view and override the method. + +`urgent`, `send_finished_mail`, and `finished_mail_template` stay at their +`AnalysisJob` defaults (`False`, `False`, `None`). + +### 6.5 Launcher (the periodic task) + +`radis/pgsearch/tasks.py`: + +```python +@app.periodic(cron=settings.EMBEDDING_DRAIN_CRON) +@app.task( + queue="default", + queueing_lock="embedding_launcher", + pass_context=True, +) +def embedding_launcher(context, timestamp: int) -> None: + in_flight = EmbeddingJob.objects.filter( + status__in=[ + EmbeddingJob.Status.PREPARING, + EmbeddingJob.Status.PENDING, + EmbeddingJob.Status.IN_PROGRESS, + ] + ).exists() + if in_flight: + logger.info("EmbeddingJob already in flight; launcher tick is a no-op.") + return + + has_pending = ReportSearchVector.objects.filter(embedding__isnull=True).exists() + if not has_pending: + logger.debug("No reports pending embedding; launcher tick is a no-op.") + return + + system_user = User.objects.get(username=settings.EMBEDDING_SYSTEM_USERNAME) + job = EmbeddingJob.objects.create( + owner=system_user, + status=EmbeddingJob.Status.PREPARING, + ) + transaction.on_commit(job.delay) +``` + +**Two reinforcing layers of duplicate-dispatch prevention:** + +- **Procrastinate `queueing_lock="embedding_launcher"`.** While a launcher job + is in the queue (`todo`) or executing (`doing`), the next cron tick's + `defer` call silently fails with `AlreadyEnqueued`. The launcher itself is + fast (one existence check + maybe one INSERT), so the lock is normally + released within milliseconds. +- **In-flight EmbeddingJob check.** Even if the queueing lock leaks (worker + crash mid-flight, manual `defer` from a shell, dashboard re-trigger), the + launcher's first action is to look for any `EmbeddingJob` in a non-terminal + status. If one exists, the launcher returns without creating another. This + is the same dedup pattern used by `process_extraction_job` when re-entered + (`extractions/tasks.py:46`). + +### 6.6 Orchestrator (`process_embedding_job`) + +```python +@app.task +def process_embedding_job(job_id: int) -> None: + job = EmbeddingJob.objects.get(id=job_id) + assert job.status == EmbeddingJob.Status.PREPARING + + # Retry/resume path: tasks already exist, re-enqueue still-pending ones. + if job.tasks.exists(): + tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) + else: + pending_ids_iter = ( + ReportSearchVector.objects + .filter(embedding__isnull=True) + .values_list("report_id", flat=True) + .iterator(chunk_size=10_000) + ) + batch: list[int] = [] + for report_id in pending_ids_iter: + batch.append(int(report_id)) + if len(batch) >= settings.EMBEDDING_BATCH_SIZE: + _create_embedding_task(job, batch) + batch = [] + if batch: + _create_embedding_task(job, batch) + + tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) + + job.status = EmbeddingJob.Status.PENDING + job.queued_job_id = None + job.save() + + for task in tasks_to_enqueue: + if not task.is_queued: + task.delay() + + +def _create_embedding_task(job: EmbeddingJob, report_ids: list[int]) -> EmbeddingTask: + task = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) + task.reports.set(Report.objects.filter(pk__in=report_ids)) + return task +``` + +Mirrors `process_extraction_job` (`extractions/tasks.py:32`). State transitions +follow the standard pattern: + +- `PREPARING` while tasks are being created (sub-tasks must not be dispatched yet). +- `PENDING` after task creation completes; sub-tasks are then enqueued. +- `IN_PROGRESS` / `SUCCESS` / `WARNING` / `FAILURE` driven by `update_job_state` + (inherited from `AnalysisJob`) called from each sub-task on completion. + +The orchestrator does no HTTP work. For 1M pending reports at +`EMBEDDING_BATCH_SIZE=32`, it creates ~31,250 `EmbeddingTask` rows and defers +them — well under a minute on the `default` worker. Its slot is freed +immediately after; long-running embedding work happens on the `embeddings` +worker. + +### 6.7 Sub-task (`process_embedding_task`) + +```python +@app.task(queue="embeddings") +def process_embedding_task(task_id: int) -> None: + task = EmbeddingTask.objects.get(id=task_id) + task.status = EmbeddingTask.Status.IN_PROGRESS + task.started_at = timezone.now() + task.attempts = task.attempts + 1 + task.save() + + client = EmbeddingClient() + try: + report_ids = list(task.reports.values_list("pk", flat=True)) + rsvs = list( + ReportSearchVector.objects + .filter(report_id__in=report_ids) + .select_related("report") + .only("id", "report_id", "report__body") + ) + texts = [rsv.report.body for rsv in rsvs] + vectors = client.embed_documents(texts) + for rsv, vec in zip(rsvs, vectors, strict=True): + rsv.embedding = vec + ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) + + task.status = EmbeddingTask.Status.SUCCESS + except EmbeddingClientError as exc: + logger.exception("Embedding task %s failed: %s", task_id, exc) + task.status = EmbeddingTask.Status.FAILURE + task.message = str(exc) + raise # Procrastinate retry policy applies + finally: + task.ended_at = timezone.now() + task.queued_job_id = None + task.save() + task.job.update_job_state() + client.close() +``` + +Raising on `EmbeddingClientError` +lets Procrastinate's retry policy apply. After retries exhaust, the exception +propagates, the task ends as `FAILURE`, and `update_job_state` is still called +from the `finally` block. The job finishes with status `WARNING` (some tasks +failed, some succeeded) or `FAILURE` (all failed). The next launcher tick will +create a fresh job that picks up any rows still NULL. + +### 6.8 No `post_save` signal for embeddings, no `backfill_embeddings` command + +The FTS `post_save` receiver (`create_or_update_report_search_vector`) stays — +it is what creates the `ReportSearchVector` row in the first place. There is +**no** corresponding embedding receiver: every ingest path eventually deposits +`embedding=NULL` rows into the DB, and the orchestrator drains them. + +There is no `backfill_embeddings` management command. Operators trigger an +immediate drain from a Django shell: + +```python +from radis.pgsearch.tasks import embedding_launcher +embedding_launcher.defer() +``` + +This goes through the same launcher → orchestrator → sub-task path as the +periodic; the only difference is who fires it. One code path, one set of +tests, one observable lifecycle. + +## 7. Hybrid search provider + +### 7.1 Universe and fusion + +The hybrid result universe is the **union** of two filter-bounded candidate sets: + +- **Vector top-K:** the `HYBRID_VECTOR_TOP_K` nearest rows by cosine distance to the query embedding, filtered by structured filters and `embedding IS NOT NULL`. *Not* constrained to the FTS hit set. +- **FTS hits:** all rows matching the tsquery and the structured filters, capped at `HYBRID_FTS_MAX_RESULTS`. + +A report appears in results if it is in **either** set. This is the change from the earlier draft, made because radiology queries like "no pneumothorax" must be able to surface reports that lexically don't match (the GIN index drops "no" as a stop word) but are semantically related. + +Each report's score is plain Reciprocal Rank Fusion: + +``` +score(d) = (1 / (HYBRID_RRF_K + vec_rank[d]) if d ∈ vec_top_K else 0) + + (1 / (HYBRID_RRF_K + fts_rank[d]) if d ∈ fts_hits else 0) +``` + +Properties: + +- Reports in both sides outrank reports in only one side (sum of two terms vs. one). +- Vector contribution decays after rank K (no `vec_rank` entry), so the ordering naturally transitions from "hybrid head" to "FTS tail" with no explicit cutoff. +- A query with zero FTS hits returns `vec_top_K` ranked by vector position only — pure semantic search. +- A query with embedding failure returns FTS hits ranked by `ts_rank` only — the pre-hybrid behavior. + +### 7.2 `search()` flow + +```python +def search(s: Search) -> SearchResult: + query_str = _build_query_string(s.query) + language = _resolve_language(s.filters) + filter_q = _build_filter_query(s.filters) + tsquery = SearchQuery(query_str, search_type="raw", config=language) + + # Vector side + query_text = QueryParser.unparse(s.query) # same helper SearchView already uses + try: + query_vec = EmbeddingClient().embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Falling back to FTS-only: %s", e) + query_vec = None + + vec_rank: dict[int, int] = {} + if query_vec is not None: + ids = list( + ReportSearchVector.objects + .filter(filter_q) + .exclude(embedding__isnull=True) + .annotate(distance=CosineDistance("embedding", query_vec)) + .order_by("distance", "report_id") + .values_list("report_id", flat=True)[:settings.HYBRID_VECTOR_TOP_K] + ) + vec_rank = {rid: i + 1 for i, rid in enumerate(ids)} + + # FTS side + fts_rows = list( + ReportSearchVector.objects + .filter(filter_q) + .filter(search_vector=tsquery) + .annotate(rank=SearchRank(F("search_vector"), tsquery)) + .order_by("-rank", "report_id") + .values("report_id", "rank")[:settings.HYBRID_FTS_MAX_RESULTS] + ) + fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)} + + # Fusion (pure Python, factored out for unit testing) + ordered_ids = _rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K) + + total_count = len(ordered_ids) + total_relation = ( + "at_least" + if len(fts_rows) >= settings.HYBRID_FTS_MAX_RESULTS + or len(vec_rank) >= settings.HYBRID_VECTOR_TOP_K + else "exact" + ) + page_ids = ordered_ids[s.offset : s.offset + (s.limit or len(ordered_ids))] + + # Headline + hydration for the page slice only + page_rows = ( + ReportSearchVector.objects + .filter(report_id__in=page_ids) + .annotate( + summary=SearchHeadline("report__body", tsquery, config=language, + start_sel="", stop_sel="", + min_words=10, max_words=20, max_fragments=10), + rank=SearchRank(F("search_vector"), tsquery), + ) + .select_related("report") + ) + by_id = {r.report_id: r for r in page_rows} + documents = [ + document_from_pgsearch_response(_with_fallback_summary(by_id[rid])) + for rid in page_ids if rid in by_id + ] + return SearchResult(total_count=total_count, total_relation=total_relation, documents=documents) +``` + +### 7.3 Empty-summary fallback + +`SearchHeadline` returns an empty string when the document body has no FTS match (the vector-only hit case). `_with_fallback_summary` replaces an empty summary with the first 30 words of `report.body`. Trivial helper, ~5 lines. + +### 7.4 `retrieve()` + +Same fusion logic, returns an iterator of `report__document_id` in `ordered_ids` order. No headline. Used by `ExtractionJob` and `SubscriptionJob` to walk the matching id set. + +### 7.5 `count()` and `filter()` + +Unchanged. These operate on filters only and never call the embedding service. + +### 7.6 `ReportDocument.relevance` + +Kept as `ts_rank` for API backwards compatibility. RRF is an internal ordering signal and is not exposed on the public document type. RRF scores are logged at DEBUG for diagnostics. + +### 7.7 `search_provider.max_results` + +Updated to `max(HYBRID_VECTOR_TOP_K, HYBRID_FTS_MAX_RESULTS)`, which is what the `SearchView` page-bound check uses to reject impossibly-deep pagination. + +## 8. Configuration + +### 8.1 Env-driven (per-deployment, set in `.env`) + +```python +# radis/settings/base.py +EMBEDDING_BACKEND = env.str("EMBEDDING_BACKEND", default="openai") +EMBEDDING_PROVIDER_URL = env.str("EMBEDDING_PROVIDER_URL", default="") +EMBEDDING_PROVIDER_PATH = env.str("EMBEDDING_PROVIDER_PATH", default="") # "" = backend default +EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="") +EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B") +EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) +EMBEDDING_DRAIN_CRON = env.str("EMBEDDING_DRAIN_CRON", default="0 2 * * *") +``` + +These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). `EMBEDDING_DRAIN_CRON` is env-tunable so dev environments can drain more frequently (e.g., `*/15 * * * *`) without a code change. + +### 8.2 Code constants (tuning knobs, in `base.py`) + +```python +EMBEDDING_REQUEST_TIMEOUT = 30 # seconds +EMBEDDING_MAX_INPUT_CHARS = 60_000 +EMBEDDING_QUERY_INSTRUCTION = ( + "Instruct: Given a radiology search query, retrieve relevant radiology reports.\n" + "Query: " +) +EMBEDDING_BATCH_SIZE = 32 + +EMBEDDING_INDEX_PRIORITY = 0 +EMBEDDING_SYSTEM_USERNAME = "system" + +HYBRID_VECTOR_TOP_K = 100 +HYBRID_FTS_MAX_RESULTS = 10_000 +HYBRID_RRF_K = 60 +``` + +These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks). `EMBEDDING_SYSTEM_USERNAME` names the system user that owns every auto-generated `EmbeddingJob`; the data migration creates this user idempotently. + +### 8.3 `example.env` + +Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, keyed off `EMBEDDING_BACKEND`. Documents `EMBEDDING_DRAIN_CRON` with the production default (`0 2 * * *`) and a dev-friendly alternative (`*/15 * * * *`). + +### 8.4 Compose + +`docker-compose.base.yml`: + +- New service `embeddings_worker` inheriting `*default-app`. +- The `EMBEDDING_BACKEND`, `EMBEDDING_PROVIDER_URL`, `EMBEDDING_PROVIDER_PATH`, `EMBEDDING_PROVIDER_API_KEY`, `EMBEDDING_MODEL_NAME`, `EMBEDDING_DIM` env keys added to the `&default-app` block so all services see them. + +`docker-compose.dev.yml`: + +- `embeddings_worker.command`: `bash -c "wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && ./manage.py bg_worker -l debug -q embeddings --autoreload"`. + +`docker-compose.prod.yml`: + +- Same without `--autoreload`, log level `info`. + +## 9. Error handling and degradation + +| Failure | Behavior | Logging | +|---|---|---| +| Embedding service returns 5xx/timeout during query-time | `query_vec = None`; result list ordered by FTS-only; request succeeds | WARNING with request id | +| Embedding service returns 4xx during query-time | Same FTS-only fallback (treats as misconfig at request layer) | ERROR | +| Embedding service returns malformed body | `EmbeddingClientError` raised; query falls back to FTS-only | ERROR | +| Embedding service down during a sub-task | `process_embedding_task` raises; Procrastinate retries with exponential backoff; `embedding` stays NULL | WARNING per attempt, ERROR after final retry | +| Launcher fires while EmbeddingJob is `PREPARING`/`PENDING`/`IN_PROGRESS` | Status check returns immediately; tick is a no-op | INFO | +| Orchestrator crashes during task creation (partial dispatch) | Job stays in `PREPARING`. Next launcher tick sees in-flight job and no-ops. Operator marks job `FAILURE` in admin to allow a fresh run | ERROR + operator action | +| Sub-task fails after Procrastinate retries exhausted | Task ends as `FAILURE`. `update_job_state` rolls the job to `WARNING` (some tasks succeeded) or `FAILURE` (all failed). NULL rows remain; next launcher creates a new job to retry them | ERROR | +| `embeddings_worker` saturation | Sub-tasks queue up; orchestrator already returned. No deadlock; just slower drain | DEBUG | +| Report body > `EMBEDDING_MAX_INPUT_CHARS` | Truncate, embed truncated text | WARNING with report_id and char count | +| Report deleted between task creation and execution | Sub-task's `task.reports.values_list(...)` returns fewer rows; `embed_documents` called on smaller list; no error | DEBUG | +| Vector dim mismatch on write | Postgres raises; sub-task fails, retried | ERROR — escalate to admin | +| `EMBEDDING_PROVIDER_URL` empty at startup | `EmbeddingClient` construction defers to call site; calls log + raise; query falls back to FTS-only | WARNING once on first request | +| System user missing (data migration didn't run) | Launcher raises `User.DoesNotExist`. Loud failure; deployment misconfiguration. Fix: run migrations | ERROR | + +**Deliberate non-policies:** + +- The product never fails a search request because the embedding service is down. It degrades to FTS-only. +- Query embeddings are not cached. The complexity and freshness trade-off is not worth it at the corpora sizes RADIS targets. +- `EmbeddingClient` does not retry internally. Procrastinate retries the whole task; the query path uses a single shot. + +**Observability:** + +- Provider logs at DEBUG: vec hit count, FTS hit count, intersection count, fusion ms, query-embed ms. +- `process_embedding_task` logs at INFO: batch size, total chars, latency, success/retry counts. +- `embedding_launcher` and `process_embedding_job` log status transitions and dispatch counts at INFO. +- Operators inspect job/task state via Django admin (`EmbeddingJob`, `EmbeddingTask` use the default `ModelAdmin`). +- The existing OpenTelemetry overlay (commit `653e0c67`) tags telemetry per service; `embeddings_worker` shows up automatically. + +## 10. Testing strategy + +### 10.1 Unit tests (no DB) + +| File | Coverage | +|---|---| +| `tests/unit/test_embedding_client.py` | Backend payload/response round-trip, path override, instruction prefix, normalization, dim validation, all error modes, truncation | +| `tests/unit/test_provider_fusion.py` | `_rrf_fuse(vec_rank, fts_rank, k)` pure-Python helper: disjoint, overlapping, FTS-only, vector-only, both-empty, tiebreak by report_id | +| `tests/unit/test_embedding_launcher.py` | No-op when EmbeddingJob already in flight; no-op when no rows pending; happy path creates job and calls `delay`; raises if system user missing | +| `tests/unit/test_process_embedding_job.py` | Batches pending reports into `EmbeddingTask` rows of size `EMBEDDING_BATCH_SIZE`; status transitions `PREPARING` → `PENDING`; retry/resume path re-enqueues only `PENDING` tasks; empty pool exits cleanly | +| `tests/unit/test_process_embedding_task.py` | Embeds reports, writes vectors, sets status `SUCCESS`; status `FAILURE` and re-raise on `EmbeddingClientError`; calls `job.update_job_state` in both paths; clears `queued_job_id` | + +### 10.2 Integration tests (real Postgres + pgvector) + +| File | Coverage | +|---|---| +| `tests/integration/test_migrations.py` (new, `django-test-migrations`) | Extension migration runs; column + HNSW index created with configured dim; reverse works | +| `tests/integration/test_provider_hybrid.py` (new) | FTS-only hit, vector-only hit ("no pneumothorax" fixture), both-sides hit, filter honoring, stable pagination, embedding-service-down fallback, NULL-embedding rows still returned, `ts_headline` query-count bounded to page, empty-summary fallback | + +Factories: existing `ReportSearchVectorFactory` gains optional `embedding` kwarg (default `None`). New `ReportSearchVectorWithEmbeddingFactory` generates deterministic normalized vectors of the configured dim from a seed. Real Qwen3 embeddings are not used in tests. + +### 10.3 View-level smoke + +`radis/search/tests/test_views.py` (extend): + +- Search request with hybrid enabled returns 200 and renders documents. +- Search request with `EMBEDDING_PROVIDER_URL=""` returns 200 (FTS-only path). + +### 10.4 Acceptance (`@pytest.mark.acceptance`) + +One end-to-end test against the dev containers, with the embedding service stubbed (either a small in-test FastAPI or a recorded fixture response), verifying the search page returns hybrid results. Marked acceptance so it's opt-in like the existing acceptance suite. + +### 10.5 Explicitly not tested + +- Live Qwen3 retrieval quality (offline eval, out of scope). +- pgvector HNSW recall under specific data shapes (extension's responsibility). +- Wire formats beyond the two supported backends. + +## 11. Known limitations and future work + +### 11.1 Negation / polarity (the "no pneumothorax" problem) + +Dense embedding models — including Qwen3-Embedding — embed semantically opposite phrases close together. "No pneumothorax" and "pneumothorax present" produce nearby vectors, so the vector half of the hybrid score is *polarity-blind*. The FTS half partly compensates by allowing the user to construct explicit AND-NOT queries, but Postgres' GIN index drops "no" as a stop word, so a naive query like `no pneumothorax` is effectively `pneumothorax` on the FTS side. + +This is a real concern for radiology, where negated findings are pervasive ("no acute …", "no evidence of …", "no significant …"). **Hybrid search as designed here does not solve this.** It is documented as an accepted limitation of v1, and a v2 conversation should address it. + +Candidate solutions to evaluate in a future spec (none committed): + +- A cross-encoder re-ranker over the top-N hybrid results (e.g., a small instruction-tuned model that knows to score "no X" against "X present" as opposite). +- Adding a sparse/late-interaction model (SPLADE, ColBERT) alongside the dense vector — sparse models preserve token-level polarity. +- Negation-aware query preprocessing: detect negation, route to a different retrieval mode, or expand to phrasal `AND-NOT` clauses on the FTS side that bypass the stop-word filter (e.g., search the raw body, not the tsvector). +- Structured-findings indexing: have the LLM extract presence/absence flags per finding category at ingest time, search those structured fields instead of (or in addition to) prose. + +### 11.2 Dimension changes are manual + +See §4.5. + +### 11.3 GGUF dev embeddings ≠ bf16 prod embeddings + +Documented in §5.4. Mitigated by deferring `embedding_launcher` after a model swap (see §4.5). The next drain re-embeds everything. + +### 11.4 No body-change detection for re-embedding + +V1 re-embeds anything where `embedding IS NULL`. A future optimization could +track whether the body actually changed (e.g., a `body_hash` column on +`ReportSearchVector` updated only on body changes) so metadata-only updates +don't have to null the embedding. Not in v1; profiling will tell us whether it +matters. + +### 11.5 Operator-aware queries: FTS / vector asymmetry + +Both halves of hybrid search receive a derivation of the same parsed `QueryNode`, but interpret it through completely different machinery. The FTS side consumes a `tsquery` built by `_build_query_string` where `AND`, `OR`, `NOT`, quoted phrases, and parens are first-class boolean operators (`&`, `|`, `!`, `<->`, `()`). The vector side consumes the canonical unparsed string and feeds it whole to the embedding model as natural language; the operators become ordinary word tokens that the model has no operator-aware machinery to interpret. + +Practical consequences: + +- **Natural-phrase queries** (`pneumothorax`, `chest x-ray`, implicit-AND `cardiac arrest`) — both halves point the same direction. RRF amplifies the agreement. This is the workload hybrid search is best at. +- **`A AND B`** — FTS strictly intersects; vector returns docs about a topic-mix of A and B (which usually includes some single-side hits). Docs matching both lexically *and* semantically rank highest, which is the desired outcome. Vector contributes useful expansion but not boolean precision. +- **`A OR B`** — FTS unions; the vector half has no concept of disjunction and just produces a centroid-style embedding. Docs about either A or B that happen to be near the centroid still get retrieved, but a doc purely about A may not appear unless it's also close to the centroid. Vector half degrades from "asset" to "noise". +- **`NOT X`** — sharpest conflict. FTS correctly returns docs without X. Dense embeddings are polarity-blind, so the vector for `"NOT X"` clusters next to the vector for `"X"` and the top-K nearest neighbours are docs *about* X — the polar opposite of what the user asked for. The two halves return nearly disjoint sets that RRF interleaves, producing actively misleading results rather than mere noise. (Distinct from §11.1, which is about natural-language negation like `no pneumothorax` where the FTS stop-word strip happens to align the halves accidentally.) + +**Candidate mitigation (not in v1, recommended follow-up):** strip negated branches from the query string before embedding. Walk the AST; when a `UnaryNode("NOT", X)` is encountered, drop `X` from the string passed to the embedding model. The FTS side still gets the full structure. Outcomes: + +- `NOT X` alone → vector receives an empty query and is skipped; provider falls back to FTS-only ranking. Correct. +- `A AND NOT B` → vector embeds just `A`; FTS enforces `A & !B`. Vector adds positive semantic signal for A, FTS enforces the exclusion. The halves are aligned again. + +This is ~15 lines of code in `providers.search()` / `providers.retrieve()` and a small extension to `QueryParser` for the AST walk. Other candidates (negation-aware re-ranker, embedding subtraction, sparse models like SPLADE-NEG) are heavier and listed in §11.1. + +**Why a re-ranker alone cannot fix this.** A cross-encoder re-ranker improves precision *within the candidate pool it is given* — it cannot improve recall of that pool. For `NOT pneumothorax` over a 1000-doc corpus where 600 docs don't mention the word, the hybrid candidate pool is poisoned: ~100 wrong docs (pneumothorax-discussing reports pulled in by the polarity-blind vector half) displace 100 of the 600 correct docs from the top-N positions. After re-ranking top-20, the head of results is sharper, but ~590 correct docs still live below the re-ranker's cutoff at their original RRF positions, interleaved with the remaining 90 wrong docs. The architecturally correct order is to fix recall upstream (strip negated branches before embedding, restoring a clean candidate pool) and *then* layer a re-ranker for precision. A re-ranker without the upstream fix is rearranging deck chairs on a polluted pool. + +### 11.6 Cross-encoder re-ranker (deferred) + +A planned follow-up adds a re-ranker stage between hybrid fusion and result hydration to lift precision (especially on operator-light natural-phrase queries, where the candidate pool is already correct but RRF ordering is mediocre) and to partially compensate for §11.1's polarity blindness. Two backend patterns are under consideration: + +- **Pointwise cross-encoder via vLLM.** Qwen3-Reranker-4B served with `vllm serve … --task score` exposes `/v1/rerank` (Cohere/TEI shape: `{model, query, documents}` → `[{index, relevance_score}]`). Logit-based scoring (yes/no token logits → softmax) gives graded relevance in [0,1]. Latency ~30–100 ms per pair on a single GPU; for top-20 candidates that's ~0.5–1.5 s added. +- **Listwise LLM re-ranker** via the existing OpenAI-compatible chat-completions endpoint. The LLM is prompted with the query and the top-N candidates packed into a single message; structured output (`response_format=json_object`) returns a ranked list of indices. One HTTP call per query rather than N. Latency ~1–3 s for top-20 depending on model size. Quality trades off graded precision for the LLM's strong instruction-following — particularly the explicit "respect negation" cue, which the pointwise reranker has to learn implicitly. + +vLLM is the recommended production host for the pointwise path because Ollama (as of mid-2025) does not expose token logits cleanly, which collapses Qwen3-Reranker to a binary 1.0/0.0 signal and loses graded ordering. Ollama can still serve the LLM listwise backend without issue. + +### 11.7 Evaluation strategy for the layered hybrid stack + +Six profiles cover the additive layers: + +| Profile | Negation strip (§11.5) | Re-ranker (§11.6) | +|---|---|---| +| `baseline` | off | off | +| `strip` | on | off | +| `rerank-qwen` | off | Qwen3-Reranker via vLLM | +| `rerank-llm` | off | listwise LLM | +| `both-qwen` | on | Qwen3-Reranker via vLLM | +| `both-llm` | on | listwise LLM | + +A `run_search_eval` management command loops a set of test queries through all six profiles (toggling settings via `override_settings`) and dumps comparable JSON output with top-N docs, per-layer scores (`ts_rank`, `cosine_distance`, `rrf_score`, `rerank_score`), and per-profile latencies. + +**Labeling.** Per-pair LLM relevance judgment ("is doc D relevant to query Q?") is unreliable for radiology because (a) it inherits the same polarity blind spot the system is trying to evaluate, and (b) it introduces circular bias when the labeling LLM and re-ranker LLM share a family. The preferred approach is *concept-based polarity-aware labeling*: label each report once per clinical concept with `PRESENT` / `ABSENT` / `NOT_MENTIONED`, then derive query relevance deterministically (`pneumothorax` → `PRESENT ∪ ABSENT`; `NOT pneumothorax` → `NOT_MENTIONED ∪ ABSENT` for strict exclusion, or `ABSENT` only for "rule-out" semantics). The concept labels are reusable across many queries and survive prompt/model changes. The upstream label-filter work in PR #196 produces structured labels with comparable semantics and is the intended source of ground truth for production-scale evaluation. + +## 12. Rollout plan + +1. **Schema and dependency.** Land the `pgvector` Python dep, the extension migration, and the embedding-column schema migration. No behavior change yet — `embedding` is nullable, queries still see only FTS. +2. **Embedding client and tests.** Land the client module and unit tests. No callers yet. +3. **Orchestrator models and migrations.** Add `EmbeddingJob`, `EmbeddingTask`, their migration, and the data migration that creates the system user. +4. **Orchestrator tasks and `embeddings_worker`.** Land `embedding_launcher`, `process_embedding_job`, `process_embedding_task`, the `embeddings_worker` container (with `--concurrency 4`), and the `EMBEDDING_DRAIN_CRON` setting. The launcher starts ticking; with no rows yet, all ticks no-op. +5. **Initial drain.** From a shell, run `embedding_launcher.defer()` so the orchestrator picks up the existing corpus. This is the only "operator action" in the rollout. It runs at `EMBEDDING_INDEX_PRIORITY` and lives behind whatever other work is on the queues; it can run for hours to days on a large corpus. +6. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only. +7. **Monitor.** Watch search latency p95, embedding-queue depth, `EmbeddingJob` admin state, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed. + +Each step is independently mergeable; steps 1–4 ship as quiet infrastructure changes with no user-visible effect, step 5 starts populating the column, step 6 is the moment hybrid goes live. From 0822336861e1ebab11759231a83b0cb10b9f2612 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 16:31:34 +0000 Subject: [PATCH 36/68] docs(pgsearch): add embedding orchestrator implementation plan Phased plan covering EmbeddingJob/EmbeddingTask models, system-user data migration, launcher/orchestrator/sub-task wiring, removal of the post_save embedding signal + embed_reports + backfill_embeddings command, and embeddings_worker --concurrency 4. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-28-embedding-orchestrator.md | 1047 +++++++++++++++++ 1 file changed, 1047 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-28-embedding-orchestrator.md diff --git a/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md b/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md new file mode 100644 index 00000000..6b5339a8 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md @@ -0,0 +1,1047 @@ +# Embedding Orchestrator Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace the `post_save`-driven `embed_reports` task and `backfill_embeddings` command with a periodic `EmbeddingJob`/`EmbeddingTask` orchestrator that batches pending embeddings without per-API-call job amplification. + +**Architecture:** Mirror `ExtractionJob`/`ExtractionTask` (`radis/extractions/tasks.py:32`) and `subscription_launcher` (`radis/subscriptions/tasks.py:115`). A periodic `embedding_launcher` on `default` queue creates one `EmbeddingJob` (system-owned) per drain; `process_embedding_job` (also `default`) batches `ReportSearchVector` rows with `embedding IS NULL` into `EmbeddingTask` rows and dispatches them; `process_embedding_task` (on `embeddings` queue) calls `EmbeddingClient`, `bulk_update`s the vectors, and rolls status up via `AnalysisJob.update_job_state`. + +**Tech Stack:** Django 5.1, Procrastinate (periodic tasks + `queueing_lock`), pgvector, pytest-django. + +**Spec:** `docs/superpowers/specs/2026-05-15-hybrid-search-design.md` §6. + +**Branch:** `feat/hybrid-search` (continue here; no worktree required). + +--- + +## File Structure + +**Files to create:** + +| Path | Responsibility | +|---|---| +| `radis/pgsearch/migrations/0004_embedding_job_task.py` | Schema migration for `EmbeddingJob`, `EmbeddingTask`, `EmbeddingTask.reports` M2M | +| `radis/pgsearch/migrations/0005_system_user.py` | Data migration that idempotently creates the system user | +| `radis/pgsearch/tests/test_models_embedding.py` | Model-level tests: status defaults, owner FK, M2M | +| `radis/pgsearch/tests/test_embedding_launcher.py` | Unit tests for `embedding_launcher` | +| `radis/pgsearch/tests/test_process_embedding_job.py` | Unit tests for `process_embedding_job` | +| `radis/pgsearch/tests/test_process_embedding_task.py` | Unit tests for `process_embedding_task` | +| `radis/pgsearch/tests/test_migrations_system_user.py` | Test for the data migration | + +**Files to modify:** + +| Path | Change | +|---|---| +| `radis/settings/base.py:341-365` | Add `EMBEDDING_DRAIN_CRON`, `EMBEDDING_SYSTEM_USERNAME`; remove `EMBEDDING_BACKFILL_PRIORITY` (last) | +| `radis/pgsearch/models.py` | Add `EmbeddingJob` and `EmbeddingTask` model classes | +| `radis/pgsearch/tasks.py` | Replace contents: add `embedding_launcher`, `process_embedding_job`, `process_embedding_task`; remove `embed_reports` and `enqueue_embed_reports` | +| `radis/pgsearch/signals.py` | Remove `enqueue_report_embedding` receiver (lines 19-23); keep the FTS receiver | +| `radis/pgsearch/tests/test_signals.py` | Delete the two embedding-signal tests; the file becomes empty and is deleted | +| `docker-compose.dev.yml:85-92` | Add `--concurrency 4` to `embeddings_worker` command | +| `docker-compose.prod.yml:80-88` | Add `--concurrency 4` to `embeddings_worker` command | + +**Files to delete:** + +| Path | Reason | +|---|---| +| `radis/pgsearch/management/commands/backfill_embeddings.py` | Replaced by `embedding_launcher.defer()` from a shell | +| `radis/pgsearch/tests/test_backfill_command.py` | Tests for the deleted command | +| `radis/pgsearch/tests/test_embed_reports_task.py` | Tests for the deleted `embed_reports` task | +| `radis/pgsearch/tests/test_signals.py` | Whole file is deleted once the embedding tests are removed | + +--- + +## Task 1: Add new settings (additive only) + +**Files:** +- Modify: `radis/settings/base.py:341-365` +- Modify: `example.env` + +`EMBEDDING_BACKFILL_PRIORITY` stays for now — it is removed in Task 10 after every caller is gone. + +- [ ] **Step 1: Add `EMBEDDING_DRAIN_CRON` and `EMBEDDING_SYSTEM_USERNAME` to settings** + +Edit `radis/settings/base.py`. Add after line 347 (after `EMBEDDING_DIM = env.int(...)`): + +```python +EMBEDDING_DRAIN_CRON = env.str("EMBEDDING_DRAIN_CRON", default="0 2 * * *") +``` + +Add after line 360 (the `EMBEDDING_BACKFILL_PRIORITY` line): + +```python +EMBEDDING_SYSTEM_USERNAME = "system" +``` + +- [ ] **Step 2: Document the env var in `example.env`** + +Append to the `EMBEDDING_*` block in `example.env`: + +``` +# Cron expression for the embedding orchestrator. Default nightly at 02:00. +# Use "*/15 * * * *" for more aggressive dev draining. +EMBEDDING_DRAIN_CRON=0 2 * * * +``` + +- [ ] **Step 3: Verify Django config loads** + +Run: `uv run cli shell -c "from django.conf import settings; print(settings.EMBEDDING_DRAIN_CRON, settings.EMBEDDING_SYSTEM_USERNAME)"` +Expected: prints `0 2 * * * system` + +- [ ] **Step 4: Commit** + +```bash +git add radis/settings/base.py example.env +git commit -m "feat(pgsearch): add EMBEDDING_DRAIN_CRON and EMBEDDING_SYSTEM_USERNAME settings" +``` + +--- + +## Task 2: Add `EmbeddingJob` and `EmbeddingTask` models + +**Files:** +- Modify: `radis/pgsearch/models.py` +- Create: `radis/pgsearch/migrations/0004_embedding_job_task.py` +- Create: `radis/pgsearch/tests/test_models_embedding.py` + +- [ ] **Step 1: Write the failing model tests** + +Create `radis/pgsearch/tests/test_models_embedding.py`: + +```python +import pytest +from django.contrib.auth import get_user_model + +from radis.pgsearch.models import EmbeddingJob, EmbeddingTask +from radis.reports.factories import ReportFactory + +User = get_user_model() +pytestmark = pytest.mark.django_db + + +def _system_user() -> "User": + return User.objects.create(username="system", is_active=False) + + +def test_embedding_job_defaults(): + job = EmbeddingJob.objects.create(owner=_system_user()) + assert job.status == EmbeddingJob.Status.UNVERIFIED + assert job.urgent is False + assert job.send_finished_mail is False + assert job.queued_job_id is None + + +def test_embedding_task_links_to_reports(): + job = EmbeddingJob.objects.create(owner=_system_user()) + reports = [ReportFactory.create() for _ in range(3)] + task = EmbeddingTask.objects.create(job=job) + task.reports.set(reports) + assert task.status == EmbeddingTask.Status.PENDING + assert set(task.reports.values_list("pk", flat=True)) == {r.pk for r in reports} + assert task.attempts == 0 + assert task.queued_job_id is None +``` + +- [ ] **Step 2: Run tests — expect ImportError** + +Run: `uv run pytest radis/pgsearch/tests/test_models_embedding.py -v` +Expected: FAIL — `ImportError: cannot import name 'EmbeddingJob'` + +- [ ] **Step 3: Add models to `radis/pgsearch/models.py`** + +Append to `radis/pgsearch/models.py`: + +```python +from django.urls import reverse +from procrastinate.contrib.django import app +from procrastinate.contrib.django.models import ProcrastinateJob + +from radis.core.models import AnalysisJob, AnalysisTask + + +class EmbeddingJob(AnalysisJob): + default_priority = settings.EMBEDDING_INDEX_PRIORITY + urgent_priority = settings.EMBEDDING_INDEX_PRIORITY + + queued_job_id: int | None + queued_job = models.OneToOneField( + ProcrastinateJob, null=True, on_delete=models.SET_NULL, related_name="+" + ) + + tasks: models.QuerySet["EmbeddingTask"] + + class Meta: + ordering = ["-created_at"] + + def __str__(self) -> str: + return f"EmbeddingJob [{self.pk}]" + + def delay(self) -> None: + queued_job_id = app.configure_task( + "radis.pgsearch.tasks.process_embedding_job", + allow_unknown=False, + priority=self.default_priority, + ).defer(job_id=self.pk) + self.queued_job_id = queued_job_id + self.save() + + +class EmbeddingTask(AnalysisTask): + job = models.ForeignKey[EmbeddingJob]( + EmbeddingJob, on_delete=models.CASCADE, related_name="tasks" + ) + reports = models.ManyToManyField(Report, related_name="embedding_tasks") + + def delay(self) -> None: + queued_job_id = app.configure_task( + "radis.pgsearch.tasks.process_embedding_task", + allow_unknown=False, + priority=settings.EMBEDDING_INDEX_PRIORITY, + ).defer(task_id=self.pk) + self.queued_job_id = queued_job_id + self.save() +``` + +- [ ] **Step 4: Generate the migration** + +Run: `uv run cli shell -c "from django.core.management import call_command; call_command('makemigrations', 'pgsearch', name='embedding_job_task')"` +Expected: creates `radis/pgsearch/migrations/0004_embedding_job_task.py` containing `CreateModel` operations for `EmbeddingJob`, `EmbeddingTask`, and the M2M through-table. + +- [ ] **Step 5: Apply the migration and re-run tests** + +Run: `uv run cli shell -c "from django.core.management import call_command; call_command('migrate', 'pgsearch')"` +Then: `uv run pytest radis/pgsearch/tests/test_models_embedding.py -v` +Expected: PASS + +- [ ] **Step 6: Commit** + +```bash +git add radis/pgsearch/models.py radis/pgsearch/migrations/0004_embedding_job_task.py radis/pgsearch/tests/test_models_embedding.py +git commit -m "feat(pgsearch): add EmbeddingJob and EmbeddingTask models" +``` + +--- + +## Task 3: Create the system user via data migration + +**Files:** +- Create: `radis/pgsearch/migrations/0005_system_user.py` +- Create: `radis/pgsearch/tests/test_migrations_system_user.py` + +- [ ] **Step 1: Write the failing migration test** + +Create `radis/pgsearch/tests/test_migrations_system_user.py`: + +```python +import pytest +from django.contrib.auth import get_user_model + +User = get_user_model() + + +@pytest.mark.django_db +def test_system_user_exists_after_migrations(): + user = User.objects.get(username="system") + assert user.is_active is False + assert not user.has_usable_password() + + +@pytest.mark.django_db +def test_creating_system_user_twice_is_a_noop(): + from radis.pgsearch.migrations import _system_user_helper + + before = User.objects.filter(username="system").count() + _system_user_helper.create_system_user_idempotent(User) + after = User.objects.filter(username="system").count() + assert before == after == 1 +``` + +- [ ] **Step 2: Run tests — expect failure** + +Run: `uv run pytest radis/pgsearch/tests/test_migrations_system_user.py -v` +Expected: FAIL — system user does not exist yet OR ImportError on `_system_user_helper`. + +- [ ] **Step 3: Create the helper module** + +Create `radis/pgsearch/migrations/_system_user_helper.py`: + +```python +from django.conf import settings + + +def create_system_user_idempotent(user_model) -> None: + username = settings.EMBEDDING_SYSTEM_USERNAME + user, created = user_model.objects.get_or_create( + username=username, + defaults={"is_active": False}, + ) + if created: + user.set_unusable_password() + user.save() +``` + +- [ ] **Step 4: Create the data migration** + +Create `radis/pgsearch/migrations/0005_system_user.py`: + +```python +from django.conf import settings +from django.db import migrations + +from radis.pgsearch.migrations._system_user_helper import create_system_user_idempotent + + +def forwards(apps, schema_editor): + User = apps.get_model(*settings.AUTH_USER_MODEL.split(".")) + create_system_user_idempotent(User) + + +class Migration(migrations.Migration): + dependencies = [ + ("pgsearch", "0004_embedding_job_task"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + operations = [migrations.RunPython(forwards, reverse_code=migrations.RunPython.noop)] +``` + +- [ ] **Step 5: Apply migration and run tests** + +Run: `uv run cli shell -c "from django.core.management import call_command; call_command('migrate', 'pgsearch')"` +Then: `uv run pytest radis/pgsearch/tests/test_migrations_system_user.py -v` +Expected: PASS + +- [ ] **Step 6: Commit** + +```bash +git add radis/pgsearch/migrations/0005_system_user.py radis/pgsearch/migrations/_system_user_helper.py radis/pgsearch/tests/test_migrations_system_user.py +git commit -m "feat(pgsearch): add data migration for system user" +``` + +--- + +## Task 4: Implement `process_embedding_task` (sub-task) + +**Files:** +- Modify: `radis/pgsearch/tasks.py` +- Create: `radis/pgsearch/tests/test_process_embedding_task.py` + +The existing `embed_reports` task and its helper stay in place for now — they are removed in Task 8. This task adds the new sub-task alongside. + +- [ ] **Step 1: Write the failing tests** + +Create `radis/pgsearch/tests/test_process_embedding_task.py`: + +```python +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from django.contrib.auth import get_user_model + +from radis.pgsearch.models import EmbeddingJob, EmbeddingTask, ReportSearchVector +from radis.pgsearch.tasks import process_embedding_task as _wrapped +from radis.pgsearch.utils.embedding_client import EmbeddingClientError +from radis.reports.factories import ReportFactory + +User = get_user_model() +process_embedding_task = _wrapped.__wrapped__ # type: ignore[attr-defined] +pytestmark = pytest.mark.django_db + + +def _make_task() -> EmbeddingTask: + owner = User.objects.get(username="system") + job = EmbeddingJob.objects.create(owner=owner) + task = EmbeddingTask.objects.create(job=job) + reports = [ReportFactory.create() for _ in range(2)] + task.reports.set(reports) + return task + + +def _unit_vec(dim: int) -> list[float]: + v = np.ones(dim, dtype=np.float32) + return (v / np.linalg.norm(v)).tolist() + + +def test_process_embedding_task_writes_vectors_and_marks_success(settings): + task = _make_task() + vec = _unit_vec(settings.EMBEDDING_DIM) + fake_client = MagicMock() + fake_client.embed_documents.return_value = [vec, vec] + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): + process_embedding_task(task.id) + + task.refresh_from_db() + assert task.status == EmbeddingTask.Status.SUCCESS + assert task.queued_job_id is None + for report in task.reports.all(): + rsv = ReportSearchVector.objects.get(report=report) + assert rsv.embedding is not None + + +def test_process_embedding_task_failure_sets_status_and_raises(): + task = _make_task() + fake_client = MagicMock() + fake_client.embed_documents.side_effect = EmbeddingClientError("boom") + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): + with pytest.raises(EmbeddingClientError): + process_embedding_task(task.id) + + task.refresh_from_db() + assert task.status == EmbeddingTask.Status.FAILURE + assert task.queued_job_id is None + assert "boom" in task.message + + +def test_process_embedding_task_calls_update_job_state(settings): + task = _make_task() + vec = _unit_vec(settings.EMBEDDING_DIM) + fake_client = MagicMock() + fake_client.embed_documents.return_value = [vec, vec] + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): + process_embedding_task(task.id) + + task.job.refresh_from_db() + # All tasks succeeded; AnalysisJob.update_job_state rolls up to SUCCESS. + assert task.job.status == EmbeddingJob.Status.SUCCESS +``` + +- [ ] **Step 2: Run tests — expect ImportError** + +Run: `uv run pytest radis/pgsearch/tests/test_process_embedding_task.py -v` +Expected: FAIL — `ImportError: cannot import name 'process_embedding_task'` + +- [ ] **Step 3: Add `process_embedding_task` to `radis/pgsearch/tasks.py`** + +Append to `radis/pgsearch/tasks.py` (existing imports already cover `logger`, `EmbeddingClient`, `ReportSearchVector`, `app`, `django_settings`): + +```python +from django.utils import timezone + +from .models import EmbeddingTask +from .utils.embedding_client import EmbeddingClientError + + +@app.task(queue="embeddings") +def process_embedding_task(task_id: int) -> None: + task = EmbeddingTask.objects.get(id=task_id) + task.status = EmbeddingTask.Status.IN_PROGRESS + task.started_at = timezone.now() + task.attempts = task.attempts + 1 + task.save() + + client = EmbeddingClient() + try: + report_ids = list(task.reports.values_list("pk", flat=True)) + rsvs = list( + ReportSearchVector.objects + .filter(report_id__in=report_ids) + .select_related("report") + .only("id", "report_id", "report__body") + ) + texts = [rsv.report.body for rsv in rsvs] + vectors = client.embed_documents(texts) + for rsv, vec in zip(rsvs, vectors, strict=True): + rsv.embedding = vec + ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) + + task.status = EmbeddingTask.Status.SUCCESS + except EmbeddingClientError as exc: + logger.exception("Embedding task %s failed: %s", task_id, exc) + task.status = EmbeddingTask.Status.FAILURE + task.message = str(exc) + raise + finally: + task.ended_at = timezone.now() + task.queued_job_id = None + task.save() + task.job.update_job_state() + client.close() +``` + +- [ ] **Step 4: Run tests and verify they pass** + +Run: `uv run pytest radis/pgsearch/tests/test_process_embedding_task.py -v` +Expected: PASS (3 tests) + +- [ ] **Step 5: Commit** + +```bash +git add radis/pgsearch/tasks.py radis/pgsearch/tests/test_process_embedding_task.py +git commit -m "feat(pgsearch): add process_embedding_task on embeddings queue" +``` + +--- + +## Task 5: Implement `process_embedding_job` (orchestrator) + +**Files:** +- Modify: `radis/pgsearch/tasks.py` +- Create: `radis/pgsearch/tests/test_process_embedding_job.py` + +- [ ] **Step 1: Write the failing tests** + +Create `radis/pgsearch/tests/test_process_embedding_job.py`: + +```python +from unittest.mock import patch + +import pytest +from django.contrib.auth import get_user_model + +from radis.pgsearch.models import EmbeddingJob, EmbeddingTask, ReportSearchVector +from radis.pgsearch.tasks import process_embedding_job as _wrapped +from radis.reports.factories import ReportFactory + +User = get_user_model() +process_embedding_job = _wrapped.__wrapped__ # type: ignore[attr-defined] +pytestmark = pytest.mark.django_db + + +def _new_job() -> EmbeddingJob: + owner = User.objects.get(username="system") + return EmbeddingJob.objects.create(owner=owner, status=EmbeddingJob.Status.PREPARING) + + +def _make_pending_reports(n: int): + reports = [ReportFactory.create() for _ in range(n)] + # ReportFactory triggers the FTS post_save signal which creates ReportSearchVector + # rows with embedding=NULL; that's exactly the pending state we want. + return reports + + +def test_process_embedding_job_batches_pending_reports(settings): + settings.EMBEDDING_BATCH_SIZE = 2 + job = _new_job() + reports = _make_pending_reports(5) + + with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: + process_embedding_job(job.id) + + job.refresh_from_db() + assert job.status == EmbeddingJob.Status.PENDING + # ceil(5 / 2) = 3 tasks + assert job.tasks.count() == 3 + # All tasks are dispatched + assert delay_mock.call_count == 3 + # Every pending report is in exactly one task + covered = set() + for task in job.tasks.all(): + covered.update(task.reports.values_list("pk", flat=True)) + assert covered == {r.pk for r in reports} + + +def test_process_embedding_job_resume_path_only_redispatches_pending_tasks(settings): + settings.EMBEDDING_BATCH_SIZE = 2 + job = _new_job() + reports = _make_pending_reports(2) + # Simulate a previous orchestrator run that created one task already. + existing = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) + existing.reports.set(reports) + succeeded = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.SUCCESS) + + with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: + process_embedding_job(job.id) + + job.refresh_from_db() + assert job.status == EmbeddingJob.Status.PENDING + # No new tasks created + assert job.tasks.count() == 2 + # Only the pending one is dispatched + assert delay_mock.call_count == 1 + + +def test_process_embedding_job_with_no_pending_rows(): + job = _new_job() + # No reports exist → no ReportSearchVector rows with embedding IS NULL. + + with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: + process_embedding_job(job.id) + + job.refresh_from_db() + assert job.status == EmbeddingJob.Status.PENDING + assert job.tasks.count() == 0 + assert delay_mock.call_count == 0 +``` + +- [ ] **Step 2: Run tests — expect ImportError** + +Run: `uv run pytest radis/pgsearch/tests/test_process_embedding_job.py -v` +Expected: FAIL — `ImportError: cannot import name 'process_embedding_job'` + +- [ ] **Step 3: Add `process_embedding_job` to `radis/pgsearch/tasks.py`** + +Append to `radis/pgsearch/tasks.py`: + +```python +from .models import EmbeddingJob + + +def _create_embedding_task(job: EmbeddingJob, report_ids: list[int]) -> EmbeddingTask: + from radis.reports.models import Report + + task = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) + task.reports.set(Report.objects.filter(pk__in=report_ids)) + return task + + +@app.task +def process_embedding_job(job_id: int) -> None: + job = EmbeddingJob.objects.get(id=job_id) + assert job.status == EmbeddingJob.Status.PREPARING + + if job.tasks.exists(): + tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) + else: + pending_ids_iter = ( + ReportSearchVector.objects + .filter(embedding__isnull=True) + .values_list("report_id", flat=True) + .iterator(chunk_size=10_000) + ) + batch: list[int] = [] + for report_id in pending_ids_iter: + batch.append(int(report_id)) + if len(batch) >= django_settings.EMBEDDING_BATCH_SIZE: + _create_embedding_task(job, batch) + batch = [] + if batch: + _create_embedding_task(job, batch) + + tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) + + job.status = EmbeddingJob.Status.PENDING + job.queued_job_id = None + job.save() + + for task in tasks_to_enqueue: + if not task.is_queued: + task.delay() +``` + +- [ ] **Step 4: Run tests and verify pass** + +Run: `uv run pytest radis/pgsearch/tests/test_process_embedding_job.py -v` +Expected: PASS (3 tests) + +- [ ] **Step 5: Commit** + +```bash +git add radis/pgsearch/tasks.py radis/pgsearch/tests/test_process_embedding_job.py +git commit -m "feat(pgsearch): add process_embedding_job orchestrator" +``` + +--- + +## Task 6: Implement `embedding_launcher` (periodic) + +**Files:** +- Modify: `radis/pgsearch/tasks.py` +- Create: `radis/pgsearch/tests/test_embedding_launcher.py` + +- [ ] **Step 1: Write the failing tests** + +Create `radis/pgsearch/tests/test_embedding_launcher.py`: + +```python +from unittest.mock import patch + +import pytest +from django.contrib.auth import get_user_model + +from radis.pgsearch.models import EmbeddingJob +from radis.pgsearch.tasks import embedding_launcher as _wrapped +from radis.reports.factories import ReportFactory + +User = get_user_model() +embedding_launcher = _wrapped.__wrapped__ # type: ignore[attr-defined] +pytestmark = pytest.mark.django_db + + +def test_embedding_launcher_noop_when_job_in_flight(): + owner = User.objects.get(username="system") + EmbeddingJob.objects.create(owner=owner, status=EmbeddingJob.Status.PREPARING) + # Make a pending report so the second guard wouldn't short-circuit on its own. + ReportFactory.create() + + with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: + embedding_launcher(context=None, timestamp=0) + + assert delay_mock.call_count == 0 + # No new job created. + assert EmbeddingJob.objects.count() == 1 + + +def test_embedding_launcher_noop_when_no_pending_rows(): + with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: + embedding_launcher(context=None, timestamp=0) + + assert delay_mock.call_count == 0 + assert EmbeddingJob.objects.count() == 0 + + +def test_embedding_launcher_happy_path_creates_job_and_defers( + django_capture_on_commit_callbacks, +): + ReportFactory.create() + + with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: + with django_capture_on_commit_callbacks(execute=True): + embedding_launcher(context=None, timestamp=0) + + assert EmbeddingJob.objects.count() == 1 + job = EmbeddingJob.objects.get() + assert job.status == EmbeddingJob.Status.PREPARING + assert job.owner.username == "system" + delay_mock.assert_called_once() +``` + +- [ ] **Step 2: Run tests — expect ImportError** + +Run: `uv run pytest radis/pgsearch/tests/test_embedding_launcher.py -v` +Expected: FAIL — `ImportError: cannot import name 'embedding_launcher'` + +- [ ] **Step 3: Add `embedding_launcher` to `radis/pgsearch/tasks.py`** + +Append to `radis/pgsearch/tasks.py`: + +```python +from django.contrib.auth import get_user_model +from django.db import transaction + + +@app.periodic(cron=django_settings.EMBEDDING_DRAIN_CRON) +@app.task( + queue="default", + queueing_lock="embedding_launcher", + pass_context=True, +) +def embedding_launcher(context, timestamp: int) -> None: + in_flight = EmbeddingJob.objects.filter( + status__in=[ + EmbeddingJob.Status.PREPARING, + EmbeddingJob.Status.PENDING, + EmbeddingJob.Status.IN_PROGRESS, + ] + ).exists() + if in_flight: + logger.info("EmbeddingJob already in flight; launcher tick is a no-op.") + return + + has_pending = ReportSearchVector.objects.filter(embedding__isnull=True).exists() + if not has_pending: + logger.debug("No reports pending embedding; launcher tick is a no-op.") + return + + User = get_user_model() + system_user = User.objects.get(username=django_settings.EMBEDDING_SYSTEM_USERNAME) + job = EmbeddingJob.objects.create( + owner=system_user, + status=EmbeddingJob.Status.PREPARING, + ) + transaction.on_commit(job.delay) +``` + +- [ ] **Step 4: Run tests and verify pass** + +Run: `uv run pytest radis/pgsearch/tests/test_embedding_launcher.py -v` +Expected: PASS (3 tests) + +- [ ] **Step 5: Verify the full pgsearch test suite still passes** + +Run: `uv run pytest radis/pgsearch/ -v` +Expected: PASS for all new tests; old `test_embed_reports_task.py`, `test_backfill_command.py`, `test_signals.py` still pass since their targets aren't removed yet. + +- [ ] **Step 6: Commit** + +```bash +git add radis/pgsearch/tasks.py radis/pgsearch/tests/test_embedding_launcher.py +git commit -m "feat(pgsearch): add embedding_launcher periodic task" +``` + +--- + +## Task 7: Remove old `enqueue_report_embedding` signal + +**Files:** +- Modify: `radis/pgsearch/signals.py:19-23` +- Delete: `radis/pgsearch/tests/test_signals.py` (the file becomes empty) + +The FTS signal `create_or_update_report_search_vector` stays. The embedding signal is the only thing being removed. + +- [ ] **Step 1: Remove the embedding signal receiver** + +Replace `radis/pgsearch/signals.py` contents with: + +```python +from django.db.models.signals import post_save +from django.dispatch import receiver + +from radis.reports.models import Report + +from .models import ReportSearchVector + + +@receiver(post_save, sender=Report) +def create_or_update_report_search_vector(sender, instance, created, **kwargs): + if created: + ReportSearchVector.objects.create(report=instance) + return + instance.search_vector.save() +``` + +(Removes the `transaction` and `enqueue_embed_reports` imports along with the second receiver.) + +- [ ] **Step 2: Delete the signal test file** + +Run: `rm radis/pgsearch/tests/test_signals.py` + +- [ ] **Step 3: Run the full pgsearch test suite** + +Run: `uv run pytest radis/pgsearch/ -v` +Expected: PASS for everything; `test_signals.py` no longer collected. + +- [ ] **Step 4: Commit** + +```bash +git add radis/pgsearch/signals.py +git rm radis/pgsearch/tests/test_signals.py +git commit -m "refactor(pgsearch): remove post_save embedding signal (replaced by orchestrator)" +``` + +--- + +## Task 8: Remove `embed_reports` task and `enqueue_embed_reports` helper + +**Files:** +- Modify: `radis/pgsearch/tasks.py` +- Delete: `radis/pgsearch/tests/test_embed_reports_task.py` + +At this point nothing imports `embed_reports` or `enqueue_embed_reports` (the signal was removed in Task 7; the backfill command is removed in Task 9 — but the command's import is what we now break by removing the task. The fix is to remove both in one logical step: this task removes the task, Task 9 removes the command. Order matters — do Task 8 *and* Task 9 in immediate succession so the tree never has a dangling import. + +Confirm with grep before deleting: + +- [ ] **Step 1: Confirm only the backfill command still imports the helper** + +Run: `grep -rn "enqueue_embed_reports\|embed_reports" radis/ --include="*.py" | grep -v __pycache__` +Expected: only references in `radis/pgsearch/tasks.py`, `radis/pgsearch/management/commands/backfill_embeddings.py`, and `radis/pgsearch/tests/test_embed_reports_task.py`. + +- [ ] **Step 2: Remove `embed_reports` and `enqueue_embed_reports` from `radis/pgsearch/tasks.py`** + +In `radis/pgsearch/tasks.py`, delete the function definitions for `embed_reports` (the `@app.task(queue="embeddings")` block currently at lines ~37-68) and `enqueue_embed_reports` (currently at lines ~71-84). Keep `bulk_index_reports`, `enqueue_bulk_index_reports`, and all the new orchestrator code added in Tasks 4–6. + +- [ ] **Step 3: Delete the old test file** + +Run: `rm radis/pgsearch/tests/test_embed_reports_task.py` + +- [ ] **Step 4: Verify the backfill command still imports cleanly is now expected to fail** + +Run: `uv run cli shell -c "from radis.pgsearch.management.commands import backfill_embeddings"` +Expected: `ImportError: cannot import name 'enqueue_embed_reports'` — this confirms Task 9 (deleting the command) is the immediate next step. + +- [ ] **Step 5: Do NOT commit yet — proceed straight to Task 9** + +The tree is in a broken intermediate state. Move to Task 9 before committing. + +--- + +## Task 9: Remove `backfill_embeddings` management command + +**Files:** +- Delete: `radis/pgsearch/management/commands/backfill_embeddings.py` +- Delete: `radis/pgsearch/tests/test_backfill_command.py` + +- [ ] **Step 1: Delete the command and its test** + +Run: +```bash +rm radis/pgsearch/management/commands/backfill_embeddings.py +rm radis/pgsearch/tests/test_backfill_command.py +``` + +- [ ] **Step 2: Verify no remaining references** + +Run: `grep -rn "backfill_embeddings\|enqueue_embed_reports\|embed_reports" radis/ --include="*.py" | grep -v __pycache__` +Expected: empty output. + +- [ ] **Step 3: Run the full pgsearch test suite** + +Run: `uv run pytest radis/pgsearch/ -v` +Expected: PASS for everything; the removed test files are no longer collected. + +- [ ] **Step 4: Commit Tasks 8 + 9 together** + +```bash +git add radis/pgsearch/tasks.py +git rm radis/pgsearch/tests/test_embed_reports_task.py +git rm radis/pgsearch/management/commands/backfill_embeddings.py +git rm radis/pgsearch/tests/test_backfill_command.py +git commit -m "refactor(pgsearch): remove embed_reports task and backfill_embeddings command" +``` + +--- + +## Task 10: Remove `EMBEDDING_BACKFILL_PRIORITY` setting + +**Files:** +- Modify: `radis/settings/base.py:360` + +- [ ] **Step 1: Confirm no remaining references** + +Run: `grep -rn "EMBEDDING_BACKFILL_PRIORITY" radis/ --include="*.py" | grep -v __pycache__` +Expected: only `radis/settings/base.py:360`. + +- [ ] **Step 2: Remove the setting line** + +In `radis/settings/base.py`, delete the line: + +```python +EMBEDDING_BACKFILL_PRIORITY = -1 +``` + +- [ ] **Step 3: Verify Django still loads** + +Run: `uv run cli shell -c "from django.conf import settings; print(settings.EMBEDDING_INDEX_PRIORITY)"` +Expected: prints `0`. + +- [ ] **Step 4: Run full test suite to confirm nothing dangles** + +Run: `uv run pytest radis/pgsearch/ -v` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add radis/settings/base.py +git commit -m "refactor(pgsearch): remove EMBEDDING_BACKFILL_PRIORITY setting" +``` + +--- + +## Task 11: Set `--concurrency 4` on the embeddings worker + +**Files:** +- Modify: `docker-compose.dev.yml:85-92` +- Modify: `docker-compose.prod.yml:80-88` + +The orchestrator is on `default`, so the `embeddings_worker` only runs `process_embedding_task`. Concurrency 4 saturates a typical embedding endpoint while leaving headroom; raise/lower per deployment. + +- [ ] **Step 1: Update `docker-compose.dev.yml`** + +Edit `docker-compose.dev.yml`. Change the `embeddings_worker` command from: + +```yaml + command: > + bash -c " + wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && + ./manage.py bg_worker -l debug -q embeddings --autoreload + " +``` + +to: + +```yaml + command: > + bash -c " + wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && + ./manage.py bg_worker -l debug -q embeddings --autoreload --concurrency 4 + " +``` + +- [ ] **Step 2: Update `docker-compose.prod.yml`** + +Edit `docker-compose.prod.yml`. Change the `embeddings_worker` command from: + +```yaml + command: > + bash -c " + wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && + ./manage.py bg_worker -q embeddings + " +``` + +to: + +```yaml + command: > + bash -c " + wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && + ./manage.py bg_worker -q embeddings --concurrency 4 + " +``` + +- [ ] **Step 3: Validate compose syntax** + +Run: `docker compose -f docker-compose.dev.yml config > /dev/null && docker compose -f docker-compose.prod.yml config > /dev/null` +Expected: exit 0, no output. (If Docker is not running locally, skip — this just confirms YAML is well-formed.) + +- [ ] **Step 4: Commit** + +```bash +git add docker-compose.dev.yml docker-compose.prod.yml +git commit -m "feat(infra): run embeddings_worker with --concurrency 4" +``` + +--- + +## Final verification + +- [ ] **Step 1: Run lint** + +Run: `uv run cli lint` +Expected: PASS (no new violations). + +- [ ] **Step 2: Run the full pgsearch test suite** + +Run: `uv run pytest radis/pgsearch/ -v` +Expected: PASS for every test. + +- [ ] **Step 3: Run the broader app test suite** + +Run: `uv run cli test` +Expected: PASS. (Pay attention to extractions/subscriptions/search since they share the AnalysisJob base.) + +- [ ] **Step 4: Smoke-test in dev containers (manual)** + +```bash +uv run cli compose-up -- --watch +# in another terminal: +uv run cli shell +>>> from radis.reports.factories import ReportFactory +>>> ReportFactory.create_batch(5) +>>> from radis.pgsearch.tasks import embedding_launcher +>>> embedding_launcher.defer() +# watch logs: +docker compose logs -f default_worker embeddings_worker +# verify EmbeddingJob and tasks are created and reach SUCCESS: +>>> from radis.pgsearch.models import EmbeddingJob +>>> EmbeddingJob.objects.latest("created_at").status +``` + +Expected: latest job's status is `SU` (SUCCESS). + +- [ ] **Step 5: Push branch** + +Only after the above pass. + +```bash +git push -u origin feat/hybrid-search +``` + +--- + +## Spec coverage cross-check + +| Spec requirement | Task | +|---|---| +| §6.2 `embeddings_worker --concurrency 4` | Task 11 | +| §6.3 priority table (no `EMBEDDING_BACKFILL_PRIORITY`) | Task 10 | +| §6.4 `EmbeddingJob`, `EmbeddingTask` models | Task 2 | +| §6.4 owner = system user via data migration | Task 3 | +| §6.5 `embedding_launcher` with `queueing_lock` + in-flight check | Task 6 | +| §6.6 `process_embedding_job` PREPARING → PENDING flow | Task 5 | +| §6.7 `process_embedding_task` on `embeddings` queue | Task 4 | +| §6.8 No post_save signal | Task 7 | +| §6.8 No `backfill_embeddings` command | Tasks 8 + 9 | +| §8.1 `EMBEDDING_DRAIN_CRON` env var | Task 1 | +| §8.2 `EMBEDDING_SYSTEM_USERNAME` constant | Task 1 | +| §10.1 unit tests for launcher/job/task | Tasks 4, 5, 6 | From d0e4231947b7349058f256f17d46166af9f9b462 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:13:06 +0000 Subject: [PATCH 37/68] docs(pgsearch): split hybrid-search spec into dated original + unified rev Restores 2026-05-15-hybrid-search-design.md to its original 594-line content (post_save-driven embedding path). Adds 2026-05-28-hybrid-search.md as the unified, forward-looking spec that supersedes it with the EmbeddingJob/ EmbeddingTask orchestrator design. Plan now references the new spec. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../2026-05-28-embedding-orchestrator.md | 2 +- .../specs/2026-05-15-hybrid-search-design.md | 486 ++-------- .../specs/2026-05-28-hybrid-search.md | 894 ++++++++++++++++++ 3 files changed, 983 insertions(+), 399 deletions(-) create mode 100644 docs/superpowers/specs/2026-05-28-hybrid-search.md diff --git a/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md b/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md index 6b5339a8..0b820e3d 100644 --- a/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md +++ b/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md @@ -8,7 +8,7 @@ **Tech Stack:** Django 5.1, Procrastinate (periodic tasks + `queueing_lock`), pgvector, pytest-django. -**Spec:** `docs/superpowers/specs/2026-05-15-hybrid-search-design.md` §6. +**Spec:** `docs/superpowers/specs/2026-05-28-hybrid-search.md` §6. **Branch:** `feat/hybrid-search` (continue here; no worktree required). diff --git a/docs/superpowers/specs/2026-05-15-hybrid-search-design.md b/docs/superpowers/specs/2026-05-15-hybrid-search-design.md index ea402aad..77a2d59e 100644 --- a/docs/superpowers/specs/2026-05-15-hybrid-search-design.md +++ b/docs/superpowers/specs/2026-05-15-hybrid-search-design.md @@ -2,15 +2,9 @@ **Status:** Draft — design phase **Author:** RADIS team (Samuel Kwong) -**Date:** 2026-05-15 (revised 2026-05-28) +**Date:** 2026-05-15 **Implementation skill (next step):** `writing-plans` -**Revision 2026-05-28:** §6 ("Async indexing") was redesigned around a periodic -`EmbeddingJob` / `EmbeddingTask` orchestrator instead of a `post_save`-driven -per-report task. Affected sections: §3 (architecture diagram & file table), §4.5 -(model-change procedure), §5.4 (dev recipe), §6 (full replacement), §8 -(settings), §9 (failure modes), §10 (testing), §12 (rollout). - --- ## 1. Overview @@ -38,7 +32,7 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — - No per-query UI toggle for semantic vs. lexical. Hybrid is the new default. - No Vespa, Elasticsearch, or OpenSearch adapter. - No solution for negation/polarity (§11 documents this as known future work). -- No automated re-embedding when `EMBEDDING_DIM` changes. That is a manual operator procedure: drop column, re-migrate, defer the embedding orchestrator (see §4.5). +- No automated re-embedding when `EMBEDDING_DIM` changes. That is a manual operator procedure: drop column, re-migrate, run `backfill_embeddings`. - No on-disk vector quantization. Float32 storage from day one; revisit if RAM pressure appears. ## 3. Architecture @@ -67,43 +61,23 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — └──────────────────────────────────────────────────────────────────────┘ ┌──────────────────────────────────────────────────────────────────────┐ -│ Async indexing path (Job/Task orchestrator, periodic-driven) │ -│ │ -│ cron (settings.EMBEDDING_DRAIN_CRON, default nightly 02:00) │ -│ │ │ -│ ▼ │ -│ embedding_launcher() — `default` queue │ -│ ├─ queueing_lock="embedding_launcher" │ -│ ├─ skip if any EmbeddingJob in PREPARING/PENDING/IN_PROGRESS │ -│ ├─ skip if no rows with embedding IS NULL │ -│ └─ EmbeddingJob.objects.create(...) → job.delay() │ +│ Async indexing path │ │ │ -│ process_embedding_job(job_id) — `default` queue │ -│ ├─ iterate ReportSearchVector with embedding IS NULL │ -│ ├─ chunk by EMBEDDING_BATCH_SIZE → EmbeddingTask rows │ -│ ├─ task.reports.set(chunk); task.delay() (no HTTP work) │ -│ └─ job.status = PENDING; return │ +│ Report.save() ──post_save──► enqueue_embed_reports([id]) │ +│ │ │ +│ ▼ │ +│ Procrastinate queue: "embeddings" │ +│ │ │ +│ ▼ │ +│ embeddings_worker ──► embed_reports(ids) │ +│ ├─ EmbeddingClient.embed_documents(...) │ +│ ├─ L2-normalize │ +│ └─ ReportSearchVector.objects.update() │ │ │ -│ process_embedding_task(task_id) — `embeddings` queue │ -│ ├─ EmbeddingClient.embed_documents([r.body for r in task.reports])│ -│ ├─ L2-normalize; bulk_update ReportSearchVector.embedding │ -│ ├─ task.status = SUCCESS/FAILURE; clear queued_job_id │ -│ └─ job.update_job_state() │ -│ │ -│ Operator-triggered drain: from a Django shell run │ -│ `embedding_launcher.defer()` — same code path as periodic. │ +│ ./manage.py backfill_embeddings ──► batched enqueue on same queue │ └──────────────────────────────────────────────────────────────────────┘ ``` -The bulk-upsert API path (`reports/api/viewsets.py:_bulk_upsert_reports`) -already creates `ReportSearchVector` rows with `embedding=NULL` via the FTS -indexing call in its `on_commit` block. The single-create API path goes through -the standard `Report.save()` and the FTS `post_save` signal, which likewise -creates the `ReportSearchVector` row with NULL embedding. Both ingest paths -deposit work into the same DB-resident pending pool; the orchestrator drains it -on the next periodic tick (or on an operator-triggered defer). There is no -per-API-call embedding job. - **Components added inside `radis.pgsearch`:** | File | Purpose | @@ -111,12 +85,11 @@ per-API-call embedding job. | `utils/embedding_client.py` | Sync + async HTTP clients with pluggable backends (`openai`, `ollama`) | | `migrations/0002_pgvector_extension.py` | `CREATE EXTENSION IF NOT EXISTS vector;` | | `migrations/0003_report_embedding.py` | Adds `embedding vector(N)` column + HNSW index | -| `migrations/0004_embedding_job_task.py` | Adds `EmbeddingJob` and `EmbeddingTask` tables + M2M to `Report` | -| `migrations/0005_system_user.py` | Data migration: creates the system user if missing | -| `models.py` (modified) | Adds `embedding` field + `HnswIndex`; defines `EmbeddingJob` and `EmbeddingTask` inheriting `AnalysisJob`/`AnalysisTask` | -| `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** | -| `tasks.py` (modified) | Adds `embedding_launcher` (periodic), `process_embedding_job` (`default` queue), `process_embedding_task` (`embeddings` queue) | +| `models.py` (modified) | Adds `embedding` field + `HnswIndex` | +| `signals.py` (modified) | Adds second `post_save` receiver to enqueue embedding | +| `tasks.py` (modified) | Adds `embed_reports` Procrastinate task on `embeddings` queue | | `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic | +| `management/commands/backfill_embeddings.py` | Idempotent backfill command | | `tests/...` | Coverage per §10 | **Infrastructure additions:** @@ -186,23 +159,17 @@ class ReportSearchVector(models.Model): ] ``` -`embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled asynchronously by `process_embedding_task` (§6.7). A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only. +`embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled asynchronously by `embed_reports`. A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only. -`save()` on `ReportSearchVector` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by `process_embedding_task` via `bulk_update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent. +`save()` on `ReportSearchVector` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by the embedding task via `update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent. ### 4.5 Operational note on `EMBEDDING_DIM` -pgvector columns and HNSW indexes are bound to a fixed dimension at create time, and HNSW has a 2000-dim ceiling (so `EMBEDDING_DIM ≤ 2000`; Qwen3-Embedding-4B's native 2560 is Matryoshka-truncated client-side). A Django system check (`pgsearch.E001`) compares `settings.EMBEDDING_DIM` against the literal in migration 0003 and fails `manage.py check` on mismatch. Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure: +pgvector columns and HNSW indexes are bound to a fixed dimension at create time. Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure: 1. Drop the HNSW index and the `embedding` column. 2. Re-run `0003_report_embedding` with the new `EMBEDDING_DIM`. -3. From a Django shell, defer the embedding orchestrator immediately so the - next nightly tick is not waited for: - - ```python - from radis.pgsearch.tasks import embedding_launcher - embedding_launcher.defer() - ``` +3. Run `./manage.py backfill_embeddings`. This is documented as a deployment-time decision and intentionally not automated. @@ -217,7 +184,7 @@ This is documented as a deployment-time decision and intentionally not automated - `class OllamaBackend(EmbeddingBackend)` — default path `/api/embed`, body `{model, input: [...]}`, response `{embeddings: [[...]]}`. - `BACKENDS: dict[str, EmbeddingBackend] = {"openai": OpenAIBackend(), "ollama": OllamaBackend()}`. - `class EmbeddingClientError(Exception)`. -- `class EmbeddingClient` — sync client used by `process_embedding_task` and the query path. +- `class EmbeddingClient` — sync client used by `embed_reports` task and the query path. - `class AsyncEmbeddingClient` — async variant, kept for parity with `chats/utils/chat_client.py` and so the query path can call it from ASGI views without `async_to_sync` later. ### 5.2 Interface @@ -256,7 +223,7 @@ class EmbeddingClient: - **Truncation:** any text longer than `EMBEDDING_MAX_INPUT_CHARS` is truncated at the character limit before being sent. A WARNING is logged with the report id (when known) and char count. Qwen3-Embedding-4B supports up to 32k tokens, so truncation will be rare for radiology bodies but is bounded as a defense against pathological inputs. - **Normalization:** every returned vector is L2-normalized client-side, unconditionally. With unit vectors, cosine distance is monotonic in dot product, which makes the HNSW `vector_cosine_ops` operator effectively a fast inner-product search. Whether the upstream server normalizes is irrelevant. - **Dimension validation:** every vector is checked to have length `EMBEDDING_DIM`. A mismatch raises `EmbeddingClientError`. -- **Batching:** `embed_documents` sends a single HTTP call per invocation. The higher-level orchestrator (`process_embedding_job`) groups reports into `EmbeddingTask` batches of `EMBEDDING_BATCH_SIZE` before dispatching them to `process_embedding_task`. +- **Batching:** `embed_documents` sends a single HTTP call per invocation. Higher-level callers (`embed_reports` task) split into batches of `EMBEDDING_BATCH_SIZE` before calling. - **Errors:** non-2xx, timeout, malformed JSON, missing key, or wrong dim all raise `EmbeddingClientError`. The client never falls back internally — fallback policy is owned by the caller. - **Dev recipe (Ollama):** ```bash @@ -267,71 +234,24 @@ class EmbeddingClient: EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M EMBEDDING_DIM=2560 ``` - GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, defer the embedding orchestrator from a Django shell (see §4.5). - -## 6. Async indexing (Job/Task orchestrator) - -The embedding lifecycle uses the same Job/Task pattern as `ExtractionJob` / -`ExtractionTask` (`radis/extractions/tasks.py:32`) and `SubscriptionJob` / -`SubscriptionTask` (`radis/subscriptions/tasks.py:33`). A periodic launcher -creates one `EmbeddingJob` per drain run; the orchestrator splits pending -reports into `EmbeddingTask` batches; each task is processed on the -`embeddings` queue. - -### 6.1 Why a Job/Task orchestrator instead of a per-save signal - -Two ingest paths exercise RADIS: - -- **Single-create** (`POST /api/reports/`) routes through `Report.objects.create` - in the serializer (`radis/reports/api/serializers.py:87`). A `post_save` - signal here would fire once per API call. -- **Bulk-upsert** (`POST /api/reports/bulk-upsert`) routes through - `Report.objects.bulk_create` / `bulk_update` - (`radis/reports/api/viewsets.py:_bulk_upsert_reports`). `post_save` **does - not fire** on `bulk_create`. The FTS path rescues itself by explicitly - calling `enqueue_bulk_index_reports(touched_ids)` in `on_commit`; an - embedding signal would still be skipped on this path. - -A per-save signal would therefore either (a) skip the bulk path entirely, or -(b) produce one Procrastinate job per single-create API call — each opening an -HTTP connection to the embedding service with batch=1. With ETL pipelines that -may post one report at a time, this fragments the workload to one job per API -call and wastes the embedding endpoint's batch capacity. - -The orchestrator design accepts a longer freshness window (hours / next-cycle) -in exchange for batched, throughput-friendly embedding runs that serve all -three operational scenarios with one mechanism: - -| Scenario | What happens | -|---|---| -| **Initial bulk upload** (millions of reports via `/bulk-upsert`) | `ReportSearchVector` rows created with `embedding=NULL`. Operator defers the launcher immediately or waits for the next cron tick. One `EmbeddingJob` produces N `EmbeddingTask` batches. | -| **Daily ad-hoc upload** | Reports land NULL via either ingest path. Next periodic tick consolidates the day's pending pool into a single `EmbeddingJob`. | -| **Model-change backfill** | Operator follows §4.5 (drop column, re-migrate), then defers the launcher from a shell. Same code path as the periodic. | + GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, run `backfill_embeddings`. + +## 6. Async indexing -### 6.2 Queue and worker +### 6.1 Queue and worker -The `embeddings` Procrastinate queue is served by the `embeddings_worker` -container. The orchestrator (`process_embedding_job`) runs on the `default` -queue alongside `process_extraction_job` and `process_subscription_job`; the -sub-tasks (`process_embedding_task`) run on `embeddings`. +A new Procrastinate queue named **`embeddings`** is added, served by a new container **`embeddings_worker`**. This isolates embedding load from the existing `default` and `llm` queues. The `embeddings` worker's command: ``` -./manage.py bg_worker -l debug -q embeddings --autoreload --concurrency 4 # dev -./manage.py bg_worker -l info -q embeddings --concurrency 4 # prod +./manage.py bg_worker -l debug -q embeddings --autoreload # dev +./manage.py bg_worker -l info -q embeddings # prod ``` -`embeddings_worker` concurrency tunes parallelism against the embedding -endpoint. Recommended 4; raise if the endpoint has spare throughput, lower if -it rate-limits. The orchestrator does not run on this queue, so there is no -self-deadlock condition tied to concurrency on the `embeddings` queue. +The worker inherits the same image and environment as `default_worker` / `llm_worker` via the existing `&default-app` anchor. -### 6.3 Priorities +### 6.2 Priorities -Procrastinate priority is "higher = sooner". Embedding work runs at lower -priority than extraction and subscription so it never starves user-driven LLM -operations. The orchestrator (`default` queue) and sub-tasks (`embeddings` -queue) share `EMBEDDING_INDEX_PRIORITY`; there is no separate backfill -priority because the backfill path is the same orchestrator. +Procrastinate priority is "higher = sooner". Embedding tasks always run at lower priority than the existing LLM tasks so a backfill never starves extraction/subscription work — though in practice this only matters *within* a queue, and `embeddings` is a separate queue from `llm`. The priorities are still set defensively in case workers are ever consolidated: | Task | Priority | |---|---| @@ -340,235 +260,60 @@ priority because the backfill path is the same orchestrator. | `SUBSCRIPTION_DEFAULT_PRIORITY` (existing) | 3 | | `SUBSCRIPTION_URGENT_PRIORITY` (existing) | 4 | | `EMBEDDING_INDEX_PRIORITY` (new) | 0 | +| `EMBEDDING_BACKFILL_PRIORITY` (new) | -1 | -### 6.4 Models - -`radis/pgsearch/models.py` defines two new models inheriting `AnalysisJob` and -`AnalysisTask` (`radis/core/models.py:17,220`): - -```python -from radis.core.models import AnalysisJob, AnalysisTask - - -class EmbeddingJob(AnalysisJob): - default_priority = settings.EMBEDDING_INDEX_PRIORITY - urgent_priority = settings.EMBEDDING_INDEX_PRIORITY # no urgent variant - - def delay(self) -> None: - queued_job_id = app.configure_task( - "radis.pgsearch.tasks.process_embedding_job", - allow_unknown=False, - priority=self.default_priority, - ).defer(job_id=self.pk) - self.queued_job_id = queued_job_id - self.save() - - -class EmbeddingTask(AnalysisTask): - job = models.ForeignKey(EmbeddingJob, on_delete=models.CASCADE, related_name="tasks") - reports = models.ManyToManyField(Report, related_name="embedding_tasks") - - def delay(self) -> None: - queued_job_id = app.configure_task( - "radis.pgsearch.tasks.process_embedding_task", - allow_unknown=False, - priority=settings.EMBEDDING_INDEX_PRIORITY, - ).defer(task_id=self.pk) - self.queued_job_id = queued_job_id - self.save() -``` - -**Owner field.** `AnalysisJob.owner` is non-nullable (`settings.AUTH_USER_MODEL`). -Embedding jobs are system-driven and have no human creator. A data migration -(`0005_system_user.py`) creates a `User(username=settings.EMBEDDING_SYSTEM_USERNAME, -is_active=False, password=unusable)` idempotently; the launcher assigns this -user as `owner` on every `EmbeddingJob`. This avoids subclass-level overrides -of `owner` and keeps the abstract contract clean. - -**No `get_absolute_url` in v1.** Existing `ExtractionJob` and `SubscriptionJob` -implement `get_absolute_url` because they have user-facing detail views. -`EmbeddingJob` has no user-facing UI in v1 — operators inspect it via Django -admin (default `ModelAdmin` registration is sufficient). The inherited abstract -`AnalysisJob.get_absolute_url` body is `...`, returning `None`; no call site in -radis treats an `EmbeddingJob` like a user-facing analysis job. A future spec -can add the view and override the method. +Backfill below incremental ensures fresh-report embeddings always overtake a backfill job in flight. -`urgent`, `send_finished_mail`, and `finished_mail_template` stay at their -`AnalysisJob` defaults (`False`, `False`, `None`). - -### 6.5 Launcher (the periodic task) +### 6.3 Task: `embed_reports` `radis/pgsearch/tasks.py`: ```python -@app.periodic(cron=settings.EMBEDDING_DRAIN_CRON) -@app.task( - queue="default", - queueing_lock="embedding_launcher", - pass_context=True, -) -def embedding_launcher(context, timestamp: int) -> None: - in_flight = EmbeddingJob.objects.filter( - status__in=[ - EmbeddingJob.Status.PREPARING, - EmbeddingJob.Status.PENDING, - EmbeddingJob.Status.IN_PROGRESS, - ] - ).exists() - if in_flight: - logger.info("EmbeddingJob already in flight; launcher tick is a no-op.") - return - - has_pending = ReportSearchVector.objects.filter(embedding__isnull=True).exists() - if not has_pending: - logger.debug("No reports pending embedding; launcher tick is a no-op.") - return - - system_user = User.objects.get(username=settings.EMBEDDING_SYSTEM_USERNAME) - job = EmbeddingJob.objects.create( - owner=system_user, - status=EmbeddingJob.Status.PREPARING, - ) - transaction.on_commit(job.delay) +@app.task(queue="embeddings") +def embed_reports(report_ids: list[int]) -> None: + """Embed the given reports and write the vector to ReportSearchVector.embedding. + Idempotent. Skips rows that already have an embedding.""" ``` -**Two reinforcing layers of duplicate-dispatch prevention:** - -- **Procrastinate `queueing_lock="embedding_launcher"`.** While a launcher job - is in the queue (`todo`) or executing (`doing`), the next cron tick's - `defer` call silently fails with `AlreadyEnqueued`. The launcher itself is - fast (one existence check + maybe one INSERT), so the lock is normally - released within milliseconds. -- **In-flight EmbeddingJob check.** Even if the queueing lock leaks (worker - crash mid-flight, manual `defer` from a shell, dashboard re-trigger), the - launcher's first action is to look for any `EmbeddingJob` in a non-terminal - status. If one exists, the launcher returns without creating another. This - is the same dedup pattern used by `process_extraction_job` when re-entered - (`extractions/tasks.py:46`). +Implementation outline: -### 6.6 Orchestrator (`process_embedding_job`) +1. `target = ReportSearchVector.objects.filter(report_id__in=ids).select_related("report").only("report_id", "report__body")`. No `embedding__isnull` short-circuit at this layer — the task always re-embeds whatever it is given. Backfill controls the "only fill in nulls" policy by filtering at enqueue time (§6.5). +2. Iterate in chunks of `EMBEDDING_BATCH_SIZE`; for each chunk, call `EmbeddingClient().embed_documents([rsv.report.body for rsv in chunk])`. +3. `ReportSearchVector.objects.filter(pk=rsv.pk).update(embedding=vec)` per row. (Postgres `UPDATE … SET embedding = CASE pk WHEN … END` is a possible optimization if profiling shows the per-row update is a bottleneck; not done in v1.) +4. Any `EmbeddingClientError` is re-raised so Procrastinate's default retry policy with exponential backoff handles transient failures. -```python -@app.task -def process_embedding_job(job_id: int) -> None: - job = EmbeddingJob.objects.get(id=job_id) - assert job.status == EmbeddingJob.Status.PREPARING - - # Retry/resume path: tasks already exist, re-enqueue still-pending ones. - if job.tasks.exists(): - tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) - else: - pending_ids_iter = ( - ReportSearchVector.objects - .filter(embedding__isnull=True) - .values_list("report_id", flat=True) - .iterator(chunk_size=10_000) - ) - batch: list[int] = [] - for report_id in pending_ids_iter: - batch.append(int(report_id)) - if len(batch) >= settings.EMBEDDING_BATCH_SIZE: - _create_embedding_task(job, batch) - batch = [] - if batch: - _create_embedding_task(job, batch) - - tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) - - job.status = EmbeddingJob.Status.PENDING - job.queued_job_id = None - job.save() - - for task in tasks_to_enqueue: - if not task.is_queued: - task.delay() - - -def _create_embedding_task(job: EmbeddingJob, report_ids: list[int]) -> EmbeddingTask: - task = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) - task.reports.set(Report.objects.filter(pk__in=report_ids)) - return task -``` - -Mirrors `process_extraction_job` (`extractions/tasks.py:32`). State transitions -follow the standard pattern: +Helper `enqueue_embed_reports(report_ids, priority=settings.EMBEDDING_INDEX_PRIORITY)` mirrors the existing `enqueue_bulk_index_reports`. -- `PREPARING` while tasks are being created (sub-tasks must not be dispatched yet). -- `PENDING` after task creation completes; sub-tasks are then enqueued. -- `IN_PROGRESS` / `SUCCESS` / `WARNING` / `FAILURE` driven by `update_job_state` - (inherited from `AnalysisJob`) called from each sub-task on completion. +**V1 re-embedding policy:** the signal enqueues on every `Report.save()`, including metadata-only updates, so metadata edits trigger a wasted re-embed. Accepted simplicity for v1; §11.4 documents body-change detection as a future optimization. -The orchestrator does no HTTP work. For 1M pending reports at -`EMBEDDING_BATCH_SIZE=32`, it creates ~31,250 `EmbeddingTask` rows and defers -them — well under a minute on the `default` worker. Its slot is freed -immediately after; long-running embedding work happens on the `embeddings` -worker. +### 6.4 Signal -### 6.7 Sub-task (`process_embedding_task`) +`radis/pgsearch/signals.py` keeps the existing receiver for the FTS path and adds: ```python -@app.task(queue="embeddings") -def process_embedding_task(task_id: int) -> None: - task = EmbeddingTask.objects.get(id=task_id) - task.status = EmbeddingTask.Status.IN_PROGRESS - task.started_at = timezone.now() - task.attempts = task.attempts + 1 - task.save() - - client = EmbeddingClient() - try: - report_ids = list(task.reports.values_list("pk", flat=True)) - rsvs = list( - ReportSearchVector.objects - .filter(report_id__in=report_ids) - .select_related("report") - .only("id", "report_id", "report__body") - ) - texts = [rsv.report.body for rsv in rsvs] - vectors = client.embed_documents(texts) - for rsv, vec in zip(rsvs, vectors, strict=True): - rsv.embedding = vec - ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) - - task.status = EmbeddingTask.Status.SUCCESS - except EmbeddingClientError as exc: - logger.exception("Embedding task %s failed: %s", task_id, exc) - task.status = EmbeddingTask.Status.FAILURE - task.message = str(exc) - raise # Procrastinate retry policy applies - finally: - task.ended_at = timezone.now() - task.queued_job_id = None - task.save() - task.job.update_job_state() - client.close() +@receiver(post_save, sender=Report) +def enqueue_report_embedding(sender, instance, **kwargs): + enqueue_embed_reports([instance.pk], priority=settings.EMBEDDING_INDEX_PRIORITY) ``` -Raising on `EmbeddingClientError` -lets Procrastinate's retry policy apply. After retries exhaust, the exception -propagates, the task ends as `FAILURE`, and `update_job_state` is still called -from the `finally` block. The job finishes with status `WARNING` (some tasks -failed, some succeeded) or `FAILURE` (all failed). The next launcher tick will -create a fresh job that picks up any rows still NULL. - -### 6.8 No `post_save` signal for embeddings, no `backfill_embeddings` command +Two separate receivers (not one combined) so an enqueue error in the embedding path cannot break the FTS-indexing path. The signal fires on both create and update; `embed_reports` always overwrites the embedding for the given ids, so metadata-only updates do trigger an unnecessary re-embed in v1. Body-change detection (a `pre_save` that suppresses enqueue when only metadata changed) is an optimization deferred to §11.4. `ReportSearchVector.save()` is *not* modified to null `embedding` — the task's unconditional overwrite makes that redundant. -The FTS `post_save` receiver (`create_or_update_report_search_vector`) stays — -it is what creates the `ReportSearchVector` row in the first place. There is -**no** corresponding embedding receiver: every ingest path eventually deposits -`embedding=NULL` rows into the DB, and the orchestrator drains them. +### 6.5 Backfill command -There is no `backfill_embeddings` management command. Operators trigger an -immediate drain from a Django shell: +`radis/pgsearch/management/commands/backfill_embeddings.py`: -```python -from radis.pgsearch.tasks import embedding_launcher -embedding_launcher.defer() ``` +./manage.py backfill_embeddings [--batch-size 500] [--limit N] [--dry-run] +``` + +Behavior: -This goes through the same launcher → orchestrator → sub-task path as the -periodic; the only difference is who fires it. One code path, one set of -tests, one observable lifecycle. +- Iterates `ReportSearchVector.objects.filter(embedding__isnull=True).values_list("report_id", flat=True)`. +- Chunks ids by `--batch-size` (default 500). +- For each chunk, calls `enqueue_embed_reports(chunk, priority=settings.EMBEDDING_BACKFILL_PRIORITY)`. +- `--limit N` caps total reports enqueued. +- `--dry-run` skips enqueue and prints the would-be count. +- The "only fill in nulls" filter is applied at enqueue time (here), not inside the task. Re-running the command is safe because rows that got embedded since the last run no longer match the `embedding__isnull=True` filter and won't be re-enqueued. ## 7. Hybrid search provider @@ -699,10 +444,9 @@ EMBEDDING_PROVIDER_PATH = env.str("EMBEDDING_PROVIDER_PATH", default="") # EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="") EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B") EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) -EMBEDDING_DRAIN_CRON = env.str("EMBEDDING_DRAIN_CRON", default="0 2 * * *") ``` -These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). `EMBEDDING_DRAIN_CRON` is env-tunable so dev environments can drain more frequently (e.g., `*/15 * * * *`) without a code change. +These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). ### 8.2 Code constants (tuning knobs, in `base.py`) @@ -716,18 +460,18 @@ EMBEDDING_QUERY_INSTRUCTION = ( EMBEDDING_BATCH_SIZE = 32 EMBEDDING_INDEX_PRIORITY = 0 -EMBEDDING_SYSTEM_USERNAME = "system" +EMBEDDING_BACKFILL_PRIORITY = -1 HYBRID_VECTOR_TOP_K = 100 HYBRID_FTS_MAX_RESULTS = 10_000 HYBRID_RRF_K = 60 ``` -These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks). `EMBEDDING_SYSTEM_USERNAME` names the system user that owns every auto-generated `EmbeddingJob`; the data migration creates this user idempotently. +These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks). ### 8.3 `example.env` -Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, keyed off `EMBEDDING_BACKEND`. Documents `EMBEDDING_DRAIN_CRON` with the production default (`0 2 * * *`) and a dev-friendly alternative (`*/15 * * * *`). +Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, keyed off `EMBEDDING_BACKEND`. ### 8.4 Compose @@ -751,16 +495,11 @@ Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, | Embedding service returns 5xx/timeout during query-time | `query_vec = None`; result list ordered by FTS-only; request succeeds | WARNING with request id | | Embedding service returns 4xx during query-time | Same FTS-only fallback (treats as misconfig at request layer) | ERROR | | Embedding service returns malformed body | `EmbeddingClientError` raised; query falls back to FTS-only | ERROR | -| Embedding service down during a sub-task | `process_embedding_task` raises; Procrastinate retries with exponential backoff; `embedding` stays NULL | WARNING per attempt, ERROR after final retry | -| Launcher fires while EmbeddingJob is `PREPARING`/`PENDING`/`IN_PROGRESS` | Status check returns immediately; tick is a no-op | INFO | -| Orchestrator crashes during task creation (partial dispatch) | Job stays in `PREPARING`. Next launcher tick sees in-flight job and no-ops. Operator marks job `FAILURE` in admin to allow a fresh run | ERROR + operator action | -| Sub-task fails after Procrastinate retries exhausted | Task ends as `FAILURE`. `update_job_state` rolls the job to `WARNING` (some tasks succeeded) or `FAILURE` (all failed). NULL rows remain; next launcher creates a new job to retry them | ERROR | -| `embeddings_worker` saturation | Sub-tasks queue up; orchestrator already returned. No deadlock; just slower drain | DEBUG | +| Embedding service down during indexing task | Task raises; Procrastinate retries with exponential backoff; `embedding` stays NULL | WARNING per attempt, ERROR after final retry | | Report body > `EMBEDDING_MAX_INPUT_CHARS` | Truncate, embed truncated text | WARNING with report_id and char count | -| Report deleted between task creation and execution | Sub-task's `task.reports.values_list(...)` returns fewer rows; `embed_documents` called on smaller list; no error | DEBUG | -| Vector dim mismatch on write | Postgres raises; sub-task fails, retried | ERROR — escalate to admin | +| Report deleted between enqueue and task run | Task fetches no rows for that id; no error | DEBUG | +| Vector dim mismatch on write | Postgres raises; task fails, retried | ERROR — escalate to admin | | `EMBEDDING_PROVIDER_URL` empty at startup | `EmbeddingClient` construction defers to call site; calls log + raise; query falls back to FTS-only | WARNING once on first request | -| System user missing (data migration didn't run) | Launcher raises `User.DoesNotExist`. Loud failure; deployment misconfiguration. Fix: run migrations | ERROR | **Deliberate non-policies:** @@ -771,9 +510,7 @@ Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, **Observability:** - Provider logs at DEBUG: vec hit count, FTS hit count, intersection count, fusion ms, query-embed ms. -- `process_embedding_task` logs at INFO: batch size, total chars, latency, success/retry counts. -- `embedding_launcher` and `process_embedding_job` log status transitions and dispatch counts at INFO. -- Operators inspect job/task state via Django admin (`EmbeddingJob`, `EmbeddingTask` use the default `ModelAdmin`). +- `embed_reports` logs at INFO: batch size, total chars, latency, success/skip/retry counts. - The existing OpenTelemetry overlay (commit `653e0c67`) tags telemetry per service; `embeddings_worker` shows up automatically. ## 10. Testing strategy @@ -784,9 +521,9 @@ Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, |---|---| | `tests/unit/test_embedding_client.py` | Backend payload/response round-trip, path override, instruction prefix, normalization, dim validation, all error modes, truncation | | `tests/unit/test_provider_fusion.py` | `_rrf_fuse(vec_rank, fts_rank, k)` pure-Python helper: disjoint, overlapping, FTS-only, vector-only, both-empty, tiebreak by report_id | -| `tests/unit/test_embedding_launcher.py` | No-op when EmbeddingJob already in flight; no-op when no rows pending; happy path creates job and calls `delay`; raises if system user missing | -| `tests/unit/test_process_embedding_job.py` | Batches pending reports into `EmbeddingTask` rows of size `EMBEDDING_BATCH_SIZE`; status transitions `PREPARING` → `PENDING`; retry/resume path re-enqueues only `PENDING` tasks; empty pool exits cleanly | -| `tests/unit/test_process_embedding_task.py` | Embeds reports, writes vectors, sets status `SUCCESS`; status `FAILURE` and re-raise on `EmbeddingClientError`; calls `job.update_job_state` in both paths; clears `queued_job_id` | +| `tests/unit/test_signals.py` | `post_save` enqueues `embed_reports([id])` with `EMBEDDING_INDEX_PRIORITY` | +| `tests/unit/test_tasks.py` (extends existing) | Always overwrites embedding when re-run (no internal short-circuit); batch splitting; missing ids are skipped without error; client errors propagate so Procrastinate retries | +| `tests/unit/test_backfill_command.py` | Batching, `--limit`, `--dry-run`, only-null-embedding selection | ### 10.2 Integration tests (real Postgres + pgvector) @@ -835,70 +572,23 @@ See §4.5. ### 11.3 GGUF dev embeddings ≠ bf16 prod embeddings -Documented in §5.4. Mitigated by deferring `embedding_launcher` after a model swap (see §4.5). The next drain re-embeds everything. - -### 11.4 No body-change detection for re-embedding - -V1 re-embeds anything where `embedding IS NULL`. A future optimization could -track whether the body actually changed (e.g., a `body_hash` column on -`ReportSearchVector` updated only on body changes) so metadata-only updates -don't have to null the embedding. Not in v1; profiling will tell us whether it -matters. - -### 11.5 Operator-aware queries: FTS / vector asymmetry - -Both halves of hybrid search receive a derivation of the same parsed `QueryNode`, but interpret it through completely different machinery. The FTS side consumes a `tsquery` built by `_build_query_string` where `AND`, `OR`, `NOT`, quoted phrases, and parens are first-class boolean operators (`&`, `|`, `!`, `<->`, `()`). The vector side consumes the canonical unparsed string and feeds it whole to the embedding model as natural language; the operators become ordinary word tokens that the model has no operator-aware machinery to interpret. +Documented in §5.4. Mitigated by running `backfill_embeddings` after a model swap. -Practical consequences: +### 11.4 No body-change detection in the signal -- **Natural-phrase queries** (`pneumothorax`, `chest x-ray`, implicit-AND `cardiac arrest`) — both halves point the same direction. RRF amplifies the agreement. This is the workload hybrid search is best at. -- **`A AND B`** — FTS strictly intersects; vector returns docs about a topic-mix of A and B (which usually includes some single-side hits). Docs matching both lexically *and* semantically rank highest, which is the desired outcome. Vector contributes useful expansion but not boolean precision. -- **`A OR B`** — FTS unions; the vector half has no concept of disjunction and just produces a centroid-style embedding. Docs about either A or B that happen to be near the centroid still get retrieved, but a doc purely about A may not appear unless it's also close to the centroid. Vector half degrades from "asset" to "noise". -- **`NOT X`** — sharpest conflict. FTS correctly returns docs without X. Dense embeddings are polarity-blind, so the vector for `"NOT X"` clusters next to the vector for `"X"` and the top-K nearest neighbours are docs *about* X — the polar opposite of what the user asked for. The two halves return nearly disjoint sets that RRF interleaves, producing actively misleading results rather than mere noise. (Distinct from §11.1, which is about natural-language negation like `no pneumothorax` where the FTS stop-word strip happens to align the halves accidentally.) - -**Candidate mitigation (not in v1, recommended follow-up):** strip negated branches from the query string before embedding. Walk the AST; when a `UnaryNode("NOT", X)` is encountered, drop `X` from the string passed to the embedding model. The FTS side still gets the full structure. Outcomes: - -- `NOT X` alone → vector receives an empty query and is skipped; provider falls back to FTS-only ranking. Correct. -- `A AND NOT B` → vector embeds just `A`; FTS enforces `A & !B`. Vector adds positive semantic signal for A, FTS enforces the exclusion. The halves are aligned again. - -This is ~15 lines of code in `providers.search()` / `providers.retrieve()` and a small extension to `QueryParser` for the AST walk. Other candidates (negation-aware re-ranker, embedding subtraction, sparse models like SPLADE-NEG) are heavier and listed in §11.1. - -**Why a re-ranker alone cannot fix this.** A cross-encoder re-ranker improves precision *within the candidate pool it is given* — it cannot improve recall of that pool. For `NOT pneumothorax` over a 1000-doc corpus where 600 docs don't mention the word, the hybrid candidate pool is poisoned: ~100 wrong docs (pneumothorax-discussing reports pulled in by the polarity-blind vector half) displace 100 of the 600 correct docs from the top-N positions. After re-ranking top-20, the head of results is sharper, but ~590 correct docs still live below the re-ranker's cutoff at their original RRF positions, interleaved with the remaining 90 wrong docs. The architecturally correct order is to fix recall upstream (strip negated branches before embedding, restoring a clean candidate pool) and *then* layer a re-ranker for precision. A re-ranker without the upstream fix is rearranging deck chairs on a polluted pool. - -### 11.6 Cross-encoder re-ranker (deferred) - -A planned follow-up adds a re-ranker stage between hybrid fusion and result hydration to lift precision (especially on operator-light natural-phrase queries, where the candidate pool is already correct but RRF ordering is mediocre) and to partially compensate for §11.1's polarity blindness. Two backend patterns are under consideration: - -- **Pointwise cross-encoder via vLLM.** Qwen3-Reranker-4B served with `vllm serve … --task score` exposes `/v1/rerank` (Cohere/TEI shape: `{model, query, documents}` → `[{index, relevance_score}]`). Logit-based scoring (yes/no token logits → softmax) gives graded relevance in [0,1]. Latency ~30–100 ms per pair on a single GPU; for top-20 candidates that's ~0.5–1.5 s added. -- **Listwise LLM re-ranker** via the existing OpenAI-compatible chat-completions endpoint. The LLM is prompted with the query and the top-N candidates packed into a single message; structured output (`response_format=json_object`) returns a ranked list of indices. One HTTP call per query rather than N. Latency ~1–3 s for top-20 depending on model size. Quality trades off graded precision for the LLM's strong instruction-following — particularly the explicit "respect negation" cue, which the pointwise reranker has to learn implicitly. - -vLLM is the recommended production host for the pointwise path because Ollama (as of mid-2025) does not expose token logits cleanly, which collapses Qwen3-Reranker to a binary 1.0/0.0 signal and loses graded ordering. Ollama can still serve the LLM listwise backend without issue. - -### 11.7 Evaluation strategy for the layered hybrid stack - -Six profiles cover the additive layers: - -| Profile | Negation strip (§11.5) | Re-ranker (§11.6) | -|---|---|---| -| `baseline` | off | off | -| `strip` | on | off | -| `rerank-qwen` | off | Qwen3-Reranker via vLLM | -| `rerank-llm` | off | listwise LLM | -| `both-qwen` | on | Qwen3-Reranker via vLLM | -| `both-llm` | on | listwise LLM | +V1 re-embeds on every `Report.save()`. If profiling shows wasted traffic from metadata-only updates, add a `pre_save` that only nulls `embedding` when `body` changed. -A `run_search_eval` management command loops a set of test queries through all six profiles (toggling settings via `override_settings`) and dumps comparable JSON output with top-N docs, per-layer scores (`ts_rank`, `cosine_distance`, `rrf_score`, `rerank_score`), and per-profile latencies. +### 11.5 Per-row `UPDATE` in the embedding task -**Labeling.** Per-pair LLM relevance judgment ("is doc D relevant to query Q?") is unreliable for radiology because (a) it inherits the same polarity blind spot the system is trying to evaluate, and (b) it introduces circular bias when the labeling LLM and re-ranker LLM share a family. The preferred approach is *concept-based polarity-aware labeling*: label each report once per clinical concept with `PRESENT` / `ABSENT` / `NOT_MENTIONED`, then derive query relevance deterministically (`pneumothorax` → `PRESENT ∪ ABSENT`; `NOT pneumothorax` → `NOT_MENTIONED ∪ ABSENT` for strict exclusion, or `ABSENT` only for "rule-out" semantics). The concept labels are reusable across many queries and survive prompt/model changes. The upstream label-filter work in PR #196 produces structured labels with comparable semantics and is the intended source of ground truth for production-scale evaluation. +V1 issues one `UPDATE` per row inside a batch. If this becomes a bottleneck, switch to a single `UPDATE … FROM (VALUES …)` or pgvector's `bulk_create` with `update_conflicts`. ## 12. Rollout plan -1. **Schema and dependency.** Land the `pgvector` Python dep, the extension migration, and the embedding-column schema migration. No behavior change yet — `embedding` is nullable, queries still see only FTS. +1. **Schema and dependency.** Land the `pgvector` Python dep, the extension migration, and the schema migration. No behavior change at this point — `embedding` is nullable, queries still see only FTS. 2. **Embedding client and tests.** Land the client module and unit tests. No callers yet. -3. **Orchestrator models and migrations.** Add `EmbeddingJob`, `EmbeddingTask`, their migration, and the data migration that creates the system user. -4. **Orchestrator tasks and `embeddings_worker`.** Land `embedding_launcher`, `process_embedding_job`, `process_embedding_task`, the `embeddings_worker` container (with `--concurrency 4`), and the `EMBEDDING_DRAIN_CRON` setting. The launcher starts ticking; with no rows yet, all ticks no-op. -5. **Initial drain.** From a shell, run `embedding_launcher.defer()` so the orchestrator picks up the existing corpus. This is the only "operator action" in the rollout. It runs at `EMBEDDING_INDEX_PRIORITY` and lives behind whatever other work is on the queues; it can run for hours to days on a large corpus. -6. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only. -7. **Monitor.** Watch search latency p95, embedding-queue depth, `EmbeddingJob` admin state, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed. +3. **Async indexing.** Land the task, signal, backfill command, and `embeddings_worker` service. New reports start getting embedded; the column gradually populates. +4. **Backfill.** Run `backfill_embeddings` against the existing corpus (manual op, can run for hours/days depending on size — that's fine, it's bounded by `EMBEDDING_BACKFILL_PRIORITY`). +5. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only. +6. **Monitor.** Watch search latency p95, embedding queue depth, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed. -Each step is independently mergeable; steps 1–4 ship as quiet infrastructure changes with no user-visible effect, step 5 starts populating the column, step 6 is the moment hybrid goes live. +Each step is independently mergeable; steps 1–4 ship as quiet infrastructure changes with no user-visible effect, step 5 is the moment hybrid goes live. diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md new file mode 100644 index 00000000..bf32ba4c --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -0,0 +1,894 @@ +# Hybrid Search Design (FTS + Dense Vector via Qwen3-Embedding-4B) + +**Status:** Draft — design phase +**Author:** RADIS team (Samuel Kwong) +**Date:** 2026-05-28 +**Implementation skill (next step):** `writing-plans` +**Supersedes:** `2026-05-15-hybrid-search-design.md` + +--- + +## 1. Overview + +RADIS today provides PostgreSQL full-text search (FTS) over radiology reports via the `radis.pgsearch` provider: each `Report` gets a 1:1 `ReportSearchVector` row holding a `tsvector`, kept in sync via `post_save` signal and a bulk re-index task. Queries are ranked by `ts_rank` and snippeted via `ts_headline`. + +This spec extends that infrastructure with a dense-vector retrieval side, fused with FTS via Reciprocal Rank Fusion (RRF), to deliver **hybrid search**. Embeddings are produced by a Qwen3-Embedding-4B inference endpoint and stored in the same `ReportSearchVector` table. + +The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — `SearchView`, `ExtractionJob`, `SubscriptionJob`, the REST API — see no signature differences. Only the body of `radis.pgsearch.providers.search()` and `retrieve()` changes. + +## 2. Goals & non-goals + +### Goals + +- Combine the existing FTS recall with semantic recall so queries like "no pneumothorax" surface reports that describe the absence without containing the exact word (modulo the dense-retrieval polarity limitation in §11). +- Keep the existing `SearchProvider` contract intact. +- Index embeddings asynchronously without blocking report ingest. +- Keep embedding load isolated from chat/extraction/subscription LLM tasks. +- Degrade gracefully when the embedding service is unavailable (search continues as FTS-only). +- Make the embedding backend pluggable so Ollama can be used in dev and a Qwen3 endpoint in prod with the same code path. + +### Non-goals + +- No new search-provider plugin slot. The single `pgsearch` provider continues to be the only one registered. +- No per-query UI toggle for semantic vs. lexical. Hybrid is the new default. +- No Vespa, Elasticsearch, or OpenSearch adapter. +- No solution for negation/polarity (§11 documents this as known future work). +- No automated re-embedding when `EMBEDDING_DIM` changes. That is a manual operator procedure: drop column, re-migrate, defer the embedding orchestrator (see §4.5). +- No on-disk vector quantization. Float32 storage from day one; revisit if RAM pressure appears. + +## 3. Architecture + +``` +┌──────────────────────────────────────────────────────────────────────┐ +│ SearchView, REST API, ExtractionJob, SubscriptionJob │ +└──────────────┬───────────────────────────────────────────────────────┘ + │ Search(query, filters, offset, limit) + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ radis.pgsearch.providers.search() (hybrid, replaces FTS-only) │ +│ │ +│ 1. embed_query() ──► EmbeddingClient ──► Qwen3 endpoint │ +│ on failure: query_vec = None │ +│ │ +│ 2. Vector top-K ────► ReportSearchVector (HNSW on .embedding) │ +│ filtered by structured filters │ +│ │ +│ 3. FTS hits ────► ReportSearchVector (GIN on .search_vector) │ +│ filtered by structured filters │ +│ │ +│ 4. Python-side RRF fusion of (vec_top_K ∪ fts_hits) │ +│ 5. Pagination on the fused order │ +│ 6. ts_headline() ────► ReportSearchVector (page-slice only) │ +└──────────────────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────────────────┐ +│ Async indexing path (Job/Task orchestrator, periodic-driven) │ +│ │ +│ cron (settings.EMBEDDING_DRAIN_CRON, default nightly 02:00) │ +│ │ │ +│ ▼ │ +│ embedding_launcher() — `default` queue │ +│ ├─ queueing_lock="embedding_launcher" │ +│ ├─ skip if any EmbeddingJob in PREPARING/PENDING/IN_PROGRESS │ +│ ├─ skip if no rows with embedding IS NULL │ +│ └─ EmbeddingJob.objects.create(...) → job.delay() │ +│ │ +│ process_embedding_job(job_id) — `default` queue │ +│ ├─ iterate ReportSearchVector with embedding IS NULL │ +│ ├─ chunk by EMBEDDING_BATCH_SIZE → EmbeddingTask rows │ +│ ├─ task.reports.set(chunk); task.delay() (no HTTP work) │ +│ └─ job.status = PENDING; return │ +│ │ +│ process_embedding_task(task_id) — `embeddings` queue │ +│ ├─ EmbeddingClient.embed_documents([r.body for r in task.reports])│ +│ ├─ L2-normalize; bulk_update ReportSearchVector.embedding │ +│ ├─ task.status = SUCCESS/FAILURE; clear queued_job_id │ +│ └─ job.update_job_state() │ +│ │ +│ Operator-triggered drain: from a Django shell run │ +│ `embedding_launcher.defer()` — same code path as periodic. │ +└──────────────────────────────────────────────────────────────────────┘ +``` + +The bulk-upsert API path (`reports/api/viewsets.py:_bulk_upsert_reports`) +already creates `ReportSearchVector` rows with `embedding=NULL` via the FTS +indexing call in its `on_commit` block. The single-create API path goes through +the standard `Report.save()` and the FTS `post_save` signal, which likewise +creates the `ReportSearchVector` row with NULL embedding. Both ingest paths +deposit work into the same DB-resident pending pool; the orchestrator drains it +on the next periodic tick (or on an operator-triggered defer). There is no +per-API-call embedding job. + +**Components added inside `radis.pgsearch`:** + +| File | Purpose | +|---|---| +| `utils/embedding_client.py` | Sync + async HTTP clients with pluggable backends (`openai`, `ollama`) | +| `migrations/0002_pgvector_extension.py` | `CREATE EXTENSION IF NOT EXISTS vector;` | +| `migrations/0003_report_embedding.py` | Adds `embedding vector(N)` column + HNSW index | +| `migrations/0004_embedding_job_task.py` | Adds `EmbeddingJob` and `EmbeddingTask` tables + M2M to `Report` | +| `migrations/0005_system_user.py` | Data migration: creates the system user if missing | +| `models.py` (modified) | Adds `embedding` field + `HnswIndex`; defines `EmbeddingJob` and `EmbeddingTask` inheriting `AnalysisJob`/`AnalysisTask` | +| `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** | +| `tasks.py` (modified) | Adds `embedding_launcher` (periodic), `process_embedding_job` (`default` queue), `process_embedding_task` (`embeddings` queue) | +| `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic | +| `tests/...` | Coverage per §10 | + +**Infrastructure additions:** + +| File | Change | +|---|---| +| `pyproject.toml` | Add `pgvector>=0.3` dependency | +| `radis/settings/base.py` | New env-driven + constant settings (§8) | +| `example.env` | Document `EMBEDDING_*` env vars for openai and ollama backends | +| `docker-compose.base.yml` | Add `embeddings_worker` service + `EMBEDDING_*` env vars | +| `docker-compose.dev.yml` / `.prod.yml` | `embeddings_worker.command` running `bg_worker -q embeddings` | + +## 4. Schema and migrations + +### 4.1 Dependency + +Add to `pyproject.toml`: + +```toml +"pgvector>=0.3", +``` + +### 4.2 Postgres extension migration + +`radis/pgsearch/migrations/0002_pgvector_extension.py`: + +```python +class Migration(migrations.Migration): + dependencies = [("pgsearch", "0001_initial")] + operations = [ + migrations.RunSQL( + sql="CREATE EXTENSION IF NOT EXISTS vector;", + reverse_sql=migrations.RunSQL.noop, # do not drop in prod + ), + ] +``` + +Reverse is a no-op because the extension may be shared with other Postgres usage and dropping it would damage unrelated state. Dev rollback is handled by recreating the database. + +### 4.3 Schema migration + +`radis/pgsearch/migrations/0003_report_embedding.py`: standard `AddField` with a `VectorField(dimensions=settings.EMBEDDING_DIM, null=True)` and `AddIndex` for an `HnswIndex` with `opclasses=["vector_cosine_ops"]`, `m=16`, `ef_construction=64`. + +### 4.4 Model update + +`radis/pgsearch/models.py`: + +```python +from django.conf import settings +from pgvector.django import HnswIndex, VectorField + +class ReportSearchVector(models.Model): + report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_vector") + search_vector = SearchVectorField(null=True) + embedding = VectorField(dimensions=settings.EMBEDDING_DIM, null=True) + + class Meta: + indexes = [ + GinIndex(fields=["search_vector"]), + HnswIndex( + name="pgsearch_embedding_hnsw", + fields=["embedding"], + m=16, + ef_construction=64, + opclasses=["vector_cosine_ops"], + ), + ] +``` + +`embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled asynchronously by `process_embedding_task` (§6.7). A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only. + +`save()` on `ReportSearchVector` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by `process_embedding_task` via `bulk_update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent. + +### 4.5 Operational note on `EMBEDDING_DIM` + +pgvector columns and HNSW indexes are bound to a fixed dimension at create time, and HNSW has a 2000-dim ceiling (so `EMBEDDING_DIM ≤ 2000`; Qwen3-Embedding-4B's native 2560 is Matryoshka-truncated client-side). A Django system check (`pgsearch.E001`) compares `settings.EMBEDDING_DIM` against the literal in migration 0003 and fails `manage.py check` on mismatch. Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure: + +1. Drop the HNSW index and the `embedding` column. +2. Re-run `0003_report_embedding` with the new `EMBEDDING_DIM`. +3. From a Django shell, defer the embedding orchestrator immediately so the + next nightly tick is not waited for: + + ```python + from radis.pgsearch.tasks import embedding_launcher + embedding_launcher.defer() + ``` + +This is documented as a deployment-time decision and intentionally not automated. + +## 5. Embedding client + +### 5.1 Module layout + +`radis/pgsearch/utils/embedding_client.py` exposes: + +- `class EmbeddingBackend(Protocol)` with `path`, `build_payload`, `parse_response`. +- `class OpenAIBackend(EmbeddingBackend)` — default path `/v1/embeddings`, body `{model, input: [...]}`, response `{data: [{embedding: [...]}]}`. +- `class OllamaBackend(EmbeddingBackend)` — default path `/api/embed`, body `{model, input: [...]}`, response `{embeddings: [[...]]}`. +- `BACKENDS: dict[str, EmbeddingBackend] = {"openai": OpenAIBackend(), "ollama": OllamaBackend()}`. +- `class EmbeddingClientError(Exception)`. +- `class EmbeddingClient` — sync client used by `process_embedding_task` and the query path. +- `class AsyncEmbeddingClient` — async variant, kept for parity with `chats/utils/chat_client.py` and so the query path can call it from ASGI views without `async_to_sync` later. + +### 5.2 Interface + +```python +class EmbeddingClient: + def __init__(self): + self._backend = BACKENDS[settings.EMBEDDING_BACKEND] + self._path = settings.EMBEDDING_PROVIDER_PATH or self._backend.path + self._url = settings.EMBEDDING_PROVIDER_URL.rstrip("/") + self._path + self._model = settings.EMBEDDING_MODEL_NAME + self._timeout = settings.EMBEDDING_REQUEST_TIMEOUT + self._headers = {"Authorization": f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}"} \ + if settings.EMBEDDING_PROVIDER_API_KEY else {} + + def embed_documents(self, texts: list[str]) -> list[list[float]]: + """Embed texts verbatim. Truncates each to EMBEDDING_MAX_INPUT_CHARS first. + Returns L2-normalized vectors of length EMBEDDING_DIM.""" + + def embed_query(self, text: str) -> list[float]: + """Prepend EMBEDDING_QUERY_INSTRUCTION, then embed_documents([text])[0].""" +``` + +### 5.3 Wire shapes + +| Backend | Path (default) | Request | Response | +|---|---|---|---| +| `openai` | `/v1/embeddings` | `{"model": M, "input": [t, ...]}` | `{"data": [{"embedding": [...]}, ...]}` | +| `ollama` | `/api/embed` | `{"model": M, "input": [t, ...]}` | `{"embeddings": [[...], ...]}` | + +`EMBEDDING_PROVIDER_PATH` (env) overrides the backend default — this is how the production endpoint at `/api/embeddings` with an OpenAI-style payload is supported by the `openai` backend with a one-line config change, no new backend needed. + +### 5.4 Behavior details + +- **Query instruction:** the model card for Qwen3-Embedding recommends a task-specific instruction prefix on the query side only. `embed_query` prepends `EMBEDDING_QUERY_INSTRUCTION` (a Python constant in `base.py`); `embed_documents` does not. +- **Truncation:** any text longer than `EMBEDDING_MAX_INPUT_CHARS` is truncated at the character limit before being sent. A WARNING is logged with the report id (when known) and char count. Qwen3-Embedding-4B supports up to 32k tokens, so truncation will be rare for radiology bodies but is bounded as a defense against pathological inputs. +- **Normalization:** every returned vector is L2-normalized client-side, unconditionally. With unit vectors, cosine distance is monotonic in dot product, which makes the HNSW `vector_cosine_ops` operator effectively a fast inner-product search. Whether the upstream server normalizes is irrelevant. +- **Dimension validation:** every vector is checked to have length `EMBEDDING_DIM`. A mismatch raises `EmbeddingClientError`. +- **Batching:** `embed_documents` sends a single HTTP call per invocation. The higher-level orchestrator (`process_embedding_job`) groups reports into `EmbeddingTask` batches of `EMBEDDING_BATCH_SIZE` before dispatching them to `process_embedding_task`. +- **Errors:** non-2xx, timeout, malformed JSON, missing key, or wrong dim all raise `EmbeddingClientError`. The client never falls back internally — fallback policy is owned by the caller. +- **Dev recipe (Ollama):** + ```bash + ollama pull dengcao/Qwen3-Embedding-4B:Q5_K_M + # in .env: + EMBEDDING_BACKEND=ollama + EMBEDDING_PROVIDER_URL=http://host.docker.internal:11434 + EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M + EMBEDDING_DIM=2560 + ``` + GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, defer the embedding orchestrator from a Django shell (see §4.5). + +## 6. Async indexing (Job/Task orchestrator) + +The embedding lifecycle uses the same Job/Task pattern as `ExtractionJob` / +`ExtractionTask` (`radis/extractions/tasks.py:32`) and `SubscriptionJob` / +`SubscriptionTask` (`radis/subscriptions/tasks.py:33`). A periodic launcher +creates one `EmbeddingJob` per drain run; the orchestrator splits pending +reports into `EmbeddingTask` batches; each task is processed on the +`embeddings` queue. + +### 6.1 Ingest paths and the pending pool + +RADIS has two ingest paths and the orchestrator is decoupled from both. Every +ingest path eventually deposits a `ReportSearchVector` row with +`embedding=NULL`; the launcher consumes that pool on its cron schedule. + +- **Single-create** (`POST /api/reports/`) routes through `Report.objects.create` + in the serializer (`radis/reports/api/serializers.py:87`). The FTS + `post_save` receiver creates the `ReportSearchVector` row with NULL embedding. +- **Bulk-upsert** (`POST /api/reports/bulk-upsert`) routes through + `Report.objects.bulk_create` / `bulk_update` + (`radis/reports/api/viewsets.py:_bulk_upsert_reports`). The bulk path calls + `enqueue_bulk_index_reports(touched_ids)` in its `on_commit` block, which + bulk-creates the `ReportSearchVector` rows with NULL embedding. + +Accepting a freshness window of hours / next-cycle is the price of batched, +throughput-friendly embedding runs. This design serves all three operational +scenarios with one mechanism: + +| Scenario | What happens | +|---|---| +| **Initial bulk upload** (millions of reports via `/bulk-upsert`) | `ReportSearchVector` rows created with `embedding=NULL`. Operator defers the launcher immediately or waits for the next cron tick. One `EmbeddingJob` produces N `EmbeddingTask` batches. | +| **Daily ad-hoc upload** | Reports land NULL via either ingest path. Next periodic tick consolidates the day's pending pool into a single `EmbeddingJob`. | +| **Model-change backfill** | Operator follows §4.5 (drop column, re-migrate), then defers the launcher from a shell. Same code path as the periodic. | + +### 6.2 Queue and worker + +The `embeddings` Procrastinate queue is served by the `embeddings_worker` +container. The orchestrator (`process_embedding_job`) runs on the `default` +queue alongside `process_extraction_job` and `process_subscription_job`; the +sub-tasks (`process_embedding_task`) run on `embeddings`. + +``` +./manage.py bg_worker -l debug -q embeddings --autoreload --concurrency 4 # dev +./manage.py bg_worker -l info -q embeddings --concurrency 4 # prod +``` + +`embeddings_worker` concurrency tunes parallelism against the embedding +endpoint. Recommended 4; raise if the endpoint has spare throughput, lower if +it rate-limits. The orchestrator does not run on this queue, so there is no +self-deadlock condition tied to concurrency on the `embeddings` queue. + +### 6.3 Priorities + +Procrastinate priority is "higher = sooner". Embedding work runs at lower +priority than extraction and subscription so it never starves user-driven LLM +operations. The orchestrator (`default` queue) and sub-tasks (`embeddings` +queue) share `EMBEDDING_INDEX_PRIORITY`; there is no separate backfill +priority because the backfill path is the same orchestrator. + +| Task | Priority | +|---|---| +| `EXTRACTION_DEFAULT_PRIORITY` (existing) | 2 | +| `EXTRACTION_URGENT_PRIORITY` (existing) | 3 | +| `SUBSCRIPTION_DEFAULT_PRIORITY` (existing) | 3 | +| `SUBSCRIPTION_URGENT_PRIORITY` (existing) | 4 | +| `EMBEDDING_INDEX_PRIORITY` (new) | 0 | + +### 6.4 Models + +`radis/pgsearch/models.py` defines two new models inheriting `AnalysisJob` and +`AnalysisTask` (`radis/core/models.py:17,220`): + +```python +from radis.core.models import AnalysisJob, AnalysisTask + + +class EmbeddingJob(AnalysisJob): + default_priority = settings.EMBEDDING_INDEX_PRIORITY + urgent_priority = settings.EMBEDDING_INDEX_PRIORITY # no urgent variant + + def delay(self) -> None: + queued_job_id = app.configure_task( + "radis.pgsearch.tasks.process_embedding_job", + allow_unknown=False, + priority=self.default_priority, + ).defer(job_id=self.pk) + self.queued_job_id = queued_job_id + self.save() + + +class EmbeddingTask(AnalysisTask): + job = models.ForeignKey(EmbeddingJob, on_delete=models.CASCADE, related_name="tasks") + reports = models.ManyToManyField(Report, related_name="embedding_tasks") + + def delay(self) -> None: + queued_job_id = app.configure_task( + "radis.pgsearch.tasks.process_embedding_task", + allow_unknown=False, + priority=settings.EMBEDDING_INDEX_PRIORITY, + ).defer(task_id=self.pk) + self.queued_job_id = queued_job_id + self.save() +``` + +**Owner field.** `AnalysisJob.owner` is non-nullable (`settings.AUTH_USER_MODEL`). +Embedding jobs are system-driven and have no human creator. A data migration +(`0005_system_user.py`) creates a `User(username=settings.EMBEDDING_SYSTEM_USERNAME, +is_active=False, password=unusable)` idempotently; the launcher assigns this +user as `owner` on every `EmbeddingJob`. This avoids subclass-level overrides +of `owner` and keeps the abstract contract clean. + +**No `get_absolute_url` in v1.** Existing `ExtractionJob` and `SubscriptionJob` +implement `get_absolute_url` because they have user-facing detail views. +`EmbeddingJob` has no user-facing UI in v1 — operators inspect it via Django +admin (default `ModelAdmin` registration is sufficient). The inherited abstract +`AnalysisJob.get_absolute_url` body is `...`, returning `None`; no call site in +radis treats an `EmbeddingJob` like a user-facing analysis job. A future spec +can add the view and override the method. + +`urgent`, `send_finished_mail`, and `finished_mail_template` stay at their +`AnalysisJob` defaults (`False`, `False`, `None`). + +### 6.5 Launcher (the periodic task) + +`radis/pgsearch/tasks.py`: + +```python +@app.periodic(cron=settings.EMBEDDING_DRAIN_CRON) +@app.task( + queue="default", + queueing_lock="embedding_launcher", + pass_context=True, +) +def embedding_launcher(context, timestamp: int) -> None: + in_flight = EmbeddingJob.objects.filter( + status__in=[ + EmbeddingJob.Status.PREPARING, + EmbeddingJob.Status.PENDING, + EmbeddingJob.Status.IN_PROGRESS, + ] + ).exists() + if in_flight: + logger.info("EmbeddingJob already in flight; launcher tick is a no-op.") + return + + has_pending = ReportSearchVector.objects.filter(embedding__isnull=True).exists() + if not has_pending: + logger.debug("No reports pending embedding; launcher tick is a no-op.") + return + + system_user = User.objects.get(username=settings.EMBEDDING_SYSTEM_USERNAME) + job = EmbeddingJob.objects.create( + owner=system_user, + status=EmbeddingJob.Status.PREPARING, + ) + transaction.on_commit(job.delay) +``` + +**Two reinforcing layers of duplicate-dispatch prevention:** + +- **Procrastinate `queueing_lock="embedding_launcher"`.** While a launcher job + is in the queue (`todo`) or executing (`doing`), the next cron tick's + `defer` call silently fails with `AlreadyEnqueued`. The launcher itself is + fast (one existence check + maybe one INSERT), so the lock is normally + released within milliseconds. +- **In-flight EmbeddingJob check.** Even if the queueing lock leaks (worker + crash mid-flight, manual `defer` from a shell, dashboard re-trigger), the + launcher's first action is to look for any `EmbeddingJob` in a non-terminal + status. If one exists, the launcher returns without creating another. This + is the same dedup pattern used by `process_extraction_job` when re-entered + (`extractions/tasks.py:46`). + +### 6.6 Orchestrator (`process_embedding_job`) + +```python +@app.task +def process_embedding_job(job_id: int) -> None: + job = EmbeddingJob.objects.get(id=job_id) + assert job.status == EmbeddingJob.Status.PREPARING + + # Retry/resume path: tasks already exist, re-enqueue still-pending ones. + if job.tasks.exists(): + tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) + else: + pending_ids_iter = ( + ReportSearchVector.objects + .filter(embedding__isnull=True) + .values_list("report_id", flat=True) + .iterator(chunk_size=10_000) + ) + batch: list[int] = [] + for report_id in pending_ids_iter: + batch.append(int(report_id)) + if len(batch) >= settings.EMBEDDING_BATCH_SIZE: + _create_embedding_task(job, batch) + batch = [] + if batch: + _create_embedding_task(job, batch) + + tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) + + job.status = EmbeddingJob.Status.PENDING + job.queued_job_id = None + job.save() + + for task in tasks_to_enqueue: + if not task.is_queued: + task.delay() + + +def _create_embedding_task(job: EmbeddingJob, report_ids: list[int]) -> EmbeddingTask: + task = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) + task.reports.set(Report.objects.filter(pk__in=report_ids)) + return task +``` + +Mirrors `process_extraction_job` (`extractions/tasks.py:32`). State transitions +follow the standard pattern: + +- `PREPARING` while tasks are being created (sub-tasks must not be dispatched yet). +- `PENDING` after task creation completes; sub-tasks are then enqueued. +- `IN_PROGRESS` / `SUCCESS` / `WARNING` / `FAILURE` driven by `update_job_state` + (inherited from `AnalysisJob`) called from each sub-task on completion. + +The orchestrator does no HTTP work. For 1M pending reports at +`EMBEDDING_BATCH_SIZE=32`, it creates ~31,250 `EmbeddingTask` rows and defers +them — well under a minute on the `default` worker. Its slot is freed +immediately after; long-running embedding work happens on the `embeddings` +worker. + +### 6.7 Sub-task (`process_embedding_task`) + +```python +@app.task(queue="embeddings") +def process_embedding_task(task_id: int) -> None: + task = EmbeddingTask.objects.get(id=task_id) + task.status = EmbeddingTask.Status.IN_PROGRESS + task.started_at = timezone.now() + task.attempts = task.attempts + 1 + task.save() + + client = EmbeddingClient() + try: + report_ids = list(task.reports.values_list("pk", flat=True)) + rsvs = list( + ReportSearchVector.objects + .filter(report_id__in=report_ids) + .select_related("report") + .only("id", "report_id", "report__body") + ) + texts = [rsv.report.body for rsv in rsvs] + vectors = client.embed_documents(texts) + for rsv, vec in zip(rsvs, vectors, strict=True): + rsv.embedding = vec + ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) + + task.status = EmbeddingTask.Status.SUCCESS + except EmbeddingClientError as exc: + logger.exception("Embedding task %s failed: %s", task_id, exc) + task.status = EmbeddingTask.Status.FAILURE + task.message = str(exc) + raise # Procrastinate retry policy applies + finally: + task.ended_at = timezone.now() + task.queued_job_id = None + task.save() + task.job.update_job_state() + client.close() +``` + +Raising on `EmbeddingClientError` +lets Procrastinate's retry policy apply. After retries exhaust, the exception +propagates, the task ends as `FAILURE`, and `update_job_state` is still called +from the `finally` block. The job finishes with status `WARNING` (some tasks +failed, some succeeded) or `FAILURE` (all failed). The next launcher tick will +create a fresh job that picks up any rows still NULL. + +### 6.8 Operator-triggered drain + +The only ingest-time signal is the FTS `post_save` receiver +(`create_or_update_report_search_vector`), which creates the +`ReportSearchVector` row with `embedding=NULL`. Embedding is driven entirely +by the orchestrator from then on. + +Operators trigger an immediate drain — typically after a model swap or initial +bulk import — by deferring the same launcher from a Django shell: + +```python +from radis.pgsearch.tasks import embedding_launcher +embedding_launcher.defer() +``` + +This goes through the same launcher → orchestrator → sub-task path as the +periodic; the only difference is who fires it. One code path, one set of +tests, one observable lifecycle. + +## 7. Hybrid search provider + +### 7.1 Universe and fusion + +The hybrid result universe is the **union** of two filter-bounded candidate sets: + +- **Vector top-K:** the `HYBRID_VECTOR_TOP_K` nearest rows by cosine distance to the query embedding, filtered by structured filters and `embedding IS NOT NULL`. *Not* constrained to the FTS hit set. +- **FTS hits:** all rows matching the tsquery and the structured filters, capped at `HYBRID_FTS_MAX_RESULTS`. + +A report appears in results if it is in **either** set. This is the change from the earlier draft, made because radiology queries like "no pneumothorax" must be able to surface reports that lexically don't match (the GIN index drops "no" as a stop word) but are semantically related. + +Each report's score is plain Reciprocal Rank Fusion: + +``` +score(d) = (1 / (HYBRID_RRF_K + vec_rank[d]) if d ∈ vec_top_K else 0) + + (1 / (HYBRID_RRF_K + fts_rank[d]) if d ∈ fts_hits else 0) +``` + +Properties: + +- Reports in both sides outrank reports in only one side (sum of two terms vs. one). +- Vector contribution decays after rank K (no `vec_rank` entry), so the ordering naturally transitions from "hybrid head" to "FTS tail" with no explicit cutoff. +- A query with zero FTS hits returns `vec_top_K` ranked by vector position only — pure semantic search. +- A query with embedding failure returns FTS hits ranked by `ts_rank` only — the pre-hybrid behavior. + +### 7.2 `search()` flow + +```python +def search(s: Search) -> SearchResult: + query_str = _build_query_string(s.query) + language = _resolve_language(s.filters) + filter_q = _build_filter_query(s.filters) + tsquery = SearchQuery(query_str, search_type="raw", config=language) + + # Vector side + query_text = QueryParser.unparse(s.query) # same helper SearchView already uses + try: + query_vec = EmbeddingClient().embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Falling back to FTS-only: %s", e) + query_vec = None + + vec_rank: dict[int, int] = {} + if query_vec is not None: + ids = list( + ReportSearchVector.objects + .filter(filter_q) + .exclude(embedding__isnull=True) + .annotate(distance=CosineDistance("embedding", query_vec)) + .order_by("distance", "report_id") + .values_list("report_id", flat=True)[:settings.HYBRID_VECTOR_TOP_K] + ) + vec_rank = {rid: i + 1 for i, rid in enumerate(ids)} + + # FTS side + fts_rows = list( + ReportSearchVector.objects + .filter(filter_q) + .filter(search_vector=tsquery) + .annotate(rank=SearchRank(F("search_vector"), tsquery)) + .order_by("-rank", "report_id") + .values("report_id", "rank")[:settings.HYBRID_FTS_MAX_RESULTS] + ) + fts_rank = {row["report_id"]: i + 1 for i, row in enumerate(fts_rows)} + + # Fusion (pure Python, factored out for unit testing) + ordered_ids = _rrf_fuse(vec_rank, fts_rank, k=settings.HYBRID_RRF_K) + + total_count = len(ordered_ids) + total_relation = ( + "at_least" + if len(fts_rows) >= settings.HYBRID_FTS_MAX_RESULTS + or len(vec_rank) >= settings.HYBRID_VECTOR_TOP_K + else "exact" + ) + page_ids = ordered_ids[s.offset : s.offset + (s.limit or len(ordered_ids))] + + # Headline + hydration for the page slice only + page_rows = ( + ReportSearchVector.objects + .filter(report_id__in=page_ids) + .annotate( + summary=SearchHeadline("report__body", tsquery, config=language, + start_sel="", stop_sel="", + min_words=10, max_words=20, max_fragments=10), + rank=SearchRank(F("search_vector"), tsquery), + ) + .select_related("report") + ) + by_id = {r.report_id: r for r in page_rows} + documents = [ + document_from_pgsearch_response(_with_fallback_summary(by_id[rid])) + for rid in page_ids if rid in by_id + ] + return SearchResult(total_count=total_count, total_relation=total_relation, documents=documents) +``` + +### 7.3 Empty-summary fallback + +`SearchHeadline` returns an empty string when the document body has no FTS match (the vector-only hit case). `_with_fallback_summary` replaces an empty summary with the first 30 words of `report.body`. Trivial helper, ~5 lines. + +### 7.4 `retrieve()` + +Same fusion logic, returns an iterator of `report__document_id` in `ordered_ids` order. No headline. Used by `ExtractionJob` and `SubscriptionJob` to walk the matching id set. + +### 7.5 `count()` and `filter()` + +Unchanged. These operate on filters only and never call the embedding service. + +### 7.6 `ReportDocument.relevance` + +Kept as `ts_rank` for API backwards compatibility. RRF is an internal ordering signal and is not exposed on the public document type. RRF scores are logged at DEBUG for diagnostics. + +### 7.7 `search_provider.max_results` + +Updated to `max(HYBRID_VECTOR_TOP_K, HYBRID_FTS_MAX_RESULTS)`, which is what the `SearchView` page-bound check uses to reject impossibly-deep pagination. + +## 8. Configuration + +### 8.1 Env-driven (per-deployment, set in `.env`) + +```python +# radis/settings/base.py +EMBEDDING_BACKEND = env.str("EMBEDDING_BACKEND", default="openai") +EMBEDDING_PROVIDER_URL = env.str("EMBEDDING_PROVIDER_URL", default="") +EMBEDDING_PROVIDER_PATH = env.str("EMBEDDING_PROVIDER_PATH", default="") # "" = backend default +EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="") +EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B") +EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) +EMBEDDING_DRAIN_CRON = env.str("EMBEDDING_DRAIN_CRON", default="0 2 * * *") +``` + +These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). `EMBEDDING_DRAIN_CRON` is env-tunable so dev environments can drain more frequently (e.g., `*/15 * * * *`) without a code change. + +### 8.2 Code constants (tuning knobs, in `base.py`) + +```python +EMBEDDING_REQUEST_TIMEOUT = 30 # seconds +EMBEDDING_MAX_INPUT_CHARS = 60_000 +EMBEDDING_QUERY_INSTRUCTION = ( + "Instruct: Given a radiology search query, retrieve relevant radiology reports.\n" + "Query: " +) +EMBEDDING_BATCH_SIZE = 32 + +EMBEDDING_INDEX_PRIORITY = 0 +EMBEDDING_SYSTEM_USERNAME = "system" + +HYBRID_VECTOR_TOP_K = 100 +HYBRID_FTS_MAX_RESULTS = 10_000 +HYBRID_RRF_K = 60 +``` + +These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks). `EMBEDDING_SYSTEM_USERNAME` names the system user that owns every auto-generated `EmbeddingJob`; the data migration creates this user idempotently. + +### 8.3 `example.env` + +Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, keyed off `EMBEDDING_BACKEND`. Documents `EMBEDDING_DRAIN_CRON` with the production default (`0 2 * * *`) and a dev-friendly alternative (`*/15 * * * *`). + +### 8.4 Compose + +`docker-compose.base.yml`: + +- New service `embeddings_worker` inheriting `*default-app`. +- The `EMBEDDING_BACKEND`, `EMBEDDING_PROVIDER_URL`, `EMBEDDING_PROVIDER_PATH`, `EMBEDDING_PROVIDER_API_KEY`, `EMBEDDING_MODEL_NAME`, `EMBEDDING_DIM` env keys added to the `&default-app` block so all services see them. + +`docker-compose.dev.yml`: + +- `embeddings_worker.command`: `bash -c "wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && ./manage.py bg_worker -l debug -q embeddings --autoreload"`. + +`docker-compose.prod.yml`: + +- Same without `--autoreload`, log level `info`. + +## 9. Error handling and degradation + +| Failure | Behavior | Logging | +|---|---|---| +| Embedding service returns 5xx/timeout during query-time | `query_vec = None`; result list ordered by FTS-only; request succeeds | WARNING with request id | +| Embedding service returns 4xx during query-time | Same FTS-only fallback (treats as misconfig at request layer) | ERROR | +| Embedding service returns malformed body | `EmbeddingClientError` raised; query falls back to FTS-only | ERROR | +| Embedding service down during a sub-task | `process_embedding_task` raises; Procrastinate retries with exponential backoff; `embedding` stays NULL | WARNING per attempt, ERROR after final retry | +| Launcher fires while EmbeddingJob is `PREPARING`/`PENDING`/`IN_PROGRESS` | Status check returns immediately; tick is a no-op | INFO | +| Orchestrator crashes during task creation (partial dispatch) | Job stays in `PREPARING`. Next launcher tick sees in-flight job and no-ops. Operator marks job `FAILURE` in admin to allow a fresh run | ERROR + operator action | +| Sub-task fails after Procrastinate retries exhausted | Task ends as `FAILURE`. `update_job_state` rolls the job to `WARNING` (some tasks succeeded) or `FAILURE` (all failed). NULL rows remain; next launcher creates a new job to retry them | ERROR | +| `embeddings_worker` saturation | Sub-tasks queue up; orchestrator already returned. No deadlock; just slower drain | DEBUG | +| Report body > `EMBEDDING_MAX_INPUT_CHARS` | Truncate, embed truncated text | WARNING with report_id and char count | +| Report deleted between task creation and execution | Sub-task's `task.reports.values_list(...)` returns fewer rows; `embed_documents` called on smaller list; no error | DEBUG | +| Vector dim mismatch on write | Postgres raises; sub-task fails, retried | ERROR — escalate to admin | +| `EMBEDDING_PROVIDER_URL` empty at startup | `EmbeddingClient` construction defers to call site; calls log + raise; query falls back to FTS-only | WARNING once on first request | +| System user missing (data migration didn't run) | Launcher raises `User.DoesNotExist`. Loud failure; deployment misconfiguration. Fix: run migrations | ERROR | + +**Deliberate non-policies:** + +- The product never fails a search request because the embedding service is down. It degrades to FTS-only. +- Query embeddings are not cached. The complexity and freshness trade-off is not worth it at the corpora sizes RADIS targets. +- `EmbeddingClient` does not retry internally. Procrastinate retries the whole task; the query path uses a single shot. + +**Observability:** + +- Provider logs at DEBUG: vec hit count, FTS hit count, intersection count, fusion ms, query-embed ms. +- `process_embedding_task` logs at INFO: batch size, total chars, latency, success/retry counts. +- `embedding_launcher` and `process_embedding_job` log status transitions and dispatch counts at INFO. +- Operators inspect job/task state via Django admin (`EmbeddingJob`, `EmbeddingTask` use the default `ModelAdmin`). +- The existing OpenTelemetry overlay (commit `653e0c67`) tags telemetry per service; `embeddings_worker` shows up automatically. + +## 10. Testing strategy + +### 10.1 Unit tests (no DB) + +| File | Coverage | +|---|---| +| `tests/unit/test_embedding_client.py` | Backend payload/response round-trip, path override, instruction prefix, normalization, dim validation, all error modes, truncation | +| `tests/unit/test_provider_fusion.py` | `_rrf_fuse(vec_rank, fts_rank, k)` pure-Python helper: disjoint, overlapping, FTS-only, vector-only, both-empty, tiebreak by report_id | +| `tests/unit/test_embedding_launcher.py` | No-op when EmbeddingJob already in flight; no-op when no rows pending; happy path creates job and calls `delay`; raises if system user missing | +| `tests/unit/test_process_embedding_job.py` | Batches pending reports into `EmbeddingTask` rows of size `EMBEDDING_BATCH_SIZE`; status transitions `PREPARING` → `PENDING`; retry/resume path re-enqueues only `PENDING` tasks; empty pool exits cleanly | +| `tests/unit/test_process_embedding_task.py` | Embeds reports, writes vectors, sets status `SUCCESS`; status `FAILURE` and re-raise on `EmbeddingClientError`; calls `job.update_job_state` in both paths; clears `queued_job_id` | + +### 10.2 Integration tests (real Postgres + pgvector) + +| File | Coverage | +|---|---| +| `tests/integration/test_migrations.py` (new, `django-test-migrations`) | Extension migration runs; column + HNSW index created with configured dim; reverse works | +| `tests/integration/test_provider_hybrid.py` (new) | FTS-only hit, vector-only hit ("no pneumothorax" fixture), both-sides hit, filter honoring, stable pagination, embedding-service-down fallback, NULL-embedding rows still returned, `ts_headline` query-count bounded to page, empty-summary fallback | + +Factories: existing `ReportSearchVectorFactory` gains optional `embedding` kwarg (default `None`). New `ReportSearchVectorWithEmbeddingFactory` generates deterministic normalized vectors of the configured dim from a seed. Real Qwen3 embeddings are not used in tests. + +### 10.3 View-level smoke + +`radis/search/tests/test_views.py` (extend): + +- Search request with hybrid enabled returns 200 and renders documents. +- Search request with `EMBEDDING_PROVIDER_URL=""` returns 200 (FTS-only path). + +### 10.4 Acceptance (`@pytest.mark.acceptance`) + +One end-to-end test against the dev containers, with the embedding service stubbed (either a small in-test FastAPI or a recorded fixture response), verifying the search page returns hybrid results. Marked acceptance so it's opt-in like the existing acceptance suite. + +### 10.5 Explicitly not tested + +- Live Qwen3 retrieval quality (offline eval, out of scope). +- pgvector HNSW recall under specific data shapes (extension's responsibility). +- Wire formats beyond the two supported backends. + +## 11. Known limitations and future work + +### 11.1 Negation / polarity (the "no pneumothorax" problem) + +Dense embedding models — including Qwen3-Embedding — embed semantically opposite phrases close together. "No pneumothorax" and "pneumothorax present" produce nearby vectors, so the vector half of the hybrid score is *polarity-blind*. The FTS half partly compensates by allowing the user to construct explicit AND-NOT queries, but Postgres' GIN index drops "no" as a stop word, so a naive query like `no pneumothorax` is effectively `pneumothorax` on the FTS side. + +This is a real concern for radiology, where negated findings are pervasive ("no acute …", "no evidence of …", "no significant …"). **Hybrid search as designed here does not solve this.** It is documented as an accepted limitation of v1, and a v2 conversation should address it. + +Candidate solutions to evaluate in a future spec (none committed): + +- A cross-encoder re-ranker over the top-N hybrid results (e.g., a small instruction-tuned model that knows to score "no X" against "X present" as opposite). +- Adding a sparse/late-interaction model (SPLADE, ColBERT) alongside the dense vector — sparse models preserve token-level polarity. +- Negation-aware query preprocessing: detect negation, route to a different retrieval mode, or expand to phrasal `AND-NOT` clauses on the FTS side that bypass the stop-word filter (e.g., search the raw body, not the tsvector). +- Structured-findings indexing: have the LLM extract presence/absence flags per finding category at ingest time, search those structured fields instead of (or in addition to) prose. + +### 11.2 Dimension changes are manual + +See §4.5. + +### 11.3 GGUF dev embeddings ≠ bf16 prod embeddings + +Documented in §5.4. Mitigated by deferring `embedding_launcher` after a model swap (see §4.5). The next drain re-embeds everything. + +### 11.4 No body-change detection for re-embedding + +V1 re-embeds anything where `embedding IS NULL`. A future optimization could +track whether the body actually changed (e.g., a `body_hash` column on +`ReportSearchVector` updated only on body changes) so metadata-only updates +don't have to null the embedding. Not in v1; profiling will tell us whether it +matters. + +### 11.5 Operator-aware queries: FTS / vector asymmetry + +Both halves of hybrid search receive a derivation of the same parsed `QueryNode`, but interpret it through completely different machinery. The FTS side consumes a `tsquery` built by `_build_query_string` where `AND`, `OR`, `NOT`, quoted phrases, and parens are first-class boolean operators (`&`, `|`, `!`, `<->`, `()`). The vector side consumes the canonical unparsed string and feeds it whole to the embedding model as natural language; the operators become ordinary word tokens that the model has no operator-aware machinery to interpret. + +Practical consequences: + +- **Natural-phrase queries** (`pneumothorax`, `chest x-ray`, implicit-AND `cardiac arrest`) — both halves point the same direction. RRF amplifies the agreement. This is the workload hybrid search is best at. +- **`A AND B`** — FTS strictly intersects; vector returns docs about a topic-mix of A and B (which usually includes some single-side hits). Docs matching both lexically *and* semantically rank highest, which is the desired outcome. Vector contributes useful expansion but not boolean precision. +- **`A OR B`** — FTS unions; the vector half has no concept of disjunction and just produces a centroid-style embedding. Docs about either A or B that happen to be near the centroid still get retrieved, but a doc purely about A may not appear unless it's also close to the centroid. Vector half degrades from "asset" to "noise". +- **`NOT X`** — sharpest conflict. FTS correctly returns docs without X. Dense embeddings are polarity-blind, so the vector for `"NOT X"` clusters next to the vector for `"X"` and the top-K nearest neighbours are docs *about* X — the polar opposite of what the user asked for. The two halves return nearly disjoint sets that RRF interleaves, producing actively misleading results rather than mere noise. (Distinct from §11.1, which is about natural-language negation like `no pneumothorax` where the FTS stop-word strip happens to align the halves accidentally.) + +**Candidate mitigation (not in v1, recommended follow-up):** strip negated branches from the query string before embedding. Walk the AST; when a `UnaryNode("NOT", X)` is encountered, drop `X` from the string passed to the embedding model. The FTS side still gets the full structure. Outcomes: + +- `NOT X` alone → vector receives an empty query and is skipped; provider falls back to FTS-only ranking. Correct. +- `A AND NOT B` → vector embeds just `A`; FTS enforces `A & !B`. Vector adds positive semantic signal for A, FTS enforces the exclusion. The halves are aligned again. + +This is ~15 lines of code in `providers.search()` / `providers.retrieve()` and a small extension to `QueryParser` for the AST walk. Other candidates (negation-aware re-ranker, embedding subtraction, sparse models like SPLADE-NEG) are heavier and listed in §11.1. + +**Why a re-ranker alone cannot fix this.** A cross-encoder re-ranker improves precision *within the candidate pool it is given* — it cannot improve recall of that pool. For `NOT pneumothorax` over a 1000-doc corpus where 600 docs don't mention the word, the hybrid candidate pool is poisoned: ~100 wrong docs (pneumothorax-discussing reports pulled in by the polarity-blind vector half) displace 100 of the 600 correct docs from the top-N positions. After re-ranking top-20, the head of results is sharper, but ~590 correct docs still live below the re-ranker's cutoff at their original RRF positions, interleaved with the remaining 90 wrong docs. The architecturally correct order is to fix recall upstream (strip negated branches before embedding, restoring a clean candidate pool) and *then* layer a re-ranker for precision. A re-ranker without the upstream fix is rearranging deck chairs on a polluted pool. + +### 11.6 Cross-encoder re-ranker (deferred) + +A planned follow-up adds a re-ranker stage between hybrid fusion and result hydration to lift precision (especially on operator-light natural-phrase queries, where the candidate pool is already correct but RRF ordering is mediocre) and to partially compensate for §11.1's polarity blindness. Two backend patterns are under consideration: + +- **Pointwise cross-encoder via vLLM.** Qwen3-Reranker-4B served with `vllm serve … --task score` exposes `/v1/rerank` (Cohere/TEI shape: `{model, query, documents}` → `[{index, relevance_score}]`). Logit-based scoring (yes/no token logits → softmax) gives graded relevance in [0,1]. Latency ~30–100 ms per pair on a single GPU; for top-20 candidates that's ~0.5–1.5 s added. +- **Listwise LLM re-ranker** via the existing OpenAI-compatible chat-completions endpoint. The LLM is prompted with the query and the top-N candidates packed into a single message; structured output (`response_format=json_object`) returns a ranked list of indices. One HTTP call per query rather than N. Latency ~1–3 s for top-20 depending on model size. Quality trades off graded precision for the LLM's strong instruction-following — particularly the explicit "respect negation" cue, which the pointwise reranker has to learn implicitly. + +vLLM is the recommended production host for the pointwise path because Ollama (as of mid-2025) does not expose token logits cleanly, which collapses Qwen3-Reranker to a binary 1.0/0.0 signal and loses graded ordering. Ollama can still serve the LLM listwise backend without issue. + +### 11.7 Evaluation strategy for the layered hybrid stack + +Six profiles cover the additive layers: + +| Profile | Negation strip (§11.5) | Re-ranker (§11.6) | +|---|---|---| +| `baseline` | off | off | +| `strip` | on | off | +| `rerank-qwen` | off | Qwen3-Reranker via vLLM | +| `rerank-llm` | off | listwise LLM | +| `both-qwen` | on | Qwen3-Reranker via vLLM | +| `both-llm` | on | listwise LLM | + +A `run_search_eval` management command loops a set of test queries through all six profiles (toggling settings via `override_settings`) and dumps comparable JSON output with top-N docs, per-layer scores (`ts_rank`, `cosine_distance`, `rrf_score`, `rerank_score`), and per-profile latencies. + +**Labeling.** Per-pair LLM relevance judgment ("is doc D relevant to query Q?") is unreliable for radiology because (a) it inherits the same polarity blind spot the system is trying to evaluate, and (b) it introduces circular bias when the labeling LLM and re-ranker LLM share a family. The preferred approach is *concept-based polarity-aware labeling*: label each report once per clinical concept with `PRESENT` / `ABSENT` / `NOT_MENTIONED`, then derive query relevance deterministically (`pneumothorax` → `PRESENT ∪ ABSENT`; `NOT pneumothorax` → `NOT_MENTIONED ∪ ABSENT` for strict exclusion, or `ABSENT` only for "rule-out" semantics). The concept labels are reusable across many queries and survive prompt/model changes. The upstream label-filter work in PR #196 produces structured labels with comparable semantics and is the intended source of ground truth for production-scale evaluation. + +## 12. Rollout plan + +1. **Schema and dependency.** Land the `pgvector` Python dep, the extension migration, and the embedding-column schema migration. No behavior change yet — `embedding` is nullable, queries still see only FTS. +2. **Embedding client and tests.** Land the client module and unit tests. No callers yet. +3. **Orchestrator models and migrations.** Add `EmbeddingJob`, `EmbeddingTask`, their migration, and the data migration that creates the system user. +4. **Orchestrator tasks and `embeddings_worker`.** Land `embedding_launcher`, `process_embedding_job`, `process_embedding_task`, the `embeddings_worker` container (with `--concurrency 4`), and the `EMBEDDING_DRAIN_CRON` setting. The launcher starts ticking; with no rows yet, all ticks no-op. +5. **Initial drain.** From a shell, run `embedding_launcher.defer()` so the orchestrator picks up the existing corpus. This is the only "operator action" in the rollout. It runs at `EMBEDDING_INDEX_PRIORITY` and lives behind whatever other work is on the queues; it can run for hours to days on a large corpus. +6. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only. +7. **Monitor.** Watch search latency p95, embedding-queue depth, `EmbeddingJob` admin state, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed. + +Each step is independently mergeable; steps 1–4 ship as quiet infrastructure changes with no user-visible effect, step 5 starts populating the column, step 6 is the moment hybrid goes live. From f3585899ef05fd9494a142a87ef9d17ffc78c588 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:29:53 +0000 Subject: [PATCH 38/68] docs(pgsearch): promote NOT-stripping into the spec; regenerate plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moves the strip-NOT-before-embedding mitigation from §11.5's "candidate follow-up" into the design proper (new §7.8 + updated §7.2 flow) and trims §11.5 to reflect that NOT is solved while OR-asymmetry remains open. Plan is renamed to 2026-05-28-hybrid-search.md and grows two tasks covering QueryParser.unparse_for_embedding and its provider wiring. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...strator.md => 2026-05-28-hybrid-search.md} | 252 +++++++++++++++++- .../specs/2026-05-28-hybrid-search.md | 99 +++++-- 2 files changed, 327 insertions(+), 24 deletions(-) rename docs/superpowers/plans/{2026-05-28-embedding-orchestrator.md => 2026-05-28-hybrid-search.md} (74%) diff --git a/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md b/docs/superpowers/plans/2026-05-28-hybrid-search.md similarity index 74% rename from docs/superpowers/plans/2026-05-28-embedding-orchestrator.md rename to docs/superpowers/plans/2026-05-28-hybrid-search.md index 0b820e3d..d3c98216 100644 --- a/docs/superpowers/plans/2026-05-28-embedding-orchestrator.md +++ b/docs/superpowers/plans/2026-05-28-hybrid-search.md @@ -1,14 +1,16 @@ -# Embedding Orchestrator Implementation Plan +# Hybrid Search — Embedding Orchestrator + Negation-Aware Query Implementation Plan > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. -**Goal:** Replace the `post_save`-driven `embed_reports` task and `backfill_embeddings` command with a periodic `EmbeddingJob`/`EmbeddingTask` orchestrator that batches pending embeddings without per-API-call job amplification. +**Goal:** Land the two remaining design pieces of the unified hybrid-search spec — (1) the periodic `EmbeddingJob`/`EmbeddingTask` orchestrator that replaces the `post_save`-driven embedding path, and (2) `QueryParser.unparse_for_embedding` which strips `NOT` branches before the vector half of `search()` calls the embedding service. -**Architecture:** Mirror `ExtractionJob`/`ExtractionTask` (`radis/extractions/tasks.py:32`) and `subscription_launcher` (`radis/subscriptions/tasks.py:115`). A periodic `embedding_launcher` on `default` queue creates one `EmbeddingJob` (system-owned) per drain; `process_embedding_job` (also `default`) batches `ReportSearchVector` rows with `embedding IS NULL` into `EmbeddingTask` rows and dispatches them; `process_embedding_task` (on `embeddings` queue) calls `EmbeddingClient`, `bulk_update`s the vectors, and rolls status up via `AnalysisJob.update_job_state`. +**Architecture (orchestrator):** Mirror `ExtractionJob`/`ExtractionTask` (`radis/extractions/tasks.py:32`) and `subscription_launcher` (`radis/subscriptions/tasks.py:115`). A periodic `embedding_launcher` on `default` queue creates one `EmbeddingJob` (system-owned) per drain; `process_embedding_job` (also `default`) batches `ReportSearchVector` rows with `embedding IS NULL` into `EmbeddingTask` rows and dispatches them; `process_embedding_task` (on `embeddings` queue) calls `EmbeddingClient`, `bulk_update`s the vectors, and rolls status up via `AnalysisJob.update_job_state`. + +**Architecture (negation):** A new `QueryParser.unparse_for_embedding(node)` static walker emits a string with `UnaryNode("NOT", X)` branches dropped and empty `BinaryNode` legs collapsed. The pgsearch provider calls it instead of `unparse()` before `embed_query`; if the result is empty, the vector side is skipped and the request degrades to FTS-only. **Tech Stack:** Django 5.1, Procrastinate (periodic tasks + `queueing_lock`), pgvector, pytest-django. -**Spec:** `docs/superpowers/specs/2026-05-28-hybrid-search.md` §6. +**Spec:** `docs/superpowers/specs/2026-05-28-hybrid-search.md` (§6 orchestrator, §7.2/§7.8 negation). **Branch:** `feat/hybrid-search` (continue here; no worktree required). @@ -27,6 +29,7 @@ | `radis/pgsearch/tests/test_process_embedding_job.py` | Unit tests for `process_embedding_job` | | `radis/pgsearch/tests/test_process_embedding_task.py` | Unit tests for `process_embedding_task` | | `radis/pgsearch/tests/test_migrations_system_user.py` | Test for the data migration | +| `radis/search/tests/test_query_parser_unparse_for_embedding.py` | Unit tests for the new `QueryParser.unparse_for_embedding` | **Files to modify:** @@ -37,6 +40,9 @@ | `radis/pgsearch/tasks.py` | Replace contents: add `embedding_launcher`, `process_embedding_job`, `process_embedding_task`; remove `embed_reports` and `enqueue_embed_reports` | | `radis/pgsearch/signals.py` | Remove `enqueue_report_embedding` receiver (lines 19-23); keep the FTS receiver | | `radis/pgsearch/tests/test_signals.py` | Delete the two embedding-signal tests; the file becomes empty and is deleted | +| `radis/search/utils/query_parser.py:293-314` | Add `QueryParser.unparse_for_embedding` next to existing `unparse` | +| `radis/pgsearch/providers.py:103,213` | Replace `QueryParser.unparse(search.query)` with `QueryParser.unparse_for_embedding(search.query)`; skip embedding call when result is empty | +| `radis/pgsearch/tests/test_provider_hybrid.py` | Add a hybrid test exercising the `NOT X` and `A AND NOT B` paths | | `docker-compose.dev.yml:85-92` | Add `--concurrency 4` to `embeddings_worker` command | | `docker-compose.prod.yml:80-88` | Add `--concurrency 4` to `embeddings_worker` command | @@ -983,6 +989,242 @@ git commit -m "feat(infra): run embeddings_worker with --concurrency 4" --- +## Task 12: Add `QueryParser.unparse_for_embedding` + +**Files:** +- Modify: `radis/search/utils/query_parser.py:293-314` (append new static method after the existing `unparse`) +- Create: `radis/search/tests/test_query_parser_unparse_for_embedding.py` + +The method walks the same AST as `unparse` (`TermNode | ParensNode | UnaryNode | BinaryNode` defined at `radis/search/utils/query_parser.py:55`) but drops `UnaryNode("NOT", X)` branches and collapses empty `BinaryNode` legs. The grammar's only unary operator is `NOT` (per `radis/search/utils/query_parser.py:214`), so the implementation can assume that. The empty string is a legitimate return value (e.g., for `NOT X` alone) and callers handle it. + +- [ ] **Step 1: Write the failing tests** + +Create `radis/search/tests/test_query_parser_unparse_for_embedding.py`: + +```python +import pytest + +from radis.search.utils.query_parser import QueryParser + + +@pytest.mark.parametrize( + "query,expected", + [ + # Simple positive term — unchanged. + ("pneumothorax", "pneumothorax"), + # Phrase preserved with quotes. + ('"chest x-ray"', '"chest x-ray"'), + # Implicit AND (no operator) — both sides survive. + ("cardiac arrest", "cardiac arrest"), + # Explicit AND — both sides survive, operator preserved. + ("A AND B", "A AND B"), + # OR — both sides survive, operator preserved. + ("A OR B", "A OR B"), + # NOT alone — empty. + ("NOT pneumothorax", ""), + # AND NOT — left survives, NOT branch dropped, AND collapses. + ("A AND NOT B", "A"), + # NOT AND — right survives, NOT branch dropped, AND collapses. + ("NOT A AND B", "B"), + # NOT OR NOT — both branches dropped, empty. + ("NOT A OR NOT B", ""), + # Mixed: AND OR with a NOT branch — surviving structure retained. + ("(A AND NOT B) OR C", "(A) OR C"), + # Nested NOT inside parens — empty parens collapsed. + ("A AND (NOT B)", "A"), + # Double-nested OR with one NOT — only NOT branch dropped. + ("(A OR B) AND NOT C", "(A OR B)"), + ], +) +def test_unparse_for_embedding(query, expected): + node, _fixes = QueryParser().parse(query) + assert node is not None, f"parser produced empty node for {query!r}" + assert QueryParser.unparse_for_embedding(node) == expected +``` + +- [ ] **Step 2: Run tests — expect AttributeError** + +Run: `uv run pytest radis/search/tests/test_query_parser_unparse_for_embedding.py -v` +Expected: FAIL — `AttributeError: type object 'QueryParser' has no attribute 'unparse_for_embedding'` + +- [ ] **Step 3: Add the method to `radis/search/utils/query_parser.py`** + +Append immediately after the existing `unparse` static method (after the closing of the `if/elif` chain that ends around line 314): + +```python + @staticmethod + def unparse_for_embedding(node: QueryNode) -> str: + """Like ``unparse``, but drops the operand of every ``UnaryNode("NOT", X)`` + and collapses any ``BinaryNode`` whose children both become empty. + Returns the empty string if the whole query reduces to NOT clauses. + + Used by the hybrid-search vector half to avoid polarity-blind embedding + of negated terms (see spec 2026-05-28-hybrid-search §7.8). + """ + if isinstance(node, TermNode): + return QueryParser.unparse(node) + if isinstance(node, ParensNode): + inner = QueryParser.unparse_for_embedding(node.expression) + return f"({inner})" if inner else "" + if isinstance(node, UnaryNode): + return "" + if isinstance(node, BinaryNode): + left = QueryParser.unparse_for_embedding(node.left) + right = QueryParser.unparse_for_embedding(node.right) + if not left and not right: + return "" + if not left: + return right + if not right: + return left + if node.implicit: + return f"{left} {right}" + return f"{left} {node.operator} {right}" + raise ValueError(f"Unknown node type: {type(node)}") +``` + +- [ ] **Step 4: Run tests and verify pass** + +Run: `uv run pytest radis/search/tests/test_query_parser_unparse_for_embedding.py -v` +Expected: PASS (12 parameterized cases). + +- [ ] **Step 5: Commit** + +```bash +git add radis/search/utils/query_parser.py radis/search/tests/test_query_parser_unparse_for_embedding.py +git commit -m "feat(search): add QueryParser.unparse_for_embedding that strips NOT branches" +``` + +--- + +## Task 13: Wire `unparse_for_embedding` into the pgsearch provider + +**Files:** +- Modify: `radis/pgsearch/providers.py:103` (in `search()`) +- Modify: `radis/pgsearch/providers.py:213` (in `retrieve()`) +- Modify: `radis/pgsearch/tests/test_provider_hybrid.py` + +Both `search()` and `retrieve()` currently call `QueryParser.unparse(search.query)` to build the text passed to `embed_query`. Replace with `unparse_for_embedding`. If the result is empty (e.g., the user query is `NOT X` alone), skip the embedding call and leave `query_vec = None` — the existing FTS-only fallback handles it. + +- [ ] **Step 1: Write the failing test** + +Append to `radis/pgsearch/tests/test_provider_hybrid.py` (use existing fixtures; structure mirrors current tests in that file): + +```python +def test_search_skips_embedding_when_query_reduces_to_not(monkeypatch, ...): + """`NOT X` alone produces an empty embedding string; the provider must + not call the embedding service and must return FTS-only results.""" + from radis.pgsearch import providers + from radis.search.site import Search, SearchFilters + from radis.search.utils.query_parser import QueryParser + + embed_query_calls: list[str] = [] + + class FakeEC: + def __init__(self): pass + def __enter__(self): return self + def __exit__(self, *a): return False + def embed_query(self, text): + embed_query_calls.append(text) + raise AssertionError("embed_query should not be called for NOT-only query") + + monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC) + + node, _ = QueryParser().parse("NOT pneumothorax") + search = Search(query=node, offset=0, limit=10, filters=SearchFilters(group=...)) + result = providers.search(search) + + assert embed_query_calls == [] + # FTS-only path still returns a SearchResult (possibly with zero hits). + assert result is not None + + +def test_search_embeds_only_positive_branch_for_and_not(monkeypatch, ...): + """`A AND NOT B` embeds only `A`; FTS half still enforces the exclusion.""" + from radis.pgsearch import providers + from radis.search.site import Search, SearchFilters + from radis.search.utils.query_parser import QueryParser + + embed_query_calls: list[str] = [] + + class FakeEC: + def __init__(self): pass + def __enter__(self): return self + def __exit__(self, *a): return False + def embed_query(self, text): + embed_query_calls.append(text) + # Return a valid normalized unit vector of the right dim. + import numpy as np + from django.conf import settings as dj + v = np.ones(dj.EMBEDDING_DIM, dtype=np.float32) + return (v / np.linalg.norm(v)).tolist() + + monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC) + + node, _ = QueryParser().parse("pneumothorax AND NOT effusion") + search = Search(query=node, offset=0, limit=10, filters=SearchFilters(group=...)) + providers.search(search) + + assert embed_query_calls == ["pneumothorax"] +``` + +Replace `group=...` with the actual fixture used elsewhere in the file (it is whatever value the existing hybrid tests pass — read the file's other test bodies for the canonical filter setup). + +- [ ] **Step 2: Run tests — expect failure** + +Run: `uv run pytest radis/pgsearch/tests/test_provider_hybrid.py -k "not_when_query_reduces_to_not or and_not" -v` +Expected: FAIL — `embed_query` is still called with the unstripped text. + +- [ ] **Step 3: Modify `radis/pgsearch/providers.py:search()`** + +Locate the block currently at lines ~102-110 in `radis/pgsearch/providers.py`: + +```python + # Vector side: query embedding (sync HTTP); fall back gracefully on failure. + query_text = QueryParser.unparse(search.query) + query_vec: list[float] | None + try: + with EmbeddingClient() as ec: + query_vec = ec.embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Hybrid search falling back to FTS-only: %s", e) + query_vec = None +``` + +Replace with: + +```python + # Vector side: strip NOT branches (see spec §7.8). If nothing is left, + # skip the embedding call entirely and fall through to FTS-only. + query_text = QueryParser.unparse_for_embedding(search.query) + query_vec: list[float] | None = None + if query_text.strip(): + try: + with EmbeddingClient() as ec: + query_vec = ec.embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Hybrid search falling back to FTS-only: %s", e) + query_vec = None +``` + +- [ ] **Step 4: Apply the same change to `retrieve()`** + +Locate the analogous block at lines ~212-220 in `radis/pgsearch/providers.py`. Apply the identical replacement. + +- [ ] **Step 5: Run tests and verify pass** + +Run: `uv run pytest radis/pgsearch/tests/test_provider_hybrid.py -v` +Expected: PASS for all hybrid tests including the two new ones. + +- [ ] **Step 6: Commit** + +```bash +git add radis/pgsearch/providers.py radis/pgsearch/tests/test_provider_hybrid.py +git commit -m "feat(pgsearch): use unparse_for_embedding to strip NOT branches before embed_query" +``` + +--- + ## Final verification - [ ] **Step 1: Run lint** @@ -1039,6 +1281,8 @@ git push -u origin feat/hybrid-search | §6.4 owner = system user via data migration | Task 3 | | §6.5 `embedding_launcher` with `queueing_lock` + in-flight check | Task 6 | | §6.6 `process_embedding_job` PREPARING → PENDING flow | Task 5 | +| §7.2 `unparse_for_embedding` used in search() + empty-string short-circuit | Task 13 | +| §7.8 `QueryParser.unparse_for_embedding` AST walker | Task 12 | | §6.7 `process_embedding_task` on `embeddings` queue | Task 4 | | §6.8 No post_save signal | Task 7 | | §6.8 No `backfill_embeddings` command | Tasks 8 + 9 | diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index bf32ba4c..34893c9e 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -594,13 +594,17 @@ def search(s: Search) -> SearchResult: filter_q = _build_filter_query(s.filters) tsquery = SearchQuery(query_str, search_type="raw", config=language) - # Vector side - query_text = QueryParser.unparse(s.query) # same helper SearchView already uses - try: - query_vec = EmbeddingClient().embed_query(query_text) - except EmbeddingClientError as e: - logger.warning("Falling back to FTS-only: %s", e) - query_vec = None + # Vector side: strip NOT branches before embedding (see §7.8), then embed. + # If stripping leaves nothing (e.g., the user query was just `NOT X`), + # skip vector retrieval entirely and fall through to FTS-only. + query_text = QueryParser.unparse_for_embedding(s.query) + query_vec: list[float] | None = None + if query_text.strip(): + try: + query_vec = EmbeddingClient().embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Falling back to FTS-only: %s", e) + query_vec = None vec_rank: dict[int, int] = {} if query_vec is not None: @@ -677,6 +681,66 @@ Kept as `ts_rank` for API backwards compatibility. RRF is an internal ordering s Updated to `max(HYBRID_VECTOR_TOP_K, HYBRID_FTS_MAX_RESULTS)`, which is what the `SearchView` page-bound check uses to reject impossibly-deep pagination. +### 7.8 Negation-aware query for embedding + +Dense embedding models are polarity-blind: the vector for `"NOT pneumothorax"` +clusters near the vector for `"pneumothorax"`, so the top-K nearest neighbours +to a `NOT X` query are documents *about* X — the polar opposite of what the +user asked for. The FTS half handles `NOT X` correctly (it returns docs +without X), so when both halves are fused naively the vector half pollutes +the candidate pool with anti-matches. + +The fix is upstream of embedding: strip negated branches from the query string +before sending it to the embedding model. The FTS side still receives the +full structured query, so its negation semantics are preserved. + +A new static method on `QueryParser` walks the AST and emits a stripped +string. The shape mirrors the existing `QueryParser.unparse` walker: + +```python +@staticmethod +def unparse_for_embedding(node: QueryNode) -> str: + """Like unparse(), but drops the operand of every UnaryNode("NOT", X) + and collapses any BinaryNode whose children both become empty. + Returns the empty string if the whole query reduces to NOT clauses.""" + if isinstance(node, TermNode): + # Same as unparse: emit the term verbatim (PHRASE keeps quotes). + return QueryParser.unparse(node) + if isinstance(node, ParensNode): + inner = QueryParser.unparse_for_embedding(node.expression) + return f"({inner})" if inner else "" + if isinstance(node, UnaryNode): + # The only unary operator in the grammar is NOT — drop the operand. + return "" + if isinstance(node, BinaryNode): + left = QueryParser.unparse_for_embedding(node.left) + right = QueryParser.unparse_for_embedding(node.right) + if not left and not right: + return "" + if not left: + return right + if not right: + return left + if node.implicit: + return f"{left} {right}" + return f"{left} {node.operator} {right}" + raise ValueError(f"Unknown node type: {type(node)}") +``` + +Outcomes: + +| User query | `unparse()` (FTS path) | `unparse_for_embedding()` (vector path) | Behavior | +|---|---|---|---| +| `pneumothorax` | `pneumothorax` | `pneumothorax` | Both halves agree; RRF amplifies. | +| `A AND NOT B` | `A AND NOT B` | `A` | Vector embeds the positive concept; FTS enforces the exclusion. | +| `NOT X` | `NOT X` | `""` | Vector path skipped (see §7.2); FTS-only ranking. | +| `(A AND NOT B) OR C` | `(A AND NOT B) OR C` | `(A) OR C` | Empty NOT branch collapses; surviving structure retained for vector. | + +The method does not attempt to resolve OR-asymmetry or other operator +mismatches documented in §11.5 — those remain open trade-offs in the design. +This is a targeted fix for the `NOT` case, which is the most acute failure +mode for radiology queries. + ## 8. Configuration ### 8.1 Env-driven (per-deployment, set in `.env`) @@ -835,25 +899,20 @@ track whether the body actually changed (e.g., a `body_hash` column on don't have to null the embedding. Not in v1; profiling will tell us whether it matters. -### 11.5 Operator-aware queries: FTS / vector asymmetry +### 11.5 Operator-aware queries: residual FTS / vector asymmetry -Both halves of hybrid search receive a derivation of the same parsed `QueryNode`, but interpret it through completely different machinery. The FTS side consumes a `tsquery` built by `_build_query_string` where `AND`, `OR`, `NOT`, quoted phrases, and parens are first-class boolean operators (`&`, `|`, `!`, `<->`, `()`). The vector side consumes the canonical unparsed string and feeds it whole to the embedding model as natural language; the operators become ordinary word tokens that the model has no operator-aware machinery to interpret. +Both halves of hybrid search receive a derivation of the same parsed `QueryNode`, but interpret it through completely different machinery. The FTS side consumes a `tsquery` built by `_build_query_string` where `AND`, `OR`, `NOT`, quoted phrases, and parens are first-class boolean operators (`&`, `|`, `!`, `<->`, `()`). The vector side consumes a string derived from the AST by `QueryParser.unparse_for_embedding` (§7.8) and feeds it to the embedding model as natural language; the remaining operators become ordinary word tokens that the model has no operator-aware machinery to interpret. -Practical consequences: +Practical consequences after the §7.8 NOT-stripping fix: - **Natural-phrase queries** (`pneumothorax`, `chest x-ray`, implicit-AND `cardiac arrest`) — both halves point the same direction. RRF amplifies the agreement. This is the workload hybrid search is best at. -- **`A AND B`** — FTS strictly intersects; vector returns docs about a topic-mix of A and B (which usually includes some single-side hits). Docs matching both lexically *and* semantically rank highest, which is the desired outcome. Vector contributes useful expansion but not boolean precision. -- **`A OR B`** — FTS unions; the vector half has no concept of disjunction and just produces a centroid-style embedding. Docs about either A or B that happen to be near the centroid still get retrieved, but a doc purely about A may not appear unless it's also close to the centroid. Vector half degrades from "asset" to "noise". -- **`NOT X`** — sharpest conflict. FTS correctly returns docs without X. Dense embeddings are polarity-blind, so the vector for `"NOT X"` clusters next to the vector for `"X"` and the top-K nearest neighbours are docs *about* X — the polar opposite of what the user asked for. The two halves return nearly disjoint sets that RRF interleaves, producing actively misleading results rather than mere noise. (Distinct from §11.1, which is about natural-language negation like `no pneumothorax` where the FTS stop-word strip happens to align the halves accidentally.) - -**Candidate mitigation (not in v1, recommended follow-up):** strip negated branches from the query string before embedding. Walk the AST; when a `UnaryNode("NOT", X)` is encountered, drop `X` from the string passed to the embedding model. The FTS side still gets the full structure. Outcomes: - -- `NOT X` alone → vector receives an empty query and is skipped; provider falls back to FTS-only ranking. Correct. -- `A AND NOT B` → vector embeds just `A`; FTS enforces `A & !B`. Vector adds positive semantic signal for A, FTS enforces the exclusion. The halves are aligned again. +- **`A AND B`** — FTS strictly intersects; vector returns docs about a topic-mix of A and B. Docs matching both lexically *and* semantically rank highest, which is the desired outcome. Vector contributes useful expansion but not boolean precision. +- **`A OR B`** — FTS unions; the vector half has no concept of disjunction and just produces a centroid-style embedding. Docs about either A or B that happen to be near the centroid still get retrieved, but a doc purely about A may not appear unless it's also close to the centroid. **Open trade-off.** Vector half degrades from "asset" to "noise" for OR-heavy queries; no fix in this spec. +- **`NOT X` / `A AND NOT B`** — addressed by §7.8. Vector embeds only the positive branches; FTS enforces the negation; the halves are aligned. -This is ~15 lines of code in `providers.search()` / `providers.retrieve()` and a small extension to `QueryParser` for the AST walk. Other candidates (negation-aware re-ranker, embedding subtraction, sparse models like SPLADE-NEG) are heavier and listed in §11.1. +The asymmetry is real and remains a quality consideration for OR-heavy queries. The §11.6 cross-encoder re-ranker, when added, can sharpen the head of results but cannot fix a polluted candidate pool — see the analysis at the end of this section for why upstream stripping (the §7.8 approach for `NOT`) is the architecturally correct order of operations. -**Why a re-ranker alone cannot fix this.** A cross-encoder re-ranker improves precision *within the candidate pool it is given* — it cannot improve recall of that pool. For `NOT pneumothorax` over a 1000-doc corpus where 600 docs don't mention the word, the hybrid candidate pool is poisoned: ~100 wrong docs (pneumothorax-discussing reports pulled in by the polarity-blind vector half) displace 100 of the 600 correct docs from the top-N positions. After re-ranking top-20, the head of results is sharper, but ~590 correct docs still live below the re-ranker's cutoff at their original RRF positions, interleaved with the remaining 90 wrong docs. The architecturally correct order is to fix recall upstream (strip negated branches before embedding, restoring a clean candidate pool) and *then* layer a re-ranker for precision. A re-ranker without the upstream fix is rearranging deck chairs on a polluted pool. +**Why a re-ranker alone cannot fix recall problems.** A cross-encoder re-ranker improves precision *within the candidate pool it is given* — it cannot improve recall of that pool. If a polarity-blind vector half had poisoned a `NOT pneumothorax` pool with ~100 anti-matches, re-ranking the top-20 would sharpen the head but ~590 correct docs would still live below the re-ranker's cutoff at their original RRF positions. The architecturally correct order is to fix recall upstream (§7.8) and *then* layer a re-ranker for precision (§11.6). A re-ranker without the upstream fix is rearranging deck chairs on a polluted pool. ### 11.6 Cross-encoder re-ranker (deferred) From 3737483072cb24bc96a95f54e601c50483493c17 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:33:35 +0000 Subject: [PATCH 39/68] fix(plan): replace 'uv run cli shell -c' with manage.py invocations --- docs/superpowers/plans/2026-05-28-hybrid-search.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/superpowers/plans/2026-05-28-hybrid-search.md b/docs/superpowers/plans/2026-05-28-hybrid-search.md index d3c98216..b5933d75 100644 --- a/docs/superpowers/plans/2026-05-28-hybrid-search.md +++ b/docs/superpowers/plans/2026-05-28-hybrid-search.md @@ -91,7 +91,7 @@ EMBEDDING_DRAIN_CRON=0 2 * * * - [ ] **Step 3: Verify Django config loads** -Run: `uv run cli shell -c "from django.conf import settings; print(settings.EMBEDDING_DRAIN_CRON, settings.EMBEDDING_SYSTEM_USERNAME)"` +Run: `uv run python manage.py shell -c "from django.conf import settings; print(settings.EMBEDDING_DRAIN_CRON, settings.EMBEDDING_SYSTEM_USERNAME)"` Expected: prints `0 2 * * * system` - [ ] **Step 4: Commit** @@ -210,12 +210,12 @@ class EmbeddingTask(AnalysisTask): - [ ] **Step 4: Generate the migration** -Run: `uv run cli shell -c "from django.core.management import call_command; call_command('makemigrations', 'pgsearch', name='embedding_job_task')"` +Run: `uv run python manage.py makemigrations pgsearch --name embedding_job_task` Expected: creates `radis/pgsearch/migrations/0004_embedding_job_task.py` containing `CreateModel` operations for `EmbeddingJob`, `EmbeddingTask`, and the M2M through-table. - [ ] **Step 5: Apply the migration and re-run tests** -Run: `uv run cli shell -c "from django.core.management import call_command; call_command('migrate', 'pgsearch')"` +Run: `uv run python manage.py migrate pgsearch` Then: `uv run pytest radis/pgsearch/tests/test_models_embedding.py -v` Expected: PASS @@ -312,7 +312,7 @@ class Migration(migrations.Migration): - [ ] **Step 5: Apply migration and run tests** -Run: `uv run cli shell -c "from django.core.management import call_command; call_command('migrate', 'pgsearch')"` +Run: `uv run python manage.py migrate pgsearch` Then: `uv run pytest radis/pgsearch/tests/test_migrations_system_user.py -v` Expected: PASS @@ -841,7 +841,7 @@ Run: `rm radis/pgsearch/tests/test_embed_reports_task.py` - [ ] **Step 4: Verify the backfill command still imports cleanly is now expected to fail** -Run: `uv run cli shell -c "from radis.pgsearch.management.commands import backfill_embeddings"` +Run: `uv run python manage.py shell -c "from radis.pgsearch.management.commands import backfill_embeddings"` Expected: `ImportError: cannot import name 'enqueue_embed_reports'` — this confirms Task 9 (deleting the command) is the immediate next step. - [ ] **Step 5: Do NOT commit yet — proceed straight to Task 9** @@ -906,7 +906,7 @@ EMBEDDING_BACKFILL_PRIORITY = -1 - [ ] **Step 3: Verify Django still loads** -Run: `uv run cli shell -c "from django.conf import settings; print(settings.EMBEDDING_INDEX_PRIORITY)"` +Run: `uv run python manage.py shell -c "from django.conf import settings; print(settings.EMBEDDING_INDEX_PRIORITY)"` Expected: prints `0`. - [ ] **Step 4: Run full test suite to confirm nothing dangles** From d29cb95caa768d9b82d81373d930e1bba5728c03 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:34:22 +0000 Subject: [PATCH 40/68] feat(pgsearch): add EMBEDDING_DRAIN_CRON and EMBEDDING_SYSTEM_USERNAME settings --- example.env | 4 ++++ radis/settings/base.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/example.env b/example.env index bf2ec8e3..685e550f 100644 --- a/example.env +++ b/example.env @@ -162,6 +162,10 @@ EMBEDDING_MODEL_NAME=Qwen/Qwen3-Embedding-4B # the embedding column, re-migrating, and running `./manage.py backfill_embeddings`. EMBEDDING_DIM=1024 +# Cron expression for the embedding orchestrator. Default nightly at 02:00. +# Use "*/15 * * * *" for more aggressive dev draining. +EMBEDDING_DRAIN_CRON=0 2 * * * + # Development with local Ollama: # EMBEDDING_BACKEND=ollama # EMBEDDING_PROVIDER_URL=http://host.docker.internal:11434 diff --git a/radis/settings/base.py b/radis/settings/base.py index d9436ef8..3ed94ded 100644 --- a/radis/settings/base.py +++ b/radis/settings/base.py @@ -345,6 +345,7 @@ EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="") EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B") EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) +EMBEDDING_DRAIN_CRON = env.str("EMBEDDING_DRAIN_CRON", default="0 2 * * *") # Embedding tuning constants (see hybrid-search spec §8.2) EMBEDDING_REQUEST_TIMEOUT = 30 @@ -358,6 +359,7 @@ # Embedding queue priorities (procrastinate "higher = sooner") EMBEDDING_INDEX_PRIORITY = 0 EMBEDDING_BACKFILL_PRIORITY = -1 +EMBEDDING_SYSTEM_USERNAME = "system" # Hybrid search tuning HYBRID_VECTOR_TOP_K = 100 From 09a488a53d250307022fa5c6a1b7ed8517eeaa0f Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:38:08 +0000 Subject: [PATCH 41/68] feat(pgsearch): add EmbeddingJob and EmbeddingTask models Co-Authored-By: Claude Sonnet 4.6 --- .../migrations/0004_embedding_job_task.py | 138 ++++++++++++++++++ radis/pgsearch/models.py | 47 ++++++ radis/pgsearch/tests/test_models_embedding.py | 31 ++++ 3 files changed, 216 insertions(+) create mode 100644 radis/pgsearch/migrations/0004_embedding_job_task.py create mode 100644 radis/pgsearch/tests/test_models_embedding.py diff --git a/radis/pgsearch/migrations/0004_embedding_job_task.py b/radis/pgsearch/migrations/0004_embedding_job_task.py new file mode 100644 index 00000000..798af1b7 --- /dev/null +++ b/radis/pgsearch/migrations/0004_embedding_job_task.py @@ -0,0 +1,138 @@ +# Generated by Django 6.0.1 on 2026-05-28 19:36 + +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("pgsearch", "0003_report_embedding"), + ("procrastinate", "0041_post_retry_failed_job"), + ("reports", "0013_alter_report_options"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="EmbeddingJob", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "status", + models.CharField( + choices=[ + ("UV", "Unverified"), + ("PR", "Preparing"), + ("PE", "Pending"), + ("IP", "In Progress"), + ("CI", "Canceling"), + ("CA", "Canceled"), + ("SU", "Success"), + ("WA", "Warning"), + ("FA", "Failure"), + ], + default="UV", + max_length=2, + ), + ), + ("urgent", models.BooleanField(default=False)), + ("send_finished_mail", models.BooleanField(default=False)), + ("message", models.TextField(blank=True, default="")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("started_at", models.DateTimeField(blank=True, null=True)), + ("ended_at", models.DateTimeField(blank=True, null=True)), + ( + "owner", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(app_label)s_jobs", + to=settings.AUTH_USER_MODEL, + ), + ), + ( + "queued_job", + models.OneToOneField( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="+", + to="procrastinate.procrastinatejob", + ), + ), + ], + options={ + "ordering": ["-created_at"], + }, + ), + migrations.CreateModel( + name="EmbeddingTask", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "status", + models.CharField( + choices=[ + ("PE", "Pending"), + ("IP", "In Progress"), + ("CA", "Canceled"), + ("SU", "Success"), + ("WA", "Warning"), + ("FA", "Failure"), + ], + default="PE", + max_length=2, + ), + ), + ("attempts", models.PositiveSmallIntegerField(default=0)), + ("message", models.TextField(blank=True, default="")), + ("log", models.TextField(blank=True, default="")), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("started_at", models.DateTimeField(blank=True, null=True)), + ("ended_at", models.DateTimeField(blank=True, null=True)), + ( + "job", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="tasks", + to="pgsearch.embeddingjob", + ), + ), + ( + "queued_job", + models.OneToOneField( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="+", + to="procrastinate.procrastinatejob", + ), + ), + ( + "reports", + models.ManyToManyField( + related_name="embedding_tasks", to="reports.report" + ), + ), + ], + options={ + "ordering": ("id",), + "abstract": False, + }, + ), + ] diff --git a/radis/pgsearch/models.py b/radis/pgsearch/models.py index 5cd90e8b..a55abaad 100644 --- a/radis/pgsearch/models.py +++ b/radis/pgsearch/models.py @@ -3,7 +3,10 @@ from django.contrib.postgres.search import SearchVector, SearchVectorField from django.db import models from pgvector.django import HnswIndex, VectorField +from procrastinate.contrib.django import app +from procrastinate.contrib.django.models import ProcrastinateJob +from radis.core.models import AnalysisJob, AnalysisTask from radis.reports.models import Report from .utils.language_utils import code_to_language @@ -34,3 +37,47 @@ def save(self, *args, **kwargs): language = code_to_language(self.report.language.code) self.search_vector = SearchVector(models.Value(body), config=language) super().save(*args, **kwargs) + + +class EmbeddingJob(AnalysisJob): + default_priority = settings.EMBEDDING_INDEX_PRIORITY + urgent_priority = settings.EMBEDDING_INDEX_PRIORITY + finished_mail_template = None + + queued_job_id: int | None + queued_job = models.OneToOneField( + ProcrastinateJob, null=True, on_delete=models.SET_NULL, related_name="+" + ) + + tasks: models.QuerySet["EmbeddingTask"] + + class Meta: + ordering = ["-created_at"] + + def __str__(self) -> str: + return f"EmbeddingJob [{self.pk}]" + + def delay(self) -> None: + queued_job_id = app.configure_task( + "radis.pgsearch.tasks.process_embedding_job", + allow_unknown=False, + priority=self.default_priority, + ).defer(job_id=self.pk) + self.queued_job_id = queued_job_id + self.save() + + +class EmbeddingTask(AnalysisTask): + job = models.ForeignKey( + EmbeddingJob, on_delete=models.CASCADE, related_name="tasks" + ) + reports = models.ManyToManyField(Report, related_name="embedding_tasks") + + def delay(self) -> None: + queued_job_id = app.configure_task( + "radis.pgsearch.tasks.process_embedding_task", + allow_unknown=False, + priority=settings.EMBEDDING_INDEX_PRIORITY, + ).defer(task_id=self.pk) + self.queued_job_id = queued_job_id + self.save() diff --git a/radis/pgsearch/tests/test_models_embedding.py b/radis/pgsearch/tests/test_models_embedding.py new file mode 100644 index 00000000..0cb58b9b --- /dev/null +++ b/radis/pgsearch/tests/test_models_embedding.py @@ -0,0 +1,31 @@ +import pytest +from django.contrib.auth import get_user_model + +from radis.pgsearch.models import EmbeddingJob, EmbeddingTask +from radis.reports.factories import ReportFactory + +User = get_user_model() +pytestmark = pytest.mark.django_db + + +def _system_user() -> "User": + return User.objects.create(username="system", is_active=False) + + +def test_embedding_job_defaults(): + job = EmbeddingJob.objects.create(owner=_system_user()) + assert job.status == EmbeddingJob.Status.UNVERIFIED + assert job.urgent is False + assert job.send_finished_mail is False + assert job.queued_job_id is None + + +def test_embedding_task_links_to_reports(): + job = EmbeddingJob.objects.create(owner=_system_user()) + reports = [ReportFactory.create() for _ in range(3)] + task = EmbeddingTask.objects.create(job=job) + task.reports.set(reports) + assert task.status == EmbeddingTask.Status.PENDING + assert set(task.reports.values_list("pk", flat=True)) == {r.pk for r in reports} + assert task.attempts == 0 + assert task.queued_job_id is None From d292fac037ff2e5a038e335c45b939b3274b2ca4 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:46:32 +0000 Subject: [PATCH 42/68] feat(pgsearch): add data migration for system user --- radis/pgsearch/migrations/0005_system_user.py | 17 +++++++++++++++ .../migrations/_system_user_helper.py | 9 ++++++++ .../tests/test_migrations_system_user.py | 21 +++++++++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 radis/pgsearch/migrations/0005_system_user.py create mode 100644 radis/pgsearch/migrations/_system_user_helper.py create mode 100644 radis/pgsearch/tests/test_migrations_system_user.py diff --git a/radis/pgsearch/migrations/0005_system_user.py b/radis/pgsearch/migrations/0005_system_user.py new file mode 100644 index 00000000..ef61ee42 --- /dev/null +++ b/radis/pgsearch/migrations/0005_system_user.py @@ -0,0 +1,17 @@ +from django.conf import settings +from django.db import migrations + +from radis.pgsearch.migrations._system_user_helper import create_system_user_idempotent + + +def forwards(apps, schema_editor): + User = apps.get_model(*settings.AUTH_USER_MODEL.split(".")) + create_system_user_idempotent(User) + + +class Migration(migrations.Migration): + dependencies = [ + ("pgsearch", "0004_embedding_job_task"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + operations = [migrations.RunPython(forwards, reverse_code=migrations.RunPython.noop)] diff --git a/radis/pgsearch/migrations/_system_user_helper.py b/radis/pgsearch/migrations/_system_user_helper.py new file mode 100644 index 00000000..5e0be0d2 --- /dev/null +++ b/radis/pgsearch/migrations/_system_user_helper.py @@ -0,0 +1,9 @@ +from django.conf import settings + + +def create_system_user_idempotent(user_model) -> None: + username = settings.EMBEDDING_SYSTEM_USERNAME + user, created = user_model.objects.get_or_create( + username=username, + defaults={"is_active": False, "password": "!"}, + ) diff --git a/radis/pgsearch/tests/test_migrations_system_user.py b/radis/pgsearch/tests/test_migrations_system_user.py new file mode 100644 index 00000000..0a7c7202 --- /dev/null +++ b/radis/pgsearch/tests/test_migrations_system_user.py @@ -0,0 +1,21 @@ +import pytest +from django.contrib.auth import get_user_model + +User = get_user_model() + + +@pytest.mark.django_db +def test_system_user_exists_after_migrations(): + user = User.objects.get(username="system") + assert user.is_active is False + assert not user.has_usable_password() + + +@pytest.mark.django_db +def test_creating_system_user_twice_is_a_noop(): + from radis.pgsearch.migrations import _system_user_helper + + before = User.objects.filter(username="system").count() + _system_user_helper.create_system_user_idempotent(User) + after = User.objects.filter(username="system").count() + assert before == after == 1 From 26544ec67d7a2c03310c6de09ae1207f4bfb7ccb Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:48:34 +0000 Subject: [PATCH 43/68] feat(pgsearch): add process_embedding_task on embeddings queue Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/tasks.py | 43 +++++++++++ .../tests/test_process_embedding_task.py | 71 +++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 radis/pgsearch/tests/test_process_embedding_task.py diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index 75d42d28..d715afb2 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -82,3 +82,46 @@ def enqueue_embed_reports( allow_unknown=False, priority=priority, ).defer(report_ids=payload) + + +from django.utils import timezone + +from .models import EmbeddingTask +from .utils.embedding_client import EmbeddingClientError + + +@app.task(queue="embeddings") +def process_embedding_task(task_id: int) -> None: + task = EmbeddingTask.objects.get(id=task_id) + task.status = EmbeddingTask.Status.IN_PROGRESS + task.started_at = timezone.now() + task.attempts = task.attempts + 1 + task.save() + + client = EmbeddingClient() + try: + report_ids = list(task.reports.values_list("pk", flat=True)) + rsvs = list( + ReportSearchVector.objects + .filter(report_id__in=report_ids) + .select_related("report") + .only("id", "report_id", "report__body") + ) + texts = [rsv.report.body for rsv in rsvs] + vectors = client.embed_documents(texts) + for rsv, vec in zip(rsvs, vectors, strict=True): + rsv.embedding = vec + ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) + + task.status = EmbeddingTask.Status.SUCCESS + except EmbeddingClientError as exc: + logger.exception("Embedding task %s failed: %s", task_id, exc) + task.status = EmbeddingTask.Status.FAILURE + task.message = str(exc) + raise + finally: + task.ended_at = timezone.now() + task.queued_job_id = None + task.save() + task.job.update_job_state() + client.close() diff --git a/radis/pgsearch/tests/test_process_embedding_task.py b/radis/pgsearch/tests/test_process_embedding_task.py new file mode 100644 index 00000000..3810b1c8 --- /dev/null +++ b/radis/pgsearch/tests/test_process_embedding_task.py @@ -0,0 +1,71 @@ +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +from django.contrib.auth import get_user_model + +from radis.pgsearch.models import EmbeddingJob, EmbeddingTask, ReportSearchVector +from radis.pgsearch.tasks import process_embedding_task as _wrapped +from radis.pgsearch.utils.embedding_client import EmbeddingClientError +from radis.reports.factories import ReportFactory + +User = get_user_model() +process_embedding_task = _wrapped.__wrapped__ # type: ignore[attr-defined] +pytestmark = pytest.mark.django_db + + +def _make_task() -> EmbeddingTask: + owner = User.objects.get(username="system") + job = EmbeddingJob.objects.create(owner=owner) + task = EmbeddingTask.objects.create(job=job) + reports = [ReportFactory.create() for _ in range(2)] + task.reports.set(reports) + return task + + +def _unit_vec(dim: int) -> list[float]: + v = np.ones(dim, dtype=np.float32) + return (v / np.linalg.norm(v)).tolist() + + +def test_process_embedding_task_writes_vectors_and_marks_success(settings): + task = _make_task() + vec = _unit_vec(settings.EMBEDDING_DIM) + fake_client = MagicMock() + fake_client.embed_documents.return_value = [vec, vec] + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): + process_embedding_task(task.id) + + task.refresh_from_db() + assert task.status == EmbeddingTask.Status.SUCCESS + assert task.queued_job_id is None + for report in task.reports.all(): + rsv = ReportSearchVector.objects.get(report=report) + assert rsv.embedding is not None + + +def test_process_embedding_task_failure_sets_status_and_raises(): + task = _make_task() + fake_client = MagicMock() + fake_client.embed_documents.side_effect = EmbeddingClientError("boom") + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): + with pytest.raises(EmbeddingClientError): + process_embedding_task(task.id) + + task.refresh_from_db() + assert task.status == EmbeddingTask.Status.FAILURE + assert task.queued_job_id is None + assert "boom" in task.message + + +def test_process_embedding_task_calls_update_job_state(settings): + task = _make_task() + vec = _unit_vec(settings.EMBEDDING_DIM) + fake_client = MagicMock() + fake_client.embed_documents.return_value = [vec, vec] + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): + process_embedding_task(task.id) + + task.job.refresh_from_db() + # All tasks succeeded; AnalysisJob.update_job_state rolls up to SUCCESS. + assert task.job.status == EmbeddingJob.Status.SUCCESS From 9a44436da49c8f30c1681249e6a38b306496184d Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:49:29 +0000 Subject: [PATCH 44/68] fix(test): fetch system user instead of creating it (collides with 0005 migration) --- radis/pgsearch/tests/test_models_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/radis/pgsearch/tests/test_models_embedding.py b/radis/pgsearch/tests/test_models_embedding.py index 0cb58b9b..7cc4b9b9 100644 --- a/radis/pgsearch/tests/test_models_embedding.py +++ b/radis/pgsearch/tests/test_models_embedding.py @@ -9,7 +9,7 @@ def _system_user() -> "User": - return User.objects.create(username="system", is_active=False) + return User.objects.get(username="system") def test_embedding_job_defaults(): From ab06928583f8cd40e95a2d7b40c1b50c67d6554c Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:50:47 +0000 Subject: [PATCH 45/68] feat(pgsearch): add process_embedding_job orchestrator Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/tasks.py | 44 ++++++++++- .../tests/test_process_embedding_job.py | 78 +++++++++++++++++++ 2 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 radis/pgsearch/tests/test_process_embedding_job.py diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index d715afb2..412f9a0e 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -86,7 +86,7 @@ def enqueue_embed_reports( from django.utils import timezone -from .models import EmbeddingTask +from .models import EmbeddingJob, EmbeddingTask from .utils.embedding_client import EmbeddingClientError @@ -125,3 +125,45 @@ def process_embedding_task(task_id: int) -> None: task.save() task.job.update_job_state() client.close() + + +def _create_embedding_task(job: EmbeddingJob, report_ids: list[int]) -> EmbeddingTask: + from radis.reports.models import Report + + task = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) + task.reports.set(Report.objects.filter(pk__in=report_ids)) + return task + + +@app.task +def process_embedding_job(job_id: int) -> None: + job = EmbeddingJob.objects.get(id=job_id) + assert job.status == EmbeddingJob.Status.PREPARING + + if job.tasks.exists(): + tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) + else: + pending_ids_iter = ( + ReportSearchVector.objects + .filter(embedding__isnull=True) + .values_list("report_id", flat=True) + .iterator(chunk_size=10_000) + ) + batch: list[int] = [] + for report_id in pending_ids_iter: + batch.append(int(report_id)) + if len(batch) >= django_settings.EMBEDDING_BATCH_SIZE: + _create_embedding_task(job, batch) + batch = [] + if batch: + _create_embedding_task(job, batch) + + tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) + + job.status = EmbeddingJob.Status.PENDING + job.queued_job_id = None + job.save() + + for task in tasks_to_enqueue: + if not task.is_queued: + task.delay() diff --git a/radis/pgsearch/tests/test_process_embedding_job.py b/radis/pgsearch/tests/test_process_embedding_job.py new file mode 100644 index 00000000..72c87987 --- /dev/null +++ b/radis/pgsearch/tests/test_process_embedding_job.py @@ -0,0 +1,78 @@ +from unittest.mock import patch + +import pytest +from django.contrib.auth import get_user_model + +from radis.pgsearch.models import EmbeddingJob, EmbeddingTask, ReportSearchVector +from radis.pgsearch.tasks import process_embedding_job as _wrapped +from radis.reports.factories import ReportFactory + +User = get_user_model() +process_embedding_job = _wrapped.__wrapped__ # type: ignore[attr-defined] +pytestmark = pytest.mark.django_db + + +def _new_job() -> EmbeddingJob: + owner = User.objects.get(username="system") + return EmbeddingJob.objects.create(owner=owner, status=EmbeddingJob.Status.PREPARING) + + +def _make_pending_reports(n: int): + reports = [ReportFactory.create() for _ in range(n)] + # ReportFactory triggers the FTS post_save signal which creates ReportSearchVector + # rows with embedding=NULL; that's exactly the pending state we want. + return reports + + +def test_process_embedding_job_batches_pending_reports(settings): + settings.EMBEDDING_BATCH_SIZE = 2 + job = _new_job() + reports = _make_pending_reports(5) + + with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: + process_embedding_job(job.id) + + job.refresh_from_db() + assert job.status == EmbeddingJob.Status.PENDING + # ceil(5 / 2) = 3 tasks + assert job.tasks.count() == 3 + # All tasks are dispatched + assert delay_mock.call_count == 3 + # Every pending report is in exactly one task + covered = set() + for task in job.tasks.all(): + covered.update(task.reports.values_list("pk", flat=True)) + assert covered == {r.pk for r in reports} + + +def test_process_embedding_job_resume_path_only_redispatches_pending_tasks(settings): + settings.EMBEDDING_BATCH_SIZE = 2 + job = _new_job() + reports = _make_pending_reports(2) + # Simulate a previous orchestrator run that created one task already. + existing = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) + existing.reports.set(reports) + succeeded = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.SUCCESS) + + with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: + process_embedding_job(job.id) + + job.refresh_from_db() + assert job.status == EmbeddingJob.Status.PENDING + # No new tasks created + assert job.tasks.count() == 2 + # Only the pending one is dispatched + assert delay_mock.call_count == 1 + + +def test_process_embedding_job_with_no_pending_rows(): + job = _new_job() + # No reports exist → no ReportSearchVector rows with embedding IS NULL. + + with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: + process_embedding_job(job.id) + + job.refresh_from_db() + assert job.status == EmbeddingJob.Status.PENDING + assert job.tasks.count() == 0 + assert delay_mock.call_count == 0 From ed5b6a7a0e781ad1cd27d6ce8b9671dc04f81233 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:53:37 +0000 Subject: [PATCH 46/68] feat(pgsearch): add embedding_launcher periodic task --- radis/pgsearch/tasks.py | 36 +++++++++++++ .../pgsearch/tests/test_embedding_launcher.py | 50 +++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 radis/pgsearch/tests/test_embedding_launcher.py diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index 412f9a0e..44d4af54 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -167,3 +167,39 @@ def process_embedding_job(job_id: int) -> None: for task in tasks_to_enqueue: if not task.is_queued: task.delay() + + +from django.contrib.auth import get_user_model +from django.db import transaction + + +@app.periodic(cron=django_settings.EMBEDDING_DRAIN_CRON) +@app.task( + queue="default", + queueing_lock="embedding_launcher", + pass_context=True, +) +def embedding_launcher(context, timestamp: int) -> None: + in_flight = EmbeddingJob.objects.filter( + status__in=[ + EmbeddingJob.Status.PREPARING, + EmbeddingJob.Status.PENDING, + EmbeddingJob.Status.IN_PROGRESS, + ] + ).exists() + if in_flight: + logger.info("EmbeddingJob already in flight; launcher tick is a no-op.") + return + + has_pending = ReportSearchVector.objects.filter(embedding__isnull=True).exists() + if not has_pending: + logger.debug("No reports pending embedding; launcher tick is a no-op.") + return + + User = get_user_model() + system_user = User.objects.get(username=django_settings.EMBEDDING_SYSTEM_USERNAME) + job = EmbeddingJob.objects.create( + owner=system_user, + status=EmbeddingJob.Status.PREPARING, + ) + transaction.on_commit(job.delay) diff --git a/radis/pgsearch/tests/test_embedding_launcher.py b/radis/pgsearch/tests/test_embedding_launcher.py new file mode 100644 index 00000000..e2725ece --- /dev/null +++ b/radis/pgsearch/tests/test_embedding_launcher.py @@ -0,0 +1,50 @@ +from unittest.mock import patch + +import pytest +from django.contrib.auth import get_user_model + +from radis.pgsearch.models import EmbeddingJob +from radis.pgsearch.tasks import embedding_launcher as _wrapped +from radis.reports.factories import ReportFactory + +User = get_user_model() +embedding_launcher = _wrapped.__wrapped__ # type: ignore[attr-defined] +pytestmark = pytest.mark.django_db + + +def test_embedding_launcher_noop_when_job_in_flight(): + owner = User.objects.get(username="system") + EmbeddingJob.objects.create(owner=owner, status=EmbeddingJob.Status.PREPARING) + # Make a pending report so the second guard wouldn't short-circuit on its own. + ReportFactory.create() + + with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: + embedding_launcher(context=None, timestamp=0) + + assert delay_mock.call_count == 0 + # No new job created. + assert EmbeddingJob.objects.count() == 1 + + +def test_embedding_launcher_noop_when_no_pending_rows(): + with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: + embedding_launcher(context=None, timestamp=0) + + assert delay_mock.call_count == 0 + assert EmbeddingJob.objects.count() == 0 + + +def test_embedding_launcher_happy_path_creates_job_and_defers( + django_capture_on_commit_callbacks, +): + ReportFactory.create() + + with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: + with django_capture_on_commit_callbacks(execute=True): + embedding_launcher(context=None, timestamp=0) + + assert EmbeddingJob.objects.count() == 1 + job = EmbeddingJob.objects.get() + assert job.status == EmbeddingJob.Status.PREPARING + assert job.owner.username == "system" + delay_mock.assert_called_once() From 111696020d2509de6cc13e23485400ac831d66bf Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:54:39 +0000 Subject: [PATCH 47/68] refactor(pgsearch): remove post_save embedding signal (replaced by orchestrator) --- radis/pgsearch/signals.py | 9 ------- radis/pgsearch/tests/test_signals.py | 40 ---------------------------- 2 files changed, 49 deletions(-) delete mode 100644 radis/pgsearch/tests/test_signals.py diff --git a/radis/pgsearch/signals.py b/radis/pgsearch/signals.py index 492e143f..37fc0373 100644 --- a/radis/pgsearch/signals.py +++ b/radis/pgsearch/signals.py @@ -1,11 +1,9 @@ -from django.db import transaction from django.db.models.signals import post_save from django.dispatch import receiver from radis.reports.models import Report from .models import ReportSearchVector -from .tasks import enqueue_embed_reports @receiver(post_save, sender=Report) @@ -14,10 +12,3 @@ def create_or_update_report_search_vector(sender, instance, created, **kwargs): ReportSearchVector.objects.create(report=instance) return instance.search_vector.save() - - -@receiver(post_save, sender=Report) -def enqueue_report_embedding(sender, instance, **kwargs): - # Defer to on_commit so the embed_reports worker can't race the surrounding - # transaction and find no ReportSearchVector row to update. - transaction.on_commit(lambda: enqueue_embed_reports([instance.pk])) diff --git a/radis/pgsearch/tests/test_signals.py b/radis/pgsearch/tests/test_signals.py deleted file mode 100644 index de8f7652..00000000 --- a/radis/pgsearch/tests/test_signals.py +++ /dev/null @@ -1,40 +0,0 @@ -from unittest.mock import patch - -import pytest - -from radis.reports.factories import ReportFactory - - -@pytest.mark.django_db(transaction=True) -def test_report_save_enqueues_embed_reports(django_capture_on_commit_callbacks): - from radis.reports.models import Language, Report - - language = Language.objects.create(code="en") - with patch("radis.pgsearch.signals.enqueue_embed_reports") as enqueue: - with django_capture_on_commit_callbacks(execute=True): - report = Report.objects.create( - document_id="DOC-SIGNAL-1", - pacs_aet="PACS", - pacs_name="PACS", - pacs_link="", - patient_id="P1", - patient_birth_date="1980-01-01", - patient_sex="M", - study_description="Study", - study_datetime="2024-01-01T00:00:00Z", - study_instance_uid="1.2.3.4", - accession_number="ACC1", - body="Body.", - language=language, - ) - enqueue.assert_called_once_with([report.pk]) - - -@pytest.mark.django_db(transaction=True) -def test_report_update_also_enqueues_embed_reports(django_capture_on_commit_callbacks): - report = ReportFactory.create() - with patch("radis.pgsearch.signals.enqueue_embed_reports") as enqueue: - with django_capture_on_commit_callbacks(execute=True): - report.body = "Updated body" - report.save() - enqueue.assert_called_once_with([report.pk]) From 6c138301243d3ddbb57568b94d4bebd377537e6b Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:55:29 +0000 Subject: [PATCH 48/68] refactor(pgsearch): remove embed_reports task and backfill_embeddings command --- .../commands/backfill_embeddings.py | 59 --------- radis/pgsearch/tasks.py | 68 +--------- radis/pgsearch/tests/test_backfill_command.py | 94 -------------- .../pgsearch/tests/test_embed_reports_task.py | 116 ------------------ 4 files changed, 5 insertions(+), 332 deletions(-) delete mode 100644 radis/pgsearch/management/commands/backfill_embeddings.py delete mode 100644 radis/pgsearch/tests/test_backfill_command.py delete mode 100644 radis/pgsearch/tests/test_embed_reports_task.py diff --git a/radis/pgsearch/management/commands/backfill_embeddings.py b/radis/pgsearch/management/commands/backfill_embeddings.py deleted file mode 100644 index 00482bde..00000000 --- a/radis/pgsearch/management/commands/backfill_embeddings.py +++ /dev/null @@ -1,59 +0,0 @@ -from django.conf import settings -from django.core.management.base import BaseCommand, CommandError - -from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.tasks import enqueue_embed_reports - - -class Command(BaseCommand): - help = ( - "Enqueue embed_reports tasks for all reports that don't yet have an " - "embedding. Idempotent: rows that already have an embedding are skipped." - ) - - def add_arguments(self, parser): - parser.add_argument("--batch-size", type=int, default=500) - parser.add_argument( - "--limit", - type=int, - default=None, - help="Maximum number of reports to enqueue (default: all).", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print the count of reports that would be enqueued, but enqueue nothing.", - ) - - def handle(self, *args, batch_size, limit, dry_run, **options): - if batch_size <= 0: - raise CommandError(f"--batch-size must be > 0, got {batch_size}") - if limit is not None and limit < 0: - raise CommandError(f"--limit must be >= 0, got {limit}") - - qs = ( - ReportSearchVector.objects.filter(embedding__isnull=True) - .order_by("report_id") - .values_list("report_id", flat=True) - ) - if limit is not None: - qs = qs[:limit] - - if dry_run: - self.stdout.write(f"Dry run: would enqueue {qs.count()} reports.") - return - - priority = settings.EMBEDDING_BACKFILL_PRIORITY - total = 0 - chunk: list[int] = [] - # Use a server-side cursor so we don't materialize the whole id set in memory. - for rid in qs.iterator(chunk_size=batch_size): - chunk.append(rid) - if len(chunk) >= batch_size: - enqueue_embed_reports(chunk, priority=priority) - total += len(chunk) - chunk = [] - if chunk: - enqueue_embed_reports(chunk, priority=priority) - total += len(chunk) - self.stdout.write(f"Enqueued {total} reports for embedding.") diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index 44d4af54..14ce946d 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -1,12 +1,14 @@ import logging from django.conf import settings as django_settings -from django.core.exceptions import ImproperlyConfigured +from django.contrib.auth import get_user_model +from django.db import transaction +from django.utils import timezone from procrastinate.contrib.django import app from procrastinate.types import JSONValue -from .models import ReportSearchVector -from .utils.embedding_client import EmbeddingClient +from .models import EmbeddingJob, EmbeddingTask, ReportSearchVector +from .utils.embedding_client import EmbeddingClient, EmbeddingClientError from .utils.indexing import bulk_upsert_report_search_vectors logger = logging.getLogger(__name__) @@ -34,62 +36,6 @@ def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: ).defer(report_ids=payload) -@app.task(queue="embeddings") -def embed_reports(report_ids: list[int]) -> None: - """Compute and write embeddings for the given reports. Overwrites any existing - embedding. Idempotent across re-runs except for the cost of the API call.""" - if not report_ids: - return - - rsvs = list( - ReportSearchVector.objects.filter(report_id__in=report_ids) - .select_related("report") - .only("id", "report_id", "report__body") - ) - if not rsvs: - return - - batch_size = django_settings.EMBEDDING_BATCH_SIZE - if batch_size <= 0: - raise ImproperlyConfigured( - f"EMBEDDING_BATCH_SIZE must be > 0, got {batch_size}" - ) - - client = EmbeddingClient() - try: - for start in range(0, len(rsvs), batch_size): - chunk = rsvs[start : start + batch_size] - texts = [rsv.report.body for rsv in chunk] - vectors = client.embed_documents(texts) - for rsv, vec in zip(chunk, vectors, strict=True): - rsv.embedding = vec - ReportSearchVector.objects.bulk_update(chunk, fields=["embedding"]) - finally: - client.close() - - -def enqueue_embed_reports( - report_ids: list[int], - priority: int | None = None, -) -> int | None: - if not report_ids: - return None - if priority is None: - priority = django_settings.EMBEDDING_INDEX_PRIORITY - payload: list[JSONValue] = [int(rid) for rid in report_ids] - return app.configure_task( - "radis.pgsearch.tasks.embed_reports", - allow_unknown=False, - priority=priority, - ).defer(report_ids=payload) - - -from django.utils import timezone - -from .models import EmbeddingJob, EmbeddingTask -from .utils.embedding_client import EmbeddingClientError - - @app.task(queue="embeddings") def process_embedding_task(task_id: int) -> None: task = EmbeddingTask.objects.get(id=task_id) @@ -169,10 +115,6 @@ def process_embedding_job(job_id: int) -> None: task.delay() -from django.contrib.auth import get_user_model -from django.db import transaction - - @app.periodic(cron=django_settings.EMBEDDING_DRAIN_CRON) @app.task( queue="default", diff --git a/radis/pgsearch/tests/test_backfill_command.py b/radis/pgsearch/tests/test_backfill_command.py deleted file mode 100644 index 5900757d..00000000 --- a/radis/pgsearch/tests/test_backfill_command.py +++ /dev/null @@ -1,94 +0,0 @@ -from io import StringIO -from unittest.mock import patch - -import pytest -from django.conf import settings -from django.core.management import call_command - -from radis.pgsearch.models import ReportSearchVector -from radis.reports.factories import ReportFactory - - -@pytest.mark.django_db -def test_backfill_enqueues_only_null_embeddings(): - r_null = ReportFactory.create() - r_filled = ReportFactory.create() - ReportSearchVector.objects.filter(report=r_filled).update( - embedding=[1.0] + [0.0] * (settings.EMBEDDING_DIM - 1) - ) - - with patch( - "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" - ) as enqueue: - call_command("backfill_embeddings", batch_size=10, stdout=StringIO()) - - # Only the null-embedding report should be in any of the enqueue calls. - enqueued_ids = [rid for call in enqueue.call_args_list for rid in call.args[0]] - assert r_null.pk in enqueued_ids - assert r_filled.pk not in enqueued_ids - - -@pytest.mark.django_db -def test_backfill_chunks_by_batch_size(): - [ReportFactory.create() for _ in range(5)] - - with patch( - "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" - ) as enqueue: - call_command("backfill_embeddings", batch_size=2, stdout=StringIO()) - - sizes = [len(call.args[0]) for call in enqueue.call_args_list] - assert sizes == [2, 2, 1] - - -@pytest.mark.django_db -def test_backfill_limit_caps_total(): - [ReportFactory.create() for _ in range(5)] - - with patch( - "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" - ) as enqueue: - call_command("backfill_embeddings", batch_size=10, limit=3, stdout=StringIO()) - - enqueued_ids = [rid for call in enqueue.call_args_list for rid in call.args[0]] - assert len(enqueued_ids) == 3 - - -@pytest.mark.django_db -def test_backfill_dry_run_does_not_enqueue(): - [ReportFactory.create() for _ in range(3)] - out = StringIO() - - with patch( - "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" - ) as enqueue: - call_command("backfill_embeddings", dry_run=True, stdout=out) - - enqueue.assert_not_called() - assert "would enqueue 3" in out.getvalue().lower() - - -@pytest.mark.django_db -def test_backfill_uses_backfill_priority(): - ReportFactory.create() - with patch( - "radis.pgsearch.management.commands.backfill_embeddings.enqueue_embed_reports" - ) as enqueue: - call_command("backfill_embeddings", stdout=StringIO()) - assert enqueue.call_args.kwargs["priority"] == settings.EMBEDDING_BACKFILL_PRIORITY - - -@pytest.mark.django_db -def test_backfill_rejects_zero_batch_size(): - from django.core.management.base import CommandError - - with pytest.raises(CommandError, match="--batch-size must be > 0"): - call_command("backfill_embeddings", batch_size=0, stdout=StringIO()) - - -@pytest.mark.django_db -def test_backfill_rejects_negative_limit(): - from django.core.management.base import CommandError - - with pytest.raises(CommandError, match="--limit must be >= 0"): - call_command("backfill_embeddings", limit=-1, stdout=StringIO()) diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py deleted file mode 100644 index 345907b8..00000000 --- a/radis/pgsearch/tests/test_embed_reports_task.py +++ /dev/null @@ -1,116 +0,0 @@ -from unittest.mock import patch - -import pytest - -from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.tasks import embed_reports as _embed_reports_task -from radis.reports.factories import ReportFactory - -# Procrastinate's @app.task wraps the function; tests call the underlying -# function directly to skip the broker layer. -embed_reports = _embed_reports_task.__wrapped__ # type: ignore[attr-defined] - - -@pytest.mark.django_db -def test_embed_reports_writes_normalized_vector(): - report = ReportFactory.create(body="Findings: no acute abnormality.") - fake_vec = [1.0] + [0.0] * 1023 # already normalized - - with patch( - "radis.pgsearch.tasks.EmbeddingClient" - ) as MockClient: - MockClient.return_value.embed_documents.return_value = [fake_vec] - embed_reports([report.pk]) - - rsv = ReportSearchVector.objects.get(report=report) - assert rsv.embedding is not None - assert len(rsv.embedding) == 1024 - assert pytest.approx(rsv.embedding[0]) == 1.0 - - -@pytest.mark.django_db -def test_embed_reports_overwrites_existing_embedding(): - report = ReportFactory.create() - rsv = ReportSearchVector.objects.get(report=report) - rsv.embedding = [0.5] * 1024 - rsv.save(update_fields=["embedding"]) - - new_vec = [1.0] + [0.0] * 1023 - with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: - MockClient.return_value.embed_documents.return_value = [new_vec] - embed_reports([report.pk]) - - rsv.refresh_from_db() - assert pytest.approx(rsv.embedding[0]) == 1.0 - assert pytest.approx(rsv.embedding[1]) == 0.0 - - -@pytest.mark.django_db -def test_embed_reports_skips_missing_ids_without_error(): - with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: - # No reports created. Should not call the client at all. - embed_reports([99999]) - MockClient.return_value.embed_documents.assert_not_called() - - -@pytest.mark.django_db -def test_embed_reports_splits_into_batches(settings): - settings.EMBEDDING_BATCH_SIZE = 2 - reports = [ReportFactory.create() for _ in range(5)] - fake_vec = [1.0] + [0.0] * 1023 - - with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: - MockClient.return_value.embed_documents.side_effect = [ - [fake_vec, fake_vec], - [fake_vec, fake_vec], - [fake_vec], - ] - embed_reports([r.pk for r in reports]) - - assert MockClient.return_value.embed_documents.call_count == 3 - - -@pytest.mark.django_db -def test_embed_reports_propagates_client_error(): - from radis.pgsearch.utils.embedding_client import EmbeddingClientError - - report = ReportFactory.create() - with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: - MockClient.return_value.embed_documents.side_effect = EmbeddingClientError("boom") - with pytest.raises(EmbeddingClientError): - embed_reports([report.pk]) - - -@pytest.mark.django_db -def test_embed_reports_closes_client_on_success(): - report = ReportFactory.create() - fake_vec = [1.0] + [0.0] * 1023 - - with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: - MockClient.return_value.embed_documents.return_value = [fake_vec] - embed_reports([report.pk]) - - MockClient.return_value.close.assert_called_once() - - -@pytest.mark.django_db -def test_embed_reports_closes_client_on_error(): - from radis.pgsearch.utils.embedding_client import EmbeddingClientError - - report = ReportFactory.create() - with patch("radis.pgsearch.tasks.EmbeddingClient") as MockClient: - MockClient.return_value.embed_documents.side_effect = EmbeddingClientError("boom") - with pytest.raises(EmbeddingClientError): - embed_reports([report.pk]) - MockClient.return_value.close.assert_called_once() - - -@pytest.mark.django_db -def test_embed_reports_raises_on_invalid_batch_size(settings): - from django.core.exceptions import ImproperlyConfigured - - settings.EMBEDDING_BATCH_SIZE = 0 - report = ReportFactory.create() - with patch("radis.pgsearch.tasks.EmbeddingClient"): - with pytest.raises(ImproperlyConfigured, match="EMBEDDING_BATCH_SIZE must be > 0"): - embed_reports([report.pk]) From a251c002fd9ee5dfa632ce343885f52721bac3be Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:55:45 +0000 Subject: [PATCH 49/68] refactor(pgsearch): remove EMBEDDING_BACKFILL_PRIORITY setting --- radis/settings/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/radis/settings/base.py b/radis/settings/base.py index 3ed94ded..68234dcd 100644 --- a/radis/settings/base.py +++ b/radis/settings/base.py @@ -358,7 +358,6 @@ # Embedding queue priorities (procrastinate "higher = sooner") EMBEDDING_INDEX_PRIORITY = 0 -EMBEDDING_BACKFILL_PRIORITY = -1 EMBEDDING_SYSTEM_USERNAME = "system" # Hybrid search tuning From 795f460ffd5b1f768c52127962867324f2bb610c Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:56:30 +0000 Subject: [PATCH 50/68] feat(infra): run embeddings_worker with --concurrency 4 --- docker-compose.dev.yml | 2 +- docker-compose.prod.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index dcc7f1d4..196e2546 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -88,7 +88,7 @@ services: command: > bash -c " wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && - ./manage.py bg_worker -l debug -q embeddings --autoreload + ./manage.py bg_worker -l debug -q embeddings --autoreload --concurrency 4 " postgres: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index fb10378d..5824b6e2 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -82,7 +82,7 @@ services: command: > bash -c " wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && - ./manage.py bg_worker -q embeddings + ./manage.py bg_worker -q embeddings --concurrency 4 " deploy: <<: *deploy From 88fdcc4d70d0a0f9e5646770c6ec45c25ef81295 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 19:58:21 +0000 Subject: [PATCH 51/68] feat(search): add QueryParser.unparse_for_embedding that strips NOT branches --- ...test_query_parser_unparse_for_embedding.py | 38 +++++++++++++++++++ radis/search/utils/query_parser.py | 30 +++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 radis/search/tests/test_query_parser_unparse_for_embedding.py diff --git a/radis/search/tests/test_query_parser_unparse_for_embedding.py b/radis/search/tests/test_query_parser_unparse_for_embedding.py new file mode 100644 index 00000000..c138976a --- /dev/null +++ b/radis/search/tests/test_query_parser_unparse_for_embedding.py @@ -0,0 +1,38 @@ +import pytest + +from radis.search.utils.query_parser import QueryParser + + +@pytest.mark.parametrize( + "query,expected", + [ + # Simple positive term — unchanged. + ("pneumothorax", "pneumothorax"), + # Phrase preserved with quotes. + ('"chest x-ray"', '"chest x-ray"'), + # Implicit AND (no operator) — both sides survive. + ("cardiac arrest", "cardiac arrest"), + # Explicit AND — both sides survive, operator preserved. + ("A AND B", "A AND B"), + # OR — both sides survive, operator preserved. + ("A OR B", "A OR B"), + # NOT alone — empty. + ("NOT pneumothorax", ""), + # AND NOT — left survives, NOT branch dropped, AND collapses. + ("A AND NOT B", "A"), + # NOT AND — right survives, NOT branch dropped, AND collapses. + ("NOT A AND B", "B"), + # NOT OR NOT — both branches dropped, empty. + ("NOT A OR NOT B", ""), + # Mixed: AND OR with a NOT branch — surviving structure retained. + ("(A AND NOT B) OR C", "(A) OR C"), + # Nested NOT inside parens — empty parens collapsed. + ("A AND (NOT B)", "A"), + # Double-nested OR with one NOT — only NOT branch dropped. + ("(A OR B) AND NOT C", "(A OR B)"), + ], +) +def test_unparse_for_embedding(query, expected): + node, _fixes = QueryParser().parse(query) + assert node is not None, f"parser produced empty node for {query!r}" + assert QueryParser.unparse_for_embedding(node) == expected diff --git a/radis/search/utils/query_parser.py b/radis/search/utils/query_parser.py index 4782a39a..51e1125a 100644 --- a/radis/search/utils/query_parser.py +++ b/radis/search/utils/query_parser.py @@ -312,3 +312,33 @@ def unparse(node: QueryNode) -> str: ) else: raise ValueError(f"Unknown node type: {type(node)}") + + @staticmethod + def unparse_for_embedding(node: QueryNode) -> str: + """Like ``unparse``, but drops the operand of every ``UnaryNode("NOT", X)`` + and collapses any ``BinaryNode`` whose children both become empty. + Returns the empty string if the whole query reduces to NOT clauses. + + Used by the hybrid-search vector half to avoid polarity-blind embedding + of negated terms (see spec 2026-05-28-hybrid-search §7.8). + """ + if isinstance(node, TermNode): + return QueryParser.unparse(node) + if isinstance(node, ParensNode): + inner = QueryParser.unparse_for_embedding(node.expression) + return f"({inner})" if inner else "" + if isinstance(node, UnaryNode): + return "" + if isinstance(node, BinaryNode): + left = QueryParser.unparse_for_embedding(node.left) + right = QueryParser.unparse_for_embedding(node.right) + if not left and not right: + return "" + if not left: + return right + if not right: + return left + if node.implicit: + return f"{left} {right}" + return f"{left} {node.operator} {right}" + raise ValueError(f"Unknown node type: {type(node)}") From 33ffb18b599a50ee941a28ff286fa7f8ae9edf13 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 20:00:35 +0000 Subject: [PATCH 52/68] feat(pgsearch): use unparse_for_embedding to strip NOT branches before embed_query Co-Authored-By: Claude Sonnet 4.6 --- radis/pgsearch/providers.py | 38 ++++++++------ radis/pgsearch/tests/test_provider_hybrid.py | 53 ++++++++++++++++++++ 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py index 9046cb82..271e66ce 100644 --- a/radis/pgsearch/providers.py +++ b/radis/pgsearch/providers.py @@ -99,15 +99,17 @@ def search(search: Search) -> SearchResult: filter_query = _build_filter_query(search.filters) tsquery = SearchQuery(query_str, search_type="raw", config=language) - # Vector side: query embedding (sync HTTP); fall back gracefully on failure. - query_text = QueryParser.unparse(search.query) - query_vec: list[float] | None - try: - with EmbeddingClient() as ec: - query_vec = ec.embed_query(query_text) - except EmbeddingClientError as e: - logger.warning("Hybrid search falling back to FTS-only: %s", e) - query_vec = None + # Vector side: strip NOT branches (see spec §7.8). If nothing is left, + # skip the embedding call entirely and fall through to FTS-only. + query_text = QueryParser.unparse_for_embedding(search.query) + query_vec: list[float] | None = None + if query_text.strip(): + try: + with EmbeddingClient() as ec: + query_vec = ec.embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Hybrid search falling back to FTS-only: %s", e) + query_vec = None vec_rank: dict[int, int] = {} vec_distance: dict[int, float] = {} @@ -210,13 +212,17 @@ def retrieve(search: Search) -> Iterator[str]: filter_query = _build_filter_query(search.filters) tsquery = SearchQuery(query_str, search_type="raw", config=language) - query_text = QueryParser.unparse(search.query) - try: - with EmbeddingClient() as ec: - query_vec = ec.embed_query(query_text) - except EmbeddingClientError as e: - logger.warning("Hybrid retrieve falling back to FTS-only: %s", e) - query_vec = None + # Vector side: strip NOT branches (see spec §7.8). If nothing is left, + # skip the embedding call entirely and fall through to FTS-only. + query_text = QueryParser.unparse_for_embedding(search.query) + query_vec: list[float] | None = None + if query_text.strip(): + try: + with EmbeddingClient() as ec: + query_vec = ec.embed_query(query_text) + except EmbeddingClientError as e: + logger.warning("Hybrid retrieve falling back to FTS-only: %s", e) + query_vec = None vec_rank: dict[int, int] = {} if query_vec is not None: diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index a6ec7b77..0a03993a 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -223,3 +223,56 @@ def test_m2m_filter_does_not_duplicate_results(group, settings): matching = [d for d in result.documents if d.document_id == r.document_id] assert len(matching) == 1, f"Expected 1 occurrence, got {len(matching)}" + + +def test_search_skips_embedding_when_query_reduces_to_not(monkeypatch, group): + """`NOT X` alone produces an empty embedding string; the provider must + not call the embedding service and must return FTS-only results.""" + from radis.pgsearch import providers + + embed_query_calls: list[str] = [] + + class FakeEC: + def __init__(self): pass + def __enter__(self): return self + def __exit__(self, *a): return False + def embed_query(self, text): + embed_query_calls.append(text) + raise AssertionError("embed_query should not be called for NOT-only query") + + monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC) + + node, _ = QueryParser().parse("NOT pneumothorax") + search = Search(query=node, filters=SearchFilters(group=group.pk), offset=0, limit=10) + result = providers.search(search) + + assert embed_query_calls == [] + # FTS-only path still returns a SearchResult (possibly with zero hits). + assert result is not None + + +def test_search_embeds_only_positive_branch_for_and_not(monkeypatch, group, settings): + """`A AND NOT B` embeds only `A`; FTS half still enforces the exclusion.""" + embed_query_calls: list[str] = [] + dim = settings.EMBEDDING_DIM + + class FakeEC: + def __init__(self): pass + def __enter__(self): return self + def __exit__(self, *a): return False + def embed_query(self, text): + embed_query_calls.append(text) + # Return a valid normalized unit vector of the right dim. + import numpy as np + v = np.ones(dim, dtype=np.float32) + return (v / np.linalg.norm(v)).tolist() + + monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC) + + from radis.pgsearch import providers + + node, _ = QueryParser().parse("pneumothorax AND NOT effusion") + search = Search(query=node, filters=SearchFilters(group=group.pk), offset=0, limit=10) + providers.search(search) + + assert embed_query_calls == ["pneumothorax"] From b42663817a1d962e15f47e3c1f9962607eaa5df7 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 20:03:00 +0000 Subject: [PATCH 53/68] lint(pgsearch): drop unused import and unused 'succeeded' var in orchestrator test --- radis/pgsearch/tests/test_process_embedding_job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/radis/pgsearch/tests/test_process_embedding_job.py b/radis/pgsearch/tests/test_process_embedding_job.py index 72c87987..9a9eac7d 100644 --- a/radis/pgsearch/tests/test_process_embedding_job.py +++ b/radis/pgsearch/tests/test_process_embedding_job.py @@ -3,7 +3,7 @@ import pytest from django.contrib.auth import get_user_model -from radis.pgsearch.models import EmbeddingJob, EmbeddingTask, ReportSearchVector +from radis.pgsearch.models import EmbeddingJob, EmbeddingTask from radis.pgsearch.tasks import process_embedding_job as _wrapped from radis.reports.factories import ReportFactory @@ -52,7 +52,7 @@ def test_process_embedding_job_resume_path_only_redispatches_pending_tasks(setti # Simulate a previous orchestrator run that created one task already. existing = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) existing.reports.set(reports) - succeeded = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.SUCCESS) + EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.SUCCESS) with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: process_embedding_job(job.id) From 2e924e9096feeda750cb565b4306e1a4b573748f Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 21:53:45 +0000 Subject: [PATCH 54/68] docs(pgsearch): spec for EMBEDDING_DIM check via MigrationLoader Eliminates the hand-edited EMBEDDING_DIM_MIGRATION_LITERAL constant in apps.py by deriving the migration-side dim from Django's MigrationLoader project state. Single source of truth, no DB connection required, manual operator procedure unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-embedding-dim-check.md | 207 ++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-28-embedding-dim-check.md diff --git a/docs/superpowers/specs/2026-05-28-embedding-dim-check.md b/docs/superpowers/specs/2026-05-28-embedding-dim-check.md new file mode 100644 index 00000000..c6db76d0 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-embedding-dim-check.md @@ -0,0 +1,207 @@ +# EMBEDDING_DIM System Check — Single Source of Truth via MigrationLoader + +**Status:** Draft — design phase +**Author:** RADIS team (Samuel Kwong) +**Date:** 2026-05-28 +**Implementation skill (next step):** `writing-plans` +**Related:** `docs/superpowers/specs/2026-05-28-hybrid-search.md` §4.5 (operational procedure for `EMBEDDING_DIM` changes) + +--- + +## 1. Problem + +`radis/pgsearch/apps.py` currently maintains a hand-edited constant: + +```python +# Keep in sync with the dimensions= literal in +# radis/pgsearch/migrations/0003_report_embedding.py. +EMBEDDING_DIM_MIGRATION_LITERAL = 1024 +``` + +The Django system check `pgsearch.E001` compares this against `settings.EMBEDDING_DIM` +to catch the case where an operator changes the env var without running +`makemigrations`. Without the check, the divergence would surface later as an +opaque pgvector dimension error on the first write or query. + +The constant has three failure modes: + +- **Drift on migration changes.** If a new migration drops/recreates the + embedding column at a different dim, `EMBEDDING_DIM_MIGRATION_LITERAL` must + also be edited. Easy to forget. +- **Triple duplication.** The dim now lives in three places: + `settings.EMBEDDING_DIM` (env), the migration literal, and the constant. +- **Wrong-by-construction risk.** The constant is the only one of the three + that is *not* auto-derived from anything; the migration literal is generated + by `makemigrations` from `settings.EMBEDDING_DIM` at generation time. The + constant has to be transcribed by hand. + +## 2. Goals & non-goals + +### Goals + +- Eliminate the hand-edited `EMBEDDING_DIM_MIGRATION_LITERAL` constant. +- Preserve the existing safety net: `manage.py check` must still fail + loudly when `settings.EMBEDDING_DIM` diverges from what the migrations + describe. +- Keep the check offline (no database connection required at startup). + +### Non-goals + +- Eliminating the §4.5 manual operator procedure (drop column, re-migrate, + re-embed). That decoupling — non-disruptive dim changes via side-by-side + columns or similar — is explicitly out of scope and is a future spec. +- Changing the on-disk migration format or the way `makemigrations` captures + the literal `dimensions=1024`. +- Changing `settings.EMBEDDING_DIM` (still an env var). + +## 3. Design + +Use Django's `MigrationLoader` to compute the project state from the on-disk +migration files at check time, then read the embedding field's `dimensions` +from that state. The state is built without a database connection. + +`radis/pgsearch/apps.py` becomes: + +```python +from django.apps import AppConfig +from django.conf import settings +from django.core.checks import Error, register + + +class PgSearchConfig(AppConfig): + name = "radis.pgsearch" + + def ready(self): + from . import signals as signals # noqa: F401 + + register_app() + + +def _migration_embedding_dim() -> int | None: + """Return the `dimensions` value of `ReportSearchVector.embedding` as + captured by the on-disk pgsearch migrations. Returns None if the field + cannot be located (e.g., migrations are missing or out of sync).""" + from django.db.migrations.loader import MigrationLoader + + loader = MigrationLoader(connection=None, ignore_no_migrations=True) + state = loader.project_state() + try: + model = state.apps.get_model("pgsearch", "ReportSearchVector") + return model._meta.get_field("embedding").dimensions + except (LookupError, AttributeError): + return None + + +@register() +def check_embedding_dim_matches_migration(app_configs, **kwargs): + """Fail loudly when settings.EMBEDDING_DIM diverges from the dim baked + into the pgsearch migrations. Mismatched values would otherwise surface as + opaque pgvector dimension errors on the first write/query.""" + migration_dim = _migration_embedding_dim() + + if migration_dim is None: + return [ + Error( + "Could not determine the embedding column dimension from the " + "pgsearch migrations. Either the migrations are missing the " + "embedding field or the model has been renamed.", + id="pgsearch.E002", + hint=( + "Verify that `radis/pgsearch/migrations/` contains a " + "migration that adds the `embedding` field to " + "`ReportSearchVector`, and that `makemigrations pgsearch` " + "succeeds without changes." + ), + ) + ] + + if settings.EMBEDDING_DIM != migration_dim: + return [ + Error( + f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the " + f"dim baked into the pgsearch migrations " + f"(vector({migration_dim})). Writes will fail with a pgvector " + f"dimension error. Either set EMBEDDING_DIM={migration_dim}, or " + f"run `makemigrations pgsearch` to capture the new dim and " + f"follow the §4.5 procedure to drop and recreate the embedding " + f"column.", + id="pgsearch.E001", + hint=( + "Update EMBEDDING_DIM in your .env to match the existing " + "migrations, or generate a new migration that matches the " + "new dim." + ), + ) + ] + return [] +``` + +`register_app()` is unchanged. + +### 3.1 Why `MigrationLoader` and not other options + +| Option | Authoritative for | DB connection | Verdict | +|---|---|---|---| +| Hand-edited constant (status quo) | Nothing — must be manually transcribed | No | Drift-prone | +| Parse `migrations/0003_*.py` source | The literal in one specific file | No | Brittle; couples to filename | +| `MigrationLoader` project state | The *aggregated* dim across all migrations | No | Chosen | +| `information_schema.columns` (live DB) | The actually-deployed column dim | Yes | Loses offline-check property | + +`MigrationLoader` answers "what dim do the migrations on disk currently describe?" +which is exactly what the check needs to catch env/migration drift before any +DB writes happen. If a later migration drops and recreates the column at a +different dim, `project_state()` reflects the *post-all-migrations* state, so +the check stays correct without code changes. + +### 3.2 Failure-mode coverage + +| Scenario | Behavior | +|---|---| +| Env says 2560, migrations describe 1024 | `pgsearch.E001` fires with both numbers and the suggested fix. | +| Env says 1024, migrations also describe 1024 | Check passes. | +| `embedding` field deleted by a migration (no replacement) | `_migration_embedding_dim()` returns `None`; `pgsearch.E002` fires telling the operator to re-add it. | +| Fresh checkout, no migrations applied yet | `project_state()` still resolves from disk; check works without DB. | +| Migrations dir present but missing the embedding field somehow | `pgsearch.E002` (same path as deletion). | + +## 4. What this does not change + +- `settings.EMBEDDING_DIM` env var — still the runtime/code-facing value. +- The migration file `0003_report_embedding.py` — `dimensions=1024` literal + stays as generated by `makemigrations`. +- The §4.5 manual operator procedure for changing the dim — still: + edit env → makemigrations → drop column → re-migrate → defer launcher. +- The check's id (`pgsearch.E001`) — preserved so existing operator playbooks + and any test that asserts on the id keep working. A new id `pgsearch.E002` + is added for the "missing field" case. + +## 5. Migration plan (code change, not Django migration) + +One PR / small commit series: + +1. **Delete `EMBEDDING_DIM_MIGRATION_LITERAL`** from `radis/pgsearch/apps.py`. +2. **Add `_migration_embedding_dim()` helper** to the same file. +3. **Rewrite `check_embedding_dim_matches_migration`** to use the helper and + emit `pgsearch.E001` (dim mismatch) or `pgsearch.E002` (field missing). +4. **Tests** in `radis/pgsearch/tests/test_apps_checks.py`: + - The existing two tests currently import `EMBEDDING_DIM_MIGRATION_LITERAL` + from `apps.py`. That import goes away. Rewrite both tests to source the + migration dim from the new `_migration_embedding_dim()` helper instead: + - `test_check_passes_when_dim_matches_migration`: override + `EMBEDDING_DIM` to `_migration_embedding_dim()` and assert no errors. + - `test_check_fails_when_dim_diverges_from_migration`: override to + `_migration_embedding_dim() + 1`; assert one `pgsearch.E001` error and + both numbers appear in the message. + - Add a test that monkey-patches `_migration_embedding_dim` to return + `None` and asserts a single `pgsearch.E002` error. + - Add a test that calls `_migration_embedding_dim()` directly and asserts + it returns the integer `1024` (current value), proving the loader path + works without a DB connection. This test should not be in a + `@pytest.mark.django_db` block. + +## 6. Open questions deferred to writing-plans + +- Whether to also delete the `# Keep in sync with ...` comment block that + documented the old constant — yes, it goes with the constant. +- Whether `pgsearch.E002` deserves a unit test that *actually* removes the + embedding migration vs. just mocks the helper return value. Mock is fine for + v1; deleting on-disk migrations in a test is risky and brittle. From 98a67c271287e3f7dcb70a54dfc29de1a610edde Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 28 May 2026 22:05:37 +0000 Subject: [PATCH 55/68] docs(pgsearch): fold EMBEDDING_DIM check redesign into unified hybrid-search spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds §4.6 covering the MigrationLoader-based system check that eliminates EMBEDDING_DIM_MIGRATION_LITERAL, including the new pgsearch.E002 case for a missing embedding field. Deletes the standalone amendment doc — the unified spec is the single source of truth. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-embedding-dim-check.md | 207 ------------------ .../specs/2026-05-28-hybrid-search.md | 75 ++++++- 2 files changed, 74 insertions(+), 208 deletions(-) delete mode 100644 docs/superpowers/specs/2026-05-28-embedding-dim-check.md diff --git a/docs/superpowers/specs/2026-05-28-embedding-dim-check.md b/docs/superpowers/specs/2026-05-28-embedding-dim-check.md deleted file mode 100644 index c6db76d0..00000000 --- a/docs/superpowers/specs/2026-05-28-embedding-dim-check.md +++ /dev/null @@ -1,207 +0,0 @@ -# EMBEDDING_DIM System Check — Single Source of Truth via MigrationLoader - -**Status:** Draft — design phase -**Author:** RADIS team (Samuel Kwong) -**Date:** 2026-05-28 -**Implementation skill (next step):** `writing-plans` -**Related:** `docs/superpowers/specs/2026-05-28-hybrid-search.md` §4.5 (operational procedure for `EMBEDDING_DIM` changes) - ---- - -## 1. Problem - -`radis/pgsearch/apps.py` currently maintains a hand-edited constant: - -```python -# Keep in sync with the dimensions= literal in -# radis/pgsearch/migrations/0003_report_embedding.py. -EMBEDDING_DIM_MIGRATION_LITERAL = 1024 -``` - -The Django system check `pgsearch.E001` compares this against `settings.EMBEDDING_DIM` -to catch the case where an operator changes the env var without running -`makemigrations`. Without the check, the divergence would surface later as an -opaque pgvector dimension error on the first write or query. - -The constant has three failure modes: - -- **Drift on migration changes.** If a new migration drops/recreates the - embedding column at a different dim, `EMBEDDING_DIM_MIGRATION_LITERAL` must - also be edited. Easy to forget. -- **Triple duplication.** The dim now lives in three places: - `settings.EMBEDDING_DIM` (env), the migration literal, and the constant. -- **Wrong-by-construction risk.** The constant is the only one of the three - that is *not* auto-derived from anything; the migration literal is generated - by `makemigrations` from `settings.EMBEDDING_DIM` at generation time. The - constant has to be transcribed by hand. - -## 2. Goals & non-goals - -### Goals - -- Eliminate the hand-edited `EMBEDDING_DIM_MIGRATION_LITERAL` constant. -- Preserve the existing safety net: `manage.py check` must still fail - loudly when `settings.EMBEDDING_DIM` diverges from what the migrations - describe. -- Keep the check offline (no database connection required at startup). - -### Non-goals - -- Eliminating the §4.5 manual operator procedure (drop column, re-migrate, - re-embed). That decoupling — non-disruptive dim changes via side-by-side - columns or similar — is explicitly out of scope and is a future spec. -- Changing the on-disk migration format or the way `makemigrations` captures - the literal `dimensions=1024`. -- Changing `settings.EMBEDDING_DIM` (still an env var). - -## 3. Design - -Use Django's `MigrationLoader` to compute the project state from the on-disk -migration files at check time, then read the embedding field's `dimensions` -from that state. The state is built without a database connection. - -`radis/pgsearch/apps.py` becomes: - -```python -from django.apps import AppConfig -from django.conf import settings -from django.core.checks import Error, register - - -class PgSearchConfig(AppConfig): - name = "radis.pgsearch" - - def ready(self): - from . import signals as signals # noqa: F401 - - register_app() - - -def _migration_embedding_dim() -> int | None: - """Return the `dimensions` value of `ReportSearchVector.embedding` as - captured by the on-disk pgsearch migrations. Returns None if the field - cannot be located (e.g., migrations are missing or out of sync).""" - from django.db.migrations.loader import MigrationLoader - - loader = MigrationLoader(connection=None, ignore_no_migrations=True) - state = loader.project_state() - try: - model = state.apps.get_model("pgsearch", "ReportSearchVector") - return model._meta.get_field("embedding").dimensions - except (LookupError, AttributeError): - return None - - -@register() -def check_embedding_dim_matches_migration(app_configs, **kwargs): - """Fail loudly when settings.EMBEDDING_DIM diverges from the dim baked - into the pgsearch migrations. Mismatched values would otherwise surface as - opaque pgvector dimension errors on the first write/query.""" - migration_dim = _migration_embedding_dim() - - if migration_dim is None: - return [ - Error( - "Could not determine the embedding column dimension from the " - "pgsearch migrations. Either the migrations are missing the " - "embedding field or the model has been renamed.", - id="pgsearch.E002", - hint=( - "Verify that `radis/pgsearch/migrations/` contains a " - "migration that adds the `embedding` field to " - "`ReportSearchVector`, and that `makemigrations pgsearch` " - "succeeds without changes." - ), - ) - ] - - if settings.EMBEDDING_DIM != migration_dim: - return [ - Error( - f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the " - f"dim baked into the pgsearch migrations " - f"(vector({migration_dim})). Writes will fail with a pgvector " - f"dimension error. Either set EMBEDDING_DIM={migration_dim}, or " - f"run `makemigrations pgsearch` to capture the new dim and " - f"follow the §4.5 procedure to drop and recreate the embedding " - f"column.", - id="pgsearch.E001", - hint=( - "Update EMBEDDING_DIM in your .env to match the existing " - "migrations, or generate a new migration that matches the " - "new dim." - ), - ) - ] - return [] -``` - -`register_app()` is unchanged. - -### 3.1 Why `MigrationLoader` and not other options - -| Option | Authoritative for | DB connection | Verdict | -|---|---|---|---| -| Hand-edited constant (status quo) | Nothing — must be manually transcribed | No | Drift-prone | -| Parse `migrations/0003_*.py` source | The literal in one specific file | No | Brittle; couples to filename | -| `MigrationLoader` project state | The *aggregated* dim across all migrations | No | Chosen | -| `information_schema.columns` (live DB) | The actually-deployed column dim | Yes | Loses offline-check property | - -`MigrationLoader` answers "what dim do the migrations on disk currently describe?" -which is exactly what the check needs to catch env/migration drift before any -DB writes happen. If a later migration drops and recreates the column at a -different dim, `project_state()` reflects the *post-all-migrations* state, so -the check stays correct without code changes. - -### 3.2 Failure-mode coverage - -| Scenario | Behavior | -|---|---| -| Env says 2560, migrations describe 1024 | `pgsearch.E001` fires with both numbers and the suggested fix. | -| Env says 1024, migrations also describe 1024 | Check passes. | -| `embedding` field deleted by a migration (no replacement) | `_migration_embedding_dim()` returns `None`; `pgsearch.E002` fires telling the operator to re-add it. | -| Fresh checkout, no migrations applied yet | `project_state()` still resolves from disk; check works without DB. | -| Migrations dir present but missing the embedding field somehow | `pgsearch.E002` (same path as deletion). | - -## 4. What this does not change - -- `settings.EMBEDDING_DIM` env var — still the runtime/code-facing value. -- The migration file `0003_report_embedding.py` — `dimensions=1024` literal - stays as generated by `makemigrations`. -- The §4.5 manual operator procedure for changing the dim — still: - edit env → makemigrations → drop column → re-migrate → defer launcher. -- The check's id (`pgsearch.E001`) — preserved so existing operator playbooks - and any test that asserts on the id keep working. A new id `pgsearch.E002` - is added for the "missing field" case. - -## 5. Migration plan (code change, not Django migration) - -One PR / small commit series: - -1. **Delete `EMBEDDING_DIM_MIGRATION_LITERAL`** from `radis/pgsearch/apps.py`. -2. **Add `_migration_embedding_dim()` helper** to the same file. -3. **Rewrite `check_embedding_dim_matches_migration`** to use the helper and - emit `pgsearch.E001` (dim mismatch) or `pgsearch.E002` (field missing). -4. **Tests** in `radis/pgsearch/tests/test_apps_checks.py`: - - The existing two tests currently import `EMBEDDING_DIM_MIGRATION_LITERAL` - from `apps.py`. That import goes away. Rewrite both tests to source the - migration dim from the new `_migration_embedding_dim()` helper instead: - - `test_check_passes_when_dim_matches_migration`: override - `EMBEDDING_DIM` to `_migration_embedding_dim()` and assert no errors. - - `test_check_fails_when_dim_diverges_from_migration`: override to - `_migration_embedding_dim() + 1`; assert one `pgsearch.E001` error and - both numbers appear in the message. - - Add a test that monkey-patches `_migration_embedding_dim` to return - `None` and asserts a single `pgsearch.E002` error. - - Add a test that calls `_migration_embedding_dim()` directly and asserts - it returns the integer `1024` (current value), proving the loader path - works without a DB connection. This test should not be in a - `@pytest.mark.django_db` block. - -## 6. Open questions deferred to writing-plans - -- Whether to also delete the `# Keep in sync with ...` comment block that - documented the old constant — yes, it goes with the constant. -- Whether `pgsearch.E002` deserves a unit test that *actually* removes the - embedding migration vs. just mocks the helper return value. Mock is fine for - v1; deleting on-disk migrations in a test is risky and brittle. diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index 34893c9e..6ee5bdf4 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -187,7 +187,7 @@ class ReportSearchVector(models.Model): ### 4.5 Operational note on `EMBEDDING_DIM` -pgvector columns and HNSW indexes are bound to a fixed dimension at create time, and HNSW has a 2000-dim ceiling (so `EMBEDDING_DIM ≤ 2000`; Qwen3-Embedding-4B's native 2560 is Matryoshka-truncated client-side). A Django system check (`pgsearch.E001`) compares `settings.EMBEDDING_DIM` against the literal in migration 0003 and fails `manage.py check` on mismatch. Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure: +pgvector columns and HNSW indexes are bound to a fixed dimension at create time, and HNSW has a 2000-dim ceiling (so `EMBEDDING_DIM ≤ 2000`; Qwen3-Embedding-4B's native 2560 is Matryoshka-truncated client-side). Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure: 1. Drop the HNSW index and the `embedding` column. 2. Re-run `0003_report_embedding` with the new `EMBEDDING_DIM`. @@ -201,6 +201,79 @@ pgvector columns and HNSW indexes are bound to a fixed dimension at create time, This is documented as a deployment-time decision and intentionally not automated. +### 4.6 Startup safety check for env/migration drift + +Two Django system checks guard against the failure mode where +`settings.EMBEDDING_DIM` no longer matches what the on-disk migrations describe. +Without these the divergence would surface later as an opaque pgvector +dimension error on the first write or query. + +The migration-side dim is *not* stored in a hand-edited constant. Instead it is +derived at check time from Django's `MigrationLoader` project state — which is +built from the migration files on disk without a database connection — so +there is exactly one source of truth (the `dimensions=...` literal that +`makemigrations` itself generates from `settings.EMBEDDING_DIM`). + +```python +# radis/pgsearch/apps.py + +def _migration_embedding_dim() -> int | None: + """Return the `dimensions` value of `ReportSearchVector.embedding` as + captured by the on-disk pgsearch migrations. Returns None if the field + cannot be located (e.g., migrations are missing or out of sync).""" + from django.db.migrations.loader import MigrationLoader + + loader = MigrationLoader(connection=None, ignore_no_migrations=True) + state = loader.project_state() + try: + model = state.apps.get_model("pgsearch", "ReportSearchVector") + return model._meta.get_field("embedding").dimensions + except (LookupError, AttributeError): + return None + + +@register() +def check_embedding_dim_matches_migration(app_configs, **kwargs): + migration_dim = _migration_embedding_dim() + if migration_dim is None: + return [Error( + "Could not determine the embedding column dimension from the " + "pgsearch migrations.", + id="pgsearch.E002", + hint="Verify that radis/pgsearch/migrations/ contains a migration " + "that adds `embedding` to `ReportSearchVector`.", + )] + if settings.EMBEDDING_DIM != migration_dim: + return [Error( + f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the dim " + f"baked into the pgsearch migrations (vector({migration_dim})). " + f"Either set EMBEDDING_DIM={migration_dim}, or run " + f"`makemigrations pgsearch` to capture the new dim and follow §4.5.", + id="pgsearch.E001", + )] + return [] +``` + +Check IDs: + +| ID | When it fires | +|---|---| +| `pgsearch.E001` | `settings.EMBEDDING_DIM != migration_dim`. The familiar drift case. | +| `pgsearch.E002` | `_migration_embedding_dim()` returns `None`. Indicates the migration tree is missing the `embedding` field — either it was deleted without replacement, or the model was renamed. Surfaces what would otherwise be a silent NoneType crash. | + +Alternatives considered and rejected: + +| Option | Authoritative for | DB connection | Verdict | +|---|---|---|---| +| Hand-edited constant (status quo before this change) | Nothing — must be manually transcribed | No | Drift-prone | +| Parse `migrations/0003_*.py` source | The literal in one specific file | No | Brittle; couples to filename | +| `MigrationLoader` project state | The aggregated dim across all migrations | No | Chosen | +| `information_schema.columns` on the live DB | The actually-deployed column dim | Yes | Loses the offline-check property | + +`MigrationLoader.project_state()` reflects the *post-all-migrations* state, so +if a later migration drops and recreates the column at a different dim, the +check stays correct without any code change to `apps.py`. + ## 5. Embedding client ### 5.1 Module layout From 5a5ac327d1f2015e04cdb8ff99829db3bab03923 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 29 May 2026 07:16:50 +0000 Subject: [PATCH 56/68] docs(plan): append Task 14 (MigrationLoader-based EMBEDDING_DIM check) --- .../plans/2026-05-28-hybrid-search.md | 219 ++++++++++++++++++ 1 file changed, 219 insertions(+) diff --git a/docs/superpowers/plans/2026-05-28-hybrid-search.md b/docs/superpowers/plans/2026-05-28-hybrid-search.md index b5933d75..2b264e32 100644 --- a/docs/superpowers/plans/2026-05-28-hybrid-search.md +++ b/docs/superpowers/plans/2026-05-28-hybrid-search.md @@ -1225,6 +1225,224 @@ git commit -m "feat(pgsearch): use unparse_for_embedding to strip NOT branches b --- +## Task 14: Replace `EMBEDDING_DIM_MIGRATION_LITERAL` with `MigrationLoader`-based check + +**Files:** +- Modify: `radis/pgsearch/apps.py` (delete constant, add helper, rewrite check) +- Modify: `radis/pgsearch/tests/test_apps_checks.py` (replace constant import) + +The current check in `radis/pgsearch/apps.py` compares `settings.EMBEDDING_DIM` against a hand-maintained constant `EMBEDDING_DIM_MIGRATION_LITERAL = 1024` that must be kept in sync with the literal in migration 0003 by convention only. This task replaces the constant with a helper that reads the dim from Django's on-disk migration files via `MigrationLoader`, so there is one source of truth (the `dimensions=...` literal that `makemigrations` itself generates from `settings.EMBEDDING_DIM`). A new error id `pgsearch.E002` covers the case where the embedding field cannot be located in the migrations. + +See spec §4.6 for the design rationale and the alternatives-considered table. + +- [ ] **Step 1: Rewrite the test file to source the dim from the new helper** + +Replace the entire contents of `radis/pgsearch/tests/test_apps_checks.py` with: + +```python +"""Tests for the Django system check that guards EMBEDDING_DIM/migration parity.""" + +from unittest.mock import patch + +from django.test import override_settings + +from radis.pgsearch.apps import ( + _migration_embedding_dim, + check_embedding_dim_matches_migration, +) + + +def test_migration_embedding_dim_returns_int_without_db(): + dim = _migration_embedding_dim() + assert isinstance(dim, int) + assert dim == 1024 + + +def test_check_passes_when_dim_matches_migration(): + dim = _migration_embedding_dim() + with override_settings(EMBEDDING_DIM=dim): + assert check_embedding_dim_matches_migration(app_configs=None) == [] + + +def test_check_fails_with_e001_when_dim_diverges_from_migration(): + dim = _migration_embedding_dim() + with override_settings(EMBEDDING_DIM=dim + 1): + errors = check_embedding_dim_matches_migration(app_configs=None) + assert len(errors) == 1 + err = errors[0] + assert err.id == "pgsearch.E001" + assert str(dim) in err.msg + assert str(dim + 1) in err.msg + + +def test_check_fails_with_e002_when_migration_field_missing(): + with patch( + "radis.pgsearch.apps._migration_embedding_dim", return_value=None + ): + errors = check_embedding_dim_matches_migration(app_configs=None) + assert len(errors) == 1 + assert errors[0].id == "pgsearch.E002" +``` + +- [ ] **Step 2: Run tests — expect ImportError** + +Run: `uv run pytest radis/pgsearch/tests/test_apps_checks.py -v` +Expected: FAIL — `ImportError: cannot import name '_migration_embedding_dim' from 'radis.pgsearch.apps'` + +- [ ] **Step 3: Rewrite `radis/pgsearch/apps.py`** + +Replace the entire file with: + +```python +from django.apps import AppConfig +from django.conf import settings +from django.core.checks import Error, register + + +class PgSearchConfig(AppConfig): + name = "radis.pgsearch" + + def ready(self): + from . import signals as signals # noqa: F401 + + register_app() + + +def _migration_embedding_dim() -> int | None: + """Return the `dimensions` value of `ReportSearchVector.embedding` as + captured by the on-disk pgsearch migrations. Returns None if the field + cannot be located (migrations missing or model renamed).""" + from django.db.migrations.loader import MigrationLoader + + loader = MigrationLoader(connection=None, ignore_no_migrations=True) + state = loader.project_state() + try: + model = state.apps.get_model("pgsearch", "ReportSearchVector") + return model._meta.get_field("embedding").dimensions + except (LookupError, AttributeError): + return None + + +@register() +def check_embedding_dim_matches_migration(app_configs, **kwargs): + """Fail loudly when settings.EMBEDDING_DIM diverges from the dim baked + into the pgsearch migrations. Mismatched values would otherwise surface as + opaque pgvector dimension errors on the first write or query.""" + migration_dim = _migration_embedding_dim() + + if migration_dim is None: + return [ + Error( + "Could not determine the embedding column dimension from the " + "pgsearch migrations. Either the migrations are missing the " + "embedding field or the model has been renamed.", + id="pgsearch.E002", + hint=( + "Verify that `radis/pgsearch/migrations/` contains a " + "migration that adds the `embedding` field to " + "`ReportSearchVector`, and that `makemigrations pgsearch` " + "succeeds without changes." + ), + ) + ] + + if settings.EMBEDDING_DIM != migration_dim: + return [ + Error( + f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the " + f"dim baked into the pgsearch migrations " + f"(vector({migration_dim})). Writes will fail with a pgvector " + f"dimension error. Either set " + f"EMBEDDING_DIM={migration_dim}, or run `makemigrations " + f"pgsearch` to capture the new dim and follow the §4.5 " + f"procedure to drop and recreate the embedding column.", + id="pgsearch.E001", + hint=( + "Update EMBEDDING_DIM in your .env to match the existing " + "migrations, or generate a new migration that matches the " + "new dim." + ), + ) + ] + return [] + + +def register_app(): + from django.conf import settings + + from radis.extractions.site import ( + ExtractionRetrievalProvider, + register_extraction_retrieval_provider, + ) + from radis.search.site import SearchProvider, register_search_provider + from radis.subscriptions.site import ( + SubscriptionFilterProvider, + SubscriptionRetrievalProvider, + register_subscription_filter_provider, + register_subscription_retrieval_provider, + ) + + from .providers import count, filter, retrieve, search + + register_search_provider( + SearchProvider( + name="PG Search", + search=search, + max_results=max( + settings.HYBRID_VECTOR_TOP_K, settings.HYBRID_FTS_MAX_RESULTS + ), + ) + ) + + register_extraction_retrieval_provider( + ExtractionRetrievalProvider( + name="PG Search", + count=count, + retrieve=retrieve, + max_results=None, + ) + ) + + register_subscription_retrieval_provider( + SubscriptionRetrievalProvider( + name="PG Search", + retrieve=retrieve, + ) + ) + register_subscription_filter_provider( + SubscriptionFilterProvider( + name="PG Search", + filter=filter, + ) + ) +``` + +The `EMBEDDING_DIM_MIGRATION_LITERAL` constant and its sync-keeping comment block are gone. `register_app()` is unchanged from the current implementation. + +- [ ] **Step 4: Run tests and verify pass** + +Run: `uv run pytest radis/pgsearch/tests/test_apps_checks.py -v` +Expected: PASS (4 tests). + +- [ ] **Step 5: Run `manage.py check` to confirm the system check still works end-to-end** + +Run: `uv run python manage.py check` +Expected: passes with no errors (current `.env` has `EMBEDDING_DIM=1024` matching the migration). + +Also verify the negative case manually: + +Run: `EMBEDDING_DIM=999 uv run python manage.py check` +Expected: prints a single `pgsearch.E001` error mentioning both `999` and `1024`, and `manage.py check` exits non-zero. + +- [ ] **Step 6: Commit** + +```bash +git add radis/pgsearch/apps.py radis/pgsearch/tests/test_apps_checks.py +git commit -m "refactor(pgsearch): derive embedding-dim check from MigrationLoader, drop hand-edited literal" +``` + +--- + ## Final verification - [ ] **Step 1: Run lint** @@ -1289,3 +1507,4 @@ git push -u origin feat/hybrid-search | §8.1 `EMBEDDING_DRAIN_CRON` env var | Task 1 | | §8.2 `EMBEDDING_SYSTEM_USERNAME` constant | Task 1 | | §10.1 unit tests for launcher/job/task | Tasks 4, 5, 6 | +| §4.6 `MigrationLoader`-based EMBEDDING_DIM check + `pgsearch.E002` | Task 14 | From 17555acb074a2d34186242daf1e22b1d1470e4e8 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 29 May 2026 07:18:33 +0000 Subject: [PATCH 57/68] refactor(pgsearch): derive embedding-dim check from MigrationLoader, drop hand-edited literal --- radis/pgsearch/apps.py | 65 +++++++++++++++++------- radis/pgsearch/tests/test_apps_checks.py | 31 ++++++++--- 2 files changed, 72 insertions(+), 24 deletions(-) diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py index 101fc309..debd7c92 100644 --- a/radis/pgsearch/apps.py +++ b/radis/pgsearch/apps.py @@ -2,40 +2,69 @@ from django.conf import settings from django.core.checks import Error, register -# Keep in sync with the dimensions= literal in -# radis/pgsearch/migrations/0003_report_embedding.py. The migration -# captures dim at generation time, so changing this requires a new -# migration that re-creates the embedding column. -EMBEDDING_DIM_MIGRATION_LITERAL = 1024 - class PgSearchConfig(AppConfig): name = "radis.pgsearch" def ready(self): - from . import signals as signals + from . import signals as signals # noqa: F401 register_app() +def _migration_embedding_dim() -> int | None: + """Return the `dimensions` value of `ReportSearchVector.embedding` as + captured by the on-disk pgsearch migrations. Returns None if the field + cannot be located (migrations missing or model renamed).""" + from django.db.migrations.loader import MigrationLoader + + loader = MigrationLoader(connection=None, ignore_no_migrations=True) + state = loader.project_state() + try: + model = state.apps.get_model("pgsearch", "ReportSearchVector") + return model._meta.get_field("embedding").dimensions + except (LookupError, AttributeError): + return None + + @register() def check_embedding_dim_matches_migration(app_configs, **kwargs): """Fail loudly when settings.EMBEDDING_DIM diverges from the dim baked - into migration 0003. Mismatched values would otherwise surface as opaque - pgvector dimension errors on the first write/query.""" - if settings.EMBEDDING_DIM != EMBEDDING_DIM_MIGRATION_LITERAL: + into the pgsearch migrations. Mismatched values would otherwise surface as + opaque pgvector dimension errors on the first write or query.""" + migration_dim = _migration_embedding_dim() + + if migration_dim is None: + return [ + Error( + "Could not determine the embedding column dimension from the " + "pgsearch migrations. Either the migrations are missing the " + "embedding field or the model has been renamed.", + id="pgsearch.E002", + hint=( + "Verify that `radis/pgsearch/migrations/` contains a " + "migration that adds the `embedding` field to " + "`ReportSearchVector`, and that `makemigrations pgsearch` " + "succeeds without changes." + ), + ) + ] + + if settings.EMBEDDING_DIM != migration_dim: return [ Error( - f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the dim " - f"baked into migration 0003 (vector({EMBEDDING_DIM_MIGRATION_LITERAL})). " - f"Writes will fail with a pgvector dimension error. Either set " - f"EMBEDDING_DIM={EMBEDDING_DIM_MIGRATION_LITERAL} or write a new " - f"migration that drops and recreates the embedding column at the new dim.", + f"EMBEDDING_DIM={settings.EMBEDDING_DIM} does not match the " + f"dim baked into the pgsearch migrations " + f"(vector({migration_dim})). Writes will fail with a pgvector " + f"dimension error. Either set " + f"EMBEDDING_DIM={migration_dim}, or run `makemigrations " + f"pgsearch` to capture the new dim and follow the §4.5 " + f"procedure to drop and recreate the embedding column.", id="pgsearch.E001", hint=( - "Update EMBEDDING_DIM in your .env, or write a migration that " - "matches the new dim and update EMBEDDING_DIM_MIGRATION_LITERAL " - "in radis/pgsearch/apps.py." + "Update EMBEDDING_DIM in your .env to match the existing " + "migrations, or generate a new migration that matches the " + "new dim." ), ) ] diff --git a/radis/pgsearch/tests/test_apps_checks.py b/radis/pgsearch/tests/test_apps_checks.py index 36bdd59c..a2d03fc3 100644 --- a/radis/pgsearch/tests/test_apps_checks.py +++ b/radis/pgsearch/tests/test_apps_checks.py @@ -1,23 +1,42 @@ """Tests for the Django system check that guards EMBEDDING_DIM/migration parity.""" +from unittest.mock import patch + from django.test import override_settings from radis.pgsearch.apps import ( - EMBEDDING_DIM_MIGRATION_LITERAL, + _migration_embedding_dim, check_embedding_dim_matches_migration, ) +def test_migration_embedding_dim_returns_int_without_db(): + dim = _migration_embedding_dim() + assert isinstance(dim, int) + assert dim == 1024 + + def test_check_passes_when_dim_matches_migration(): - with override_settings(EMBEDDING_DIM=EMBEDDING_DIM_MIGRATION_LITERAL): + dim = _migration_embedding_dim() + with override_settings(EMBEDDING_DIM=dim): assert check_embedding_dim_matches_migration(app_configs=None) == [] -def test_check_fails_when_dim_diverges_from_migration(): - with override_settings(EMBEDDING_DIM=EMBEDDING_DIM_MIGRATION_LITERAL + 1): +def test_check_fails_with_e001_when_dim_diverges_from_migration(): + dim = _migration_embedding_dim() + with override_settings(EMBEDDING_DIM=dim + 1): errors = check_embedding_dim_matches_migration(app_configs=None) assert len(errors) == 1 err = errors[0] assert err.id == "pgsearch.E001" - assert str(EMBEDDING_DIM_MIGRATION_LITERAL) in err.msg - assert str(EMBEDDING_DIM_MIGRATION_LITERAL + 1) in err.msg + assert str(dim) in err.msg + assert str(dim + 1) in err.msg + + +def test_check_fails_with_e002_when_migration_field_missing(): + with patch( + "radis.pgsearch.apps._migration_embedding_dim", return_value=None + ): + errors = check_embedding_dim_matches_migration(app_configs=None) + assert len(errors) == 1 + assert errors[0].id == "pgsearch.E002" From 615f39f7a291760810806c48b7592e214bf2a3b0 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Tue, 2 Jun 2026 18:59:03 +0000 Subject: [PATCH 58/68] lint(pgsearch): satisfy pyright in hybrid-search test files Narrow Optional[int] from _migration_embedding_dim() with an assert, switch job.id/task.id to .pk so pyright sees the AnalysisJob/AnalysisTask PK, and assert QueryParser().parse() returned a non-None node before constructing Search. Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/tests/test_apps_checks.py | 1 + radis/pgsearch/tests/test_process_embedding_job.py | 6 +++--- radis/pgsearch/tests/test_process_embedding_task.py | 6 +++--- radis/pgsearch/tests/test_provider_hybrid.py | 2 ++ 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/radis/pgsearch/tests/test_apps_checks.py b/radis/pgsearch/tests/test_apps_checks.py index a2d03fc3..7658fc6d 100644 --- a/radis/pgsearch/tests/test_apps_checks.py +++ b/radis/pgsearch/tests/test_apps_checks.py @@ -24,6 +24,7 @@ def test_check_passes_when_dim_matches_migration(): def test_check_fails_with_e001_when_dim_diverges_from_migration(): dim = _migration_embedding_dim() + assert dim is not None with override_settings(EMBEDDING_DIM=dim + 1): errors = check_embedding_dim_matches_migration(app_configs=None) assert len(errors) == 1 diff --git a/radis/pgsearch/tests/test_process_embedding_job.py b/radis/pgsearch/tests/test_process_embedding_job.py index 9a9eac7d..463abf09 100644 --- a/radis/pgsearch/tests/test_process_embedding_job.py +++ b/radis/pgsearch/tests/test_process_embedding_job.py @@ -30,7 +30,7 @@ def test_process_embedding_job_batches_pending_reports(settings): reports = _make_pending_reports(5) with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: - process_embedding_job(job.id) + process_embedding_job(job.pk) job.refresh_from_db() assert job.status == EmbeddingJob.Status.PENDING @@ -55,7 +55,7 @@ def test_process_embedding_job_resume_path_only_redispatches_pending_tasks(setti EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.SUCCESS) with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: - process_embedding_job(job.id) + process_embedding_job(job.pk) job.refresh_from_db() assert job.status == EmbeddingJob.Status.PENDING @@ -70,7 +70,7 @@ def test_process_embedding_job_with_no_pending_rows(): # No reports exist → no ReportSearchVector rows with embedding IS NULL. with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: - process_embedding_job(job.id) + process_embedding_job(job.pk) job.refresh_from_db() assert job.status == EmbeddingJob.Status.PENDING diff --git a/radis/pgsearch/tests/test_process_embedding_task.py b/radis/pgsearch/tests/test_process_embedding_task.py index 3810b1c8..58e38664 100644 --- a/radis/pgsearch/tests/test_process_embedding_task.py +++ b/radis/pgsearch/tests/test_process_embedding_task.py @@ -34,7 +34,7 @@ def test_process_embedding_task_writes_vectors_and_marks_success(settings): fake_client = MagicMock() fake_client.embed_documents.return_value = [vec, vec] with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): - process_embedding_task(task.id) + process_embedding_task(task.pk) task.refresh_from_db() assert task.status == EmbeddingTask.Status.SUCCESS @@ -50,7 +50,7 @@ def test_process_embedding_task_failure_sets_status_and_raises(): fake_client.embed_documents.side_effect = EmbeddingClientError("boom") with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): with pytest.raises(EmbeddingClientError): - process_embedding_task(task.id) + process_embedding_task(task.pk) task.refresh_from_db() assert task.status == EmbeddingTask.Status.FAILURE @@ -64,7 +64,7 @@ def test_process_embedding_task_calls_update_job_state(settings): fake_client = MagicMock() fake_client.embed_documents.return_value = [vec, vec] with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): - process_embedding_task(task.id) + process_embedding_task(task.pk) task.job.refresh_from_db() # All tasks succeeded; AnalysisJob.update_job_state rolls up to SUCCESS. diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index 0a03993a..6e6758ef 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -243,6 +243,7 @@ def embed_query(self, text): monkeypatch.setattr("radis.pgsearch.providers.EmbeddingClient", FakeEC) node, _ = QueryParser().parse("NOT pneumothorax") + assert node is not None search = Search(query=node, filters=SearchFilters(group=group.pk), offset=0, limit=10) result = providers.search(search) @@ -272,6 +273,7 @@ def embed_query(self, text): from radis.pgsearch import providers node, _ = QueryParser().parse("pneumothorax AND NOT effusion") + assert node is not None search = Search(query=node, filters=SearchFilters(group=group.pk), offset=0, limit=10) providers.search(search) From e73a7f5ee8abe367eabcafeb00a6699cc8e0509e Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Wed, 3 Jun 2026 06:26:34 +0000 Subject: [PATCH 59/68] docs(pgsearch): document cosine_distance and rrf_score on ReportDocument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit §7.6 previously claimed RRF was internal and not exposed on the document type, but the shipped ReportDocument carries both cosine_distance and rrf_score (see radis/search/site.py). Rewrite the section to describe all three score fields, when each is populated, and how the planned §11.6 re-ranker will consume rrf_score. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-hybrid-search.md | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index 6ee5bdf4..ea8e5922 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -746,9 +746,42 @@ Same fusion logic, returns an iterator of `report__document_id` in `ordered_ids` Unchanged. These operate on filters only and never call the embedding service. -### 7.6 `ReportDocument.relevance` +### 7.6 `ReportDocument` score fields -Kept as `ts_rank` for API backwards compatibility. RRF is an internal ordering signal and is not exposed on the public document type. RRF scores are logged at DEBUG for diagnostics. +`ReportDocument` (`radis/search/site.py`) carries three score fields. The +existing `relevance` is preserved for API backwards compatibility; two new +fields are added so callers (and the UI) can see *why* a result ranked where +it did: + +```python +class ReportDocument(NamedTuple): + relevance: float | None # FTS ts_rank — existing; 0.0 for vector-only hits + document_id: str + # ... + cosine_distance: float | None = None # NEW — pgvector cosine distance; None for FTS-only hits + rrf_score: float = 0.0 # NEW — the value the final ordering is based on +``` + +Semantics: + +- `relevance` — Postgres `ts_rank` of the row's `search_vector` against the + tsquery. Same field/shape pre- and post-hybrid; callers that read it + continue to work. Defaults to `0.0` for documents that came from the vector + half only. +- `cosine_distance` — the `CosineDistance("embedding", query_vec)` annotation + for rows that made `vec_top_K`. `None` for FTS-only hits and whenever the + query path skipped vector retrieval (embedding service down, or the query + reduced to `NOT` after §7.8 stripping). +- `rrf_score` — the fused score from §7.1; this is what the result ordering + is based on. Exposed for transparency, debugging, and UI display + (operators can see at a glance which side contributed). Also useful when + the §11.6 re-ranker lands: it will read `rrf_score` to seed its top-N + candidate selection. + +All three fields are populated by `document_from_pgsearch_response` during +the page-slice hydration step in §7.2. The hydration query annotates the page +rows with `ts_rank`, looks up the corresponding entries in the `vec_rank` / +`fts_rank` / `rrf` dicts, and assembles the document. ### 7.7 `search_provider.max_results` From 997805f0756c99c78301f7a1721ecf0adb784fa6 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Wed, 3 Jun 2026 08:49:13 +0000 Subject: [PATCH 60/68] refactor(pgsearch): squash hybrid-search migrations into 0002_hybrid_search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collapse 0002_pgvector_extension, 0003_report_embedding, 0004_embedding_job_task, and 0005_system_user into a single 0002_hybrid_search that runs the extension SQL, adds the embedding column + HNSW index, creates the EmbeddingJob/EmbeddingTask tables, and inlines the system-user RunPython callable. Drop the _system_user_helper module (its sole caller is now the migration itself) and the obsolete idempotency test that imported it. Spec §3/§4.2/§4.5/§4.6/§6.4/§8.2/§12 updated to reference the squashed migration. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-hybrid-search.md | 106 ++++++++++-------- ...ding_job_task.py => 0002_hybrid_search.py} | 36 +++++- .../migrations/0002_pgvector_extension.py | 11 -- .../migrations/0003_report_embedding.py | 25 ----- radis/pgsearch/migrations/0005_system_user.py | 17 --- .../migrations/_system_user_helper.py | 9 -- .../tests/test_migrations_system_user.py | 10 -- 7 files changed, 93 insertions(+), 121 deletions(-) rename radis/pgsearch/migrations/{0004_embedding_job_task.py => 0002_hybrid_search.py} (81%) delete mode 100644 radis/pgsearch/migrations/0002_pgvector_extension.py delete mode 100644 radis/pgsearch/migrations/0003_report_embedding.py delete mode 100644 radis/pgsearch/migrations/0005_system_user.py delete mode 100644 radis/pgsearch/migrations/_system_user_helper.py diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index ea8e5922..9a2b06db 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -104,10 +104,7 @@ per-API-call embedding job. | File | Purpose | |---|---| | `utils/embedding_client.py` | Sync + async HTTP clients with pluggable backends (`openai`, `ollama`) | -| `migrations/0002_pgvector_extension.py` | `CREATE EXTENSION IF NOT EXISTS vector;` | -| `migrations/0003_report_embedding.py` | Adds `embedding vector(N)` column + HNSW index | -| `migrations/0004_embedding_job_task.py` | Adds `EmbeddingJob` and `EmbeddingTask` tables + M2M to `Report` | -| `migrations/0005_system_user.py` | Data migration: creates the system user if missing | +| `migrations/0002_hybrid_search.py` | Single squashed migration: `CREATE EXTENSION vector`; adds `embedding vector(N)` column + HNSW index; creates `EmbeddingJob`/`EmbeddingTask` tables + M2M to `Report`; idempotent `RunPython` for the system user | | `models.py` (modified) | Adds `embedding` field + `HnswIndex`; defines `EmbeddingJob` and `EmbeddingTask` inheriting `AnalysisJob`/`AnalysisTask` | | `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** | | `tasks.py` (modified) | Adds `embedding_launcher` (periodic), `process_embedding_job` (`default` queue), `process_embedding_task` (`embeddings` queue) | @@ -134,26 +131,41 @@ Add to `pyproject.toml`: "pgvector>=0.3", ``` -### 4.2 Postgres extension migration - -`radis/pgsearch/migrations/0002_pgvector_extension.py`: - -```python -class Migration(migrations.Migration): - dependencies = [("pgsearch", "0001_initial")] - operations = [ - migrations.RunSQL( - sql="CREATE EXTENSION IF NOT EXISTS vector;", - reverse_sql=migrations.RunSQL.noop, # do not drop in prod - ), - ] -``` - -Reverse is a no-op because the extension may be shared with other Postgres usage and dropping it would damage unrelated state. Dev rollback is handled by recreating the database. - -### 4.3 Schema migration - -`radis/pgsearch/migrations/0003_report_embedding.py`: standard `AddField` with a `VectorField(dimensions=settings.EMBEDDING_DIM, null=True)` and `AddIndex` for an `HnswIndex` with `opclasses=["vector_cosine_ops"]`, `m=16`, `ef_construction=64`. +### 4.2 Squashed migration + +The entire hybrid-search schema and the system-user data migration land in a +single file, `radis/pgsearch/migrations/0002_hybrid_search.py`, depending on +`pgsearch.0001_initial`, `reports.0013_alter_report_options`, +`procrastinate.0041_post_retry_failed_job`, and `AUTH_USER_MODEL`. Operations +in order: + +1. `RunSQL("CREATE EXTENSION IF NOT EXISTS vector;", reverse_sql=RunSQL.noop)`. + Reverse is a no-op because the extension may be shared with other Postgres + usage and dropping it would damage unrelated state. Dev rollback is handled + by recreating the database. +2. `AddField` `embedding` on `ReportSearchVector`: + `pgvector.django.vector.VectorField(dimensions=settings.EMBEDDING_DIM, null=True)`. +3. `AddIndex` HNSW on `embedding`: `m=16`, `ef_construction=64`, + `opclasses=["vector_cosine_ops"]`, `name="pgsearch_embedding_hnsw"`. +4. `CreateModel` `EmbeddingJob` (subclass of `AnalysisJob`). +5. `CreateModel` `EmbeddingTask` (subclass of `AnalysisTask`, FK to + `EmbeddingJob`, M2M to `Report`). +6. `RunPython(create_system_user, reverse_code=RunPython.noop)`: idempotent + `User.objects.get_or_create(username=settings.EMBEDDING_SYSTEM_USERNAME, + defaults={"is_active": False, "password": "!"})`. The function is inlined + at the top of the migration file — no separate helper module — because it + is only ever called from this one place. + +Operation order matters: the `AddField` step references the `vector` type +installed by step 1, and the `CreateModel` steps reference both the `Report` +table (via M2M) and the `AUTH_USER_MODEL` (via owner FK). Step 6 runs last +because it needs the user table to exist (which it does at `0001_initial` of +the auth app, swappable-dependency-ordered above). + +Reverse semantics: the auto-generated reverse of steps 2–5 drops the index, +the column, and the two tables. Steps 1 and 6 use `noop` reverse — extension +stays installed; system user stays in the DB. Matches the originally-chained +behaviour exactly. ### 4.4 Model update @@ -190,7 +202,9 @@ class ReportSearchVector(models.Model): pgvector columns and HNSW indexes are bound to a fixed dimension at create time, and HNSW has a 2000-dim ceiling (so `EMBEDDING_DIM ≤ 2000`; Qwen3-Embedding-4B's native 2560 is Matryoshka-truncated client-side). Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure: 1. Drop the HNSW index and the `embedding` column. -2. Re-run `0003_report_embedding` with the new `EMBEDDING_DIM`. +2. Re-run `0002_hybrid_search` with the new `EMBEDDING_DIM`. (Because the + migration is squashed, this single apply re-creates the column at the new + dim, the HNSW index, and is idempotent for the rest of the operations.) 3. From a Django shell, defer the embedding orchestrator immediately so the next nightly tick is not waited for: @@ -204,15 +218,17 @@ This is documented as a deployment-time decision and intentionally not automated ### 4.6 Startup safety check for env/migration drift Two Django system checks guard against the failure mode where -`settings.EMBEDDING_DIM` no longer matches what the on-disk migrations describe. -Without these the divergence would surface later as an opaque pgvector -dimension error on the first write or query. +`settings.EMBEDDING_DIM` no longer matches what the squashed +`0002_hybrid_search` migration describes. Without these the divergence would +surface later as an opaque pgvector dimension error on the first write or +query. -The migration-side dim is *not* stored in a hand-edited constant. Instead it is -derived at check time from Django's `MigrationLoader` project state — which is +The migration-side dim is *not* stored in a hand-edited constant. Instead it +is derived at check time from Django's `MigrationLoader` project state — built from the migration files on disk without a database connection — so there is exactly one source of truth (the `dimensions=...` literal that -`makemigrations` itself generates from `settings.EMBEDDING_DIM`). +`makemigrations` itself generated from `settings.EMBEDDING_DIM` when +`0002_hybrid_search` was first written). ```python # radis/pgsearch/apps.py @@ -266,7 +282,7 @@ Alternatives considered and rejected: | Option | Authoritative for | DB connection | Verdict | |---|---|---|---| | Hand-edited constant (status quo before this change) | Nothing — must be manually transcribed | No | Drift-prone | -| Parse `migrations/0003_*.py` source | The literal in one specific file | No | Brittle; couples to filename | +| Parse `migrations/0002_hybrid_search.py` source | The literal in one specific file | No | Brittle; couples to filename | | `MigrationLoader` project state | The aggregated dim across all migrations | No | Chosen | | `information_schema.columns` on the live DB | The actually-deployed column dim | Yes | Loses the offline-check property | @@ -442,11 +458,12 @@ class EmbeddingTask(AnalysisTask): ``` **Owner field.** `AnalysisJob.owner` is non-nullable (`settings.AUTH_USER_MODEL`). -Embedding jobs are system-driven and have no human creator. A data migration -(`0005_system_user.py`) creates a `User(username=settings.EMBEDDING_SYSTEM_USERNAME, -is_active=False, password=unusable)` idempotently; the launcher assigns this -user as `owner` on every `EmbeddingJob`. This avoids subclass-level overrides -of `owner` and keeps the abstract contract clean. +Embedding jobs are system-driven and have no human creator. The squashed +`0002_hybrid_search` migration's final `RunPython` step creates +`User(username=settings.EMBEDDING_SYSTEM_USERNAME, is_active=False, +password=unusable)` idempotently; the launcher assigns this user as `owner` +on every `EmbeddingJob`. This avoids subclass-level overrides of `owner` and +keeps the abstract contract clean. **No `get_absolute_url` in v1.** Existing `ExtractionJob` and `SubscriptionJob` implement `get_absolute_url` because they have user-facing detail views. @@ -883,7 +900,7 @@ HYBRID_FTS_MAX_RESULTS = 10_000 HYBRID_RRF_K = 60 ``` -These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks). `EMBEDDING_SYSTEM_USERNAME` names the system user that owns every auto-generated `EmbeddingJob`; the data migration creates this user idempotently. +These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks). `EMBEDDING_SYSTEM_USERNAME` names the system user that owns every auto-generated `EmbeddingJob`; the squashed migration's `RunPython` step creates this user idempotently. ### 8.3 `example.env` @@ -1048,12 +1065,11 @@ A `run_search_eval` management command loops a set of test queries through all s ## 12. Rollout plan -1. **Schema and dependency.** Land the `pgvector` Python dep, the extension migration, and the embedding-column schema migration. No behavior change yet — `embedding` is nullable, queries still see only FTS. +1. **Schema, dependency, models, data migration.** Land the `pgvector` Python dep and the squashed `0002_hybrid_search` migration (extension + embedding column + HNSW + `EmbeddingJob`/`EmbeddingTask` tables + system user). No behaviour change yet — `embedding` is nullable, queries still see only FTS. 2. **Embedding client and tests.** Land the client module and unit tests. No callers yet. -3. **Orchestrator models and migrations.** Add `EmbeddingJob`, `EmbeddingTask`, their migration, and the data migration that creates the system user. -4. **Orchestrator tasks and `embeddings_worker`.** Land `embedding_launcher`, `process_embedding_job`, `process_embedding_task`, the `embeddings_worker` container (with `--concurrency 4`), and the `EMBEDDING_DRAIN_CRON` setting. The launcher starts ticking; with no rows yet, all ticks no-op. -5. **Initial drain.** From a shell, run `embedding_launcher.defer()` so the orchestrator picks up the existing corpus. This is the only "operator action" in the rollout. It runs at `EMBEDDING_INDEX_PRIORITY` and lives behind whatever other work is on the queues; it can run for hours to days on a large corpus. -6. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only. -7. **Monitor.** Watch search latency p95, embedding-queue depth, `EmbeddingJob` admin state, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed. +3. **Orchestrator tasks and `embeddings_worker`.** Land `embedding_launcher`, `process_embedding_job`, `process_embedding_task`, the `embeddings_worker` container (with `--concurrency 4`), and the `EMBEDDING_DRAIN_CRON` setting. The launcher starts ticking; with no rows yet, all ticks no-op. +4. **Initial drain.** From a shell, run `embedding_launcher.defer()` so the orchestrator picks up the existing corpus. This is the only "operator action" in the rollout. It runs at `EMBEDDING_INDEX_PRIORITY` and lives behind whatever other work is on the queues; it can run for hours to days on a large corpus. +5. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only. +6. **Monitor.** Watch search latency p95, embedding-queue depth, `EmbeddingJob` admin state, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed. -Each step is independently mergeable; steps 1–4 ship as quiet infrastructure changes with no user-visible effect, step 5 starts populating the column, step 6 is the moment hybrid goes live. +Each step is independently mergeable; steps 1–3 ship as quiet infrastructure changes with no user-visible effect, step 4 starts populating the column, step 5 is the moment hybrid goes live. diff --git a/radis/pgsearch/migrations/0004_embedding_job_task.py b/radis/pgsearch/migrations/0002_hybrid_search.py similarity index 81% rename from radis/pgsearch/migrations/0004_embedding_job_task.py rename to radis/pgsearch/migrations/0002_hybrid_search.py index 798af1b7..7a118d37 100644 --- a/radis/pgsearch/migrations/0004_embedding_job_task.py +++ b/radis/pgsearch/migrations/0002_hybrid_search.py @@ -1,20 +1,47 @@ -# Generated by Django 6.0.1 on 2026-05-28 19:36 - import django.db.models.deletion +import pgvector.django.indexes +import pgvector.django.vector from django.conf import settings from django.db import migrations, models +def create_system_user(apps, schema_editor): + User = apps.get_model(*settings.AUTH_USER_MODEL.split(".")) + User.objects.get_or_create( + username=settings.EMBEDDING_SYSTEM_USERNAME, + defaults={"is_active": False, "password": "!"}, + ) + + class Migration(migrations.Migration): dependencies = [ - ("pgsearch", "0003_report_embedding"), - ("procrastinate", "0041_post_retry_failed_job"), + ("pgsearch", "0001_initial"), ("reports", "0013_alter_report_options"), + ("procrastinate", "0041_post_retry_failed_job"), migrations.swappable_dependency(settings.AUTH_USER_MODEL), ] operations = [ + migrations.RunSQL( + sql="CREATE EXTENSION IF NOT EXISTS vector;", + reverse_sql=migrations.RunSQL.noop, + ), + migrations.AddField( + model_name="reportsearchvector", + name="embedding", + field=pgvector.django.vector.VectorField(dimensions=1024, null=True), + ), + migrations.AddIndex( + model_name="reportsearchvector", + index=pgvector.django.indexes.HnswIndex( + ef_construction=64, + fields=["embedding"], + m=16, + name="pgsearch_embedding_hnsw", + opclasses=["vector_cosine_ops"], + ), + ), migrations.CreateModel( name="EmbeddingJob", fields=[ @@ -135,4 +162,5 @@ class Migration(migrations.Migration): "abstract": False, }, ), + migrations.RunPython(create_system_user, reverse_code=migrations.RunPython.noop), ] diff --git a/radis/pgsearch/migrations/0002_pgvector_extension.py b/radis/pgsearch/migrations/0002_pgvector_extension.py deleted file mode 100644 index c862dbce..00000000 --- a/radis/pgsearch/migrations/0002_pgvector_extension.py +++ /dev/null @@ -1,11 +0,0 @@ -from django.db import migrations - - -class Migration(migrations.Migration): - dependencies = [("pgsearch", "0001_initial")] - operations = [ - migrations.RunSQL( - sql="CREATE EXTENSION IF NOT EXISTS vector;", - reverse_sql=migrations.RunSQL.noop, - ), - ] diff --git a/radis/pgsearch/migrations/0003_report_embedding.py b/radis/pgsearch/migrations/0003_report_embedding.py deleted file mode 100644 index b014e102..00000000 --- a/radis/pgsearch/migrations/0003_report_embedding.py +++ /dev/null @@ -1,25 +0,0 @@ -# Generated by Django 6.0.1 on 2026-05-15 18:19 - -import pgvector.django.indexes -import pgvector.django.vector -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('pgsearch', '0002_pgvector_extension'), - ('reports', '0013_alter_report_options'), - ] - - operations = [ - migrations.AddField( - model_name='reportsearchvector', - name='embedding', - field=pgvector.django.vector.VectorField(dimensions=1024, null=True), - ), - migrations.AddIndex( - model_name='reportsearchvector', - index=pgvector.django.indexes.HnswIndex(ef_construction=64, fields=['embedding'], m=16, name='pgsearch_embedding_hnsw', opclasses=['vector_cosine_ops']), - ), - ] diff --git a/radis/pgsearch/migrations/0005_system_user.py b/radis/pgsearch/migrations/0005_system_user.py deleted file mode 100644 index ef61ee42..00000000 --- a/radis/pgsearch/migrations/0005_system_user.py +++ /dev/null @@ -1,17 +0,0 @@ -from django.conf import settings -from django.db import migrations - -from radis.pgsearch.migrations._system_user_helper import create_system_user_idempotent - - -def forwards(apps, schema_editor): - User = apps.get_model(*settings.AUTH_USER_MODEL.split(".")) - create_system_user_idempotent(User) - - -class Migration(migrations.Migration): - dependencies = [ - ("pgsearch", "0004_embedding_job_task"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - operations = [migrations.RunPython(forwards, reverse_code=migrations.RunPython.noop)] diff --git a/radis/pgsearch/migrations/_system_user_helper.py b/radis/pgsearch/migrations/_system_user_helper.py deleted file mode 100644 index 5e0be0d2..00000000 --- a/radis/pgsearch/migrations/_system_user_helper.py +++ /dev/null @@ -1,9 +0,0 @@ -from django.conf import settings - - -def create_system_user_idempotent(user_model) -> None: - username = settings.EMBEDDING_SYSTEM_USERNAME - user, created = user_model.objects.get_or_create( - username=username, - defaults={"is_active": False, "password": "!"}, - ) diff --git a/radis/pgsearch/tests/test_migrations_system_user.py b/radis/pgsearch/tests/test_migrations_system_user.py index 0a7c7202..ca277361 100644 --- a/radis/pgsearch/tests/test_migrations_system_user.py +++ b/radis/pgsearch/tests/test_migrations_system_user.py @@ -9,13 +9,3 @@ def test_system_user_exists_after_migrations(): user = User.objects.get(username="system") assert user.is_active is False assert not user.has_usable_password() - - -@pytest.mark.django_db -def test_creating_system_user_twice_is_a_noop(): - from radis.pgsearch.migrations import _system_user_helper - - before = User.objects.filter(username="system").count() - _system_user_helper.create_system_user_idempotent(User) - after = User.objects.filter(username="system").count() - assert before == after == 1 From 8fa1b5ad85fb68f79f0b887bd2b16115c7c946f6 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Wed, 3 Jun 2026 08:56:45 +0000 Subject: [PATCH 61/68] refactor(pgsearch): demote EMBEDDING_DRAIN_CRON from env to code constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drain schedule is a code-review-worthy tuning knob, not a per-deployment operator setting. Move it from env.str() to a plain assignment in the §8.2 tuning-constants block, drop it from example.env, and update spec §8.1/§8.2/ §8.3/§12 plus the presentation Configuration slide. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/superpowers/specs/2026-05-28-hybrid-search.md | 8 ++++---- example.env | 4 ---- radis/settings/base.py | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index 9a2b06db..92282e9d 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -876,14 +876,14 @@ EMBEDDING_PROVIDER_PATH = env.str("EMBEDDING_PROVIDER_PATH", default="") # EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="") EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B") EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) -EMBEDDING_DRAIN_CRON = env.str("EMBEDDING_DRAIN_CRON", default="0 2 * * *") ``` -These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). `EMBEDDING_DRAIN_CRON` is env-tunable so dev environments can drain more frequently (e.g., `*/15 * * * *`) without a code change. +These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). ### 8.2 Code constants (tuning knobs, in `base.py`) ```python +EMBEDDING_DRAIN_CRON = "0 2 * * *" # nightly; bump in code for faster dev drain EMBEDDING_REQUEST_TIMEOUT = 30 # seconds EMBEDDING_MAX_INPUT_CHARS = 60_000 EMBEDDING_QUERY_INSTRUCTION = ( @@ -904,7 +904,7 @@ These are tuning constants. Changing them is a code change with a PR diff. This ### 8.3 `example.env` -Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, keyed off `EMBEDDING_BACKEND`. Documents `EMBEDDING_DRAIN_CRON` with the production default (`0 2 * * *`) and a dev-friendly alternative (`*/15 * * * *`). +Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, keyed off `EMBEDDING_BACKEND`. ### 8.4 Compose @@ -1067,7 +1067,7 @@ A `run_search_eval` management command loops a set of test queries through all s 1. **Schema, dependency, models, data migration.** Land the `pgvector` Python dep and the squashed `0002_hybrid_search` migration (extension + embedding column + HNSW + `EmbeddingJob`/`EmbeddingTask` tables + system user). No behaviour change yet — `embedding` is nullable, queries still see only FTS. 2. **Embedding client and tests.** Land the client module and unit tests. No callers yet. -3. **Orchestrator tasks and `embeddings_worker`.** Land `embedding_launcher`, `process_embedding_job`, `process_embedding_task`, the `embeddings_worker` container (with `--concurrency 4`), and the `EMBEDDING_DRAIN_CRON` setting. The launcher starts ticking; with no rows yet, all ticks no-op. +3. **Orchestrator tasks and `embeddings_worker`.** Land `embedding_launcher`, `process_embedding_job`, `process_embedding_task`, and the `embeddings_worker` container (with `--concurrency 4`). The launcher starts ticking on its compile-time `EMBEDDING_DRAIN_CRON` schedule; with no rows yet, all ticks no-op. 4. **Initial drain.** From a shell, run `embedding_launcher.defer()` so the orchestrator picks up the existing corpus. This is the only "operator action" in the rollout. It runs at `EMBEDDING_INDEX_PRIORITY` and lives behind whatever other work is on the queues; it can run for hours to days on a large corpus. 5. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only. 6. **Monitor.** Watch search latency p95, embedding-queue depth, `EmbeddingJob` admin state, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed. diff --git a/example.env b/example.env index 685e550f..bf2ec8e3 100644 --- a/example.env +++ b/example.env @@ -162,10 +162,6 @@ EMBEDDING_MODEL_NAME=Qwen/Qwen3-Embedding-4B # the embedding column, re-migrating, and running `./manage.py backfill_embeddings`. EMBEDDING_DIM=1024 -# Cron expression for the embedding orchestrator. Default nightly at 02:00. -# Use "*/15 * * * *" for more aggressive dev draining. -EMBEDDING_DRAIN_CRON=0 2 * * * - # Development with local Ollama: # EMBEDDING_BACKEND=ollama # EMBEDDING_PROVIDER_URL=http://host.docker.internal:11434 diff --git a/radis/settings/base.py b/radis/settings/base.py index 68234dcd..54333076 100644 --- a/radis/settings/base.py +++ b/radis/settings/base.py @@ -345,9 +345,9 @@ EMBEDDING_PROVIDER_API_KEY = env.str("EMBEDDING_PROVIDER_API_KEY", default="") EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B") EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) -EMBEDDING_DRAIN_CRON = env.str("EMBEDDING_DRAIN_CRON", default="0 2 * * *") # Embedding tuning constants (see hybrid-search spec §8.2) +EMBEDDING_DRAIN_CRON = "0 2 * * *" EMBEDDING_REQUEST_TIMEOUT = 30 EMBEDDING_MAX_INPUT_CHARS = 60_000 EMBEDDING_QUERY_INSTRUCTION = ( From b0e24efe5889b170d6a988b95fde0dc7e35f49a6 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Fri, 19 Jun 2026 07:45:46 +0000 Subject: [PATCH 62/68] refactor(pgsearch): all-deferred embedding via embed_reports_task; drop orchestrator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the periodic Job/Task orchestrator with a single async `embed_reports_task` Procrastinate task on the dedicated `embeddings` queue. Both single-create / PUT and bulk-upsert paths enqueue the task inside `transaction.on_commit`; the write path no longer touches the embedding service. `PGSEARCH_SYNC_INDEXING` is retained — bulk-upsert keeps its two FTS modes (sync inline vs deferred via `bulk_index_reports`) and both modes chain into `embed_reports_task` so embedding always follows FTS. This intentionally does not pull in the feat/adrf-views rewrite: the enqueue is a synchronous Procrastinate API call so the views remain plain sync DRF (`ReportViewSet` unchanged in shape from main). - tasks.py: add `embed_reports_task` (async, queue="embeddings") with internal batching by EMBEDDING_BATCH_SIZE; let EmbeddingClientError propagate so Procrastinate retries apply. `bulk_index_reports` chains `embed_reports_task.defer` at the end of its run. Drop the orchestrator (`process_embedding_job` / `process_embedding_task` / `embedding_launcher`). - models.py: drop EmbeddingJob and EmbeddingTask (only ReportSearchVector remains). - migrations/0002_hybrid_search.py: slim to schema-only (extension + embedding column + HNSW index); drop the Job/Task tables and the system-user RunPython data migration. - utils/embedding_client.py: add `AsyncEmbeddingClient` sibling for the async worker task; share `_resolve_config` and `_normalize_response` helpers with the sync client used by the query path. - viewsets.py: ReportViewSet.perform_create / perform_update wire embed_reports_task.defer into on_commit; the bulk_upsert action's on_commit defers it in the PGSEARCH_SYNC_INDEXING=True path (the deferred FTS path chains it inside bulk_index_reports). - settings/base.py: drop EMBEDDING_DRAIN_CRON, EMBEDDING_INDEX_PRIORITY, EMBEDDING_SYSTEM_USERNAME (orchestrator-only constants). - settings/test.py: keep EMBEDDING_PROVIDER_URL="" override; comment refreshed. - embed_pending management command: enqueue embed_reports_task per batch. - tests: replace orchestrator-era tests (test_embedding_launcher / test_process_embedding_* / test_migrations_system_user / test_models_embedding) with test_embed_reports_task (empty input, no-RSV no-op, internal batching, error propagation, bulk_index_reports → embed_reports_task chain) and test_embed_pending_command (asserts defer() calls). - spec: rewrite §3 / §6 for the all-deferred + sync DRF architecture; retain PGSEARCH_SYNC_INDEXING documentation; remove inline_embedding / ADRF references. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-hybrid-search.md | 585 +++++++----------- .../management/commands/embed_pending.py | 78 +++ .../pgsearch/migrations/0002_hybrid_search.py | 135 +--- radis/pgsearch/models.py | 47 -- radis/pgsearch/tasks.py | 150 ++--- .../tests/test_embed_pending_command.py | 48 ++ .../pgsearch/tests/test_embed_reports_task.py | 86 +++ .../pgsearch/tests/test_embedding_launcher.py | 50 -- .../tests/test_migrations_system_user.py | 11 - radis/pgsearch/tests/test_models_embedding.py | 31 - .../tests/test_process_embedding_job.py | 78 --- .../tests/test_process_embedding_task.py | 71 --- radis/pgsearch/utils/embedding_client.py | 181 ++++-- radis/reports/api/viewsets.py | 8 +- radis/settings/base.py | 5 - radis/settings/test.py | 7 + 16 files changed, 628 insertions(+), 943 deletions(-) create mode 100644 radis/pgsearch/management/commands/embed_pending.py create mode 100644 radis/pgsearch/tests/test_embed_pending_command.py create mode 100644 radis/pgsearch/tests/test_embed_reports_task.py delete mode 100644 radis/pgsearch/tests/test_embedding_launcher.py delete mode 100644 radis/pgsearch/tests/test_migrations_system_user.py delete mode 100644 radis/pgsearch/tests/test_models_embedding.py delete mode 100644 radis/pgsearch/tests/test_process_embedding_job.py delete mode 100644 radis/pgsearch/tests/test_process_embedding_task.py diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index 92282e9d..490ee8d0 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -33,7 +33,7 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — - No per-query UI toggle for semantic vs. lexical. Hybrid is the new default. - No Vespa, Elasticsearch, or OpenSearch adapter. - No solution for negation/polarity (§11 documents this as known future work). -- No automated re-embedding when `EMBEDDING_DIM` changes. That is a manual operator procedure: drop column, re-migrate, defer the embedding orchestrator (see §4.5). +- No automated re-embedding when `EMBEDDING_DIM` changes. That is a manual operator procedure: drop column, re-migrate, re-PUT affected reports (see §4.5). - No on-disk vector quantization. Float32 storage from day one; revisit if RAM pressure appears. ## 3. Architecture @@ -62,52 +62,54 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — └──────────────────────────────────────────────────────────────────────┘ ┌──────────────────────────────────────────────────────────────────────┐ -│ Async indexing path (Job/Task orchestrator, periodic-driven) │ +│ Async indexing path (deferred via Procrastinate) │ │ │ -│ cron (settings.EMBEDDING_DRAIN_CRON, default nightly 02:00) │ +│ Report view (single-create / PUT / bulk-upsert) │ │ │ │ -│ ▼ │ -│ embedding_launcher() — `default` queue │ -│ ├─ queueing_lock="embedding_launcher" │ -│ ├─ skip if any EmbeddingJob in PREPARING/PENDING/IN_PROGRESS │ -│ ├─ skip if no rows with embedding IS NULL │ -│ └─ EmbeddingJob.objects.create(...) → job.delay() │ -│ │ -│ process_embedding_job(job_id) — `default` queue │ -│ ├─ iterate ReportSearchVector with embedding IS NULL │ -│ ├─ chunk by EMBEDDING_BATCH_SIZE → EmbeddingTask rows │ -│ ├─ task.reports.set(chunk); task.delay() (no HTTP work) │ -│ └─ job.status = PENDING; return │ +│ ▼ transaction.atomic() block │ +│ ReportSerializer / bulk_upsert_reports │ +│ ├─ DB write (Report rows) │ +│ ├─ FTS path creates ReportSearchVector(embedding=NULL): │ +│ │ post_save signal (single) or │ +│ │ sync `bulk_upsert_report_search_vectors` (bulk, when │ +│ │ PGSEARCH_SYNC_INDEXING=True) or │ +│ │ deferred `bulk_index_reports` (bulk, default; chains into │ +│ │ embed_reports_task at its end — see §6.6) │ +│ └─ transaction.on_commit registers: │ +│ embed_reports_task.defer(report_ids=touched_pks) │ +│ (sync FTS paths only — the deferred FTS task chains the │ +│ embed enqueue itself; see §6.6) │ +│ │ │ +│ ▼ HTTP response returned (201 / 200) immediately │ │ │ -│ process_embedding_task(task_id) — `embeddings` queue │ -│ ├─ EmbeddingClient.embed_documents([r.body for r in task.reports])│ -│ ├─ L2-normalize; bulk_update ReportSearchVector.embedding │ -│ ├─ task.status = SUCCESS/FAILURE; clear queued_job_id │ -│ └─ job.update_job_state() │ +│ ──── elsewhere, on the embeddings_worker process ──── │ │ │ -│ Operator-triggered drain: from a Django shell run │ -│ `embedding_launcher.defer()` — same code path as periodic. │ +│ embed_reports_task(report_ids) (async task, embeddings queue) │ +│ ├─ load RSVs (database_sync_to_async) │ +│ ├─ await AsyncEmbeddingClient.embed_documents([body, ...]) │ +│ ├─ L2-normalize; ReportSearchVector.objects.bulk_update │ +│ └─ on EmbeddingClientError: raise │ +│ → Procrastinate retry policy (exp backoff, N attempts) │ └──────────────────────────────────────────────────────────────────────┘ ``` -The bulk-upsert API path (`reports/api/viewsets.py:_bulk_upsert_reports`) -already creates `ReportSearchVector` rows with `embedding=NULL` via the FTS -indexing call in its `on_commit` block. The single-create API path goes through -the standard `Report.save()` and the FTS `post_save` signal, which likewise -creates the `ReportSearchVector` row with NULL embedding. Both ingest paths -deposit work into the same DB-resident pending pool; the orchestrator drains it -on the next periodic tick (or on an operator-triggered defer). There is no -per-API-call embedding job. +Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id}/?upsert=true`) and bulk-upsert (`POST /api/reports/bulk-upsert/`) — enqueue an async Procrastinate task on the dedicated `embeddings` queue. The write path returns immediately after the transaction commits; the embedding service is touched only by the worker. This: + +- **Decouples write-path uptime from the embedding service.** API responses succeed even when the embedding endpoint is down or slow. +- **Bounds concurrent load on the embedding service** via the worker's `--concurrency K` — explicit, configurable backpressure rather than implicit request-driven concurrency. +- **Auto-recovers from transient outages** via Procrastinate's retry policy with exponential backoff. +- **Symmetric across single-create and bulk-upsert** — one enqueue site, one task, one worker. **Components added inside `radis.pgsearch`:** | File | Purpose | |---|---| -| `utils/embedding_client.py` | Sync + async HTTP clients with pluggable backends (`openai`, `ollama`) | -| `migrations/0002_hybrid_search.py` | Single squashed migration: `CREATE EXTENSION vector`; adds `embedding vector(N)` column + HNSW index; creates `EmbeddingJob`/`EmbeddingTask` tables + M2M to `Report`; idempotent `RunPython` for the system user | -| `models.py` (modified) | Adds `embedding` field + `HnswIndex`; defines `EmbeddingJob` and `EmbeddingTask` inheriting `AnalysisJob`/`AnalysisTask` | +| `utils/embedding_client.py` | `EmbeddingClient` (sync, used by the query path) + `AsyncEmbeddingClient` (async, used by `embed_reports_task` on the worker); pluggable backends (`openai`, `ollama`) | +| `tasks.py` (embedding entries) | `embed_reports_task(report_ids)` async Procrastinate task on the `embeddings` queue. Looks up RSVs via `database_sync_to_async`, calls `AsyncEmbeddingClient.embed_documents`, bulk-updates the column. Raises on `EmbeddingClientError` so the Procrastinate retry policy applies. | +| `migrations/0002_hybrid_search.py` | Single schema migration: `CREATE EXTENSION vector`; adds `embedding vector(N)` column + HNSW index | +| `models.py` (modified) | Adds `embedding` field + `HnswIndex` to `ReportSearchVector`. No Job/Task models. | | `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** | -| `tasks.py` (modified) | Adds `embedding_launcher` (periodic), `process_embedding_job` (`default` queue), `process_embedding_task` (`embeddings` queue) | +| `tasks.py` (FTS bits) | FTS bulk-indexing helper `bulk_upsert_report_search_vectors`. The existing `bulk_index_reports` Procrastinate task and `enqueue_bulk_index_reports` helper are retained; `bulk_index_reports` is extended to defer `embed_reports_task` at the end of its run so embedding always follows FTS in either mode (see §6.6). | | `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic | | `tests/...` | Coverage per §10 | @@ -117,9 +119,9 @@ per-API-call embedding job. |---|---| | `pyproject.toml` | Add `pgvector>=0.3` dependency | | `radis/settings/base.py` | New env-driven + constant settings (§8) | +| `radis/settings/test.py` | Override `EMBEDDING_PROVIDER_URL=""` so any incidental construction of `EmbeddingClient` / `AsyncEmbeddingClient` fast-fails into `EmbeddingClientError` in CI (no live embedding service). Tests that exercise embedding patch the client explicitly. | | `example.env` | Document `EMBEDDING_*` env vars for openai and ollama backends | -| `docker-compose.base.yml` | Add `embeddings_worker` service + `EMBEDDING_*` env vars | -| `docker-compose.dev.yml` / `.prod.yml` | `embeddings_worker.command` running `bg_worker -q embeddings` | +| `radis/reports/api/viewsets.py` | `ReportViewSet.perform_create` / `perform_update` / `bulk_upsert` register `embed_reports_task.defer(report_ids=...)` inside their `transaction.on_commit` callbacks. Sync DRF; no async machinery needed because the enqueue is a synchronous Procrastinate API call. | ## 4. Schema and migrations @@ -131,13 +133,11 @@ Add to `pyproject.toml`: "pgvector>=0.3", ``` -### 4.2 Squashed migration +### 4.2 Schema migration -The entire hybrid-search schema and the system-user data migration land in a -single file, `radis/pgsearch/migrations/0002_hybrid_search.py`, depending on -`pgsearch.0001_initial`, `reports.0013_alter_report_options`, -`procrastinate.0041_post_retry_failed_job`, and `AUTH_USER_MODEL`. Operations -in order: +Schema lives in a single file `radis/pgsearch/migrations/0002_hybrid_search.py`, +depending on `pgsearch.0001_initial` and `reports.0013_alter_report_options`. +Three operations: 1. `RunSQL("CREATE EXTENSION IF NOT EXISTS vector;", reverse_sql=RunSQL.noop)`. Reverse is a no-op because the extension may be shared with other Postgres @@ -147,25 +147,10 @@ in order: `pgvector.django.vector.VectorField(dimensions=settings.EMBEDDING_DIM, null=True)`. 3. `AddIndex` HNSW on `embedding`: `m=16`, `ef_construction=64`, `opclasses=["vector_cosine_ops"]`, `name="pgsearch_embedding_hnsw"`. -4. `CreateModel` `EmbeddingJob` (subclass of `AnalysisJob`). -5. `CreateModel` `EmbeddingTask` (subclass of `AnalysisTask`, FK to - `EmbeddingJob`, M2M to `Report`). -6. `RunPython(create_system_user, reverse_code=RunPython.noop)`: idempotent - `User.objects.get_or_create(username=settings.EMBEDDING_SYSTEM_USERNAME, - defaults={"is_active": False, "password": "!"})`. The function is inlined - at the top of the migration file — no separate helper module — because it - is only ever called from this one place. - -Operation order matters: the `AddField` step references the `vector` type -installed by step 1, and the `CreateModel` steps reference both the `Report` -table (via M2M) and the `AUTH_USER_MODEL` (via owner FK). Step 6 runs last -because it needs the user table to exist (which it does at `0001_initial` of -the auth app, swappable-dependency-ordered above). - -Reverse semantics: the auto-generated reverse of steps 2–5 drops the index, -the column, and the two tables. Steps 1 and 6 use `noop` reverse — extension -stays installed; system user stays in the DB. Matches the originally-chained -behaviour exactly. + +The all-deferred embedding architecture (§6) has no orchestrator tables or +system user, so this migration carries only schema. Reverse drops the index +and column. ### 4.4 Model update @@ -193,25 +178,22 @@ class ReportSearchVector(models.Model): ] ``` -`embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled asynchronously by `process_embedding_task` (§6.7). A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only. +`embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled by the `embed_reports_task` Procrastinate worker, enqueued from `transaction.on_commit` (§6). A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only. -`save()` on `ReportSearchVector` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by `process_embedding_task` via `bulk_update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent. +`save()` on `ReportSearchVector` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by `embed_reports_task` via `bulk_update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent. ### 4.5 Operational note on `EMBEDDING_DIM` pgvector columns and HNSW indexes are bound to a fixed dimension at create time, and HNSW has a 2000-dim ceiling (so `EMBEDDING_DIM ≤ 2000`; Qwen3-Embedding-4B's native 2560 is Matryoshka-truncated client-side). Changing `EMBEDDING_DIM` after deploy requires a manual operator procedure: 1. Drop the HNSW index and the `embedding` column. -2. Re-run `0002_hybrid_search` with the new `EMBEDDING_DIM`. (Because the - migration is squashed, this single apply re-creates the column at the new - dim, the HNSW index, and is idempotent for the rest of the operations.) -3. From a Django shell, defer the embedding orchestrator immediately so the - next nightly tick is not waited for: - - ```python - from radis.pgsearch.tasks import embedding_launcher - embedding_launcher.defer() - ``` +2. Re-run `0002_hybrid_search` with the new `EMBEDDING_DIM`. This re-creates + the column at the new dim plus the HNSW index. +3. Run `./manage.py embed_pending` to enqueue an `embed_reports_task` for + every row that's now NULL. The command is idempotent and resumable; the + embeddings worker drains the queue at its configured `--concurrency`. + See §6.5. +4. From here on, new writes enqueue tasks against the new dim automatically. This is documented as a deployment-time decision and intentionally not automated. @@ -301,8 +283,8 @@ check stays correct without any code change to `apps.py`. - `class OllamaBackend(EmbeddingBackend)` — default path `/api/embed`, body `{model, input: [...]}`, response `{embeddings: [[...]]}`. - `BACKENDS: dict[str, EmbeddingBackend] = {"openai": OpenAIBackend(), "ollama": OllamaBackend()}`. - `class EmbeddingClientError(Exception)`. -- `class EmbeddingClient` — sync client used by `process_embedding_task` and the query path. -- `class AsyncEmbeddingClient` — async variant, kept for parity with `chats/utils/chat_client.py` and so the query path can call it from ASGI views without `async_to_sync` later. +- `class EmbeddingClient` — sync client used by the query path (`providers.search` / `providers.retrieve`). +- `class AsyncEmbeddingClient` — async sibling of `EmbeddingClient`, used by the `embed_reports_task` worker task (§6.2). Same backend protocol; differs only in using `httpx.AsyncClient` + an `async with` lifecycle. The async surface lets a single embeddings worker run K embedding HTTP calls concurrently via asyncio at low memory overhead. ### 5.2 Interface @@ -340,7 +322,7 @@ class EmbeddingClient: - **Truncation:** any text longer than `EMBEDDING_MAX_INPUT_CHARS` is truncated at the character limit before being sent. A WARNING is logged with the report id (when known) and char count. Qwen3-Embedding-4B supports up to 32k tokens, so truncation will be rare for radiology bodies but is bounded as a defense against pathological inputs. - **Normalization:** every returned vector is L2-normalized client-side, unconditionally. With unit vectors, cosine distance is monotonic in dot product, which makes the HNSW `vector_cosine_ops` operator effectively a fast inner-product search. Whether the upstream server normalizes is irrelevant. - **Dimension validation:** every vector is checked to have length `EMBEDDING_DIM`. A mismatch raises `EmbeddingClientError`. -- **Batching:** `embed_documents` sends a single HTTP call per invocation. The higher-level orchestrator (`process_embedding_job`) groups reports into `EmbeddingTask` batches of `EMBEDDING_BATCH_SIZE` before dispatching them to `process_embedding_task`. +- **Batching:** `embed_documents` sends a single HTTP call per invocation. The write path enqueues an `embed_reports_task` per ingest event (one task per single-create, one task per bulk-upsert); each task in turn issues one batched embedding HTTP call covering all the report bodies it owns. The `EMBEDDING_BATCH_SIZE` constant is used by `embed_pending` to chunk large drains into tasks of reasonable size. - **Errors:** non-2xx, timeout, malformed JSON, missing key, or wrong dim all raise `EmbeddingClientError`. The client never falls back internally — fallback policy is owned by the caller. - **Dev recipe (Ollama):** ```bash @@ -351,304 +333,200 @@ class EmbeddingClient: EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M EMBEDDING_DIM=2560 ``` - GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, defer the embedding orchestrator from a Django shell (see §4.5). + GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, clear the column (`ReportSearchVector.objects.update(embedding=None)`) and run `./manage.py embed_pending`. -## 6. Async indexing (Job/Task orchestrator) +## 6. Async indexing (deferred via Procrastinate) -The embedding lifecycle uses the same Job/Task pattern as `ExtractionJob` / -`ExtractionTask` (`radis/extractions/tasks.py:32`) and `SubscriptionJob` / -`SubscriptionTask` (`radis/subscriptions/tasks.py:33`). A periodic launcher -creates one `EmbeddingJob` per drain run; the orchestrator splits pending -reports into `EmbeddingTask` batches; each task is processed on the -`embeddings` queue. +Every successful report write enqueues an async Procrastinate task that embeds the report(s) on a dedicated worker queue. The write path is decoupled from the embedding service's uptime, transient outages auto-recover via Procrastinate's retry policy, and load on the embedding service is bounded by worker concurrency rather than request concurrency. -### 6.1 Ingest paths and the pending pool +### 6.1 The enqueue at write time -RADIS has two ingest paths and the orchestrator is decoupled from both. Every -ingest path eventually deposits a `ReportSearchVector` row with -`embedding=NULL`; the launcher consumes that pool on its cron schedule. +Both ingest paths register a `transaction.on_commit` callback that defers an `embed_reports_task` once the FTS rows exist: -- **Single-create** (`POST /api/reports/`) routes through `Report.objects.create` - in the serializer (`radis/reports/api/serializers.py:87`). The FTS - `post_save` receiver creates the `ReportSearchVector` row with NULL embedding. -- **Bulk-upsert** (`POST /api/reports/bulk-upsert`) routes through - `Report.objects.bulk_create` / `bulk_update` - (`radis/reports/api/viewsets.py:_bulk_upsert_reports`). The bulk path calls - `enqueue_bulk_index_reports(touched_ids)` in its `on_commit` block, which - bulk-creates the `ReportSearchVector` rows with NULL embedding. +```python +# single-create (POST) / PUT — inside the view's on_commit +def on_commit(): + # ... existing reports_created_handlers / reports_updated_handlers calls ... + embed_reports_task.defer(report_ids=[report.pk]) +``` -Accepting a freshness window of hours / next-cycle is the price of batched, -throughput-friendly embedding runs. This design serves all three operational -scenarios with one mechanism: +The single-create / PUT path always has FTS done by the time `on_commit` fires because the FTS `post_save` signal on `Report` runs sync inline during `serializer.save()`. The bulk-upsert path keeps its existing two FTS modes governed by `PGSEARCH_SYNC_INDEXING`; both modes chain into `embed_reports_task` (see §6.6): -| Scenario | What happens | -|---|---| -| **Initial bulk upload** (millions of reports via `/bulk-upsert`) | `ReportSearchVector` rows created with `embedding=NULL`. Operator defers the launcher immediately or waits for the next cron tick. One `EmbeddingJob` produces N `EmbeddingTask` batches. | -| **Daily ad-hoc upload** | Reports land NULL via either ingest path. Next periodic tick consolidates the day's pending pool into a single `EmbeddingJob`. | -| **Model-change backfill** | Operator follows §4.5 (drop column, re-migrate), then defers the launcher from a shell. Same code path as the periodic. | +```python +# bulk-upsert — inside bulk_upsert_reports' on_commit +def on_commit(): + # ... existing reports_created_handlers / reports_updated_handlers calls ... + if touched_report_ids: + if settings.PGSEARCH_SYNC_INDEXING: + bulk_upsert_report_search_vectors(touched_report_ids) # FTS sync + embed_reports_task.defer(report_ids=touched_report_ids) + else: + # bulk_index_reports chains embed_reports_task at its end (see §6.6) + enqueue_bulk_index_reports(touched_report_ids) +``` -### 6.2 Queue and worker +When the `transaction.atomic()` block commits: -The `embeddings` Procrastinate queue is served by the `embeddings_worker` -container. The orchestrator (`process_embedding_job`) runs on the `default` -queue alongside `process_extraction_job` and `process_subscription_job`; the -sub-tasks (`process_embedding_task`) run on `embeddings`. +1. Report rows are durable. +2. RSV rows exist (or will exist once `bulk_index_reports` runs, in the deferred FTS mode). +3. A row is inserted into `procrastinate_jobs` describing the embedding work (immediately in the sync FTS mode, or at the tail of `bulk_index_reports` in the deferred mode). -``` -./manage.py bg_worker -l debug -q embeddings --autoreload --concurrency 4 # dev -./manage.py bg_worker -l info -q embeddings --concurrency 4 # prod -``` +The HTTP response returns at that point. The view handler does **not** await embedding. -`embeddings_worker` concurrency tunes parallelism against the embedding -endpoint. Recommended 4; raise if the endpoint has spare throughput, lower if -it rate-limits. The orchestrator does not run on this queue, so there is no -self-deadlock condition tied to concurrency on the `embeddings` queue. +### 6.2 The task -### 6.3 Priorities +`radis/pgsearch/tasks.py`: -Procrastinate priority is "higher = sooner". Embedding work runs at lower -priority than extraction and subscription so it never starves user-driven LLM -operations. The orchestrator (`default` queue) and sub-tasks (`embeddings` -queue) share `EMBEDDING_INDEX_PRIORITY`; there is no separate backfill -priority because the backfill path is the same orchestrator. +```python +@app.task(queue="embeddings") +async def embed_reports_task(report_ids: list[int]) -> None: + """Embed the named reports. Raises on EmbeddingClientError so + Procrastinate's retry policy applies. + + Reports are sent to the embedding service in batches of + `EMBEDDING_BATCH_SIZE` to bound per-call payload size and per-call + GPU-side latency regardless of how many report_ids the caller passed. + """ + if not report_ids: + return -| Task | Priority | -|---|---| -| `EXTRACTION_DEFAULT_PRIORITY` (existing) | 2 | -| `EXTRACTION_URGENT_PRIORITY` (existing) | 3 | -| `SUBSCRIPTION_DEFAULT_PRIORITY` (existing) | 3 | -| `SUBSCRIPTION_URGENT_PRIORITY` (existing) | 4 | -| `EMBEDDING_INDEX_PRIORITY` (new) | 0 | + @database_sync_to_async + def _load_rsvs() -> list[ReportSearchVector]: + return list( + ReportSearchVector.objects.filter(report_id__in=report_ids) + .select_related("report") + .only("id", "report_id", "report__body") + ) -### 6.4 Models + rsvs = await _load_rsvs() + if not rsvs: + logger.warning("embed_reports_task: no RSVs for report ids %s", report_ids) + return -`radis/pgsearch/models.py` defines two new models inheriting `AnalysisJob` and -`AnalysisTask` (`radis/core/models.py:17,220`): + batch_size = settings.EMBEDDING_BATCH_SIZE + async with AsyncEmbeddingClient() as client: + for start in range(0, len(rsvs), batch_size): + chunk = rsvs[start : start + batch_size] + vectors = await client.embed_documents( + [rsv.report.body for rsv in chunk] + ) + for rsv, vec in zip(chunk, vectors, strict=True): + rsv.embedding = vec -```python -from radis.core.models import AnalysisJob, AnalysisTask - - -class EmbeddingJob(AnalysisJob): - default_priority = settings.EMBEDDING_INDEX_PRIORITY - urgent_priority = settings.EMBEDDING_INDEX_PRIORITY # no urgent variant - - def delay(self) -> None: - queued_job_id = app.configure_task( - "radis.pgsearch.tasks.process_embedding_job", - allow_unknown=False, - priority=self.default_priority, - ).defer(job_id=self.pk) - self.queued_job_id = queued_job_id - self.save() - - -class EmbeddingTask(AnalysisTask): - job = models.ForeignKey(EmbeddingJob, on_delete=models.CASCADE, related_name="tasks") - reports = models.ManyToManyField(Report, related_name="embedding_tasks") - - def delay(self) -> None: - queued_job_id = app.configure_task( - "radis.pgsearch.tasks.process_embedding_task", - allow_unknown=False, - priority=settings.EMBEDDING_INDEX_PRIORITY, - ).defer(task_id=self.pk) - self.queued_job_id = queued_job_id - self.save() + @database_sync_to_async + def _save(): + ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) + await _save() ``` -**Owner field.** `AnalysisJob.owner` is non-nullable (`settings.AUTH_USER_MODEL`). -Embedding jobs are system-driven and have no human creator. The squashed -`0002_hybrid_search` migration's final `RunPython` step creates -`User(username=settings.EMBEDDING_SYSTEM_USERNAME, is_active=False, -password=unusable)` idempotently; the launcher assigns this user as `owner` -on every `EmbeddingJob`. This avoids subclass-level overrides of `owner` and -keeps the abstract contract clean. +**Why async**: the work is dominated by HTTP wait. Procrastinate's worker is asyncio-based; an async task lets `--concurrency K` mean "K embedding HTTP calls in flight on a single event loop" without spinning OS threads. DB parts wrap `database_sync_to_async` so sync ORM doesn't block the loop. -**No `get_absolute_url` in v1.** Existing `ExtractionJob` and `SubscriptionJob` -implement `get_absolute_url` because they have user-facing detail views. -`EmbeddingJob` has no user-facing UI in v1 — operators inspect it via Django -admin (default `ModelAdmin` registration is sufficient). The inherited abstract -`AnalysisJob.get_absolute_url` body is `...`, returning `None`; no call site in -radis treats an `EmbeddingJob` like a user-facing analysis job. A future spec -can add the view and override the method. +**Internal batching**: a single task accepts an arbitrarily-sized `report_ids` list (e.g., a 1000-row bulk-upsert dispatches one task) and chunks it into HTTP calls of `EMBEDDING_BATCH_SIZE` reports each. This decouples the *enqueue size* (one task per ingest event, naturally sized to the workload) from the *embedding service call size* (always bounded by `EMBEDDING_BATCH_SIZE`, regardless of input). The vLLM endpoint sees a steady stream of equally-sized batches rather than occasional spike requests. -`urgent`, `send_finished_mail`, and `finished_mail_template` stay at their -`AnalysisJob` defaults (`False`, `False`, `None`). +**No internal catch**: the task lets `EmbeddingClientError` propagate. Procrastinate handles retry — see §6.4. On retry, the entire batch loop reruns (idempotent: `bulk_update` overwrites identical vectors with no change). -### 6.5 Launcher (the periodic task) +### 6.3 The worker and the concurrency model -`radis/pgsearch/tasks.py`: +A dedicated `embeddings_worker` container is added to `docker-compose.*.yml` with an explicit concurrency flag: -```python -@app.periodic(cron=settings.EMBEDDING_DRAIN_CRON) -@app.task( - queue="default", - queueing_lock="embedding_launcher", - pass_context=True, -) -def embedding_launcher(context, timestamp: int) -> None: - in_flight = EmbeddingJob.objects.filter( - status__in=[ - EmbeddingJob.Status.PREPARING, - EmbeddingJob.Status.PENDING, - EmbeddingJob.Status.IN_PROGRESS, - ] - ).exists() - if in_flight: - logger.info("EmbeddingJob already in flight; launcher tick is a no-op.") - return +```yaml +embeddings_worker: + <<: *default-app + command: | + bash -c " + wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && + ./manage.py bg_worker -q embeddings --concurrency 4 + " +``` - has_pending = ReportSearchVector.objects.filter(embedding__isnull=True).exists() - if not has_pending: - logger.debug("No reports pending embedding; launcher tick is a no-op.") - return +Three explicit choices: - system_user = User.objects.get(username=settings.EMBEDDING_SYSTEM_USERNAME) - job = EmbeddingJob.objects.create( - owner=system_user, - status=EmbeddingJob.Status.PREPARING, - ) - transaction.on_commit(job.delay) -``` +- **Dedicated queue (`embeddings`)**: isolated from `default` (extraction / subscription) and `llm`. A backfill or write burst can't starve unrelated tasks. +- **`--concurrency 4`** (the concurrency knob): up to 4 `embed_reports_task` coroutines in flight on the worker's event loop at once. Each coroutine has at most one embedding HTTP call outstanding at a time (the task's internal batch loop is sequential), so `--concurrency K` translates directly to "up to K embedding HTTP requests in flight to the embedding service per worker process." Total system concurrency = `worker_count × --concurrency`. The default of 4 leaves capacity for the query path's `embed_query` to share the same embedding service. Tunable per deployment. +- **Async-native**: the worker runs a single asyncio event loop; async tasks slot in directly. One `httpx.AsyncClient` connection pool per worker process; one async Postgres pool. Low overhead compared to threaded workers. -**Two reinforcing layers of duplicate-dispatch prevention:** +**Two layers of "batching"**, easy to confuse, kept separate by design: -- **Procrastinate `queueing_lock="embedding_launcher"`.** While a launcher job - is in the queue (`todo`) or executing (`doing`), the next cron tick's - `defer` call silently fails with `AlreadyEnqueued`. The launcher itself is - fast (one existence check + maybe one INSERT), so the lock is normally - released within milliseconds. -- **In-flight EmbeddingJob check.** Even if the queueing lock leaks (worker - crash mid-flight, manual `defer` from a shell, dashboard re-trigger), the - launcher's first action is to look for any `EmbeddingJob` in a non-terminal - status. If one exists, the launcher returns without creating another. This - is the same dedup pattern used by `process_extraction_job` when re-entered - (`extractions/tasks.py:46`). +| Layer | Knob | What it controls | +|---|---|---| +| Per-HTTP-call size | `EMBEDDING_BATCH_SIZE` (settings constant; default 32) | How many report bodies are sent in one `embed_documents` call inside the task. | +| Concurrent HTTP calls per worker | `--concurrency K` (compose flag; default 4) | How many `embed_documents` calls can be in flight at the same time. | +| Concurrent HTTP calls across all workers | `worker_count × --concurrency K` | The system's actual load ceiling on the embedding service. | -### 6.6 Orchestrator (`process_embedding_job`) +To scale up, prefer adding worker processes (crash isolation + connection-pool fan-out) over raising `--concurrency` past ~8 (the embedding service typically saturates around there anyway). Total embedding load on the service is `worker_count × --concurrency`. -```python -@app.task -def process_embedding_job(job_id: int) -> None: - job = EmbeddingJob.objects.get(id=job_id) - assert job.status == EmbeddingJob.Status.PREPARING - - # Retry/resume path: tasks already exist, re-enqueue still-pending ones. - if job.tasks.exists(): - tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) - else: - pending_ids_iter = ( - ReportSearchVector.objects - .filter(embedding__isnull=True) - .values_list("report_id", flat=True) - .iterator(chunk_size=10_000) - ) - batch: list[int] = [] - for report_id in pending_ids_iter: - batch.append(int(report_id)) - if len(batch) >= settings.EMBEDDING_BATCH_SIZE: - _create_embedding_task(job, batch) - batch = [] - if batch: - _create_embedding_task(job, batch) - - tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) - - job.status = EmbeddingJob.Status.PENDING - job.queued_job_id = None - job.save() - - for task in tasks_to_enqueue: - if not task.is_queued: - task.delay() - - -def _create_embedding_task(job: EmbeddingJob, report_ids: list[int]) -> EmbeddingTask: - task = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) - task.reports.set(Report.objects.filter(pk__in=report_ids)) - return task -``` +### 6.4 Failure semantics -Mirrors `process_extraction_job` (`extractions/tasks.py:32`). State transitions -follow the standard pattern: +Procrastinate handles transient failures automatically; `embed_pending` (§6.5) handles extended outages. -- `PREPARING` while tasks are being created (sub-tasks must not be dispatched yet). -- `PENDING` after task creation completes; sub-tasks are then enqueued. -- `IN_PROGRESS` / `SUCCESS` / `WARNING` / `FAILURE` driven by `update_job_state` - (inherited from `AnalysisJob`) called from each sub-task on completion. +| Failure | What happens | +|---|---| +| **Transient outage** (5xx / timeout / network blip ≲ minutes) | Task raises → Procrastinate retries with exponential backoff. Most cases auto-recover; the embedding is written without operator action. | +| **Extended outage** (service down longer than retry window) | Task ends in `failed` state in `procrastinate_jobs`. RSV stays NULL. Operator runs `./manage.py embed_pending` once the service recovers to re-enqueue the affected rows. | +| **Wrong-dim vector returned by backend** | `EmbeddingClientError` raised → retries → all fail the same way → task ends `failed`. Operator inspects, fixes config (or the `pgsearch.E001` system check catches it at deploy time). | +| **Worker offline / crashed** | Tasks pile up in `procrastinate_jobs.todo`. When a worker starts, it picks them up via `SELECT ... FOR UPDATE SKIP LOCKED`. No data loss. Write path unaffected. | +| **Embedding written and report immediately deleted** | `bulk_update` updates zero rows for the deleted RSV; rest of the batch is unaffected. Benign. | +| **`EMBEDDING_PROVIDER_URL` empty / misconfigured** | `EmbeddingClient.__init__` raises `EmbeddingClientError` at task start → retries fail → task ends `failed`. Operator fixes settings, runs `embed_pending`. | +| **`settings.EMBEDDING_DIM` ≠ migration dim** | `pgsearch.E001` system check blocks startup; this is caught at deploy time, not runtime. | -The orchestrator does no HTTP work. For 1M pending reports at -`EMBEDDING_BATCH_SIZE=32`, it creates ~31,250 `EmbeddingTask` rows and defers -them — well under a minute on the `default` worker. Its slot is freed -immediately after; long-running embedding work happens on the `embeddings` -worker. +The **write path never fails because of embedding**. Reports are saved, FTS indexed sync, vector indexing best-effort with retries + recovery. -### 6.7 Sub-task (`process_embedding_task`) +### 6.5 `embed_pending` — operator-driven recovery -```python -@app.task(queue="embeddings") -def process_embedding_task(task_id: int) -> None: - task = EmbeddingTask.objects.get(id=task_id) - task.status = EmbeddingTask.Status.IN_PROGRESS - task.started_at = timezone.now() - task.attempts = task.attempts + 1 - task.save() - - client = EmbeddingClient() - try: - report_ids = list(task.reports.values_list("pk", flat=True)) - rsvs = list( - ReportSearchVector.objects - .filter(report_id__in=report_ids) - .select_related("report") - .only("id", "report_id", "report__body") - ) - texts = [rsv.report.body for rsv in rsvs] - vectors = client.embed_documents(texts) - for rsv, vec in zip(rsvs, vectors, strict=True): - rsv.embedding = vec - ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) +The `./manage.py embed_pending` command is retained, with one change: it now **enqueues `embed_reports_task` instances** rather than running embedding work inline in the command process. This keeps the embedding service load bounded by the worker's configured concurrency rather than by however fast the operator's shell can iterate. - task.status = EmbeddingTask.Status.SUCCESS - except EmbeddingClientError as exc: - logger.exception("Embedding task %s failed: %s", task_id, exc) - task.status = EmbeddingTask.Status.FAILURE - task.message = str(exc) - raise # Procrastinate retry policy applies - finally: - task.ended_at = timezone.now() - task.queued_job_id = None - task.save() - task.job.update_job_state() - client.close() +```python +async def _drain(self, ids, batch_size): + for i in range(0, len(ids), batch_size): + chunk = ids[i : i + batch_size] + embed_reports_task.defer(report_ids=chunk) + self.stdout.write(f" enqueued {i + len(chunk)}/{len(ids)}") ``` -Raising on `EmbeddingClientError` -lets Procrastinate's retry policy apply. After retries exhaust, the exception -propagates, the task ends as `FAILURE`, and `update_job_state` is still called -from the `finally` block. The job finishes with status `WARNING` (some tasks -failed, some succeeded) or `FAILURE` (all failed). The next launcher tick will -create a fresh job that picks up any rows still NULL. +The three scenarios still apply: -### 6.8 Operator-triggered drain +1. **Backfill** of historical NULLs (rows loaded before the deferred-embedding architecture shipped). +2. **Dim or model change** following §4.5 (or `ReportSearchVector.objects.update(embedding=None)` for a same-dim model swap). +3. **Outage recovery** for tasks that exhausted Procrastinate retries during an extended embedding-service outage. -The only ingest-time signal is the FTS `post_save` receiver -(`create_or_update_report_search_vector`), which creates the -`ReportSearchVector` row with `embedding=NULL`. Embedding is driven entirely -by the orchestrator from then on. +Properties: + +- **Idempotent.** Filter is `embedding IS NULL`; re-runs are no-ops on already-drained rows. +- **Resumable.** No checkpoint state. Killed mid-run → re-run picks up remaining NULLs. +- **Rate-limited.** The worker's `--concurrency K` caps concurrent embedding HTTP calls regardless of how many tasks the command enqueues. Operators cannot accidentally hammer the embedding service. +- **Visible.** Enqueued tasks appear in the standard Procrastinate observability surface (admin, logs, telemetry). Failed retries surface there as well. + +### 6.6 `PGSEARCH_SYNC_INDEXING` retained; FTS chains to embedding + +The pre-existing `PGSEARCH_SYNC_INDEXING` switch is **retained**, unchanged in semantics: it controls *how* FTS bulk-indexing happens on the bulk-upsert path. The hybrid-search work adds one new property — both FTS modes chain into `embed_reports_task` so embedding always follows FTS by construction. -Operators trigger an immediate drain — typically after a model swap or initial -bulk import — by deferring the same launcher from a Django shell: +| Mode | `PGSEARCH_SYNC_INDEXING` | FTS step | Embedding step | +|---|---|---|---| +| Sync | `True` | `bulk_upsert_report_search_vectors(ids)` runs inline inside `on_commit` | `embed_reports_task.defer(report_ids=ids)` immediately follows in the same `on_commit` | +| Deferred (default) | `False` | `enqueue_bulk_index_reports(ids)` defers the `bulk_index_reports` Procrastinate task | `bulk_index_reports` calls `embed_reports_task.defer(report_ids=ids)` at the end of its run | + +The chain is enforced inside `bulk_index_reports`: ```python -from radis.pgsearch.tasks import embedding_launcher -embedding_launcher.defer() +@app.task +def bulk_index_reports(report_ids: list[int]) -> None: + if not report_ids: + return + bulk_upsert_report_search_vectors(report_ids) + embed_reports_task.defer(report_ids=list(report_ids)) ``` -This goes through the same launcher → orchestrator → sub-task path as the -periodic; the only difference is who fires it. One code path, one set of -tests, one observable lifecycle. +Properties: + +- **No race between embedding and FTS.** Embedding is only enqueued after the RSV rows exist (either inline in the sync path or at the tail of `bulk_index_reports` in the deferred path). +- **Operator choice preserved.** Deployments that prefer sync FTS (small bulks, deterministic end-to-end ordering with subscription handlers) keep that option. Deployments that prefer deferred FTS (large bulks, fast HTTP response) keep that option. The hybrid-search work is orthogonal. +- **One queue per concern.** FTS deferral runs on the `default` queue (where `bulk_index_reports` already lived); embedding runs on the dedicated `embeddings` queue. FTS-only worker capacity does not compete with embedding capacity. + +The single-create / PUT path is unaffected by `PGSEARCH_SYNC_INDEXING`: its FTS step is the `post_save` signal on `Report`, which is always sync inline by construction (not under the switch's control). + +### 6.7 Sync DRF; no async views required + +The enqueue (`embed_reports_task.defer(...)`) is a synchronous Procrastinate API call, so the report views remain plain sync DRF (`ReportViewSet`, unchanged in shape from main). No `await` lives inside any request handler. The async-view rewrite proposed in PR #230 is **not a dependency** of this design and is intentionally not pulled in — the entire embedding workload lives on the worker side, behind the `embeddings` queue. ## 7. Hybrid search provider @@ -878,12 +756,11 @@ EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3 EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) ``` -These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). +These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` is intentionally an env decision because it is schema-coupled (see §4.5). Worker concurrency is set in the compose command line (`bg_worker -q embeddings --concurrency K`), not via env — it's a deployment-shape decision rather than a runtime tunable. ### 8.2 Code constants (tuning knobs, in `base.py`) ```python -EMBEDDING_DRAIN_CRON = "0 2 * * *" # nightly; bump in code for faster dev drain EMBEDDING_REQUEST_TIMEOUT = 30 # seconds EMBEDDING_MAX_INPUT_CHARS = 60_000 EMBEDDING_QUERY_INSTRUCTION = ( @@ -892,15 +769,12 @@ EMBEDDING_QUERY_INSTRUCTION = ( ) EMBEDDING_BATCH_SIZE = 32 -EMBEDDING_INDEX_PRIORITY = 0 -EMBEDDING_SYSTEM_USERNAME = "system" - HYBRID_VECTOR_TOP_K = 100 HYBRID_FTS_MAX_RESULTS = 10_000 HYBRID_RRF_K = 60 ``` -These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks). `EMBEDDING_SYSTEM_USERNAME` names the system user that owns every auto-generated `EmbeddingJob`; the squashed migration's `RunPython` step creates this user idempotently. +These are tuning constants. Changing them is a code change with a PR diff. This matches the project's existing pattern (`EXTRACTION_LLM_CONCURRENCY_LIMIT = 6`, the `CHAT_*_SYSTEM_PROMPT` blocks). ### 8.3 `example.env` @@ -910,16 +784,12 @@ Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, `docker-compose.base.yml`: -- New service `embeddings_worker` inheriting `*default-app`. -- The `EMBEDDING_BACKEND`, `EMBEDDING_PROVIDER_URL`, `EMBEDDING_PROVIDER_PATH`, `EMBEDDING_PROVIDER_API_KEY`, `EMBEDDING_MODEL_NAME`, `EMBEDDING_DIM` env keys added to the `&default-app` block so all services see them. +- The `EMBEDDING_BACKEND`, `EMBEDDING_PROVIDER_URL`, `EMBEDDING_PROVIDER_PATH`, `EMBEDDING_PROVIDER_API_KEY`, `EMBEDDING_MODEL_NAME`, `EMBEDDING_DIM` env keys are added to the `&default-app` block so all services see them. +- New service `embeddings_worker` inheriting `*default-app` runs `./manage.py bg_worker -q embeddings --concurrency 4` (see §6.3). `docker-compose.dev.yml`: -- `embeddings_worker.command`: `bash -c "wait-for-it -s postgres.local:5432 -t ${WAIT_POSTGRES_TIMEOUT:-180} && ./manage.py bg_worker -l debug -q embeddings --autoreload"`. - -`docker-compose.prod.yml`: - -- Same without `--autoreload`, log level `info`. +Both files add an `embeddings_worker.command` block. Dev uses `-l debug --autoreload`; prod uses `-l info`. Both pass `-q embeddings --concurrency 4` by default — tune per deployment. ## 9. Error handling and degradation @@ -928,11 +798,9 @@ Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, | Embedding service returns 5xx/timeout during query-time | `query_vec = None`; result list ordered by FTS-only; request succeeds | WARNING with request id | | Embedding service returns 4xx during query-time | Same FTS-only fallback (treats as misconfig at request layer) | ERROR | | Embedding service returns malformed body | `EmbeddingClientError` raised; query falls back to FTS-only | ERROR | -| Embedding service down during a sub-task | `process_embedding_task` raises; Procrastinate retries with exponential backoff; `embedding` stays NULL | WARNING per attempt, ERROR after final retry | -| Launcher fires while EmbeddingJob is `PREPARING`/`PENDING`/`IN_PROGRESS` | Status check returns immediately; tick is a no-op | INFO | +| Embedding service down during `embed_reports_task` execution | Task raises `EmbeddingClientError`; Procrastinate retries with exponential backoff. After retries exhaust, task ends `failed`; `embedding` stays NULL. **API request was never affected** (already returned at the on_commit point). | WARNING per retry; ERROR on final failure | | Orchestrator crashes during task creation (partial dispatch) | Job stays in `PREPARING`. Next launcher tick sees in-flight job and no-ops. Operator marks job `FAILURE` in admin to allow a fresh run | ERROR + operator action | | Sub-task fails after Procrastinate retries exhausted | Task ends as `FAILURE`. `update_job_state` rolls the job to `WARNING` (some tasks succeeded) or `FAILURE` (all failed). NULL rows remain; next launcher creates a new job to retry them | ERROR | -| `embeddings_worker` saturation | Sub-tasks queue up; orchestrator already returned. No deadlock; just slower drain | DEBUG | | Report body > `EMBEDDING_MAX_INPUT_CHARS` | Truncate, embed truncated text | WARNING with report_id and char count | | Report deleted between task creation and execution | Sub-task's `task.reports.values_list(...)` returns fewer rows; `embed_documents` called on smaller list; no error | DEBUG | | Vector dim mismatch on write | Postgres raises; sub-task fails, retried | ERROR — escalate to admin | @@ -948,10 +816,8 @@ Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, **Observability:** - Provider logs at DEBUG: vec hit count, FTS hit count, intersection count, fusion ms, query-embed ms. -- `process_embedding_task` logs at INFO: batch size, total chars, latency, success/retry counts. -- `embedding_launcher` and `process_embedding_job` log status transitions and dispatch counts at INFO. -- Operators inspect job/task state via Django admin (`EmbeddingJob`, `EmbeddingTask` use the default `ModelAdmin`). -- The existing OpenTelemetry overlay (commit `653e0c67`) tags telemetry per service; `embeddings_worker` shows up automatically. +- `embed_reports_task` logs at INFO: batch size, total chars, latency, attempt number. +- The existing OpenTelemetry overlay (commit `653e0c67`) tags telemetry per service; embedding spans show up under the `embeddings_worker` service. ## 10. Testing strategy @@ -961,9 +827,7 @@ Adds a documented Ollama block and a Qwen/OpenAI-compatible block side by side, |---|---| | `tests/unit/test_embedding_client.py` | Backend payload/response round-trip, path override, instruction prefix, normalization, dim validation, all error modes, truncation | | `tests/unit/test_provider_fusion.py` | `_rrf_fuse(vec_rank, fts_rank, k)` pure-Python helper: disjoint, overlapping, FTS-only, vector-only, both-empty, tiebreak by report_id | -| `tests/unit/test_embedding_launcher.py` | No-op when EmbeddingJob already in flight; no-op when no rows pending; happy path creates job and calls `delay`; raises if system user missing | -| `tests/unit/test_process_embedding_job.py` | Batches pending reports into `EmbeddingTask` rows of size `EMBEDDING_BATCH_SIZE`; status transitions `PREPARING` → `PENDING`; retry/resume path re-enqueues only `PENDING` tasks; empty pool exits cleanly | -| `tests/unit/test_process_embedding_task.py` | Embeds reports, writes vectors, sets status `SUCCESS`; status `FAILURE` and re-raise on `EmbeddingClientError`; calls `job.update_job_state` in both paths; clears `queued_job_id` | +| `tests/unit/test_embed_reports_task.py` | Loads RSVs by report_id, calls `AsyncEmbeddingClient.embed_documents`, bulk-updates vectors. Asserts that `EmbeddingClientError` propagates so Procrastinate's retry policy applies (the task does not swallow). | ### 10.2 Integration tests (real Postgres + pgvector) @@ -1012,7 +876,7 @@ See §4.5. ### 11.3 GGUF dev embeddings ≠ bf16 prod embeddings -Documented in §5.4. Mitigated by deferring `embedding_launcher` after a model swap (see §4.5). The next drain re-embeds everything. +Documented in §5.4. Mitigated by following §4.5 after a model swap and then running `./manage.py embed_pending` (§6.5), which enqueues `embed_reports_task` for every NULL row; the embeddings worker drains the queue at its configured concurrency. ### 11.4 No body-change detection for re-embedding @@ -1065,11 +929,12 @@ A `run_search_eval` management command loops a set of test queries through all s ## 12. Rollout plan -1. **Schema, dependency, models, data migration.** Land the `pgvector` Python dep and the squashed `0002_hybrid_search` migration (extension + embedding column + HNSW + `EmbeddingJob`/`EmbeddingTask` tables + system user). No behaviour change yet — `embedding` is nullable, queries still see only FTS. -2. **Embedding client and tests.** Land the client module and unit tests. No callers yet. -3. **Orchestrator tasks and `embeddings_worker`.** Land `embedding_launcher`, `process_embedding_job`, `process_embedding_task`, and the `embeddings_worker` container (with `--concurrency 4`). The launcher starts ticking on its compile-time `EMBEDDING_DRAIN_CRON` schedule; with no rows yet, all ticks no-op. -4. **Initial drain.** From a shell, run `embedding_launcher.defer()` so the orchestrator picks up the existing corpus. This is the only "operator action" in the rollout. It runs at `EMBEDDING_INDEX_PRIORITY` and lives behind whatever other work is on the queues; it can run for hours to days on a large corpus. -5. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. At this point hybrid is the new default; rows still missing an embedding participate via the FTS half only. -6. **Monitor.** Watch search latency p95, embedding-queue depth, `EmbeddingJob` admin state, and the rate of "FTS-only fallback" warnings. Tune `HYBRID_VECTOR_TOP_K` / `HYBRID_FTS_MAX_RESULTS` if needed. +1. **Schema + dep.** `pgvector` pip dep + `0002_hybrid_search` migration (extension + embedding column + HNSW). No behaviour change yet. +2. **Embedding clients + tests.** Land `EmbeddingClient` (sync, query side) and `AsyncEmbeddingClient` (async, worker side). No callers wired up yet. +3. **Worker + task + queue.** Add `embeddings_worker` container (compose), `embed_reports_task` async task on the `embeddings` queue, and the worker command at `--concurrency 4`. Without callers, the worker stays idle. +4. **Write-path enqueue.** Modify the single-create `on_commit` and `bulk_upsert_reports`' `on_commit` to call `embed_reports_task.defer(report_ids=touched_pks)`. The bulk-upsert path keeps both `PGSEARCH_SYNC_INDEXING` modes (§6.6); the sync mode defers embedding immediately after FTS, the deferred mode chains embedding at the tail of `bulk_index_reports`. From this point on, **every write enqueues an embedding task**; the embeddings worker drains the queue. +5. **Provider switch.** Replace the body of `radis.pgsearch.providers.search()` and `retrieve()` with the hybrid implementation. Rows still missing an embedding participate via the FTS half only. +6. **(Optional) historical backfill.** Run `./manage.py embed_pending` to enqueue an `embed_reports_task` for every existing NULL row. Same command serves outage recovery and dim/model-change scenarios (§6.5). +7. **Monitor.** Watch search latency p95, write latency p95 (unchanged — just the enqueue), embedding-queue depth, retry rate, and `procrastinate_jobs.failed` count. -Each step is independently mergeable; steps 1–3 ship as quiet infrastructure changes with no user-visible effect, step 4 starts populating the column, step 5 is the moment hybrid goes live. +Each step is independently mergeable; steps 1–3 ship as quiet infrastructure with no user-visible effect, step 4 starts populating the column on every write, step 5 is the moment hybrid search goes live for users. diff --git a/radis/pgsearch/management/commands/embed_pending.py b/radis/pgsearch/management/commands/embed_pending.py new file mode 100644 index 00000000..67fb4bd2 --- /dev/null +++ b/radis/pgsearch/management/commands/embed_pending.py @@ -0,0 +1,78 @@ +"""Enqueue `embed_reports_task` for every `ReportSearchVector` whose embedding +is still NULL. + +Operators run this for three scenarios: + +1. **Backfill.** Reports loaded before the deferred-embedding wiring shipped. +2. **Dim or model change.** After §4.5: drop the column, re-migrate (or + `ReportSearchVector.objects.update(embedding=None)` for a same-dim model + swap), then run this command to re-embed against the new model. +3. **Outage recovery.** Tasks that exhausted Procrastinate retries during an + extended embedding-service outage — re-run after the service recovers. + +The command itself does no HTTP work; it enqueues tasks onto the `embeddings` +queue. The embeddings worker drains them at its configured `--concurrency`, +so operators cannot accidentally hammer the embedding service. + +Properties: + +- **Idempotent.** The filter is `embedding IS NULL`; re-runs are no-ops on + rows the worker has already drained. +- **Resumable.** No checkpoint state. Killed mid-enqueue → re-run picks up + the still-NULL rows. +- **Rate-limited.** Worker concurrency caps load on the embedding service + regardless of how many tasks this command enqueues. +""" +from django.conf import settings +from django.core.management.base import BaseCommand + +from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.tasks import embed_reports_task + + +class Command(BaseCommand): + help = ( + "Enqueue embed_reports_task for every ReportSearchVector with " + "embedding=NULL. The embeddings worker drains the queue at its " + "configured concurrency." + ) + + def add_arguments(self, parser) -> None: + parser.add_argument( + "--batch-size", + type=int, + default=settings.EMBEDDING_BATCH_SIZE, + help=( + f"Reports per enqueued task (default " + f"{settings.EMBEDDING_BATCH_SIZE}). The worker further chunks " + f"each task by EMBEDDING_BATCH_SIZE internally." + ), + ) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Stop after enqueuing N reports (default: enqueue all).", + ) + + def handle(self, *args, **opts) -> None: + ids = list( + ReportSearchVector.objects.filter(embedding__isnull=True) + .order_by("report_id") + .values_list("report_id", flat=True) + ) + if opts["limit"] is not None: + ids = ids[: opts["limit"]] + if not ids: + self.stdout.write("Nothing to embed.") + return + + batch_size = opts["batch_size"] + self.stdout.write( + f"Enqueuing {len(ids)} report(s) in tasks of {batch_size}..." + ) + for i in range(0, len(ids), batch_size): + chunk = ids[i : i + batch_size] + embed_reports_task.defer(report_ids=list(chunk)) + self.stdout.write(f" enqueued {i + len(chunk)}/{len(ids)}") + self.stdout.write(self.style.SUCCESS("Done.")) diff --git a/radis/pgsearch/migrations/0002_hybrid_search.py b/radis/pgsearch/migrations/0002_hybrid_search.py index 7a118d37..0a891d2e 100644 --- a/radis/pgsearch/migrations/0002_hybrid_search.py +++ b/radis/pgsearch/migrations/0002_hybrid_search.py @@ -1,16 +1,6 @@ -import django.db.models.deletion import pgvector.django.indexes import pgvector.django.vector -from django.conf import settings -from django.db import migrations, models - - -def create_system_user(apps, schema_editor): - User = apps.get_model(*settings.AUTH_USER_MODEL.split(".")) - User.objects.get_or_create( - username=settings.EMBEDDING_SYSTEM_USERNAME, - defaults={"is_active": False, "password": "!"}, - ) +from django.db import migrations class Migration(migrations.Migration): @@ -18,8 +8,6 @@ class Migration(migrations.Migration): dependencies = [ ("pgsearch", "0001_initial"), ("reports", "0013_alter_report_options"), - ("procrastinate", "0041_post_retry_failed_job"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), ] operations = [ @@ -42,125 +30,4 @@ class Migration(migrations.Migration): opclasses=["vector_cosine_ops"], ), ), - migrations.CreateModel( - name="EmbeddingJob", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "status", - models.CharField( - choices=[ - ("UV", "Unverified"), - ("PR", "Preparing"), - ("PE", "Pending"), - ("IP", "In Progress"), - ("CI", "Canceling"), - ("CA", "Canceled"), - ("SU", "Success"), - ("WA", "Warning"), - ("FA", "Failure"), - ], - default="UV", - max_length=2, - ), - ), - ("urgent", models.BooleanField(default=False)), - ("send_finished_mail", models.BooleanField(default=False)), - ("message", models.TextField(blank=True, default="")), - ("created_at", models.DateTimeField(auto_now_add=True)), - ("started_at", models.DateTimeField(blank=True, null=True)), - ("ended_at", models.DateTimeField(blank=True, null=True)), - ( - "owner", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - related_name="%(app_label)s_jobs", - to=settings.AUTH_USER_MODEL, - ), - ), - ( - "queued_job", - models.OneToOneField( - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="+", - to="procrastinate.procrastinatejob", - ), - ), - ], - options={ - "ordering": ["-created_at"], - }, - ), - migrations.CreateModel( - name="EmbeddingTask", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "status", - models.CharField( - choices=[ - ("PE", "Pending"), - ("IP", "In Progress"), - ("CA", "Canceled"), - ("SU", "Success"), - ("WA", "Warning"), - ("FA", "Failure"), - ], - default="PE", - max_length=2, - ), - ), - ("attempts", models.PositiveSmallIntegerField(default=0)), - ("message", models.TextField(blank=True, default="")), - ("log", models.TextField(blank=True, default="")), - ("created_at", models.DateTimeField(auto_now_add=True)), - ("started_at", models.DateTimeField(blank=True, null=True)), - ("ended_at", models.DateTimeField(blank=True, null=True)), - ( - "job", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - related_name="tasks", - to="pgsearch.embeddingjob", - ), - ), - ( - "queued_job", - models.OneToOneField( - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="+", - to="procrastinate.procrastinatejob", - ), - ), - ( - "reports", - models.ManyToManyField( - related_name="embedding_tasks", to="reports.report" - ), - ), - ], - options={ - "ordering": ("id",), - "abstract": False, - }, - ), - migrations.RunPython(create_system_user, reverse_code=migrations.RunPython.noop), ] diff --git a/radis/pgsearch/models.py b/radis/pgsearch/models.py index a55abaad..5cd90e8b 100644 --- a/radis/pgsearch/models.py +++ b/radis/pgsearch/models.py @@ -3,10 +3,7 @@ from django.contrib.postgres.search import SearchVector, SearchVectorField from django.db import models from pgvector.django import HnswIndex, VectorField -from procrastinate.contrib.django import app -from procrastinate.contrib.django.models import ProcrastinateJob -from radis.core.models import AnalysisJob, AnalysisTask from radis.reports.models import Report from .utils.language_utils import code_to_language @@ -37,47 +34,3 @@ def save(self, *args, **kwargs): language = code_to_language(self.report.language.code) self.search_vector = SearchVector(models.Value(body), config=language) super().save(*args, **kwargs) - - -class EmbeddingJob(AnalysisJob): - default_priority = settings.EMBEDDING_INDEX_PRIORITY - urgent_priority = settings.EMBEDDING_INDEX_PRIORITY - finished_mail_template = None - - queued_job_id: int | None - queued_job = models.OneToOneField( - ProcrastinateJob, null=True, on_delete=models.SET_NULL, related_name="+" - ) - - tasks: models.QuerySet["EmbeddingTask"] - - class Meta: - ordering = ["-created_at"] - - def __str__(self) -> str: - return f"EmbeddingJob [{self.pk}]" - - def delay(self) -> None: - queued_job_id = app.configure_task( - "radis.pgsearch.tasks.process_embedding_job", - allow_unknown=False, - priority=self.default_priority, - ).defer(job_id=self.pk) - self.queued_job_id = queued_job_id - self.save() - - -class EmbeddingTask(AnalysisTask): - job = models.ForeignKey( - EmbeddingJob, on_delete=models.CASCADE, related_name="tasks" - ) - reports = models.ManyToManyField(Report, related_name="embedding_tasks") - - def delay(self) -> None: - queued_job_id = app.configure_task( - "radis.pgsearch.tasks.process_embedding_task", - allow_unknown=False, - priority=settings.EMBEDDING_INDEX_PRIORITY, - ).defer(task_id=self.pk) - self.queued_job_id = queued_job_id - self.save() diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index 14ce946d..e66bd54a 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -1,14 +1,12 @@ import logging -from django.conf import settings as django_settings -from django.contrib.auth import get_user_model -from django.db import transaction -from django.utils import timezone +from channels.db import database_sync_to_async +from django.conf import settings from procrastinate.contrib.django import app from procrastinate.types import JSONValue -from .models import EmbeddingJob, EmbeddingTask, ReportSearchVector -from .utils.embedding_client import EmbeddingClient, EmbeddingClientError +from .models import ReportSearchVector +from .utils.embedding_client import AsyncEmbeddingClient from .utils.indexing import bulk_upsert_report_search_vectors logger = logging.getLogger(__name__) @@ -16,10 +14,18 @@ @app.task def bulk_index_reports(report_ids: list[int]) -> None: + """Deferred FTS bulk-indexing for the bulk-upsert path + (when `PGSEARCH_SYNC_INDEXING=False`). + + Chains into `embed_reports_task` so the embedding step is enqueued + immediately after FTS rows exist, regardless of whether FTS ran sync inline + or via this deferred task. + """ if not report_ids: return logger.info("Indexing %s reports in bulk.", len(report_ids)) bulk_upsert_report_search_vectors(report_ids) + embed_reports_task.defer(report_ids=list(report_ids)) def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: @@ -37,111 +43,43 @@ def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: @app.task(queue="embeddings") -def process_embedding_task(task_id: int) -> None: - task = EmbeddingTask.objects.get(id=task_id) - task.status = EmbeddingTask.Status.IN_PROGRESS - task.started_at = timezone.now() - task.attempts = task.attempts + 1 - task.save() - - client = EmbeddingClient() - try: - report_ids = list(task.reports.values_list("pk", flat=True)) - rsvs = list( - ReportSearchVector.objects - .filter(report_id__in=report_ids) +async def embed_reports_task(report_ids: list[int]) -> None: + """Embed the named reports. + + Raises on `EmbeddingClientError` so Procrastinate's retry policy applies. + Reports are sent to the embedding service in batches of + `EMBEDDING_BATCH_SIZE` to bound per-call payload size regardless of how + many `report_ids` the caller passed. + """ + if not report_ids: + return + + @database_sync_to_async + def _load_rsvs() -> list[ReportSearchVector]: + return list( + ReportSearchVector.objects.filter(report_id__in=report_ids) .select_related("report") .only("id", "report_id", "report__body") ) - texts = [rsv.report.body for rsv in rsvs] - vectors = client.embed_documents(texts) - for rsv, vec in zip(rsvs, vectors, strict=True): - rsv.embedding = vec - ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) - - task.status = EmbeddingTask.Status.SUCCESS - except EmbeddingClientError as exc: - logger.exception("Embedding task %s failed: %s", task_id, exc) - task.status = EmbeddingTask.Status.FAILURE - task.message = str(exc) - raise - finally: - task.ended_at = timezone.now() - task.queued_job_id = None - task.save() - task.job.update_job_state() - client.close() - -def _create_embedding_task(job: EmbeddingJob, report_ids: list[int]) -> EmbeddingTask: - from radis.reports.models import Report - - task = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) - task.reports.set(Report.objects.filter(pk__in=report_ids)) - return task - - -@app.task -def process_embedding_job(job_id: int) -> None: - job = EmbeddingJob.objects.get(id=job_id) - assert job.status == EmbeddingJob.Status.PREPARING - - if job.tasks.exists(): - tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) - else: - pending_ids_iter = ( - ReportSearchVector.objects - .filter(embedding__isnull=True) - .values_list("report_id", flat=True) - .iterator(chunk_size=10_000) + rsvs = await _load_rsvs() + if not rsvs: + logger.warning( + "embed_reports_task: no ReportSearchVector rows for report ids %s", + report_ids, ) - batch: list[int] = [] - for report_id in pending_ids_iter: - batch.append(int(report_id)) - if len(batch) >= django_settings.EMBEDDING_BATCH_SIZE: - _create_embedding_task(job, batch) - batch = [] - if batch: - _create_embedding_task(job, batch) - - tasks_to_enqueue = job.tasks.filter(status=EmbeddingTask.Status.PENDING) - - job.status = EmbeddingJob.Status.PENDING - job.queued_job_id = None - job.save() - - for task in tasks_to_enqueue: - if not task.is_queued: - task.delay() - - -@app.periodic(cron=django_settings.EMBEDDING_DRAIN_CRON) -@app.task( - queue="default", - queueing_lock="embedding_launcher", - pass_context=True, -) -def embedding_launcher(context, timestamp: int) -> None: - in_flight = EmbeddingJob.objects.filter( - status__in=[ - EmbeddingJob.Status.PREPARING, - EmbeddingJob.Status.PENDING, - EmbeddingJob.Status.IN_PROGRESS, - ] - ).exists() - if in_flight: - logger.info("EmbeddingJob already in flight; launcher tick is a no-op.") return - has_pending = ReportSearchVector.objects.filter(embedding__isnull=True).exists() - if not has_pending: - logger.debug("No reports pending embedding; launcher tick is a no-op.") - return + batch_size = settings.EMBEDDING_BATCH_SIZE + async with AsyncEmbeddingClient() as client: + for start in range(0, len(rsvs), batch_size): + chunk = rsvs[start : start + batch_size] + vectors = await client.embed_documents([rsv.report.body for rsv in chunk]) + for rsv, vec in zip(chunk, vectors, strict=True): + rsv.embedding = vec + + @database_sync_to_async + def _save() -> None: + ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) - User = get_user_model() - system_user = User.objects.get(username=django_settings.EMBEDDING_SYSTEM_USERNAME) - job = EmbeddingJob.objects.create( - owner=system_user, - status=EmbeddingJob.Status.PREPARING, - ) - transaction.on_commit(job.delay) + await _save() diff --git a/radis/pgsearch/tests/test_embed_pending_command.py b/radis/pgsearch/tests/test_embed_pending_command.py new file mode 100644 index 00000000..b125cb2a --- /dev/null +++ b/radis/pgsearch/tests/test_embed_pending_command.py @@ -0,0 +1,48 @@ +"""Tests for the `embed_pending` management command.""" +from io import StringIO +from unittest.mock import patch + +import pytest +from django.core.management import call_command + +from radis.reports.factories import ReportFactory + +pytestmark = pytest.mark.django_db + + +def test_nothing_to_embed(): + out = StringIO() + with patch("radis.pgsearch.management.commands.embed_pending.embed_reports_task") as task: + call_command("embed_pending", stdout=out) + assert "Nothing to embed." in out.getvalue() + task.defer.assert_not_called() + + +def test_enqueues_all_pending_in_batches(): + # ReportFactory triggers the FTS post_save signal → RSV row with embedding=NULL. + reports = [ReportFactory.create() for _ in range(5)] + expected_ids = sorted(r.pk for r in reports) + + out = StringIO() + with patch("radis.pgsearch.management.commands.embed_pending.embed_reports_task") as task: + call_command("embed_pending", "--batch-size", "2", stdout=out) + + # 5 reports / batch 2 → three defer calls of sizes 2, 2, 1. + assert task.defer.call_count == 3 + enqueued_ids = [pk for call in task.defer.call_args_list for pk in call.kwargs["report_ids"]] + assert sorted(enqueued_ids) == expected_ids + output = out.getvalue() + assert "2/5" in output + assert "5/5" in output + assert "Done." in output + + +def test_limit_caps_work(): + [ReportFactory.create() for _ in range(5)] + + out = StringIO() + with patch("radis.pgsearch.management.commands.embed_pending.embed_reports_task") as task: + call_command("embed_pending", "--limit", "3", "--batch-size", "10", stdout=out) + + enqueued_ids = [pk for call in task.defer.call_args_list for pk in call.kwargs["report_ids"]] + assert len(enqueued_ids) == 3 diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py new file mode 100644 index 00000000..51cae851 --- /dev/null +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -0,0 +1,86 @@ +"""Tests for `embed_reports_task` and the `bulk_index_reports` → embedding chain.""" +import asyncio +from unittest.mock import AsyncMock, MagicMock, patch + +import numpy as np +import pytest + +from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.tasks import bulk_index_reports, embed_reports_task +from radis.pgsearch.utils.embedding_client import EmbeddingClientError +from radis.reports.factories import ReportFactory + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _unit_vec(dim: int) -> list[float]: + v = np.ones(dim, dtype=np.float32) + return (v / np.linalg.norm(v)).tolist() + + +def _make_fake_async_client(vec: list[float]) -> MagicMock: + """Build a MagicMock that mimics `async with AsyncEmbeddingClient() as c` + and supports `await c.embed_documents([...])`.""" + instance = MagicMock() + instance.__aenter__ = AsyncMock(return_value=instance) + instance.__aexit__ = AsyncMock(return_value=None) + instance.embed_documents = AsyncMock(side_effect=lambda texts: [vec] * len(texts)) + return instance + + +def test_empty_input_no_ops(): + with patch("radis.pgsearch.tasks.AsyncEmbeddingClient") as client_cls: + asyncio.run(embed_reports_task(report_ids=[])) + client_cls.assert_not_called() + + +def test_no_matching_rsvs_no_ops(): + with patch("radis.pgsearch.tasks.AsyncEmbeddingClient") as client_cls: + asyncio.run(embed_reports_task(report_ids=[999_999])) + client_cls.assert_not_called() + + +def test_embeds_in_internal_batches(settings): + settings.EMBEDDING_BATCH_SIZE = 2 + reports = [ReportFactory.create() for _ in range(5)] + pks = [r.pk for r in reports] + vec = _unit_vec(settings.EMBEDDING_DIM) + fake = _make_fake_async_client(vec) + + with patch("radis.pgsearch.tasks.AsyncEmbeddingClient", return_value=fake): + asyncio.run(embed_reports_task(report_ids=pks)) + + # 5 reports with batch_size=2 → 3 embed_documents calls of sizes 2, 2, 1. + assert fake.embed_documents.await_count == 3 + sizes = [len(call.args[0]) for call in fake.embed_documents.await_args_list] + assert sorted(sizes) == [1, 2, 2] + assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 0 + + +def test_embedding_error_propagates(): + """Procrastinate retries depend on the exception escaping the task.""" + reports = [ReportFactory.create() for _ in range(2)] + pks = [r.pk for r in reports] + fake = MagicMock() + fake.__aenter__ = AsyncMock(return_value=fake) + fake.__aexit__ = AsyncMock(return_value=None) + fake.embed_documents = AsyncMock(side_effect=EmbeddingClientError("service down")) + + with patch("radis.pgsearch.tasks.AsyncEmbeddingClient", return_value=fake): + with pytest.raises(EmbeddingClientError): + asyncio.run(embed_reports_task(report_ids=pks)) + + assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 2 + + +def test_bulk_index_reports_chains_into_embed_reports_task(): + """When PGSEARCH_SYNC_INDEXING=False, the deferred FTS task must enqueue + the embedding task at the end so embedding always follows FTS.""" + reports = [ReportFactory.create() for _ in range(3)] + pks = [r.pk for r in reports] + + with patch("radis.pgsearch.tasks.embed_reports_task") as task: + bulk_index_reports(report_ids=pks) + + task.defer.assert_called_once() + assert sorted(task.defer.call_args.kwargs["report_ids"]) == sorted(pks) diff --git a/radis/pgsearch/tests/test_embedding_launcher.py b/radis/pgsearch/tests/test_embedding_launcher.py deleted file mode 100644 index e2725ece..00000000 --- a/radis/pgsearch/tests/test_embedding_launcher.py +++ /dev/null @@ -1,50 +0,0 @@ -from unittest.mock import patch - -import pytest -from django.contrib.auth import get_user_model - -from radis.pgsearch.models import EmbeddingJob -from radis.pgsearch.tasks import embedding_launcher as _wrapped -from radis.reports.factories import ReportFactory - -User = get_user_model() -embedding_launcher = _wrapped.__wrapped__ # type: ignore[attr-defined] -pytestmark = pytest.mark.django_db - - -def test_embedding_launcher_noop_when_job_in_flight(): - owner = User.objects.get(username="system") - EmbeddingJob.objects.create(owner=owner, status=EmbeddingJob.Status.PREPARING) - # Make a pending report so the second guard wouldn't short-circuit on its own. - ReportFactory.create() - - with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: - embedding_launcher(context=None, timestamp=0) - - assert delay_mock.call_count == 0 - # No new job created. - assert EmbeddingJob.objects.count() == 1 - - -def test_embedding_launcher_noop_when_no_pending_rows(): - with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: - embedding_launcher(context=None, timestamp=0) - - assert delay_mock.call_count == 0 - assert EmbeddingJob.objects.count() == 0 - - -def test_embedding_launcher_happy_path_creates_job_and_defers( - django_capture_on_commit_callbacks, -): - ReportFactory.create() - - with patch("radis.pgsearch.models.EmbeddingJob.delay") as delay_mock: - with django_capture_on_commit_callbacks(execute=True): - embedding_launcher(context=None, timestamp=0) - - assert EmbeddingJob.objects.count() == 1 - job = EmbeddingJob.objects.get() - assert job.status == EmbeddingJob.Status.PREPARING - assert job.owner.username == "system" - delay_mock.assert_called_once() diff --git a/radis/pgsearch/tests/test_migrations_system_user.py b/radis/pgsearch/tests/test_migrations_system_user.py deleted file mode 100644 index ca277361..00000000 --- a/radis/pgsearch/tests/test_migrations_system_user.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest -from django.contrib.auth import get_user_model - -User = get_user_model() - - -@pytest.mark.django_db -def test_system_user_exists_after_migrations(): - user = User.objects.get(username="system") - assert user.is_active is False - assert not user.has_usable_password() diff --git a/radis/pgsearch/tests/test_models_embedding.py b/radis/pgsearch/tests/test_models_embedding.py deleted file mode 100644 index 7cc4b9b9..00000000 --- a/radis/pgsearch/tests/test_models_embedding.py +++ /dev/null @@ -1,31 +0,0 @@ -import pytest -from django.contrib.auth import get_user_model - -from radis.pgsearch.models import EmbeddingJob, EmbeddingTask -from radis.reports.factories import ReportFactory - -User = get_user_model() -pytestmark = pytest.mark.django_db - - -def _system_user() -> "User": - return User.objects.get(username="system") - - -def test_embedding_job_defaults(): - job = EmbeddingJob.objects.create(owner=_system_user()) - assert job.status == EmbeddingJob.Status.UNVERIFIED - assert job.urgent is False - assert job.send_finished_mail is False - assert job.queued_job_id is None - - -def test_embedding_task_links_to_reports(): - job = EmbeddingJob.objects.create(owner=_system_user()) - reports = [ReportFactory.create() for _ in range(3)] - task = EmbeddingTask.objects.create(job=job) - task.reports.set(reports) - assert task.status == EmbeddingTask.Status.PENDING - assert set(task.reports.values_list("pk", flat=True)) == {r.pk for r in reports} - assert task.attempts == 0 - assert task.queued_job_id is None diff --git a/radis/pgsearch/tests/test_process_embedding_job.py b/radis/pgsearch/tests/test_process_embedding_job.py deleted file mode 100644 index 463abf09..00000000 --- a/radis/pgsearch/tests/test_process_embedding_job.py +++ /dev/null @@ -1,78 +0,0 @@ -from unittest.mock import patch - -import pytest -from django.contrib.auth import get_user_model - -from radis.pgsearch.models import EmbeddingJob, EmbeddingTask -from radis.pgsearch.tasks import process_embedding_job as _wrapped -from radis.reports.factories import ReportFactory - -User = get_user_model() -process_embedding_job = _wrapped.__wrapped__ # type: ignore[attr-defined] -pytestmark = pytest.mark.django_db - - -def _new_job() -> EmbeddingJob: - owner = User.objects.get(username="system") - return EmbeddingJob.objects.create(owner=owner, status=EmbeddingJob.Status.PREPARING) - - -def _make_pending_reports(n: int): - reports = [ReportFactory.create() for _ in range(n)] - # ReportFactory triggers the FTS post_save signal which creates ReportSearchVector - # rows with embedding=NULL; that's exactly the pending state we want. - return reports - - -def test_process_embedding_job_batches_pending_reports(settings): - settings.EMBEDDING_BATCH_SIZE = 2 - job = _new_job() - reports = _make_pending_reports(5) - - with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: - process_embedding_job(job.pk) - - job.refresh_from_db() - assert job.status == EmbeddingJob.Status.PENDING - # ceil(5 / 2) = 3 tasks - assert job.tasks.count() == 3 - # All tasks are dispatched - assert delay_mock.call_count == 3 - # Every pending report is in exactly one task - covered = set() - for task in job.tasks.all(): - covered.update(task.reports.values_list("pk", flat=True)) - assert covered == {r.pk for r in reports} - - -def test_process_embedding_job_resume_path_only_redispatches_pending_tasks(settings): - settings.EMBEDDING_BATCH_SIZE = 2 - job = _new_job() - reports = _make_pending_reports(2) - # Simulate a previous orchestrator run that created one task already. - existing = EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.PENDING) - existing.reports.set(reports) - EmbeddingTask.objects.create(job=job, status=EmbeddingTask.Status.SUCCESS) - - with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: - process_embedding_job(job.pk) - - job.refresh_from_db() - assert job.status == EmbeddingJob.Status.PENDING - # No new tasks created - assert job.tasks.count() == 2 - # Only the pending one is dispatched - assert delay_mock.call_count == 1 - - -def test_process_embedding_job_with_no_pending_rows(): - job = _new_job() - # No reports exist → no ReportSearchVector rows with embedding IS NULL. - - with patch("radis.pgsearch.models.EmbeddingTask.delay") as delay_mock: - process_embedding_job(job.pk) - - job.refresh_from_db() - assert job.status == EmbeddingJob.Status.PENDING - assert job.tasks.count() == 0 - assert delay_mock.call_count == 0 diff --git a/radis/pgsearch/tests/test_process_embedding_task.py b/radis/pgsearch/tests/test_process_embedding_task.py deleted file mode 100644 index 58e38664..00000000 --- a/radis/pgsearch/tests/test_process_embedding_task.py +++ /dev/null @@ -1,71 +0,0 @@ -from unittest.mock import MagicMock, patch - -import numpy as np -import pytest -from django.contrib.auth import get_user_model - -from radis.pgsearch.models import EmbeddingJob, EmbeddingTask, ReportSearchVector -from radis.pgsearch.tasks import process_embedding_task as _wrapped -from radis.pgsearch.utils.embedding_client import EmbeddingClientError -from radis.reports.factories import ReportFactory - -User = get_user_model() -process_embedding_task = _wrapped.__wrapped__ # type: ignore[attr-defined] -pytestmark = pytest.mark.django_db - - -def _make_task() -> EmbeddingTask: - owner = User.objects.get(username="system") - job = EmbeddingJob.objects.create(owner=owner) - task = EmbeddingTask.objects.create(job=job) - reports = [ReportFactory.create() for _ in range(2)] - task.reports.set(reports) - return task - - -def _unit_vec(dim: int) -> list[float]: - v = np.ones(dim, dtype=np.float32) - return (v / np.linalg.norm(v)).tolist() - - -def test_process_embedding_task_writes_vectors_and_marks_success(settings): - task = _make_task() - vec = _unit_vec(settings.EMBEDDING_DIM) - fake_client = MagicMock() - fake_client.embed_documents.return_value = [vec, vec] - with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): - process_embedding_task(task.pk) - - task.refresh_from_db() - assert task.status == EmbeddingTask.Status.SUCCESS - assert task.queued_job_id is None - for report in task.reports.all(): - rsv = ReportSearchVector.objects.get(report=report) - assert rsv.embedding is not None - - -def test_process_embedding_task_failure_sets_status_and_raises(): - task = _make_task() - fake_client = MagicMock() - fake_client.embed_documents.side_effect = EmbeddingClientError("boom") - with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): - with pytest.raises(EmbeddingClientError): - process_embedding_task(task.pk) - - task.refresh_from_db() - assert task.status == EmbeddingTask.Status.FAILURE - assert task.queued_job_id is None - assert "boom" in task.message - - -def test_process_embedding_task_calls_update_job_state(settings): - task = _make_task() - vec = _unit_vec(settings.EMBEDDING_DIM) - fake_client = MagicMock() - fake_client.embed_documents.return_value = [vec, vec] - with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake_client): - process_embedding_task(task.pk) - - task.job.refresh_from_db() - # All tasks succeeded; AnalysisJob.update_job_state rolls up to SUCCESS. - assert task.job.status == EmbeddingJob.Status.SUCCESS diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py index 4ff39fd6..dbccabf5 100644 --- a/radis/pgsearch/utils/embedding_client.py +++ b/radis/pgsearch/utils/embedding_client.py @@ -2,6 +2,7 @@ import logging import math +from dataclasses import dataclass from typing import Iterable, Protocol import httpx @@ -63,6 +64,11 @@ def _build_http_client() -> httpx.Client: return httpx.Client(timeout=settings.EMBEDDING_REQUEST_TIMEOUT) +def _build_async_http_client() -> httpx.AsyncClient: + """Indirection so tests can swap in a MockTransport.""" + return httpx.AsyncClient(timeout=settings.EMBEDDING_REQUEST_TIMEOUT) + + def _l2_normalize(vec: list[float]) -> list[float]: norm = math.sqrt(sum(x * x for x in vec)) if norm == 0.0: @@ -83,40 +89,86 @@ def _truncate(texts: Iterable[str], max_chars: int) -> list[str]: return out -class EmbeddingClient: - def __init__(self) -> None: - try: - self._backend = BACKENDS[settings.EMBEDDING_BACKEND] - except KeyError as e: - raise EmbeddingClientError( - f"Unknown EMBEDDING_BACKEND={settings.EMBEDDING_BACKEND!r}; " - f"known: {sorted(BACKENDS)}" - ) from e - path = settings.EMBEDDING_PROVIDER_PATH or self._backend.path - if not path.startswith("/"): +@dataclass(frozen=True) +class _ResolvedConfig: + backend: EmbeddingBackend + url: str + model: str + dim: int + max_chars: int + instruction: str + headers: dict[str, str] + + +def _resolve_config() -> _ResolvedConfig: + """Read+validate Django settings once; raise EmbeddingClientError on misconfig.""" + try: + backend = BACKENDS[settings.EMBEDDING_BACKEND] + except KeyError as e: + raise EmbeddingClientError( + f"Unknown EMBEDDING_BACKEND={settings.EMBEDDING_BACKEND!r}; " + f"known: {sorted(BACKENDS)}" + ) from e + path = settings.EMBEDDING_PROVIDER_PATH or backend.path + if not path.startswith("/"): + raise EmbeddingClientError( + f"EMBEDDING_PROVIDER_PATH must start with '/'; got {path!r}" + ) + base = settings.EMBEDDING_PROVIDER_URL.rstrip("/") + if not base: + raise EmbeddingClientError("EMBEDDING_PROVIDER_URL is not configured") + headers: dict[str, str] = {} + if settings.EMBEDDING_PROVIDER_API_KEY: + headers["Authorization"] = f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}" + return _ResolvedConfig( + backend=backend, + url=f"{base}{path}", + model=settings.EMBEDDING_MODEL_NAME, + dim=settings.EMBEDDING_DIM, + max_chars=settings.EMBEDDING_MAX_INPUT_CHARS, + instruction=settings.EMBEDDING_QUERY_INSTRUCTION, + headers=headers, + ) + + +def _normalize_response( + raw: list[list[float]], expected_count: int, target_dim: int +) -> list[list[float]]: + if len(raw) != expected_count: + raise EmbeddingClientError( + f"Embedding count mismatch: requested {expected_count}, " + f"backend returned {len(raw)}" + ) + normalized: list[list[float]] = [] + for vec in raw: + if len(vec) < target_dim: raise EmbeddingClientError( - f"EMBEDDING_PROVIDER_PATH must start with '/'; got {path!r}" + f"Embedding dim too small: got {len(vec)}, expected at least {target_dim}" ) - base = settings.EMBEDDING_PROVIDER_URL.rstrip("/") - if not base: - raise EmbeddingClientError("EMBEDDING_PROVIDER_URL is not configured") - self._url = f"{base}{path}" - self._model = settings.EMBEDDING_MODEL_NAME - self._dim = settings.EMBEDDING_DIM - self._max_chars = settings.EMBEDDING_MAX_INPUT_CHARS - self._instruction = settings.EMBEDDING_QUERY_INSTRUCTION - self._headers: dict[str, str] = {} - if settings.EMBEDDING_PROVIDER_API_KEY: - self._headers["Authorization"] = f"Bearer {settings.EMBEDDING_PROVIDER_API_KEY}" + if len(vec) > target_dim: + # Matryoshka truncation: keep first EMBEDDING_DIM components, then re-normalize. + # Qwen3-Embedding is trained to retain quality at truncated dimensions. + normalized.append(_l2_normalize(list(vec[:target_dim]))) + else: + # Length already matches; still normalize since we can't assume + # all providers return unit vectors. + normalized.append(_l2_normalize(list(vec))) + return normalized + + +class EmbeddingClient: + def __init__(self) -> None: + cfg = _resolve_config() + self._cfg = cfg self._http = _build_http_client() def embed_documents(self, texts: list[str]) -> list[list[float]]: - truncated_texts = _truncate(texts, self._max_chars) - payload = self._backend.build_payload(self._model, truncated_texts) + truncated_texts = _truncate(texts, self._cfg.max_chars) + payload = self._cfg.backend.build_payload(self._cfg.model, truncated_texts) try: - response = self._http.post(self._url, json=payload, headers=self._headers) + response = self._http.post(self._cfg.url, json=payload, headers=self._cfg.headers) except httpx.HTTPError as e: - raise EmbeddingClientError(f"HTTP error contacting {self._url}: {e}") from e + raise EmbeddingClientError(f"HTTP error contacting {self._cfg.url}: {e}") from e if response.status_code >= 400: raise EmbeddingClientError( f"Embedding service returned {response.status_code}: {response.text[:200]}" @@ -125,30 +177,11 @@ def embed_documents(self, texts: list[str]) -> list[list[float]]: body = response.json() except ValueError as e: raise EmbeddingClientError(f"Embedding response is not JSON: {e}") from e - raw = self._backend.parse_response(body) - if len(raw) != len(truncated_texts): - raise EmbeddingClientError( - f"Embedding count mismatch: requested {len(truncated_texts)}, " - f"backend returned {len(raw)}" - ) - normalized: list[list[float]] = [] - for vec in raw: - if len(vec) < self._dim: - raise EmbeddingClientError( - f"Embedding dim too small: got {len(vec)}, expected at least {self._dim}" - ) - if len(vec) > self._dim: - # Matryoshka truncation: keep first EMBEDDING_DIM components, then re-normalize. - # Qwen3-Embedding is trained to retain quality at truncated dimensions. - normalized.append(_l2_normalize(list(vec[: self._dim]))) - else: - # Length already matches; still normalize since we can't assume - # all providers return unit vectors. - normalized.append(_l2_normalize(list(vec))) - return normalized + raw = self._cfg.backend.parse_response(body) + return _normalize_response(raw, len(truncated_texts), self._cfg.dim) def embed_query(self, text: str) -> list[float]: - prefixed = f"{self._instruction}{text}" if self._instruction else text + prefixed = f"{self._cfg.instruction}{text}" if self._cfg.instruction else text vectors = self.embed_documents([prefixed]) if not vectors: raise EmbeddingClientError("Embedding service returned no vectors for query") @@ -162,3 +195,53 @@ def __enter__(self) -> "EmbeddingClient": def __exit__(self, exc_type, exc_val, exc_tb) -> None: self.close() + + +class AsyncEmbeddingClient: + """Async sibling of `EmbeddingClient` for ADRF view paths. + + Same backend protocol, same config, same response handling. Differs only + in using `httpx.AsyncClient` and exposing `await`-able methods + an async + context-manager lifecycle (`async with AsyncEmbeddingClient() as c:`). + """ + + def __init__(self) -> None: + cfg = _resolve_config() + self._cfg = cfg + self._http = _build_async_http_client() + + async def embed_documents(self, texts: list[str]) -> list[list[float]]: + truncated_texts = _truncate(texts, self._cfg.max_chars) + payload = self._cfg.backend.build_payload(self._cfg.model, truncated_texts) + try: + response = await self._http.post( + self._cfg.url, json=payload, headers=self._cfg.headers + ) + except httpx.HTTPError as e: + raise EmbeddingClientError(f"HTTP error contacting {self._cfg.url}: {e}") from e + if response.status_code >= 400: + raise EmbeddingClientError( + f"Embedding service returned {response.status_code}: {response.text[:200]}" + ) + try: + body = response.json() + except ValueError as e: + raise EmbeddingClientError(f"Embedding response is not JSON: {e}") from e + raw = self._cfg.backend.parse_response(body) + return _normalize_response(raw, len(truncated_texts), self._cfg.dim) + + async def embed_query(self, text: str) -> list[float]: + prefixed = f"{self._cfg.instruction}{text}" if self._cfg.instruction else text + vectors = await self.embed_documents([prefixed]) + if not vectors: + raise EmbeddingClientError("Embedding service returned no vectors for query") + return vectors[0] + + async def aclose(self) -> None: + await self._http.aclose() + + async def __aenter__(self) -> "AsyncEmbeddingClient": + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + await self.aclose() diff --git a/radis/reports/api/viewsets.py b/radis/reports/api/viewsets.py index bb684b15..0101f48a 100644 --- a/radis/reports/api/viewsets.py +++ b/radis/reports/api/viewsets.py @@ -13,7 +13,7 @@ from rest_framework.response import Response from rest_framework.serializers import BaseSerializer -from radis.pgsearch.tasks import enqueue_bulk_index_reports +from radis.pgsearch.tasks import embed_reports_task, enqueue_bulk_index_reports from radis.pgsearch.utils.indexing import bulk_upsert_report_search_vectors from ..models import Language, Metadata, Modality, Report @@ -259,7 +259,11 @@ def on_commit(): if touched_report_ids: if settings.PGSEARCH_SYNC_INDEXING: bulk_upsert_report_search_vectors(touched_report_ids) + embed_reports_task.defer(report_ids=touched_report_ids) else: + # bulk_index_reports chains into embed_reports_task at the + # end of its run, so embedding always follows FTS regardless + # of which mode is active. enqueue_bulk_index_reports(touched_report_ids) transaction.on_commit(on_commit) @@ -322,6 +326,7 @@ def on_commit(): document_ids = [report.document_id for report in reports] logger.debug(f"{handler.name} - handle newly created reports: {document_ids}") handler.handle(reports) + embed_reports_task.defer(report_ids=[report.pk for report in reports]) transaction.on_commit(on_commit) @@ -429,6 +434,7 @@ def on_commit(): document_ids = [report.document_id for report in reports] logger.debug(f"{handler.name} - handle updated reports: {document_ids}") handler.handle(reports) + embed_reports_task.defer(report_ids=[report.pk for report in reports]) transaction.on_commit(on_commit) diff --git a/radis/settings/base.py b/radis/settings/base.py index 54333076..db4e076c 100644 --- a/radis/settings/base.py +++ b/radis/settings/base.py @@ -347,7 +347,6 @@ EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) # Embedding tuning constants (see hybrid-search spec §8.2) -EMBEDDING_DRAIN_CRON = "0 2 * * *" EMBEDDING_REQUEST_TIMEOUT = 30 EMBEDDING_MAX_INPUT_CHARS = 60_000 EMBEDDING_QUERY_INSTRUCTION = ( @@ -356,10 +355,6 @@ ) EMBEDDING_BATCH_SIZE = 32 -# Embedding queue priorities (procrastinate "higher = sooner") -EMBEDDING_INDEX_PRIORITY = 0 -EMBEDDING_SYSTEM_USERNAME = "system" - # Hybrid search tuning HYBRID_VECTOR_TOP_K = 100 HYBRID_FTS_MAX_RESULTS = 10_000 diff --git a/radis/settings/test.py b/radis/settings/test.py index 1c084d46..697d21e5 100644 --- a/radis/settings/test.py +++ b/radis/settings/test.py @@ -10,3 +10,10 @@ DATABASES["default"]["TEST"] = {"NAME": test_database} # noqa: F405 DEBUG_TOOLBAR_CONFIG = {"SHOW_TOOLBAR_CALLBACK": lambda request: False} + +# Tests must not hit a live embedding service. Embedding work is deferred via +# a Procrastinate task; tests do not run a worker by default. Blanking the URL +# means any incidental construction of EmbeddingClient/AsyncEmbeddingClient +# fast-fails into EmbeddingClientError rather than touching the network. Tests +# that exercise the embedding path explicitly patch the client. +EMBEDDING_PROVIDER_URL = "" From fa72b59b82af56927dc013f337ebc9ee4efe9f85 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Sun, 21 Jun 2026 00:48:49 +0000 Subject: [PATCH 63/68] refactor(pgsearch): subscribe to reports_*_handlers; viewsets.py loses pgsearch knowledge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pgsearch now hooks into report changes via the reports_created_handlers / reports_updated_handlers registry (its docstring explicitly invites this use). The view layer drops every pgsearch import and the entire touched_report_ids block — viewsets.py is back to its main shape, just dispatching the registry. - apps.py: register `_handle_reports_changed` on both registries during register_app(). The handler runs FTS (sync inline or via enqueue_bulk_index_reports per PGSEARCH_SYNC_INDEXING) and defers embed_reports_task. - tasks.py: embed_reports_task now defensively calls bulk_upsert_report_search_vectors at the top so it can win the race against deferred bulk_index_reports and still find usable RSV rows. The chain inside bulk_index_reports is gone — that task is purely FTS again. - viewsets.py: drop imports of embed_reports_task / enqueue_bulk_index_reports / bulk_upsert_report_search_vectors / settings.PGSEARCH_SYNC_INDEXING. Remove embed_reports_task.defer calls from perform_create / perform_update / _bulk_upsert_reports. Delete touched_report_ids and its conditional block. - tests: replace the bulk_index_reports → embed chain test with test_ensures_rsv_rows_exist_before_embedding which exercises the new safety net. - spec §3 / §6.1 / §6.6: rewritten to describe the handler-based wiring and the defensive-FTS approach to FTS↔embed ordering. The architecture diagram now shows the registry dispatch step and the safety net inside embed_reports_task. Side effects: the admin's save_model already fires the handlers, so admin-driven edits now also produce embeddings. Shell-only Report.save() edits still bypass the handler — operators can drain those via embed_pending, same as before. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-hybrid-search.md | 121 ++++++++++-------- radis/pgsearch/apps.py | 38 ++++++ radis/pgsearch/tasks.py | 17 ++- .../pgsearch/tests/test_embed_reports_task.py | 32 +++-- radis/reports/api/viewsets.py | 21 --- 5 files changed, 138 insertions(+), 91 deletions(-) diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index 490ee8d0..2d2b25f9 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -62,29 +62,35 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — └──────────────────────────────────────────────────────────────────────┘ ┌──────────────────────────────────────────────────────────────────────┐ -│ Async indexing path (deferred via Procrastinate) │ +│ Async indexing path (handler-registry → deferred via Procrastinate)│ │ │ │ Report view (single-create / PUT / bulk-upsert) │ │ │ │ │ ▼ transaction.atomic() block │ │ ReportSerializer / bulk_upsert_reports │ │ ├─ DB write (Report rows) │ -│ ├─ FTS path creates ReportSearchVector(embedding=NULL): │ -│ │ post_save signal (single) or │ -│ │ sync `bulk_upsert_report_search_vectors` (bulk, when │ -│ │ PGSEARCH_SYNC_INDEXING=True) or │ -│ │ deferred `bulk_index_reports` (bulk, default; chains into │ -│ │ embed_reports_task at its end — see §6.6) │ -│ └─ transaction.on_commit registers: │ -│ embed_reports_task.defer(report_ids=touched_pks) │ -│ (sync FTS paths only — the deferred FTS task chains the │ -│ embed enqueue itself; see §6.6) │ +│ └─ transaction.on_commit: │ +│ dispatches reports_created_handlers / reports_updated_ │ +│ handlers (radis.reports.site registry) with the touched │ +│ Report instances │ +│ │ │ +│ ▼ (one of the registered subscribers is pgsearch:) │ +│ pgsearch._handle_reports_changed(reports) │ +│ ├─ FTS step (mode-dependent): │ +│ │ PGSEARCH_SYNC_INDEXING=True → bulk_upsert_report_search_ │ +│ │ vectors(report_ids) inline │ +│ │ PGSEARCH_SYNC_INDEXING=False → enqueue_bulk_index_reports │ +│ │ (deferred to the `default` queue) │ +│ └─ embed_reports_task.defer(report_ids=...) │ │ │ │ │ ▼ HTTP response returned (201 / 200) immediately │ │ │ │ ──── elsewhere, on the embeddings_worker process ──── │ │ │ │ embed_reports_task(report_ids) (async task, embeddings queue) │ +│ ├─ defensive: bulk_upsert_report_search_vectors(report_ids) │ +│ │ (idempotent; ensures RSV rows + tsvector exist when this │ +│ │ task wins the race against deferred bulk_index_reports) │ │ ├─ load RSVs (database_sync_to_async) │ │ ├─ await AsyncEmbeddingClient.embed_documents([body, ...]) │ │ ├─ L2-normalize; ReportSearchVector.objects.bulk_update │ @@ -93,11 +99,14 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — └──────────────────────────────────────────────────────────────────────┘ ``` -Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id}/?upsert=true`) and bulk-upsert (`POST /api/reports/bulk-upsert/`) — enqueue an async Procrastinate task on the dedicated `embeddings` queue. The write path returns immediately after the transaction commits; the embedding service is touched only by the worker. This: +`radis.reports` already exposes a handler registry (`reports_created_handlers` / `reports_updated_handlers` in `radis.reports.site`) whose docstring is explicit about its purpose: *"The handler can be used to index those reports in an external search database."* Pgsearch registers `_handle_reports_changed` on both. The view layer never imports anything from `pgsearch`; it only dispatches the registry. + +Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id}/?upsert=true`) and bulk-upsert (`POST /api/reports/bulk-upsert/`) — flow through the same handler, which defers an async Procrastinate task on the dedicated `embeddings` queue. The write path returns immediately after the transaction commits; the embedding service is touched only by the worker. This: - **Decouples write-path uptime from the embedding service.** API responses succeed even when the embedding endpoint is down or slow. - **Bounds concurrent load on the embedding service** via the worker's `--concurrency K` — explicit, configurable backpressure rather than implicit request-driven concurrency. - **Auto-recovers from transient outages** via Procrastinate's retry policy with exponential backoff. +- **Inverts the dependency** so `radis.reports` stays unaware of search/indexing concerns; adding or swapping a search provider is a registration call, not a view edit. - **Symmetric across single-create and bulk-upsert** — one enqueue site, one task, one worker. **Components added inside `radis.pgsearch`:** @@ -105,11 +114,12 @@ Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id | File | Purpose | |---|---| | `utils/embedding_client.py` | `EmbeddingClient` (sync, used by the query path) + `AsyncEmbeddingClient` (async, used by `embed_reports_task` on the worker); pluggable backends (`openai`, `ollama`) | -| `tasks.py` (embedding entries) | `embed_reports_task(report_ids)` async Procrastinate task on the `embeddings` queue. Looks up RSVs via `database_sync_to_async`, calls `AsyncEmbeddingClient.embed_documents`, bulk-updates the column. Raises on `EmbeddingClientError` so the Procrastinate retry policy applies. | +| `apps.py` (modified) | `register_app()` now also registers `_handle_reports_changed` on both `reports_created_handlers` and `reports_updated_handlers`. The handler runs FTS (sync or deferred per `PGSEARCH_SYNC_INDEXING`) + defers `embed_reports_task` for the touched reports. This is the only place pgsearch wires itself into the reports app. | +| `tasks.py` (embedding entries) | `embed_reports_task(report_ids)` async Procrastinate task on the `embeddings` queue. Defensively calls `bulk_upsert_report_search_vectors` first to ensure RSV rows exist (covers the race with deferred FTS, plus shell/admin edits), then `AsyncEmbeddingClient.embed_documents`, then `bulk_update`. Raises on `EmbeddingClientError` so the Procrastinate retry policy applies. | | `migrations/0002_hybrid_search.py` | Single schema migration: `CREATE EXTENSION vector`; adds `embedding vector(N)` column + HNSW index | | `models.py` (modified) | Adds `embedding` field + `HnswIndex` to `ReportSearchVector`. No Job/Task models. | | `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** | -| `tasks.py` (FTS bits) | FTS bulk-indexing helper `bulk_upsert_report_search_vectors`. The existing `bulk_index_reports` Procrastinate task and `enqueue_bulk_index_reports` helper are retained; `bulk_index_reports` is extended to defer `embed_reports_task` at the end of its run so embedding always follows FTS in either mode (see §6.6). | +| `tasks.py` (FTS bits) | FTS bulk-indexing helper `bulk_upsert_report_search_vectors`. The existing `bulk_index_reports` Procrastinate task and `enqueue_bulk_index_reports` helper are retained unchanged from pre-hybrid-search — they remain pure FTS. The handler defers `embed_reports_task` independently; ordering between FTS and embedding is guaranteed by the defensive `bulk_upsert_report_search_vectors` call at the top of `embed_reports_task`, not by chaining (see §6.6). | | `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic | | `tests/...` | Coverage per §10 | @@ -121,7 +131,7 @@ Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id | `radis/settings/base.py` | New env-driven + constant settings (§8) | | `radis/settings/test.py` | Override `EMBEDDING_PROVIDER_URL=""` so any incidental construction of `EmbeddingClient` / `AsyncEmbeddingClient` fast-fails into `EmbeddingClientError` in CI (no live embedding service). Tests that exercise embedding patch the client explicitly. | | `example.env` | Document `EMBEDDING_*` env vars for openai and ollama backends | -| `radis/reports/api/viewsets.py` | `ReportViewSet.perform_create` / `perform_update` / `bulk_upsert` register `embed_reports_task.defer(report_ids=...)` inside their `transaction.on_commit` callbacks. Sync DRF; no async machinery needed because the enqueue is a synchronous Procrastinate API call. | +| `radis/reports/api/viewsets.py` | **Unchanged from main** in shape. It already dispatches `reports_created_handlers` / `reports_updated_handlers` from `on_commit`; pgsearch hooks in via that registry. Nothing in `viewsets.py` imports from `radis.pgsearch`. | ## 4. Schema and migrations @@ -341,35 +351,36 @@ Every successful report write enqueues an async Procrastinate task that embeds t ### 6.1 The enqueue at write time -Both ingest paths register a `transaction.on_commit` callback that defers an `embed_reports_task` once the FTS rows exist: +`viewsets.py` is unchanged from main — it already dispatches `reports_created_handlers` / `reports_updated_handlers` inside `transaction.on_commit`. Pgsearch subscribes to those at app startup: ```python -# single-create (POST) / PUT — inside the view's on_commit -def on_commit(): - # ... existing reports_created_handlers / reports_updated_handlers calls ... - embed_reports_task.defer(report_ids=[report.pk]) -``` - -The single-create / PUT path always has FTS done by the time `on_commit` fires because the FTS `post_save` signal on `Report` runs sync inline during `serializer.save()`. The bulk-upsert path keeps its existing two FTS modes governed by `PGSEARCH_SYNC_INDEXING`; both modes chain into `embed_reports_task` (see §6.6): +# radis/pgsearch/apps.py — inside register_app() -```python -# bulk-upsert — inside bulk_upsert_reports' on_commit -def on_commit(): - # ... existing reports_created_handlers / reports_updated_handlers calls ... - if touched_report_ids: - if settings.PGSEARCH_SYNC_INDEXING: - bulk_upsert_report_search_vectors(touched_report_ids) # FTS sync - embed_reports_task.defer(report_ids=touched_report_ids) - else: - # bulk_index_reports chains embed_reports_task at its end (see §6.6) - enqueue_bulk_index_reports(touched_report_ids) +def _handle_reports_changed(reports): + if not reports: + return + report_ids = [r.pk for r in reports] + if settings.PGSEARCH_SYNC_INDEXING: + bulk_upsert_report_search_vectors(report_ids) + else: + enqueue_bulk_index_reports(report_ids) + embed_reports_task.defer(report_ids=report_ids) + +register_reports_created_handler( + ReportsCreatedHandler(name="PG Search", handle=_handle_reports_changed) +) +register_reports_updated_handler( + ReportsUpdatedHandler(name="PG Search", handle=_handle_reports_changed) +) ``` +The view contributes nothing pgsearch-specific. Whatever fires `reports_created_handlers` / `reports_updated_handlers` (the API viewsets, the Django admin's `save_model`, any future caller) automatically gets FTS + embedding for free. + When the `transaction.atomic()` block commits: 1. Report rows are durable. -2. RSV rows exist (or will exist once `bulk_index_reports` runs, in the deferred FTS mode). -3. A row is inserted into `procrastinate_jobs` describing the embedding work (immediately in the sync FTS mode, or at the tail of `bulk_index_reports` in the deferred mode). +2. RSV rows exist (or will exist once `bulk_index_reports` runs, in the deferred FTS mode — see §6.6). +3. A row is inserted into `procrastinate_jobs` describing the embedding work. The HTTP response returns at that point. The view handler does **not** await embedding. @@ -390,6 +401,13 @@ async def embed_reports_task(report_ids: list[int]) -> None: if not report_ids: return + # Defensive: ensure RSV rows exist with up-to-date tsvectors. Covers + # the race against deferred `bulk_index_reports` (PGSEARCH_SYNC_INDEXING + # =False) and the shell/admin path that may have bypassed the bulk + # indexer entirely. Idempotent — no-op when the row + tsvector already + # match. + await database_sync_to_async(bulk_upsert_report_search_vectors)(report_ids) + @database_sync_to_async def _load_rsvs() -> list[ReportSearchVector]: return list( @@ -496,33 +514,30 @@ Properties: - **Rate-limited.** The worker's `--concurrency K` caps concurrent embedding HTTP calls regardless of how many tasks the command enqueues. Operators cannot accidentally hammer the embedding service. - **Visible.** Enqueued tasks appear in the standard Procrastinate observability surface (admin, logs, telemetry). Failed retries surface there as well. -### 6.6 `PGSEARCH_SYNC_INDEXING` retained; FTS chains to embedding +### 6.6 `PGSEARCH_SYNC_INDEXING` retained; ordering enforced by defensive FTS in the embed task -The pre-existing `PGSEARCH_SYNC_INDEXING` switch is **retained**, unchanged in semantics: it controls *how* FTS bulk-indexing happens on the bulk-upsert path. The hybrid-search work adds one new property — both FTS modes chain into `embed_reports_task` so embedding always follows FTS by construction. +The pre-existing `PGSEARCH_SYNC_INDEXING` switch is **retained** with the same semantics it had before hybrid search: it controls whether FTS bulk-indexing runs inline on the request thread or is deferred to a `bulk_index_reports` Procrastinate task. Pgsearch's `_handle_reports_changed` reads the flag and dispatches accordingly: | Mode | `PGSEARCH_SYNC_INDEXING` | FTS step | Embedding step | |---|---|---|---| -| Sync | `True` | `bulk_upsert_report_search_vectors(ids)` runs inline inside `on_commit` | `embed_reports_task.defer(report_ids=ids)` immediately follows in the same `on_commit` | -| Deferred (default) | `False` | `enqueue_bulk_index_reports(ids)` defers the `bulk_index_reports` Procrastinate task | `bulk_index_reports` calls `embed_reports_task.defer(report_ids=ids)` at the end of its run | +| Sync | `True` | `bulk_upsert_report_search_vectors(ids)` inline inside the handler | `embed_reports_task.defer(...)` immediately after, in the same handler call | +| Deferred (default) | `False` | `enqueue_bulk_index_reports(ids)` defers `bulk_index_reports` to the `default` queue | `embed_reports_task.defer(...)` immediately after; ordering vs the deferred FTS task is unspecified (see below) | -The chain is enforced inside `bulk_index_reports`: +`bulk_index_reports` is **unchanged from pre-hybrid-search**: it's purely an FTS task. It does *not* chain into `embed_reports_task`. -```python -@app.task -def bulk_index_reports(report_ids: list[int]) -> None: - if not report_ids: - return - bulk_upsert_report_search_vectors(report_ids) - embed_reports_task.defer(report_ids=list(report_ids)) -``` +In the deferred FTS mode both Procrastinate jobs (the bulk-index task on `default` and the embed task on `embeddings`) are inserted in the same DB transaction. The two workers pick them up independently; either can win the race. The defensive `bulk_upsert_report_search_vectors` call at the top of `embed_reports_task` covers the case where the embed task wins — it idempotently produces the RSV rows it needs before reading `report.body`, costing one extra (no-op in the common case) tsvector recompute. This is the trade I picked over chaining the two tasks together: + +- **No coupling between tasks.** `bulk_index_reports` stays pure FTS; `embed_reports_task` stays self-sufficient. Either can be reused, tested, or replaced in isolation. +- **Same safety net protects shell/admin edits.** A Python-shell `report.body = x; report.save()` fires the FTS signal but no handler. If an operator manually `embed_reports_task.defer([pk])` after such an edit, the defensive call still ensures the RSV is current. With chaining the safety net only existed for the bulk path. +- **Cheap idempotent cost.** `bulk_upsert_report_search_vectors([pk])` for an already-indexed row is one INSERT ON CONFLICT DO NOTHING + one UPDATE that rewrites `search_vector = to_tsvector(...)` to the same value. ~1 ms per chunk. Properties: -- **No race between embedding and FTS.** Embedding is only enqueued after the RSV rows exist (either inline in the sync path or at the tail of `bulk_index_reports` in the deferred path). -- **Operator choice preserved.** Deployments that prefer sync FTS (small bulks, deterministic end-to-end ordering with subscription handlers) keep that option. Deployments that prefer deferred FTS (large bulks, fast HTTP response) keep that option. The hybrid-search work is orthogonal. -- **One queue per concern.** FTS deferral runs on the `default` queue (where `bulk_index_reports` already lived); embedding runs on the dedicated `embeddings` queue. FTS-only worker capacity does not compete with embedding capacity. +- **No correctness race.** Embedding either finds the RSV already indexed (common case) or creates+indexes it on the fly (defensive case). It never reads a NULL `body` or skips a missing RSV. +- **Operator choice preserved.** Deployments that prefer sync FTS keep that option; deployments that prefer the deferred FTS task for large bulks keep that option. Hybrid search is orthogonal to the FTS-mode decision. +- **Two queues, two concerns.** FTS deferral runs on the `default` queue (where `bulk_index_reports` already lived); embedding runs on the dedicated `embeddings` queue. FTS-only worker capacity does not compete with embedding capacity. -The single-create / PUT path is unaffected by `PGSEARCH_SYNC_INDEXING`: its FTS step is the `post_save` signal on `Report`, which is always sync inline by construction (not under the switch's control). +The single-create / PUT path is unaffected by `PGSEARCH_SYNC_INDEXING`. Its FTS step is the `post_save` signal on `Report`, which is always sync inline by construction. The same handler still fires for it; the handler's FTS call in sync mode is a redundant (idempotent, ~1 ms) recompute, and in async mode adds one Procrastinate job per single create. The redundancy is the cost of the clean abstraction — the handler doesn't know whether it was triggered by a single-create or a bulk write. ### 6.7 Sync DRF; no async views required diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py index debd7c92..de056f01 100644 --- a/radis/pgsearch/apps.py +++ b/radis/pgsearch/apps.py @@ -71,6 +71,31 @@ def check_embedding_dim_matches_migration(app_configs, **kwargs): return [] +def _handle_reports_changed(reports): + """pgsearch's subscriber on reports_created_handlers / reports_updated_handlers. + + Owns both FTS indexing and embedding for the touched reports. The mode + flag `PGSEARCH_SYNC_INDEXING` controls whether FTS runs inline on the + request thread or is deferred to a Procrastinate task on the `default` + queue. Embedding is always deferred to the `embeddings` queue; the + embed task is itself defensive about RSV rows being absent (see + `embed_reports_task`), so it doesn't need to wait for the deferred FTS + task to finish. + """ + if not reports: + return + + from radis.pgsearch.tasks import embed_reports_task, enqueue_bulk_index_reports + from radis.pgsearch.utils.indexing import bulk_upsert_report_search_vectors + + report_ids = [report.pk for report in reports] + if settings.PGSEARCH_SYNC_INDEXING: + bulk_upsert_report_search_vectors(report_ids) + else: + enqueue_bulk_index_reports(report_ids) + embed_reports_task.defer(report_ids=report_ids) + + def register_app(): from django.conf import settings @@ -78,6 +103,12 @@ def register_app(): ExtractionRetrievalProvider, register_extraction_retrieval_provider, ) + from radis.reports.site import ( + ReportsCreatedHandler, + ReportsUpdatedHandler, + register_reports_created_handler, + register_reports_updated_handler, + ) from radis.search.site import SearchProvider, register_search_provider from radis.subscriptions.site import ( SubscriptionFilterProvider, @@ -88,6 +119,13 @@ def register_app(): from .providers import count, filter, retrieve, search + register_reports_created_handler( + ReportsCreatedHandler(name="PG Search", handle=_handle_reports_changed) + ) + register_reports_updated_handler( + ReportsUpdatedHandler(name="PG Search", handle=_handle_reports_changed) + ) + register_search_provider( SearchProvider( name="PG Search", diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index e66bd54a..f810ed76 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -15,17 +15,11 @@ @app.task def bulk_index_reports(report_ids: list[int]) -> None: """Deferred FTS bulk-indexing for the bulk-upsert path - (when `PGSEARCH_SYNC_INDEXING=False`). - - Chains into `embed_reports_task` so the embedding step is enqueued - immediately after FTS rows exist, regardless of whether FTS ran sync inline - or via this deferred task. - """ + (when `PGSEARCH_SYNC_INDEXING=False`).""" if not report_ids: return logger.info("Indexing %s reports in bulk.", len(report_ids)) bulk_upsert_report_search_vectors(report_ids) - embed_reports_task.defer(report_ids=list(report_ids)) def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: @@ -50,10 +44,19 @@ async def embed_reports_task(report_ids: list[int]) -> None: Reports are sent to the embedding service in batches of `EMBEDDING_BATCH_SIZE` to bound per-call payload size regardless of how many `report_ids` the caller passed. + + Defensive about missing RSV rows: when `PGSEARCH_SYNC_INDEXING=False`, + the handler enqueues this task alongside `bulk_index_reports` and the + embeddings worker may pick this task up first. Calling + `bulk_upsert_report_search_vectors` at the top ensures RSV rows exist + with up-to-date `search_vector` before we read `report.body`. The same + safety net covers shell/admin edits that bypass the bulk path. """ if not report_ids: return + await database_sync_to_async(bulk_upsert_report_search_vectors)(report_ids) + @database_sync_to_async def _load_rsvs() -> list[ReportSearchVector]: return list( diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py index 51cae851..b45630e3 100644 --- a/radis/pgsearch/tests/test_embed_reports_task.py +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -1,4 +1,4 @@ -"""Tests for `embed_reports_task` and the `bulk_index_reports` → embedding chain.""" +"""Tests for `embed_reports_task`.""" import asyncio from unittest.mock import AsyncMock, MagicMock, patch @@ -6,7 +6,7 @@ import pytest from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.tasks import bulk_index_reports, embed_reports_task +from radis.pgsearch.tasks import embed_reports_task from radis.pgsearch.utils.embedding_client import EmbeddingClientError from radis.reports.factories import ReportFactory @@ -35,6 +35,9 @@ def test_empty_input_no_ops(): def test_no_matching_rsvs_no_ops(): + """Report ids that don't resolve to actual reports must not blow up; + bulk_upsert_report_search_vectors logs+skips missing rows and the task + returns without calling the embedding service.""" with patch("radis.pgsearch.tasks.AsyncEmbeddingClient") as client_cls: asyncio.run(embed_reports_task(report_ids=[999_999])) client_cls.assert_not_called() @@ -73,14 +76,23 @@ def test_embedding_error_propagates(): assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 2 -def test_bulk_index_reports_chains_into_embed_reports_task(): - """When PGSEARCH_SYNC_INDEXING=False, the deferred FTS task must enqueue - the embedding task at the end so embedding always follows FTS.""" - reports = [ReportFactory.create() for _ in range(3)] +def test_ensures_rsv_rows_exist_before_embedding(settings): + """If a report has no ReportSearchVector yet (e.g., bulk_index_reports + hasn't run, or an admin/shell edit bypassed the signal), the embed task + must create the row + tsvector before reading body. This is the safety + net that lets the handler enqueue embed without waiting for the + deferred FTS task to land first.""" + reports = [ReportFactory.create() for _ in range(2)] pks = [r.pk for r in reports] + ReportSearchVector.objects.filter(report_id__in=pks).delete() + assert ReportSearchVector.objects.filter(report_id__in=pks).count() == 0 - with patch("radis.pgsearch.tasks.embed_reports_task") as task: - bulk_index_reports(report_ids=pks) + vec = _unit_vec(settings.EMBEDDING_DIM) + fake = _make_fake_async_client(vec) + with patch("radis.pgsearch.tasks.AsyncEmbeddingClient", return_value=fake): + asyncio.run(embed_reports_task(report_ids=pks)) - task.defer.assert_called_once() - assert sorted(task.defer.call_args.kwargs["report_ids"]) == sorted(pks) + rsvs = ReportSearchVector.objects.filter(report_id__in=pks) + assert rsvs.count() == 2 + assert rsvs.filter(search_vector__isnull=True).count() == 0 + assert rsvs.filter(embedding__isnull=True).count() == 0 diff --git a/radis/reports/api/viewsets.py b/radis/reports/api/viewsets.py index 0101f48a..3567774f 100644 --- a/radis/reports/api/viewsets.py +++ b/radis/reports/api/viewsets.py @@ -1,7 +1,6 @@ import logging from typing import Any -from django.conf import settings from django.db import transaction from django.http import Http404 from django.utils import timezone @@ -13,9 +12,6 @@ from rest_framework.response import Response from rest_framework.serializers import BaseSerializer -from radis.pgsearch.tasks import embed_reports_task, enqueue_bulk_index_reports -from radis.pgsearch.utils.indexing import bulk_upsert_report_search_vectors - from ..models import Language, Metadata, Modality, Report from ..site import ( document_fetchers, @@ -241,12 +237,6 @@ def _dedupe_groups(items: list[Any]) -> tuple[list[int], int]: group_duplicate_count, ) - touched_report_ids = [ - report_id_by_document_id[document_id] - for document_id in [*created_ids, *updated_ids] - if document_id in report_id_by_document_id - ] - def on_commit(): if created_ids: created_reports = list(Report.objects.filter(document_id__in=created_ids)) @@ -256,15 +246,6 @@ def on_commit(): updated_reports = list(Report.objects.filter(document_id__in=updated_ids)) for handler in reports_updated_handlers: handler.handle(updated_reports) - if touched_report_ids: - if settings.PGSEARCH_SYNC_INDEXING: - bulk_upsert_report_search_vectors(touched_report_ids) - embed_reports_task.defer(report_ids=touched_report_ids) - else: - # bulk_index_reports chains into embed_reports_task at the - # end of its run, so embedding always follows FTS regardless - # of which mode is active. - enqueue_bulk_index_reports(touched_report_ids) transaction.on_commit(on_commit) @@ -326,7 +307,6 @@ def on_commit(): document_ids = [report.document_id for report in reports] logger.debug(f"{handler.name} - handle newly created reports: {document_ids}") handler.handle(reports) - embed_reports_task.defer(report_ids=[report.pk for report in reports]) transaction.on_commit(on_commit) @@ -434,7 +414,6 @@ def on_commit(): document_ids = [report.document_id for report in reports] logger.debug(f"{handler.name} - handle updated reports: {document_ids}") handler.handle(reports) - embed_reports_task.defer(report_ids=[report.pk for report in reports]) transaction.on_commit(on_commit) From 5acdb17916b67537eae6d64543f850d726ed3942 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Tue, 23 Jun 2026 10:31:53 +0000 Subject: [PATCH 64/68] refactor(pgsearch): sync embed task, chained ordering, bisect + stamina retry Drops the async embed_reports_task (and AsyncEmbeddingClient) since Procrastinate's --concurrency 4 gives K independent slots regardless of sync vs async, and the batch loop is sequential anyway. Chains embed_reports_task at the end of bulk_index_reports so the embeddings worker never reads a report before its RSV row is committed, replacing the defensive bulk_upsert at the top of the embed task. Removes the client-side EMBEDDING_MAX_INPUT_CHARS truncation; instead the client detects payload-too-large responses (413, or 400/422 with telltale strings) and raises a typed EmbeddingPayloadTooLargeError that the task bisects until it isolates the offending report and logs report_id + body_chars. Wraps the embed call in stamina-controlled retries (3 attempts, 30s budget) with a predicate that excludes the bisectable error, so transient blips don't escalate to a Procrastinate task retry. Adds a Django admin entry for ReportSearchVector with an enqueue_pending_embeddings action that mirrors the embed_pending management command. Reverts an unrelated uv 0.11.13 downgrade in Dockerfile + CI. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 2 +- Dockerfile | 2 +- .../specs/2026-05-28-hybrid-search.md | 192 +++++++++------ pyproject.toml | 1 + radis/conftest.py | 5 + radis/pgsearch/admin.py | 47 ++++ radis/pgsearch/apps.py | 14 +- radis/pgsearch/tasks.py | 161 +++++++++--- .../pgsearch/tests/test_embed_reports_task.py | 232 ++++++++++++++---- radis/pgsearch/tests/test_embedding_client.py | 119 ++++++--- radis/pgsearch/utils/embedding_client.py | 116 ++++----- radis/settings/base.py | 12 +- 12 files changed, 618 insertions(+), 285 deletions(-) create mode 100644 radis/pgsearch/admin.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9384abcf..a4ef95f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v8.1.0 with: - version: "0.11.13" + version: "0.11.16" - name: Setup Python uses: actions/setup-python@v6 with: diff --git a/Dockerfile b/Dockerfile index 5f498227..57215cc2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,7 +21,7 @@ RUN apt-get update \ postgresql-client-17 \ && rm -rf /var/lib/apt/lists/* -COPY --from=ghcr.io/astral-sh/uv:0.11.13 /uv /uvx /bin/ +COPY --from=ghcr.io/astral-sh/uv:0.11.16 /uv /uvx /bin/ WORKDIR /app diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index 2d2b25f9..449526e0 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -76,23 +76,26 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — │ │ │ │ ▼ (one of the registered subscribers is pgsearch:) │ │ pgsearch._handle_reports_changed(reports) │ -│ ├─ FTS step (mode-dependent): │ -│ │ PGSEARCH_SYNC_INDEXING=True → bulk_upsert_report_search_ │ -│ │ vectors(report_ids) inline │ -│ │ PGSEARCH_SYNC_INDEXING=False → enqueue_bulk_index_reports │ -│ │ (deferred to the `default` queue) │ -│ └─ embed_reports_task.defer(report_ids=...) │ +│ ├─ PGSEARCH_SYNC_INDEXING=True: │ +│ │ bulk_upsert_report_search_vectors(report_ids) inline, │ +│ │ then embed_reports_task.defer(report_ids=...) │ +│ └─ PGSEARCH_SYNC_INDEXING=False: │ +│ enqueue_bulk_index_reports(report_ids); the embed task is │ +│ chained at the tail of bulk_index_reports (see below) │ │ │ │ │ ▼ HTTP response returned (201 / 200) immediately │ │ │ +│ ──── elsewhere, on the default_worker process ──── │ +│ │ +│ bulk_index_reports(report_ids) (default queue) │ +│ ├─ bulk_upsert_report_search_vectors(report_ids) │ +│ └─ embed_reports_task.defer(report_ids=...) │ +│ │ │ ──── elsewhere, on the embeddings_worker process ──── │ │ │ -│ embed_reports_task(report_ids) (async task, embeddings queue) │ -│ ├─ defensive: bulk_upsert_report_search_vectors(report_ids) │ -│ │ (idempotent; ensures RSV rows + tsvector exist when this │ -│ │ task wins the race against deferred bulk_index_reports) │ -│ ├─ load RSVs (database_sync_to_async) │ -│ ├─ await AsyncEmbeddingClient.embed_documents([body, ...]) │ +│ embed_reports_task(report_ids) (embeddings queue) │ +│ ├─ load RSVs (select_related("report")) │ +│ ├─ EmbeddingClient.embed_documents([body, ...]) (batched) │ │ ├─ L2-normalize; ReportSearchVector.objects.bulk_update │ │ └─ on EmbeddingClientError: raise │ │ → Procrastinate retry policy (exp backoff, N attempts) │ @@ -101,7 +104,7 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — `radis.reports` already exposes a handler registry (`reports_created_handlers` / `reports_updated_handlers` in `radis.reports.site`) whose docstring is explicit about its purpose: *"The handler can be used to index those reports in an external search database."* Pgsearch registers `_handle_reports_changed` on both. The view layer never imports anything from `pgsearch`; it only dispatches the registry. -Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id}/?upsert=true`) and bulk-upsert (`POST /api/reports/bulk-upsert/`) — flow through the same handler, which defers an async Procrastinate task on the dedicated `embeddings` queue. The write path returns immediately after the transaction commits; the embedding service is touched only by the worker. This: +Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id}/?upsert=true`) and bulk-upsert (`POST /api/reports/bulk-upsert/`) — flow through the same handler, which schedules a Procrastinate task on the dedicated `embeddings` queue (directly in sync FTS mode; chained at the end of `bulk_index_reports` in deferred FTS mode). The write path returns immediately after the transaction commits; the embedding service is touched only by the worker. This: - **Decouples write-path uptime from the embedding service.** API responses succeed even when the embedding endpoint is down or slow. - **Bounds concurrent load on the embedding service** via the worker's `--concurrency K` — explicit, configurable backpressure rather than implicit request-driven concurrency. @@ -113,13 +116,14 @@ Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id | File | Purpose | |---|---| -| `utils/embedding_client.py` | `EmbeddingClient` (sync, used by the query path) + `AsyncEmbeddingClient` (async, used by `embed_reports_task` on the worker); pluggable backends (`openai`, `ollama`) | -| `apps.py` (modified) | `register_app()` now also registers `_handle_reports_changed` on both `reports_created_handlers` and `reports_updated_handlers`. The handler runs FTS (sync or deferred per `PGSEARCH_SYNC_INDEXING`) + defers `embed_reports_task` for the touched reports. This is the only place pgsearch wires itself into the reports app. | -| `tasks.py` (embedding entries) | `embed_reports_task(report_ids)` async Procrastinate task on the `embeddings` queue. Defensively calls `bulk_upsert_report_search_vectors` first to ensure RSV rows exist (covers the race with deferred FTS, plus shell/admin edits), then `AsyncEmbeddingClient.embed_documents`, then `bulk_update`. Raises on `EmbeddingClientError` so the Procrastinate retry policy applies. | +| `utils/embedding_client.py` | `EmbeddingClient` used by both the query path and `embed_reports_task` on the worker; pluggable backends (`openai`, `ollama`) | +| `apps.py` (modified) | `register_app()` now also registers `_handle_reports_changed` on both `reports_created_handlers` and `reports_updated_handlers`. In sync FTS mode the handler upserts inline then defers `embed_reports_task`; in deferred FTS mode it enqueues `bulk_index_reports`, which chains the embed task at the end of its own run. This is the only place pgsearch wires itself into the reports app. | +| `tasks.py` (embedding entries) | `embed_reports_task(report_ids)` Procrastinate task on the `embeddings` queue. Loads RSVs by `report_id`, calls `EmbeddingClient.embed_documents`, then `bulk_update`. Raises on `EmbeddingClientError` so the Procrastinate retry policy applies. | +| `admin.py` | Registers `ReportSearchVector` with a `has_embedding` list display, an `embedding` `IsNull` filter, and an admin action `enqueue_pending_embeddings` that defers `embed_reports_task` for the selected rows whose embedding is NULL. Mirrors the `embed_pending` management command for operators who prefer the UI. | | `migrations/0002_hybrid_search.py` | Single schema migration: `CREATE EXTENSION vector`; adds `embedding vector(N)` column + HNSW index | | `models.py` (modified) | Adds `embedding` field + `HnswIndex` to `ReportSearchVector`. No Job/Task models. | | `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** | -| `tasks.py` (FTS bits) | FTS bulk-indexing helper `bulk_upsert_report_search_vectors`. The existing `bulk_index_reports` Procrastinate task and `enqueue_bulk_index_reports` helper are retained unchanged from pre-hybrid-search — they remain pure FTS. The handler defers `embed_reports_task` independently; ordering between FTS and embedding is guaranteed by the defensive `bulk_upsert_report_search_vectors` call at the top of `embed_reports_task`, not by chaining (see §6.6). | +| `tasks.py` (FTS bits) | FTS bulk-indexing helper `bulk_upsert_report_search_vectors` and the `bulk_index_reports` Procrastinate task. `bulk_index_reports` upserts the RSV rows and then chains `embed_reports_task.defer(...)` at the end of its run, so the embeddings worker only ever sees report ids whose RSV rows are already committed (see §6.6). | | `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic | | `tests/...` | Coverage per §10 | @@ -129,7 +133,7 @@ Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id |---|---| | `pyproject.toml` | Add `pgvector>=0.3` dependency | | `radis/settings/base.py` | New env-driven + constant settings (§8) | -| `radis/settings/test.py` | Override `EMBEDDING_PROVIDER_URL=""` so any incidental construction of `EmbeddingClient` / `AsyncEmbeddingClient` fast-fails into `EmbeddingClientError` in CI (no live embedding service). Tests that exercise embedding patch the client explicitly. | +| `radis/settings/test.py` | Override `EMBEDDING_PROVIDER_URL=""` so any incidental construction of `EmbeddingClient` fast-fails into `EmbeddingClientError` in CI (no live embedding service). Tests that exercise embedding patch the client explicitly. | | `example.env` | Document `EMBEDDING_*` env vars for openai and ollama backends | | `radis/reports/api/viewsets.py` | **Unchanged from main** in shape. It already dispatches `reports_created_handlers` / `reports_updated_handlers` from `on_commit`; pgsearch hooks in via that registry. Nothing in `viewsets.py` imports from `radis.pgsearch`. | @@ -293,8 +297,7 @@ check stays correct without any code change to `apps.py`. - `class OllamaBackend(EmbeddingBackend)` — default path `/api/embed`, body `{model, input: [...]}`, response `{embeddings: [[...]]}`. - `BACKENDS: dict[str, EmbeddingBackend] = {"openai": OpenAIBackend(), "ollama": OllamaBackend()}`. - `class EmbeddingClientError(Exception)`. -- `class EmbeddingClient` — sync client used by the query path (`providers.search` / `providers.retrieve`). -- `class AsyncEmbeddingClient` — async sibling of `EmbeddingClient`, used by the `embed_reports_task` worker task (§6.2). Same backend protocol; differs only in using `httpx.AsyncClient` + an `async with` lifecycle. The async surface lets a single embeddings worker run K embedding HTTP calls concurrently via asyncio at low memory overhead. +- `class EmbeddingClient` — sync client used by both the query path (`providers.search` / `providers.retrieve`) and the `embed_reports_task` worker task (§6.2). A single client class keeps the configuration surface narrow; worker-side concurrency is provided by Procrastinate's `--concurrency K` flag spawning K sync task slots, not by intra-task asyncio. ### 5.2 Interface @@ -310,8 +313,10 @@ class EmbeddingClient: if settings.EMBEDDING_PROVIDER_API_KEY else {} def embed_documents(self, texts: list[str]) -> list[list[float]]: - """Embed texts verbatim. Truncates each to EMBEDDING_MAX_INPUT_CHARS first. - Returns L2-normalized vectors of length EMBEDDING_DIM.""" + """Embed texts verbatim. Returns L2-normalized vectors of length + EMBEDDING_DIM. Raises `EmbeddingPayloadTooLargeError` (subclass of + `EmbeddingClientError`) when the backend rejects the request because + one or more inputs exceed the model's context window.""" def embed_query(self, text: str) -> list[float]: """Prepend EMBEDDING_QUERY_INSTRUCTION, then embed_documents([text])[0].""" @@ -329,7 +334,7 @@ class EmbeddingClient: ### 5.4 Behavior details - **Query instruction:** the model card for Qwen3-Embedding recommends a task-specific instruction prefix on the query side only. `embed_query` prepends `EMBEDDING_QUERY_INSTRUCTION` (a Python constant in `base.py`); `embed_documents` does not. -- **Truncation:** any text longer than `EMBEDDING_MAX_INPUT_CHARS` is truncated at the character limit before being sent. A WARNING is logged with the report id (when known) and char count. Qwen3-Embedding-4B supports up to 32k tokens, so truncation will be rare for radiology bodies but is bounded as a defense against pathological inputs. +- **Overlength inputs:** the client does *not* truncate. The model's context window is the authoritative limit, and the backend signals overlength via HTTP 413 or 400/422 with a context-length message in the body. The client detects that via a loose substring match on common keywords (`context length`, `max tokens`, `too long`, `exceeds`, …) and raises the typed `EmbeddingPayloadTooLargeError`. The `embed_reports_task` worker catches that subclass and bisects the chunk (§6.2); the query path lets it propagate (which the search view treats the same as any other `EmbeddingClientError` — fall back to FTS-only for that request). - **Normalization:** every returned vector is L2-normalized client-side, unconditionally. With unit vectors, cosine distance is monotonic in dot product, which makes the HNSW `vector_cosine_ops` operator effectively a fast inner-product search. Whether the upstream server normalizes is irrelevant. - **Dimension validation:** every vector is checked to have length `EMBEDDING_DIM`. A mismatch raises `EmbeddingClientError`. - **Batching:** `embed_documents` sends a single HTTP call per invocation. The write path enqueues an `embed_reports_task` per ingest event (one task per single-create, one task per bulk-upsert); each task in turn issues one batched embedding HTTP call covering all the report bodies it owns. The `EMBEDDING_BATCH_SIZE` constant is used by `embed_pending` to chunk large drains into tasks of reasonable size. @@ -362,9 +367,12 @@ def _handle_reports_changed(reports): report_ids = [r.pk for r in reports] if settings.PGSEARCH_SYNC_INDEXING: bulk_upsert_report_search_vectors(report_ids) + embed_reports_task.defer(report_ids=report_ids) else: + # bulk_index_reports chains embed_reports_task at the end of its run, + # so the embeddings worker never sees a report id before its RSV row + # is committed. enqueue_bulk_index_reports(report_ids) - embed_reports_task.defer(report_ids=report_ids) register_reports_created_handler( ReportsCreatedHandler(name="PG Search", handle=_handle_reports_changed) @@ -390,58 +398,86 @@ The HTTP response returns at that point. The view handler does **not** await emb ```python @app.task(queue="embeddings") -async def embed_reports_task(report_ids: list[int]) -> None: - """Embed the named reports. Raises on EmbeddingClientError so - Procrastinate's retry policy applies. - - Reports are sent to the embedding service in batches of - `EMBEDDING_BATCH_SIZE` to bound per-call payload size and per-call - GPU-side latency regardless of how many report_ids the caller passed. - """ +def embed_reports_task(report_ids: list[int]) -> None: if not report_ids: return - # Defensive: ensure RSV rows exist with up-to-date tsvectors. Covers - # the race against deferred `bulk_index_reports` (PGSEARCH_SYNC_INDEXING - # =False) and the shell/admin path that may have bypassed the bulk - # indexer entirely. Idempotent — no-op when the row + tsvector already - # match. - await database_sync_to_async(bulk_upsert_report_search_vectors)(report_ids) - - @database_sync_to_async - def _load_rsvs() -> list[ReportSearchVector]: - return list( - ReportSearchVector.objects.filter(report_id__in=report_ids) - .select_related("report") - .only("id", "report_id", "report__body") - ) - - rsvs = await _load_rsvs() + rsvs = list( + ReportSearchVector.objects.filter(report_id__in=report_ids) + .select_related("report") + .only("id", "report_id", "report__body") + ) if not rsvs: logger.warning("embed_reports_task: no RSVs for report ids %s", report_ids) return batch_size = settings.EMBEDDING_BATCH_SIZE - async with AsyncEmbeddingClient() as client: + embedded: list[ReportSearchVector] = [] + skipped: list[ReportSearchVector] = [] + with EmbeddingClient() as client: for start in range(0, len(rsvs), batch_size): chunk = rsvs[start : start + batch_size] - vectors = await client.embed_documents( - [rsv.report.body for rsv in chunk] - ) - for rsv, vec in zip(chunk, vectors, strict=True): - rsv.embedding = vec + _embed_with_bisect(client, chunk, embedded, skipped) + + if embedded: + ReportSearchVector.objects.bulk_update(embedded, fields=["embedding"]) + if skipped: + logger.error("…skipped as too large; report_ids=%s", [r.report_id for r in skipped]) + - @database_sync_to_async - def _save(): - ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) - await _save() +def _embed_with_bisect(client, rsvs, embedded, skipped): + """Embed rsvs. On EmbeddingPayloadTooLargeError, bisect until we isolate + the single offender — then log report_id + body_chars and skip it. + Other EmbeddingClientError types propagate so Procrastinate retries.""" + if not rsvs: + return + try: + vectors = client.embed_documents([rsv.report.body for rsv in rsvs]) + except EmbeddingPayloadTooLargeError as exc: + if len(rsvs) == 1: + logger.error( + "embed_reports_task: report_id=%s body_chars=%d rejected as too " + "large; skipping. Backend: %s", + rsvs[0].report_id, len(rsvs[0].report.body), exc, + ) + skipped.append(rsvs[0]) + return + mid = len(rsvs) // 2 + _embed_with_bisect(client, rsvs[:mid], embedded, skipped) + _embed_with_bisect(client, rsvs[mid:], embedded, skipped) + return + for rsv, vec in zip(rsvs, vectors, strict=True): + rsv.embedding = vec + embedded.append(rsv) ``` -**Why async**: the work is dominated by HTTP wait. Procrastinate's worker is asyncio-based; an async task lets `--concurrency K` mean "K embedding HTTP calls in flight on a single event loop" without spinning OS threads. DB parts wrap `database_sync_to_async` so sync ORM doesn't block the loop. +**Sync, not async**: each task issues batches sequentially (one HTTP round-trip at a time, waiting for the response before launching the next), so asyncio inside a single task wouldn't add concurrency. Worker concurrency comes from Procrastinate's `--concurrency K` flag, which gives K independent task slots regardless of whether the task body is `def` or `async def`. A sync task keeps the call graph readable — direct ORM, direct `httpx.Client`, no `database_sync_to_async` shims. **Internal batching**: a single task accepts an arbitrarily-sized `report_ids` list (e.g., a 1000-row bulk-upsert dispatches one task) and chunks it into HTTP calls of `EMBEDDING_BATCH_SIZE` reports each. This decouples the *enqueue size* (one task per ingest event, naturally sized to the workload) from the *embedding service call size* (always bounded by `EMBEDDING_BATCH_SIZE`, regardless of input). The vLLM endpoint sees a steady stream of equally-sized batches rather than occasional spike requests. -**No internal catch**: the task lets `EmbeddingClientError` propagate. Procrastinate handles retry — see §6.4. On retry, the entire batch loop reruns (idempotent: `bulk_update` overwrites identical vectors with no change). +**Bisect on payload-too-large**: the client signals overlength inputs via the typed `EmbeddingPayloadTooLargeError` subclass (§5.4). The task catches it in `_embed_with_bisect` and recursively halves the failing chunk; the recursion terminates either when a sub-chunk succeeds or when a single rsv is isolated. In the isolated case the task logs ERROR with the specific `report_id` + `body_chars`, appends to `skipped`, and continues — the rest of the batch still gets embedded. At task end, ERROR-level summary lists all skipped ids so operators can find them with one log search. The skipped reports' RSVs stay NULL; re-running `embed_pending` will re-attempt and re-log them, which is the expected stop signal for the operator to fix the upstream report or raise the model's context window. Bisect cost: worst case `O(K log K)` extra HTTP calls per offending chunk, but only when an offender exists — the common case is one HTTP call per chunk. + +**Two layers of retry for transient errors**: the actual embed call is wrapped in `_embed_chunk_with_retry`, a [stamina](https://stamina.hynek.me/)-decorated function: + +```python +def _is_retryable_embedding_error(exc: Exception) -> bool: + return isinstance(exc, EmbeddingClientError) and not isinstance( + exc, EmbeddingPayloadTooLargeError + ) + +@stamina.retry( + on=_is_retryable_embedding_error, + attempts=3, timeout=30.0, wait_initial=0.5, wait_max=8.0, +) +def _embed_chunk_with_retry(client, texts): + return client.embed_documents(texts) +``` + +- **stamina (inline, per-call):** 3 attempts within ~30 s, exponential backoff with jitter. Handles brief blips — a single 5xx, a network jitter, a transient timeout. The predicate `_is_retryable_embedding_error` explicitly *excludes* `EmbeddingPayloadTooLargeError` so the bisect logic owns that case end-to-end without burning retry budget on a deterministic rejection. +- **Procrastinate (task-level, per-task):** when stamina's budget is exhausted the exception escapes the task, and Procrastinate's exponential-backoff retry kicks in for the whole batch. Handles extended outages where the embedding service is down for minutes-to-hours. On retry the entire batch loop reruns (idempotent: `bulk_update` overwrites identical vectors with no change). +- **Why two layers and not just one:** stamina inside the task absorbs the common case of "the service blipped once" without the operator-visible noise of a Procrastinate retry event, and without re-doing all the bookkeeping (`SELECT FOR UPDATE SKIP LOCKED`, lease, ack). Procrastinate above the task covers the long-tail case stamina is not budgeted for. Stamina alone would mean a single 30-s outage permanently fails the task; Procrastinate alone would mean every blip incurs a full task replay. + +For tests, the repo-wide `conftest.py` disables stamina globally via `stamina.set_active(False)`; specific tests that exercise retry behaviour opt back in with the `stamina_active` fixture. ### 6.3 The worker and the concurrency model @@ -460,8 +496,8 @@ embeddings_worker: Three explicit choices: - **Dedicated queue (`embeddings`)**: isolated from `default` (extraction / subscription) and `llm`. A backfill or write burst can't starve unrelated tasks. -- **`--concurrency 4`** (the concurrency knob): up to 4 `embed_reports_task` coroutines in flight on the worker's event loop at once. Each coroutine has at most one embedding HTTP call outstanding at a time (the task's internal batch loop is sequential), so `--concurrency K` translates directly to "up to K embedding HTTP requests in flight to the embedding service per worker process." Total system concurrency = `worker_count × --concurrency`. The default of 4 leaves capacity for the query path's `embed_query` to share the same embedding service. Tunable per deployment. -- **Async-native**: the worker runs a single asyncio event loop; async tasks slot in directly. One `httpx.AsyncClient` connection pool per worker process; one async Postgres pool. Low overhead compared to threaded workers. +- **`--concurrency 4`** (the concurrency knob): up to 4 `embed_reports_task` slots in flight on the worker at once. Each slot processes its batches sequentially, so `--concurrency K` translates directly to "up to K embedding HTTP requests in flight to the embedding service per worker process." Total system concurrency = `worker_count × --concurrency`. The default of 4 leaves capacity for the query path's `embed_query` to share the same embedding service. Tunable per deployment. +- **Sync task body**: the task is `def`, not `async def`. Procrastinate gives concurrency through K independent task slots regardless of sync vs async, and the embedding batch loop is sequential by design — switching to async would not add any in-task concurrency, just a `database_sync_to_async` shim layer. **Two layers of "batching"**, easy to confuse, kept separate by design: @@ -479,8 +515,9 @@ Procrastinate handles transient failures automatically; `embed_pending` (§6.5) | Failure | What happens | |---|---| -| **Transient outage** (5xx / timeout / network blip ≲ minutes) | Task raises → Procrastinate retries with exponential backoff. Most cases auto-recover; the embedding is written without operator action. | -| **Extended outage** (service down longer than retry window) | Task ends in `failed` state in `procrastinate_jobs`. RSV stays NULL. Operator runs `./manage.py embed_pending` once the service recovers to re-enqueue the affected rows. | +| **Brief blip** (single 5xx / timeout / network jitter ≲ seconds) | stamina inside the task retries the same HTTP call up to 3 times within ~30 s. Most cases recover before the task even completes its current batch loop iteration. No Procrastinate retry event. | +| **Transient outage** (service degraded for minutes; outlasts stamina's 30 s budget) | Stamina exhausts → exception escapes the task → Procrastinate's task-level retry kicks in with exponential backoff. Most cases auto-recover; the embedding is written without operator action. | +| **Extended outage** (service down longer than Procrastinate's retry window) | Task ends in `failed` state in `procrastinate_jobs`. RSV stays NULL. Operator runs `./manage.py embed_pending` (or the admin action) once the service recovers to re-enqueue the affected rows. | | **Wrong-dim vector returned by backend** | `EmbeddingClientError` raised → retries → all fail the same way → task ends `failed`. Operator inspects, fixes config (or the `pgsearch.E001` system check catches it at deploy time). | | **Worker offline / crashed** | Tasks pile up in `procrastinate_jobs.todo`. When a worker starts, it picks them up via `SELECT ... FOR UPDATE SKIP LOCKED`. No data loss. Write path unaffected. | | **Embedding written and report immediately deleted** | `bulk_update` updates zero rows for the deleted RSV; rest of the batch is unaffected. Benign. | @@ -514,30 +551,26 @@ Properties: - **Rate-limited.** The worker's `--concurrency K` caps concurrent embedding HTTP calls regardless of how many tasks the command enqueues. Operators cannot accidentally hammer the embedding service. - **Visible.** Enqueued tasks appear in the standard Procrastinate observability surface (admin, logs, telemetry). Failed retries surface there as well. -### 6.6 `PGSEARCH_SYNC_INDEXING` retained; ordering enforced by defensive FTS in the embed task +### 6.6 `PGSEARCH_SYNC_INDEXING` retained; ordering enforced by chaining The pre-existing `PGSEARCH_SYNC_INDEXING` switch is **retained** with the same semantics it had before hybrid search: it controls whether FTS bulk-indexing runs inline on the request thread or is deferred to a `bulk_index_reports` Procrastinate task. Pgsearch's `_handle_reports_changed` reads the flag and dispatches accordingly: | Mode | `PGSEARCH_SYNC_INDEXING` | FTS step | Embedding step | |---|---|---|---| -| Sync | `True` | `bulk_upsert_report_search_vectors(ids)` inline inside the handler | `embed_reports_task.defer(...)` immediately after, in the same handler call | -| Deferred (default) | `False` | `enqueue_bulk_index_reports(ids)` defers `bulk_index_reports` to the `default` queue | `embed_reports_task.defer(...)` immediately after; ordering vs the deferred FTS task is unspecified (see below) | - -`bulk_index_reports` is **unchanged from pre-hybrid-search**: it's purely an FTS task. It does *not* chain into `embed_reports_task`. - -In the deferred FTS mode both Procrastinate jobs (the bulk-index task on `default` and the embed task on `embeddings`) are inserted in the same DB transaction. The two workers pick them up independently; either can win the race. The defensive `bulk_upsert_report_search_vectors` call at the top of `embed_reports_task` covers the case where the embed task wins — it idempotently produces the RSV rows it needs before reading `report.body`, costing one extra (no-op in the common case) tsvector recompute. This is the trade I picked over chaining the two tasks together: +| Sync | `True` | `bulk_upsert_report_search_vectors(ids)` inline inside the handler | `embed_reports_task.defer(...)` immediately after, in the same handler call. RSV rows are already committed. | +| Deferred (default) | `False` | `enqueue_bulk_index_reports(ids)` defers `bulk_index_reports` to the `default` queue | `bulk_index_reports` itself defers `embed_reports_task` at the end of its run. Handler does *not* defer embed directly. | -- **No coupling between tasks.** `bulk_index_reports` stays pure FTS; `embed_reports_task` stays self-sufficient. Either can be reused, tested, or replaced in isolation. -- **Same safety net protects shell/admin edits.** A Python-shell `report.body = x; report.save()` fires the FTS signal but no handler. If an operator manually `embed_reports_task.defer([pk])` after such an edit, the defensive call still ensures the RSV is current. With chaining the safety net only existed for the bulk path. -- **Cheap idempotent cost.** `bulk_upsert_report_search_vectors([pk])` for an already-indexed row is one INSERT ON CONFLICT DO NOTHING + one UPDATE that rewrites `search_vector = to_tsvector(...)` to the same value. ~1 ms per chunk. +`bulk_index_reports` now ends with `embed_reports_task.defer(report_ids=...)`. The defer happens inside the same task body, after `bulk_upsert_report_search_vectors` has committed the RSV rows, so the embeddings worker can only observe a `report_ids` payload whose RSV rows already exist. This replaces the earlier "defensive idempotent re-upsert at the top of the embed task" design — the chain is the ordering guarantee. Properties: -- **No correctness race.** Embedding either finds the RSV already indexed (common case) or creates+indexes it on the fly (defensive case). It never reads a NULL `body` or skips a missing RSV. +- **No race.** The embeddings worker never picks up a report id before its RSV row is committed. The embed task can read `report.body` and write `embedding` without checking for RSV existence. +- **Simple embed task.** No `bulk_upsert_report_search_vectors` shim at the top, no idempotent re-upsert cost on the embeddings worker, no extra commit hop. - **Operator choice preserved.** Deployments that prefer sync FTS keep that option; deployments that prefer the deferred FTS task for large bulks keep that option. Hybrid search is orthogonal to the FTS-mode decision. - **Two queues, two concerns.** FTS deferral runs on the `default` queue (where `bulk_index_reports` already lived); embedding runs on the dedicated `embeddings` queue. FTS-only worker capacity does not compete with embedding capacity. +- **Operator-triggered re-embed.** The `embed_pending` management command and the `enqueue_pending_embeddings` admin action defer `embed_reports_task` directly. Both bypass `bulk_index_reports` but the invariant still holds: their queries are over existing `ReportSearchVector` rows with `embedding IS NULL`, so the RSV rows exist by construction. -The single-create / PUT path is unaffected by `PGSEARCH_SYNC_INDEXING`. Its FTS step is the `post_save` signal on `Report`, which is always sync inline by construction. The same handler still fires for it; the handler's FTS call in sync mode is a redundant (idempotent, ~1 ms) recompute, and in async mode adds one Procrastinate job per single create. The redundancy is the cost of the clean abstraction — the handler doesn't know whether it was triggered by a single-create or a bulk write. +The single-create / PUT path is unaffected by `PGSEARCH_SYNC_INDEXING`. Its FTS step is the `post_save` signal on `Report`, which is always sync inline by construction. The same handler still fires for it; the handler then takes the sync-mode branch's behaviour (immediate embed defer), which is correct since the RSV row was just written sync by the signal. ### 6.7 Sync DRF; no async views required @@ -777,7 +810,6 @@ These vary across dev/staging/prod and are operator-controlled. `EMBEDDING_DIM` ```python EMBEDDING_REQUEST_TIMEOUT = 30 # seconds -EMBEDDING_MAX_INPUT_CHARS = 60_000 EMBEDDING_QUERY_INSTRUCTION = ( "Instruct: Given a radiology search query, retrieve relevant radiology reports.\n" "Query: " @@ -816,7 +848,7 @@ Both files add an `embeddings_worker.command` block. Dev uses `-l debug --autore | Embedding service down during `embed_reports_task` execution | Task raises `EmbeddingClientError`; Procrastinate retries with exponential backoff. After retries exhaust, task ends `failed`; `embedding` stays NULL. **API request was never affected** (already returned at the on_commit point). | WARNING per retry; ERROR on final failure | | Orchestrator crashes during task creation (partial dispatch) | Job stays in `PREPARING`. Next launcher tick sees in-flight job and no-ops. Operator marks job `FAILURE` in admin to allow a fresh run | ERROR + operator action | | Sub-task fails after Procrastinate retries exhausted | Task ends as `FAILURE`. `update_job_state` rolls the job to `WARNING` (some tasks succeeded) or `FAILURE` (all failed). NULL rows remain; next launcher creates a new job to retry them | ERROR | -| Report body > `EMBEDDING_MAX_INPUT_CHARS` | Truncate, embed truncated text | WARNING with report_id and char count | +| Report body exceeds embedding model's context window (backend returns 413, or 400/422 with a context-length message) | Client raises `EmbeddingPayloadTooLargeError`. Task bisects the chunk and retries; once the offender is isolated to one report, it is skipped and its RSV stays NULL. The rest of the chunk still gets embedded. | ERROR per offender (report_id + body_chars) and ERROR summary listing all skipped ids | | Report deleted between task creation and execution | Sub-task's `task.reports.values_list(...)` returns fewer rows; `embed_documents` called on smaller list; no error | DEBUG | | Vector dim mismatch on write | Postgres raises; sub-task fails, retried | ERROR — escalate to admin | | `EMBEDDING_PROVIDER_URL` empty at startup | `EmbeddingClient` construction defers to call site; calls log + raise; query falls back to FTS-only | WARNING once on first request | @@ -826,7 +858,7 @@ Both files add an `embeddings_worker.command` block. Dev uses `-l debug --autore - The product never fails a search request because the embedding service is down. It degrades to FTS-only. - Query embeddings are not cached. The complexity and freshness trade-off is not worth it at the corpora sizes RADIS targets. -- `EmbeddingClient` does not retry internally. Procrastinate retries the whole task; the query path uses a single shot. +- `EmbeddingClient` does not retry internally. The worker path layers `stamina.retry` over the client call inside `_embed_chunk_with_retry` (3 attempts / 30 s budget) and lets Procrastinate's task-level retry handle anything stamina can't absorb. The query path uses a single shot and falls back to FTS-only on any `EmbeddingClientError`. **Observability:** @@ -842,7 +874,7 @@ Both files add an `embeddings_worker.command` block. Dev uses `-l debug --autore |---|---| | `tests/unit/test_embedding_client.py` | Backend payload/response round-trip, path override, instruction prefix, normalization, dim validation, all error modes, truncation | | `tests/unit/test_provider_fusion.py` | `_rrf_fuse(vec_rank, fts_rank, k)` pure-Python helper: disjoint, overlapping, FTS-only, vector-only, both-empty, tiebreak by report_id | -| `tests/unit/test_embed_reports_task.py` | Loads RSVs by report_id, calls `AsyncEmbeddingClient.embed_documents`, bulk-updates vectors. Asserts that `EmbeddingClientError` propagates so Procrastinate's retry policy applies (the task does not swallow). | +| `tests/unit/test_embed_reports_task.py` | Loads RSVs by report_id, calls `EmbeddingClient.embed_documents`, bulk-updates vectors. Asserts internal batching by `EMBEDDING_BATCH_SIZE`, that `EmbeddingClientError` propagates so Procrastinate's retry policy applies (the task does not swallow), and that `bulk_index_reports` chains `embed_reports_task.defer(...)` at the end of its run so the embeddings worker only sees report ids whose RSV rows are committed. | ### 10.2 Integration tests (real Postgres + pgvector) diff --git a/pyproject.toml b/pyproject.toml index 0707351a..ae4f0db3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "pandas>=2.2.3", "pgvector>=0.3", "procrastinate[django]>=3.0.2", + "stamina>=24.3.0", "psycopg[binary]>=3.2.5", "pycountry>=24.6.1", "pyparsing>=3.2.1", diff --git a/radis/conftest.py b/radis/conftest.py index 600eece8..fa40db9b 100644 --- a/radis/conftest.py +++ b/radis/conftest.py @@ -1,4 +1,5 @@ import nest_asyncio +import stamina pytest_plugins = ["adit_radis_shared.pytest_fixtures"] @@ -11,3 +12,7 @@ def pytest_configure(): # https://github.com/pytest-dev/pytest-asyncio/issues/543 # https://github.com/microsoft/playwright-pytest/issues/167 nest_asyncio.apply() + + # Disable stamina retries for tests by default; transient-blip retry + # behaviour is exercised explicitly where needed via `stamina.set_active`. + stamina.set_active(False) diff --git a/radis/pgsearch/admin.py b/radis/pgsearch/admin.py new file mode 100644 index 00000000..8ab9f09c --- /dev/null +++ b/radis/pgsearch/admin.py @@ -0,0 +1,47 @@ +from django.conf import settings +from django.contrib import admin, messages +from django.db.models.query import QuerySet +from django.http.request import HttpRequest + +from .models import ReportSearchVector +from .tasks import embed_reports_task + + +@admin.register(ReportSearchVector) +class ReportSearchVectorAdmin(admin.ModelAdmin): + list_display = ("id", "report_id", "has_embedding") + list_filter = ("embedding",) + search_fields = ("report__document_id",) + actions = ("enqueue_pending_embeddings",) + + @admin.display(boolean=True, description="Embedded") + def has_embedding(self, obj: ReportSearchVector) -> bool: + return obj.embedding is not None + + @admin.action(description="Enqueue embedding for selected rows (NULL only)") + def enqueue_pending_embeddings( + self, request: HttpRequest, queryset: QuerySet[ReportSearchVector] + ) -> None: + report_ids = list( + queryset.filter(embedding__isnull=True) + .order_by("report_id") + .values_list("report_id", flat=True) + ) + if not report_ids: + self.message_user( + request, + "No selected rows are missing an embedding.", + level=messages.WARNING, + ) + return + + batch_size = settings.EMBEDDING_BATCH_SIZE + for i in range(0, len(report_ids), batch_size): + chunk = report_ids[i : i + batch_size] + embed_reports_task.defer(report_ids=list(chunk)) + + self.message_user( + request, + f"Enqueued {len(report_ids)} report(s) for embedding.", + level=messages.SUCCESS, + ) diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py index de056f01..69bc248b 100644 --- a/radis/pgsearch/apps.py +++ b/radis/pgsearch/apps.py @@ -77,10 +77,14 @@ def _handle_reports_changed(reports): Owns both FTS indexing and embedding for the touched reports. The mode flag `PGSEARCH_SYNC_INDEXING` controls whether FTS runs inline on the request thread or is deferred to a Procrastinate task on the `default` - queue. Embedding is always deferred to the `embeddings` queue; the - embed task is itself defensive about RSV rows being absent (see - `embed_reports_task`), so it doesn't need to wait for the deferred FTS - task to finish. + queue. Embedding is always deferred to the `embeddings` queue. + + Ordering between FTS and embedding is the same in both modes: RSV rows + exist (and `report.body` is reachable) before `embed_reports_task` runs. + In sync mode the handler upserts inline, then defers embed. In async + mode the handler only enqueues `bulk_index_reports`; that task chains + `embed_reports_task` at the end of its own run, so the embeddings worker + never picks up a report before its RSV row is committed. """ if not reports: return @@ -91,9 +95,9 @@ def _handle_reports_changed(reports): report_ids = [report.pk for report in reports] if settings.PGSEARCH_SYNC_INDEXING: bulk_upsert_report_search_vectors(report_ids) + embed_reports_task.defer(report_ids=report_ids) else: enqueue_bulk_index_reports(report_ids) - embed_reports_task.defer(report_ids=report_ids) def register_app(): diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index f810ed76..3ac66b18 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -1,25 +1,66 @@ import logging -from channels.db import database_sync_to_async +import stamina from django.conf import settings from procrastinate.contrib.django import app from procrastinate.types import JSONValue from .models import ReportSearchVector -from .utils.embedding_client import AsyncEmbeddingClient +from .utils.embedding_client import ( + EmbeddingClient, + EmbeddingClientError, + EmbeddingPayloadTooLargeError, +) from .utils.indexing import bulk_upsert_report_search_vectors logger = logging.getLogger(__name__) +def _is_retryable_embedding_error(exc: Exception) -> bool: + """stamina retry predicate. Retry transient embedding-service failures + (5xx, network, timeouts — all surfaced as `EmbeddingClientError`) but + NOT `EmbeddingPayloadTooLargeError`, which is a deterministic rejection + of an input that exceeds the model's context window. Retrying that + one would just hit the same wall — the bisect logic in + `_embed_with_bisect` handles it instead.""" + return isinstance(exc, EmbeddingClientError) and not isinstance( + exc, EmbeddingPayloadTooLargeError + ) + + +@stamina.retry( + on=_is_retryable_embedding_error, + attempts=3, + timeout=30.0, + wait_initial=0.5, + wait_max=8.0, +) +def _embed_chunk_with_retry( + client: EmbeddingClient, texts: list[str] +) -> list[list[float]]: + """Single embed call wrapped in stamina-controlled transient retries. + + Layered with Procrastinate's task-level retry: stamina handles brief + blips (3 attempts within ~30s); Procrastinate handles extended outages + (whole-task retry on backoff). `EmbeddingPayloadTooLargeError` is + excluded by the predicate so the bisect logic above this layer can + catch and resolve it without burning retry budget.""" + return client.embed_documents(texts) + + @app.task def bulk_index_reports(report_ids: list[int]) -> None: """Deferred FTS bulk-indexing for the bulk-upsert path - (when `PGSEARCH_SYNC_INDEXING=False`).""" + (when `PGSEARCH_SYNC_INDEXING=False`). + + Chains into `embed_reports_task` once RSV rows exist, so the embeddings + worker never reads a missing `report.body` or a stale tsvector. + """ if not report_ids: return logger.info("Indexing %s reports in bulk.", len(report_ids)) bulk_upsert_report_search_vectors(report_ids) + embed_reports_task.defer(report_ids=report_ids) def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: @@ -36,36 +77,79 @@ def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: ).defer(report_ids=payload) +def _embed_with_bisect( + client: EmbeddingClient, + rsvs: list[ReportSearchVector], + embedded: list[ReportSearchVector], + skipped: list[ReportSearchVector], +) -> None: + """Embed `rsvs` and append `(rsv, vec)` pairs to `embedded`. When the + backend rejects the request as too large, bisect and recurse. Once the + offender is isolated to a single rsv, log its `report_id` + body length + and append it to `skipped` instead of raising — that way the rest of + the task's batch still gets embedded. + + Transient errors are absorbed by `_embed_chunk_with_retry`'s stamina + wrapper. Anything that escapes after stamina's attempts/timeout budget + is exhausted propagates so Procrastinate's task-level retry applies. + """ + if not rsvs: + return + try: + vectors = _embed_chunk_with_retry(client, [rsv.report.body for rsv in rsvs]) + except EmbeddingPayloadTooLargeError as exc: + if len(rsvs) == 1: + offender = rsvs[0] + logger.error( + "embed_reports_task: report_id=%s body_chars=%d rejected by embedding " + "service as too large; skipping. Backend error: %s", + offender.report_id, + len(offender.report.body), + exc, + ) + skipped.append(offender) + return + mid = len(rsvs) // 2 + _embed_with_bisect(client, rsvs[:mid], embedded, skipped) + _embed_with_bisect(client, rsvs[mid:], embedded, skipped) + return + + for rsv, vec in zip(rsvs, vectors, strict=True): + rsv.embedding = vec + embedded.append(rsv) + + @app.task(queue="embeddings") -async def embed_reports_task(report_ids: list[int]) -> None: +def embed_reports_task(report_ids: list[int]) -> None: """Embed the named reports. - Raises on `EmbeddingClientError` so Procrastinate's retry policy applies. - Reports are sent to the embedding service in batches of - `EMBEDDING_BATCH_SIZE` to bound per-call payload size regardless of how - many `report_ids` the caller passed. - - Defensive about missing RSV rows: when `PGSEARCH_SYNC_INDEXING=False`, - the handler enqueues this task alongside `bulk_index_reports` and the - embeddings worker may pick this task up first. Calling - `bulk_upsert_report_search_vectors` at the top ensures RSV rows exist - with up-to-date `search_vector` before we read `report.body`. The same - safety net covers shell/admin edits that bypass the bulk path. + Two layers of failure handling sit between the embedding service and + this task: + + * `_embed_chunk_with_retry` retries transient `EmbeddingClientError` + via stamina (3 attempts, ~30s budget) for brief blips. + * `_embed_with_bisect` catches the deterministic + `EmbeddingPayloadTooLargeError` and recurses until it isolates the + offending report, then logs ERROR with `report_id` + body length and + skips it (its RSV stays NULL). The rest of the batch still embeds. + + Anything that escapes both — sustained `EmbeddingClientError` past + stamina's budget — propagates so Procrastinate's task-level retry + policy applies. + + Callers must ensure ReportSearchVector rows exist before deferring this + task. `bulk_index_reports` chains the defer at the end of its run, and + `embed_pending` / the admin action filter on existing RSV rows by + construction. """ if not report_ids: return - await database_sync_to_async(bulk_upsert_report_search_vectors)(report_ids) - - @database_sync_to_async - def _load_rsvs() -> list[ReportSearchVector]: - return list( - ReportSearchVector.objects.filter(report_id__in=report_ids) - .select_related("report") - .only("id", "report_id", "report__body") - ) - - rsvs = await _load_rsvs() + rsvs = list( + ReportSearchVector.objects.filter(report_id__in=report_ids) + .select_related("report") + .only("id", "report_id", "report__body") + ) if not rsvs: logger.warning( "embed_reports_task: no ReportSearchVector rows for report ids %s", @@ -74,15 +158,20 @@ def _load_rsvs() -> list[ReportSearchVector]: return batch_size = settings.EMBEDDING_BATCH_SIZE - async with AsyncEmbeddingClient() as client: + embedded: list[ReportSearchVector] = [] + skipped: list[ReportSearchVector] = [] + with EmbeddingClient() as client: for start in range(0, len(rsvs), batch_size): chunk = rsvs[start : start + batch_size] - vectors = await client.embed_documents([rsv.report.body for rsv in chunk]) - for rsv, vec in zip(chunk, vectors, strict=True): - rsv.embedding = vec - - @database_sync_to_async - def _save() -> None: - ReportSearchVector.objects.bulk_update(rsvs, fields=["embedding"]) - - await _save() + _embed_with_bisect(client, chunk, embedded, skipped) + + if embedded: + ReportSearchVector.objects.bulk_update(embedded, fields=["embedding"]) + if skipped: + logger.error( + "embed_reports_task: %d report(s) skipped as too large for the embedding " + "model; report_ids=%s. Fix the upstream report or raise the model context " + "limit; their RSV rows stay NULL until embedded.", + len(skipped), + [rsv.report_id for rsv in skipped], + ) diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py index b45630e3..0a6c7e5d 100644 --- a/radis/pgsearch/tests/test_embed_reports_task.py +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -1,15 +1,29 @@ -"""Tests for `embed_reports_task`.""" -import asyncio -from unittest.mock import AsyncMock, MagicMock, patch +"""Tests for `embed_reports_task` and its chaining from `bulk_index_reports`.""" +import logging +from unittest.mock import MagicMock, patch import numpy as np import pytest +import stamina from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.tasks import embed_reports_task -from radis.pgsearch.utils.embedding_client import EmbeddingClientError +from radis.pgsearch.tasks import bulk_index_reports, embed_reports_task +from radis.pgsearch.utils.embedding_client import ( + EmbeddingClientError, + EmbeddingPayloadTooLargeError, +) from radis.reports.factories import ReportFactory + +@pytest.fixture +def stamina_active(): + """Enable stamina retries for the duration of one test. The repo-wide + conftest disables them so the rest of the suite isn't slowed by retry + backoffs.""" + stamina.set_active(True) + yield + stamina.set_active(False) + pytestmark = pytest.mark.django_db(transaction=True) @@ -18,28 +32,27 @@ def _unit_vec(dim: int) -> list[float]: return (v / np.linalg.norm(v)).tolist() -def _make_fake_async_client(vec: list[float]) -> MagicMock: - """Build a MagicMock that mimics `async with AsyncEmbeddingClient() as c` - and supports `await c.embed_documents([...])`.""" +def _make_fake_client(vec: list[float]) -> MagicMock: + """MagicMock that mimics `with EmbeddingClient() as c` and + `c.embed_documents([...])`.""" instance = MagicMock() - instance.__aenter__ = AsyncMock(return_value=instance) - instance.__aexit__ = AsyncMock(return_value=None) - instance.embed_documents = AsyncMock(side_effect=lambda texts: [vec] * len(texts)) + instance.__enter__ = MagicMock(return_value=instance) + instance.__exit__ = MagicMock(return_value=None) + instance.embed_documents = MagicMock(side_effect=lambda texts: [vec] * len(texts)) return instance def test_empty_input_no_ops(): - with patch("radis.pgsearch.tasks.AsyncEmbeddingClient") as client_cls: - asyncio.run(embed_reports_task(report_ids=[])) + with patch("radis.pgsearch.tasks.EmbeddingClient") as client_cls: + embed_reports_task(report_ids=[]) client_cls.assert_not_called() def test_no_matching_rsvs_no_ops(): - """Report ids that don't resolve to actual reports must not blow up; - bulk_upsert_report_search_vectors logs+skips missing rows and the task - returns without calling the embedding service.""" - with patch("radis.pgsearch.tasks.AsyncEmbeddingClient") as client_cls: - asyncio.run(embed_reports_task(report_ids=[999_999])) + """Report ids that don't resolve to RSV rows are a no-op — the task does + not contact the embedding service.""" + with patch("radis.pgsearch.tasks.EmbeddingClient") as client_cls: + embed_reports_task(report_ids=[999_999]) client_cls.assert_not_called() @@ -48,14 +61,14 @@ def test_embeds_in_internal_batches(settings): reports = [ReportFactory.create() for _ in range(5)] pks = [r.pk for r in reports] vec = _unit_vec(settings.EMBEDDING_DIM) - fake = _make_fake_async_client(vec) + fake = _make_fake_client(vec) - with patch("radis.pgsearch.tasks.AsyncEmbeddingClient", return_value=fake): - asyncio.run(embed_reports_task(report_ids=pks)) + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake): + embed_reports_task(report_ids=pks) # 5 reports with batch_size=2 → 3 embed_documents calls of sizes 2, 2, 1. - assert fake.embed_documents.await_count == 3 - sizes = [len(call.args[0]) for call in fake.embed_documents.await_args_list] + assert fake.embed_documents.call_count == 3 + sizes = [len(call.args[0]) for call in fake.embed_documents.call_args_list] assert sorted(sizes) == [1, 2, 2] assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 0 @@ -65,34 +78,165 @@ def test_embedding_error_propagates(): reports = [ReportFactory.create() for _ in range(2)] pks = [r.pk for r in reports] fake = MagicMock() - fake.__aenter__ = AsyncMock(return_value=fake) - fake.__aexit__ = AsyncMock(return_value=None) - fake.embed_documents = AsyncMock(side_effect=EmbeddingClientError("service down")) + fake.__enter__ = MagicMock(return_value=fake) + fake.__exit__ = MagicMock(return_value=None) + fake.embed_documents = MagicMock(side_effect=EmbeddingClientError("service down")) - with patch("radis.pgsearch.tasks.AsyncEmbeddingClient", return_value=fake): + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake): with pytest.raises(EmbeddingClientError): - asyncio.run(embed_reports_task(report_ids=pks)) + embed_reports_task(report_ids=pks) assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 2 -def test_ensures_rsv_rows_exist_before_embedding(settings): - """If a report has no ReportSearchVector yet (e.g., bulk_index_reports - hasn't run, or an admin/shell edit bypassed the signal), the embed task - must create the row + tsvector before reading body. This is the safety - net that lets the handler enqueue embed without waiting for the - deferred FTS task to land first.""" - reports = [ReportFactory.create() for _ in range(2)] +def test_bulk_index_reports_chains_into_embed_reports_task(): + """`bulk_index_reports` upserts RSVs and then defers `embed_reports_task`. + The chain is the ordering guarantee: the embeddings worker only ever sees + report ids whose RSV rows are already committed.""" + reports = [ReportFactory.create() for _ in range(3)] pks = [r.pk for r in reports] ReportSearchVector.objects.filter(report_id__in=pks).delete() - assert ReportSearchVector.objects.filter(report_id__in=pks).count() == 0 + with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer: + bulk_index_reports(report_ids=pks) + + # RSVs were upserted, then the embed task was deferred with the same ids. + assert ReportSearchVector.objects.filter(report_id__in=pks).count() == 3 + defer.assert_called_once_with(report_ids=pks) + + +def test_bisects_on_too_large_and_isolates_offender(settings, caplog, monkeypatch): + """When the backend rejects a batch as too large, the task bisects until + it isolates the single offending report, logs ERROR with its id + body + length, skips it, and still embeds the rest of the batch.""" + settings.EMBEDDING_BATCH_SIZE = 4 + reports = [ReportFactory.create() for _ in range(4)] + pks = [r.pk for r in reports] + offender_pk = pks[2] # the third report is the one we mark too large + + vec = _unit_vec(settings.EMBEDDING_DIM) + + def fake_embed(texts): + # Simulate the backend rejecting any payload that contains the + # offending report's body. The body is fetched by report_id. + offender_body = ReportSearchVector.objects.select_related("report").get( + report_id=offender_pk + ).report.body + if offender_body in texts: + raise EmbeddingPayloadTooLargeError("over context window") + return [vec] * len(texts) + + fake = MagicMock() + fake.__enter__ = MagicMock(return_value=fake) + fake.__exit__ = MagicMock(return_value=None) + fake.embed_documents = MagicMock(side_effect=fake_embed) + + # The project's `radis` logger has `propagate=False` in settings, so + # caplog's root handler doesn't see records emitted under it. Attach + # caplog's handler directly to the task logger for the duration of + # this test. + task_logger = logging.getLogger("radis.pgsearch.tasks") + task_logger.addHandler(caplog.handler) + caplog.set_level(logging.ERROR, logger="radis.pgsearch.tasks") + try: + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake): + embed_reports_task(report_ids=pks) + finally: + task_logger.removeHandler(caplog.handler) + + # The three good reports got embeddings; the offender stayed NULL. + rsvs_by_pk = { + rsv.report_id: rsv + for rsv in ReportSearchVector.objects.filter(report_id__in=pks) + } + assert rsvs_by_pk[offender_pk].embedding is None + for pk in pks: + if pk == offender_pk: + continue + assert rsvs_by_pk[pk].embedding is not None + + # The bisect logged the specific offender's id + body length, and the + # task-level summary listed it among skipped ids. + error_msgs = [r.getMessage() for r in caplog.records if r.levelname == "ERROR"] + assert any( + f"report_id={offender_pk}" in msg and "body_chars=" in msg + for msg in error_msgs + ) + assert any( + "skipped as too large" in msg and str(offender_pk) in msg + for msg in error_msgs + ) + + +def test_non_too_large_error_propagates_without_bisecting(): + """A generic EmbeddingClientError (5xx, network, etc.) must NOT bisect — + Procrastinate's retry policy should handle it, retrying the whole batch. + (Stamina retries are disabled in the conftest, so this is a single call.)""" + reports = [ReportFactory.create() for _ in range(4)] + pks = [r.pk for r in reports] + fake = MagicMock() + fake.__enter__ = MagicMock(return_value=fake) + fake.__exit__ = MagicMock(return_value=None) + fake.embed_documents = MagicMock(side_effect=EmbeddingClientError("service down")) + + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake): + with pytest.raises(EmbeddingClientError): + embed_reports_task(report_ids=pks) + + # Only one call should have been made — no bisect on non-too-large errors. + assert fake.embed_documents.call_count == 1 + assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 4 + + +def test_stamina_retries_transient_then_succeeds(settings, stamina_active): + """stamina retries transient EmbeddingClientError: an embed call that + fails the first two attempts and succeeds on the third returns vectors + without the bisect logic ever firing, and without escalating to + Procrastinate's task-level retry.""" + settings.EMBEDDING_BATCH_SIZE = 4 + reports = [ReportFactory.create() for _ in range(3)] + pks = [r.pk for r in reports] vec = _unit_vec(settings.EMBEDDING_DIM) - fake = _make_fake_async_client(vec) - with patch("radis.pgsearch.tasks.AsyncEmbeddingClient", return_value=fake): - asyncio.run(embed_reports_task(report_ids=pks)) - - rsvs = ReportSearchVector.objects.filter(report_id__in=pks) - assert rsvs.count() == 2 - assert rsvs.filter(search_vector__isnull=True).count() == 0 - assert rsvs.filter(embedding__isnull=True).count() == 0 + + fake = MagicMock() + fake.__enter__ = MagicMock(return_value=fake) + fake.__exit__ = MagicMock(return_value=None) + fake.embed_documents = MagicMock( + side_effect=[ + EmbeddingClientError("blip 1"), + EmbeddingClientError("blip 2"), + [vec, vec, vec], + ] + ) + + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake): + embed_reports_task(report_ids=pks) + + # The mock was called 3 times: two retries + one success. + assert fake.embed_documents.call_count == 3 + # All three reports got embeddings; none stayed NULL. + assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 0 + + +def test_stamina_does_not_retry_payload_too_large(settings, stamina_active): + """EmbeddingPayloadTooLargeError must skip the stamina retry layer and + go straight to the bisect logic. With one offender in a single-row + chunk, the embed_documents mock should be called exactly once (no + retries), and the offender is logged + skipped.""" + settings.EMBEDDING_BATCH_SIZE = 1 + reports = [ReportFactory.create() for _ in range(1)] + pks = [r.pk for r in reports] + + fake = MagicMock() + fake.__enter__ = MagicMock(return_value=fake) + fake.__exit__ = MagicMock(return_value=None) + fake.embed_documents = MagicMock( + side_effect=EmbeddingPayloadTooLargeError("over context") + ) + + with patch("radis.pgsearch.tasks.EmbeddingClient", return_value=fake): + embed_reports_task(report_ids=pks) + + # Single call — no stamina retry for payload-too-large. + assert fake.embed_documents.call_count == 1 + assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 1 diff --git a/radis/pgsearch/tests/test_embedding_client.py b/radis/pgsearch/tests/test_embedding_client.py index 82c231f6..6a7c3226 100644 --- a/radis/pgsearch/tests/test_embedding_client.py +++ b/radis/pgsearch/tests/test_embedding_client.py @@ -72,7 +72,6 @@ def test_backends_registry_keys(): EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=4, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="INST: ", ) def test_embed_documents_posts_payload_and_normalizes(monkeypatch): @@ -111,7 +110,6 @@ def handler(request: httpx.Request) -> httpx.Response: EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) def test_provider_path_override(monkeypatch): @@ -138,7 +136,6 @@ def handler(request: httpx.Request) -> httpx.Response: EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="INST: ", ) def test_embed_query_prepends_instruction(monkeypatch): @@ -165,34 +162,6 @@ def handler(request: httpx.Request) -> httpx.Response: EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=5, - EMBEDDING_QUERY_INSTRUCTION="", -) -def test_truncates_long_input(monkeypatch): - from radis.pgsearch.utils import embedding_client as ec - - seen = {} - - def handler(request: httpx.Request) -> httpx.Response: - seen["body"] = json.loads(request.content) - return httpx.Response(200, json={"data": [{"embedding": [1.0, 0.0]}]}) - - monkeypatch.setattr( - ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) - ) - ec.EmbeddingClient().embed_documents(["abcdefghij"]) - assert seen["body"]["input"] == ["abcde"] - - -@override_settings( - EMBEDDING_BACKEND="openai", - EMBEDDING_PROVIDER_URL="http://embed.example", - EMBEDDING_PROVIDER_PATH="", - EMBEDDING_PROVIDER_API_KEY="", - EMBEDDING_MODEL_NAME="qwen3", - EMBEDDING_DIM=2, - EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) def test_dim_too_small_raises(monkeypatch): @@ -217,7 +186,6 @@ def handler(request: httpx.Request) -> httpx.Response: EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) def test_oversized_embedding_truncates_and_renormalizes(monkeypatch): @@ -243,7 +211,6 @@ def handler(request: httpx.Request) -> httpx.Response: EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) def test_5xx_raises(monkeypatch): @@ -267,7 +234,6 @@ def handler(request: httpx.Request) -> httpx.Response: EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) def test_close_releases_http_client(monkeypatch): @@ -296,7 +262,6 @@ def close(self): EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) def test_context_manager_closes_http_client(monkeypatch): @@ -325,7 +290,6 @@ def close(self): EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) def test_provider_path_without_leading_slash_raises(): @@ -343,7 +307,6 @@ def test_provider_path_without_leading_slash_raises(): EMBEDDING_MODEL_NAME="qwen3", EMBEDDING_DIM=2, EMBEDDING_REQUEST_TIMEOUT=10, - EMBEDDING_MAX_INPUT_CHARS=100, EMBEDDING_QUERY_INSTRUCTION="", ) def test_response_count_mismatch_raises(monkeypatch): @@ -358,3 +321,85 @@ def handler(request: httpx.Request) -> httpx.Response: ) with pytest.raises(ec.EmbeddingClientError, match="count mismatch"): ec.EmbeddingClient().embed_documents(["a", "b"]) + + +@pytest.mark.parametrize( + "status, body", + [ + (413, "Payload too large"), + (400, "This model's maximum context length is 8192 tokens, however your " + "messages resulted in 9143 tokens"), + (400, '{"error": {"code": "context_length_exceeded"}}'), + (422, "input exceeds the model context"), + (400, "request too long"), + ], +) +def test_is_payload_too_large_detects_overlength_responses(status, body): + from radis.pgsearch.utils.embedding_client import _is_payload_too_large + + assert _is_payload_too_large(httpx.Response(status, text=body)) is True + + +@pytest.mark.parametrize( + "status, body", + [ + (400, "missing required field 'model'"), + (401, "invalid api key"), + (500, "internal server error"), + (503, "service unavailable"), + ], +) +def test_is_payload_too_large_negatives(status, body): + from radis.pgsearch.utils.embedding_client import _is_payload_too_large + + assert _is_payload_too_large(httpx.Response(status, text=body)) is False + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_overlength_response_raises_typed_subclass(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response( + 400, + text="This model's maximum context length is 8192 tokens.", + ) + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) + ) + with pytest.raises(ec.EmbeddingPayloadTooLargeError): + ec.EmbeddingClient().embed_documents(["x"]) + + +@override_settings( + EMBEDDING_BACKEND="openai", + EMBEDDING_PROVIDER_URL="http://embed.example", + EMBEDDING_PROVIDER_PATH="", + EMBEDDING_PROVIDER_API_KEY="", + EMBEDDING_MODEL_NAME="qwen3", + EMBEDDING_DIM=2, + EMBEDDING_REQUEST_TIMEOUT=10, + EMBEDDING_QUERY_INSTRUCTION="", +) +def test_generic_4xx_still_raises_base_error(monkeypatch): + from radis.pgsearch.utils import embedding_client as ec + + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(401, text="invalid api key") + + monkeypatch.setattr( + ec, "_build_http_client", lambda: httpx.Client(transport=httpx.MockTransport(handler)) + ) + with pytest.raises(ec.EmbeddingClientError) as excinfo: + ec.EmbeddingClient().embed_documents(["x"]) + assert not isinstance(excinfo.value, ec.EmbeddingPayloadTooLargeError) diff --git a/radis/pgsearch/utils/embedding_client.py b/radis/pgsearch/utils/embedding_client.py index dbccabf5..b16da16a 100644 --- a/radis/pgsearch/utils/embedding_client.py +++ b/radis/pgsearch/utils/embedding_client.py @@ -3,7 +3,7 @@ import logging import math from dataclasses import dataclass -from typing import Iterable, Protocol +from typing import Protocol import httpx from django.conf import settings @@ -13,6 +13,12 @@ class EmbeddingClientError(Exception): """Raised when the embedding service returns an error or a malformed response.""" +class EmbeddingPayloadTooLargeError(EmbeddingClientError): + """Raised when the backend rejects a request because one or more inputs + exceed the model's context window. Callers can bisect the batch and + retry — `embed_reports_task` does exactly that.""" + + class EmbeddingBackend(Protocol): path: str @@ -64,11 +70,6 @@ def _build_http_client() -> httpx.Client: return httpx.Client(timeout=settings.EMBEDDING_REQUEST_TIMEOUT) -def _build_async_http_client() -> httpx.AsyncClient: - """Indirection so tests can swap in a MockTransport.""" - return httpx.AsyncClient(timeout=settings.EMBEDDING_REQUEST_TIMEOUT) - - def _l2_normalize(vec: list[float]) -> list[float]: norm = math.sqrt(sum(x * x for x in vec)) if norm == 0.0: @@ -76,17 +77,33 @@ def _l2_normalize(vec: list[float]) -> list[float]: return [x / norm for x in vec] -def _truncate(texts: Iterable[str], max_chars: int) -> list[str]: - out: list[str] = [] - for t in texts: - if len(t) > max_chars: - logger.warning( - "Truncating embedding input from %d to %d chars", len(t), max_chars - ) - out.append(t[:max_chars]) - else: - out.append(t) - return out +# Substrings (case-insensitive) seen in embedding-service responses when one +# or more inputs exceed the model's context window. Kept loose because the +# exact phrasing varies across OpenAI / vLLM / Ollama and minor version bumps. +_TOO_LARGE_MARKERS = ( + "context length", + "context_length", + "maximum context", + "max_tokens", + "max tokens", + "max_position", + "too long", + "too large", + "too many tokens", + "exceeds", + "exceeded", +) + + +def _is_payload_too_large(response: httpx.Response) -> bool: + """Best-effort detection: is this 4xx caused by an input exceeding the + model's context window (i.e., bisecting the batch could resolve it)?""" + if response.status_code == 413: + return True + if response.status_code not in (400, 422): + return False + body_lower = response.text.lower() + return any(marker in body_lower for marker in _TOO_LARGE_MARKERS) @dataclass(frozen=True) @@ -95,7 +112,6 @@ class _ResolvedConfig: url: str model: str dim: int - max_chars: int instruction: str headers: dict[str, str] @@ -125,7 +141,6 @@ def _resolve_config() -> _ResolvedConfig: url=f"{base}{path}", model=settings.EMBEDDING_MODEL_NAME, dim=settings.EMBEDDING_DIM, - max_chars=settings.EMBEDDING_MAX_INPUT_CHARS, instruction=settings.EMBEDDING_QUERY_INSTRUCTION, headers=headers, ) @@ -163,22 +178,27 @@ def __init__(self) -> None: self._http = _build_http_client() def embed_documents(self, texts: list[str]) -> list[list[float]]: - truncated_texts = _truncate(texts, self._cfg.max_chars) - payload = self._cfg.backend.build_payload(self._cfg.model, truncated_texts) + payload = self._cfg.backend.build_payload(self._cfg.model, texts) try: response = self._http.post(self._cfg.url, json=payload, headers=self._cfg.headers) except httpx.HTTPError as e: raise EmbeddingClientError(f"HTTP error contacting {self._cfg.url}: {e}") from e if response.status_code >= 400: + snippet = response.text[:200] + if _is_payload_too_large(response): + raise EmbeddingPayloadTooLargeError( + f"Embedding service rejected payload as too large " + f"({response.status_code}): {snippet}" + ) raise EmbeddingClientError( - f"Embedding service returned {response.status_code}: {response.text[:200]}" + f"Embedding service returned {response.status_code}: {snippet}" ) try: body = response.json() except ValueError as e: raise EmbeddingClientError(f"Embedding response is not JSON: {e}") from e raw = self._cfg.backend.parse_response(body) - return _normalize_response(raw, len(truncated_texts), self._cfg.dim) + return _normalize_response(raw, len(texts), self._cfg.dim) def embed_query(self, text: str) -> list[float]: prefixed = f"{self._cfg.instruction}{text}" if self._cfg.instruction else text @@ -195,53 +215,3 @@ def __enter__(self) -> "EmbeddingClient": def __exit__(self, exc_type, exc_val, exc_tb) -> None: self.close() - - -class AsyncEmbeddingClient: - """Async sibling of `EmbeddingClient` for ADRF view paths. - - Same backend protocol, same config, same response handling. Differs only - in using `httpx.AsyncClient` and exposing `await`-able methods + an async - context-manager lifecycle (`async with AsyncEmbeddingClient() as c:`). - """ - - def __init__(self) -> None: - cfg = _resolve_config() - self._cfg = cfg - self._http = _build_async_http_client() - - async def embed_documents(self, texts: list[str]) -> list[list[float]]: - truncated_texts = _truncate(texts, self._cfg.max_chars) - payload = self._cfg.backend.build_payload(self._cfg.model, truncated_texts) - try: - response = await self._http.post( - self._cfg.url, json=payload, headers=self._cfg.headers - ) - except httpx.HTTPError as e: - raise EmbeddingClientError(f"HTTP error contacting {self._cfg.url}: {e}") from e - if response.status_code >= 400: - raise EmbeddingClientError( - f"Embedding service returned {response.status_code}: {response.text[:200]}" - ) - try: - body = response.json() - except ValueError as e: - raise EmbeddingClientError(f"Embedding response is not JSON: {e}") from e - raw = self._cfg.backend.parse_response(body) - return _normalize_response(raw, len(truncated_texts), self._cfg.dim) - - async def embed_query(self, text: str) -> list[float]: - prefixed = f"{self._cfg.instruction}{text}" if self._cfg.instruction else text - vectors = await self.embed_documents([prefixed]) - if not vectors: - raise EmbeddingClientError("Embedding service returned no vectors for query") - return vectors[0] - - async def aclose(self) -> None: - await self._http.aclose() - - async def __aenter__(self) -> "AsyncEmbeddingClient": - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: - await self.aclose() diff --git a/radis/settings/base.py b/radis/settings/base.py index db4e076c..89ff04c3 100644 --- a/radis/settings/base.py +++ b/radis/settings/base.py @@ -319,9 +319,7 @@ }, "dbbackup": { "BACKEND": "django.core.files.storage.FileSystemStorage", - "OPTIONS": { - "location": env.str("DBBACKUP_STORAGE_LOCATION", default="/tmp/backups-radis") - }, + "OPTIONS": {"location": env.str("DBBACKUP_STORAGE_LOCATION", default="/tmp/backups-radis")}, }, } DBBACKUP_CLEANUP_KEEP = 30 @@ -338,7 +336,7 @@ LLM_SERVICE_DEV_PORT = env.int("LLM_SERVICE_DEV_PORT", default=8080) LLM_SERVICE_URL = env.str("LLM_SERVICE_URL", default=f"http://localhost:{LLM_SERVICE_DEV_PORT}/v1") -# Embedding service (per-deployment, see hybrid-search spec §8.1) +# Embedding service (per-deployment) EMBEDDING_BACKEND = env.str("EMBEDDING_BACKEND", default="openai") EMBEDDING_PROVIDER_URL = env.str("EMBEDDING_PROVIDER_URL", default="") EMBEDDING_PROVIDER_PATH = env.str("EMBEDDING_PROVIDER_PATH", default="") @@ -346,12 +344,10 @@ EMBEDDING_MODEL_NAME = env.str("EMBEDDING_MODEL_NAME", default="Qwen/Qwen3-Embedding-4B") EMBEDDING_DIM = env.int("EMBEDDING_DIM", default=1024) -# Embedding tuning constants (see hybrid-search spec §8.2) +# Embedding tuning constants EMBEDDING_REQUEST_TIMEOUT = 30 -EMBEDDING_MAX_INPUT_CHARS = 60_000 EMBEDDING_QUERY_INSTRUCTION = ( - "Instruct: Given a radiology search query, retrieve relevant radiology reports.\n" - "Query: " + "Instruct: Given a radiology search query, retrieve relevant radiology reports.\nQuery: " ) EMBEDDING_BATCH_SIZE = 32 From c6318a8453f9f641fa70d71a2239c7c1119e4513 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Tue, 23 Jun 2026 11:06:29 +0000 Subject: [PATCH 65/68] feat(pgsearch): subjob chunking + admin pipeline-status badge Split the overloaded EMBEDDING_BATCH_SIZE into two knobs: EMBEDDING_SUBJOB_SIZE controls how many report ids each embed_reports_task carries (the Procrastinate-task granularity); EMBEDDING_BATCH_SIZE keeps its meaning as the per-HTTP-call size inside one task. A new enqueue_embed_reports helper chunks by subjob size and is the single defer site for the write-path handler, the FTS-chain tail of bulk_index_reports, embed_pending (renamed --batch-size to --subjob-size), and the admin action. A 1M-row backfill now becomes ~thousands of bounded subjobs that workers can drain in parallel, with retries scoped to one subjob rather than the whole batch. Add a changelist badge on ReportSearchVectorAdmin showing pending RSV count plus todo/doing/failed counts on the embeddings queue, backed by a ProcrastinateJob query grouped by status. Backfills are now observable end-to-end from Django admin without leaving the page for the Procrastinate admin. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-hybrid-search.md | 12 ++- radis/pgsearch/admin.py | 39 ++++++++-- radis/pgsearch/apps.py | 4 +- .../management/commands/embed_pending.py | 42 +++++----- radis/pgsearch/tasks.py | 35 ++++++++- .../reportsearchvector/change_list.html | 21 +++++ radis/pgsearch/tests/test_admin.py | 77 +++++++++++++++++++ .../tests/test_embed_pending_command.py | 46 +++++++---- .../pgsearch/tests/test_embed_reports_task.py | 77 +++++++++++++++++-- radis/settings/base.py | 1 + 10 files changed, 296 insertions(+), 58 deletions(-) create mode 100644 radis/pgsearch/templates/admin/pgsearch/reportsearchvector/change_list.html create mode 100644 radis/pgsearch/tests/test_admin.py diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index 449526e0..33d26a70 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -337,7 +337,7 @@ class EmbeddingClient: - **Overlength inputs:** the client does *not* truncate. The model's context window is the authoritative limit, and the backend signals overlength via HTTP 413 or 400/422 with a context-length message in the body. The client detects that via a loose substring match on common keywords (`context length`, `max tokens`, `too long`, `exceeds`, …) and raises the typed `EmbeddingPayloadTooLargeError`. The `embed_reports_task` worker catches that subclass and bisects the chunk (§6.2); the query path lets it propagate (which the search view treats the same as any other `EmbeddingClientError` — fall back to FTS-only for that request). - **Normalization:** every returned vector is L2-normalized client-side, unconditionally. With unit vectors, cosine distance is monotonic in dot product, which makes the HNSW `vector_cosine_ops` operator effectively a fast inner-product search. Whether the upstream server normalizes is irrelevant. - **Dimension validation:** every vector is checked to have length `EMBEDDING_DIM`. A mismatch raises `EmbeddingClientError`. -- **Batching:** `embed_documents` sends a single HTTP call per invocation. The write path enqueues an `embed_reports_task` per ingest event (one task per single-create, one task per bulk-upsert); each task in turn issues one batched embedding HTTP call covering all the report bodies it owns. The `EMBEDDING_BATCH_SIZE` constant is used by `embed_pending` to chunk large drains into tasks of reasonable size. +- **Batching:** `embed_documents` sends a single HTTP call per invocation. The write path and `embed_pending` both go through `enqueue_embed_reports(report_ids)` (defined in `tasks.py`), which chunks the input by `EMBEDDING_SUBJOB_SIZE` and defers one `embed_reports_task` per subjob. Inside each task, `EMBEDDING_BATCH_SIZE` controls the per-HTTP-call size. See §6.3 for the three-layer batching model. - **Errors:** non-2xx, timeout, malformed JSON, missing key, or wrong dim all raise `EmbeddingClientError`. The client never falls back internally — fallback policy is owned by the caller. - **Dev recipe (Ollama):** ```bash @@ -499,14 +499,17 @@ Three explicit choices: - **`--concurrency 4`** (the concurrency knob): up to 4 `embed_reports_task` slots in flight on the worker at once. Each slot processes its batches sequentially, so `--concurrency K` translates directly to "up to K embedding HTTP requests in flight to the embedding service per worker process." Total system concurrency = `worker_count × --concurrency`. The default of 4 leaves capacity for the query path's `embed_query` to share the same embedding service. Tunable per deployment. - **Sync task body**: the task is `def`, not `async def`. Procrastinate gives concurrency through K independent task slots regardless of sync vs async, and the embedding batch loop is sequential by design — switching to async would not add any in-task concurrency, just a `database_sync_to_async` shim layer. -**Two layers of "batching"**, easy to confuse, kept separate by design: +**Three layers of "batching"**, easy to confuse, kept separate by design: | Layer | Knob | What it controls | |---|---|---| -| Per-HTTP-call size | `EMBEDDING_BATCH_SIZE` (settings constant; default 32) | How many report bodies are sent in one `embed_documents` call inside the task. | -| Concurrent HTTP calls per worker | `--concurrency K` (compose flag; default 4) | How many `embed_documents` calls can be in flight at the same time. | +| Per-Procrastinate-task size | `EMBEDDING_SUBJOB_SIZE` (settings constant; default 100) | How many report ids one `embed_reports_task` instance carries. The single chunking point for *every* enqueue — write-path handler, FTS chain tail, `embed_pending`, admin action — via `enqueue_embed_reports(report_ids)`. | +| Per-HTTP-call size | `EMBEDDING_BATCH_SIZE` (settings constant; default 32) | How many report bodies are sent in one `embed_documents` call *inside* one task. One subjob of 100 → ~3 HTTP calls of 32. | +| Concurrent task slots per worker | `--concurrency K` (compose flag; default 4) | How many `embed_reports_task` instances run in parallel on a single worker. | | Concurrent HTTP calls across all workers | `worker_count × --concurrency K` | The system's actual load ceiling on the embedding service. | +Why subjob granularity matters: a 1M-row `embed_pending` backfill becomes ~10k subjobs of 100, not one giant task. Multiple workers can drain in parallel; a stuck or failing subjob has bounded blast radius (retries reprocess only 100 ids, not 1M); Procrastinate's `--concurrency K` actually means something for backfill throughput. Write-path bulk-upserts get the same treatment: a 1000-row upload → 10 embed subjobs, not one. + To scale up, prefer adding worker processes (crash isolation + connection-pool fan-out) over raising `--concurrency` past ~8 (the embedding service typically saturates around there anyway). Total embedding load on the service is `worker_count × --concurrency`. ### 6.4 Failure semantics @@ -815,6 +818,7 @@ EMBEDDING_QUERY_INSTRUCTION = ( "Query: " ) EMBEDDING_BATCH_SIZE = 32 +EMBEDDING_SUBJOB_SIZE = 100 HYBRID_VECTOR_TOP_K = 100 HYBRID_FTS_MAX_RESULTS = 10_000 diff --git a/radis/pgsearch/admin.py b/radis/pgsearch/admin.py index 8ab9f09c..e66a0cac 100644 --- a/radis/pgsearch/admin.py +++ b/radis/pgsearch/admin.py @@ -1,10 +1,13 @@ -from django.conf import settings from django.contrib import admin, messages +from django.db.models import Count from django.db.models.query import QuerySet from django.http.request import HttpRequest +from procrastinate.contrib.django.models import ProcrastinateJob from .models import ReportSearchVector -from .tasks import embed_reports_task +from .tasks import enqueue_embed_reports + +EMBEDDINGS_QUEUE = "embeddings" @admin.register(ReportSearchVector) @@ -13,11 +16,35 @@ class ReportSearchVectorAdmin(admin.ModelAdmin): list_filter = ("embedding",) search_fields = ("report__document_id",) actions = ("enqueue_pending_embeddings",) + change_list_template = "admin/pgsearch/reportsearchvector/change_list.html" @admin.display(boolean=True, description="Embedded") def has_embedding(self, obj: ReportSearchVector) -> bool: return obj.embedding is not None + def changelist_view(self, request, extra_context=None): + extra_context = extra_context or {} + extra_context["embedding_pipeline_stats"] = self._embedding_pipeline_stats() + return super().changelist_view(request, extra_context=extra_context) + + @staticmethod + def _embedding_pipeline_stats() -> dict[str, int]: + """Snapshot of the embedding pipeline for the admin badge: how many + reports are still missing an embedding, and what Procrastinate is + doing about it right now.""" + pending = ReportSearchVector.objects.filter(embedding__isnull=True).count() + queue_counts = dict( + ProcrastinateJob.objects.filter(queue_name=EMBEDDINGS_QUEUE) + .values_list("status") + .annotate(n=Count("id")) + ) + return { + "pending_reports": pending, + "todo": queue_counts.get("todo", 0), + "doing": queue_counts.get("doing", 0), + "failed": queue_counts.get("failed", 0), + } + @admin.action(description="Enqueue embedding for selected rows (NULL only)") def enqueue_pending_embeddings( self, request: HttpRequest, queryset: QuerySet[ReportSearchVector] @@ -35,13 +62,11 @@ def enqueue_pending_embeddings( ) return - batch_size = settings.EMBEDDING_BATCH_SIZE - for i in range(0, len(report_ids), batch_size): - chunk = report_ids[i : i + batch_size] - embed_reports_task.defer(report_ids=list(chunk)) + subjob_count = enqueue_embed_reports(report_ids) self.message_user( request, - f"Enqueued {len(report_ids)} report(s) for embedding.", + f"Enqueued {len(report_ids)} report(s) across " + f"{subjob_count} subjob(s) for embedding.", level=messages.SUCCESS, ) diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py index 69bc248b..bf13b7fd 100644 --- a/radis/pgsearch/apps.py +++ b/radis/pgsearch/apps.py @@ -89,13 +89,13 @@ def _handle_reports_changed(reports): if not reports: return - from radis.pgsearch.tasks import embed_reports_task, enqueue_bulk_index_reports + from radis.pgsearch.tasks import enqueue_bulk_index_reports, enqueue_embed_reports from radis.pgsearch.utils.indexing import bulk_upsert_report_search_vectors report_ids = [report.pk for report in reports] if settings.PGSEARCH_SYNC_INDEXING: bulk_upsert_report_search_vectors(report_ids) - embed_reports_task.defer(report_ids=report_ids) + enqueue_embed_reports(report_ids) else: enqueue_bulk_index_reports(report_ids) diff --git a/radis/pgsearch/management/commands/embed_pending.py b/radis/pgsearch/management/commands/embed_pending.py index 67fb4bd2..7dadd8b8 100644 --- a/radis/pgsearch/management/commands/embed_pending.py +++ b/radis/pgsearch/management/commands/embed_pending.py @@ -10,9 +10,13 @@ 3. **Outage recovery.** Tasks that exhausted Procrastinate retries during an extended embedding-service outage — re-run after the service recovers. -The command itself does no HTTP work; it enqueues tasks onto the `embeddings` -queue. The embeddings worker drains them at its configured `--concurrency`, -so operators cannot accidentally hammer the embedding service. +The command itself does no HTTP work; it defers Procrastinate tasks onto the +`embeddings` queue. The embeddings worker drains them at its configured +`--concurrency`, so operators cannot accidentally hammer the embedding service. + +Chunking goes through the shared `enqueue_embed_reports` helper, so the +subjob size matches what the write-path handler and the admin action use +(default `settings.EMBEDDING_SUBJOB_SIZE`). Properties: @@ -27,25 +31,26 @@ from django.core.management.base import BaseCommand from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.tasks import embed_reports_task +from radis.pgsearch.tasks import enqueue_embed_reports class Command(BaseCommand): help = ( - "Enqueue embed_reports_task for every ReportSearchVector with " - "embedding=NULL. The embeddings worker drains the queue at its " - "configured concurrency." + "Enqueue embed_reports_task subjobs for every ReportSearchVector " + "with embedding=NULL. The embeddings worker drains the queue at " + "its configured concurrency." ) def add_arguments(self, parser) -> None: parser.add_argument( - "--batch-size", + "--subjob-size", type=int, - default=settings.EMBEDDING_BATCH_SIZE, + default=settings.EMBEDDING_SUBJOB_SIZE, help=( - f"Reports per enqueued task (default " - f"{settings.EMBEDDING_BATCH_SIZE}). The worker further chunks " - f"each task by EMBEDDING_BATCH_SIZE internally." + f"Reports per Procrastinate subjob (default " + f"{settings.EMBEDDING_SUBJOB_SIZE}). The worker further " + f"chunks each subjob into HTTP calls of " + f"EMBEDDING_BATCH_SIZE={settings.EMBEDDING_BATCH_SIZE}." ), ) parser.add_argument( @@ -67,12 +72,11 @@ def handle(self, *args, **opts) -> None: self.stdout.write("Nothing to embed.") return - batch_size = opts["batch_size"] + subjob_size = opts["subjob_size"] + self.stdout.write( + f"Enqueuing {len(ids)} report(s) in subjobs of {subjob_size}..." + ) + subjob_count = enqueue_embed_reports(ids, subjob_size=subjob_size) self.stdout.write( - f"Enqueuing {len(ids)} report(s) in tasks of {batch_size}..." + self.style.SUCCESS(f"Done. Deferred {subjob_count} subjob(s).") ) - for i in range(0, len(ids), batch_size): - chunk = ids[i : i + batch_size] - embed_reports_task.defer(report_ids=list(chunk)) - self.stdout.write(f" enqueued {i + len(chunk)}/{len(ids)}") - self.stdout.write(self.style.SUCCESS("Done.")) diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index 3ac66b18..18f0ae48 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -53,14 +53,14 @@ def bulk_index_reports(report_ids: list[int]) -> None: """Deferred FTS bulk-indexing for the bulk-upsert path (when `PGSEARCH_SYNC_INDEXING=False`). - Chains into `embed_reports_task` once RSV rows exist, so the embeddings - worker never reads a missing `report.body` or a stale tsvector. + Chains into `embed_reports_task` subjobs once RSV rows exist, so the + embeddings worker never reads a missing `report.body` or a stale tsvector. """ if not report_ids: return logger.info("Indexing %s reports in bulk.", len(report_ids)) bulk_upsert_report_search_vectors(report_ids) - embed_reports_task.defer(report_ids=report_ids) + enqueue_embed_reports(report_ids) def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: @@ -77,6 +77,35 @@ def enqueue_bulk_index_reports(report_ids: list[int]) -> int | None: ).defer(report_ids=payload) +def enqueue_embed_reports( + report_ids: list[int], *, subjob_size: int | None = None +) -> int: + """Chunk `report_ids` into subjobs and defer one `embed_reports_task` + per chunk. Returns the number of subjobs deferred. + + Subjob size defaults to `settings.EMBEDDING_SUBJOB_SIZE` (the + Procrastinate-task granularity). It's distinct from + `settings.EMBEDDING_BATCH_SIZE` (the per-HTTP-call size inside one + task). A 1M-report backfill becomes ~10k subjobs of 100, each making + ~3 HTTP calls of 32 — many workers can drain in parallel, retries + have bounded blast radius, and a stuck task can't tie up the worker + on the whole queue's worth of work. + + Single call site for every place that enqueues embedding work: the + write-path handler, the FTS chain tail, `embed_pending`, and the + admin action. Operators read one knob, not several. + """ + if not report_ids: + return 0 + size = subjob_size if subjob_size is not None else settings.EMBEDDING_SUBJOB_SIZE + count = 0 + for start in range(0, len(report_ids), size): + chunk = report_ids[start : start + size] + embed_reports_task.defer(report_ids=list(chunk)) + count += 1 + return count + + def _embed_with_bisect( client: EmbeddingClient, rsvs: list[ReportSearchVector], diff --git a/radis/pgsearch/templates/admin/pgsearch/reportsearchvector/change_list.html b/radis/pgsearch/templates/admin/pgsearch/reportsearchvector/change_list.html new file mode 100644 index 00000000..d0504d83 --- /dev/null +++ b/radis/pgsearch/templates/admin/pgsearch/reportsearchvector/change_list.html @@ -0,0 +1,21 @@ +{% extends "admin/change_list.html" %} + +{% block content %} +{% if embedding_pipeline_stats %} +
+ Embedding pipeline +  ·  {{ embedding_pipeline_stats.pending_reports }} report{{ embedding_pipeline_stats.pending_reports|pluralize }} awaiting embedding +  ·  {{ embedding_pipeline_stats.todo }} queued +  ·  {{ embedding_pipeline_stats.doing }} in-flight +  ·  + {% if embedding_pipeline_stats.failed %} + {{ embedding_pipeline_stats.failed }} + {% else %} + 0 + {% endif %} + failed + ({{ EMBEDDINGS_QUEUE|default:"embeddings" }} queue) +
+{% endif %} +{{ block.super }} +{% endblock %} diff --git a/radis/pgsearch/tests/test_admin.py b/radis/pgsearch/tests/test_admin.py new file mode 100644 index 00000000..3bf096e9 --- /dev/null +++ b/radis/pgsearch/tests/test_admin.py @@ -0,0 +1,77 @@ +"""Tests for the ReportSearchVector admin pipeline-stats badge.""" +from django.db import connection + +import pytest + +from radis.pgsearch.admin import ReportSearchVectorAdmin +from radis.pgsearch.models import ReportSearchVector +from radis.reports.factories import ReportFactory + +pytestmark = pytest.mark.django_db(transaction=True) + + +@pytest.fixture(autouse=True) +def _clear_procrastinate_jobs(): + """ProcrastinateJob is read-only via the ORM, so pytest-django's + flush between transactional tests doesn't clear it. Truncate + explicitly so each test starts from an empty queue.""" + with connection.cursor() as cur: + cur.execute("TRUNCATE procrastinate_jobs RESTART IDENTITY CASCADE") + yield + with connection.cursor() as cur: + cur.execute("TRUNCATE procrastinate_jobs RESTART IDENTITY CASCADE") + + +def _insert_procrastinate_job(status: str, queue: str = "embeddings") -> None: + """Insert a row directly via SQL because ProcrastinateJob's Django ORM + surface is intentionally read-only — Procrastinate owns writes. We + only need (queue_name, status) for the stats helper to count.""" + with connection.cursor() as cur: + cur.execute( + "INSERT INTO procrastinate_jobs " + "(queue_name, task_name, priority, lock, queueing_lock, args, status, attempts) " + "VALUES (%s, %s, %s, NULL, NULL, %s, %s::procrastinate_job_status, %s)", + [ + queue, + "radis.pgsearch.tasks.embed_reports_task", + 0, + '{"report_ids": []}', + status, + 0, + ], + ) + + +def test_pipeline_stats_counts_pending_rsvs(): + [ReportFactory.create() for _ in range(3)] + embedded = ReportFactory.create() + rsv = ReportSearchVector.objects.get(report_id=embedded.pk) + rsv.embedding = [0.0] * 1024 + rsv.save() + + stats = ReportSearchVectorAdmin._embedding_pipeline_stats() + assert stats["pending_reports"] == 3 + + +def test_pipeline_stats_counts_procrastinate_jobs_by_status(): + _insert_procrastinate_job("todo") + _insert_procrastinate_job("todo") + _insert_procrastinate_job("doing") + _insert_procrastinate_job("failed") + # Job on a different queue must not be counted. + _insert_procrastinate_job("todo", queue="default") + + stats = ReportSearchVectorAdmin._embedding_pipeline_stats() + assert stats["todo"] == 2 + assert stats["doing"] == 1 + assert stats["failed"] == 1 + + +def test_pipeline_stats_zero_when_no_queue_activity(): + stats = ReportSearchVectorAdmin._embedding_pipeline_stats() + assert stats == { + "pending_reports": 0, + "todo": 0, + "doing": 0, + "failed": 0, + } diff --git a/radis/pgsearch/tests/test_embed_pending_command.py b/radis/pgsearch/tests/test_embed_pending_command.py index b125cb2a..83eb5c8e 100644 --- a/radis/pgsearch/tests/test_embed_pending_command.py +++ b/radis/pgsearch/tests/test_embed_pending_command.py @@ -12,37 +12,49 @@ def test_nothing_to_embed(): out = StringIO() - with patch("radis.pgsearch.management.commands.embed_pending.embed_reports_task") as task: + with patch( + "radis.pgsearch.management.commands.embed_pending.enqueue_embed_reports" + ) as enqueue: call_command("embed_pending", stdout=out) assert "Nothing to embed." in out.getvalue() - task.defer.assert_not_called() + enqueue.assert_not_called() -def test_enqueues_all_pending_in_batches(): +def test_enqueues_via_helper_with_explicit_subjob_size(): # ReportFactory triggers the FTS post_save signal → RSV row with embedding=NULL. reports = [ReportFactory.create() for _ in range(5)] expected_ids = sorted(r.pk for r in reports) out = StringIO() - with patch("radis.pgsearch.management.commands.embed_pending.embed_reports_task") as task: - call_command("embed_pending", "--batch-size", "2", stdout=out) + with patch( + "radis.pgsearch.management.commands.embed_pending.enqueue_embed_reports", + return_value=3, + ) as enqueue: + call_command("embed_pending", "--subjob-size", "2", stdout=out) + + # The command delegates chunking to the shared helper. + enqueue.assert_called_once() + args, kwargs = enqueue.call_args + assert sorted(args[0]) == expected_ids + assert kwargs["subjob_size"] == 2 - # 5 reports / batch 2 → three defer calls of sizes 2, 2, 1. - assert task.defer.call_count == 3 - enqueued_ids = [pk for call in task.defer.call_args_list for pk in call.kwargs["report_ids"]] - assert sorted(enqueued_ids) == expected_ids output = out.getvalue() - assert "2/5" in output - assert "5/5" in output - assert "Done." in output + assert "5 report(s) in subjobs of 2" in output + assert "Deferred 3 subjob(s)" in output def test_limit_caps_work(): [ReportFactory.create() for _ in range(5)] out = StringIO() - with patch("radis.pgsearch.management.commands.embed_pending.embed_reports_task") as task: - call_command("embed_pending", "--limit", "3", "--batch-size", "10", stdout=out) - - enqueued_ids = [pk for call in task.defer.call_args_list for pk in call.kwargs["report_ids"]] - assert len(enqueued_ids) == 3 + with patch( + "radis.pgsearch.management.commands.embed_pending.enqueue_embed_reports", + return_value=1, + ) as enqueue: + call_command( + "embed_pending", "--limit", "3", "--subjob-size", "10", stdout=out + ) + + args, kwargs = enqueue.call_args + assert len(args[0]) == 3 + assert kwargs["subjob_size"] == 10 diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py index 0a6c7e5d..dd1b1589 100644 --- a/radis/pgsearch/tests/test_embed_reports_task.py +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -7,7 +7,11 @@ import stamina from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.tasks import bulk_index_reports, embed_reports_task +from radis.pgsearch.tasks import ( + bulk_index_reports, + embed_reports_task, + enqueue_embed_reports, +) from radis.pgsearch.utils.embedding_client import ( EmbeddingClientError, EmbeddingPayloadTooLargeError, @@ -89,10 +93,12 @@ def test_embedding_error_propagates(): assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 2 -def test_bulk_index_reports_chains_into_embed_reports_task(): - """`bulk_index_reports` upserts RSVs and then defers `embed_reports_task`. - The chain is the ordering guarantee: the embeddings worker only ever sees - report ids whose RSV rows are already committed.""" +def test_bulk_index_reports_chains_into_embed_reports_task(settings): + """`bulk_index_reports` upserts RSVs and then chunks the embed work via + `enqueue_embed_reports`. The chain is the ordering guarantee: the + embeddings worker only ever sees report ids whose RSV rows are already + committed.""" + settings.EMBEDDING_SUBJOB_SIZE = 100 reports = [ReportFactory.create() for _ in range(3)] pks = [r.pk for r in reports] ReportSearchVector.objects.filter(report_id__in=pks).delete() @@ -100,11 +106,70 @@ def test_bulk_index_reports_chains_into_embed_reports_task(): with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer: bulk_index_reports(report_ids=pks) - # RSVs were upserted, then the embed task was deferred with the same ids. + # RSVs were upserted, then one embed subjob covering all 3 ids was + # deferred (3 < SUBJOB_SIZE so the whole batch fits in one subjob). assert ReportSearchVector.objects.filter(report_id__in=pks).count() == 3 defer.assert_called_once_with(report_ids=pks) +def test_bulk_index_reports_splits_into_subjobs_when_exceeding_subjob_size(settings): + """A bulk-upsert larger than `EMBEDDING_SUBJOB_SIZE` must defer multiple + embed subjobs so the embeddings worker can drain them in parallel and + retries/failures have bounded blast radius.""" + settings.EMBEDDING_SUBJOB_SIZE = 4 + reports = [ReportFactory.create() for _ in range(10)] + pks = [r.pk for r in reports] + + with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer: + bulk_index_reports(report_ids=pks) + + # 10 reports / subjob 4 → 3 defer calls of sizes 4, 4, 2. + assert defer.call_count == 3 + enqueued_chunks = [call.kwargs["report_ids"] for call in defer.call_args_list] + assert [len(c) for c in enqueued_chunks] == [4, 4, 2] + # The union of all chunks covers exactly the input ids in order. + assert [pk for c in enqueued_chunks for pk in c] == pks + + +def test_enqueue_embed_reports_helper_chunks_by_subjob_size(settings): + """The shared `enqueue_embed_reports` helper is the single chunking + point. A 1M-row backfill becomes ~10k subjobs (no single huge task); + a single create with one id becomes one subjob (no overhead).""" + settings.EMBEDDING_SUBJOB_SIZE = 3 + + with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer: + count = enqueue_embed_reports([1, 2, 3, 4, 5, 6, 7]) + + assert count == 3 + assert defer.call_count == 3 + assert [c.kwargs["report_ids"] for c in defer.call_args_list] == [ + [1, 2, 3], + [4, 5, 6], + [7], + ] + + +def test_enqueue_embed_reports_helper_empty_input_is_noop(): + with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer: + count = enqueue_embed_reports([]) + assert count == 0 + defer.assert_not_called() + + +def test_enqueue_embed_reports_helper_explicit_subjob_size_overrides_setting(settings): + """Operators (e.g., `embed_pending --subjob-size=…`) can pass a + one-off override without mutating the global setting.""" + settings.EMBEDDING_SUBJOB_SIZE = 100 + + with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer: + count = enqueue_embed_reports([1, 2, 3, 4, 5], subjob_size=2) + + assert count == 3 + assert [c.kwargs["report_ids"] for c in defer.call_args_list] == [ + [1, 2], [3, 4], [5] + ] + + def test_bisects_on_too_large_and_isolates_offender(settings, caplog, monkeypatch): """When the backend rejects a batch as too large, the task bisects until it isolates the single offending report, logs ERROR with its id + body diff --git a/radis/settings/base.py b/radis/settings/base.py index 89ff04c3..bd3a8565 100644 --- a/radis/settings/base.py +++ b/radis/settings/base.py @@ -350,6 +350,7 @@ "Instruct: Given a radiology search query, retrieve relevant radiology reports.\nQuery: " ) EMBEDDING_BATCH_SIZE = 32 +EMBEDDING_SUBJOB_SIZE = 1000 # Hybrid search tuning HYBRID_VECTOR_TOP_K = 100 From 5a333862bfca8b49ba1adda518059653b71df771 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Tue, 23 Jun 2026 11:32:05 +0000 Subject: [PATCH 66/68] =?UTF-8?q?refactor(pgsearch):=20rename=20ReportSear?= =?UTF-8?q?chVector=20=E2=86=92=20ReportSearchIndex?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The model holds the FTS tsvector AND the dense embedding (and likely a trigram column in the future). Its name should describe its role ("per-report search-backing row") rather than one specific field. Field names stay descriptive (search_vector for the tsvector, embedding for the dense vector, etc.). Sweeps the class, the related_name (Report.search_vector → Report.search_index), the bulk-upsert helper (bulk_upsert_report_search_indexes), the document_utils type alias, all imports, signals, admin, tasks, providers, indexing utils, embed_pending command, tests, and the spec. Adds a 0003 migration that performs a RenameModel + AlterField for the related_name — Django renames the table to pgsearch_reportsearchindex, the raw SQL in indexing.py is updated to match. Moves the admin template to admin/pgsearch/reportsearchindex/change_list.html. Also swaps the meaningless list_filter = ("embedding",) (which tried to enumerate distinct vector values) for admin.EmptyFieldListFilter, giving operators a "Has embedding: Yes / No" toggle. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-hybrid-search.md | 68 +++++++++---------- radis/pgsearch/admin.py | 16 ++--- radis/pgsearch/apps.py | 10 +-- .../management/commands/embed_pending.py | 10 +-- .../migrations/0003_rename_search_index.py | 31 +++++++++ radis/pgsearch/models.py | 14 +++- radis/pgsearch/providers.py | 20 +++--- radis/pgsearch/signals.py | 8 +-- radis/pgsearch/tasks.py | 24 +++---- .../change_list.html | 0 radis/pgsearch/tests/test_admin.py | 14 ++-- .../pgsearch/tests/test_embed_reports_task.py | 20 +++--- radis/pgsearch/tests/test_indexing.py | 12 ++-- radis/pgsearch/tests/test_provider_hybrid.py | 12 ++-- radis/pgsearch/utils/document_utils.py | 6 +- radis/pgsearch/utils/indexing.py | 10 +-- 16 files changed, 157 insertions(+), 118 deletions(-) create mode 100644 radis/pgsearch/migrations/0003_rename_search_index.py rename radis/pgsearch/templates/admin/pgsearch/{reportsearchvector => reportsearchindex}/change_list.html (100%) diff --git a/docs/superpowers/specs/2026-05-28-hybrid-search.md b/docs/superpowers/specs/2026-05-28-hybrid-search.md index 33d26a70..e73dc29c 100644 --- a/docs/superpowers/specs/2026-05-28-hybrid-search.md +++ b/docs/superpowers/specs/2026-05-28-hybrid-search.md @@ -10,9 +10,9 @@ ## 1. Overview -RADIS today provides PostgreSQL full-text search (FTS) over radiology reports via the `radis.pgsearch` provider: each `Report` gets a 1:1 `ReportSearchVector` row holding a `tsvector`, kept in sync via `post_save` signal and a bulk re-index task. Queries are ranked by `ts_rank` and snippeted via `ts_headline`. +RADIS today provides PostgreSQL full-text search (FTS) over radiology reports via the `radis.pgsearch` provider: each `Report` gets a 1:1 `ReportSearchIndex` row holding a `tsvector`, kept in sync via `post_save` signal and a bulk re-index task. Queries are ranked by `ts_rank` and snippeted via `ts_headline`. -This spec extends that infrastructure with a dense-vector retrieval side, fused with FTS via Reciprocal Rank Fusion (RRF), to deliver **hybrid search**. Embeddings are produced by a Qwen3-Embedding-4B inference endpoint and stored in the same `ReportSearchVector` table. +This spec extends that infrastructure with a dense-vector retrieval side, fused with FTS via Reciprocal Rank Fusion (RRF), to deliver **hybrid search**. Embeddings are produced by a Qwen3-Embedding-4B inference endpoint and stored in the same `ReportSearchIndex` table. The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — `SearchView`, `ExtractionJob`, `SubscriptionJob`, the REST API — see no signature differences. Only the body of `radis.pgsearch.providers.search()` and `retrieve()` changes. @@ -50,15 +50,15 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — │ 1. embed_query() ──► EmbeddingClient ──► Qwen3 endpoint │ │ on failure: query_vec = None │ │ │ -│ 2. Vector top-K ────► ReportSearchVector (HNSW on .embedding) │ +│ 2. Vector top-K ────► ReportSearchIndex (HNSW on .embedding) │ │ filtered by structured filters │ │ │ -│ 3. FTS hits ────► ReportSearchVector (GIN on .search_vector) │ +│ 3. FTS hits ────► ReportSearchIndex (GIN on .search_vector) │ │ filtered by structured filters │ │ │ │ 4. Python-side RRF fusion of (vec_top_K ∪ fts_hits) │ │ 5. Pagination on the fused order │ -│ 6. ts_headline() ────► ReportSearchVector (page-slice only) │ +│ 6. ts_headline() ────► ReportSearchIndex (page-slice only) │ └──────────────────────────────────────────────────────────────────────┘ ┌──────────────────────────────────────────────────────────────────────┐ @@ -77,7 +77,7 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — │ ▼ (one of the registered subscribers is pgsearch:) │ │ pgsearch._handle_reports_changed(reports) │ │ ├─ PGSEARCH_SYNC_INDEXING=True: │ -│ │ bulk_upsert_report_search_vectors(report_ids) inline, │ +│ │ bulk_upsert_report_search_indexes(report_ids) inline, │ │ │ then embed_reports_task.defer(report_ids=...) │ │ └─ PGSEARCH_SYNC_INDEXING=False: │ │ enqueue_bulk_index_reports(report_ids); the embed task is │ @@ -88,7 +88,7 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — │ ──── elsewhere, on the default_worker process ──── │ │ │ │ bulk_index_reports(report_ids) (default queue) │ -│ ├─ bulk_upsert_report_search_vectors(report_ids) │ +│ ├─ bulk_upsert_report_search_indexes(report_ids) │ │ └─ embed_reports_task.defer(report_ids=...) │ │ │ │ ──── elsewhere, on the embeddings_worker process ──── │ @@ -96,7 +96,7 @@ The public `SearchProvider` API (`radis.search.site`) is unchanged. Callers — │ embed_reports_task(report_ids) (embeddings queue) │ │ ├─ load RSVs (select_related("report")) │ │ ├─ EmbeddingClient.embed_documents([body, ...]) (batched) │ -│ ├─ L2-normalize; ReportSearchVector.objects.bulk_update │ +│ ├─ L2-normalize; ReportSearchIndex.objects.bulk_update │ │ └─ on EmbeddingClientError: raise │ │ → Procrastinate retry policy (exp backoff, N attempts) │ └──────────────────────────────────────────────────────────────────────┘ @@ -119,11 +119,11 @@ Both ingest paths — single-create (`POST /api/reports/`, `PUT /api/reports/{id | `utils/embedding_client.py` | `EmbeddingClient` used by both the query path and `embed_reports_task` on the worker; pluggable backends (`openai`, `ollama`) | | `apps.py` (modified) | `register_app()` now also registers `_handle_reports_changed` on both `reports_created_handlers` and `reports_updated_handlers`. In sync FTS mode the handler upserts inline then defers `embed_reports_task`; in deferred FTS mode it enqueues `bulk_index_reports`, which chains the embed task at the end of its own run. This is the only place pgsearch wires itself into the reports app. | | `tasks.py` (embedding entries) | `embed_reports_task(report_ids)` Procrastinate task on the `embeddings` queue. Loads RSVs by `report_id`, calls `EmbeddingClient.embed_documents`, then `bulk_update`. Raises on `EmbeddingClientError` so the Procrastinate retry policy applies. | -| `admin.py` | Registers `ReportSearchVector` with a `has_embedding` list display, an `embedding` `IsNull` filter, and an admin action `enqueue_pending_embeddings` that defers `embed_reports_task` for the selected rows whose embedding is NULL. Mirrors the `embed_pending` management command for operators who prefer the UI. | +| `admin.py` | Registers `ReportSearchIndex` with a `has_embedding` list display, an `embedding` `IsNull` filter, and an admin action `enqueue_pending_embeddings` that defers `embed_reports_task` for the selected rows whose embedding is NULL. Mirrors the `embed_pending` management command for operators who prefer the UI. | | `migrations/0002_hybrid_search.py` | Single schema migration: `CREATE EXTENSION vector`; adds `embedding vector(N)` column + HNSW index | -| `models.py` (modified) | Adds `embedding` field + `HnswIndex` to `ReportSearchVector`. No Job/Task models. | +| `models.py` (modified) | Adds `embedding` field + `HnswIndex` to `ReportSearchIndex`. No Job/Task models. | | `signals.py` (unchanged from FTS-only) | The FTS `create_or_update_report_search_vector` receiver stays; **no embedding signal** | -| `tasks.py` (FTS bits) | FTS bulk-indexing helper `bulk_upsert_report_search_vectors` and the `bulk_index_reports` Procrastinate task. `bulk_index_reports` upserts the RSV rows and then chains `embed_reports_task.defer(...)` at the end of its run, so the embeddings worker only ever sees report ids whose RSV rows are already committed (see §6.6). | +| `tasks.py` (FTS bits) | FTS bulk-indexing helper `bulk_upsert_report_search_indexes` and the `bulk_index_reports` Procrastinate task. `bulk_index_reports` upserts the RSV rows and then chains `embed_reports_task.defer(...)` at the end of its run, so the embeddings worker only ever sees report ids whose RSV rows are already committed (see §6.6). | | `providers.py` (modified) | Replaces `search()` and `retrieve()` bodies with hybrid logic | | `tests/...` | Coverage per §10 | @@ -157,7 +157,7 @@ Three operations: Reverse is a no-op because the extension may be shared with other Postgres usage and dropping it would damage unrelated state. Dev rollback is handled by recreating the database. -2. `AddField` `embedding` on `ReportSearchVector`: +2. `AddField` `embedding` on `ReportSearchIndex`: `pgvector.django.vector.VectorField(dimensions=settings.EMBEDDING_DIM, null=True)`. 3. `AddIndex` HNSW on `embedding`: `m=16`, `ef_construction=64`, `opclasses=["vector_cosine_ops"]`, `name="pgsearch_embedding_hnsw"`. @@ -174,8 +174,8 @@ and column. from django.conf import settings from pgvector.django import HnswIndex, VectorField -class ReportSearchVector(models.Model): - report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_vector") +class ReportSearchIndex(models.Model): + report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_index") search_vector = SearchVectorField(null=True) embedding = VectorField(dimensions=settings.EMBEDDING_DIM, null=True) @@ -194,7 +194,7 @@ class ReportSearchVector(models.Model): `embedding` is nullable: the row exists from the moment a `Report` is created (FTS path), but its embedding is filled by the `embed_reports_task` Procrastinate worker, enqueued from `transaction.on_commit` (§6). A NULL embedding is treated as "not embedded yet" at query time, and the row participates via the FTS half only. -`save()` on `ReportSearchVector` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by `embed_reports_task` via `bulk_update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent. +`save()` on `ReportSearchIndex` retains its current behavior of recomputing `search_vector` from `report.body`. The embedding column is written **only** by `embed_reports_task` via `bulk_update()`, never by `save()`, to avoid triggering the FTS signal recursively and to keep the two indexing paths independent. ### 4.5 Operational note on `EMBEDDING_DIM` @@ -230,7 +230,7 @@ there is exactly one source of truth (the `dimensions=...` literal that # radis/pgsearch/apps.py def _migration_embedding_dim() -> int | None: - """Return the `dimensions` value of `ReportSearchVector.embedding` as + """Return the `dimensions` value of `ReportSearchIndex.embedding` as captured by the on-disk pgsearch migrations. Returns None if the field cannot be located (e.g., migrations are missing or out of sync).""" from django.db.migrations.loader import MigrationLoader @@ -238,7 +238,7 @@ def _migration_embedding_dim() -> int | None: loader = MigrationLoader(connection=None, ignore_no_migrations=True) state = loader.project_state() try: - model = state.apps.get_model("pgsearch", "ReportSearchVector") + model = state.apps.get_model("pgsearch", "ReportSearchIndex") return model._meta.get_field("embedding").dimensions except (LookupError, AttributeError): return None @@ -253,7 +253,7 @@ def check_embedding_dim_matches_migration(app_configs, **kwargs): "pgsearch migrations.", id="pgsearch.E002", hint="Verify that radis/pgsearch/migrations/ contains a migration " - "that adds `embedding` to `ReportSearchVector`.", + "that adds `embedding` to `ReportSearchIndex`.", )] if settings.EMBEDDING_DIM != migration_dim: return [Error( @@ -348,7 +348,7 @@ class EmbeddingClient: EMBEDDING_MODEL_NAME=dengcao/Qwen3-Embedding-4B:Q5_K_M EMBEDDING_DIM=2560 ``` - GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, clear the column (`ReportSearchVector.objects.update(embedding=None)`) and run `./manage.py embed_pending`. + GGUF-quantized embedding models produce slightly different vectors than the bf16 reference, so dev embeddings are not interchangeable with prod embeddings. After swapping the model between dev/prod, clear the column (`ReportSearchIndex.objects.update(embedding=None)`) and run `./manage.py embed_pending`. ## 6. Async indexing (deferred via Procrastinate) @@ -366,7 +366,7 @@ def _handle_reports_changed(reports): return report_ids = [r.pk for r in reports] if settings.PGSEARCH_SYNC_INDEXING: - bulk_upsert_report_search_vectors(report_ids) + bulk_upsert_report_search_indexes(report_ids) embed_reports_task.defer(report_ids=report_ids) else: # bulk_index_reports chains embed_reports_task at the end of its run, @@ -403,7 +403,7 @@ def embed_reports_task(report_ids: list[int]) -> None: return rsvs = list( - ReportSearchVector.objects.filter(report_id__in=report_ids) + ReportSearchIndex.objects.filter(report_id__in=report_ids) .select_related("report") .only("id", "report_id", "report__body") ) @@ -412,15 +412,15 @@ def embed_reports_task(report_ids: list[int]) -> None: return batch_size = settings.EMBEDDING_BATCH_SIZE - embedded: list[ReportSearchVector] = [] - skipped: list[ReportSearchVector] = [] + embedded: list[ReportSearchIndex] = [] + skipped: list[ReportSearchIndex] = [] with EmbeddingClient() as client: for start in range(0, len(rsvs), batch_size): chunk = rsvs[start : start + batch_size] _embed_with_bisect(client, chunk, embedded, skipped) if embedded: - ReportSearchVector.objects.bulk_update(embedded, fields=["embedding"]) + ReportSearchIndex.objects.bulk_update(embedded, fields=["embedding"]) if skipped: logger.error("…skipped as too large; report_ids=%s", [r.report_id for r in skipped]) @@ -544,7 +544,7 @@ async def _drain(self, ids, batch_size): The three scenarios still apply: 1. **Backfill** of historical NULLs (rows loaded before the deferred-embedding architecture shipped). -2. **Dim or model change** following §4.5 (or `ReportSearchVector.objects.update(embedding=None)` for a same-dim model swap). +2. **Dim or model change** following §4.5 (or `ReportSearchIndex.objects.update(embedding=None)` for a same-dim model swap). 3. **Outage recovery** for tasks that exhausted Procrastinate retries during an extended embedding-service outage. Properties: @@ -560,18 +560,18 @@ The pre-existing `PGSEARCH_SYNC_INDEXING` switch is **retained** with the same s | Mode | `PGSEARCH_SYNC_INDEXING` | FTS step | Embedding step | |---|---|---|---| -| Sync | `True` | `bulk_upsert_report_search_vectors(ids)` inline inside the handler | `embed_reports_task.defer(...)` immediately after, in the same handler call. RSV rows are already committed. | +| Sync | `True` | `bulk_upsert_report_search_indexes(ids)` inline inside the handler | `embed_reports_task.defer(...)` immediately after, in the same handler call. RSV rows are already committed. | | Deferred (default) | `False` | `enqueue_bulk_index_reports(ids)` defers `bulk_index_reports` to the `default` queue | `bulk_index_reports` itself defers `embed_reports_task` at the end of its run. Handler does *not* defer embed directly. | -`bulk_index_reports` now ends with `embed_reports_task.defer(report_ids=...)`. The defer happens inside the same task body, after `bulk_upsert_report_search_vectors` has committed the RSV rows, so the embeddings worker can only observe a `report_ids` payload whose RSV rows already exist. This replaces the earlier "defensive idempotent re-upsert at the top of the embed task" design — the chain is the ordering guarantee. +`bulk_index_reports` now ends with `embed_reports_task.defer(report_ids=...)`. The defer happens inside the same task body, after `bulk_upsert_report_search_indexes` has committed the RSV rows, so the embeddings worker can only observe a `report_ids` payload whose RSV rows already exist. This replaces the earlier "defensive idempotent re-upsert at the top of the embed task" design — the chain is the ordering guarantee. Properties: - **No race.** The embeddings worker never picks up a report id before its RSV row is committed. The embed task can read `report.body` and write `embedding` without checking for RSV existence. -- **Simple embed task.** No `bulk_upsert_report_search_vectors` shim at the top, no idempotent re-upsert cost on the embeddings worker, no extra commit hop. +- **Simple embed task.** No `bulk_upsert_report_search_indexes` shim at the top, no idempotent re-upsert cost on the embeddings worker, no extra commit hop. - **Operator choice preserved.** Deployments that prefer sync FTS keep that option; deployments that prefer the deferred FTS task for large bulks keep that option. Hybrid search is orthogonal to the FTS-mode decision. - **Two queues, two concerns.** FTS deferral runs on the `default` queue (where `bulk_index_reports` already lived); embedding runs on the dedicated `embeddings` queue. FTS-only worker capacity does not compete with embedding capacity. -- **Operator-triggered re-embed.** The `embed_pending` management command and the `enqueue_pending_embeddings` admin action defer `embed_reports_task` directly. Both bypass `bulk_index_reports` but the invariant still holds: their queries are over existing `ReportSearchVector` rows with `embedding IS NULL`, so the RSV rows exist by construction. +- **Operator-triggered re-embed.** The `embed_pending` management command and the `enqueue_pending_embeddings` admin action defer `embed_reports_task` directly. Both bypass `bulk_index_reports` but the invariant still holds: their queries are over existing `ReportSearchIndex` rows with `embedding IS NULL`, so the RSV rows exist by construction. The single-create / PUT path is unaffected by `PGSEARCH_SYNC_INDEXING`. Its FTS step is the `post_save` signal on `Report`, which is always sync inline by construction. The same handler still fires for it; the handler then takes the sync-mode branch's behaviour (immediate embed defer), which is correct since the RSV row was just written sync by the signal. @@ -628,7 +628,7 @@ def search(s: Search) -> SearchResult: vec_rank: dict[int, int] = {} if query_vec is not None: ids = list( - ReportSearchVector.objects + ReportSearchIndex.objects .filter(filter_q) .exclude(embedding__isnull=True) .annotate(distance=CosineDistance("embedding", query_vec)) @@ -639,7 +639,7 @@ def search(s: Search) -> SearchResult: # FTS side fts_rows = list( - ReportSearchVector.objects + ReportSearchIndex.objects .filter(filter_q) .filter(search_vector=tsquery) .annotate(rank=SearchRank(F("search_vector"), tsquery)) @@ -662,7 +662,7 @@ def search(s: Search) -> SearchResult: # Headline + hydration for the page slice only page_rows = ( - ReportSearchVector.objects + ReportSearchIndex.objects .filter(report_id__in=page_ids) .annotate( summary=SearchHeadline("report__body", tsquery, config=language, @@ -887,7 +887,7 @@ Both files add an `embeddings_worker.command` block. Dev uses `-l debug --autore | `tests/integration/test_migrations.py` (new, `django-test-migrations`) | Extension migration runs; column + HNSW index created with configured dim; reverse works | | `tests/integration/test_provider_hybrid.py` (new) | FTS-only hit, vector-only hit ("no pneumothorax" fixture), both-sides hit, filter honoring, stable pagination, embedding-service-down fallback, NULL-embedding rows still returned, `ts_headline` query-count bounded to page, empty-summary fallback | -Factories: existing `ReportSearchVectorFactory` gains optional `embedding` kwarg (default `None`). New `ReportSearchVectorWithEmbeddingFactory` generates deterministic normalized vectors of the configured dim from a seed. Real Qwen3 embeddings are not used in tests. +Factories: existing `ReportSearchIndexFactory` gains optional `embedding` kwarg (default `None`). New `ReportSearchIndexWithEmbeddingFactory` generates deterministic normalized vectors of the configured dim from a seed. Real Qwen3 embeddings are not used in tests. ### 10.3 View-level smoke @@ -933,7 +933,7 @@ Documented in §5.4. Mitigated by following §4.5 after a model swap and then ru V1 re-embeds anything where `embedding IS NULL`. A future optimization could track whether the body actually changed (e.g., a `body_hash` column on -`ReportSearchVector` updated only on body changes) so metadata-only updates +`ReportSearchIndex` updated only on body changes) so metadata-only updates don't have to null the embedding. Not in v1; profiling will tell us whether it matters. diff --git a/radis/pgsearch/admin.py b/radis/pgsearch/admin.py index e66a0cac..6257e086 100644 --- a/radis/pgsearch/admin.py +++ b/radis/pgsearch/admin.py @@ -4,22 +4,22 @@ from django.http.request import HttpRequest from procrastinate.contrib.django.models import ProcrastinateJob -from .models import ReportSearchVector +from .models import ReportSearchIndex from .tasks import enqueue_embed_reports EMBEDDINGS_QUEUE = "embeddings" -@admin.register(ReportSearchVector) -class ReportSearchVectorAdmin(admin.ModelAdmin): +@admin.register(ReportSearchIndex) +class ReportSearchIndexAdmin(admin.ModelAdmin): list_display = ("id", "report_id", "has_embedding") - list_filter = ("embedding",) + list_filter = (("embedding", admin.EmptyFieldListFilter),) search_fields = ("report__document_id",) actions = ("enqueue_pending_embeddings",) - change_list_template = "admin/pgsearch/reportsearchvector/change_list.html" + change_list_template = "admin/pgsearch/reportsearchindex/change_list.html" @admin.display(boolean=True, description="Embedded") - def has_embedding(self, obj: ReportSearchVector) -> bool: + def has_embedding(self, obj: ReportSearchIndex) -> bool: return obj.embedding is not None def changelist_view(self, request, extra_context=None): @@ -32,7 +32,7 @@ def _embedding_pipeline_stats() -> dict[str, int]: """Snapshot of the embedding pipeline for the admin badge: how many reports are still missing an embedding, and what Procrastinate is doing about it right now.""" - pending = ReportSearchVector.objects.filter(embedding__isnull=True).count() + pending = ReportSearchIndex.objects.filter(embedding__isnull=True).count() queue_counts = dict( ProcrastinateJob.objects.filter(queue_name=EMBEDDINGS_QUEUE) .values_list("status") @@ -47,7 +47,7 @@ def _embedding_pipeline_stats() -> dict[str, int]: @admin.action(description="Enqueue embedding for selected rows (NULL only)") def enqueue_pending_embeddings( - self, request: HttpRequest, queryset: QuerySet[ReportSearchVector] + self, request: HttpRequest, queryset: QuerySet[ReportSearchIndex] ) -> None: report_ids = list( queryset.filter(embedding__isnull=True) diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py index bf13b7fd..c004f121 100644 --- a/radis/pgsearch/apps.py +++ b/radis/pgsearch/apps.py @@ -13,7 +13,7 @@ def ready(self): def _migration_embedding_dim() -> int | None: - """Return the `dimensions` value of `ReportSearchVector.embedding` as + """Return the `dimensions` value of `ReportSearchIndex.embedding` as captured by the on-disk pgsearch migrations. Returns None if the field cannot be located (migrations missing or model renamed).""" from django.db.migrations.loader import MigrationLoader @@ -21,7 +21,7 @@ def _migration_embedding_dim() -> int | None: loader = MigrationLoader(connection=None, ignore_no_migrations=True) state = loader.project_state() try: - model = state.apps.get_model("pgsearch", "ReportSearchVector") + model = state.apps.get_model("pgsearch", "ReportSearchIndex") return model._meta.get_field("embedding").dimensions except (LookupError, AttributeError): return None @@ -44,7 +44,7 @@ def check_embedding_dim_matches_migration(app_configs, **kwargs): hint=( "Verify that `radis/pgsearch/migrations/` contains a " "migration that adds the `embedding` field to " - "`ReportSearchVector`, and that `makemigrations pgsearch` " + "`ReportSearchIndex`, and that `makemigrations pgsearch` " "succeeds without changes." ), ) @@ -90,11 +90,11 @@ def _handle_reports_changed(reports): return from radis.pgsearch.tasks import enqueue_bulk_index_reports, enqueue_embed_reports - from radis.pgsearch.utils.indexing import bulk_upsert_report_search_vectors + from radis.pgsearch.utils.indexing import bulk_upsert_report_search_indexes report_ids = [report.pk for report in reports] if settings.PGSEARCH_SYNC_INDEXING: - bulk_upsert_report_search_vectors(report_ids) + bulk_upsert_report_search_indexes(report_ids) enqueue_embed_reports(report_ids) else: enqueue_bulk_index_reports(report_ids) diff --git a/radis/pgsearch/management/commands/embed_pending.py b/radis/pgsearch/management/commands/embed_pending.py index 7dadd8b8..a9c71245 100644 --- a/radis/pgsearch/management/commands/embed_pending.py +++ b/radis/pgsearch/management/commands/embed_pending.py @@ -1,11 +1,11 @@ -"""Enqueue `embed_reports_task` for every `ReportSearchVector` whose embedding +"""Enqueue `embed_reports_task` for every `ReportSearchIndex` whose embedding is still NULL. Operators run this for three scenarios: 1. **Backfill.** Reports loaded before the deferred-embedding wiring shipped. 2. **Dim or model change.** After §4.5: drop the column, re-migrate (or - `ReportSearchVector.objects.update(embedding=None)` for a same-dim model + `ReportSearchIndex.objects.update(embedding=None)` for a same-dim model swap), then run this command to re-embed against the new model. 3. **Outage recovery.** Tasks that exhausted Procrastinate retries during an extended embedding-service outage — re-run after the service recovers. @@ -30,13 +30,13 @@ from django.conf import settings from django.core.management.base import BaseCommand -from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.models import ReportSearchIndex from radis.pgsearch.tasks import enqueue_embed_reports class Command(BaseCommand): help = ( - "Enqueue embed_reports_task subjobs for every ReportSearchVector " + "Enqueue embed_reports_task subjobs for every ReportSearchIndex " "with embedding=NULL. The embeddings worker drains the queue at " "its configured concurrency." ) @@ -62,7 +62,7 @@ def add_arguments(self, parser) -> None: def handle(self, *args, **opts) -> None: ids = list( - ReportSearchVector.objects.filter(embedding__isnull=True) + ReportSearchIndex.objects.filter(embedding__isnull=True) .order_by("report_id") .values_list("report_id", flat=True) ) diff --git a/radis/pgsearch/migrations/0003_rename_search_index.py b/radis/pgsearch/migrations/0003_rename_search_index.py new file mode 100644 index 00000000..bbaf6bae --- /dev/null +++ b/radis/pgsearch/migrations/0003_rename_search_index.py @@ -0,0 +1,31 @@ +"""Rename `ReportSearchVector` → `ReportSearchIndex` and the reverse +accessor `Report.search_vector` → `Report.search_index`. + +The model now holds the FTS tsvector AND the dense embedding (and likely +a trigram column in the future). Its name should reflect its role +("the per-report search-backing row") rather than one specific field.""" +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("pgsearch", "0002_hybrid_search"), + ("reports", "0013_alter_report_options"), + ] + + operations = [ + migrations.RenameModel( + old_name="ReportSearchVector", + new_name="ReportSearchIndex", + ), + migrations.AlterField( + model_name="reportsearchindex", + name="report", + field=models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, + related_name="search_index", + to="reports.report", + ), + ), + ] diff --git a/radis/pgsearch/models.py b/radis/pgsearch/models.py index 5cd90e8b..ab613581 100644 --- a/radis/pgsearch/models.py +++ b/radis/pgsearch/models.py @@ -9,12 +9,20 @@ from .utils.language_utils import code_to_language -class ReportSearchVector(models.Model): - report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_vector") +class ReportSearchIndex(models.Model): + """Per-report row that backs every search modality. Holds the FTS + `search_vector` (tsvector) and the dense `embedding` vector for + hybrid search; a future trigram column would also live here. Named + after its role, not after any single field — adding another search + representation shouldn't force another rename.""" + + report = models.OneToOneField(Report, on_delete=models.CASCADE, related_name="search_index") search_vector = SearchVectorField(null=True) embedding = VectorField(dimensions=settings.EMBEDDING_DIM, null=True) class Meta: + verbose_name = "Report search index" + verbose_name_plural = "Report search indexes" indexes = [ GinIndex(fields=["search_vector"]), HnswIndex( @@ -27,7 +35,7 @@ class Meta: ] def __str__(self) -> str: - return f"Report {self.report.id} search vector" + return f"Report {self.report.id} search index" def save(self, *args, **kwargs): body = self.report.body if self.report else "" diff --git a/radis/pgsearch/providers.py b/radis/pgsearch/providers.py index 271e66ce..5aa02c3e 100644 --- a/radis/pgsearch/providers.py +++ b/radis/pgsearch/providers.py @@ -18,8 +18,8 @@ is_search_token_char, ) -from .models import ReportSearchVector -from .utils.document_utils import AnnotatedReportSearchVector, document_from_pgsearch_response +from .models import ReportSearchIndex +from .utils.document_utils import AnnotatedReportSearchIndex, document_from_pgsearch_response from .utils.embedding_client import EmbeddingClient, EmbeddingClientError from .utils.fusion import rrf_fuse, summary_with_fallback from .utils.language_utils import code_to_language @@ -115,7 +115,7 @@ def search(search: Search) -> SearchResult: vec_distance: dict[int, float] = {} if query_vec is not None: vec_rows = list( - ReportSearchVector.objects.filter(filter_query) + ReportSearchIndex.objects.filter(filter_query) .distinct() .exclude(embedding__isnull=True) .annotate(distance=CosineDistance("embedding", query_vec)) @@ -128,7 +128,7 @@ def search(search: Search) -> SearchResult: # FTS side: bounded set, ts_rank only (no headline at this stage). fts_rows = list( - ReportSearchVector.objects.filter(filter_query) + ReportSearchIndex.objects.filter(filter_query) .distinct() .filter(search_vector=tsquery) .annotate(rank=SearchRank(F("search_vector"), tsquery)) @@ -158,7 +158,7 @@ def search(search: Search) -> SearchResult: # Headline + hydration for the page slice only. page_rows = ( - ReportSearchVector.objects.filter(report_id__in=page_ids) + ReportSearchIndex.objects.filter(report_id__in=page_ids) .annotate( summary=SearchHeadline( "report__body", @@ -186,7 +186,7 @@ def search(search: Search) -> SearchResult: ) documents.append( document_from_pgsearch_response( - cast(AnnotatedReportSearchVector, rsv), + cast(AnnotatedReportSearchIndex, rsv), cosine_distance=vec_distance.get(rid), rrf_score=rrf_score_by_id.get(rid, 0.0), ) @@ -202,7 +202,7 @@ def count(search: Search) -> int: language = _resolve_language(search.filters) query = SearchQuery(query_str, search_type="raw", config=language) filter_query = _build_filter_query(search.filters) - results = ReportSearchVector.objects.filter(filter_query).filter(search_vector=query) + results = ReportSearchIndex.objects.filter(filter_query).filter(search_vector=query) return results.count() @@ -227,7 +227,7 @@ def retrieve(search: Search) -> Iterator[str]: vec_rank: dict[int, int] = {} if query_vec is not None: vec_ids = list( - ReportSearchVector.objects.filter(filter_query) + ReportSearchIndex.objects.filter(filter_query) .distinct() .exclude(embedding__isnull=True) .annotate(distance=CosineDistance("embedding", query_vec)) @@ -237,7 +237,7 @@ def retrieve(search: Search) -> Iterator[str]: vec_rank = {rid: i + 1 for i, rid in enumerate(vec_ids)} fts_rows = list( - ReportSearchVector.objects.filter(filter_query) + ReportSearchIndex.objects.filter(filter_query) .distinct() .filter(search_vector=tsquery) .annotate(rank=SearchRank(F("search_vector"), tsquery)) @@ -258,7 +258,7 @@ def retrieve(search: Search) -> Iterator[str]: def filter(filter: SearchFilters) -> Iterator[str]: filter_query = _build_filter_query(filter) - results = ReportSearchVector.objects.filter(filter_query).values_list( + results = ReportSearchIndex.objects.filter(filter_query).values_list( "report__document_id", flat=True ) return results.iterator() diff --git a/radis/pgsearch/signals.py b/radis/pgsearch/signals.py index 37fc0373..dc351fab 100644 --- a/radis/pgsearch/signals.py +++ b/radis/pgsearch/signals.py @@ -3,12 +3,12 @@ from radis.reports.models import Report -from .models import ReportSearchVector +from .models import ReportSearchIndex @receiver(post_save, sender=Report) -def create_or_update_report_search_vector(sender, instance, created, **kwargs): +def create_or_update_report_search_index(sender, instance, created, **kwargs): if created: - ReportSearchVector.objects.create(report=instance) + ReportSearchIndex.objects.create(report=instance) return - instance.search_vector.save() + instance.search_index.save() diff --git a/radis/pgsearch/tasks.py b/radis/pgsearch/tasks.py index 18f0ae48..856b0ee8 100644 --- a/radis/pgsearch/tasks.py +++ b/radis/pgsearch/tasks.py @@ -5,13 +5,13 @@ from procrastinate.contrib.django import app from procrastinate.types import JSONValue -from .models import ReportSearchVector +from .models import ReportSearchIndex from .utils.embedding_client import ( EmbeddingClient, EmbeddingClientError, EmbeddingPayloadTooLargeError, ) -from .utils.indexing import bulk_upsert_report_search_vectors +from .utils.indexing import bulk_upsert_report_search_indexes logger = logging.getLogger(__name__) @@ -59,7 +59,7 @@ def bulk_index_reports(report_ids: list[int]) -> None: if not report_ids: return logger.info("Indexing %s reports in bulk.", len(report_ids)) - bulk_upsert_report_search_vectors(report_ids) + bulk_upsert_report_search_indexes(report_ids) enqueue_embed_reports(report_ids) @@ -108,9 +108,9 @@ def enqueue_embed_reports( def _embed_with_bisect( client: EmbeddingClient, - rsvs: list[ReportSearchVector], - embedded: list[ReportSearchVector], - skipped: list[ReportSearchVector], + rsvs: list[ReportSearchIndex], + embedded: list[ReportSearchIndex], + skipped: list[ReportSearchIndex], ) -> None: """Embed `rsvs` and append `(rsv, vec)` pairs to `embedded`. When the backend rejects the request as too large, bisect and recurse. Once the @@ -166,7 +166,7 @@ def embed_reports_task(report_ids: list[int]) -> None: stamina's budget — propagates so Procrastinate's task-level retry policy applies. - Callers must ensure ReportSearchVector rows exist before deferring this + Callers must ensure ReportSearchIndex rows exist before deferring this task. `bulk_index_reports` chains the defer at the end of its run, and `embed_pending` / the admin action filter on existing RSV rows by construction. @@ -175,27 +175,27 @@ def embed_reports_task(report_ids: list[int]) -> None: return rsvs = list( - ReportSearchVector.objects.filter(report_id__in=report_ids) + ReportSearchIndex.objects.filter(report_id__in=report_ids) .select_related("report") .only("id", "report_id", "report__body") ) if not rsvs: logger.warning( - "embed_reports_task: no ReportSearchVector rows for report ids %s", + "embed_reports_task: no ReportSearchIndex rows for report ids %s", report_ids, ) return batch_size = settings.EMBEDDING_BATCH_SIZE - embedded: list[ReportSearchVector] = [] - skipped: list[ReportSearchVector] = [] + embedded: list[ReportSearchIndex] = [] + skipped: list[ReportSearchIndex] = [] with EmbeddingClient() as client: for start in range(0, len(rsvs), batch_size): chunk = rsvs[start : start + batch_size] _embed_with_bisect(client, chunk, embedded, skipped) if embedded: - ReportSearchVector.objects.bulk_update(embedded, fields=["embedding"]) + ReportSearchIndex.objects.bulk_update(embedded, fields=["embedding"]) if skipped: logger.error( "embed_reports_task: %d report(s) skipped as too large for the embedding " diff --git a/radis/pgsearch/templates/admin/pgsearch/reportsearchvector/change_list.html b/radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html similarity index 100% rename from radis/pgsearch/templates/admin/pgsearch/reportsearchvector/change_list.html rename to radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html diff --git a/radis/pgsearch/tests/test_admin.py b/radis/pgsearch/tests/test_admin.py index 3bf096e9..0aeefd0b 100644 --- a/radis/pgsearch/tests/test_admin.py +++ b/radis/pgsearch/tests/test_admin.py @@ -1,10 +1,10 @@ -"""Tests for the ReportSearchVector admin pipeline-stats badge.""" +"""Tests for the ReportSearchIndex admin pipeline-stats badge.""" from django.db import connection import pytest -from radis.pgsearch.admin import ReportSearchVectorAdmin -from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.admin import ReportSearchIndexAdmin +from radis.pgsearch.models import ReportSearchIndex from radis.reports.factories import ReportFactory pytestmark = pytest.mark.django_db(transaction=True) @@ -45,11 +45,11 @@ def _insert_procrastinate_job(status: str, queue: str = "embeddings") -> None: def test_pipeline_stats_counts_pending_rsvs(): [ReportFactory.create() for _ in range(3)] embedded = ReportFactory.create() - rsv = ReportSearchVector.objects.get(report_id=embedded.pk) + rsv = ReportSearchIndex.objects.get(report_id=embedded.pk) rsv.embedding = [0.0] * 1024 rsv.save() - stats = ReportSearchVectorAdmin._embedding_pipeline_stats() + stats = ReportSearchIndexAdmin._embedding_pipeline_stats() assert stats["pending_reports"] == 3 @@ -61,14 +61,14 @@ def test_pipeline_stats_counts_procrastinate_jobs_by_status(): # Job on a different queue must not be counted. _insert_procrastinate_job("todo", queue="default") - stats = ReportSearchVectorAdmin._embedding_pipeline_stats() + stats = ReportSearchIndexAdmin._embedding_pipeline_stats() assert stats["todo"] == 2 assert stats["doing"] == 1 assert stats["failed"] == 1 def test_pipeline_stats_zero_when_no_queue_activity(): - stats = ReportSearchVectorAdmin._embedding_pipeline_stats() + stats = ReportSearchIndexAdmin._embedding_pipeline_stats() assert stats == { "pending_reports": 0, "todo": 0, diff --git a/radis/pgsearch/tests/test_embed_reports_task.py b/radis/pgsearch/tests/test_embed_reports_task.py index dd1b1589..eefe5611 100644 --- a/radis/pgsearch/tests/test_embed_reports_task.py +++ b/radis/pgsearch/tests/test_embed_reports_task.py @@ -6,7 +6,7 @@ import pytest import stamina -from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.models import ReportSearchIndex from radis.pgsearch.tasks import ( bulk_index_reports, embed_reports_task, @@ -74,7 +74,7 @@ def test_embeds_in_internal_batches(settings): assert fake.embed_documents.call_count == 3 sizes = [len(call.args[0]) for call in fake.embed_documents.call_args_list] assert sorted(sizes) == [1, 2, 2] - assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 0 + assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 0 def test_embedding_error_propagates(): @@ -90,7 +90,7 @@ def test_embedding_error_propagates(): with pytest.raises(EmbeddingClientError): embed_reports_task(report_ids=pks) - assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 2 + assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 2 def test_bulk_index_reports_chains_into_embed_reports_task(settings): @@ -101,14 +101,14 @@ def test_bulk_index_reports_chains_into_embed_reports_task(settings): settings.EMBEDDING_SUBJOB_SIZE = 100 reports = [ReportFactory.create() for _ in range(3)] pks = [r.pk for r in reports] - ReportSearchVector.objects.filter(report_id__in=pks).delete() + ReportSearchIndex.objects.filter(report_id__in=pks).delete() with patch("radis.pgsearch.tasks.embed_reports_task.defer") as defer: bulk_index_reports(report_ids=pks) # RSVs were upserted, then one embed subjob covering all 3 ids was # deferred (3 < SUBJOB_SIZE so the whole batch fits in one subjob). - assert ReportSearchVector.objects.filter(report_id__in=pks).count() == 3 + assert ReportSearchIndex.objects.filter(report_id__in=pks).count() == 3 defer.assert_called_once_with(report_ids=pks) @@ -184,7 +184,7 @@ def test_bisects_on_too_large_and_isolates_offender(settings, caplog, monkeypatc def fake_embed(texts): # Simulate the backend rejecting any payload that contains the # offending report's body. The body is fetched by report_id. - offender_body = ReportSearchVector.objects.select_related("report").get( + offender_body = ReportSearchIndex.objects.select_related("report").get( report_id=offender_pk ).report.body if offender_body in texts: @@ -212,7 +212,7 @@ def fake_embed(texts): # The three good reports got embeddings; the offender stayed NULL. rsvs_by_pk = { rsv.report_id: rsv - for rsv in ReportSearchVector.objects.filter(report_id__in=pks) + for rsv in ReportSearchIndex.objects.filter(report_id__in=pks) } assert rsvs_by_pk[offender_pk].embedding is None for pk in pks: @@ -250,7 +250,7 @@ def test_non_too_large_error_propagates_without_bisecting(): # Only one call should have been made — no bisect on non-too-large errors. assert fake.embed_documents.call_count == 1 - assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 4 + assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 4 def test_stamina_retries_transient_then_succeeds(settings, stamina_active): @@ -280,7 +280,7 @@ def test_stamina_retries_transient_then_succeeds(settings, stamina_active): # The mock was called 3 times: two retries + one success. assert fake.embed_documents.call_count == 3 # All three reports got embeddings; none stayed NULL. - assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 0 + assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 0 def test_stamina_does_not_retry_payload_too_large(settings, stamina_active): @@ -304,4 +304,4 @@ def test_stamina_does_not_retry_payload_too_large(settings, stamina_active): # Single call — no stamina retry for payload-too-large. assert fake.embed_documents.call_count == 1 - assert ReportSearchVector.objects.filter(embedding__isnull=True).count() == 1 + assert ReportSearchIndex.objects.filter(embedding__isnull=True).count() == 1 diff --git a/radis/pgsearch/tests/test_indexing.py b/radis/pgsearch/tests/test_indexing.py index 344018f5..282aceb5 100644 --- a/radis/pgsearch/tests/test_indexing.py +++ b/radis/pgsearch/tests/test_indexing.py @@ -1,7 +1,7 @@ import pytest -from radis.pgsearch.models import ReportSearchVector -from radis.pgsearch.utils.indexing import bulk_upsert_report_search_vectors +from radis.pgsearch.models import ReportSearchIndex +from radis.pgsearch.utils.indexing import bulk_upsert_report_search_indexes from radis.reports.models import Language, Report @@ -24,10 +24,10 @@ def test_bulk_index_matches_signal_vector() -> None: language=language, ) - signal_vector = ReportSearchVector.objects.get(report=report).search_vector - ReportSearchVector.objects.filter(report=report).delete() + signal_vector = ReportSearchIndex.objects.get(report=report).search_vector + ReportSearchIndex.objects.filter(report=report).delete() - bulk_upsert_report_search_vectors([report.pk]) - bulk_vector = ReportSearchVector.objects.get(report=report).search_vector + bulk_upsert_report_search_indexes([report.pk]) + bulk_vector = ReportSearchIndex.objects.get(report=report).search_vector assert signal_vector == bulk_vector diff --git a/radis/pgsearch/tests/test_provider_hybrid.py b/radis/pgsearch/tests/test_provider_hybrid.py index 6e6758ef..2966272b 100644 --- a/radis/pgsearch/tests/test_provider_hybrid.py +++ b/radis/pgsearch/tests/test_provider_hybrid.py @@ -3,7 +3,7 @@ import pytest from django.contrib.auth.models import Group -from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.models import ReportSearchIndex from radis.pgsearch.providers import retrieve, search from radis.pgsearch.utils.embedding_client import EmbeddingClientError from radis.reports.factories import ReportFactory @@ -50,9 +50,9 @@ def reports_with_embeddings(group, settings): body="No pneumothorax detected. Previous pneumothorax resolved. Lungs clear." ) r2.groups.add(group) - ReportSearchVector.objects.filter(report=r0).update(embedding=_unit_vec(99, dim)) - ReportSearchVector.objects.filter(report=r1).update(embedding=_unit_vec(1, dim)) - ReportSearchVector.objects.filter(report=r2).update(embedding=_unit_vec(0, dim)) + ReportSearchIndex.objects.filter(report=r0).update(embedding=_unit_vec(99, dim)) + ReportSearchIndex.objects.filter(report=r1).update(embedding=_unit_vec(1, dim)) + ReportSearchIndex.objects.filter(report=r2).update(embedding=_unit_vec(0, dim)) return r0, r1, r2 @@ -136,7 +136,7 @@ def test_empty_summary_falls_back_to_body_head(group, settings): body="lung parenchyma demonstrates clear bilaterally with no abnormality", ) r.groups.add(group) - ReportSearchVector.objects.filter(report=r).update(embedding=_unit_vec(0, dim)) + ReportSearchIndex.objects.filter(report=r).update(embedding=_unit_vec(0, dim)) with patch("radis.pgsearch.providers.EmbeddingClient") as MockClient: MockClient.return_value.__enter__.return_value = MockClient.return_value @@ -205,7 +205,7 @@ def test_m2m_filter_does_not_duplicate_results(group, settings): dim = settings.EMBEDDING_DIM r = ReportFactory.create(body="pneumothorax findings", modalities=["CT", "MR", "DX"]) r.groups.add(group) - ReportSearchVector.objects.filter(report=r).update(embedding=_unit_vec(0, dim)) + ReportSearchIndex.objects.filter(report=r).update(embedding=_unit_vec(0, dim)) node, _ = QueryParser().parse("pneumothorax") assert node is not None diff --git a/radis/pgsearch/utils/document_utils.py b/radis/pgsearch/utils/document_utils.py index bed79439..31a8623f 100644 --- a/radis/pgsearch/utils/document_utils.py +++ b/radis/pgsearch/utils/document_utils.py @@ -1,8 +1,8 @@ -from radis.pgsearch.models import ReportSearchVector +from radis.pgsearch.models import ReportSearchIndex from radis.search.site import ReportDocument -class AnnotatedReportSearchVector(ReportSearchVector): +class AnnotatedReportSearchIndex(ReportSearchIndex): rank: float summary: str @@ -11,7 +11,7 @@ class Meta: def document_from_pgsearch_response( - record: AnnotatedReportSearchVector, + record: AnnotatedReportSearchIndex, cosine_distance: float | None = None, rrf_score: float = 0.0, ) -> ReportDocument: diff --git a/radis/pgsearch/utils/indexing.py b/radis/pgsearch/utils/indexing.py index 882ba4b3..0d90b55f 100644 --- a/radis/pgsearch/utils/indexing.py +++ b/radis/pgsearch/utils/indexing.py @@ -8,7 +8,7 @@ from radis.reports.models import Report -from ..models import ReportSearchVector +from ..models import ReportSearchIndex from .language_utils import code_to_language logger = logging.getLogger(__name__) @@ -19,7 +19,7 @@ def _chunked(items: list[int], size: int) -> Iterable[list[int]]: yield items[index : index + size] -def bulk_upsert_report_search_vectors( +def bulk_upsert_report_search_indexes( report_ids: Iterable[int], chunk_size: int | None = None, ) -> None: @@ -56,8 +56,8 @@ def bulk_upsert_report_search_vectors( ) for config, config_ids in config_to_ids.items(): - ReportSearchVector.objects.bulk_create( - [ReportSearchVector(report_id=report_id) for report_id in config_ids], + ReportSearchIndex.objects.bulk_create( + [ReportSearchIndex(report_id=report_id) for report_id in config_ids], ignore_conflicts=True, batch_size=settings.PGSEARCH_BULK_INSERT_BATCH_SIZE, ) @@ -65,7 +65,7 @@ def bulk_upsert_report_search_vectors( with connection.cursor() as cursor: cursor.execute( """ - UPDATE pgsearch_reportsearchvector v + UPDATE pgsearch_reportsearchindex v SET search_vector = to_tsvector(%s::regconfig, r.body) FROM reports_report r WHERE v.report_id = r.id AND r.id = ANY(%s) From 49df71cc84c12a7fe1107a2f8e2bd554d8e1e5aa Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Tue, 23 Jun 2026 13:41:00 +0000 Subject: [PATCH 67/68] feat(pgsearch): admin clear-embeddings action; block delete; inline queue name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a clear_embeddings_for_remodel admin action that NULLs the embedding column on the selected rows via queryset.update, leaving search_vector untouched and explicitly bypassing post_save signals — operators then run embed_pending to backfill against the swapped model. Suitable only for same-dim model swaps; full dim changes still require the §4.5 stop-the-world DDL procedure. Blocks delete on the ReportSearchIndex admin since rows are managed by the post_save signal on Report — deleting one orphans the report from search until the next save. Drops the unused EMBEDDINGS_QUEUE module constant and the equivalent template variable. Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/admin.py | 38 ++++++++++++++++-- .../reportsearchindex/change_list.html | 2 +- radis/pgsearch/tests/test_admin.py | 40 +++++++++++++++++++ 3 files changed, 75 insertions(+), 5 deletions(-) diff --git a/radis/pgsearch/admin.py b/radis/pgsearch/admin.py index 6257e086..bf3991eb 100644 --- a/radis/pgsearch/admin.py +++ b/radis/pgsearch/admin.py @@ -7,17 +7,21 @@ from .models import ReportSearchIndex from .tasks import enqueue_embed_reports -EMBEDDINGS_QUEUE = "embeddings" - @admin.register(ReportSearchIndex) class ReportSearchIndexAdmin(admin.ModelAdmin): list_display = ("id", "report_id", "has_embedding") list_filter = (("embedding", admin.EmptyFieldListFilter),) search_fields = ("report__document_id",) - actions = ("enqueue_pending_embeddings",) + actions = ("enqueue_pending_embeddings", "clear_embeddings_for_remodel") change_list_template = "admin/pgsearch/reportsearchindex/change_list.html" + def has_delete_permission(self, request, obj=None): + # RSI rows are managed by the post_save signal on Report — deleting + # one orphans the report from search until someone saves the report + # again. Block delete (this also hides the "delete selected" action). + return False + @admin.display(boolean=True, description="Embedded") def has_embedding(self, obj: ReportSearchIndex) -> bool: return obj.embedding is not None @@ -34,7 +38,7 @@ def _embedding_pipeline_stats() -> dict[str, int]: doing about it right now.""" pending = ReportSearchIndex.objects.filter(embedding__isnull=True).count() queue_counts = dict( - ProcrastinateJob.objects.filter(queue_name=EMBEDDINGS_QUEUE) + ProcrastinateJob.objects.filter(queue_name="embeddings") .values_list("status") .annotate(n=Count("id")) ) @@ -70,3 +74,29 @@ def enqueue_pending_embeddings( f"{subjob_count} subjob(s) for embedding.", level=messages.SUCCESS, ) + + @admin.action(description="Clear embeddings (NULL them) — for same-dim model swap") + def clear_embeddings_for_remodel( + self, request: HttpRequest, queryset: QuerySet[ReportSearchIndex] + ) -> None: + # Same-dim model swap procedure: NULL the existing embeddings so + # the new model writes fresh ones via `embed_pending`. Uses + # queryset.update so post_save signals don't fire (we don't want + # auto-re-embedding here — that'd hit the embedding service + # immediately, possibly with the OLD model still configured). + # The operator drives the backfill explicitly afterward. + cleared = queryset.filter(embedding__isnull=False).update(embedding=None) + if not cleared: + self.message_user( + request, + "No selected rows had an embedding to clear.", + level=messages.WARNING, + ) + return + self.message_user( + request, + f"Cleared embeddings on {cleared} row(s). Run " + f"`./manage.py embed_pending` (or the 'Enqueue embedding' " + f"action) to backfill against the new model.", + level=messages.SUCCESS, + ) diff --git a/radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html b/radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html index d0504d83..58b5790f 100644 --- a/radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html +++ b/radis/pgsearch/templates/admin/pgsearch/reportsearchindex/change_list.html @@ -14,7 +14,7 @@ 0 {% endif %} failed - ({{ EMBEDDINGS_QUEUE|default:"embeddings" }} queue) + (embeddings queue) {% endif %} {{ block.super }} diff --git a/radis/pgsearch/tests/test_admin.py b/radis/pgsearch/tests/test_admin.py index 0aeefd0b..eb2a701b 100644 --- a/radis/pgsearch/tests/test_admin.py +++ b/radis/pgsearch/tests/test_admin.py @@ -1,4 +1,7 @@ """Tests for the ReportSearchIndex admin pipeline-stats badge.""" +from unittest.mock import MagicMock + +from django.contrib.admin.sites import AdminSite from django.db import connection import pytest @@ -75,3 +78,40 @@ def test_pipeline_stats_zero_when_no_queue_activity(): "doing": 0, "failed": 0, } + + +def test_delete_permission_denied(): + """RSI rows are managed by the post_save signal on Report — admin must + not let operators delete them out from under the model.""" + admin_instance = ReportSearchIndexAdmin(ReportSearchIndex, AdminSite()) + assert admin_instance.has_delete_permission(MagicMock()) is False + + +def test_clear_embeddings_for_remodel_nulls_only_selected_rows_with_embeddings(): + """Same-dim model swap: NULL the existing embeddings on selected rows. + Rows already NULL are no-ops; rows outside the selection are untouched.""" + targets = [ReportFactory.create() for _ in range(3)] + untouched = ReportFactory.create() + for r in targets + [untouched]: + rsi = ReportSearchIndex.objects.get(report_id=r.pk) + rsi.embedding = [0.1] * 1024 + rsi.save() + # One target already NULL — should be skipped by the filter. + ReportSearchIndex.objects.filter(report_id=targets[0].pk).update(embedding=None) + + selected = ReportSearchIndex.objects.filter( + report_id__in=[r.pk for r in targets] + ) + admin_instance = ReportSearchIndexAdmin(ReportSearchIndex, AdminSite()) + admin_instance.message_user = MagicMock() + admin_instance.clear_embeddings_for_remodel(MagicMock(), selected) + + # Two of three targets had embeddings and got cleared. + assert ReportSearchIndex.objects.filter( + report_id__in=[r.pk for r in targets], embedding__isnull=True + ).count() == 3 + # The non-selected row is untouched. + assert ReportSearchIndex.objects.get(report_id=untouched.pk).embedding is not None + # message_user reports the number cleared, not the number selected. + msg_args = admin_instance.message_user.call_args + assert "Cleared embeddings on 2 row(s)" in msg_args.args[1] From 1c6d69043f5be799cc56e3d776c84cc2b40e8974 Mon Sep 17 00:00:00 2001 From: Samuel Kwong Date: Thu, 25 Jun 2026 09:42:39 +0000 Subject: [PATCH 68/68] refactor(pgsearch, search): rename indexing handler; squash migration; bag-of-terms embed text; strip field filters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename pgsearch's reports-handler subscriber from _handle_reports_changed to _index_reports — the function's job is "index these reports for search", which the new name says directly. - Squash 0002_hybrid_search + 0003_rename_search_index into a single 0002_hybrid_search migration whose ops are: CREATE EXTENSION vector → RenameModel(ReportSearchVector → ReportSearchIndex) → AlterField (related_name="search_index") → AddField(embedding) → AddIndex(HNSW). Fresh installs now see one coherent migration rather than three intermediate states no operator will ever observe in isolation. - Rewrite unparse_for_embedding to emit a clean bag of terms: drop AND/OR operator tokens, grouping parens, and phrase quotes. The dense embedding model isn't a query parser — it shouldn't see syntax cluttering the input. NOT-stripping behavior unchanged. - New _strip_field_filters cleanup step removes field:value tokens at parse time before the colon would be silently absorbed into a corrupted single word (bodypneumonia). Field filtering belongs in SearchFilters at the provider level; the query string is body-text only. Preserves colons inside phrases. Co-Authored-By: Claude Opus 4.7 (1M context) --- radis/pgsearch/apps.py | 10 ++-- .../pgsearch/migrations/0002_hybrid_search.py | 37 +++++++++++-- .../migrations/0003_rename_search_index.py | 31 ----------- radis/search/tests/test_query_parser.py | 17 ++++++ ...test_query_parser_unparse_for_embedding.py | 27 +++++----- radis/search/utils/query_parser.py | 54 +++++++++++++++---- 6 files changed, 113 insertions(+), 63 deletions(-) delete mode 100644 radis/pgsearch/migrations/0003_rename_search_index.py diff --git a/radis/pgsearch/apps.py b/radis/pgsearch/apps.py index c004f121..ad5ebd71 100644 --- a/radis/pgsearch/apps.py +++ b/radis/pgsearch/apps.py @@ -71,7 +71,7 @@ def check_embedding_dim_matches_migration(app_configs, **kwargs): return [] -def _handle_reports_changed(reports): +def _index_reports(reports): """pgsearch's subscriber on reports_created_handlers / reports_updated_handlers. Owns both FTS indexing and embedding for the touched reports. The mode @@ -79,12 +79,12 @@ def _handle_reports_changed(reports): request thread or is deferred to a Procrastinate task on the `default` queue. Embedding is always deferred to the `embeddings` queue. - Ordering between FTS and embedding is the same in both modes: RSV rows + Ordering between FTS and embedding is the same in both modes: RSI rows exist (and `report.body` is reachable) before `embed_reports_task` runs. In sync mode the handler upserts inline, then defers embed. In async mode the handler only enqueues `bulk_index_reports`; that task chains `embed_reports_task` at the end of its own run, so the embeddings worker - never picks up a report before its RSV row is committed. + never picks up a report before its RSI row is committed. """ if not reports: return @@ -124,10 +124,10 @@ def register_app(): from .providers import count, filter, retrieve, search register_reports_created_handler( - ReportsCreatedHandler(name="PG Search", handle=_handle_reports_changed) + ReportsCreatedHandler(name="PG Search", handle=_index_reports) ) register_reports_updated_handler( - ReportsUpdatedHandler(name="PG Search", handle=_handle_reports_changed) + ReportsUpdatedHandler(name="PG Search", handle=_index_reports) ) register_search_provider( diff --git a/radis/pgsearch/migrations/0002_hybrid_search.py b/radis/pgsearch/migrations/0002_hybrid_search.py index 0a891d2e..98e7174b 100644 --- a/radis/pgsearch/migrations/0002_hybrid_search.py +++ b/radis/pgsearch/migrations/0002_hybrid_search.py @@ -1,10 +1,26 @@ +"""Hybrid-search schema additions on top of pgsearch.0001_initial: + +- Rename the per-report search row from `ReportSearchVector` to + `ReportSearchIndex` (now holds the FTS tsvector *and* the dense + embedding; future trigram column would also live there). +- Update the reverse accessor on Report (`search_vector` → `search_index`). +- Install the pgvector extension. +- Add the `embedding vector(1024)` column and its HNSW index for cosine + similarity search. + +Squashed from the previously-separate `0002_hybrid_search` (extension + +embedding field + HNSW) and `0003_rename_search_index` (RenameModel + +AlterField) so that hybrid search ships as a single coherent migration +rather than three intermediate states no operator will ever see in +isolation. +""" +import django.db.models.deletion import pgvector.django.indexes import pgvector.django.vector -from django.db import migrations +from django.db import migrations, models class Migration(migrations.Migration): - dependencies = [ ("pgsearch", "0001_initial"), ("reports", "0013_alter_report_options"), @@ -15,13 +31,26 @@ class Migration(migrations.Migration): sql="CREATE EXTENSION IF NOT EXISTS vector;", reverse_sql=migrations.RunSQL.noop, ), + migrations.RenameModel( + old_name="ReportSearchVector", + new_name="ReportSearchIndex", + ), + migrations.AlterField( + model_name="reportsearchindex", + name="report", + field=models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, + related_name="search_index", + to="reports.report", + ), + ), migrations.AddField( - model_name="reportsearchvector", + model_name="reportsearchindex", name="embedding", field=pgvector.django.vector.VectorField(dimensions=1024, null=True), ), migrations.AddIndex( - model_name="reportsearchvector", + model_name="reportsearchindex", index=pgvector.django.indexes.HnswIndex( ef_construction=64, fields=["embedding"], diff --git a/radis/pgsearch/migrations/0003_rename_search_index.py b/radis/pgsearch/migrations/0003_rename_search_index.py deleted file mode 100644 index bbaf6bae..00000000 --- a/radis/pgsearch/migrations/0003_rename_search_index.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Rename `ReportSearchVector` → `ReportSearchIndex` and the reverse -accessor `Report.search_vector` → `Report.search_index`. - -The model now holds the FTS tsvector AND the dense embedding (and likely -a trigram column in the future). Its name should reflect its role -("the per-report search-backing row") rather than one specific field.""" -import django.db.models.deletion -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("pgsearch", "0002_hybrid_search"), - ("reports", "0013_alter_report_options"), - ] - - operations = [ - migrations.RenameModel( - old_name="ReportSearchVector", - new_name="ReportSearchIndex", - ), - migrations.AlterField( - model_name="reportsearchindex", - name="report", - field=models.OneToOneField( - on_delete=django.db.models.deletion.CASCADE, - related_name="search_index", - to="reports.report", - ), - ), - ] diff --git a/radis/search/tests/test_query_parser.py b/radis/search/tests/test_query_parser.py index 28d05dd2..3ee06138 100644 --- a/radis/search/tests/test_query_parser.py +++ b/radis/search/tests/test_query_parser.py @@ -136,6 +136,23 @@ def test_fixed_queries(): assert is_fixed_query("foo \\) bar", "foo bar", 2) +def test_strips_field_filter_syntax(): + # Field-filter syntax (`field:value`) has no place in the query grammar — + # structured field filtering lives in `SearchFilters`. Stripping the + # whole token keeps the colon from being silently dropped into a + # corrupted single word (`bodypneumonia`). + assert is_fixed_query("pneumonia body:pneumonia", "pneumonia", 1) + # Two stripped filters still count as one "step ran", matching how + # `_replace_invalid_characters` reports its own pass. + assert is_empty_query("body:pneumonia patient_sex:F", 1) + assert is_empty_query("body:pneumonia", 1) + # Colons inside phrases are preserved verbatim (operator syntax doesn't + # apply inside quoted strings). + assert is_valid_query('"body:pneumonia"') + # Time-like tokens with embedded colons are also stripped. + assert is_fixed_query("time:14:30 finding", "finding", 1) + + def test_empty_queries(): assert is_empty_query("", 0) assert is_empty_query(" ", 0) diff --git a/radis/search/tests/test_query_parser_unparse_for_embedding.py b/radis/search/tests/test_query_parser_unparse_for_embedding.py index c138976a..833db95c 100644 --- a/radis/search/tests/test_query_parser_unparse_for_embedding.py +++ b/radis/search/tests/test_query_parser_unparse_for_embedding.py @@ -8,15 +8,16 @@ [ # Simple positive term — unchanged. ("pneumothorax", "pneumothorax"), - # Phrase preserved with quotes. - ('"chest x-ray"', '"chest x-ray"'), - # Implicit AND (no operator) — both sides survive. + # Phrase: quotes dropped, value preserved (embedding tokenizers handle + # multi-word spans natively; the quote chars are noise). + ('"chest x-ray"', "chest x-ray"), + # Implicit AND (no operator) — both sides survive, joined by a space. ("cardiac arrest", "cardiac arrest"), - # Explicit AND — both sides survive, operator preserved. - ("A AND B", "A AND B"), - # OR — both sides survive, operator preserved. - ("A OR B", "A OR B"), - # NOT alone — empty. + # Explicit AND — operator token dropped; bag of terms. + ("A AND B", "A B"), + # Explicit OR — operator token dropped; bag of terms. + ("A OR B", "A B"), + # NOT alone — empty (polarity-blind for negation). ("NOT pneumothorax", ""), # AND NOT — left survives, NOT branch dropped, AND collapses. ("A AND NOT B", "A"), @@ -24,12 +25,14 @@ ("NOT A AND B", "B"), # NOT OR NOT — both branches dropped, empty. ("NOT A OR NOT B", ""), - # Mixed: AND OR with a NOT branch — surviving structure retained. - ("(A AND NOT B) OR C", "(A) OR C"), + # Mixed AND OR with a NOT branch — grouping parens dropped, + # operators dropped, surviving terms joined. + ("(A AND NOT B) OR C", "A C"), # Nested NOT inside parens — empty parens collapsed. ("A AND (NOT B)", "A"), - # Double-nested OR with one NOT — only NOT branch dropped. - ("(A OR B) AND NOT C", "(A OR B)"), + # Double-nested OR with one NOT — parens + operators dropped, + # surviving disjunction terms joined. + ("(A OR B) AND NOT C", "A B"), ], ) def test_unparse_for_embedding(query, expected): diff --git a/radis/search/utils/query_parser.py b/radis/search/utils/query_parser.py index 51e1125a..f26ac92d 100644 --- a/radis/search/utils/query_parser.py +++ b/radis/search/utils/query_parser.py @@ -141,6 +141,25 @@ def _modify_unquoted_segments( return "".join(results) + def _strip_field_filters(self, input_string: str) -> str: + """Drop `field:value` tokens (e.g., ``body:pneumonia``, + ``patient_sex:F``, ``time:14:30``). + + The parser grammar has no field-filter syntax — structured field + filtering lives on the provider side via ``SearchFilters``. Without + this step the colon would be silently stripped by + ``_replace_invalid_characters`` and ``body:pneumonia`` would collapse + to ``bodypneumonia``, a meaningless token that pollutes both the FTS + tsquery and the dense-embedding text. Drop the whole token instead. + + Operates only on unquoted segments so ``"body:pneumonia"`` inside a + phrase is preserved verbatim. + """ + pattern = re.compile(r"\b\w+:\S+") + return self._modify_unquoted_segments( + input_string, lambda s: pattern.sub("", s) + ) + def _replace_invalid_characters(self, input_string: str) -> str: def handle_segment(segment: str) -> str: return "".join(char for char in segment if is_search_query_char(char)) @@ -244,6 +263,11 @@ def parse(self, query: str) -> tuple[QueryNode | None, list[str]]: if query_before != query_after: fixes.append("Fixed unbalanced parentheses") + query_before = query_after + query_after = self._strip_field_filters(query_before) + if query_before != query_after: + fixes.append("Stripped field-filter syntax (use the filter widgets instead)") + query_before = query_after query_after = self._replace_invalid_characters(query_before) if query_before != query_after: @@ -315,18 +339,27 @@ def unparse(node: QueryNode) -> str: @staticmethod def unparse_for_embedding(node: QueryNode) -> str: - """Like ``unparse``, but drops the operand of every ``UnaryNode("NOT", X)`` - and collapses any ``BinaryNode`` whose children both become empty. - Returns the empty string if the whole query reduces to NOT clauses. + """Render the query as a plain bag of terms suitable for a dense + embedding model. + + - Drops every ``UnaryNode("NOT", X)`` (embeddings are polarity-blind + for negation; see spec §7.8). + - Drops boolean operator tokens (``AND``/``OR``): they're query syntax, + not content. The embedding model would otherwise see them as + stopword-ish tokens cluttering the input. + - Drops grouping parentheses for the same reason. + - Drops quotes around phrases — embedding tokenizers handle multi-word + spans natively; the literal quote chars only add noise. - Used by the hybrid-search vector half to avoid polarity-blind embedding - of negated terms (see spec 2026-05-28-hybrid-search §7.8). + Returns the empty string if the whole query reduces to NOT clauses. + Used by the hybrid-search vector half via ``providers.search``. """ if isinstance(node, TermNode): - return QueryParser.unparse(node) + # Emit the raw value for both WORD and PHRASE — no surrounding + # quotes, since the embedding model doesn't care about them. + return node.value if isinstance(node, ParensNode): - inner = QueryParser.unparse_for_embedding(node.expression) - return f"({inner})" if inner else "" + return QueryParser.unparse_for_embedding(node.expression) if isinstance(node, UnaryNode): return "" if isinstance(node, BinaryNode): @@ -338,7 +371,6 @@ def unparse_for_embedding(node: QueryNode) -> str: return right if not right: return left - if node.implicit: - return f"{left} {right}" - return f"{left} {node.operator} {right}" + # Always join with a single space — operator tokens are dropped. + return f"{left} {right}" raise ValueError(f"Unknown node type: {type(node)}")