sabinbobu · sabinbobu · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,7 +22,7 @@ jobs:
         run: uv python install 3.12
 
       - name: Install dependencies
-        run: uv sync
+        run: uv sync --group dev
 
       - name: Lint with ruff
         run: uv run ruff check .

diff --git a/README.md b/README.md
@@ -0,0 +1,105 @@
+# RAGLab
+
+RAG evaluation platform that compares LLMs, retrieval strategies, and prompt versions side-by-side — with faithfulness, cost, and latency scorecards.
+
+**RAGLab** — answers one question: which combination of LLM, retriever, and prompt actually performs best?
+
+Here's what the data showed on a technical AI corpus:
+
+- gpt-4o-mini is **10x cheaper** than claude-haiku per query
+- claude-haiku is **2x faster**, but faithfulness drops from 0.97 to 0.85 when switching to a conversational prompt style
+- gpt-4o-mini **stays consistent** across both prompt versions
+The platform runs a full experiment matrix — models × prompts × retrievers × questions — scores each run with Ragas faithfulness, and returns a cost/latency scorecard you can actually defend to stakeholders.
+
+**Built with:** FastAPI · ChromaDB · OpenAI · Anthropic · Ragas · Docker
+
+No LangChain. Every abstraction built from scratch.
+
+#genai #rag #llm #python #mlops
+
+
+## Architecture
+
+```
+React UI
+      │ REST
+FastAPI backend
+  ├── LLM Gateway        OpenAI + Anthropic (LLMProvider Protocol)
+  ├── Ingestion Pipeline PDF → chunks → embeddings → ChromaDB
+  ├── Retrieval          Dense (ChromaDB) · BM25 · Hybrid RRF
+  ├── Prompt Registry    Versioned YAML templates
+  ├── Experiment Runner  Matrix: models × prompts × questions
+  └── Eval Harness       Ragas faithfulness · cost · latency
+```
+
+## Eval results
+
+Corpus: AI orchestration technical document
+3 questions · 2 prompt versions · 2 providers
+
+| Model | Prompt | Faithfulness | Avg cost | Avg latency |
+|---|---|---|---|---|
+| gpt-4o-mini | v1 | 0.974 | $0.000161 | 3.32s |
+| gpt-4o-mini | v2 | 0.978 | $0.000197 | 4.22s |
+| claude-haiku-4-5 | v1 | 0.967 | $0.001569 | 1.80s |
+| claude-haiku-4-5 | v2 | 0.852 | $0.001941 | 2.71s |
+
+**Key findings:**
+- gpt-4o-mini is 10x cheaper than claude-haiku for this corpus
+- claude-haiku is 2x faster but faithfulness drops from 0.97 to 0.85 on conversational prompts
+- Hybrid RRF retrieval produces the most complete answers
+
+## Quickstart
+
+```bash
+git clone https://github.com/sabinbobu/RAGLab
+cd RAGLab
+uv pip install -e .
+cp .env.example .env  # add your API keys
+
+# ingest a corpus
+uv run raglab ingest data/sample/
+
+# start the API
+uv run uvicorn raglab.main:app --reload
+
+# or with Docker
+docker compose up
+```
+
+## Run an experiment
+
+```bash
+curl -X POST http://localhost:8000/experiments/run \
+  -H "Content-Type: application/json" \
+  -d '{
+    "models": ["gpt-4o-mini", "claude-haiku-4-5-20251001"],
+    "prompt_versions": ["v1", "v2"],
+    "questions": ["What is AI orchestration?"],
+    "provider": "openai"
+  }'
+```
+
+## Evaluate an experiment
+
+```bash
+curl -X POST "http://localhost:8000/experiments/evaluate?experiment_id=<id>"
+```
+
+## API endpoints
+
+| Method | Endpoint | Description |
+|---|---|---|
+| POST | `/generate` | Single LLM completion |
+| POST | `/query` | RAG query with citations |
+| POST | `/experiments/run` | Run model × prompt × question matrix |
+| POST | `/experiments/evaluate` | Score runs with Ragas |
+
+## Tech stack
+
+- **Backend:** FastAPI · Pydantic v2 · SQLModel · ChromaDB
+- **LLMs:** OpenAI + Anthropic via native SDKs — no LangChain
+- **Retrieval:** Dense vectors · BM25 · Hybrid RRF
+- **Eval:** Ragas faithfulness metric
+- **Infra:** Docker · GitHub Actions CI
+- **Dev:** uv · ruff · mypy · pytest · pre-commit
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,9 @@ where = ["src"]
 line-length = 88
 src = ["src"]
 
+[tool.logfire]
+ignore_no_config = true
+
 [tool.ruff.lint]
 select = ["E", "F", "I"]
 
@@ -58,4 +61,7 @@ dev = [
     "pre-commit>=4.6.0",
     "pytest>=9.0.3",
     "pytest-mock>=3.15.1",
+    "ruff>=0.9.0",
+    "mypy>=1.14.0",
+    "types-pyyaml>=6.0.0",
 ]
diff --git a/src/raglab/cli.py b/src/raglab/cli.py
@@ -72,8 +72,8 @@ def ingest(
             collection.add(
                 ids=ids,
                 documents=chunks,
-                embeddings=vectors,
-                metadatas=metadatas,
+                embeddings=vectors,  # type: ignore[arg-type]
+                metadatas=metadatas,  # type: ignore[arg-type]
             )
 
             total_chunks += len(chunks)

diff --git a/src/raglab/config.py b/src/raglab/config.py
@@ -24,4 +24,4 @@ class Settings(BaseSettings):
 
 
 # Singleton — import this everywhere, don't create new instances
-settings = Settings()
+settings = Settings()  # type: ignore[call-arg]
diff --git a/src/raglab/experiments/eval.py b/src/raglab/experiments/eval.py
@@ -1,10 +1,11 @@
 import json
 import os
 from itertools import groupby
+from typing import Union
 
 from pydantic import BaseModel
 from ragas import EvaluationDataset, SingleTurnSample, evaluate
-from ragas.metrics import _Faithfulness
+from ragas.metrics import Faithfulness
 from sqlmodel import Session, select
 
 from raglab.config import settings
@@ -41,7 +42,7 @@ def evaluate_experiment(experiment_id: str) -> list[Scorecard]:
         raise ValueError(f"No runs found for experiment_id: {experiment_id}")
 
     # group runs by (model, prompt_version) - one scorecard per combination
-    def group_key(r: RunResult) -> tuple:
+    def group_key(r: RunResult) -> tuple[str, str]:
         return (r.model, r.prompt_version)
 
     sorted_runs = sorted(runs, key=group_key)
@@ -51,7 +52,7 @@ def group_key(r: RunResult) -> tuple:
         group_runs = list(group)
 
         # build Ragas dataset from this group
-        samples = []
+        samples: list[Union[SingleTurnSample, object]] = []
         for run in group_runs:
             samples.append(
                 SingleTurnSample(
@@ -62,15 +63,16 @@ def group_key(r: RunResult) -> tuple:
                 )
             )
 
-        dataset = EvaluationDataset(samples=samples)
+        dataset = EvaluationDataset(samples=samples)  # type: ignore[arg-type]
 
         # run Ragas evaluation — this makes LLM calls internally
         results = evaluate(
             dataset=dataset,
-            metrics=[_Faithfulness()],
+            metrics=[Faithfulness()],
         )
 
         # results.to_pandas() gives a DataFrame — extract mean per metric
+        assert hasattr(results, "to_pandas"), "evaluate() returned unexpected type"
         df = results.to_pandas()
 
         scorecards.append(

diff --git a/src/raglab/experiments/runner.py b/src/raglab/experiments/runner.py
@@ -3,6 +3,7 @@
 from datetime import datetime, timezone
 from uuid import uuid4
 
+import chromadb
 from pydantic import BaseModel
 from sqlmodel import Session, SQLModel, create_engine
 
@@ -46,7 +47,9 @@ def run_experiment(config: ExperimentConfig) -> list[RunResult]:
     # so you can query "show me all runs from experiment X"
     experiment_id = str(uuid4())
 
-    retriever = get_retriever(config.retriever, collection="raglab")
+    _client = chromadb.PersistentClient(path=".chroma")
+    _collection = _client.get_collection(name="raglab")
+    retriever = get_retriever(config.retriever, _collection)
     results = []
 
     # itertools.product expands the matrix

diff --git a/src/raglab/gateway/anthropic.py b/src/raglab/gateway/anthropic.py
@@ -1,7 +1,9 @@
 import time
+from typing import Any
 
 import logfire
 from anthropic import Anthropic
+from anthropic.types import TextBlock
 
 from raglab.config import MODEL_PRICING
 from raglab.gateway.base import LLMResponse
@@ -11,7 +13,7 @@ class AnthropicProvider:
     def __init__(self, api_key: str) -> None:
         self.client = Anthropic(api_key=api_key)
 
-    def generate(self, messages: list[dict], model: str) -> LLMResponse:
+    def generate(self, messages: list[dict[str, Any]], model: str) -> LLMResponse:
         start = time.perf_counter()
 
         # Anthropic requires system prompt as a separate parameter
@@ -30,7 +32,7 @@ def generate(self, messages: list[dict], model: str) -> LLMResponse:
                 model=model,
                 max_tokens=1024,
                 system=system,
-                messages=user_messages,
+                messages=user_messages,  # type: ignore[arg-type]
             )
 
         latency_ms = (time.perf_counter() - start) * 1000
@@ -43,8 +45,14 @@ def generate(self, messages: list[dict], model: str) -> LLMResponse:
             input_tokens * pricing["input"] + output_tokens * pricing["output"]
         ) / 1_000_000
 
+        # response.content can contain non-text blocks (tool use, thinking, etc.)
+        # find the first TextBlock to extract the answer
+        text_block = next(
+            (b for b in response.content if isinstance(b, TextBlock)), None
+        )
+
         return LLMResponse(
-            text=response.content[0].text or "",
+            text=text_block.text if text_block else "",
             input_tokens=input_tokens,
             output_tokens=output_tokens,
             cost_usd=cost_usd,

diff --git a/src/raglab/gateway/base.py b/src/raglab/gateway/base.py
@@ -1,4 +1,4 @@
-from typing import Protocol
+from typing import Any, Protocol
 
 from pydantic import BaseModel, ConfigDict
 
@@ -16,4 +16,4 @@ class LLMResponse(BaseModel):
 
 
 class LLMProvider(Protocol):
-    def generate(self, messages: list[dict], model: str) -> LLMResponse: ...
+    def generate(self, messages: list[dict[str, Any]], model: str) -> LLMResponse: ...
diff --git a/src/raglab/gateway/openai.py b/src/raglab/gateway/openai.py
@@ -1,4 +1,5 @@
 import time
+from typing import Any
 
 import logfire
 from openai import OpenAI
@@ -11,18 +12,19 @@ class OpenAIProvider:
     def __init__(self, api_key: str) -> None:
         self.client = OpenAI(api_key=api_key)
 
-    def generate(self, messages: list[dict], model: str) -> LLMResponse:
+    def generate(self, messages: list[dict[str, Any]], model: str) -> LLMResponse:
         start = time.perf_counter()
 
         with logfire.span("openai.chat.completions", model=model):
             response = self.client.chat.completions.create(
                 model=model,
                 # OpenAI accepts messages list natively — system + user roles work as-is
-                messages=messages,
+                messages=messages,  # type: ignore[arg-type]
             )
 
         latency_ms = (time.perf_counter() - start) * 1000
 
+        assert response.usage is not None
         input_tokens = response.usage.prompt_tokens
         output_tokens = response.usage.completion_tokens
 

diff --git a/src/raglab/ingestion/loaders.py b/src/raglab/ingestion/loaders.py
@@ -1,9 +1,10 @@
 from pathlib import Path
+from typing import Any
 
 import pypdf
 
 
-def load_pdf(path: str) -> list[dict]:
+def load_pdf(path: str) -> list[dict[str, Any]]:
     """
     Extract text from a PDF file page by page.
 
@@ -32,7 +33,7 @@ def load_pdf(path: str) -> list[dict]:
     return pages
 
 
-def load_markdown(path: str) -> list[dict]:
+def load_markdown(path: str) -> list[dict[str, Any]]:
     """
     Load a Markdown file as a single document.
 

diff --git a/src/raglab/main.py b/src/raglab/main.py
@@ -1,4 +1,5 @@
 import time
+from typing import Any
 
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
@@ -42,7 +43,9 @@ class QueryResponse(BaseModel):
 @app.post("/generate", response_model=LLMResponse)
 def generate(request: GenerateRequest) -> LLMResponse:
     provider = get_provider(request.provider)
-    return provider.generate(request.prompt, request.model)
+    return provider.generate(
+        [{"role": "user", "content": request.prompt}], request.model
+    )
 
 
 @app.post("/query", response_model=QueryResponse)
@@ -92,7 +95,7 @@ def query(request: QueryRequest) -> QueryResponse:
 
 
 @app.post("/experiments/run")
-def run_experiments(config: ExperimentConfig) -> dict:
+def run_experiments(config: ExperimentConfig) -> dict[str, Any]:
     results = run_experiment(config)
     return {
         "total_runs": len(results),
@@ -107,5 +110,5 @@ def evaluate(experiment_id: str) -> list[Scorecard]:
 
 
 @app.get("/")
-def health() -> dict:
+def health() -> dict[str, str]:
     return {"status": "ok", "service": "raglab"}
diff --git a/src/raglab/prompts/__init__.py b/src/raglab/prompts/__init__.py
@@ -1,9 +1,10 @@
 from pathlib import Path
+from typing import Any
 
 import yaml
 
 
-def load_prompt(version: str) -> dict:
+def load_prompt(version: str) -> dict[str, Any]:
     """
     Load and validate a prompt template by version string.
 
@@ -22,14 +23,14 @@ def load_prompt(version: str) -> dict:
         )
 
     with open(prompt_file, encoding="utf-8") as f:
-        prompt = yaml.safe_load(f)
+        prompt: dict[str, Any] = yaml.safe_load(f)
 
     _validate_prompt(prompt, version)
 
     return prompt
 
 
-def _validate_prompt(prompt: dict, version: str) -> None:
+def _validate_prompt(prompt: dict[str, Any], version: str) -> None:
     """
     Validate that a prompt template has all required keys.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,4 +24,4 @@ class Settings(BaseSettings):


		# Singleton — import this everywhere, don't create new instances
		settings = Settings()
		settings = Settings() # type: ignore[call-arg]