fortemi/docker-compose.minimal.yml at main · Fortemi/fortemi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# docker-compose.minimal.yml — minimal-footprint overlay for the bundle
#
# Stacks on top of docker-compose.bundle.yml to reduce idle resource use for
# operators with little headroom (running their own LLM, low-RAM hosts, etc.).
#
# What this overlay does:
#   - Disables automatic support-archive seeding on first boot (no CPU spike)
#   - Trims the fast-extraction model down from qwen3.5:9b to qwen2.5:3b
#   - Caps job concurrency to 1 (serial extraction, lower peak RAM)
#   - Drops MAX_MEMORIES from 10 to 2 (less per-archive cache)
#
# What it does NOT do (operator-controlled):
#   - Disable Whisper / GLiNER / pyannote sidecars — use COMPOSE_PROFILES to
#     pick the hardware tier. The default `edge` profile is already CPU-only
#     for the heavy sidecars; opting out further is a separate compose
#     concern documented in README.
#
# Usage:
#   docker compose -f docker-compose.bundle.yml -f docker-compose.minimal.yml up -d
#
# Target footprint: ~2 GB RAM idle (Postgres + Redis + small embed model).
# Caveat: chat quality with qwen2.5:3b is materially lower than qwen3.5:9b.
# This profile is for "I just want it to run" usability, not production.

services:
  fortemi:
    environment:
      # Force-skip the support archive even if the operator has
      # LOAD_SUPPORT_MEMORY=true in their .env. The minimal overlay is
      # for "make it run on my laptop" — the archive is opt-in
      # everywhere now (default off as of v2026.5.5), but the legacy
      # override is the unambiguous way to guarantee skip regardless
      # of upstream config. Manual seed remains available:
      #   docker compose ... exec fortemi /app/seed-support-archive.sh
      - DISABLE_SUPPORT_MEMORY=true

      # Smaller fast model (~2 GB VRAM/RAM vs ~8 GB for qwen3.5:9b).
      # Override per-deployment by setting MATRIC_FAST_GEN_MODEL in .env.
      - MATRIC_FAST_GEN_MODEL=${MATRIC_FAST_GEN_MODEL:-qwen2.5:3b}
      - OLLAMA_GEN_MODEL=${OLLAMA_GEN_MODEL:-qwen2.5:3b}
      - OLLAMA_VISION_MODEL=${OLLAMA_VISION_MODEL:-qwen2.5:3b}

      # Serial extraction, lower peak RAM/VRAM under load.
      - JOB_MAX_CONCURRENT=${JOB_MAX_CONCURRENT:-1}
      - GPU_MAX_CONCURRENT=${GPU_MAX_CONCURRENT:-1}

      # Fewer concurrent live archives — drops in-memory caches.
      - MAX_MEMORIES=${MAX_MEMORIES:-2}