selfhosted-chat-api/docker-compose.yml at main · varad-more/selfhosted-chat-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Multi-backend self-hosted chat API stack.
#
# The `api` gateway always runs. Bring up exactly ONE inference backend via a
# compose profile. Defaults assume BACKEND_KIND=vllm — change .env to match
# whichever backend you start.
#
# Usage:
#   docker compose --profile demo      up -d --build   # CPU-only, laptop-friendly (Ollama + tiny model)
#   docker compose --profile vllm      up -d --build
#   docker compose --profile ollama    up -d --build
#   docker compose --profile llamacpp  up -d --build
#   docker compose --profile tgi       up -d --build
#   docker compose --profile sglang    up -d --build
#   docker compose --profile localai   up -d --build
#   docker compose --profile none      up -d --build   # gateway only, point BACKEND_BASE_URL at an external runtime

x-gpu-service: &gpu-service
  restart: unless-stopped
  ipc: host
  deploy:
    resources:
      reservations:
        devices:
          - driver: nvidia
            count: all
            capabilities: ["gpu"]

services:
  api:
    build:
      context: ./api
    image: selfhosted-chat-api:latest
    container_name: selfhosted-chat-api
    restart: unless-stopped
    env_file:
      - .env
    read_only: true
    tmpfs:
      - /tmp
    security_opt:
      - no-new-privileges:true
    cap_drop:
      - ALL
    ports:
      - "${API_HOST:-127.0.0.1}:${API_PORT:-8000}:8000"
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8000/livez || exit 1"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 10s

  vllm:
    <<: *gpu-service
    profiles: ["vllm"]
    image: vllm/vllm-openai:latest
    container_name: vllm
    env_file:
      - .env
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_HUB_TOKEN:-}
      HUGGINGFACE_HUB_TOKEN: ${HUGGINGFACE_HUB_TOKEN:-}
    expose:
      - "8001"
    ports:
      - "127.0.0.1:8001:8001"
    volumes:
      - ./data/hf-cache:/root/.cache/huggingface
      - ./data/vllm-cache:/root/.cache/vllm
    command:
      - --model
      - ${MODEL_NAME:-Qwen/Qwen2.5-7B-Instruct}
      - --host
      - 0.0.0.0
      - --port
      - "8001"
      - --dtype
      - ${VLLM_DTYPE:-half}
      - --max-model-len
      - ${VLLM_MAX_MODEL_LEN:-16384}
      - --gpu-memory-utilization
      - ${VLLM_GPU_MEMORY_UTILIZATION:-0.92}

  ollama:
    <<: *gpu-service
    profiles: ["ollama"]
    image: ollama/ollama:latest
    container_name: ollama
    expose:
      - "11434"
    ports:
      - "127.0.0.1:11434:11434"
    volumes:
      - ./data/ollama:/root/.ollama
    environment:
      OLLAMA_KEEP_ALIVE: "24h"

  # CPU-only demo. Works on any laptop with Docker — no GPU required.
  # docker compose --profile demo up -d --build
  # then: docker exec -it ollama-demo ollama pull qwen2.5:0.5b-instruct
  ollama-demo:
    profiles: ["demo"]
    image: ollama/ollama:latest
    container_name: ollama-demo
    restart: unless-stopped
    expose:
      - "11434"
    ports:
      - "127.0.0.1:11434:11434"
    volumes:
      - ./data/ollama:/root/.ollama
    environment:
      OLLAMA_KEEP_ALIVE: "24h"

  llamacpp:
    <<: *gpu-service
    profiles: ["llamacpp"]
    image: ghcr.io/ggerganov/llama.cpp:server-cuda
    container_name: llamacpp
    expose:
      - "8001"
    ports:
      - "127.0.0.1:8001:8001"
    volumes:
      - ./data/models:/models
    command:
      - -m
      - /models/${LLAMACPP_MODEL_FILE:-model.gguf}
      - --host
      - 0.0.0.0
      - --port
      - "8001"
      - --n-gpu-layers
      - ${LLAMACPP_NGL:-999}
      - --ctx-size
      - ${LLAMACPP_CTX:-8192}
      - --parallel
      - ${LLAMACPP_PARALLEL:-2}
      - --api-key
      - ${BACKEND_API_KEY:-unused}

  tgi:
    <<: *gpu-service
    profiles: ["tgi"]
    image: ghcr.io/huggingface/text-generation-inference:latest
    container_name: tgi
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_HUB_TOKEN:-}
    expose:
      - "8001"
    ports:
      - "127.0.0.1:8001:8001"
    volumes:
      - ./data/hf-cache:/data
    command:
      - --model-id
      - ${MODEL_NAME:-Qwen/Qwen2.5-7B-Instruct}
      - --port
      - "8001"
      - --hostname
      - 0.0.0.0
      - --max-input-length
      - ${TGI_MAX_INPUT:-8192}
      - --max-total-tokens
      - ${TGI_MAX_TOTAL:-16384}

  sglang:
    <<: *gpu-service
    profiles: ["sglang"]
    image: lmsysorg/sglang:latest
    container_name: sglang
    environment:
      HF_TOKEN: ${HUGGINGFACE_HUB_TOKEN:-}
    expose:
      - "8001"
    ports:
      - "127.0.0.1:8001:8001"
    volumes:
      - ./data/hf-cache:/root/.cache/huggingface
    command:
      - python3
      - -m
      - sglang.launch_server
      - --model-path
      - ${MODEL_NAME:-Qwen/Qwen2.5-7B-Instruct}
      - --host
      - 0.0.0.0
      - --port
      - "8001"
      - --context-length
      - ${SGLANG_CTX:-16384}

  localai:
    profiles: ["localai"]
    image: localai/localai:latest-aio-gpu-nvidia-cuda-12
    container_name: localai
    restart: unless-stopped
    ipc: host
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: ["gpu"]
    expose:
      - "8001"
    ports:
      - "127.0.0.1:8001:8080"
    volumes:
      - ./data/localai-models:/build/models

networks:
  default:
    name: selfhosted-chat-api