-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
215 lines (205 loc) · 5.2 KB
/
docker-compose.yml
File metadata and controls
215 lines (205 loc) · 5.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Multi-backend self-hosted chat API stack.
#
# The `api` gateway always runs. Bring up exactly ONE inference backend via a
# compose profile. Defaults assume BACKEND_KIND=vllm — change .env to match
# whichever backend you start.
#
# Usage:
# docker compose --profile demo up -d --build # CPU-only, laptop-friendly (Ollama + tiny model)
# docker compose --profile vllm up -d --build
# docker compose --profile ollama up -d --build
# docker compose --profile llamacpp up -d --build
# docker compose --profile tgi up -d --build
# docker compose --profile sglang up -d --build
# docker compose --profile localai up -d --build
# docker compose --profile none up -d --build # gateway only, point BACKEND_BASE_URL at an external runtime
x-gpu-service: &gpu-service
restart: unless-stopped
ipc: host
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: ["gpu"]
services:
api:
build:
context: ./api
image: selfhosted-chat-api:latest
container_name: selfhosted-chat-api
restart: unless-stopped
env_file:
- .env
read_only: true
tmpfs:
- /tmp
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
ports:
- "${API_HOST:-127.0.0.1}:${API_PORT:-8000}:8000"
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8000/livez || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
vllm:
<<: *gpu-service
profiles: ["vllm"]
image: vllm/vllm-openai:latest
container_name: vllm
env_file:
- .env
environment:
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_HUB_TOKEN:-}
HUGGINGFACE_HUB_TOKEN: ${HUGGINGFACE_HUB_TOKEN:-}
expose:
- "8001"
ports:
- "127.0.0.1:8001:8001"
volumes:
- ./data/hf-cache:/root/.cache/huggingface
- ./data/vllm-cache:/root/.cache/vllm
command:
- --model
- ${MODEL_NAME:-Qwen/Qwen2.5-7B-Instruct}
- --host
- 0.0.0.0
- --port
- "8001"
- --dtype
- ${VLLM_DTYPE:-half}
- --max-model-len
- ${VLLM_MAX_MODEL_LEN:-16384}
- --gpu-memory-utilization
- ${VLLM_GPU_MEMORY_UTILIZATION:-0.92}
ollama:
<<: *gpu-service
profiles: ["ollama"]
image: ollama/ollama:latest
container_name: ollama
expose:
- "11434"
ports:
- "127.0.0.1:11434:11434"
volumes:
- ./data/ollama:/root/.ollama
environment:
OLLAMA_KEEP_ALIVE: "24h"
# CPU-only demo. Works on any laptop with Docker — no GPU required.
# docker compose --profile demo up -d --build
# then: docker exec -it ollama-demo ollama pull qwen2.5:0.5b-instruct
ollama-demo:
profiles: ["demo"]
image: ollama/ollama:latest
container_name: ollama-demo
restart: unless-stopped
expose:
- "11434"
ports:
- "127.0.0.1:11434:11434"
volumes:
- ./data/ollama:/root/.ollama
environment:
OLLAMA_KEEP_ALIVE: "24h"
llamacpp:
<<: *gpu-service
profiles: ["llamacpp"]
image: ghcr.io/ggerganov/llama.cpp:server-cuda
container_name: llamacpp
expose:
- "8001"
ports:
- "127.0.0.1:8001:8001"
volumes:
- ./data/models:/models
command:
- -m
- /models/${LLAMACPP_MODEL_FILE:-model.gguf}
- --host
- 0.0.0.0
- --port
- "8001"
- --n-gpu-layers
- ${LLAMACPP_NGL:-999}
- --ctx-size
- ${LLAMACPP_CTX:-8192}
- --parallel
- ${LLAMACPP_PARALLEL:-2}
- --api-key
- ${BACKEND_API_KEY:-unused}
tgi:
<<: *gpu-service
profiles: ["tgi"]
image: ghcr.io/huggingface/text-generation-inference:latest
container_name: tgi
environment:
HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACE_HUB_TOKEN:-}
expose:
- "8001"
ports:
- "127.0.0.1:8001:8001"
volumes:
- ./data/hf-cache:/data
command:
- --model-id
- ${MODEL_NAME:-Qwen/Qwen2.5-7B-Instruct}
- --port
- "8001"
- --hostname
- 0.0.0.0
- --max-input-length
- ${TGI_MAX_INPUT:-8192}
- --max-total-tokens
- ${TGI_MAX_TOTAL:-16384}
sglang:
<<: *gpu-service
profiles: ["sglang"]
image: lmsysorg/sglang:latest
container_name: sglang
environment:
HF_TOKEN: ${HUGGINGFACE_HUB_TOKEN:-}
expose:
- "8001"
ports:
- "127.0.0.1:8001:8001"
volumes:
- ./data/hf-cache:/root/.cache/huggingface
command:
- python3
- -m
- sglang.launch_server
- --model-path
- ${MODEL_NAME:-Qwen/Qwen2.5-7B-Instruct}
- --host
- 0.0.0.0
- --port
- "8001"
- --context-length
- ${SGLANG_CTX:-16384}
localai:
profiles: ["localai"]
image: localai/localai:latest-aio-gpu-nvidia-cuda-12
container_name: localai
restart: unless-stopped
ipc: host
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: ["gpu"]
expose:
- "8001"
ports:
- "127.0.0.1:8001:8080"
volumes:
- ./data/localai-models:/build/models
networks:
default:
name: selfhosted-chat-api