selfhosted-chat-api/.env.example at main · varad-more/selfhosted-chat-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -----------------------------------------------------------------------------
# Self-hosted chat API configuration.
# Copy to .env and adjust. BACKEND_KIND must match the compose profile you run.
# -----------------------------------------------------------------------------

# --- Gateway ---
API_HOST=127.0.0.1
API_PORT=8000

# Comma-separated API keys. At least one key is required for production.
# Leave blank only for isolated private testing on an unreachable network.
API_KEYS=change-me-now

# CORS. Restrict in production.
CORS_ORIGINS=*
CORS_ALLOW_CREDENTIALS=false

# Rate limiting (simple in-process token bucket; put a real limiter in front
# for multi-replica deployments).
RATE_LIMIT_ENABLED=false
RATE_LIMIT_RPM=120
RATE_LIMIT_BURST=30

# Timeouts.
REQUEST_TIMEOUT_S=600
CONNECT_TIMEOUT_S=10

# Logging.
LOG_LEVEL=INFO
LOG_JSON=true
LOG_PROMPTS=false

# Metrics endpoint on /metrics (Prometheus text format).
METRICS_ENABLED=true

# --- Backend selection ---
# Supported: vllm | ollama | llamacpp | tgi | sglang | localai | lmstudio | openai
BACKEND_KIND=vllm
BACKEND_BASE_URL=http://vllm:8001/v1
# Set only if the backend requires its own API key (llama.cpp --api-key, etc.)
BACKEND_API_KEY=

# Default model displayed in docs and health output.
MODEL_NAME=Qwen/Qwen2.5-7B-Instruct

# --- vLLM-specific ---
VLLM_GPU_MEMORY_UTILIZATION=0.92
VLLM_MAX_MODEL_LEN=16384
VLLM_DTYPE=half

# --- llama.cpp-specific ---
LLAMACPP_MODEL_FILE=model.gguf
LLAMACPP_NGL=999
LLAMACPP_CTX=8192
LLAMACPP_PARALLEL=2

# --- TGI-specific ---
TGI_MAX_INPUT=8192
TGI_MAX_TOTAL=16384

# --- SGLang-specific ---
SGLANG_CTX=16384

# --- Hugging Face gated model downloads ---
HUGGINGFACE_HUB_TOKEN=