inference-engine-benchmark-system/.env.example at main · varad-more/inference-engine-benchmark-system · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# =============================================================================
# .env.example — Copy to .env and fill in your values
#
#   cp .env.example .env
#   # Edit .env — it is gitignored
# =============================================================================

# ---------------------------------------------------------------------------
# HuggingFace Hub token
# Required for gated models (Llama 3.x, Gemma 3, Mistral).
# Qwen3-8B is public and does NOT require a token.
# Get yours at: https://huggingface.co/settings/tokens
# ---------------------------------------------------------------------------
HUGGING_FACE_HUB_TOKEN=

# ---------------------------------------------------------------------------
# Pinned engine images
# Review / update these intentionally when you want to upgrade engine versions.
# ---------------------------------------------------------------------------
VLLM_IMAGE=vllm/vllm-openai:v0.18.0-cu130
SGLANG_IMAGE=lmsysorg/sglang:nightly-dev-cu13-20260321-94194537

# ---------------------------------------------------------------------------
# Engine host overrides (for distributed / AWS multi-instance deployments)
# Leave as defaults for local Docker Compose.
# ---------------------------------------------------------------------------
VLLM_HOST=localhost
VLLM_PORT=8000

SGLANG_HOST=localhost
SGLANG_PORT=8001

# ---------------------------------------------------------------------------
# Dashboard
# ---------------------------------------------------------------------------
RESULTS_DIR=results
# Comma-separated CORS origins (default: http://localhost:3000)
ALLOWED_ORIGINS=http://localhost:3000

# ---------------------------------------------------------------------------
# Speculative decoding draft models
# Used by the vllm-eagle3 and sglang-eagle3 docker-compose profiles.
# Only needed when running Eagle3 speculative decoding benchmarks.
# ---------------------------------------------------------------------------
# Eagle3 draft model for vLLM (RedHatAI speculators are the recommended source)
EAGLE3_VLLM_DRAFT=RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3
# Eagle3 draft model for SGLang
EAGLE3_SGLANG_DRAFT=jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
# LOG_FORMAT: "console" (default, colored) or "json" (structured, for production)
LOG_FORMAT=console
# LOG_LEVEL: DEBUG, INFO (default), WARNING, ERROR
LOG_LEVEL=INFO