ATLAS/docker-compose.macos.yml at main · itigges22/ATLAS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# ATLAS docker-compose override for the macOS hybrid path (#32).
#
# Bring up the stack with both files:
#   docker compose -f docker-compose.yml -f docker-compose.macos.yml up -d
#
# Why hybrid: on Apple Silicon, llama.cpp under MoltenVK + Docker
# Desktop is 5-10x slower than native Metal. We run llama-server
# NATIVELY on macOS (via scripts/atlas-llama-macos.sh) for the inference
# perf, but keep proxy/v3/lens/sandbox in Docker so the rest of the
# stack is unchanged from the linux + cuda/rocm path.
#
# The trick: instead of removing the `llama-server` service from the
# base file (which would require profiles or rewrites that ripple
# everywhere), we replace its IMAGE with a tiny socat container that
# forwards every connection on `llama-server:8080` -> `host.docker
# .internal:8080` (the native llama-server on the host). Other services
# can keep their existing URLs (`http://llama-server:8080`) and don't
# need to know they're talking to a host process.
#
# Net effect:
#   * llama-server service slot: socat proxy, 4MB container
#   * proxy / v3 / lens / sandbox: unchanged from base compose
#   * depends_on chain: still works (socat is healthy when listening)
#
# Prereq: run ./scripts/atlas-llama-macos.sh in a separate terminal
# BEFORE `docker compose up`. The socat container starts fine without
# the host server running but every request returns connection-refused
# until it's up.
#
# Verify with:
#   curl http://localhost:8080/health    # should hit native llama-server
#   atlas doctor                         # check_metal_native confirms wiring

services:
  llama-server:
    # alpine/socat is ~4MB. We override the upstream image entirely
    # rather than building anything Mac-specific — keeps the macos
    # overlay self-contained, no Mac-only image needed in GHCR.
    image: alpine/socat:latest
    build: !reset null
    # Drop the GPU resource request from the base file. socat doesn't
    # need a GPU and Docker Desktop on Mac doesn't have one to give
    # anyway — leaving the request in would make compose warn.
    deploy:
      resources:
        reservations:
          devices: !reset []
    # No /models mount, no llama-server env vars, no slot path — all
    # irrelevant for a socat proxy. Reset them so the merge result is
    # a clean tiny container.
    volumes: !reset []
    environment: !reset []
    # Don't publish 8080 to the host. The native llama-server is
    # already bound to host :8080 — publishing socat there too would
    # collide (and conceptually wouldn't make sense, since callers on
    # the host should hit the native server directly, not loop back
    # through socat -> host.docker.internal -> host). The 4 docker
    # services reach socat via the docker network as `llama-server:8080`
    # without any host publish needed.
    ports: !reset []
    # The actual forwarding: listen on every interface port 8080 inside
    # the container, forward TCP to host.docker.internal:8080 (Docker
    # Desktop for Mac auto-resolves this to the host's loopback).
    # fork = handle multiple concurrent clients (proxy + v3 + lens all
    # connect simultaneously). reuseaddr = recover from restarts
    # cleanly without TIME_WAIT delays.
    command: >
      TCP-LISTEN:8080,fork,reuseaddr
      TCP:host.docker.internal:8080
    # Health check: alpine/socat is minimal — no nc, no wget, no
    # ss/netstat. The only thing we're guaranteed to have is socat
    # itself + /proc. Checking that PID 1 is socat is the cheapest
    # and most portable signal that the container is doing its job.
    # If socat dies, PID 1 dies, container exits → docker restart
    # policy kicks in (or compose marks the dep as unhealthy).
    healthcheck:
      test: ["CMD-SHELL", "grep -q socat /proc/1/cmdline"]
      interval: 10s
      timeout: 3s
      retries: 3
      start_period: 5s

  # No overrides needed for the other 4 services. They connect to
  # http://llama-server:8080 from the base compose; that name now
  # resolves to the socat container which forwards to the host. Same
  # depends_on chain works — socat is healthy as soon as the listen
  # socket is bound (a few seconds), so geometric-lens / v3-service /
  # atlas-proxy start without changes.
  #
  # If the native llama-server isn't running yet, the services come up
  # but requests through them return 502 until the user starts it.
  # Matches the existing "wait for llama-server to load the model"
  # ux from CUDA/ROCm where the first ~90 seconds also return 502.