-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
87 lines (85 loc) · 2.38 KB
/
docker-compose.yml
File metadata and controls
87 lines (85 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
services:
vllm:
image: vllm/vllm-openai:latest
container_name: vllm
restart: unless-stopped
ports:
- "8800:8000"
volumes:
- huggingface_cache:/root/.cache/huggingface
environment:
# Pin to GPU 0 (RTX Pro 6000 Blackwell, 96 GB).
- CUDA_VISIBLE_DEVICES=0
# HuggingFace token for gated / private models
- HF_TOKEN=${HF_TOKEN:-}
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0"]
capabilities: [gpu]
# NOTE: --num-scheduler-steps / --speculative-model / --num-speculative-tokens
# were removed in newer vllm images (v0.8+). Speculative decoding can be
# reintroduced via --speculative-config '{"method":"...","model":"..."}'
# when we need the throughput back.
command:
- "--model"
- "Qwen/Qwen3-32B"
- "--quantization"
- "fp8"
# Qwen3-32B native max is 40960 (max_position_embeddings). Going above
# that requires --rope-scaling yarn, which costs quality at short ctx.
# 40960 = 25% over the old 32768 with zero quality cost.
- "--max-model-len"
- "40960"
- "--gpu-memory-utilization"
- "0.70"
- "--dtype"
- "half"
- "--enable-prefix-caching"
- "--max-num-seqs"
- "4"
- "--enable-chunked-prefill"
- "--max-num-batched-tokens"
- "4096"
- "--enable-auto-tool-choice"
- "--tool-call-parser"
- "hermes"
- "--port"
- "8000"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 15s
timeout: 10s
retries: 10
# Model download + load can take several minutes on first run
start_period: 300s
gesture:
build:
context: .
dockerfile: gesture/Dockerfile
container_name: gesture
restart: unless-stopped
ports:
- "8005:8005"
volumes:
- huggingface_cache:/app/.cache/huggingface
environment:
- CUDA_VISIBLE_DEVICES=0
- HF_TOKEN=${HF_TOKEN:-}
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0"]
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8005/health"]
interval: 15s
timeout: 10s
retries: 5
start_period: 120s
volumes:
huggingface_cache: