ProjectParrot/docker-compose.yml at main · LofiChptr6/ProjectParrot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
services:
  vllm:
    image: vllm/vllm-openai:latest
    container_name: vllm
    restart: unless-stopped
    ports:
      - "8800:8000"
    volumes:
      - huggingface_cache:/root/.cache/huggingface
    environment:
      # Pin to GPU 0 (RTX Pro 6000 Blackwell, 96 GB).
      - CUDA_VISIBLE_DEVICES=0
      # HuggingFace token for gated / private models
      - HF_TOKEN=${HF_TOKEN:-}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["0"]
              capabilities: [gpu]
    # NOTE: --num-scheduler-steps / --speculative-model / --num-speculative-tokens
    # were removed in newer vllm images (v0.8+). Speculative decoding can be
    # reintroduced via --speculative-config '{"method":"...","model":"..."}'
    # when we need the throughput back.
    command:
      - "--model"
      - "Qwen/Qwen3-32B"
      - "--quantization"
      - "fp8"
      # Qwen3-32B native max is 40960 (max_position_embeddings). Going above
      # that requires --rope-scaling yarn, which costs quality at short ctx.
      # 40960 = 25% over the old 32768 with zero quality cost.
      - "--max-model-len"
      - "40960"
      - "--gpu-memory-utilization"
      - "0.70"
      - "--dtype"
      - "half"
      - "--enable-prefix-caching"
      - "--max-num-seqs"
      - "4"
      - "--enable-chunked-prefill"
      - "--max-num-batched-tokens"
      - "4096"
      - "--enable-auto-tool-choice"
      - "--tool-call-parser"
      - "hermes"
      - "--port"
      - "8000"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 15s
      timeout: 10s
      retries: 10
      # Model download + load can take several minutes on first run
      start_period: 300s

  gesture:
    build:
      context: .
      dockerfile: gesture/Dockerfile
    container_name: gesture
    restart: unless-stopped
    ports:
      - "8005:8005"
    volumes:
      - huggingface_cache:/app/.cache/huggingface
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - HF_TOKEN=${HF_TOKEN:-}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["0"]
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8005/health"]
      interval: 15s
      timeout: 10s
      retries: 5
      start_period: 120s

volumes:
  huggingface_cache: