DesTEngSsv006_swd/kischdle/llmux/Dockerfile

# --- Build stage: compile llama-cpp-python with CUDA ---
FROM docker.io/pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel AS builder

RUN pip install --no-cache-dir --break-system-packages --upgrade pip setuptools wheel
RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install --no-cache-dir --break-system-packages \
    "llama-cpp-python>=0.3.0"

# --- Runtime stage ---
FROM docker.io/pytorch/pytorch:2.11.0-cuda12.8-cudnn9-runtime

# System dependencies for audio processing
RUN apt-get update && apt-get install -y --no-install-recommends \
    libsndfile1 \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*

# Upgrade pip/setuptools for Python 3.12 compatibility
RUN pip install --no-cache-dir --break-system-packages --upgrade \
    pip setuptools wheel

# Install deps that don't conflict with pre-installed torch stack
RUN pip install --no-cache-dir --break-system-packages \
    "fastapi>=0.115.0" \
    "uvicorn[standard]>=0.34.0" \
    "python-multipart>=0.0.18" \
    "soundfile>=0.12.0" \
    "sentencepiece>=0.2.0" \
    "protobuf>=5.0.0"

# Install transformers + accelerate + kernels (MXFP4/FP8 triton kernels)
RUN pip install --no-cache-dir --break-system-packages --no-build-isolation \
    "transformers>=5.4.0" \
    "accelerate>=1.0.0" \
    "kernels"

# Install chatterbox-tts WITHOUT its dependencies (it would downgrade
# torch from 2.11 to 2.6 and pull gradio, librosa, etc.)
# Then install only the runtime deps chatterbox actually needs.
RUN pip install --no-cache-dir --break-system-packages --no-deps \
    "chatterbox-tts>=0.1.0"

RUN pip install --no-cache-dir --break-system-packages --no-build-isolation \
    "conformer>=0.3.2" \
    "einops>=0.8.0" \
    "omegaconf>=2.3.0" \
    "scipy>=1.17.0" \
    "diffusers>=0.29.0" \
    "resemble-perth>=1.0.0" \
    "s3tokenizer>=0.3.0" \
    "librosa>=0.10.0" \
    "diskcache>=5.6.0"

# Copy llama-cpp-python from builder
COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp /usr/local/lib/python3.12/dist-packages/llama_cpp
COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp_python* /usr/local/lib/python3.12/dist-packages/

# Copy application code
COPY llmux/ /app/llmux/
WORKDIR /app

# Avoid CUDA memory fragmentation when swapping models
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# Run the server
EXPOSE 8081
CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"]