Compare commits
34 Commits
bd0ed74d32
...
main
| Author | SHA256 | Date | |
|---|---|---|---|
| 3edc055299 | |||
| 06923d51b4 | |||
| 61308703dc | |||
| 7c4bbe0b29 | |||
| 7a0ff55eb5 | |||
| da35e94b16 | |||
| a88f0afb8a | |||
| d615bb4553 | |||
| f24a225baf | |||
| 38e1523d7e | |||
| aa7a160118 | |||
| d3285bad8a | |||
| f2f73d204c | |||
| d6a3fe5427 | |||
| 8816a06369 | |||
| 8a6f6a5097 | |||
| d5a98879c9 | |||
| 2f4d242f55 | |||
| 1a26d34ea5 | |||
| 17818a3860 | |||
| d55c80ae35 | |||
| ef44bc09b9 | |||
| c6677dcab3 | |||
| de25b5e2a7 | |||
| 449e37d318 | |||
| 813bbe0ad0 | |||
| d7a091df8c | |||
| 969bcb3292 | |||
| c4eaf5088b | |||
| 690ad46d88 | |||
| a64f32b590 | |||
| cf7c77b3b5 | |||
| 45947e80a4 | |||
| 7187c58c5e |
6
kischdle/llmux/.gitignore
vendored
Normal file
6
kischdle/llmux/.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.pytest_cache/
|
||||||
|
.superpowers/
|
||||||
|
.claude/
|
||||||
66
kischdle/llmux/Dockerfile
Normal file
66
kischdle/llmux/Dockerfile
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
# --- Build stage: compile llama-cpp-python with CUDA ---
|
||||||
|
FROM docker.io/pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel AS builder
|
||||||
|
|
||||||
|
RUN pip install --no-cache-dir --break-system-packages --upgrade pip setuptools wheel
|
||||||
|
RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install --no-cache-dir --break-system-packages \
|
||||||
|
"llama-cpp-python>=0.3.0"
|
||||||
|
|
||||||
|
# --- Runtime stage ---
|
||||||
|
FROM docker.io/pytorch/pytorch:2.11.0-cuda12.8-cudnn9-runtime
|
||||||
|
|
||||||
|
# System dependencies for audio processing
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libsndfile1 \
|
||||||
|
ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Upgrade pip/setuptools for Python 3.12 compatibility
|
||||||
|
RUN pip install --no-cache-dir --break-system-packages --upgrade \
|
||||||
|
pip setuptools wheel
|
||||||
|
|
||||||
|
# Install deps that don't conflict with pre-installed torch stack
|
||||||
|
RUN pip install --no-cache-dir --break-system-packages \
|
||||||
|
"fastapi>=0.115.0" \
|
||||||
|
"uvicorn[standard]>=0.34.0" \
|
||||||
|
"python-multipart>=0.0.18" \
|
||||||
|
"soundfile>=0.12.0" \
|
||||||
|
"sentencepiece>=0.2.0" \
|
||||||
|
"protobuf>=5.0.0"
|
||||||
|
|
||||||
|
# Install transformers + accelerate + kernels (MXFP4/FP8 triton kernels)
|
||||||
|
RUN pip install --no-cache-dir --break-system-packages --no-build-isolation \
|
||||||
|
"transformers>=5.4.0" \
|
||||||
|
"accelerate>=1.0.0" \
|
||||||
|
"kernels"
|
||||||
|
|
||||||
|
# Install chatterbox-tts WITHOUT its dependencies (it would downgrade
|
||||||
|
# torch from 2.11 to 2.6 and pull gradio, librosa, etc.)
|
||||||
|
# Then install only the runtime deps chatterbox actually needs.
|
||||||
|
RUN pip install --no-cache-dir --break-system-packages --no-deps \
|
||||||
|
"chatterbox-tts>=0.1.0"
|
||||||
|
|
||||||
|
RUN pip install --no-cache-dir --break-system-packages --no-build-isolation \
|
||||||
|
"conformer>=0.3.2" \
|
||||||
|
"einops>=0.8.0" \
|
||||||
|
"omegaconf>=2.3.0" \
|
||||||
|
"scipy>=1.17.0" \
|
||||||
|
"diffusers>=0.29.0" \
|
||||||
|
"resemble-perth>=1.0.0" \
|
||||||
|
"s3tokenizer>=0.3.0" \
|
||||||
|
"librosa>=0.10.0" \
|
||||||
|
"diskcache>=5.6.0"
|
||||||
|
|
||||||
|
# Copy llama-cpp-python from builder
|
||||||
|
COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp /usr/local/lib/python3.12/dist-packages/llama_cpp
|
||||||
|
COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp_python* /usr/local/lib/python3.12/dist-packages/
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY llmux/ /app/llmux/
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Avoid CUDA memory fragmentation when swapping models
|
||||||
|
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||||
|
|
||||||
|
# Run the server
|
||||||
|
EXPOSE 8081
|
||||||
|
CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"]
|
||||||
7
kischdle/llmux/config/api_keys.yaml
Normal file
7
kischdle/llmux/config/api_keys.yaml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
api_keys:
|
||||||
|
- key: "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c"
|
||||||
|
name: "Open WebUI"
|
||||||
|
- key: "sk-llmux-whisper-ReHko1u-VpVHFbMANyhYLY2Oseswu2gSyKQR32gSyMY"
|
||||||
|
name: "Remote Whisper clients"
|
||||||
|
- key: "sk-llmux-opencode-PUqKAAtevYfUsKtjawqb3tKaLvT-DHZZBKJHwGZIvmo"
|
||||||
|
name: "OpenCode"
|
||||||
112
kischdle/llmux/config/models.yaml
Normal file
112
kischdle/llmux/config/models.yaml
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
physical_models:
|
||||||
|
qwen3.5-9b-fp8:
|
||||||
|
type: llm
|
||||||
|
backend: llamacpp
|
||||||
|
model_id: "unsloth/Qwen3.5-9B-GGUF"
|
||||||
|
model_file: "Qwen3.5-9B-Q8_0.gguf"
|
||||||
|
estimated_vram_gb: 10
|
||||||
|
supports_vision: false
|
||||||
|
supports_tools: true
|
||||||
|
|
||||||
|
qwen3.5-9b-fp8-uncensored:
|
||||||
|
type: llm
|
||||||
|
backend: llamacpp
|
||||||
|
model_id: "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
|
||||||
|
model_file: "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf"
|
||||||
|
mmproj_file: "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf"
|
||||||
|
estimated_vram_gb: 9
|
||||||
|
supports_vision: true
|
||||||
|
supports_tools: true
|
||||||
|
|
||||||
|
qwen3.5-4b:
|
||||||
|
type: llm
|
||||||
|
backend: transformers
|
||||||
|
model_id: "Qwen/Qwen3.5-4B"
|
||||||
|
estimated_vram_gb: 9
|
||||||
|
supports_vision: true
|
||||||
|
supports_tools: true
|
||||||
|
|
||||||
|
gpt-oss-20b:
|
||||||
|
type: llm
|
||||||
|
backend: transformers
|
||||||
|
model_id: "openai/gpt-oss-20b"
|
||||||
|
estimated_vram_gb: 13
|
||||||
|
supports_vision: false
|
||||||
|
supports_tools: true
|
||||||
|
|
||||||
|
gpt-oss-20b-uncensored:
|
||||||
|
type: llm
|
||||||
|
backend: llamacpp
|
||||||
|
model_id: "HauhauCS/GPT-OSS-20B-Uncensored-HauhauCS-Aggressive"
|
||||||
|
model_file: "GPT-OSS-20B-Uncensored-HauhauCS-MXFP4-Aggressive.gguf"
|
||||||
|
estimated_vram_gb: 13
|
||||||
|
supports_vision: false
|
||||||
|
supports_tools: true
|
||||||
|
|
||||||
|
cohere-transcribe:
|
||||||
|
type: asr
|
||||||
|
backend: transformers
|
||||||
|
model_id: "CohereLabs/cohere-transcribe-03-2026"
|
||||||
|
estimated_vram_gb: 4
|
||||||
|
default_language: "en"
|
||||||
|
|
||||||
|
chatterbox-multilingual:
|
||||||
|
type: tts
|
||||||
|
backend: chatterbox
|
||||||
|
variant: "multilingual"
|
||||||
|
estimated_vram_gb: 2
|
||||||
|
|
||||||
|
chatterbox:
|
||||||
|
type: tts
|
||||||
|
backend: chatterbox
|
||||||
|
variant: "default"
|
||||||
|
estimated_vram_gb: 2
|
||||||
|
|
||||||
|
virtual_models:
|
||||||
|
Qwen3.5-9B-FP8-Thinking:
|
||||||
|
physical: qwen3.5-9b-fp8
|
||||||
|
params: { enable_thinking: true }
|
||||||
|
Qwen3.5-9B-FP8-Instruct:
|
||||||
|
physical: qwen3.5-9b-fp8
|
||||||
|
params: { enable_thinking: false }
|
||||||
|
|
||||||
|
Qwen3.5-9B-FP8-Uncensored-Thinking:
|
||||||
|
physical: qwen3.5-9b-fp8-uncensored
|
||||||
|
params: { enable_thinking: true }
|
||||||
|
Qwen3.5-9B-FP8-Uncensored-Instruct:
|
||||||
|
physical: qwen3.5-9b-fp8-uncensored
|
||||||
|
params: { enable_thinking: false }
|
||||||
|
|
||||||
|
Qwen3.5-4B-Thinking:
|
||||||
|
physical: qwen3.5-4b
|
||||||
|
params: { enable_thinking: true }
|
||||||
|
Qwen3.5-4B-Instruct:
|
||||||
|
physical: qwen3.5-4b
|
||||||
|
params: { enable_thinking: false }
|
||||||
|
|
||||||
|
GPT-OSS-20B-Low:
|
||||||
|
physical: gpt-oss-20b
|
||||||
|
params: { system_prompt_prefix: "Reasoning: low" }
|
||||||
|
GPT-OSS-20B-Medium:
|
||||||
|
physical: gpt-oss-20b
|
||||||
|
params: { system_prompt_prefix: "Reasoning: medium" }
|
||||||
|
GPT-OSS-20B-High:
|
||||||
|
physical: gpt-oss-20b
|
||||||
|
params: { system_prompt_prefix: "Reasoning: high" }
|
||||||
|
|
||||||
|
GPT-OSS-20B-Uncensored-Low:
|
||||||
|
physical: gpt-oss-20b-uncensored
|
||||||
|
params: { system_prompt_prefix: "Reasoning: low" }
|
||||||
|
GPT-OSS-20B-Uncensored-Medium:
|
||||||
|
physical: gpt-oss-20b-uncensored
|
||||||
|
params: { system_prompt_prefix: "Reasoning: medium" }
|
||||||
|
GPT-OSS-20B-Uncensored-High:
|
||||||
|
physical: gpt-oss-20b-uncensored
|
||||||
|
params: { system_prompt_prefix: "Reasoning: high" }
|
||||||
|
|
||||||
|
cohere-transcribe:
|
||||||
|
physical: cohere-transcribe
|
||||||
|
Chatterbox-Multilingual:
|
||||||
|
physical: chatterbox-multilingual
|
||||||
|
Chatterbox:
|
||||||
|
physical: chatterbox
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -476,7 +476,12 @@ Keys generated at deployment time.
|
|||||||
|
|
||||||
These require human action and cannot be automated:
|
These require human action and cannot be automated:
|
||||||
|
|
||||||
- DNS setup for kidirekt.kischdle.com (during implementation)
|
|
||||||
- HuggingFace terms for cohere-transcribe: accepted 2026-04-03
|
- HuggingFace terms for cohere-transcribe: accepted 2026-04-03
|
||||||
- HuggingFace token configured at ~/.cache/huggingface/token (done for user tlg, needs setup for user llm during deployment)
|
- HuggingFace token configured at ~/.cache/huggingface/token (done for user tlg, needs setup for user llm during deployment)
|
||||||
- Open WebUI admin configuration (connections, audio settings)
|
- DNS setup for kidirekt.kischdle.com: done 2026-04-03 (https://kidirekt.kischdle.com → 10.8.0.6, not yet tested)
|
||||||
|
|
||||||
|
## Automated Setup Steps
|
||||||
|
|
||||||
|
These are handled by llmux during implementation/testing:
|
||||||
|
|
||||||
|
- Open WebUI admin configuration (connections, audio settings) — configured via Open WebUI API using admin credentials (Thomas.Langer@destengs.com)
|
||||||
|
|||||||
@@ -0,0 +1,630 @@
|
|||||||
|
[DOCUMENT]
|
||||||
|
TITLE: llmux Product Requirements
|
||||||
|
VERSION: 1.0
|
||||||
|
DATE: 2026-04-03
|
||||||
|
|
||||||
|
[TEXT]
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux is a single-process FastAPI application that manages multiple AI models on a single GPU (NVIDIA RTX 5070 Ti, 16GB VRAM). It provides an OpenAI-compatible API for chat completions, speech-to-text, and text-to-speech, serving as the unified AI backend for Open WebUI and external clients on the Kischdle on-premise system.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: System Architecture
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-ARCH-001
|
||||||
|
TITLE: Single process design
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall be a monolithic FastAPI application where one Python process handles all model loading/unloading, VRAM management, and inference routing.
|
||||||
|
<<<
|
||||||
|
RATIONALE: >>>
|
||||||
|
Keeps the system simple, easy to debug, and gives full control over GPU memory management. The 16GB VRAM constraint means concurrent model usage is limited anyway.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-ARCH-002
|
||||||
|
TITLE: Containerized deployment
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall run as a rootless Podman pod (pod name: llmux_pod, container name: llmux_ctr) under the dedicated Linux user llm, managed via systemd user services.
|
||||||
|
<<<
|
||||||
|
RATIONALE: >>>
|
||||||
|
Consistent with the Kischdle microservice architecture where each service runs as a rootless Podman pod under a dedicated user.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-ARCH-003
|
||||||
|
TITLE: Base container image
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall use pytorch/pytorch:2.11.0-cuda12.8-cudnn9-runtime as the base container image.
|
||||||
|
<<<
|
||||||
|
RATIONALE: >>>
|
||||||
|
PyTorch 2.7+ with CUDA 12.8+ supports SM12.0 (Blackwell/RTX 5070 Ti). Host driver 590.48 (CUDA 13.1) is backwards compatible. Verified available on Docker Hub.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-ARCH-004
|
||||||
|
TITLE: GPU passthrough
|
||||||
|
STATEMENT: >>>
|
||||||
|
The container shall have access to the NVIDIA RTX 5070 Ti GPU via NVIDIA CDI (--device nvidia.com/gpu=all).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-ARCH-005
|
||||||
|
TITLE: Pod creation script
|
||||||
|
STATEMENT: >>>
|
||||||
|
A shell script create_pod_llmux.sh shall be provided that creates the Podman pod and enables it as a systemd service, following the Kischdle shell script pattern (create pod, create container, generate systemd units, enable service). The script shall be installed at /home/llm/bin/create_pod_llmux.sh.
|
||||||
|
<<<
|
||||||
|
RELATIONS:
|
||||||
|
- TYPE: Parent
|
||||||
|
VALUE: LLMUX-ARCH-002
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Inference Runtimes
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-RT-001
|
||||||
|
TITLE: HuggingFace transformers runtime
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall use the HuggingFace transformers library (version >= 5.4.0) as the primary runtime for loading and running inference on HuggingFace safetensors models.
|
||||||
|
<<<
|
||||||
|
RATIONALE: >>>
|
||||||
|
vLLM lacks stable support for SM12.0 (RTX Blackwell consumer GPUs). Specifically, NVFP4 MoE kernels fail on SM12.0 (vllm-project/vllm#33416). vLLM can be reconsidered once SM12.0 support matures.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-RT-002
|
||||||
|
TITLE: llama-cpp-python runtime
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall use the llama-cpp-python library (built with CUDA support) for loading and running inference on GGUF format models.
|
||||||
|
<<<
|
||||||
|
RATIONALE: >>>
|
||||||
|
The Qwen3.5-9B-Uncensored model is distributed in GGUF format and requires a llama.cpp compatible runtime.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-RT-003
|
||||||
|
TITLE: Chatterbox runtime
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall use the resemble-ai/chatterbox library for text-to-speech inference.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: AI Models
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Physical Models
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-MDL-001
|
||||||
|
TITLE: Qwen3.5-9B-FP8
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall support the lovedheart/Qwen3.5-9B-FP8 model via the transformers runtime. The model supports vision (image input) and tool/function calling. Estimated VRAM: ~9GB.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-MDL-002
|
||||||
|
TITLE: Qwen3.5-9B-FP8-Uncensored
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall support the HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive model via the llama-cpp-python runtime, using the files Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf (main model) and mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf (vision encoder). The model supports vision and tool/function calling. Estimated VRAM: ~9GB.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-MDL-003
|
||||||
|
TITLE: Qwen3.5-4B
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall support the Qwen/Qwen3.5-4B model via the transformers runtime. The model supports vision and tool/function calling. Estimated VRAM: ~4GB.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-MDL-004
|
||||||
|
TITLE: gpt-oss-20B
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall support the openai/gpt-oss-20b model via the transformers runtime. The model uses MXFP4 quantization on MoE weights and is designed for 16GB VRAM. The model supports tool/function calling but not vision. Estimated VRAM: ~13GB.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-MDL-005
|
||||||
|
TITLE: gpt-oss-20B-uncensored
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall support the aoxo/gpt-oss-20b-uncensored model via the transformers runtime. The model supports tool/function calling but not vision. Estimated VRAM: ~13GB.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-MDL-006
|
||||||
|
TITLE: cohere-transcribe ASR
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall support the CohereLabs/cohere-transcribe-03-2026 model via the transformers runtime for automatic speech recognition. The model supports English and German. Estimated VRAM: ~4GB.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-MDL-007
|
||||||
|
TITLE: Chatterbox TTS variants
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall support three Chatterbox TTS model variants: Chatterbox-Turbo, Chatterbox-Multilingual, and Chatterbox (default). Only one Chatterbox variant shall be loaded in VRAM at a time. Estimated VRAM per variant: ~2GB.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Virtual Models
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VMDL-001
|
||||||
|
TITLE: Virtual model concept
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall expose virtual models to API clients. Multiple virtual models may map to the same physical model with different behavior parameters. Switching between virtual models that share a physical model shall have zero VRAM cost.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VMDL-002
|
||||||
|
TITLE: Qwen3.5 Thinking and Instruct variants
|
||||||
|
STATEMENT: >>>
|
||||||
|
For each Qwen3.5 physical model (qwen3.5-9b-fp8, qwen3.5-9b-fp8-uncensored, qwen3.5-4b), llmux shall expose two virtual models: one with Thinking enabled (default Qwen3.5 behavior) and one with Instruct mode (enable_thinking=False for direct response).
|
||||||
|
<<<
|
||||||
|
RELATIONS:
|
||||||
|
- TYPE: Parent
|
||||||
|
VALUE: LLMUX-VMDL-001
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VMDL-003
|
||||||
|
TITLE: gpt-oss-20B reasoning level variants
|
||||||
|
STATEMENT: >>>
|
||||||
|
For each gpt-oss-20b physical model (gpt-oss-20b, gpt-oss-20b-uncensored), llmux shall expose three virtual models corresponding to reasoning levels Low, Medium, and High, implemented by prepending "Reasoning: low/medium/high" to the system prompt.
|
||||||
|
<<<
|
||||||
|
RELATIONS:
|
||||||
|
- TYPE: Parent
|
||||||
|
VALUE: LLMUX-VMDL-001
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VMDL-004
|
||||||
|
TITLE: Total virtual model count
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall expose exactly 16 virtual models: 6 Qwen3.5 variants (3 physical x 2 modes), 6 gpt-oss-20b variants (2 physical x 3 levels), 1 ASR model, and 3 TTS models.
|
||||||
|
<<<
|
||||||
|
RELATIONS:
|
||||||
|
- TYPE: Parent
|
||||||
|
VALUE: LLMUX-VMDL-001
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: VRAM Management
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VRAM-001
|
||||||
|
TITLE: No idle timeout
|
||||||
|
STATEMENT: >>>
|
||||||
|
Models shall remain loaded in VRAM indefinitely until eviction is required to load another model. There shall be no idle timeout for unloading models.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VRAM-002
|
||||||
|
TITLE: Eviction priority order
|
||||||
|
STATEMENT: >>>
|
||||||
|
When VRAM is insufficient to load a requested model, llmux shall evict loaded models in the following order (lowest priority evicted first):
|
||||||
|
1. LLM models (lowest priority, evicted first)
|
||||||
|
2. TTS models
|
||||||
|
3. ASR model (highest priority, evicted only as last resort)
|
||||||
|
|
||||||
|
llmux shall never evict a higher-priority model to load a lower-priority one (e.g., never evict ASR to make room for TTS; in that case, evict the LLM instead).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VRAM-003
|
||||||
|
TITLE: Load alongside if VRAM permits
|
||||||
|
STATEMENT: >>>
|
||||||
|
If sufficient VRAM is available, llmux shall load a requested model alongside already-loaded models without evicting any model.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VRAM-004
|
||||||
|
TITLE: One LLM at a time
|
||||||
|
STATEMENT: >>>
|
||||||
|
At most one LLM physical model shall be loaded in VRAM at any time.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VRAM-005
|
||||||
|
TITLE: One TTS variant at a time
|
||||||
|
STATEMENT: >>>
|
||||||
|
At most one Chatterbox TTS variant shall be loaded in VRAM at any time. Loading a different TTS variant shall unload the current one.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-VRAM-006
|
||||||
|
TITLE: Concurrency during model swap
|
||||||
|
STATEMENT: >>>
|
||||||
|
An asyncio Lock shall ensure only one load/unload operation at a time. Requests arriving during a model swap shall await the lock. Inference requests shall hold a read-lock on their model to prevent eviction mid-inference.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: API
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Endpoints
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-API-001
|
||||||
|
TITLE: Listen address
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall listen on 127.0.0.1:8081 for all API traffic.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-API-002
|
||||||
|
TITLE: Model listing endpoint
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall provide a GET /v1/models endpoint that returns all virtual models in OpenAI format, regardless of which models are currently loaded in VRAM.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-API-003
|
||||||
|
TITLE: Chat completions endpoint
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall provide a POST /v1/chat/completions endpoint compatible with the OpenAI chat completions API. It shall accept a model parameter matching a virtual model name, support stream: true for SSE streaming, and pass through tool/function calling for models that support it. The virtual-to-physical mapping and behavior modification (thinking toggle, reasoning system prompt) shall be applied transparently.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-API-004
|
||||||
|
TITLE: Audio transcription endpoint
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall provide a POST /v1/audio/transcriptions endpoint compatible with the OpenAI Whisper API. It shall accept multipart form data with an audio file and model parameter. It shall support the language parameter (default "en", also "de"). Supported audio formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, webm.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-API-005
|
||||||
|
TITLE: Text-to-speech endpoint
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall provide a POST /v1/audio/speech endpoint compatible with the OpenAI TTS API. It shall accept JSON with model, input (text), and voice parameters. It shall return audio bytes.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-API-006
|
||||||
|
TITLE: Health endpoint
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall provide a GET /health endpoint that returns service status and currently loaded models. This endpoint shall not require authentication.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Authentication
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-AUTH-001
|
||||||
|
TITLE: API key authentication
|
||||||
|
STATEMENT: >>>
|
||||||
|
All /v1/* endpoints shall require a Bearer token in the Authorization header (Authorization: Bearer <api-key>). Requests without a valid API key shall receive HTTP 401.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-AUTH-002
|
||||||
|
TITLE: API key storage
|
||||||
|
STATEMENT: >>>
|
||||||
|
API keys shall be stored in a config/api_keys.yaml file mounted read-only into the container. Multiple keys shall be supported (one per client: Open WebUI, remote Whisper clients, OpenCode, etc.). Keys shall be generated at deployment time.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-AUTH-003
|
||||||
|
TITLE: No Traefik authentication
|
||||||
|
STATEMENT: >>>
|
||||||
|
Traefik shall act purely as a router. Authentication shall be handled entirely by llmux via API keys.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Configuration
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-CFG-001
|
||||||
|
TITLE: Model registry configuration
|
||||||
|
STATEMENT: >>>
|
||||||
|
All physical and virtual model definitions shall be stored in a config/models.yaml file. Physical model entries shall define: type (llm/asr/tts), backend (transformers/llamacpp/chatterbox), model identifier, estimated VRAM in GB, and capability flags (vision, tools). Virtual model entries shall reference a physical model and define behavior parameters.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-CFG-002
|
||||||
|
TITLE: Configuration bind mounts
|
||||||
|
STATEMENT: >>>
|
||||||
|
Model weights shall be bind-mounted from /home/llm/.local/share/llmux_pod/models/ to /models (read-only). Configuration files shall be bind-mounted from /home/llm/.local/share/llmux_pod/config/ to /config (read-only).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Model Downloads
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-DL-001
|
||||||
|
TITLE: Pre-download all models
|
||||||
|
STATEMENT: >>>
|
||||||
|
All model weights shall be pre-downloaded before the pod is created. A scripts/download_models.sh script shall download all models to /home/llm/.local/share/llmux_pod/models/. The script shall be idempotent (skip existing models).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-DL-002
|
||||||
|
TITLE: HuggingFace token requirement
|
||||||
|
STATEMENT: >>>
|
||||||
|
The download script shall use a HuggingFace access token (stored at ~/.cache/huggingface/token) for downloading gated models (cohere-transcribe). The token must be configured for user llm during deployment.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-DL-003
|
||||||
|
TITLE: Estimated storage
|
||||||
|
STATEMENT: >>>
|
||||||
|
Total estimated model storage is ~60GB. The host has ~1.3TB free on /home, which is sufficient.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: System Integration
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Open WebUI
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-INT-001
|
||||||
|
TITLE: Open WebUI connection
|
||||||
|
STATEMENT: >>>
|
||||||
|
Open WebUI (user wbg, port 8080) shall be configured with OpenAI API base URL http://127.0.0.1:8081/v1 and the designated API key from api_keys.yaml.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-INT-002
|
||||||
|
TITLE: Open WebUI audio configuration
|
||||||
|
STATEMENT: >>>
|
||||||
|
Open WebUI shall be configured with STT engine set to "openai" with base URL http://127.0.0.1:8081/v1 and model "cohere-transcribe", and TTS engine set to "openai" with base URL http://127.0.0.1:8081/v1 and model "Chatterbox-Multilingual".
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-INT-003
|
||||||
|
TITLE: Model visibility in Open WebUI
|
||||||
|
STATEMENT: >>>
|
||||||
|
All 16 virtual models shall be visible in the Open WebUI model dropdown for user selection. Users shall be able to select any model; llmux handles loading/swapping transparently.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Traefik
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-INT-004
|
||||||
|
TITLE: Traefik route for remote access
|
||||||
|
STATEMENT: >>>
|
||||||
|
A Traefik dynamic configuration file shall be added at /home/trf/.local/share/traefik_pod/dynamic/llmux.yml, routing the hostname kidirekt.kischdle.com through the WireGuard VPN entry point to http://10.0.2.2:8081.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-INT-005
|
||||||
|
TITLE: DNS setup
|
||||||
|
STATEMENT: >>>
|
||||||
|
DNS for kidirekt.kischdle.com shall be configured as a manual step during implementation.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Systemd
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-INT-006
|
||||||
|
TITLE: Systemd service lifecycle
|
||||||
|
STATEMENT: >>>
|
||||||
|
The llmux pod shall be managed as a systemd user service under user llm. The service shall support start, stop, and restart operations via systemctl --user, and shall survive system reboots.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Testing and Verification
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Phase 1 - System Integration Tests
|
||||||
|
|
||||||
|
[TEXT]
|
||||||
|
STATEMENT: >>>
|
||||||
|
System integration tests are iterative: issues are fixed before proceeding to the next phase.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-001
|
||||||
|
TITLE: Container build
|
||||||
|
STATEMENT: >>>
|
||||||
|
The Dockerfile shall build successfully and the resulting image shall contain all required dependencies (FastAPI, uvicorn, transformers, llama-cpp-python, chatterbox, and supporting libraries).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-002
|
||||||
|
TITLE: GPU passthrough verification
|
||||||
|
STATEMENT: >>>
|
||||||
|
nvidia-smi shall execute successfully inside the container and report the RTX 5070 Ti GPU.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-003
|
||||||
|
TITLE: Model mount verification
|
||||||
|
STATEMENT: >>>
|
||||||
|
The container shall be able to read model weight files from the /models bind mount.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-004
|
||||||
|
TITLE: Service startup verification
|
||||||
|
STATEMENT: >>>
|
||||||
|
llmux shall start inside the pod and port 8081 shall be reachable from the host.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-005
|
||||||
|
TITLE: Open WebUI connection verification
|
||||||
|
STATEMENT: >>>
|
||||||
|
Open WebUI shall connect to llmux and the model list shall populate with all 16 virtual models.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-006
|
||||||
|
TITLE: Traefik routing verification
|
||||||
|
STATEMENT: >>>
|
||||||
|
When DNS is configured, kidirekt.kischdle.com shall route to llmux through the WireGuard VPN.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-007
|
||||||
|
TITLE: Systemd lifecycle verification
|
||||||
|
STATEMENT: >>>
|
||||||
|
systemctl --user start/stop/restart pod-llmux_pod.service shall work cleanly, and the service shall survive reboot.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Phase 2 - Functional Tests
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-008
|
||||||
|
TITLE: Authentication test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Requests to /v1/* endpoints without a valid API key shall receive HTTP 401 Unauthorized.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-009
|
||||||
|
TITLE: Model listing test
|
||||||
|
STATEMENT: >>>
|
||||||
|
GET /v1/models shall return all 16 virtual models in OpenAI format.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-010
|
||||||
|
TITLE: Chat inference test
|
||||||
|
STATEMENT: >>>
|
||||||
|
For each physical LLM model, a chat request via Open WebUI as user "try" shall produce a reasonable response. This shall be tested for all virtual model variants: Qwen3.5-9B-FP8 (Thinking + Instruct), Qwen3.5-9B-FP8-Uncensored (Thinking + Instruct), Qwen3.5-4B (Thinking + Instruct), GPT-OSS-20B (Low, Medium, High), GPT-OSS-20B-Uncensored (Low, Medium, High).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-011
|
||||||
|
TITLE: Streaming test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Chat responses shall stream token-by-token in Open WebUI, not be delivered as a single block.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-012
|
||||||
|
TITLE: ASR test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Open WebUI dictation shall transcribe speech correctly in English and German using cohere-transcribe.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-013
|
||||||
|
TITLE: TTS test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Open WebUI audio playback shall produce spoken audio from text using Chatterbox.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-014
|
||||||
|
TITLE: Vision test
|
||||||
|
STATEMENT: >>>
|
||||||
|
An image + text prompt shall produce a correct response for each vision-capable model: Qwen3.5-4B, Qwen3.5-9B-FP8, and Qwen3.5-9B-FP8-Uncensored.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-015
|
||||||
|
TITLE: Tool usage test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Tool/function calling shall work for each runtime and all tool-capable models: Qwen3.5-9B-FP8 (transformers), Qwen3.5-9B-FP8-Uncensored (llama-cpp-python), GPT-OSS-20B (transformers), GPT-OSS-20B-Uncensored (transformers).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Phase 3 - VRAM Management Tests
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-016
|
||||||
|
TITLE: Small LLM coexistence test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Loading Qwen3.5-4B (~4GB) shall leave ASR and TTS models loaded in VRAM (~10GB total).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-017
|
||||||
|
TITLE: Medium LLM coexistence test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Loading Qwen3.5-9B-FP8 (~9GB) shall leave ASR and TTS models loaded in VRAM (~15GB total).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-018
|
||||||
|
TITLE: Large LLM eviction test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Loading GPT-OSS-20B (~13GB) shall evict ASR and TTS from VRAM. A subsequent ASR request shall evict the LLM first (not attempt to fit alongside it).
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-019
|
||||||
|
TITLE: Model swapping test
|
||||||
|
STATEMENT: >>>
|
||||||
|
Switching between two LLMs in Open WebUI shall result in the second model loading and the first being evicted.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Phase 4 - Performance Tests
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-020
|
||||||
|
TITLE: Transformers GPU vs CPU performance test
|
||||||
|
STATEMENT: >>>
|
||||||
|
For each transformers-backed physical model (Qwen3.5-9B-FP8, Qwen3.5-4B, gpt-oss-20b, gpt-oss-20b-uncensored, cohere-transcribe), running the same inference on GPU shall be at least 5x faster than on CPU. An admin test endpoint or CLI tool shall be provided to force CPU execution for this test.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-021
|
||||||
|
TITLE: llama-cpp-python GPU vs CPU performance test
|
||||||
|
STATEMENT: >>>
|
||||||
|
For Qwen3.5-9B-FP8-Uncensored, running inference with n_gpu_layers=-1 (GPU) shall be at least 5x faster than with n_gpu_layers=0 (CPU). The same admin test endpoint shall support this.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[REQUIREMENT]
|
||||||
|
UID: LLMUX-TST-022
|
||||||
|
TITLE: Chatterbox performance test
|
||||||
|
STATEMENT: >>>
|
||||||
|
TTS synthesis duration shall be reasonable relative to the duration of the generated audio output.
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
|
|
||||||
|
[[SECTION]]
|
||||||
|
TITLE: Manual Steps
|
||||||
|
|
||||||
|
[TEXT]
|
||||||
|
STATEMENT: >>>
|
||||||
|
The following steps require human action and cannot be automated:
|
||||||
|
|
||||||
|
- DNS setup for kidirekt.kischdle.com (during implementation)
|
||||||
|
- HuggingFace terms for cohere-transcribe: accepted 2026-04-03
|
||||||
|
- HuggingFace token configuration for user llm during deployment
|
||||||
|
- Open WebUI admin configuration (connections, audio settings)
|
||||||
|
<<<
|
||||||
|
|
||||||
|
[[/SECTION]]
|
||||||
0
kischdle/llmux/llmux/__init__.py
Normal file
0
kischdle/llmux/llmux/__init__.py
Normal file
19
kischdle/llmux/llmux/auth.py
Normal file
19
kischdle/llmux/llmux/auth.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
from fastapi import HTTPException, Request
|
||||||
|
|
||||||
|
from llmux.config import ApiKey
|
||||||
|
|
||||||
|
|
||||||
|
def create_api_key_dependency(api_keys: list[ApiKey]):
|
||||||
|
key_to_name = {k.key: k.name for k in api_keys}
|
||||||
|
|
||||||
|
async def require_api_key(request: Request) -> str:
|
||||||
|
auth = request.headers.get("Authorization", "")
|
||||||
|
if not auth.startswith("Bearer "):
|
||||||
|
raise HTTPException(status_code=401, detail="Missing or malformed Authorization header")
|
||||||
|
token = auth[7:]
|
||||||
|
name = key_to_name.get(token)
|
||||||
|
if name is None:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid API key")
|
||||||
|
return name
|
||||||
|
|
||||||
|
return require_api_key
|
||||||
0
kischdle/llmux/llmux/backends/__init__.py
Normal file
0
kischdle/llmux/llmux/backends/__init__.py
Normal file
48
kischdle/llmux/llmux/backends/base.py
Normal file
48
kischdle/llmux/llmux/backends/base.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import AsyncIterator
|
||||||
|
|
||||||
|
|
||||||
|
class BaseBackend(ABC):
|
||||||
|
"""Abstract base for all model backends."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def load(self, model_id: str, **kwargs) -> None:
|
||||||
|
"""Load model weights into GPU VRAM.
|
||||||
|
|
||||||
|
Backends accept optional kwargs:
|
||||||
|
- device: "cuda" or "cpu" (transformers backends, chatterbox)
|
||||||
|
- n_gpu_layers: int (llamacpp backend, -1=all GPU, 0=CPU only)
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def unload(self, model_id: str) -> None:
|
||||||
|
"""Unload model weights from GPU VRAM."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def generate(
|
||||||
|
self,
|
||||||
|
model_id: str,
|
||||||
|
messages: list[dict],
|
||||||
|
params: dict,
|
||||||
|
stream: bool = False,
|
||||||
|
tools: list[dict] | None = None,
|
||||||
|
) -> AsyncIterator[str] | dict:
|
||||||
|
"""Run chat inference. Returns full response dict or async iterator of SSE chunks."""
|
||||||
|
|
||||||
|
async def transcribe(
|
||||||
|
self,
|
||||||
|
model_id: str,
|
||||||
|
audio_data: bytes,
|
||||||
|
language: str = "en",
|
||||||
|
) -> dict:
|
||||||
|
"""Transcribe audio. Only implemented by ASR backends."""
|
||||||
|
raise NotImplementedError(f"{self.__class__.__name__} does not support transcription")
|
||||||
|
|
||||||
|
async def synthesize(
|
||||||
|
self,
|
||||||
|
model_id: str,
|
||||||
|
text: str,
|
||||||
|
voice: str = "default",
|
||||||
|
) -> bytes:
|
||||||
|
"""Synthesize speech. Only implemented by TTS backends."""
|
||||||
|
raise NotImplementedError(f"{self.__class__.__name__} does not support speech synthesis")
|
||||||
81
kischdle/llmux/llmux/backends/chatterbox_tts.py
Normal file
81
kischdle/llmux/llmux/backends/chatterbox_tts.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
import asyncio
|
||||||
|
import gc
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from llmux.backends.base import BaseBackend
|
||||||
|
from llmux.config import PhysicalModel
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ChatterboxTTSBackend(BaseBackend):
|
||||||
|
def __init__(self, models_dir: str = "/models"):
|
||||||
|
self._models_dir = models_dir
|
||||||
|
self._loaded: dict[str, dict] = {}
|
||||||
|
|
||||||
|
async def load(self, model_id: str, device: str = "cuda") -> None:
|
||||||
|
if model_id in self._loaded:
|
||||||
|
return
|
||||||
|
physical = _get_physical_config(model_id)
|
||||||
|
variant = physical.variant
|
||||||
|
logger.info(f"Loading Chatterbox {variant} to {device}")
|
||||||
|
|
||||||
|
def _load():
|
||||||
|
if variant == "multilingual":
|
||||||
|
from chatterbox import ChatterboxMultilingualTTS
|
||||||
|
return ChatterboxMultilingualTTS.from_pretrained(device=device)
|
||||||
|
else:
|
||||||
|
from chatterbox.tts import ChatterboxTTS
|
||||||
|
return ChatterboxTTS.from_pretrained(device=device)
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
model = await loop.run_in_executor(None, _load)
|
||||||
|
self._loaded[model_id] = {"model": model, "variant": variant, "device": device}
|
||||||
|
|
||||||
|
async def unload(self, model_id: str) -> None:
|
||||||
|
if model_id not in self._loaded:
|
||||||
|
return
|
||||||
|
entry = self._loaded.pop(model_id)
|
||||||
|
del entry["model"]
|
||||||
|
del entry
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
logger.info(f"Unloaded Chatterbox {model_id}")
|
||||||
|
|
||||||
|
async def generate(self, model_id, messages, params, stream=False, tools=None):
|
||||||
|
raise NotImplementedError("TTS backend does not support chat generation")
|
||||||
|
|
||||||
|
async def synthesize(self, model_id: str, text: str, voice: str = "default") -> bytes:
|
||||||
|
entry = self._loaded[model_id]
|
||||||
|
model = entry["model"]
|
||||||
|
variant = entry["variant"]
|
||||||
|
|
||||||
|
def _synthesize():
|
||||||
|
if variant == "multilingual":
|
||||||
|
# Default to English; voice param could encode language
|
||||||
|
lang = "en" if voice == "default" else voice
|
||||||
|
wav = model.generate(text, language_id=lang)
|
||||||
|
else:
|
||||||
|
wav = model.generate(text)
|
||||||
|
buf = io.BytesIO()
|
||||||
|
sf.write(buf, wav.cpu().numpy().squeeze(), samplerate=24000, format="WAV")
|
||||||
|
buf.seek(0)
|
||||||
|
return buf.read()
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
audio_bytes = await loop.run_in_executor(None, _synthesize)
|
||||||
|
return audio_bytes
|
||||||
|
|
||||||
|
|
||||||
|
_physical_models: dict[str, PhysicalModel] = {}
|
||||||
|
|
||||||
|
def set_physical_models(models: dict[str, PhysicalModel]) -> None:
|
||||||
|
global _physical_models
|
||||||
|
_physical_models = models
|
||||||
|
|
||||||
|
def _get_physical_config(model_id: str) -> PhysicalModel:
|
||||||
|
return _physical_models[model_id]
|
||||||
254
kischdle/llmux/llmux/backends/llamacpp.py
Normal file
254
kischdle/llmux/llmux/backends/llamacpp.py
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
import asyncio
|
||||||
|
import gc
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import AsyncIterator
|
||||||
|
|
||||||
|
from llama_cpp import Llama
|
||||||
|
|
||||||
|
from llmux.backends.base import BaseBackend
|
||||||
|
from llmux.config import PhysicalModel
|
||||||
|
from llmux.harmony import HarmonyStreamFilter, extract_final_text
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaCppBackend(BaseBackend):
|
||||||
|
def __init__(self, models_dir: str = "/models"):
|
||||||
|
self._models_dir = Path(models_dir)
|
||||||
|
self._loaded: dict[str, dict] = {}
|
||||||
|
self._locks: dict[str, asyncio.Lock] = {} # per-model lock to prevent concurrent C++ access
|
||||||
|
|
||||||
|
def _resolve_gguf_path(self, physical: PhysicalModel, filename: str) -> str:
|
||||||
|
"""Resolve a GGUF filename — check flat gguf/ dir first, then HF cache."""
|
||||||
|
# Check flat gguf/ directory
|
||||||
|
flat_path = self._models_dir / "gguf" / filename
|
||||||
|
if flat_path.exists():
|
||||||
|
return str(flat_path)
|
||||||
|
# Fall back to HF cache resolution
|
||||||
|
from huggingface_hub import hf_hub_download
|
||||||
|
return hf_hub_download(
|
||||||
|
repo_id=physical.model_id,
|
||||||
|
filename=filename,
|
||||||
|
cache_dir=str(self._models_dir),
|
||||||
|
local_files_only=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def load(self, model_id: str, n_gpu_layers: int = -1) -> None:
|
||||||
|
if model_id in self._loaded:
|
||||||
|
return
|
||||||
|
physical = _get_physical_config(model_id)
|
||||||
|
model_path = self._resolve_gguf_path(physical, physical.model_file)
|
||||||
|
logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
|
||||||
|
|
||||||
|
def _load():
|
||||||
|
kwargs = {
|
||||||
|
"model_path": model_path,
|
||||||
|
"n_gpu_layers": n_gpu_layers,
|
||||||
|
"n_ctx": 4096,
|
||||||
|
"flash_attn": True,
|
||||||
|
"verbose": False,
|
||||||
|
}
|
||||||
|
if physical.mmproj_file:
|
||||||
|
mmproj_path = self._resolve_gguf_path(physical, physical.mmproj_file)
|
||||||
|
kwargs["chat_handler"] = _create_vision_handler(mmproj_path)
|
||||||
|
llm = Llama(**kwargs)
|
||||||
|
return llm
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
llm = await loop.run_in_executor(None, _load)
|
||||||
|
|
||||||
|
# Create thinking-enabled and thinking-disabled chat handlers from Jinja template
|
||||||
|
think_handler = _create_think_handler(llm, enable_thinking=True)
|
||||||
|
no_think_handler = _create_think_handler(llm, enable_thinking=False)
|
||||||
|
|
||||||
|
self._loaded[model_id] = {
|
||||||
|
"llm": llm,
|
||||||
|
"n_gpu_layers": n_gpu_layers,
|
||||||
|
"think_handler": think_handler,
|
||||||
|
"no_think_handler": no_think_handler,
|
||||||
|
}
|
||||||
|
self._locks[model_id] = asyncio.Lock()
|
||||||
|
|
||||||
|
async def unload(self, model_id: str) -> None:
|
||||||
|
if model_id not in self._loaded:
|
||||||
|
return
|
||||||
|
entry = self._loaded.pop(model_id)
|
||||||
|
self._locks.pop(model_id, None)
|
||||||
|
# Delete chat handlers first (they hold references to Llama internals)
|
||||||
|
entry.pop("think_handler", None)
|
||||||
|
entry.pop("no_think_handler", None)
|
||||||
|
llm = entry.pop("llm")
|
||||||
|
# Close the Llama model to release GGML CUDA memory
|
||||||
|
if hasattr(llm, "close"):
|
||||||
|
llm.close()
|
||||||
|
del llm
|
||||||
|
del entry
|
||||||
|
gc.collect()
|
||||||
|
# Also clear PyTorch cache in case of mixed allocations
|
||||||
|
import torch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
logger.info(f"Unloaded GGUF model {model_id}")
|
||||||
|
|
||||||
|
def _select_handler(self, entry, params):
|
||||||
|
"""Select the correct chat handler based on params."""
|
||||||
|
if "enable_thinking" in params:
|
||||||
|
if params["enable_thinking"]:
|
||||||
|
return entry.get("think_handler")
|
||||||
|
else:
|
||||||
|
return entry.get("no_think_handler")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def generate(self, model_id, messages, params, stream=False, tools=None):
|
||||||
|
entry = self._loaded[model_id]
|
||||||
|
handler = self._select_handler(entry, params)
|
||||||
|
|
||||||
|
effective_messages = list(messages)
|
||||||
|
if "system_prompt_prefix" in params:
|
||||||
|
prefix = params["system_prompt_prefix"]
|
||||||
|
if effective_messages and effective_messages[0].get("role") == "system":
|
||||||
|
effective_messages[0] = dict(effective_messages[0])
|
||||||
|
effective_messages[0]["content"] = prefix + "\n\n" + effective_messages[0]["content"]
|
||||||
|
else:
|
||||||
|
effective_messages.insert(0, {"role": "system", "content": prefix})
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
return self._stream_generate(entry, effective_messages, model_id, tools, handler)
|
||||||
|
else:
|
||||||
|
return await self._full_generate(entry, effective_messages, model_id, tools, handler)
|
||||||
|
|
||||||
|
async def _full_generate(self, entry, messages, model_id, tools, handler):
|
||||||
|
llm = entry["llm"]
|
||||||
|
lock = self._locks[model_id]
|
||||||
|
|
||||||
|
def _run():
|
||||||
|
kwargs = {"messages": messages, "max_tokens": 4096}
|
||||||
|
if tools:
|
||||||
|
kwargs["tools"] = tools
|
||||||
|
return llm.create_chat_completion(**kwargs)
|
||||||
|
|
||||||
|
async with lock:
|
||||||
|
original = llm.chat_handler
|
||||||
|
if handler:
|
||||||
|
llm.chat_handler = handler
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
result = await loop.run_in_executor(None, _run)
|
||||||
|
finally:
|
||||||
|
llm.chat_handler = original
|
||||||
|
|
||||||
|
result["model"] = model_id
|
||||||
|
for choice in result.get("choices", []):
|
||||||
|
msg = choice.get("message", {})
|
||||||
|
if msg.get("content"):
|
||||||
|
msg["content"] = extract_final_text(msg["content"])
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _stream_generate(self, entry, messages, model_id, tools, handler):
|
||||||
|
llm = entry["llm"]
|
||||||
|
lock = self._locks[model_id]
|
||||||
|
|
||||||
|
# Acquire lock for the entire duration of streaming.
|
||||||
|
# This prevents concurrent C++ access which causes segfaults.
|
||||||
|
await lock.acquire()
|
||||||
|
|
||||||
|
original = llm.chat_handler
|
||||||
|
if handler:
|
||||||
|
llm.chat_handler = handler
|
||||||
|
|
||||||
|
try:
|
||||||
|
def _run():
|
||||||
|
kwargs = {"messages": messages, "max_tokens": 4096, "stream": True}
|
||||||
|
if tools:
|
||||||
|
kwargs["tools"] = tools
|
||||||
|
return llm.create_chat_completion(**kwargs)
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
stream = await loop.run_in_executor(None, _run)
|
||||||
|
|
||||||
|
harmony_filter = HarmonyStreamFilter()
|
||||||
|
error_msg = None
|
||||||
|
try:
|
||||||
|
for chunk in stream:
|
||||||
|
chunk["model"] = model_id
|
||||||
|
skip = False
|
||||||
|
for choice in chunk.get("choices", []):
|
||||||
|
delta = choice.get("delta", {})
|
||||||
|
content = delta.get("content")
|
||||||
|
if content is not None:
|
||||||
|
filtered = harmony_filter.feed(content)
|
||||||
|
if not filtered:
|
||||||
|
skip = True
|
||||||
|
else:
|
||||||
|
delta["content"] = filtered
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Stream error for {model_id}: {e}")
|
||||||
|
error_msg = str(e)
|
||||||
|
|
||||||
|
flushed = harmony_filter.flush()
|
||||||
|
if flushed:
|
||||||
|
flush_chunk = {
|
||||||
|
"id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
|
||||||
|
"model": model_id,
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"choices": [{"index": 0, "delta": {"content": flushed}, "finish_reason": None}],
|
||||||
|
}
|
||||||
|
yield f"data: {json.dumps(flush_chunk)}\n\n"
|
||||||
|
|
||||||
|
if error_msg:
|
||||||
|
err_chunk = {
|
||||||
|
"id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
|
||||||
|
"model": model_id,
|
||||||
|
"object": "chat.completion.chunk",
|
||||||
|
"choices": [{"index": 0, "delta": {"content": f"\n\n[Error: {error_msg}]"}, "finish_reason": None}],
|
||||||
|
}
|
||||||
|
yield f"data: {json.dumps(err_chunk)}\n\n"
|
||||||
|
|
||||||
|
yield "data: [DONE]\n\n"
|
||||||
|
finally:
|
||||||
|
llm.chat_handler = original
|
||||||
|
lock.release()
|
||||||
|
|
||||||
|
|
||||||
|
def _create_think_handler(llm, enable_thinking: bool):
|
||||||
|
"""Create a chat handler with thinking enabled or disabled via Jinja template."""
|
||||||
|
mode = "enabled" if enable_thinking else "disabled"
|
||||||
|
try:
|
||||||
|
from llama_cpp.llama_chat_format import Jinja2ChatFormatter
|
||||||
|
template_str = llm.metadata.get("tokenizer.chat_template", "")
|
||||||
|
if not template_str:
|
||||||
|
logger.warning("Model has no embedded chat template")
|
||||||
|
return None
|
||||||
|
value = "true" if enable_thinking else "false"
|
||||||
|
patched = "{%- set enable_thinking = " + value + " %}\n" + template_str
|
||||||
|
eos = llm._model.token_get_text(llm._model.token_eos())
|
||||||
|
bos = llm._model.token_get_text(llm._model.token_bos())
|
||||||
|
formatter = Jinja2ChatFormatter(template=patched, eos_token=eos, bos_token=bos)
|
||||||
|
handler = formatter.to_chat_handler()
|
||||||
|
logger.info(f"Created chat handler with thinking {mode}")
|
||||||
|
return handler
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to create thinking-{mode} handler: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _create_vision_handler(mmproj_path: str):
|
||||||
|
from llama_cpp.llama_chat_format import Llava16ChatHandler
|
||||||
|
return Llava16ChatHandler(clip_model_path=mmproj_path)
|
||||||
|
|
||||||
|
|
||||||
|
_physical_models: dict[str, PhysicalModel] = {}
|
||||||
|
|
||||||
|
def set_physical_models(models: dict[str, PhysicalModel]) -> None:
|
||||||
|
global _physical_models
|
||||||
|
_physical_models = models
|
||||||
|
|
||||||
|
def _get_physical_config(model_id: str) -> PhysicalModel:
|
||||||
|
return _physical_models[model_id]
|
||||||
73
kischdle/llmux/llmux/backends/transformers_asr.py
Normal file
73
kischdle/llmux/llmux/backends/transformers_asr.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
|
||||||
|
|
||||||
|
from llmux.backends.base import BaseBackend
|
||||||
|
from llmux.config import PhysicalModel
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TransformersASRBackend(BaseBackend):
|
||||||
|
def __init__(self, models_dir: str = "/models"):
|
||||||
|
self._models_dir = models_dir
|
||||||
|
self._loaded: dict[str, dict] = {}
|
||||||
|
|
||||||
|
async def load(self, model_id: str, device: str = "cuda") -> None:
|
||||||
|
if model_id in self._loaded:
|
||||||
|
return
|
||||||
|
physical = _get_physical_config(model_id)
|
||||||
|
hf_id = physical.model_id
|
||||||
|
logger.info(f"Loading ASR model {hf_id} to {device}")
|
||||||
|
|
||||||
|
def _load():
|
||||||
|
processor = AutoProcessor.from_pretrained(hf_id, cache_dir=self._models_dir, trust_remote_code=True)
|
||||||
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(hf_id, cache_dir=self._models_dir, torch_dtype="auto", device_map=device, trust_remote_code=True)
|
||||||
|
return model, processor
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
model, processor = await loop.run_in_executor(None, _load)
|
||||||
|
self._loaded[model_id] = {"model": model, "processor": processor, "device": device}
|
||||||
|
|
||||||
|
async def unload(self, model_id: str) -> None:
|
||||||
|
if model_id not in self._loaded:
|
||||||
|
return
|
||||||
|
entry = self._loaded.pop(model_id)
|
||||||
|
del entry["model"]
|
||||||
|
del entry["processor"]
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
async def generate(self, model_id, messages, params, stream=False, tools=None):
|
||||||
|
raise NotImplementedError("ASR backend does not support chat generation")
|
||||||
|
|
||||||
|
async def transcribe(self, model_id: str, audio_data: bytes, language: str = "en") -> dict:
|
||||||
|
import io
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
entry = self._loaded[model_id]
|
||||||
|
model = entry["model"]
|
||||||
|
processor = entry["processor"]
|
||||||
|
|
||||||
|
def _transcribe():
|
||||||
|
audio_array, sample_rate = sf.read(io.BytesIO(audio_data))
|
||||||
|
inputs = processor(audio_array, sampling_rate=sample_rate, return_tensors="pt", language=language).to(model.device)
|
||||||
|
with torch.no_grad():
|
||||||
|
predicted_ids = model.generate(**inputs)
|
||||||
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
||||||
|
return transcription
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
text = await loop.run_in_executor(None, _transcribe)
|
||||||
|
return {"text": text}
|
||||||
|
|
||||||
|
|
||||||
|
_physical_models: dict[str, PhysicalModel] = {}
|
||||||
|
|
||||||
|
def set_physical_models(models: dict[str, PhysicalModel]) -> None:
|
||||||
|
global _physical_models
|
||||||
|
_physical_models = models
|
||||||
|
|
||||||
|
def _get_physical_config(model_id: str) -> PhysicalModel:
|
||||||
|
return _physical_models[model_id]
|
||||||
172
kischdle/llmux/llmux/backends/transformers_llm.py
Normal file
172
kischdle/llmux/llmux/backends/transformers_llm.py
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from typing import AsyncIterator
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, TextIteratorStreamer
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
from llmux.backends.base import BaseBackend
|
||||||
|
from llmux.config import PhysicalModel
|
||||||
|
from llmux.harmony import HarmonyStreamFilter, extract_final_text
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TransformersLLMBackend(BaseBackend):
|
||||||
|
def __init__(self, models_dir: str = "/models"):
|
||||||
|
self._models_dir = models_dir
|
||||||
|
self._loaded: dict[str, dict] = {} # model_id -> {"model", "tokenizer", "processor"}
|
||||||
|
|
||||||
|
async def load(self, model_id: str, device: str = "cuda") -> None:
|
||||||
|
if model_id in self._loaded:
|
||||||
|
return
|
||||||
|
physical = _get_physical_config(model_id)
|
||||||
|
hf_id = physical.model_id
|
||||||
|
logger.info(f"Loading transformers model {hf_id} to {device}")
|
||||||
|
|
||||||
|
def _load():
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(hf_id, cache_dir=self._models_dir, trust_remote_code=True)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(hf_id, cache_dir=self._models_dir, torch_dtype="auto", device_map=device, trust_remote_code=True)
|
||||||
|
processor = None
|
||||||
|
if physical.supports_vision:
|
||||||
|
try:
|
||||||
|
processor = AutoProcessor.from_pretrained(hf_id, cache_dir=self._models_dir, trust_remote_code=True)
|
||||||
|
except Exception:
|
||||||
|
logger.warning(f"No processor found for {hf_id}, vision disabled")
|
||||||
|
return model, tokenizer, processor
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
model, tokenizer, processor = await loop.run_in_executor(None, _load)
|
||||||
|
self._loaded[model_id] = {"model": model, "tokenizer": tokenizer, "processor": processor, "device": device}
|
||||||
|
|
||||||
|
async def unload(self, model_id: str) -> None:
|
||||||
|
if model_id not in self._loaded:
|
||||||
|
return
|
||||||
|
import gc
|
||||||
|
entry = self._loaded.pop(model_id)
|
||||||
|
model = entry.pop("model")
|
||||||
|
tokenizer = entry.pop("tokenizer")
|
||||||
|
processor = entry.pop("processor", None)
|
||||||
|
del model
|
||||||
|
del tokenizer
|
||||||
|
del processor
|
||||||
|
del entry
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
logger.info(f"Unloaded {model_id}, VRAM freed")
|
||||||
|
|
||||||
|
async def generate(self, model_id, messages, params, stream=False, tools=None):
|
||||||
|
entry = self._loaded[model_id]
|
||||||
|
model = entry["model"]
|
||||||
|
tokenizer = entry["tokenizer"]
|
||||||
|
|
||||||
|
# Apply virtual model params
|
||||||
|
chat_params = {}
|
||||||
|
if "enable_thinking" in params:
|
||||||
|
chat_params["enable_thinking"] = params["enable_thinking"]
|
||||||
|
|
||||||
|
# Inject system prompt prefix for gpt-oss reasoning levels
|
||||||
|
effective_messages = list(messages)
|
||||||
|
if "system_prompt_prefix" in params:
|
||||||
|
prefix = params["system_prompt_prefix"]
|
||||||
|
if effective_messages and effective_messages[0].get("role") == "system":
|
||||||
|
effective_messages[0] = dict(effective_messages[0])
|
||||||
|
effective_messages[0]["content"] = prefix + "\n\n" + effective_messages[0]["content"]
|
||||||
|
else:
|
||||||
|
effective_messages.insert(0, {"role": "system", "content": prefix})
|
||||||
|
|
||||||
|
text = tokenizer.apply_chat_template(effective_messages, tokenize=False, add_generation_prompt=True, tools=tools, **chat_params)
|
||||||
|
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
return self._stream_generate(model, tokenizer, inputs, model_id)
|
||||||
|
else:
|
||||||
|
return await self._full_generate(model, tokenizer, inputs, model_id)
|
||||||
|
|
||||||
|
async def _full_generate(self, model, tokenizer, inputs, model_id):
|
||||||
|
def _run():
|
||||||
|
with torch.no_grad():
|
||||||
|
output_ids = model.generate(**inputs, max_new_tokens=4096)
|
||||||
|
new_tokens = output_ids[0][inputs["input_ids"].shape[1]:]
|
||||||
|
return tokenizer.decode(new_tokens, skip_special_tokens=True)
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
text = await loop.run_in_executor(None, _run)
|
||||||
|
text = extract_final_text(text)
|
||||||
|
return {
|
||||||
|
"id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": int(time.time()),
|
||||||
|
"model": model_id,
|
||||||
|
"choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}],
|
||||||
|
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
|
||||||
|
}
|
||||||
|
|
||||||
|
async def _stream_generate(self, model, tokenizer, inputs, model_id):
|
||||||
|
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
||||||
|
gen_kwargs = {**inputs, "max_new_tokens": 4096, "streamer": streamer}
|
||||||
|
gen_error = [None]
|
||||||
|
|
||||||
|
def _run():
|
||||||
|
try:
|
||||||
|
model.generate(**gen_kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
gen_error[0] = e
|
||||||
|
logger.error(f"Generation error for {model_id}: {e}")
|
||||||
|
|
||||||
|
thread = Thread(target=_run)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
||||||
|
created = int(time.time())
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
harmony_filter = HarmonyStreamFilter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
token = await loop.run_in_executor(None, lambda: next(streamer, None))
|
||||||
|
if token is None:
|
||||||
|
break
|
||||||
|
filtered = harmony_filter.feed(token)
|
||||||
|
if not filtered:
|
||||||
|
continue
|
||||||
|
chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": filtered}, "finish_reason": None}]}
|
||||||
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Stream iteration error for {model_id}: {e}")
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
thread.join()
|
||||||
|
|
||||||
|
if gen_error[0]:
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
error_msg = str(gen_error[0])
|
||||||
|
if "out of memory" in error_msg.lower():
|
||||||
|
error_msg = "GPU out of memory. Try a shorter message or clear VRAM."
|
||||||
|
chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": f"\n\n[Error: {error_msg}]"}, "finish_reason": None}]}
|
||||||
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
|
|
||||||
|
# Flush any remaining buffered content
|
||||||
|
flushed = harmony_filter.flush()
|
||||||
|
if flushed:
|
||||||
|
chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": flushed}, "finish_reason": None}]}
|
||||||
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
|
|
||||||
|
chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]}
|
||||||
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
# Physical model config injection
|
||||||
|
_physical_models: dict[str, PhysicalModel] = {}
|
||||||
|
|
||||||
|
def set_physical_models(models: dict[str, PhysicalModel]) -> None:
|
||||||
|
global _physical_models
|
||||||
|
_physical_models = models
|
||||||
|
|
||||||
|
def _get_physical_config(model_id: str) -> PhysicalModel:
|
||||||
|
return _physical_models[model_id]
|
||||||
79
kischdle/llmux/llmux/config.py
Normal file
79
kischdle/llmux/llmux/config.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
import os
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
def _config_dir() -> Path:
|
||||||
|
return Path(os.environ.get("LLMUX_CONFIG_DIR", "/config"))
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PhysicalModel:
|
||||||
|
type: str # "llm", "asr", "tts"
|
||||||
|
backend: str # "transformers", "llamacpp", "chatterbox"
|
||||||
|
estimated_vram_gb: float
|
||||||
|
model_id: str = ""
|
||||||
|
model_file: str = ""
|
||||||
|
mmproj_file: str = ""
|
||||||
|
supports_vision: bool = False
|
||||||
|
supports_tools: bool = False
|
||||||
|
default_language: str = ""
|
||||||
|
variant: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VirtualModel:
|
||||||
|
physical: str
|
||||||
|
params: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ApiKey:
|
||||||
|
key: str
|
||||||
|
name: str
|
||||||
|
|
||||||
|
|
||||||
|
def load_models_config(
|
||||||
|
config_path: Path | None = None,
|
||||||
|
) -> tuple[dict[str, PhysicalModel], dict[str, VirtualModel]]:
|
||||||
|
if config_path is None:
|
||||||
|
config_path = _config_dir() / "models.yaml"
|
||||||
|
|
||||||
|
with open(config_path) as f:
|
||||||
|
raw = yaml.safe_load(f)
|
||||||
|
|
||||||
|
physical: dict[str, PhysicalModel] = {}
|
||||||
|
for model_id, attrs in raw["physical_models"].items():
|
||||||
|
physical[model_id] = PhysicalModel(
|
||||||
|
type=attrs["type"],
|
||||||
|
backend=attrs["backend"],
|
||||||
|
estimated_vram_gb=attrs["estimated_vram_gb"],
|
||||||
|
model_id=attrs.get("model_id", ""),
|
||||||
|
model_file=attrs.get("model_file", ""),
|
||||||
|
mmproj_file=attrs.get("mmproj_file", ""),
|
||||||
|
supports_vision=attrs.get("supports_vision", False),
|
||||||
|
supports_tools=attrs.get("supports_tools", False),
|
||||||
|
default_language=attrs.get("default_language", ""),
|
||||||
|
variant=attrs.get("variant", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
virtual: dict[str, VirtualModel] = {}
|
||||||
|
for model_name, attrs in raw["virtual_models"].items():
|
||||||
|
virtual[model_name] = VirtualModel(
|
||||||
|
physical=attrs["physical"],
|
||||||
|
params=attrs.get("params", {}),
|
||||||
|
)
|
||||||
|
|
||||||
|
return physical, virtual
|
||||||
|
|
||||||
|
|
||||||
|
def load_api_keys(config_path: Path | None = None) -> list[ApiKey]:
|
||||||
|
if config_path is None:
|
||||||
|
config_path = _config_dir() / "api_keys.yaml"
|
||||||
|
|
||||||
|
with open(config_path) as f:
|
||||||
|
raw = yaml.safe_load(f)
|
||||||
|
|
||||||
|
return [ApiKey(key=entry["key"], name=entry["name"]) for entry in raw["api_keys"]]
|
||||||
90
kischdle/llmux/llmux/harmony.py
Normal file
90
kischdle/llmux/llmux/harmony.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
"""Post-processing for GPT-OSS Harmony format responses.
|
||||||
|
|
||||||
|
GPT-OSS models output multi-channel responses with analysis (thinking) and
|
||||||
|
final (user-facing) channels. This module extracts only the final channel.
|
||||||
|
|
||||||
|
Formats seen:
|
||||||
|
llamacpp: <|channel|>analysis<|message|>...<|end|><|start|>assistant<|channel|>final<|message|>Hello!
|
||||||
|
transformers: analysisUser greeting...assistantfinalHello! (special tokens stripped)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Pattern for llamacpp output (special tokens preserved)
|
||||||
|
_LLAMACPP_FINAL_RE = re.compile(
|
||||||
|
r"<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|$)",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pattern for transformers output (special tokens stripped, leaving text markers)
|
||||||
|
_TRANSFORMERS_FINAL_RE = re.compile(
|
||||||
|
r"assistantfinal(.*?)$",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_final_text(text: str) -> str:
|
||||||
|
"""Extract the final channel content from a Harmony format response."""
|
||||||
|
# Try llamacpp format first
|
||||||
|
m = _LLAMACPP_FINAL_RE.search(text)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip()
|
||||||
|
|
||||||
|
# Try transformers format
|
||||||
|
m = _TRANSFORMERS_FINAL_RE.search(text)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip()
|
||||||
|
|
||||||
|
# Not Harmony format — return as-is
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
class HarmonyStreamFilter:
|
||||||
|
"""Buffers streaming chunks and emits only the final channel content.
|
||||||
|
|
||||||
|
For streaming, we accumulate text until we detect the final channel marker,
|
||||||
|
then start emitting from that point forward. Any content before the marker
|
||||||
|
(analysis channel) is silently dropped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Markers that indicate the start of the final channel
|
||||||
|
_LLAMACPP_MARKER = "<|channel|>final<|message|>"
|
||||||
|
_TRANSFORMERS_MARKER = "assistantfinal"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._buffer = ""
|
||||||
|
self._emitting = False
|
||||||
|
self._marker_found = False
|
||||||
|
|
||||||
|
def feed(self, chunk: str) -> str:
|
||||||
|
"""Feed a chunk of streamed text. Returns text to emit (may be empty)."""
|
||||||
|
if self._emitting:
|
||||||
|
return chunk
|
||||||
|
|
||||||
|
self._buffer += chunk
|
||||||
|
|
||||||
|
# Check for llamacpp marker
|
||||||
|
idx = self._buffer.find(self._LLAMACPP_MARKER)
|
||||||
|
if idx >= 0:
|
||||||
|
self._emitting = True
|
||||||
|
after = self._buffer[idx + len(self._LLAMACPP_MARKER):]
|
||||||
|
self._buffer = ""
|
||||||
|
return after
|
||||||
|
|
||||||
|
# Check for transformers marker
|
||||||
|
idx = self._buffer.find(self._TRANSFORMERS_MARKER)
|
||||||
|
if idx >= 0:
|
||||||
|
self._emitting = True
|
||||||
|
after = self._buffer[idx + len(self._TRANSFORMERS_MARKER):]
|
||||||
|
self._buffer = ""
|
||||||
|
return after
|
||||||
|
|
||||||
|
# Not found yet — keep buffering, emit nothing
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def flush(self) -> str:
|
||||||
|
"""Call at end of stream. If no marker was found, return full buffer."""
|
||||||
|
if not self._emitting and self._buffer:
|
||||||
|
# No Harmony markers found — return unmodified content
|
||||||
|
return self._buffer
|
||||||
|
return ""
|
||||||
90
kischdle/llmux/llmux/main.py
Normal file
90
kischdle/llmux/llmux/main.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
from llmux.config import load_models_config, load_api_keys
|
||||||
|
from llmux.auth import create_api_key_dependency
|
||||||
|
from llmux.model_registry import ModelRegistry
|
||||||
|
from llmux.vram_manager import VRAMManager
|
||||||
|
from llmux.backends.transformers_llm import TransformersLLMBackend
|
||||||
|
from llmux.backends.transformers_llm import set_physical_models as set_transformers_llm_models
|
||||||
|
from llmux.backends.transformers_asr import TransformersASRBackend
|
||||||
|
from llmux.backends.transformers_asr import set_physical_models as set_transformers_asr_models
|
||||||
|
from llmux.backends.llamacpp import LlamaCppBackend
|
||||||
|
from llmux.backends.llamacpp import set_physical_models as set_llamacpp_models
|
||||||
|
from llmux.backends.chatterbox_tts import ChatterboxTTSBackend
|
||||||
|
from llmux.backends.chatterbox_tts import set_physical_models as set_chatterbox_models
|
||||||
|
from llmux.routes.models import create_models_router
|
||||||
|
from llmux.routes.chat import create_chat_router
|
||||||
|
from llmux.routes.transcription import create_transcription_router
|
||||||
|
from llmux.routes.speech import create_speech_router
|
||||||
|
from llmux.routes.admin import create_admin_router
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MODELS_DIR = os.environ.get("LLMUX_MODELS_DIR", "/models")
|
||||||
|
|
||||||
|
# Load HF token from file if HF_TOKEN_PATH is set and HF_TOKEN is not already set
|
||||||
|
_hf_token_path = os.environ.get("HF_TOKEN_PATH")
|
||||||
|
if _hf_token_path and not os.environ.get("HF_TOKEN"):
|
||||||
|
try:
|
||||||
|
with open(_hf_token_path) as f:
|
||||||
|
os.environ["HF_TOKEN"] = f.read().strip()
|
||||||
|
logger.info(f"Loaded HF token from {_hf_token_path}")
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.warning(f"HF_TOKEN_PATH set but file not found: {_hf_token_path}")
|
||||||
|
|
||||||
|
app = FastAPI(title="llmux", version="0.1.0")
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def startup():
|
||||||
|
logger.info("Starting llmux...")
|
||||||
|
|
||||||
|
physical, virtual = load_models_config()
|
||||||
|
api_keys = load_api_keys()
|
||||||
|
|
||||||
|
set_transformers_llm_models(physical)
|
||||||
|
set_transformers_asr_models(physical)
|
||||||
|
set_llamacpp_models(physical)
|
||||||
|
set_chatterbox_models(physical)
|
||||||
|
|
||||||
|
registry = ModelRegistry(physical, virtual)
|
||||||
|
vram_manager = VRAMManager(total_vram_gb=16.0)
|
||||||
|
require_api_key = create_api_key_dependency(api_keys)
|
||||||
|
|
||||||
|
transformers_llm = TransformersLLMBackend(models_dir=MODELS_DIR)
|
||||||
|
transformers_asr = TransformersASRBackend(models_dir=MODELS_DIR)
|
||||||
|
llamacpp = LlamaCppBackend(models_dir=MODELS_DIR)
|
||||||
|
chatterbox = ChatterboxTTSBackend(models_dir=MODELS_DIR)
|
||||||
|
|
||||||
|
backends = {
|
||||||
|
"transformers": transformers_llm,
|
||||||
|
"transformers_asr": transformers_asr,
|
||||||
|
"llamacpp": llamacpp,
|
||||||
|
"chatterbox": chatterbox,
|
||||||
|
}
|
||||||
|
|
||||||
|
app.state.vram_manager = vram_manager
|
||||||
|
app.state.registry = registry
|
||||||
|
|
||||||
|
app.include_router(create_models_router(registry, require_api_key))
|
||||||
|
app.include_router(create_chat_router(registry, vram_manager, backends, require_api_key))
|
||||||
|
app.include_router(create_transcription_router(registry, vram_manager, backends, require_api_key))
|
||||||
|
app.include_router(create_speech_router(registry, vram_manager, backends, require_api_key))
|
||||||
|
app.include_router(create_admin_router(registry, vram_manager, backends, require_api_key))
|
||||||
|
|
||||||
|
logger.info("llmux started successfully")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
vram_manager = app.state.vram_manager
|
||||||
|
loaded = vram_manager.get_loaded_models()
|
||||||
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"loaded_models": {mid: {"type": slot.model_type, "vram_gb": slot.vram_gb} for mid, slot in loaded.items()},
|
||||||
|
"available_vram_gb": round(vram_manager.available_vram_gb, 1),
|
||||||
|
}
|
||||||
37
kischdle/llmux/llmux/model_registry.py
Normal file
37
kischdle/llmux/llmux/model_registry.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
from llmux.config import PhysicalModel, VirtualModel, load_models_config
|
||||||
|
|
||||||
|
|
||||||
|
class ModelRegistry:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
physical: dict[str, PhysicalModel],
|
||||||
|
virtual: dict[str, VirtualModel],
|
||||||
|
):
|
||||||
|
self._physical = physical
|
||||||
|
self._virtual = virtual
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_config(cls) -> "ModelRegistry":
|
||||||
|
physical, virtual = load_models_config()
|
||||||
|
return cls(physical, virtual)
|
||||||
|
|
||||||
|
def list_virtual_models(self) -> list[dict]:
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"id": name,
|
||||||
|
"object": "model",
|
||||||
|
"created": 0,
|
||||||
|
"owned_by": "llmux",
|
||||||
|
}
|
||||||
|
for name, vm in self._virtual.items()
|
||||||
|
if self._physical[vm.physical].type == "llm"
|
||||||
|
]
|
||||||
|
|
||||||
|
def resolve(self, virtual_name: str) -> tuple[str, PhysicalModel, dict]:
|
||||||
|
"""Resolve a virtual model name to (physical_id, PhysicalModel, params)."""
|
||||||
|
vm = self._virtual[virtual_name] # raises KeyError if unknown
|
||||||
|
pm = self._physical[vm.physical]
|
||||||
|
return vm.physical, pm, dict(vm.params)
|
||||||
|
|
||||||
|
def get_physical(self, physical_id: str) -> PhysicalModel:
|
||||||
|
return self._physical[physical_id] # raises KeyError if unknown
|
||||||
0
kischdle/llmux/llmux/routes/__init__.py
Normal file
0
kischdle/llmux/llmux/routes/__init__.py
Normal file
121
kischdle/llmux/llmux/routes/admin.py
Normal file
121
kischdle/llmux/llmux/routes/admin.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||||
|
from llmux.model_registry import ModelRegistry
|
||||||
|
from llmux.vram_manager import VRAMManager
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TEST_PROMPT = [{"role": "user", "content": "Say hello in one sentence."}]
|
||||||
|
|
||||||
|
|
||||||
|
def create_admin_router(registry, vram_manager, backends, require_api_key):
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
@router.post("/admin/clear-vram")
|
||||||
|
async def clear_vram(api_key: str = Depends(require_api_key)):
|
||||||
|
"""Unload all models and clear GPU VRAM."""
|
||||||
|
result = await vram_manager.clear_all()
|
||||||
|
import torch
|
||||||
|
gpu_info = {}
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
gpu_info = {
|
||||||
|
"gpu_memory_used_mb": round(torch.cuda.memory_allocated() / 1024**2, 1),
|
||||||
|
"gpu_memory_reserved_mb": round(torch.cuda.memory_reserved() / 1024**2, 1),
|
||||||
|
}
|
||||||
|
return {**result, **gpu_info}
|
||||||
|
|
||||||
|
@router.post("/admin/test/performance")
|
||||||
|
async def test_performance(request: Request, api_key: str = Depends(require_api_key)):
|
||||||
|
body = await request.json()
|
||||||
|
physical_id = body.get("physical_model_id")
|
||||||
|
if not physical_id:
|
||||||
|
raise HTTPException(status_code=400, detail="Missing 'physical_model_id'")
|
||||||
|
|
||||||
|
physical = registry.get_physical(physical_id)
|
||||||
|
backend_name = physical.backend
|
||||||
|
|
||||||
|
if backend_name == "transformers" and physical.type == "llm":
|
||||||
|
return await _test_transformers_llm(physical_id, backends)
|
||||||
|
elif backend_name == "transformers" and physical.type == "asr":
|
||||||
|
return await _test_transformers_asr(physical_id, backends)
|
||||||
|
elif backend_name == "llamacpp":
|
||||||
|
return await _test_llamacpp(physical_id, backends)
|
||||||
|
elif backend_name == "chatterbox":
|
||||||
|
return await _test_chatterbox(physical_id, backends)
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Unknown backend: {backend_name}")
|
||||||
|
|
||||||
|
return router
|
||||||
|
|
||||||
|
|
||||||
|
async def _test_transformers_llm(physical_id, backends):
|
||||||
|
from llmux.backends.transformers_llm import TransformersLLMBackend
|
||||||
|
results = {}
|
||||||
|
for device_label, device in [("gpu", "cuda"), ("cpu", "cpu")]:
|
||||||
|
backend = TransformersLLMBackend(models_dir=backends["transformers"]._models_dir)
|
||||||
|
await backend.load(physical_id, device=device)
|
||||||
|
start = time.monotonic()
|
||||||
|
await backend.generate(physical_id, TEST_PROMPT, params={}, stream=False)
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
await backend.unload(physical_id)
|
||||||
|
results[device_label] = round(elapsed, 2)
|
||||||
|
|
||||||
|
ratio = results["cpu"] / results["gpu"] if results["gpu"] > 0 else 0
|
||||||
|
return {"model": physical_id, "gpu_seconds": results["gpu"], "cpu_seconds": results["cpu"], "speedup": round(ratio, 1), "pass": ratio >= 5.0}
|
||||||
|
|
||||||
|
|
||||||
|
async def _test_transformers_asr(physical_id, backends):
|
||||||
|
from llmux.backends.transformers_asr import TransformersASRBackend
|
||||||
|
silent_wav = _make_silent_wav(duration_seconds=2)
|
||||||
|
results = {}
|
||||||
|
for device_label, device in [("gpu", "cuda"), ("cpu", "cpu")]:
|
||||||
|
backend = TransformersASRBackend(models_dir=backends["transformers_asr"]._models_dir)
|
||||||
|
await backend.load(physical_id, device=device)
|
||||||
|
start = time.monotonic()
|
||||||
|
await backend.transcribe(physical_id, silent_wav, language="en")
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
await backend.unload(physical_id)
|
||||||
|
results[device_label] = round(elapsed, 2)
|
||||||
|
|
||||||
|
ratio = results["cpu"] / results["gpu"] if results["gpu"] > 0 else 0
|
||||||
|
return {"model": physical_id, "gpu_seconds": results["gpu"], "cpu_seconds": results["cpu"], "speedup": round(ratio, 1), "pass": ratio >= 5.0}
|
||||||
|
|
||||||
|
|
||||||
|
async def _test_llamacpp(physical_id, backends):
|
||||||
|
from llmux.backends.llamacpp import LlamaCppBackend
|
||||||
|
results = {}
|
||||||
|
for label, n_gpu_layers in [("gpu", -1), ("cpu", 0)]:
|
||||||
|
backend = LlamaCppBackend(models_dir=backends["llamacpp"]._models_dir)
|
||||||
|
await backend.load(physical_id, n_gpu_layers=n_gpu_layers)
|
||||||
|
start = time.monotonic()
|
||||||
|
await backend.generate(physical_id, TEST_PROMPT, params={}, stream=False)
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
await backend.unload(physical_id)
|
||||||
|
results[label] = round(elapsed, 2)
|
||||||
|
|
||||||
|
ratio = results["cpu"] / results["gpu"] if results["gpu"] > 0 else 0
|
||||||
|
return {"model": physical_id, "gpu_seconds": results["gpu"], "cpu_seconds": results["cpu"], "speedup": round(ratio, 1), "pass": ratio >= 5.0}
|
||||||
|
|
||||||
|
|
||||||
|
async def _test_chatterbox(physical_id, backends):
|
||||||
|
from llmux.backends.chatterbox_tts import ChatterboxTTSBackend
|
||||||
|
backend = ChatterboxTTSBackend(models_dir=backends["chatterbox"]._models_dir)
|
||||||
|
await backend.load(physical_id, device="cuda")
|
||||||
|
test_text = "Hello, this is a performance test."
|
||||||
|
start = time.monotonic()
|
||||||
|
audio_bytes = await backend.synthesize(physical_id, test_text)
|
||||||
|
elapsed = time.monotonic() - start
|
||||||
|
await backend.unload(physical_id)
|
||||||
|
|
||||||
|
audio_samples = (len(audio_bytes) - 44) / 2
|
||||||
|
audio_duration = audio_samples / 24000
|
||||||
|
return {"model": physical_id, "synthesis_seconds": round(elapsed, 2), "audio_duration_seconds": round(audio_duration, 2), "realtime_factor": round(audio_duration / elapsed, 1) if elapsed > 0 else 0}
|
||||||
|
|
||||||
|
|
||||||
|
def _make_silent_wav(duration_seconds=2, sample_rate=16000):
|
||||||
|
import struct
|
||||||
|
num_samples = int(sample_rate * duration_seconds)
|
||||||
|
data = b"\x00\x00" * num_samples
|
||||||
|
header = struct.pack("<4sI4s4sIHHIIHH4sI", b"RIFF", 36 + len(data), b"WAVE", b"fmt ", 16, 1, 1, sample_rate, sample_rate * 2, 2, 16, b"data", len(data))
|
||||||
|
return header + data
|
||||||
54
kischdle/llmux/llmux/routes/chat.py
Normal file
54
kischdle/llmux/llmux/routes/chat.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import logging
|
||||||
|
import torch
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from llmux.model_registry import ModelRegistry
|
||||||
|
from llmux.vram_manager import VRAMManager
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def create_chat_router(registry, vram_manager, backends, require_api_key):
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
@router.post("/v1/chat/completions")
|
||||||
|
async def chat_completions(request: Request, api_key: str = Depends(require_api_key)):
|
||||||
|
body = await request.json()
|
||||||
|
virtual_name = body.get("model")
|
||||||
|
if not virtual_name:
|
||||||
|
raise HTTPException(status_code=400, detail="Missing 'model' field")
|
||||||
|
|
||||||
|
try:
|
||||||
|
physical_id, physical, params = registry.resolve(virtual_name)
|
||||||
|
except KeyError:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Model '{virtual_name}' not found")
|
||||||
|
|
||||||
|
backend_key = physical.backend
|
||||||
|
if backend_key == "transformers" and physical.type == "asr":
|
||||||
|
backend_key = "transformers_asr"
|
||||||
|
backend = backends.get(backend_key)
|
||||||
|
if backend is None:
|
||||||
|
raise HTTPException(status_code=500, detail=f"No backend for '{physical.backend}'")
|
||||||
|
|
||||||
|
await vram_manager.load_model(
|
||||||
|
model_id=physical_id, model_type=physical.type,
|
||||||
|
vram_gb=physical.estimated_vram_gb, backend=backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
messages = body.get("messages", [])
|
||||||
|
stream = body.get("stream", False)
|
||||||
|
tools = body.get("tools")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if stream:
|
||||||
|
stream_iter = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=True, tools=tools)
|
||||||
|
return StreamingResponse(stream_iter, media_type="text/event-stream")
|
||||||
|
|
||||||
|
result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=False, tools=tools)
|
||||||
|
return result
|
||||||
|
except torch.cuda.OutOfMemoryError:
|
||||||
|
logger.error(f"CUDA OOM during generation with {virtual_name}")
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
raise HTTPException(status_code=503, detail="GPU out of memory. Try a shorter message or switch to a smaller model.")
|
||||||
|
|
||||||
|
return router
|
||||||
12
kischdle/llmux/llmux/routes/models.py
Normal file
12
kischdle/llmux/llmux/routes/models.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from llmux.model_registry import ModelRegistry
|
||||||
|
|
||||||
|
|
||||||
|
def create_models_router(registry: ModelRegistry, require_api_key) -> APIRouter:
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
@router.get("/v1/models")
|
||||||
|
async def list_models(api_key: str = Depends(require_api_key)):
|
||||||
|
return {"object": "list", "data": registry.list_virtual_models()}
|
||||||
|
|
||||||
|
return router
|
||||||
57
kischdle/llmux/llmux/routes/speech.py
Normal file
57
kischdle/llmux/llmux/routes/speech.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
import logging
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||||
|
from fastapi.responses import Response
|
||||||
|
from llmux.model_registry import ModelRegistry
|
||||||
|
from llmux.vram_manager import VRAMManager
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def create_speech_router(registry, vram_manager, backends, require_api_key):
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
@router.get("/v1/audio/models")
|
||||||
|
async def list_audio_models():
|
||||||
|
"""Discovery endpoint for Open WebUI — lists available TTS models."""
|
||||||
|
tts_models = [
|
||||||
|
{"id": name}
|
||||||
|
for name, vm in registry._virtual.items()
|
||||||
|
if registry._physical[vm.physical].type == "tts"
|
||||||
|
]
|
||||||
|
return {"models": tts_models}
|
||||||
|
|
||||||
|
@router.get("/v1/audio/voices")
|
||||||
|
async def list_audio_voices():
|
||||||
|
"""Discovery endpoint for Open WebUI — lists available voices."""
|
||||||
|
return {"voices": [{"id": "default", "name": "Default"}]}
|
||||||
|
|
||||||
|
@router.post("/v1/audio/speech")
|
||||||
|
async def create_speech(request: Request, api_key: str = Depends(require_api_key)):
|
||||||
|
body = await request.json()
|
||||||
|
model_name = body.get("model")
|
||||||
|
if not model_name:
|
||||||
|
raise HTTPException(status_code=400, detail="Missing 'model' field")
|
||||||
|
|
||||||
|
try:
|
||||||
|
physical_id, physical, params = registry.resolve(model_name)
|
||||||
|
except KeyError:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Model '{model_name}' not found")
|
||||||
|
|
||||||
|
if physical.type != "tts":
|
||||||
|
raise HTTPException(status_code=400, detail=f"Model '{model_name}' is not a TTS model")
|
||||||
|
|
||||||
|
backend = backends.get(physical.backend)
|
||||||
|
if backend is None:
|
||||||
|
raise HTTPException(status_code=500, detail=f"No backend for '{physical.backend}'")
|
||||||
|
|
||||||
|
await vram_manager.load_model(
|
||||||
|
model_id=physical_id, model_type=physical.type,
|
||||||
|
vram_gb=physical.estimated_vram_gb, backend=backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
text = body.get("input", "")
|
||||||
|
voice = body.get("voice", "default")
|
||||||
|
audio_bytes = await backend.synthesize(model_id=physical_id, text=text, voice=voice)
|
||||||
|
return Response(content=audio_bytes, media_type="audio/wav")
|
||||||
|
|
||||||
|
return router
|
||||||
43
kischdle/llmux/llmux/routes/transcription.py
Normal file
43
kischdle/llmux/llmux/routes/transcription.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import logging
|
||||||
|
from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
|
||||||
|
from llmux.model_registry import ModelRegistry
|
||||||
|
from llmux.vram_manager import VRAMManager
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def create_transcription_router(registry, vram_manager, backends, require_api_key):
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
@router.post("/v1/audio/transcriptions")
|
||||||
|
async def create_transcription(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
model: str = Form(...),
|
||||||
|
language: str = Form("en"),
|
||||||
|
api_key: str = Depends(require_api_key),
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
physical_id, physical, params = registry.resolve(model)
|
||||||
|
except KeyError:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Model '{model}' not found")
|
||||||
|
|
||||||
|
if physical.type != "asr":
|
||||||
|
raise HTTPException(status_code=400, detail=f"Model '{model}' is not an ASR model")
|
||||||
|
|
||||||
|
backend_key = physical.backend
|
||||||
|
if backend_key == "transformers" and physical.type == "asr":
|
||||||
|
backend_key = "transformers_asr"
|
||||||
|
backend = backends.get(backend_key)
|
||||||
|
if backend is None:
|
||||||
|
raise HTTPException(status_code=500, detail=f"No backend for '{physical.backend}'")
|
||||||
|
|
||||||
|
await vram_manager.load_model(
|
||||||
|
model_id=physical_id, model_type=physical.type,
|
||||||
|
vram_gb=physical.estimated_vram_gb, backend=backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_data = await file.read()
|
||||||
|
result = await backend.transcribe(model_id=physical_id, audio_data=audio_data, language=language)
|
||||||
|
return result
|
||||||
|
|
||||||
|
return router
|
||||||
175
kischdle/llmux/llmux/vram_manager.py
Normal file
175
kischdle/llmux/llmux/vram_manager.py
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
import asyncio
|
||||||
|
import gc
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_PRIORITY = {"llm": 0, "tts": 1, "asr": 2}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ModelSlot:
|
||||||
|
model_id: str
|
||||||
|
model_type: str
|
||||||
|
vram_gb: float
|
||||||
|
backend: object
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def priority_rank(model_type: str) -> int:
|
||||||
|
return _PRIORITY[model_type]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def priority(self) -> int:
|
||||||
|
return _PRIORITY[self.model_type]
|
||||||
|
|
||||||
|
|
||||||
|
class VRAMManager:
|
||||||
|
def __init__(self, total_vram_gb: float = 16.0, verify_gpu: bool = True):
|
||||||
|
self._total_vram_gb = total_vram_gb
|
||||||
|
self._loaded: dict[str, ModelSlot] = {}
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
self._verify_gpu = verify_gpu
|
||||||
|
|
||||||
|
@property
|
||||||
|
def available_vram_gb(self) -> float:
|
||||||
|
used = sum(slot.vram_gb for slot in self._loaded.values())
|
||||||
|
return self._total_vram_gb - used
|
||||||
|
|
||||||
|
def is_loaded(self, model_id: str) -> bool:
|
||||||
|
return model_id in self._loaded
|
||||||
|
|
||||||
|
def get_loaded_models(self) -> dict[str, ModelSlot]:
|
||||||
|
return dict(self._loaded)
|
||||||
|
|
||||||
|
async def clear_all(self) -> dict:
|
||||||
|
"""Unload all models and clear CUDA cache. Returns what was unloaded."""
|
||||||
|
async with self._lock:
|
||||||
|
unloaded = []
|
||||||
|
for slot in list(self._loaded.values()):
|
||||||
|
logger.info(f"Clearing {slot.model_id} ({slot.model_type}, {slot.vram_gb}GB)")
|
||||||
|
await slot.backend.unload(slot.model_id)
|
||||||
|
unloaded.append(slot.model_id)
|
||||||
|
self._loaded.clear()
|
||||||
|
self._force_gpu_cleanup()
|
||||||
|
return {
|
||||||
|
"unloaded": unloaded,
|
||||||
|
"available_vram_gb": round(self.available_vram_gb, 1),
|
||||||
|
}
|
||||||
|
|
||||||
|
async def load_model(self, model_id, model_type, vram_gb, backend):
|
||||||
|
async with self._lock:
|
||||||
|
await self._load_model_locked(model_id, model_type, vram_gb, backend)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _gpu_free_gb() -> float:
|
||||||
|
"""Get actual free GPU memory in GB."""
|
||||||
|
if not torch.cuda.is_available():
|
||||||
|
return 16.0
|
||||||
|
free, _ = torch.cuda.mem_get_info()
|
||||||
|
return free / (1024 ** 3)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _force_gpu_cleanup():
|
||||||
|
"""Force garbage collection and GPU memory release."""
|
||||||
|
gc.collect()
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
async def _load_model_locked(self, model_id, model_type, vram_gb, backend):
|
||||||
|
if model_id in self._loaded:
|
||||||
|
return
|
||||||
|
|
||||||
|
evicted = False
|
||||||
|
if self.available_vram_gb < vram_gb:
|
||||||
|
await self._evict_for(vram_gb, model_type)
|
||||||
|
evicted = True
|
||||||
|
|
||||||
|
if self.available_vram_gb < vram_gb:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Cannot free enough VRAM for {model_id} "
|
||||||
|
f"(need {vram_gb}GB, available {self.available_vram_gb}GB)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# After eviction, verify GPU memory is actually freed.
|
||||||
|
# GGML (llama-cpp) CUDA allocations may take time to release.
|
||||||
|
# Only check when we evicted AND have real GPU AND the model needs >4GB
|
||||||
|
# (small models fit even with overhead; large models are the OOM risk).
|
||||||
|
if evicted and self._verify_gpu and torch.cuda.is_available():
|
||||||
|
self._force_gpu_cleanup()
|
||||||
|
actual_free = self._gpu_free_gb()
|
||||||
|
if actual_free < vram_gb:
|
||||||
|
logger.warning(
|
||||||
|
f"GPU has only {actual_free:.1f}GB free after eviction "
|
||||||
|
f"(need {vram_gb}GB). Waiting for memory release..."
|
||||||
|
)
|
||||||
|
for _ in range(10):
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
self._force_gpu_cleanup()
|
||||||
|
actual_free = self._gpu_free_gb()
|
||||||
|
if actual_free >= vram_gb:
|
||||||
|
break
|
||||||
|
if actual_free < vram_gb:
|
||||||
|
logger.error(
|
||||||
|
f"GPU memory not freed: {actual_free:.1f}GB free, "
|
||||||
|
f"need {vram_gb}GB for {model_id}"
|
||||||
|
)
|
||||||
|
raise RuntimeError(
|
||||||
|
f"GPU memory not freed after eviction: "
|
||||||
|
f"{actual_free:.1f}GB free, need {vram_gb}GB"
|
||||||
|
)
|
||||||
|
logger.info(f"GPU verified: {actual_free:.1f}GB free after eviction")
|
||||||
|
|
||||||
|
logger.info(f"Loading {model_id} ({vram_gb}GB VRAM)")
|
||||||
|
await backend.load(model_id)
|
||||||
|
self._loaded[model_id] = ModelSlot(
|
||||||
|
model_id=model_id,
|
||||||
|
model_type=model_type,
|
||||||
|
vram_gb=vram_gb,
|
||||||
|
backend=backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _evict_for(self, needed_gb, requesting_type):
|
||||||
|
requesting_priority = _PRIORITY[requesting_type]
|
||||||
|
|
||||||
|
# Evict in priority order: lowest first (LLM=0, TTS=1, ASR=2).
|
||||||
|
#
|
||||||
|
# Rule: never evict a higher-priority model to make room for a
|
||||||
|
# lower-priority one. E.g., a TTS request must not evict ASR —
|
||||||
|
# it should evict the LLM instead. But an LLM request CAN cascade
|
||||||
|
# through TTS and ASR as a last resort, because there is nothing
|
||||||
|
# lower to evict. Same-priority replacement is always allowed.
|
||||||
|
#
|
||||||
|
# Pass 1: evict models with priority <= requesting priority
|
||||||
|
# (lower or same tier).
|
||||||
|
# Pass 2: if still not enough, evict higher-priority models
|
||||||
|
# in ascending order (only when the requester has no
|
||||||
|
# lower-priority alternatives left).
|
||||||
|
candidates = sorted(self._loaded.values(), key=lambda s: s.priority)
|
||||||
|
|
||||||
|
# Pass 1: evict lower and same priority
|
||||||
|
for slot in list(candidates):
|
||||||
|
if self.available_vram_gb >= needed_gb:
|
||||||
|
break
|
||||||
|
if slot.priority <= requesting_priority:
|
||||||
|
logger.info(
|
||||||
|
f"Evicting {slot.model_id} ({slot.model_type}, {slot.vram_gb}GB)"
|
||||||
|
)
|
||||||
|
await slot.backend.unload(slot.model_id)
|
||||||
|
del self._loaded[slot.model_id]
|
||||||
|
|
||||||
|
# Pass 2: evict higher priority as last resort
|
||||||
|
if self.available_vram_gb < needed_gb:
|
||||||
|
candidates = sorted(self._loaded.values(), key=lambda s: s.priority)
|
||||||
|
for slot in list(candidates):
|
||||||
|
if self.available_vram_gb >= needed_gb:
|
||||||
|
break
|
||||||
|
logger.info(
|
||||||
|
f"Evicting {slot.model_id} ({slot.model_type}, {slot.vram_gb}GB) [last resort]"
|
||||||
|
)
|
||||||
|
await slot.backend.unload(slot.model_id)
|
||||||
|
del self._loaded[slot.model_id]
|
||||||
20
kischdle/llmux/requirements.txt
Normal file
20
kischdle/llmux/requirements.txt
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
# Web framework
|
||||||
|
fastapi>=0.115.0
|
||||||
|
uvicorn[standard]>=0.34.0
|
||||||
|
python-multipart>=0.0.18
|
||||||
|
|
||||||
|
# AI runtimes (torch pre-installed in base image)
|
||||||
|
transformers>=5.4.0
|
||||||
|
chatterbox-tts>=0.1.0
|
||||||
|
|
||||||
|
# Audio processing
|
||||||
|
soundfile>=0.12.0
|
||||||
|
|
||||||
|
# Config & utilities (pyyaml pre-installed in base image)
|
||||||
|
sentencepiece>=0.2.0
|
||||||
|
protobuf>=5.0.0
|
||||||
|
|
||||||
|
# Testing (only needed for development)
|
||||||
|
pytest>=8.0.0
|
||||||
|
pytest-asyncio>=0.24.0
|
||||||
|
httpx>=0.28.0
|
||||||
87
kischdle/llmux/scripts/create_pod_llmux.sh
Executable file
87
kischdle/llmux/scripts/create_pod_llmux.sh
Executable file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Create the llmux Podman pod and systemd service.
|
||||||
|
# Run as user llm: bash scripts/create_pod_llmux.sh
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
POD_NAME="llmux_pod"
|
||||||
|
CTR_NAME="llmux_ctr"
|
||||||
|
IMAGE="localhost/llmux:latest"
|
||||||
|
PORT="127.0.0.1:8081:8081"
|
||||||
|
BIND_DIR="$HOME/.local/share/${POD_NAME}"
|
||||||
|
USER_SYSTEMD_DIR="$HOME/.config/systemd/user"
|
||||||
|
|
||||||
|
MODELS_DIR="${BIND_DIR}/models"
|
||||||
|
CONFIG_DIR="${BIND_DIR}/config"
|
||||||
|
|
||||||
|
if [ ! -d "$MODELS_DIR" ]; then
|
||||||
|
echo "ERROR: Models directory not found: $MODELS_DIR"
|
||||||
|
echo "Run download_models.sh first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$CONFIG_DIR/models.yaml" ]; then
|
||||||
|
echo "ERROR: Config not found: $CONFIG_DIR/models.yaml"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$CONFIG_DIR/api_keys.yaml" ]; then
|
||||||
|
echo "ERROR: Config not found: $CONFIG_DIR/api_keys.yaml"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$USER_SYSTEMD_DIR"
|
||||||
|
|
||||||
|
LLMUX_SRC="${LLMUX_SRC:-/home/tlg/swd/kischdle/llmux}"
|
||||||
|
|
||||||
|
if ! podman image exists "$IMAGE"; then
|
||||||
|
echo "Building container image from $LLMUX_SRC ..."
|
||||||
|
if [ ! -f "$LLMUX_SRC/Dockerfile" ]; then
|
||||||
|
echo "ERROR: Dockerfile not found at $LLMUX_SRC/Dockerfile"
|
||||||
|
echo "Set LLMUX_SRC to the llmux source directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
podman build -t llmux:latest -f "$LLMUX_SRC/Dockerfile" "$LLMUX_SRC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
podman pod exists "$POD_NAME" && podman pod stop "$POD_NAME" 2>/dev/null || true
|
||||||
|
podman pod exists "$POD_NAME" && podman pod rm -f "$POD_NAME" 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "Creating pod $POD_NAME..."
|
||||||
|
podman pod create --name "$POD_NAME" -p "$PORT"
|
||||||
|
|
||||||
|
echo "Creating container $CTR_NAME..."
|
||||||
|
podman run -d \
|
||||||
|
--name "$CTR_NAME" \
|
||||||
|
--pod "$POD_NAME" \
|
||||||
|
--device nvidia.com/gpu=all \
|
||||||
|
-v "${MODELS_DIR}:/models:ro" \
|
||||||
|
-v "${CONFIG_DIR}:/config:ro" \
|
||||||
|
-e LLMUX_CONFIG_DIR=/config \
|
||||||
|
-e LLMUX_MODELS_DIR=/models \
|
||||||
|
"$IMAGE"
|
||||||
|
|
||||||
|
echo "Waiting for llmux to start..."
|
||||||
|
for i in $(seq 1 30); do
|
||||||
|
if curl -sf http://127.0.0.1:8081/health > /dev/null 2>&1; then
|
||||||
|
echo "llmux is healthy!"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Generating systemd units..."
|
||||||
|
cd "$USER_SYSTEMD_DIR"
|
||||||
|
podman generate systemd --files --new --name "$POD_NAME"
|
||||||
|
|
||||||
|
podman pod stop "$POD_NAME"
|
||||||
|
podman pod rm -f "$POD_NAME"
|
||||||
|
|
||||||
|
systemctl --user daemon-reload
|
||||||
|
systemctl --user enable --now "pod-${POD_NAME}.service"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== llmux pod created and enabled ==="
|
||||||
|
echo "Service: systemctl --user status pod-${POD_NAME}.service"
|
||||||
|
echo "Health: curl http://127.0.0.1:8081/health"
|
||||||
|
echo "Logs: journalctl --user -u pod-${POD_NAME}.service -f"
|
||||||
83
kischdle/llmux/scripts/download_models.sh
Executable file
83
kischdle/llmux/scripts/download_models.sh
Executable file
@@ -0,0 +1,83 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Download all model weights for llmux.
|
||||||
|
# Run as user llm: bash scripts/download_models.sh
|
||||||
|
# Requires: HuggingFace token at ~/.cache/huggingface/token for gated models
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Use llm user's venv for huggingface-cli and python
|
||||||
|
VENV="${LLMUX_VENV:-$HOME/.venv-pytorch}"
|
||||||
|
HF_CLI="${VENV}/bin/huggingface-cli"
|
||||||
|
PYTHON="${VENV}/bin/python"
|
||||||
|
|
||||||
|
if [ ! -x "$HF_CLI" ]; then
|
||||||
|
echo "ERROR: huggingface-cli not found at $HF_CLI"
|
||||||
|
echo "Install with: ${VENV}/bin/pip install huggingface_hub"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
MODELS_DIR="${LLMUX_MODELS_DIR:-$HOME/.local/share/llmux_pod/models}"
|
||||||
|
mkdir -p "$MODELS_DIR"
|
||||||
|
|
||||||
|
echo "=== Downloading models to $MODELS_DIR ==="
|
||||||
|
echo "Using: $HF_CLI"
|
||||||
|
|
||||||
|
download_hf() {
|
||||||
|
local repo="$1"
|
||||||
|
local target="$MODELS_DIR/models--${repo//\//-}"
|
||||||
|
if [ -d "$target" ]; then
|
||||||
|
echo "SKIP: $repo (already downloaded)"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
echo "Downloading: $repo"
|
||||||
|
"$HF_CLI" download "$repo" --cache-dir "$MODELS_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
download_hf_files() {
|
||||||
|
local repo="$1"
|
||||||
|
shift
|
||||||
|
echo "Downloading specific files from: $repo"
|
||||||
|
"$HF_CLI" download "$repo" "$@" --cache-dir "$MODELS_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 1. Qwen3.5-9B-FP8
|
||||||
|
download_hf "lovedheart/Qwen3.5-9B-FP8"
|
||||||
|
|
||||||
|
# 2. Qwen3.5-9B-FP8-Uncensored (GGUF files only)
|
||||||
|
download_hf_files "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive" \
|
||||||
|
"Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf" \
|
||||||
|
"mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf"
|
||||||
|
|
||||||
|
# 3. Qwen3.5-4B
|
||||||
|
download_hf "Qwen/Qwen3.5-4B"
|
||||||
|
|
||||||
|
# 4. gpt-oss-20b
|
||||||
|
download_hf "openai/gpt-oss-20b"
|
||||||
|
|
||||||
|
# 5. gpt-oss-20b-uncensored
|
||||||
|
download_hf "aoxo/gpt-oss-20b-uncensored"
|
||||||
|
|
||||||
|
# 6. cohere-transcribe (gated — requires accepted terms)
|
||||||
|
echo "Downloading: CohereLabs/cohere-transcribe-03-2026 (gated)"
|
||||||
|
download_hf "CohereLabs/cohere-transcribe-03-2026" || \
|
||||||
|
echo "WARNING: cohere-transcribe download failed. Have you accepted the terms at https://huggingface.co/CohereLabs/cohere-transcribe-03-2026 ?"
|
||||||
|
|
||||||
|
# 7. Chatterbox TTS
|
||||||
|
echo "Downloading: Chatterbox TTS weights (auto-downloaded by library)"
|
||||||
|
"$PYTHON" -c "
|
||||||
|
from chatterbox.tts import ChatterboxTTS
|
||||||
|
import os
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||||
|
print('Downloading Chatterbox default...')
|
||||||
|
ChatterboxTTS.from_pretrained(device='cpu')
|
||||||
|
print('Downloading Chatterbox turbo...')
|
||||||
|
ChatterboxTTS.from_pretrained(device='cpu', variant='turbo')
|
||||||
|
print('Downloading Chatterbox multilingual...')
|
||||||
|
ChatterboxTTS.from_pretrained(device='cpu', variant='multilingual')
|
||||||
|
print('Chatterbox downloads complete.')
|
||||||
|
" || echo "WARNING: Chatterbox download failed. Check chatterbox-tts installation."
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Download complete ==="
|
||||||
|
echo "Models directory: $MODELS_DIR"
|
||||||
|
du -sh "$MODELS_DIR"
|
||||||
224
kischdle/llmux/scripts/perf_test.py
Normal file
224
kischdle/llmux/scripts/perf_test.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Performance test for llmux — measures TTFT, tok/s, and total latency for each LLM model."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
BASE_URL = "http://127.0.0.1:8081"
|
||||||
|
API_KEY = "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c"
|
||||||
|
HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
|
||||||
|
|
||||||
|
# Test prompts — short and long to measure different characteristics
|
||||||
|
PROMPTS = {
|
||||||
|
"short": "What is 2+2? Answer in one sentence.",
|
||||||
|
"medium": "Explain how a CPU works in 3-4 paragraphs.",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Models to test — one virtual model per physical model (avoid duplicating physical loads)
|
||||||
|
TEST_MODELS = [
|
||||||
|
# llama-cpp backend (GGUF)
|
||||||
|
("Qwen3.5-9B-FP8-Instruct", "llamacpp", "~10GB"),
|
||||||
|
("GPT-OSS-20B-Uncensored-Low", "llamacpp", "~13GB"),
|
||||||
|
# transformers backend
|
||||||
|
("Qwen3.5-4B-Instruct", "transformers", "~4GB"),
|
||||||
|
# GPT-OSS-20B-Low disabled: needs libc6-dev sys/ headers for triton MXFP4 kernels
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def clear_vram():
|
||||||
|
"""Unload all models to start fresh."""
|
||||||
|
r = httpx.post(f"{BASE_URL}/admin/clear-vram", headers=HEADERS, timeout=60)
|
||||||
|
if r.status_code == 200:
|
||||||
|
print(" VRAM cleared")
|
||||||
|
else:
|
||||||
|
print(f" WARN: clear-vram returned {r.status_code}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_streaming(model: str, prompt: str, prompt_label: str) -> dict:
|
||||||
|
"""Test a model with streaming, measuring TTFT and tok/s."""
|
||||||
|
body = {
|
||||||
|
"model": model,
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
first_token_time = None
|
||||||
|
token_count = 0
|
||||||
|
full_text = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with httpx.stream("POST", f"{BASE_URL}/v1/chat/completions",
|
||||||
|
json=body, headers=HEADERS, timeout=300) as resp:
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return {"model": model, "prompt": prompt_label, "error": f"HTTP {resp.status_code}"}
|
||||||
|
|
||||||
|
for line in resp.iter_lines():
|
||||||
|
if not line.startswith("data: "):
|
||||||
|
continue
|
||||||
|
data = line[6:]
|
||||||
|
if data == "[DONE]":
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
chunk = json.loads(data)
|
||||||
|
delta = chunk.get("choices", [{}])[0].get("delta", {})
|
||||||
|
content = delta.get("content", "")
|
||||||
|
if content:
|
||||||
|
if first_token_time is None:
|
||||||
|
first_token_time = time.perf_counter()
|
||||||
|
token_count += 1
|
||||||
|
full_text.append(content)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"model": model, "prompt": prompt_label, "error": str(e)}
|
||||||
|
|
||||||
|
end = time.perf_counter()
|
||||||
|
total_time = end - start
|
||||||
|
ttft = (first_token_time - start) if first_token_time else total_time
|
||||||
|
|
||||||
|
# Token generation time (after first token)
|
||||||
|
gen_time = (end - first_token_time) if first_token_time and token_count > 1 else 0
|
||||||
|
tok_per_sec = (token_count - 1) / gen_time if gen_time > 0 else 0
|
||||||
|
|
||||||
|
output_text = "".join(full_text)
|
||||||
|
output_chars = len(output_text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt_label,
|
||||||
|
"ttft_s": round(ttft, 2),
|
||||||
|
"total_s": round(total_time, 2),
|
||||||
|
"tokens": token_count,
|
||||||
|
"tok_per_s": round(tok_per_sec, 1),
|
||||||
|
"output_chars": output_chars,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_non_streaming(model: str, prompt: str, prompt_label: str) -> dict:
|
||||||
|
"""Test a model without streaming — measures total latency."""
|
||||||
|
body = {
|
||||||
|
"model": model,
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
"stream": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
try:
|
||||||
|
r = httpx.post(f"{BASE_URL}/v1/chat/completions",
|
||||||
|
json=body, headers=HEADERS, timeout=300)
|
||||||
|
if r.status_code != 200:
|
||||||
|
return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": f"HTTP {r.status_code}"}
|
||||||
|
result = r.json()
|
||||||
|
except Exception as e:
|
||||||
|
return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": str(e)}
|
||||||
|
|
||||||
|
end = time.perf_counter()
|
||||||
|
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt_label,
|
||||||
|
"mode": "non-stream",
|
||||||
|
"total_s": round(end - start, 2),
|
||||||
|
"output_chars": len(content),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_tests():
|
||||||
|
print("=" * 80)
|
||||||
|
print("llmux Performance Test")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Check health
|
||||||
|
try:
|
||||||
|
r = httpx.get(f"{BASE_URL}/health", timeout=5)
|
||||||
|
health = r.json()
|
||||||
|
print(f"Server healthy — available VRAM: {health['available_vram_gb']} GB")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: Server not reachable: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for model, backend, vram_est in TEST_MODELS:
|
||||||
|
print(f"\n{'─' * 60}")
|
||||||
|
print(f"Model: {model} ({backend}, {vram_est})")
|
||||||
|
print(f"{'─' * 60}")
|
||||||
|
|
||||||
|
# Clear VRAM before each model to measure cold-start load time
|
||||||
|
clear_vram()
|
||||||
|
|
||||||
|
for prompt_label, prompt_text in PROMPTS.items():
|
||||||
|
# First run = cold start (includes model loading)
|
||||||
|
print(f" [{prompt_label}] streaming (cold)...", end=" ", flush=True)
|
||||||
|
r = test_streaming(model, prompt_text, prompt_label)
|
||||||
|
r["cold_start"] = True
|
||||||
|
results.append(r)
|
||||||
|
if "error" in r:
|
||||||
|
print(f"ERROR: {r['error']}")
|
||||||
|
else:
|
||||||
|
print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)")
|
||||||
|
|
||||||
|
# Second run = warm (model already loaded)
|
||||||
|
print(f" [{prompt_label}] streaming (warm)...", end=" ", flush=True)
|
||||||
|
r = test_streaming(model, prompt_text, prompt_label)
|
||||||
|
r["cold_start"] = False
|
||||||
|
results.append(r)
|
||||||
|
if "error" in r:
|
||||||
|
print(f"ERROR: {r['error']}")
|
||||||
|
else:
|
||||||
|
print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)")
|
||||||
|
|
||||||
|
# Non-streaming tests (warm)
|
||||||
|
for plabel in ["short", "medium"]:
|
||||||
|
print(f" [{plabel}] non-streaming (warm)...", end=" ", flush=True)
|
||||||
|
r = test_non_streaming(model, PROMPTS[plabel], plabel)
|
||||||
|
results.append(r)
|
||||||
|
if "error" in r:
|
||||||
|
print(f"ERROR: {r['error']}")
|
||||||
|
else:
|
||||||
|
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
||||||
|
print(f"total={r['total_s']}s ({r['output_chars']} chars, {chars_per_s} chars/s)")
|
||||||
|
|
||||||
|
# Clear to free VRAM for next model
|
||||||
|
clear_vram()
|
||||||
|
|
||||||
|
# Summary table
|
||||||
|
print(f"\n{'=' * 90}")
|
||||||
|
print("Summary — Streaming")
|
||||||
|
print(f"{'=' * 90}")
|
||||||
|
print(f"{'Model':<40} {'Prompt':<8} {'Cold':>5} {'TTFT':>7} {'Total':>7} {'Chunks':>7} {'Char/s':>7}")
|
||||||
|
print(f"{'-' * 40} {'-' * 8} {'-' * 5} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7}")
|
||||||
|
for r in results:
|
||||||
|
if r.get("mode") == "non-stream":
|
||||||
|
continue
|
||||||
|
if "error" in r:
|
||||||
|
print(f"{r['model']:<40} {r['prompt']:<8} {'':>5} {'ERROR':>7}")
|
||||||
|
continue
|
||||||
|
cold = "yes" if r.get("cold_start") else "no"
|
||||||
|
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
||||||
|
print(f"{r['model']:<40} {r['prompt']:<8} {cold:>5} {r['ttft_s']:>6.2f}s {r['total_s']:>6.2f}s {r['tokens']:>7} {chars_per_s:>6.1f}")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 90}")
|
||||||
|
print("Summary — Non-streaming")
|
||||||
|
print(f"{'=' * 90}")
|
||||||
|
print(f"{'Model':<40} {'Prompt':<8} {'Total':>7} {'Chars':>7} {'Char/s':>7}")
|
||||||
|
print(f"{'-' * 40} {'-' * 8} {'-' * 7} {'-' * 7} {'-' * 7}")
|
||||||
|
for r in results:
|
||||||
|
if r.get("mode") != "non-stream":
|
||||||
|
continue
|
||||||
|
if "error" in r:
|
||||||
|
print(f"{r['model']:<40} {r['prompt']:<8} {'ERROR':>7}")
|
||||||
|
continue
|
||||||
|
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
|
||||||
|
print(f"{r['model']:<40} {r['prompt']:<8} {r['total_s']:>6.2f}s {r['output_chars']:>7} {chars_per_s:>6.1f}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_tests()
|
||||||
0
kischdle/llmux/tests/__init__.py
Normal file
0
kischdle/llmux/tests/__init__.py
Normal file
11
kischdle/llmux/tests/conftest.py
Normal file
11
kischdle/llmux/tests/conftest.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Point config to the project's config directory for tests
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def set_config_dir(tmp_path, monkeypatch):
|
||||||
|
"""Use the project's config files for tests by default."""
|
||||||
|
config_dir = Path(__file__).parent.parent / "config"
|
||||||
|
monkeypatch.setenv("LLMUX_CONFIG_DIR", str(config_dir))
|
||||||
|
return config_dir
|
||||||
55
kischdle/llmux/tests/test_auth.py
Normal file
55
kischdle/llmux/tests/test_auth.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import pytest
|
||||||
|
from fastapi import FastAPI, Depends
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from llmux.auth import create_api_key_dependency
|
||||||
|
from llmux.config import ApiKey
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def app_with_auth():
|
||||||
|
keys = [
|
||||||
|
ApiKey(key="sk-test-valid-key", name="Test"),
|
||||||
|
ApiKey(key="sk-test-another-key", name="Another"),
|
||||||
|
]
|
||||||
|
require_api_key = create_api_key_dependency(keys)
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
@app.get("/protected")
|
||||||
|
def protected(api_key: str = Depends(require_api_key)):
|
||||||
|
return {"key_name": api_key}
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(app_with_auth):
|
||||||
|
return TestClient(app_with_auth)
|
||||||
|
|
||||||
|
|
||||||
|
def test_valid_key_returns_200(client):
|
||||||
|
resp = client.get("/protected", headers={"Authorization": "Bearer sk-test-valid-key"})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["key_name"] == "Test"
|
||||||
|
|
||||||
|
|
||||||
|
def test_another_valid_key(client):
|
||||||
|
resp = client.get("/protected", headers={"Authorization": "Bearer sk-test-another-key"})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["key_name"] == "Another"
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_auth_header_returns_401(client):
|
||||||
|
resp = client.get("/protected")
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_key_returns_401(client):
|
||||||
|
resp = client.get("/protected", headers={"Authorization": "Bearer sk-wrong"})
|
||||||
|
assert resp.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
|
def test_malformed_header_returns_401(client):
|
||||||
|
resp = client.get("/protected", headers={"Authorization": "sk-test-valid-key"})
|
||||||
|
assert resp.status_code == 401
|
||||||
56
kischdle/llmux/tests/test_config.py
Normal file
56
kischdle/llmux/tests/test_config.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
from llmux.config import load_models_config, load_api_keys, PhysicalModel, VirtualModel
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_models_config_returns_physical_and_virtual():
|
||||||
|
physical, virtual = load_models_config()
|
||||||
|
assert isinstance(physical, dict)
|
||||||
|
assert isinstance(virtual, dict)
|
||||||
|
assert len(physical) == 8
|
||||||
|
assert len(virtual) == 15
|
||||||
|
|
||||||
|
|
||||||
|
def test_physical_model_has_required_fields():
|
||||||
|
physical, _ = load_models_config()
|
||||||
|
qwen = physical["qwen3.5-9b-fp8"]
|
||||||
|
assert qwen.type == "llm"
|
||||||
|
assert qwen.backend == "llamacpp"
|
||||||
|
assert qwen.model_id == "unsloth/Qwen3.5-9B-GGUF"
|
||||||
|
assert qwen.estimated_vram_gb == 10
|
||||||
|
assert qwen.supports_vision is False
|
||||||
|
assert qwen.supports_tools is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_physical_model_llamacpp_has_gguf_fields():
|
||||||
|
physical, _ = load_models_config()
|
||||||
|
uncensored = physical["qwen3.5-9b-fp8-uncensored"]
|
||||||
|
assert uncensored.backend == "llamacpp"
|
||||||
|
assert uncensored.model_file == "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf"
|
||||||
|
assert uncensored.mmproj_file == "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf"
|
||||||
|
|
||||||
|
|
||||||
|
def test_virtual_model_maps_to_physical():
|
||||||
|
_, virtual = load_models_config()
|
||||||
|
thinking = virtual["Qwen3.5-9B-FP8-Thinking"]
|
||||||
|
assert thinking.physical == "qwen3.5-9b-fp8"
|
||||||
|
assert thinking.params == {"enable_thinking": True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_virtual_model_gpt_oss_has_system_prompt():
|
||||||
|
_, virtual = load_models_config()
|
||||||
|
low = virtual["GPT-OSS-20B-Low"]
|
||||||
|
assert low.physical == "gpt-oss-20b"
|
||||||
|
assert low.params == {"system_prompt_prefix": "Reasoning: low"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_virtual_model_without_params():
|
||||||
|
_, virtual = load_models_config()
|
||||||
|
ct = virtual["cohere-transcribe"]
|
||||||
|
assert ct.physical == "cohere-transcribe"
|
||||||
|
assert ct.params == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_api_keys():
|
||||||
|
keys = load_api_keys()
|
||||||
|
assert len(keys) == 3
|
||||||
|
assert all(k.key.startswith("sk-llmux-") for k in keys)
|
||||||
|
assert {k.name for k in keys} == {"Open WebUI", "Remote Whisper clients", "OpenCode"}
|
||||||
55
kischdle/llmux/tests/test_harmony.py
Normal file
55
kischdle/llmux/tests/test_harmony.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
from llmux.harmony import extract_final_text, HarmonyStreamFilter
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_llamacpp_format():
|
||||||
|
text = '<|channel|>analysis<|message|>User greeting. Simple.<|end|><|start|>assistant<|channel|>final<|message|>Hello! How can I help you today?'
|
||||||
|
assert extract_final_text(text) == "Hello! How can I help you today?"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_llamacpp_with_end_tag():
|
||||||
|
text = '<|channel|>analysis<|message|>thinking...<|end|><|start|>assistant<|channel|>final<|message|>The answer is 42.<|end|>'
|
||||||
|
assert extract_final_text(text) == "The answer is 42."
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_transformers_format():
|
||||||
|
text = 'analysisUser greeting. Just respond friendly.assistantfinalHello! I am doing great.'
|
||||||
|
assert extract_final_text(text) == "Hello! I am doing great."
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_non_harmony_passthrough():
|
||||||
|
text = "Hello! I'm doing well, thanks for asking."
|
||||||
|
assert extract_final_text(text) == text
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_filter_llamacpp():
|
||||||
|
f = HarmonyStreamFilter()
|
||||||
|
chunks = [
|
||||||
|
"<|channel|>", "analysis", "<|message|>", "User ", "greeting.",
|
||||||
|
"<|end|>", "<|start|>", "assistant", "<|channel|>", "final",
|
||||||
|
"<|message|>", "Hello!", " How ", "are you?"
|
||||||
|
]
|
||||||
|
output = ""
|
||||||
|
for c in chunks:
|
||||||
|
output += f.feed(c)
|
||||||
|
output += f.flush()
|
||||||
|
assert output == "Hello! How are you?"
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_filter_transformers():
|
||||||
|
f = HarmonyStreamFilter()
|
||||||
|
chunks = ["analysis", "User ", "greeting.", "assistant", "final", "Hello!", " Great day!"]
|
||||||
|
output = ""
|
||||||
|
for c in chunks:
|
||||||
|
output += f.feed(c)
|
||||||
|
output += f.flush()
|
||||||
|
assert output == "Hello! Great day!"
|
||||||
|
|
||||||
|
|
||||||
|
def test_stream_filter_non_harmony():
|
||||||
|
f = HarmonyStreamFilter()
|
||||||
|
chunks = ["Hello", " world", "!"]
|
||||||
|
output = ""
|
||||||
|
for c in chunks:
|
||||||
|
output += f.feed(c)
|
||||||
|
output += f.flush()
|
||||||
|
assert output == "Hello world!"
|
||||||
66
kischdle/llmux/tests/test_model_registry.py
Normal file
66
kischdle/llmux/tests/test_model_registry.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from llmux.model_registry import ModelRegistry
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def registry():
|
||||||
|
return ModelRegistry.from_config()
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_virtual_models(registry):
|
||||||
|
models = registry.list_virtual_models()
|
||||||
|
assert len(models) == 12 # only LLM models, not ASR/TTS
|
||||||
|
names = [m["id"] for m in models]
|
||||||
|
assert "Qwen3.5-9B-FP8-Thinking" in names
|
||||||
|
assert "GPT-OSS-20B-High" in names
|
||||||
|
assert "cohere-transcribe" not in names
|
||||||
|
assert "Chatterbox-Multilingual" not in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_virtual_model_openai_format(registry):
|
||||||
|
models = registry.list_virtual_models()
|
||||||
|
m = next(m for m in models if m["id"] == "Qwen3.5-9B-FP8-Thinking")
|
||||||
|
assert m["object"] == "model"
|
||||||
|
assert m["owned_by"] == "llmux"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_virtual_to_physical(registry):
|
||||||
|
physical_id, physical, params = registry.resolve("Qwen3.5-9B-FP8-Thinking")
|
||||||
|
assert physical_id == "qwen3.5-9b-fp8"
|
||||||
|
assert physical.backend == "llamacpp"
|
||||||
|
assert params == {"enable_thinking": True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_instruct_variant(registry):
|
||||||
|
physical_id, physical, params = registry.resolve("Qwen3.5-9B-FP8-Instruct")
|
||||||
|
assert physical_id == "qwen3.5-9b-fp8"
|
||||||
|
assert params == {"enable_thinking": False}
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_gpt_oss_reasoning(registry):
|
||||||
|
physical_id, physical, params = registry.resolve("GPT-OSS-20B-Medium")
|
||||||
|
assert physical_id == "gpt-oss-20b"
|
||||||
|
assert params == {"system_prompt_prefix": "Reasoning: medium"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_same_physical_for_variants(registry):
|
||||||
|
pid1, _, _ = registry.resolve("Qwen3.5-9B-FP8-Thinking")
|
||||||
|
pid2, _, _ = registry.resolve("Qwen3.5-9B-FP8-Instruct")
|
||||||
|
assert pid1 == pid2
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_unknown_model_raises(registry):
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
registry.resolve("nonexistent-model")
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_physical(registry):
|
||||||
|
physical = registry.get_physical("qwen3.5-9b-fp8")
|
||||||
|
assert physical.type == "llm"
|
||||||
|
assert physical.estimated_vram_gb == 10
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_physical_unknown_raises(registry):
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
registry.get_physical("nonexistent")
|
||||||
62
kischdle/llmux/tests/test_routes.py
Normal file
62
kischdle/llmux/tests/test_routes.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
import pytest
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from llmux.config import ApiKey
|
||||||
|
from llmux.auth import create_api_key_dependency
|
||||||
|
from llmux.model_registry import ModelRegistry
|
||||||
|
from llmux.vram_manager import VRAMManager
|
||||||
|
from llmux.routes.models import create_models_router
|
||||||
|
|
||||||
|
API_KEY = "sk-test-key"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def registry():
|
||||||
|
return ModelRegistry.from_config()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vram_manager():
|
||||||
|
return VRAMManager(total_vram_gb=16.0)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def app(registry, vram_manager):
|
||||||
|
keys = [ApiKey(key=API_KEY, name="Test")]
|
||||||
|
require_api_key = create_api_key_dependency(keys)
|
||||||
|
app = FastAPI()
|
||||||
|
app.include_router(create_models_router(registry, require_api_key))
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(app):
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def auth_headers():
|
||||||
|
return {"Authorization": f"Bearer {API_KEY}"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_models_returns_only_llm(client, auth_headers):
|
||||||
|
resp = client.get("/v1/models", headers=auth_headers)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
body = resp.json()
|
||||||
|
assert body["object"] == "list"
|
||||||
|
assert len(body["data"]) == 12 # only LLM models
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_models_contains_expected_names(client, auth_headers):
|
||||||
|
resp = client.get("/v1/models", headers=auth_headers)
|
||||||
|
names = [m["id"] for m in resp.json()["data"]]
|
||||||
|
assert "Qwen3.5-9B-FP8-Thinking" in names
|
||||||
|
assert "GPT-OSS-20B-High" in names
|
||||||
|
assert "cohere-transcribe" not in names
|
||||||
|
assert "Chatterbox-Multilingual" not in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_models_requires_auth(client):
|
||||||
|
resp = client.get("/v1/models")
|
||||||
|
assert resp.status_code == 401
|
||||||
154
kischdle/llmux/tests/test_vram_manager.py
Normal file
154
kischdle/llmux/tests/test_vram_manager.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
import asyncio
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from llmux.vram_manager import VRAMManager, ModelSlot
|
||||||
|
|
||||||
|
|
||||||
|
class FakeBackend:
|
||||||
|
"""Simulates a backend that tracks load/unload calls."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.loaded = {}
|
||||||
|
self.load_count = 0
|
||||||
|
self.unload_count = 0
|
||||||
|
|
||||||
|
async def load(self, model_id: str):
|
||||||
|
self.loaded[model_id] = True
|
||||||
|
self.load_count += 1
|
||||||
|
|
||||||
|
async def unload(self, model_id: str):
|
||||||
|
self.loaded.pop(model_id, None)
|
||||||
|
self.unload_count += 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def manager():
|
||||||
|
return VRAMManager(total_vram_gb=16.0, verify_gpu=False)
|
||||||
|
|
||||||
|
|
||||||
|
def test_priority_ordering():
|
||||||
|
assert ModelSlot.priority_rank("llm") == 0
|
||||||
|
assert ModelSlot.priority_rank("tts") == 1
|
||||||
|
assert ModelSlot.priority_rank("asr") == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_into_empty_vram(manager):
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
assert manager.is_loaded("qwen3.5-4b")
|
||||||
|
assert manager.available_vram_gb == pytest.approx(12.0)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_load_alongside_when_fits(manager):
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
assert manager.is_loaded("cohere-transcribe")
|
||||||
|
assert manager.is_loaded("qwen3.5-4b")
|
||||||
|
assert manager.available_vram_gb == pytest.approx(8.0)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evict_llm_first(manager):
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
# 10 GB used. Loading 9B (9GB). Evict LLM (4B), free=12. ASR+TTS+9B=15, fits.
|
||||||
|
await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend)
|
||||||
|
assert not manager.is_loaded("qwen3.5-4b")
|
||||||
|
assert manager.is_loaded("cohere-transcribe")
|
||||||
|
assert manager.is_loaded("chatterbox-multilingual")
|
||||||
|
assert manager.is_loaded("qwen3.5-9b-fp8")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evict_cascade_asr_survives(manager):
|
||||||
|
"""When LLM fits alongside ASR after evicting LLM+TTS, ASR survives."""
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
# 10 GB used. Need 12GB. Evict LLM(4)->free=10. Evict TTS(2)->free=12. ASR+12=16, fits.
|
||||||
|
await manager.load_model("large-llm", model_type="llm", vram_gb=12.0, backend=backend)
|
||||||
|
assert not manager.is_loaded("qwen3.5-4b")
|
||||||
|
assert not manager.is_loaded("chatterbox-multilingual")
|
||||||
|
assert manager.is_loaded("cohere-transcribe") # ASR survives
|
||||||
|
assert manager.is_loaded("large-llm")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_evict_cascade_full_for_huge_llm(manager):
|
||||||
|
"""When LLM is too large to fit alongside ASR, everything gets evicted."""
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
# 10 GB used. gpt-oss-20b needs 13GB. Evict LLM(4)->free=10. TTS(2)->free=12. ASR(4)->free=16. Load alone.
|
||||||
|
await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=13.0, backend=backend)
|
||||||
|
assert not manager.is_loaded("qwen3.5-4b")
|
||||||
|
assert not manager.is_loaded("chatterbox-multilingual")
|
||||||
|
assert not manager.is_loaded("cohere-transcribe") # ASR evicted as last resort
|
||||||
|
assert manager.is_loaded("gpt-oss-20b")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_tts_cannot_evict_asr(manager):
|
||||||
|
"""TTS request must not evict ASR — it evicts LLM instead."""
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend)
|
||||||
|
# 13GB used, 3GB free. TTS needs 2GB — fits! Load alongside.
|
||||||
|
await manager.load_model("chatterbox", model_type="tts", vram_gb=2.0, backend=backend)
|
||||||
|
assert manager.is_loaded("cohere-transcribe")
|
||||||
|
assert manager.is_loaded("qwen3.5-9b-fp8")
|
||||||
|
assert manager.is_loaded("chatterbox")
|
||||||
|
# Now replace TTS with a bigger one that needs eviction
|
||||||
|
# 15GB used, 1GB free. New TTS needs 2GB. Evict old TTS(2)->free=3. Load.
|
||||||
|
await manager.load_model("chatterbox-ml", model_type="tts", vram_gb=2.0, backend=backend)
|
||||||
|
assert manager.is_loaded("cohere-transcribe") # ASR must survive
|
||||||
|
assert manager.is_loaded("chatterbox-ml")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_asr_evicts_llm_not_reversed(manager):
|
||||||
|
"""When ASR request arrives and LLM is loaded, evict LLM (lower priority)."""
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=13.0, backend=backend)
|
||||||
|
# 13GB used, 3GB free. ASR needs 4GB. Must evict LLM.
|
||||||
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
||||||
|
assert not manager.is_loaded("gpt-oss-20b")
|
||||||
|
assert manager.is_loaded("cohere-transcribe")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_already_loaded_is_noop(manager):
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
assert backend.load_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_spec_scenario_switch_to_9b(manager):
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend)
|
||||||
|
assert manager.is_loaded("cohere-transcribe")
|
||||||
|
assert manager.is_loaded("chatterbox-multilingual")
|
||||||
|
assert manager.is_loaded("qwen3.5-9b-fp8")
|
||||||
|
assert not manager.is_loaded("qwen3.5-4b")
|
||||||
|
assert manager.available_vram_gb == pytest.approx(1.0)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_loaded_models(manager):
|
||||||
|
backend = FakeBackend()
|
||||||
|
await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend)
|
||||||
|
await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend)
|
||||||
|
loaded = manager.get_loaded_models()
|
||||||
|
assert set(loaded.keys()) == {"cohere-transcribe", "qwen3.5-4b"}
|
||||||
Reference in New Issue
Block a user