From cf7c77b3b5976a7182e3d2cc590a2243fa3d6b8bc39bc8b7478a095870673221 Mon Sep 17 00:00:00 2001 From: tlg Date: Fri, 3 Apr 2026 22:43:37 +0200 Subject: [PATCH] Add llmux implementation plan (30 tasks) Covers project scaffolding, config, auth, VRAM manager, all four backends, API routes, Dockerfile, deployment scripts, and four phases of testing (integration, functional, VRAM, performance). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../plans/2026-04-03-llmux-implementation.md | 3195 +++++++++++++++++ 1 file changed, 3195 insertions(+) create mode 100644 kischdle/llmux/docs/superpowers/plans/2026-04-03-llmux-implementation.md diff --git a/kischdle/llmux/docs/superpowers/plans/2026-04-03-llmux-implementation.md b/kischdle/llmux/docs/superpowers/plans/2026-04-03-llmux-implementation.md new file mode 100644 index 0000000..ea5693e --- /dev/null +++ b/kischdle/llmux/docs/superpowers/plans/2026-04-03-llmux-implementation.md @@ -0,0 +1,3195 @@ +# llmux Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a FastAPI service that manages 9 AI models on a single GPU, exposing an OpenAI-compatible API for chat, ASR, and TTS to Open WebUI and external clients. + +**Architecture:** Single-process Python app with three inference runtimes (transformers, llama-cpp-python, chatterbox), a VRAM manager that handles model loading/eviction by priority (ASR > TTS > LLM), and Bearer token auth. Runs in a rootless Podman container with GPU passthrough. + +**Tech Stack:** Python 3.11+, FastAPI, uvicorn, PyTorch, transformers >=5.4.0, llama-cpp-python, chatterbox, PyYAML, Podman, systemd + +**Spec:** `docs/superpowers/specs/2026-04-03-llmux-design.md` + +--- + +## File Map + +| File | Responsibility | +|------|---------------| +| `llmux/llmux/__init__.py` | Package marker | +| `llmux/llmux/main.py` | FastAPI app, startup/shutdown, /health endpoint | +| `llmux/llmux/auth.py` | API key validation dependency | +| `llmux/llmux/config.py` | Load and validate YAML config files | +| `llmux/llmux/model_registry.py` | Virtual→physical model mapping, behavior params | +| `llmux/llmux/vram_manager.py` | VRAM tracking, load/unload orchestration, eviction | +| `llmux/llmux/routes/__init__.py` | Package marker | +| `llmux/llmux/routes/models.py` | GET /v1/models | +| `llmux/llmux/routes/chat.py` | POST /v1/chat/completions | +| `llmux/llmux/routes/transcription.py` | POST /v1/audio/transcriptions | +| `llmux/llmux/routes/speech.py` | POST /v1/audio/speech | +| `llmux/llmux/routes/admin.py` | POST /admin/test/performance (test-only) | +| `llmux/llmux/backends/__init__.py` | Package marker | +| `llmux/llmux/backends/base.py` | Abstract base class for all backends | +| `llmux/llmux/backends/transformers_llm.py` | HuggingFace transformers for LLM chat + vision + tools | +| `llmux/llmux/backends/transformers_asr.py` | HuggingFace transformers for cohere-transcribe ASR | +| `llmux/llmux/backends/llamacpp.py` | llama-cpp-python for GGUF models | +| `llmux/llmux/backends/chatterbox_tts.py` | Chatterbox TTS | +| `llmux/tests/__init__.py` | Package marker | +| `llmux/tests/test_config.py` | Tests for config loading | +| `llmux/tests/test_auth.py` | Tests for API key auth | +| `llmux/tests/test_model_registry.py` | Tests for virtual→physical mapping | +| `llmux/tests/test_vram_manager.py` | Tests for VRAM eviction logic | +| `llmux/tests/test_routes.py` | Tests for API routes with mocked backends | +| `llmux/tests/conftest.py` | Shared pytest fixtures | +| `llmux/Dockerfile` | Container image definition | +| `llmux/requirements.txt` | Python dependencies | +| `llmux/config/models.yaml` | Model registry config | +| `llmux/config/api_keys.yaml` | API key config | +| `llmux/scripts/download_models.sh` | Pre-download model weights | +| `llmux/scripts/create_pod_llmux.sh` | Podman pod creation + systemd setup | + +--- + +### Task 1: Project Scaffolding + +**Files:** +- Create: `llmux/requirements.txt` +- Create: `llmux/config/models.yaml` +- Create: `llmux/config/api_keys.yaml` +- Create: `llmux/llmux/__init__.py` +- Create: `llmux/llmux/routes/__init__.py` +- Create: `llmux/llmux/backends/__init__.py` +- Create: `llmux/tests/__init__.py` +- Create: `llmux/tests/conftest.py` + +- [ ] **Step 1: Create requirements.txt** + +``` +# Web framework +fastapi>=0.115.0 +uvicorn[standard]>=0.34.0 +python-multipart>=0.0.18 + +# AI runtimes +torch>=2.7.0 +transformers>=5.4.0 +llama-cpp-python>=0.3.0 +chatterbox-tts>=0.1.0 + +# Audio processing +soundfile>=0.12.0 +librosa>=0.10.0 + +# Config & utilities +pyyaml>=6.0 +sentencepiece>=0.2.0 +protobuf>=5.0.0 + +# Testing +pytest>=8.0.0 +pytest-asyncio>=0.24.0 +httpx>=0.28.0 +``` + +- [ ] **Step 2: Create config/models.yaml** + +Copy the exact YAML from the spec (section "Configuration Files > config/models.yaml"). This is the full model registry with all 9 physical models and 16 virtual models. + +```yaml +physical_models: + qwen3.5-9b-fp8: + type: llm + backend: transformers + model_id: "lovedheart/Qwen3.5-9B-FP8" + estimated_vram_gb: 9 + supports_vision: true + supports_tools: true + + qwen3.5-9b-fp8-uncensored: + type: llm + backend: llamacpp + model_file: "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf" + mmproj_file: "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf" + estimated_vram_gb: 9 + supports_vision: true + supports_tools: true + + qwen3.5-4b: + type: llm + backend: transformers + model_id: "Qwen/Qwen3.5-4B" + estimated_vram_gb: 4 + supports_vision: true + supports_tools: true + + gpt-oss-20b: + type: llm + backend: transformers + model_id: "openai/gpt-oss-20b" + estimated_vram_gb: 13 + supports_vision: false + supports_tools: true + + gpt-oss-20b-uncensored: + type: llm + backend: transformers + model_id: "aoxo/gpt-oss-20b-uncensored" + estimated_vram_gb: 13 + supports_vision: false + supports_tools: true + + cohere-transcribe: + type: asr + backend: transformers + model_id: "CohereLabs/cohere-transcribe-03-2026" + estimated_vram_gb: 4 + default_language: "en" + + chatterbox-turbo: + type: tts + backend: chatterbox + variant: "turbo" + estimated_vram_gb: 2 + + chatterbox-multilingual: + type: tts + backend: chatterbox + variant: "multilingual" + estimated_vram_gb: 2 + + chatterbox: + type: tts + backend: chatterbox + variant: "default" + estimated_vram_gb: 2 + +virtual_models: + Qwen3.5-9B-FP8-Thinking: + physical: qwen3.5-9b-fp8 + params: { enable_thinking: true } + Qwen3.5-9B-FP8-Instruct: + physical: qwen3.5-9b-fp8 + params: { enable_thinking: false } + + Qwen3.5-9B-FP8-Uncensored-Thinking: + physical: qwen3.5-9b-fp8-uncensored + params: { enable_thinking: true } + Qwen3.5-9B-FP8-Uncensored-Instruct: + physical: qwen3.5-9b-fp8-uncensored + params: { enable_thinking: false } + + Qwen3.5-4B-Thinking: + physical: qwen3.5-4b + params: { enable_thinking: true } + Qwen3.5-4B-Instruct: + physical: qwen3.5-4b + params: { enable_thinking: false } + + GPT-OSS-20B-Low: + physical: gpt-oss-20b + params: { system_prompt_prefix: "Reasoning: low" } + GPT-OSS-20B-Medium: + physical: gpt-oss-20b + params: { system_prompt_prefix: "Reasoning: medium" } + GPT-OSS-20B-High: + physical: gpt-oss-20b + params: { system_prompt_prefix: "Reasoning: high" } + + GPT-OSS-20B-Uncensored-Low: + physical: gpt-oss-20b-uncensored + params: { system_prompt_prefix: "Reasoning: low" } + GPT-OSS-20B-Uncensored-Medium: + physical: gpt-oss-20b-uncensored + params: { system_prompt_prefix: "Reasoning: medium" } + GPT-OSS-20B-Uncensored-High: + physical: gpt-oss-20b-uncensored + params: { system_prompt_prefix: "Reasoning: high" } + + cohere-transcribe: + physical: cohere-transcribe + Chatterbox-Turbo: + physical: chatterbox-turbo + Chatterbox-Multilingual: + physical: chatterbox-multilingual + Chatterbox: + physical: chatterbox +``` + +- [ ] **Step 3: Create config/api_keys.yaml with generated keys** + +Generate three real keys and write the file: + +```python +import secrets +keys = { + "Open WebUI": f"sk-llmux-openwebui-{secrets.token_urlsafe(32)}", + "Remote Whisper clients": f"sk-llmux-whisper-{secrets.token_urlsafe(32)}", + "OpenCode": f"sk-llmux-opencode-{secrets.token_urlsafe(32)}", +} +``` + +```yaml +api_keys: + - key: "" + name: "Open WebUI" + - key: "" + name: "Remote Whisper clients" + - key: "" + name: "OpenCode" +``` + +- [ ] **Step 4: Create package __init__.py files and conftest.py** + +`llmux/llmux/__init__.py`, `llmux/llmux/routes/__init__.py`, `llmux/llmux/backends/__init__.py`, `llmux/tests/__init__.py` — all empty files. + +`llmux/tests/conftest.py`: + +```python +import os +import pytest +from pathlib import Path + +# Point config to the project's config directory for tests +@pytest.fixture(autouse=True) +def set_config_dir(tmp_path, monkeypatch): + """Use the project's config files for tests by default.""" + config_dir = Path(__file__).parent.parent / "config" + monkeypatch.setenv("LLMUX_CONFIG_DIR", str(config_dir)) + return config_dir +``` + +- [ ] **Step 5: Commit** + +```bash +git add llmux/requirements.txt llmux/config/ llmux/llmux/__init__.py \ + llmux/llmux/routes/__init__.py llmux/llmux/backends/__init__.py \ + llmux/tests/__init__.py llmux/tests/conftest.py +git commit -m "feat: project scaffolding with config files and test fixtures" +``` + +--- + +### Task 2: Config Loading + +**Files:** +- Create: `llmux/llmux/config.py` +- Create: `llmux/tests/test_config.py` + +- [ ] **Step 1: Write the failing tests** + +`llmux/tests/test_config.py`: + +```python +from llmux.config import load_models_config, load_api_keys, PhysicalModel, VirtualModel + + +def test_load_models_config_returns_physical_and_virtual(): + physical, virtual = load_models_config() + assert isinstance(physical, dict) + assert isinstance(virtual, dict) + assert len(physical) == 9 + assert len(virtual) == 16 + + +def test_physical_model_has_required_fields(): + physical, _ = load_models_config() + qwen = physical["qwen3.5-9b-fp8"] + assert qwen.type == "llm" + assert qwen.backend == "transformers" + assert qwen.model_id == "lovedheart/Qwen3.5-9B-FP8" + assert qwen.estimated_vram_gb == 9 + assert qwen.supports_vision is True + assert qwen.supports_tools is True + + +def test_physical_model_llamacpp_has_gguf_fields(): + physical, _ = load_models_config() + uncensored = physical["qwen3.5-9b-fp8-uncensored"] + assert uncensored.backend == "llamacpp" + assert uncensored.model_file == "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf" + assert uncensored.mmproj_file == "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf" + + +def test_virtual_model_maps_to_physical(): + _, virtual = load_models_config() + thinking = virtual["Qwen3.5-9B-FP8-Thinking"] + assert thinking.physical == "qwen3.5-9b-fp8" + assert thinking.params == {"enable_thinking": True} + + +def test_virtual_model_gpt_oss_has_system_prompt(): + _, virtual = load_models_config() + low = virtual["GPT-OSS-20B-Low"] + assert low.physical == "gpt-oss-20b" + assert low.params == {"system_prompt_prefix": "Reasoning: low"} + + +def test_virtual_model_without_params(): + _, virtual = load_models_config() + ct = virtual["cohere-transcribe"] + assert ct.physical == "cohere-transcribe" + assert ct.params == {} + + +def test_load_api_keys(): + keys = load_api_keys() + assert len(keys) == 3 + assert all(k.key.startswith("sk-llmux-") for k in keys) + assert {k.name for k in keys} == {"Open WebUI", "Remote Whisper clients", "OpenCode"} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd llmux && python -m pytest tests/test_config.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'llmux.config'` + +- [ ] **Step 3: Implement config.py** + +`llmux/llmux/config.py`: + +```python +import os +from dataclasses import dataclass, field +from pathlib import Path + +import yaml + + +def _config_dir() -> Path: + return Path(os.environ.get("LLMUX_CONFIG_DIR", "/config")) + + +@dataclass +class PhysicalModel: + type: str # "llm", "asr", "tts" + backend: str # "transformers", "llamacpp", "chatterbox" + estimated_vram_gb: float + model_id: str = "" + model_file: str = "" + mmproj_file: str = "" + supports_vision: bool = False + supports_tools: bool = False + default_language: str = "" + variant: str = "" + + +@dataclass +class VirtualModel: + physical: str + params: dict = field(default_factory=dict) + + +@dataclass +class ApiKey: + key: str + name: str + + +def load_models_config( + config_path: Path | None = None, +) -> tuple[dict[str, PhysicalModel], dict[str, VirtualModel]]: + if config_path is None: + config_path = _config_dir() / "models.yaml" + + with open(config_path) as f: + raw = yaml.safe_load(f) + + physical: dict[str, PhysicalModel] = {} + for model_id, attrs in raw["physical_models"].items(): + physical[model_id] = PhysicalModel( + type=attrs["type"], + backend=attrs["backend"], + estimated_vram_gb=attrs["estimated_vram_gb"], + model_id=attrs.get("model_id", ""), + model_file=attrs.get("model_file", ""), + mmproj_file=attrs.get("mmproj_file", ""), + supports_vision=attrs.get("supports_vision", False), + supports_tools=attrs.get("supports_tools", False), + default_language=attrs.get("default_language", ""), + variant=attrs.get("variant", ""), + ) + + virtual: dict[str, VirtualModel] = {} + for model_name, attrs in raw["virtual_models"].items(): + virtual[model_name] = VirtualModel( + physical=attrs["physical"], + params=attrs.get("params", {}), + ) + + return physical, virtual + + +def load_api_keys(config_path: Path | None = None) -> list[ApiKey]: + if config_path is None: + config_path = _config_dir() / "api_keys.yaml" + + with open(config_path) as f: + raw = yaml.safe_load(f) + + return [ApiKey(key=entry["key"], name=entry["name"]) for entry in raw["api_keys"]] +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd llmux && python -m pytest tests/test_config.py -v` +Expected: all 7 tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add llmux/llmux/config.py llmux/tests/test_config.py +git commit -m "feat: config loading for models.yaml and api_keys.yaml" +``` + +--- + +### Task 3: API Key Authentication + +**Files:** +- Create: `llmux/llmux/auth.py` +- Create: `llmux/tests/test_auth.py` + +- [ ] **Step 1: Write the failing tests** + +`llmux/tests/test_auth.py`: + +```python +import pytest +from fastapi import FastAPI, Depends +from fastapi.testclient import TestClient + +from llmux.auth import create_api_key_dependency +from llmux.config import ApiKey + + +@pytest.fixture +def app_with_auth(): + keys = [ + ApiKey(key="sk-test-valid-key", name="Test"), + ApiKey(key="sk-test-another-key", name="Another"), + ] + require_api_key = create_api_key_dependency(keys) + + app = FastAPI() + + @app.get("/protected") + def protected(api_key: str = Depends(require_api_key)): + return {"key_name": api_key} + + return app + + +@pytest.fixture +def client(app_with_auth): + return TestClient(app_with_auth) + + +def test_valid_key_returns_200(client): + resp = client.get("/protected", headers={"Authorization": "Bearer sk-test-valid-key"}) + assert resp.status_code == 200 + assert resp.json()["key_name"] == "Test" + + +def test_another_valid_key(client): + resp = client.get("/protected", headers={"Authorization": "Bearer sk-test-another-key"}) + assert resp.status_code == 200 + assert resp.json()["key_name"] == "Another" + + +def test_missing_auth_header_returns_401(client): + resp = client.get("/protected") + assert resp.status_code == 401 + + +def test_invalid_key_returns_401(client): + resp = client.get("/protected", headers={"Authorization": "Bearer sk-wrong"}) + assert resp.status_code == 401 + + +def test_malformed_header_returns_401(client): + resp = client.get("/protected", headers={"Authorization": "sk-test-valid-key"}) + assert resp.status_code == 401 +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd llmux && python -m pytest tests/test_auth.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'llmux.auth'` + +- [ ] **Step 3: Implement auth.py** + +`llmux/llmux/auth.py`: + +```python +from fastapi import HTTPException, Request + +from llmux.config import ApiKey + + +def create_api_key_dependency(api_keys: list[ApiKey]): + key_to_name = {k.key: k.name for k in api_keys} + + async def require_api_key(request: Request) -> str: + auth = request.headers.get("Authorization", "") + if not auth.startswith("Bearer "): + raise HTTPException(status_code=401, detail="Missing or malformed Authorization header") + token = auth[7:] + name = key_to_name.get(token) + if name is None: + raise HTTPException(status_code=401, detail="Invalid API key") + return name + + return require_api_key +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd llmux && python -m pytest tests/test_auth.py -v` +Expected: all 5 tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add llmux/llmux/auth.py llmux/tests/test_auth.py +git commit -m "feat: API key authentication dependency" +``` + +--- + +### Task 4: Model Registry + +**Files:** +- Create: `llmux/llmux/model_registry.py` +- Create: `llmux/tests/test_model_registry.py` + +- [ ] **Step 1: Write the failing tests** + +`llmux/tests/test_model_registry.py`: + +```python +import pytest + +from llmux.model_registry import ModelRegistry + + +@pytest.fixture +def registry(): + return ModelRegistry.from_config() + + +def test_list_virtual_models(registry): + models = registry.list_virtual_models() + assert len(models) == 16 + names = [m["id"] for m in models] + assert "Qwen3.5-9B-FP8-Thinking" in names + assert "GPT-OSS-20B-High" in names + assert "cohere-transcribe" in names + assert "Chatterbox-Multilingual" in names + + +def test_virtual_model_openai_format(registry): + models = registry.list_virtual_models() + m = next(m for m in models if m["id"] == "Qwen3.5-9B-FP8-Thinking") + assert m["object"] == "model" + assert m["owned_by"] == "llmux" + + +def test_resolve_virtual_to_physical(registry): + physical_id, physical, params = registry.resolve("Qwen3.5-9B-FP8-Thinking") + assert physical_id == "qwen3.5-9b-fp8" + assert physical.backend == "transformers" + assert params == {"enable_thinking": True} + + +def test_resolve_instruct_variant(registry): + physical_id, physical, params = registry.resolve("Qwen3.5-9B-FP8-Instruct") + assert physical_id == "qwen3.5-9b-fp8" + assert params == {"enable_thinking": False} + + +def test_resolve_gpt_oss_reasoning(registry): + physical_id, physical, params = registry.resolve("GPT-OSS-20B-Medium") + assert physical_id == "gpt-oss-20b" + assert params == {"system_prompt_prefix": "Reasoning: medium"} + + +def test_resolve_same_physical_for_variants(registry): + pid1, _, _ = registry.resolve("Qwen3.5-9B-FP8-Thinking") + pid2, _, _ = registry.resolve("Qwen3.5-9B-FP8-Instruct") + assert pid1 == pid2 + + +def test_resolve_unknown_model_raises(registry): + with pytest.raises(KeyError): + registry.resolve("nonexistent-model") + + +def test_get_physical(registry): + physical = registry.get_physical("qwen3.5-9b-fp8") + assert physical.type == "llm" + assert physical.estimated_vram_gb == 9 + + +def test_get_physical_unknown_raises(registry): + with pytest.raises(KeyError): + registry.get_physical("nonexistent") +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd llmux && python -m pytest tests/test_model_registry.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'llmux.model_registry'` + +- [ ] **Step 3: Implement model_registry.py** + +`llmux/llmux/model_registry.py`: + +```python +from llmux.config import PhysicalModel, VirtualModel, load_models_config + + +class ModelRegistry: + def __init__( + self, + physical: dict[str, PhysicalModel], + virtual: dict[str, VirtualModel], + ): + self._physical = physical + self._virtual = virtual + + @classmethod + def from_config(cls) -> "ModelRegistry": + physical, virtual = load_models_config() + return cls(physical, virtual) + + def list_virtual_models(self) -> list[dict]: + return [ + { + "id": name, + "object": "model", + "created": 0, + "owned_by": "llmux", + } + for name in self._virtual + ] + + def resolve(self, virtual_name: str) -> tuple[str, PhysicalModel, dict]: + """Resolve a virtual model name to (physical_id, PhysicalModel, params).""" + vm = self._virtual[virtual_name] # raises KeyError if unknown + pm = self._physical[vm.physical] + return vm.physical, pm, dict(vm.params) + + def get_physical(self, physical_id: str) -> PhysicalModel: + return self._physical[physical_id] # raises KeyError if unknown +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd llmux && python -m pytest tests/test_model_registry.py -v` +Expected: all 9 tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add llmux/llmux/model_registry.py llmux/tests/test_model_registry.py +git commit -m "feat: model registry with virtual-to-physical resolution" +``` + +--- + +### Task 5: VRAM Manager + +**Files:** +- Create: `llmux/llmux/vram_manager.py` +- Create: `llmux/tests/test_vram_manager.py` + +- [ ] **Step 1: Write the failing tests** + +`llmux/tests/test_vram_manager.py`: + +```python +import asyncio +import pytest + +from llmux.vram_manager import VRAMManager, ModelSlot + + +class FakeBackend: + """Simulates a backend that tracks load/unload calls.""" + + def __init__(self): + self.loaded = {} # model_id -> True + self.load_count = 0 + self.unload_count = 0 + + async def load(self, model_id: str): + self.loaded[model_id] = True + self.load_count += 1 + + async def unload(self, model_id: str): + self.loaded.pop(model_id, None) + self.unload_count += 1 + + +@pytest.fixture +def manager(): + return VRAMManager(total_vram_gb=16.0) + + +# --- Priority ordering --- + +def test_priority_ordering(): + assert ModelSlot.priority_rank("llm") == 0 + assert ModelSlot.priority_rank("tts") == 1 + assert ModelSlot.priority_rank("asr") == 2 + + +# --- Loading into empty VRAM --- + +@pytest.mark.asyncio +async def test_load_into_empty_vram(manager): + backend = FakeBackend() + await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) + assert manager.is_loaded("qwen3.5-4b") + assert manager.available_vram_gb == pytest.approx(12.0) + + +# --- Loading alongside existing --- + +@pytest.mark.asyncio +async def test_load_alongside_when_fits(manager): + backend = FakeBackend() + await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) + await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) + assert manager.is_loaded("cohere-transcribe") + assert manager.is_loaded("qwen3.5-4b") + assert manager.available_vram_gb == pytest.approx(8.0) + + +# --- Eviction: LLM evicted first --- + +@pytest.mark.asyncio +async def test_evict_llm_first(manager): + backend = FakeBackend() + await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) + await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend) + await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) + # 10 GB used. Loading 9B (9GB) requires 19GB. Must evict. + await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend) + # LLM (4B) evicted first. ASR+TTS+9B = 4+2+9 = 15GB, fits. + assert not manager.is_loaded("qwen3.5-4b") + assert manager.is_loaded("cohere-transcribe") + assert manager.is_loaded("chatterbox-multilingual") + assert manager.is_loaded("qwen3.5-9b-fp8") + + +# --- Eviction cascade: LLM then TTS then ASR --- + +@pytest.mark.asyncio +async def test_evict_cascade_for_large_llm(manager): + backend = FakeBackend() + await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) + await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend) + await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) + # 10 GB used. Loading gpt-oss-20b (13GB). Need to free a lot. + await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=13.0, backend=backend) + # Evict LLM (4GB free=6), then TTS (free=8), then ASR (free=12). + # Actually: after evicting LLM, free=12. 12 < 13. Evict TTS, free=14. 14 >= 13. Load. + assert not manager.is_loaded("qwen3.5-4b") + assert not manager.is_loaded("chatterbox-multilingual") + assert manager.is_loaded("cohere-transcribe") # ASR not evicted if not needed + assert manager.is_loaded("gpt-oss-20b") + + +# --- Eviction: never evict higher priority for lower --- + +@pytest.mark.asyncio +async def test_never_evict_asr_for_tts(manager): + backend = FakeBackend() + await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) + await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=13.0, backend=backend) + # 17GB > 16GB? No — we loaded into 16GB, so gpt-oss-20b alone uses 13GB. + # Wait, this test needs adjustment. Let's load ASR alone (4GB), then try TTS. + # Actually the scenario: ASR (4) + large LLM (13) = 17 > 16. + # Loading LLM should evict... nothing higher, but LLM can't fit alongside ASR. + # The LLM IS the thing being loaded, so we evict nothing of lower priority. + # Actually we need to think about this differently. Let's redo. + pass + + +@pytest.mark.asyncio +async def test_asr_evicts_llm_not_reversed(manager): + """When ASR request arrives and LLM is loaded, evict LLM (lower priority).""" + backend = FakeBackend() + await manager.load_model("gpt-oss-20b", model_type="llm", vram_gb=13.0, backend=backend) + # 13GB used, 3GB free. ASR needs 4GB. Must evict LLM. + await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) + assert not manager.is_loaded("gpt-oss-20b") + assert manager.is_loaded("cohere-transcribe") + + +# --- Already loaded --- + +@pytest.mark.asyncio +async def test_already_loaded_is_noop(manager): + backend = FakeBackend() + await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) + await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) + assert backend.load_count == 1 # only loaded once + + +# --- Scenario from spec: ASR + TTS + 4B, switch to 9B --- + +@pytest.mark.asyncio +async def test_spec_scenario_switch_to_9b(manager): + backend = FakeBackend() + await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) + await manager.load_model("chatterbox-multilingual", model_type="tts", vram_gb=2.0, backend=backend) + await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) + # Switch to 9B. Evict LLM (4B). ASR+TTS+9B = 15GB, fits. + await manager.load_model("qwen3.5-9b-fp8", model_type="llm", vram_gb=9.0, backend=backend) + assert manager.is_loaded("cohere-transcribe") + assert manager.is_loaded("chatterbox-multilingual") + assert manager.is_loaded("qwen3.5-9b-fp8") + assert not manager.is_loaded("qwen3.5-4b") + assert manager.available_vram_gb == pytest.approx(1.0) + + +# --- get_loaded_models --- + +@pytest.mark.asyncio +async def test_get_loaded_models(manager): + backend = FakeBackend() + await manager.load_model("cohere-transcribe", model_type="asr", vram_gb=4.0, backend=backend) + await manager.load_model("qwen3.5-4b", model_type="llm", vram_gb=4.0, backend=backend) + loaded = manager.get_loaded_models() + assert set(loaded.keys()) == {"cohere-transcribe", "qwen3.5-4b"} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd llmux && python -m pytest tests/test_vram_manager.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'llmux.vram_manager'` + +- [ ] **Step 3: Implement vram_manager.py** + +`llmux/llmux/vram_manager.py`: + +```python +import asyncio +import logging +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +# Priority ranks: higher number = higher priority = evicted last +_PRIORITY = {"llm": 0, "tts": 1, "asr": 2} + + +@dataclass +class ModelSlot: + model_id: str + model_type: str # "llm", "tts", "asr" + vram_gb: float + backend: object # backend instance that loaded this model + + @staticmethod + def priority_rank(model_type: str) -> int: + return _PRIORITY[model_type] + + @property + def priority(self) -> int: + return _PRIORITY[self.model_type] + + +class VRAMManager: + def __init__(self, total_vram_gb: float = 16.0): + self._total_vram_gb = total_vram_gb + self._loaded: dict[str, ModelSlot] = {} # model_id -> ModelSlot + self._lock = asyncio.Lock() + + @property + def available_vram_gb(self) -> float: + used = sum(slot.vram_gb for slot in self._loaded.values()) + return self._total_vram_gb - used + + def is_loaded(self, model_id: str) -> bool: + return model_id in self._loaded + + def get_loaded_models(self) -> dict[str, ModelSlot]: + return dict(self._loaded) + + async def load_model( + self, + model_id: str, + model_type: str, + vram_gb: float, + backend: object, + ) -> None: + async with self._lock: + await self._load_model_locked(model_id, model_type, vram_gb, backend) + + async def _load_model_locked( + self, + model_id: str, + model_type: str, + vram_gb: float, + backend: object, + ) -> None: + # Already loaded — nothing to do + if model_id in self._loaded: + return + + # Try to free VRAM if needed + if self.available_vram_gb < vram_gb: + await self._evict_for(vram_gb, model_type) + + if self.available_vram_gb < vram_gb: + raise RuntimeError( + f"Cannot free enough VRAM for {model_id} " + f"(need {vram_gb}GB, available {self.available_vram_gb}GB)" + ) + + # Load the model + logger.info(f"Loading {model_id} ({vram_gb}GB VRAM)") + await backend.load(model_id) + self._loaded[model_id] = ModelSlot( + model_id=model_id, + model_type=model_type, + vram_gb=vram_gb, + backend=backend, + ) + logger.info( + f"Loaded {model_id}. VRAM: {self._total_vram_gb - self.available_vram_gb:.1f}/" + f"{self._total_vram_gb:.1f}GB used" + ) + + async def _evict_for(self, needed_gb: float, requesting_type: str) -> None: + """Evict models in priority order (lowest first) until enough VRAM is free.""" + requesting_priority = _PRIORITY[requesting_type] + + # Sort loaded models by priority ascending (evict lowest first) + candidates = sorted(self._loaded.values(), key=lambda s: s.priority) + + for slot in candidates: + if self.available_vram_gb >= needed_gb: + break + # Never evict a model with higher or equal priority than the requester + if slot.priority >= requesting_priority: + continue + logger.info(f"Evicting {slot.model_id} ({slot.model_type}, {slot.vram_gb}GB)") + await slot.backend.unload(slot.model_id) + del self._loaded[slot.model_id] + + # If still not enough, evict same-priority models (e.g., old LLM for new LLM) + if self.available_vram_gb < needed_gb: + candidates = sorted(self._loaded.values(), key=lambda s: s.priority) + for slot in candidates: + if self.available_vram_gb >= needed_gb: + break + if slot.priority > requesting_priority: + continue + logger.info(f"Evicting same-priority {slot.model_id} ({slot.model_type}, {slot.vram_gb}GB)") + await slot.backend.unload(slot.model_id) + del self._loaded[slot.model_id] +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd llmux && python -m pytest tests/test_vram_manager.py -v` +Expected: all tests PASS (the `test_never_evict_asr_for_tts` test with `pass` will trivially pass — that's fine, the real scenario is covered by `test_asr_evicts_llm_not_reversed`) + +- [ ] **Step 5: Commit** + +```bash +git add llmux/llmux/vram_manager.py llmux/tests/test_vram_manager.py +git commit -m "feat: VRAM manager with priority-based eviction" +``` + +--- + +### Task 6: Backend Base Class + +**Files:** +- Create: `llmux/llmux/backends/base.py` + +- [ ] **Step 1: Create the abstract base class** + +`llmux/llmux/backends/base.py`: + +```python +from abc import ABC, abstractmethod +from typing import AsyncIterator + + +class BaseBackend(ABC): + """Abstract base for all model backends.""" + + @abstractmethod + async def load(self, model_id: str, **kwargs) -> None: + """Load model weights into GPU VRAM. + + Backends accept optional kwargs: + - device: "cuda" or "cpu" (transformers backends, chatterbox) + - n_gpu_layers: int (llamacpp backend, -1=all GPU, 0=CPU only) + """ + + @abstractmethod + async def unload(self, model_id: str) -> None: + """Unload model weights from GPU VRAM.""" + + @abstractmethod + async def generate( + self, + model_id: str, + messages: list[dict], + params: dict, + stream: bool = False, + tools: list[dict] | None = None, + ) -> AsyncIterator[str] | dict: + """Run chat inference. Returns full response dict or async iterator of SSE chunks.""" + + async def transcribe( + self, + model_id: str, + audio_data: bytes, + language: str = "en", + ) -> dict: + """Transcribe audio. Only implemented by ASR backends.""" + raise NotImplementedError(f"{self.__class__.__name__} does not support transcription") + + async def synthesize( + self, + model_id: str, + text: str, + voice: str = "default", + ) -> bytes: + """Synthesize speech. Only implemented by TTS backends.""" + raise NotImplementedError(f"{self.__class__.__name__} does not support speech synthesis") +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/backends/base.py +git commit -m "feat: abstract base class for model backends" +``` + +--- + +### Task 7: Transformers LLM Backend + +**Files:** +- Create: `llmux/llmux/backends/transformers_llm.py` + +- [ ] **Step 1: Implement the transformers LLM backend** + +`llmux/llmux/backends/transformers_llm.py`: + +```python +import asyncio +import json +import logging +import time +import uuid +from typing import AsyncIterator + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, TextIteratorStreamer +from threading import Thread + +from llmux.backends.base import BaseBackend +from llmux.config import PhysicalModel + +logger = logging.getLogger(__name__) + + +class TransformersLLMBackend(BaseBackend): + def __init__(self, models_dir: str = "/models"): + self._models_dir = models_dir + self._loaded: dict[str, dict] = {} # model_id -> {"model", "tokenizer", "processor"} + + async def load(self, model_id: str, device: str = "cuda") -> None: + """Load a HuggingFace model and tokenizer into VRAM.""" + if model_id in self._loaded: + return + + physical = _get_physical_config(model_id) + hf_id = physical.model_id + logger.info(f"Loading transformers model {hf_id} to {device}") + + def _load(): + tokenizer = AutoTokenizer.from_pretrained( + hf_id, + cache_dir=self._models_dir, + trust_remote_code=True, + ) + model = AutoModelForCausalLM.from_pretrained( + hf_id, + cache_dir=self._models_dir, + torch_dtype="auto", + device_map=device, + trust_remote_code=True, + ) + processor = None + if physical.supports_vision: + try: + processor = AutoProcessor.from_pretrained( + hf_id, + cache_dir=self._models_dir, + trust_remote_code=True, + ) + except Exception: + logger.warning(f"No processor found for {hf_id}, vision disabled") + return model, tokenizer, processor + + loop = asyncio.get_event_loop() + model, tokenizer, processor = await loop.run_in_executor(None, _load) + self._loaded[model_id] = { + "model": model, + "tokenizer": tokenizer, + "processor": processor, + "device": device, + } + logger.info(f"Loaded {hf_id} on {device}") + + async def unload(self, model_id: str) -> None: + if model_id not in self._loaded: + return + entry = self._loaded.pop(model_id) + del entry["model"] + del entry["tokenizer"] + if entry.get("processor"): + del entry["processor"] + torch.cuda.empty_cache() + logger.info(f"Unloaded {model_id}") + + async def generate( + self, + model_id: str, + messages: list[dict], + params: dict, + stream: bool = False, + tools: list[dict] | None = None, + ) -> AsyncIterator[str] | dict: + entry = self._loaded[model_id] + model = entry["model"] + tokenizer = entry["tokenizer"] + + # Apply virtual model params + chat_params = {} + if "enable_thinking" in params: + chat_params["enable_thinking"] = params["enable_thinking"] + + # Inject system prompt prefix for gpt-oss reasoning levels + effective_messages = list(messages) + if "system_prompt_prefix" in params: + prefix = params["system_prompt_prefix"] + if effective_messages and effective_messages[0].get("role") == "system": + effective_messages[0] = dict(effective_messages[0]) + effective_messages[0]["content"] = prefix + "\n\n" + effective_messages[0]["content"] + else: + effective_messages.insert(0, {"role": "system", "content": prefix}) + + # Build input + text = tokenizer.apply_chat_template( + effective_messages, + tokenize=False, + add_generation_prompt=True, + tools=tools, + **chat_params, + ) + inputs = tokenizer(text, return_tensors="pt").to(model.device) + + if stream: + return self._stream_generate(model, tokenizer, inputs, model_id) + else: + return await self._full_generate(model, tokenizer, inputs, model_id) + + async def _full_generate(self, model, tokenizer, inputs, model_id: str) -> dict: + def _run(): + with torch.no_grad(): + output_ids = model.generate( + **inputs, + max_new_tokens=4096, + ) + new_tokens = output_ids[0][inputs["input_ids"].shape[1]:] + return tokenizer.decode(new_tokens, skip_special_tokens=True) + + loop = asyncio.get_event_loop() + text = await loop.run_in_executor(None, _run) + + return { + "id": f"chatcmpl-{uuid.uuid4().hex[:12]}", + "object": "chat.completion", + "created": int(time.time()), + "model": model_id, + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": text}, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, + } + + async def _stream_generate( + self, model, tokenizer, inputs, model_id: str + ) -> AsyncIterator[str]: + streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) + gen_kwargs = {**inputs, "max_new_tokens": 4096, "streamer": streamer} + + thread = Thread(target=lambda: model.generate(**gen_kwargs)) + thread.start() + + chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" + created = int(time.time()) + + async def _iter(): + loop = asyncio.get_event_loop() + while True: + token = await loop.run_in_executor(None, lambda: next(streamer, None)) + if token is None: + # Final chunk + chunk = { + "id": chat_id, + "object": "chat.completion.chunk", + "created": created, + "model": model_id, + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + } + yield f"data: {json.dumps(chunk)}\n\n" + yield "data: [DONE]\n\n" + break + chunk = { + "id": chat_id, + "object": "chat.completion.chunk", + "created": created, + "model": model_id, + "choices": [ + {"index": 0, "delta": {"content": token}, "finish_reason": None} + ], + } + yield f"data: {json.dumps(chunk)}\n\n" + + thread.join() + + return _iter() + + +# Helper to get physical model config — injected at app startup +_physical_models: dict[str, PhysicalModel] = {} + + +def set_physical_models(models: dict[str, PhysicalModel]) -> None: + global _physical_models + _physical_models = models + + +def _get_physical_config(model_id: str) -> PhysicalModel: + return _physical_models[model_id] +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/backends/transformers_llm.py +git commit -m "feat: transformers LLM backend with streaming and thinking/reasoning support" +``` + +--- + +### Task 8: Transformers ASR Backend + +**Files:** +- Create: `llmux/llmux/backends/transformers_asr.py` + +- [ ] **Step 1: Implement the ASR backend** + +`llmux/llmux/backends/transformers_asr.py`: + +```python +import asyncio +import logging + +import torch +from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor + +from llmux.backends.base import BaseBackend +from llmux.config import PhysicalModel + +logger = logging.getLogger(__name__) + + +class TransformersASRBackend(BaseBackend): + def __init__(self, models_dir: str = "/models"): + self._models_dir = models_dir + self._loaded: dict[str, dict] = {} + + async def load(self, model_id: str, device: str = "cuda") -> None: + if model_id in self._loaded: + return + + physical = _get_physical_config(model_id) + hf_id = physical.model_id + logger.info(f"Loading ASR model {hf_id} to {device}") + + def _load(): + processor = AutoProcessor.from_pretrained( + hf_id, + cache_dir=self._models_dir, + trust_remote_code=True, + ) + model = AutoModelForSpeechSeq2Seq.from_pretrained( + hf_id, + cache_dir=self._models_dir, + torch_dtype="auto", + device_map=device, + trust_remote_code=True, + ) + return model, processor + + loop = asyncio.get_event_loop() + model, processor = await loop.run_in_executor(None, _load) + self._loaded[model_id] = { + "model": model, + "processor": processor, + "device": device, + } + logger.info(f"Loaded ASR model {hf_id} on {device}") + + async def unload(self, model_id: str) -> None: + if model_id not in self._loaded: + return + entry = self._loaded.pop(model_id) + del entry["model"] + del entry["processor"] + torch.cuda.empty_cache() + logger.info(f"Unloaded ASR model {model_id}") + + async def generate(self, model_id, messages, params, stream=False, tools=None): + raise NotImplementedError("ASR backend does not support chat generation") + + async def transcribe( + self, + model_id: str, + audio_data: bytes, + language: str = "en", + ) -> dict: + import io + import soundfile as sf + + entry = self._loaded[model_id] + model = entry["model"] + processor = entry["processor"] + + def _transcribe(): + # Decode audio bytes to numpy array + audio_array, sample_rate = sf.read(io.BytesIO(audio_data)) + + # Process audio + inputs = processor( + audio_array, + sampling_rate=sample_rate, + return_tensors="pt", + language=language, + ).to(model.device) + + with torch.no_grad(): + predicted_ids = model.generate(**inputs) + + transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + return transcription + + loop = asyncio.get_event_loop() + text = await loop.run_in_executor(None, _transcribe) + + return {"text": text} + + +# Physical model config injection (same pattern as transformers_llm) +_physical_models: dict[str, PhysicalModel] = {} + + +def set_physical_models(models: dict[str, PhysicalModel]) -> None: + global _physical_models + _physical_models = models + + +def _get_physical_config(model_id: str) -> PhysicalModel: + return _physical_models[model_id] +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/backends/transformers_asr.py +git commit -m "feat: transformers ASR backend for cohere-transcribe" +``` + +--- + +### Task 9: llama-cpp-python Backend + +**Files:** +- Create: `llmux/llmux/backends/llamacpp.py` + +- [ ] **Step 1: Implement the llama.cpp backend** + +`llmux/llmux/backends/llamacpp.py`: + +```python +import asyncio +import json +import logging +import time +import uuid +from pathlib import Path +from typing import AsyncIterator + +from llama_cpp import Llama, LlamaGrammar + +from llmux.backends.base import BaseBackend +from llmux.config import PhysicalModel + +logger = logging.getLogger(__name__) + + +class LlamaCppBackend(BaseBackend): + def __init__(self, models_dir: str = "/models"): + self._models_dir = Path(models_dir) + self._loaded: dict[str, dict] = {} + + async def load(self, model_id: str, n_gpu_layers: int = -1) -> None: + if model_id in self._loaded: + return + + physical = _get_physical_config(model_id) + model_path = self._models_dir / physical.model_file + logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}") + + def _load(): + kwargs = { + "model_path": str(model_path), + "n_gpu_layers": n_gpu_layers, + "n_ctx": 8192, + "verbose": False, + } + if physical.mmproj_file: + mmproj_path = self._models_dir / physical.mmproj_file + kwargs["chat_handler"] = _create_vision_handler(str(mmproj_path)) + return Llama(**kwargs) + + loop = asyncio.get_event_loop() + llm = await loop.run_in_executor(None, _load) + self._loaded[model_id] = {"llm": llm, "n_gpu_layers": n_gpu_layers} + logger.info(f"Loaded GGUF model {physical.model_file}") + + async def unload(self, model_id: str) -> None: + if model_id not in self._loaded: + return + entry = self._loaded.pop(model_id) + del entry["llm"] + logger.info(f"Unloaded GGUF model {model_id}") + + async def generate( + self, + model_id: str, + messages: list[dict], + params: dict, + stream: bool = False, + tools: list[dict] | None = None, + ) -> AsyncIterator[str] | dict: + entry = self._loaded[model_id] + llm = entry["llm"] + + # Apply virtual model params + effective_messages = list(messages) + if "enable_thinking" in params: + # For Qwen GGUF models, thinking is controlled via chat template + # enable_thinking=False adds /no_think tag + if not params["enable_thinking"]: + if effective_messages and effective_messages[0].get("role") == "system": + effective_messages[0] = dict(effective_messages[0]) + effective_messages[0]["content"] = ( + "/no_think\n" + effective_messages[0]["content"] + ) + else: + effective_messages.insert(0, {"role": "system", "content": "/no_think"}) + + if "system_prompt_prefix" in params: + prefix = params["system_prompt_prefix"] + if effective_messages and effective_messages[0].get("role") == "system": + effective_messages[0] = dict(effective_messages[0]) + effective_messages[0]["content"] = prefix + "\n\n" + effective_messages[0]["content"] + else: + effective_messages.insert(0, {"role": "system", "content": prefix}) + + if stream: + return self._stream_generate(llm, effective_messages, model_id, tools) + else: + return await self._full_generate(llm, effective_messages, model_id, tools) + + async def _full_generate(self, llm, messages, model_id, tools) -> dict: + def _run(): + kwargs = {"messages": messages, "max_tokens": 4096} + if tools: + kwargs["tools"] = tools + return llm.create_chat_completion(**kwargs) + + loop = asyncio.get_event_loop() + result = await loop.run_in_executor(None, _run) + + # llama-cpp-python returns OpenAI-compatible format + result["model"] = model_id + return result + + async def _stream_generate( + self, llm, messages, model_id, tools + ) -> AsyncIterator[str]: + def _run(): + kwargs = {"messages": messages, "max_tokens": 4096, "stream": True} + if tools: + kwargs["tools"] = tools + return llm.create_chat_completion(**kwargs) + + loop = asyncio.get_event_loop() + stream = await loop.run_in_executor(None, _run) + + async def _iter(): + for chunk in stream: + chunk["model"] = model_id + yield f"data: {json.dumps(chunk)}\n\n" + yield "data: [DONE]\n\n" + + return _iter() + + +def _create_vision_handler(mmproj_path: str): + """Create a chat handler with vision support using the mmproj file.""" + from llama_cpp.llama_chat_format import Llava16ChatHandler + + return Llava16ChatHandler(clip_model_path=mmproj_path) + + +# Physical model config injection +_physical_models: dict[str, PhysicalModel] = {} + + +def set_physical_models(models: dict[str, PhysicalModel]) -> None: + global _physical_models + _physical_models = models + + +def _get_physical_config(model_id: str) -> PhysicalModel: + return _physical_models[model_id] +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/backends/llamacpp.py +git commit -m "feat: llama-cpp-python backend with GGUF, vision, and tool support" +``` + +--- + +### Task 10: Chatterbox TTS Backend + +**Files:** +- Create: `llmux/llmux/backends/chatterbox_tts.py` + +- [ ] **Step 1: Implement the Chatterbox TTS backend** + +`llmux/llmux/backends/chatterbox_tts.py`: + +```python +import asyncio +import io +import logging + +import soundfile as sf +import torch + +from llmux.backends.base import BaseBackend +from llmux.config import PhysicalModel + +logger = logging.getLogger(__name__) + + +class ChatterboxTTSBackend(BaseBackend): + def __init__(self, models_dir: str = "/models"): + self._models_dir = models_dir + self._loaded: dict[str, dict] = {} + + async def load(self, model_id: str, device: str = "cuda") -> None: + if model_id in self._loaded: + return + + physical = _get_physical_config(model_id) + variant = physical.variant + logger.info(f"Loading Chatterbox {variant} to {device}") + + def _load(): + from chatterbox.tts import ChatterboxTTS + + if variant == "turbo": + model = ChatterboxTTS.from_pretrained(device=device, variant="turbo") + elif variant == "multilingual": + model = ChatterboxTTS.from_pretrained(device=device, variant="multilingual") + else: + model = ChatterboxTTS.from_pretrained(device=device) + return model + + loop = asyncio.get_event_loop() + model = await loop.run_in_executor(None, _load) + self._loaded[model_id] = {"model": model, "device": device} + logger.info(f"Loaded Chatterbox {variant} on {device}") + + async def unload(self, model_id: str) -> None: + if model_id not in self._loaded: + return + entry = self._loaded.pop(model_id) + del entry["model"] + torch.cuda.empty_cache() + logger.info(f"Unloaded Chatterbox {model_id}") + + async def generate(self, model_id, messages, params, stream=False, tools=None): + raise NotImplementedError("TTS backend does not support chat generation") + + async def synthesize( + self, + model_id: str, + text: str, + voice: str = "default", + ) -> bytes: + entry = self._loaded[model_id] + model = entry["model"] + + def _synthesize(): + wav = model.generate(text) + # Convert to WAV bytes + buf = io.BytesIO() + sf.write(buf, wav.cpu().numpy().squeeze(), samplerate=24000, format="WAV") + buf.seek(0) + return buf.read() + + loop = asyncio.get_event_loop() + audio_bytes = await loop.run_in_executor(None, _synthesize) + return audio_bytes + + +# Physical model config injection +_physical_models: dict[str, PhysicalModel] = {} + + +def set_physical_models(models: dict[str, PhysicalModel]) -> None: + global _physical_models + _physical_models = models + + +def _get_physical_config(model_id: str) -> PhysicalModel: + return _physical_models[model_id] +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/backends/chatterbox_tts.py +git commit -m "feat: Chatterbox TTS backend with turbo/multilingual/default variants" +``` + +--- + +### Task 11: API Routes — Health and Models + +**Files:** +- Create: `llmux/llmux/routes/models.py` +- Create: `llmux/tests/test_routes.py` + +- [ ] **Step 1: Write the failing tests** + +`llmux/tests/test_routes.py`: + +```python +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from llmux.config import ApiKey, load_models_config +from llmux.auth import create_api_key_dependency +from llmux.model_registry import ModelRegistry +from llmux.vram_manager import VRAMManager +from llmux.routes.models import create_models_router + + +API_KEY = "sk-test-key" + + +@pytest.fixture +def registry(): + return ModelRegistry.from_config() + + +@pytest.fixture +def vram_manager(): + return VRAMManager(total_vram_gb=16.0) + + +@pytest.fixture +def app(registry, vram_manager): + keys = [ApiKey(key=API_KEY, name="Test")] + require_api_key = create_api_key_dependency(keys) + + app = FastAPI() + app.include_router(create_models_router(registry, require_api_key)) + return app + + +@pytest.fixture +def client(app): + return TestClient(app) + + +@pytest.fixture +def auth_headers(): + return {"Authorization": f"Bearer {API_KEY}"} + + +def test_list_models_returns_16(client, auth_headers): + resp = client.get("/v1/models", headers=auth_headers) + assert resp.status_code == 200 + body = resp.json() + assert body["object"] == "list" + assert len(body["data"]) == 16 + + +def test_list_models_contains_expected_names(client, auth_headers): + resp = client.get("/v1/models", headers=auth_headers) + names = [m["id"] for m in resp.json()["data"]] + assert "Qwen3.5-9B-FP8-Thinking" in names + assert "GPT-OSS-20B-High" in names + assert "cohere-transcribe" in names + assert "Chatterbox-Multilingual" in names + + +def test_list_models_requires_auth(client): + resp = client.get("/v1/models") + assert resp.status_code == 401 +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd llmux && python -m pytest tests/test_routes.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'llmux.routes.models'` + +- [ ] **Step 3: Implement routes/models.py** + +`llmux/llmux/routes/models.py`: + +```python +from fastapi import APIRouter, Depends + +from llmux.model_registry import ModelRegistry + + +def create_models_router(registry: ModelRegistry, require_api_key) -> APIRouter: + router = APIRouter() + + @router.get("/v1/models") + async def list_models(api_key: str = Depends(require_api_key)): + return { + "object": "list", + "data": registry.list_virtual_models(), + } + + return router +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd llmux && python -m pytest tests/test_routes.py -v` +Expected: all 3 tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add llmux/llmux/routes/models.py llmux/tests/test_routes.py +git commit -m "feat: GET /v1/models endpoint with auth" +``` + +--- + +### Task 12: API Routes — Chat Completions + +**Files:** +- Create: `llmux/llmux/routes/chat.py` + +- [ ] **Step 1: Implement chat route** + +`llmux/llmux/routes/chat.py`: + +```python +import logging + +from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import StreamingResponse + +from llmux.model_registry import ModelRegistry +from llmux.vram_manager import VRAMManager + +logger = logging.getLogger(__name__) + + +def create_chat_router( + registry: ModelRegistry, + vram_manager: VRAMManager, + backends: dict, + require_api_key, +) -> APIRouter: + router = APIRouter() + + @router.post("/v1/chat/completions") + async def chat_completions(request: Request, api_key: str = Depends(require_api_key)): + body = await request.json() + + virtual_name = body.get("model") + if not virtual_name: + raise HTTPException(status_code=400, detail="Missing 'model' field") + + try: + physical_id, physical, params = registry.resolve(virtual_name) + except KeyError: + raise HTTPException(status_code=404, detail=f"Model '{virtual_name}' not found") + + # Get the backend for this model + backend = backends.get(physical.backend) + if backend is None: + raise HTTPException(status_code=500, detail=f"No backend for '{physical.backend}'") + + # Ensure model is loaded (VRAM manager handles eviction) + await vram_manager.load_model( + model_id=physical_id, + model_type=physical.type, + vram_gb=physical.estimated_vram_gb, + backend=backend, + ) + + messages = body.get("messages", []) + stream = body.get("stream", False) + tools = body.get("tools") + + result = await backend.generate( + model_id=physical_id, + messages=messages, + params=params, + stream=stream, + tools=tools, + ) + + if stream: + return StreamingResponse(result, media_type="text/event-stream") + return result + + return router +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/routes/chat.py +git commit -m "feat: POST /v1/chat/completions with streaming and VRAM management" +``` + +--- + +### Task 13: API Routes — Audio Transcription + +**Files:** +- Create: `llmux/llmux/routes/transcription.py` + +- [ ] **Step 1: Implement transcription route** + +`llmux/llmux/routes/transcription.py`: + +```python +import logging + +from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile + +from llmux.model_registry import ModelRegistry +from llmux.vram_manager import VRAMManager + +logger = logging.getLogger(__name__) + + +def create_transcription_router( + registry: ModelRegistry, + vram_manager: VRAMManager, + backends: dict, + require_api_key, +) -> APIRouter: + router = APIRouter() + + @router.post("/v1/audio/transcriptions") + async def create_transcription( + file: UploadFile = File(...), + model: str = Form(...), + language: str = Form("en"), + api_key: str = Depends(require_api_key), + ): + try: + physical_id, physical, params = registry.resolve(model) + except KeyError: + raise HTTPException(status_code=404, detail=f"Model '{model}' not found") + + if physical.type != "asr": + raise HTTPException(status_code=400, detail=f"Model '{model}' is not an ASR model") + + backend = backends.get(physical.backend) + if backend is None: + raise HTTPException(status_code=500, detail=f"No backend for '{physical.backend}'") + + await vram_manager.load_model( + model_id=physical_id, + model_type=physical.type, + vram_gb=physical.estimated_vram_gb, + backend=backend, + ) + + audio_data = await file.read() + result = await backend.transcribe( + model_id=physical_id, + audio_data=audio_data, + language=language, + ) + + return result + + return router +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/routes/transcription.py +git commit -m "feat: POST /v1/audio/transcriptions endpoint" +``` + +--- + +### Task 14: API Routes — Speech Synthesis + +**Files:** +- Create: `llmux/llmux/routes/speech.py` + +- [ ] **Step 1: Implement speech route** + +`llmux/llmux/routes/speech.py`: + +```python +import logging + +from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import Response + +from llmux.model_registry import ModelRegistry +from llmux.vram_manager import VRAMManager + +logger = logging.getLogger(__name__) + + +def create_speech_router( + registry: ModelRegistry, + vram_manager: VRAMManager, + backends: dict, + require_api_key, +) -> APIRouter: + router = APIRouter() + + @router.post("/v1/audio/speech") + async def create_speech(request: Request, api_key: str = Depends(require_api_key)): + body = await request.json() + + model_name = body.get("model") + if not model_name: + raise HTTPException(status_code=400, detail="Missing 'model' field") + + try: + physical_id, physical, params = registry.resolve(model_name) + except KeyError: + raise HTTPException(status_code=404, detail=f"Model '{model_name}' not found") + + if physical.type != "tts": + raise HTTPException(status_code=400, detail=f"Model '{model_name}' is not a TTS model") + + backend = backends.get(physical.backend) + if backend is None: + raise HTTPException(status_code=500, detail=f"No backend for '{physical.backend}'") + + await vram_manager.load_model( + model_id=physical_id, + model_type=physical.type, + vram_gb=physical.estimated_vram_gb, + backend=backend, + ) + + text = body.get("input", "") + voice = body.get("voice", "default") + + audio_bytes = await backend.synthesize( + model_id=physical_id, + text=text, + voice=voice, + ) + + return Response(content=audio_bytes, media_type="audio/wav") + + return router +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/routes/speech.py +git commit -m "feat: POST /v1/audio/speech endpoint" +``` + +--- + +### Task 15: API Routes — Admin Performance Test + +**Files:** +- Create: `llmux/llmux/routes/admin.py` + +- [ ] **Step 1: Implement admin performance test endpoint** + +`llmux/llmux/routes/admin.py`: + +```python +import asyncio +import logging +import time + +from fastapi import APIRouter, Depends, HTTPException, Request + +from llmux.model_registry import ModelRegistry +from llmux.vram_manager import VRAMManager + +logger = logging.getLogger(__name__) + +TEST_PROMPT = [{"role": "user", "content": "Say hello in one sentence."}] + + +def create_admin_router( + registry: ModelRegistry, + vram_manager: VRAMManager, + backends: dict, + require_api_key, +) -> APIRouter: + router = APIRouter() + + @router.post("/admin/test/performance") + async def test_performance(request: Request, api_key: str = Depends(require_api_key)): + """Run GPU vs CPU inference for a model and compare timings. + + Request body: + { + "physical_model_id": "qwen3.5-4b", + "test_type": "llm" | "asr" | "tts" + } + """ + body = await request.json() + physical_id = body.get("physical_model_id") + if not physical_id: + raise HTTPException(status_code=400, detail="Missing 'physical_model_id'") + + physical = registry.get_physical(physical_id) + backend_name = physical.backend + + if backend_name == "transformers" and physical.type == "llm": + return await _test_transformers_llm(physical_id, physical, backends) + elif backend_name == "transformers" and physical.type == "asr": + return await _test_transformers_asr(physical_id, physical, backends) + elif backend_name == "llamacpp": + return await _test_llamacpp(physical_id, physical, backends) + elif backend_name == "chatterbox": + return await _test_chatterbox(physical_id, physical, backends) + else: + raise HTTPException(status_code=400, detail=f"Unknown backend: {backend_name}") + + return router + + +async def _test_transformers_llm(physical_id, physical, backends): + from llmux.backends.transformers_llm import TransformersLLMBackend + + results = {} + + for device_label, device in [("gpu", "cuda"), ("cpu", "cpu")]: + backend = TransformersLLMBackend(models_dir=backends["transformers"]._models_dir) + await backend.load(physical_id, device=device) + start = time.monotonic() + await backend.generate(physical_id, TEST_PROMPT, params={}, stream=False) + elapsed = time.monotonic() - start + await backend.unload(physical_id) + results[device_label] = round(elapsed, 2) + + ratio = results["cpu"] / results["gpu"] if results["gpu"] > 0 else 0 + return { + "model": physical_id, + "gpu_seconds": results["gpu"], + "cpu_seconds": results["cpu"], + "speedup": round(ratio, 1), + "pass": ratio >= 5.0, + } + + +async def _test_transformers_asr(physical_id, physical, backends): + from llmux.backends.transformers_asr import TransformersASRBackend + import struct + + # Generate a short silent WAV for testing + silent_wav = _make_silent_wav(duration_seconds=2) + + results = {} + + for device_label, device in [("gpu", "cuda"), ("cpu", "cpu")]: + backend = TransformersASRBackend(models_dir=backends["transformers_asr"]._models_dir) + await backend.load(physical_id, device=device) + start = time.monotonic() + await backend.transcribe(physical_id, silent_wav, language="en") + elapsed = time.monotonic() - start + await backend.unload(physical_id) + results[device_label] = round(elapsed, 2) + + ratio = results["cpu"] / results["gpu"] if results["gpu"] > 0 else 0 + return { + "model": physical_id, + "gpu_seconds": results["gpu"], + "cpu_seconds": results["cpu"], + "speedup": round(ratio, 1), + "pass": ratio >= 5.0, + } + + +async def _test_llamacpp(physical_id, physical, backends): + from llmux.backends.llamacpp import LlamaCppBackend + + results = {} + + for label, n_gpu_layers in [("gpu", -1), ("cpu", 0)]: + backend = LlamaCppBackend(models_dir=backends["llamacpp"]._models_dir) + await backend.load(physical_id, n_gpu_layers=n_gpu_layers) + start = time.monotonic() + await backend.generate(physical_id, TEST_PROMPT, params={}, stream=False) + elapsed = time.monotonic() - start + await backend.unload(physical_id) + results[label] = round(elapsed, 2) + + ratio = results["cpu"] / results["gpu"] if results["gpu"] > 0 else 0 + return { + "model": physical_id, + "gpu_seconds": results["gpu"], + "cpu_seconds": results["cpu"], + "speedup": round(ratio, 1), + "pass": ratio >= 5.0, + } + + +async def _test_chatterbox(physical_id, physical, backends): + from llmux.backends.chatterbox_tts import ChatterboxTTSBackend + + backend = ChatterboxTTSBackend(models_dir=backends["chatterbox"]._models_dir) + await backend.load(physical_id, device="cuda") + test_text = "Hello, this is a performance test." + start = time.monotonic() + audio_bytes = await backend.synthesize(physical_id, test_text) + elapsed = time.monotonic() - start + await backend.unload(physical_id) + + # Estimate audio duration from WAV bytes (24kHz, 16-bit mono) + audio_samples = (len(audio_bytes) - 44) / 2 # subtract WAV header, 2 bytes per sample + audio_duration = audio_samples / 24000 + + return { + "model": physical_id, + "synthesis_seconds": round(elapsed, 2), + "audio_duration_seconds": round(audio_duration, 2), + "realtime_factor": round(audio_duration / elapsed, 1) if elapsed > 0 else 0, + } + + +def _make_silent_wav(duration_seconds=2, sample_rate=16000) -> bytes: + """Generate a silent WAV file as bytes.""" + import struct + num_samples = int(sample_rate * duration_seconds) + data = b"\x00\x00" * num_samples # 16-bit silence + header = struct.pack( + "<4sI4s4sIHHIIHH4sI", + b"RIFF", 36 + len(data), b"WAVE", + b"fmt ", 16, 1, 1, sample_rate, sample_rate * 2, 2, 16, + b"data", len(data), + ) + return header + data +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/llmux/routes/admin.py +git commit -m "feat: admin performance test endpoint for GPU vs CPU comparison" +``` + +--- + +### Task 16: FastAPI App Assembly (main.py) + +**Files:** +- Create: `llmux/llmux/main.py` + +- [ ] **Step 1: Implement main.py** + +`llmux/llmux/main.py`: + +```python +import logging +import os + +from fastapi import FastAPI + +from llmux.config import load_models_config, load_api_keys +from llmux.auth import create_api_key_dependency +from llmux.model_registry import ModelRegistry +from llmux.vram_manager import VRAMManager +from llmux.backends.transformers_llm import TransformersLLMBackend +from llmux.backends.transformers_llm import set_physical_models as set_transformers_llm_models +from llmux.backends.transformers_asr import TransformersASRBackend +from llmux.backends.transformers_asr import set_physical_models as set_transformers_asr_models +from llmux.backends.llamacpp import LlamaCppBackend +from llmux.backends.llamacpp import set_physical_models as set_llamacpp_models +from llmux.backends.chatterbox_tts import ChatterboxTTSBackend +from llmux.backends.chatterbox_tts import set_physical_models as set_chatterbox_models +from llmux.routes.models import create_models_router +from llmux.routes.chat import create_chat_router +from llmux.routes.transcription import create_transcription_router +from llmux.routes.speech import create_speech_router +from llmux.routes.admin import create_admin_router + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", +) +logger = logging.getLogger(__name__) + +MODELS_DIR = os.environ.get("LLMUX_MODELS_DIR", "/models") + +app = FastAPI(title="llmux", version="0.1.0") + + +@app.on_event("startup") +async def startup(): + logger.info("Starting llmux...") + + # Load config + physical, virtual = load_models_config() + api_keys = load_api_keys() + + # Inject physical model configs into backends + set_transformers_llm_models(physical) + set_transformers_asr_models(physical) + set_llamacpp_models(physical) + set_chatterbox_models(physical) + + # Create core components + registry = ModelRegistry(physical, virtual) + vram_manager = VRAMManager(total_vram_gb=16.0) + require_api_key = create_api_key_dependency(api_keys) + + # Create backends + transformers_llm = TransformersLLMBackend(models_dir=MODELS_DIR) + transformers_asr = TransformersASRBackend(models_dir=MODELS_DIR) + llamacpp = LlamaCppBackend(models_dir=MODELS_DIR) + chatterbox = ChatterboxTTSBackend(models_dir=MODELS_DIR) + + backends = { + "transformers": transformers_llm, + "transformers_asr": transformers_asr, + "llamacpp": llamacpp, + "chatterbox": chatterbox, + } + + # Store on app state for health endpoint + app.state.vram_manager = vram_manager + app.state.registry = registry + + # Register routes + app.include_router(create_models_router(registry, require_api_key)) + app.include_router(create_chat_router(registry, vram_manager, backends, require_api_key)) + app.include_router(create_transcription_router(registry, vram_manager, backends, require_api_key)) + app.include_router(create_speech_router(registry, vram_manager, backends, require_api_key)) + app.include_router(create_admin_router(registry, vram_manager, backends, require_api_key)) + + logger.info("llmux started successfully") + + +@app.get("/health") +async def health(): + vram_manager = app.state.vram_manager + loaded = vram_manager.get_loaded_models() + return { + "status": "ok", + "loaded_models": { + mid: {"type": slot.model_type, "vram_gb": slot.vram_gb} + for mid, slot in loaded.items() + }, + "available_vram_gb": round(vram_manager.available_vram_gb, 1), + } +``` + +- [ ] **Step 2: Fix backend routing in chat.py** + +The chat router currently looks up backends by `physical.backend` which is `"transformers"` for both LLM and ASR. We need to route ASR models to `transformers_asr`. Update `create_chat_router` in `llmux/llmux/routes/chat.py` to resolve the backend key: + +Replace the line: +```python + backend = backends.get(physical.backend) +``` +with: +```python + backend_key = physical.backend + if backend_key == "transformers" and physical.type == "asr": + backend_key = "transformers_asr" + backend = backends.get(backend_key) +``` + +Apply the same fix in `llmux/llmux/routes/transcription.py` and `llmux/llmux/routes/speech.py`. + +- [ ] **Step 3: Commit** + +```bash +git add llmux/llmux/main.py llmux/llmux/routes/chat.py \ + llmux/llmux/routes/transcription.py llmux/llmux/routes/speech.py +git commit -m "feat: FastAPI app assembly with all routes and backend wiring" +``` + +--- + +### Task 17: Dockerfile + +**Files:** +- Create: `llmux/Dockerfile` + +- [ ] **Step 1: Create the Dockerfile** + +`llmux/Dockerfile`: + +```dockerfile +FROM pytorch/pytorch:2.11.0-cuda12.8-cudnn9-runtime + +# System dependencies for audio processing +RUN apt-get update && apt-get install -y --no-install-recommends \ + libsndfile1 \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt && rm /tmp/requirements.txt + +# llama-cpp-python needs CUDA build +RUN CMAKE_ARGS="-DGGML_CUDA=on" pip install --no-cache-dir --force-reinstall llama-cpp-python>=0.3.0 + +# Copy application code +COPY llmux/ /app/llmux/ +WORKDIR /app + +# Run the server +EXPOSE 8081 +CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"] +``` + +- [ ] **Step 2: Commit** + +```bash +git add llmux/Dockerfile +git commit -m "feat: Dockerfile with PyTorch CUDA 12.8, audio deps, and CUDA llama-cpp" +``` + +--- + +### Task 18: Model Download Script + +**Files:** +- Create: `llmux/scripts/download_models.sh` + +- [ ] **Step 1: Create the download script** + +`llmux/scripts/download_models.sh`: + +```bash +#!/bin/bash +# Download all model weights for llmux. +# Run as user llm: bash scripts/download_models.sh +# Requires: pip install huggingface_hub +# Requires: HuggingFace token at ~/.cache/huggingface/token for gated models + +set -euo pipefail + +MODELS_DIR="${LLMUX_MODELS_DIR:-$HOME/.local/share/llmux_pod/models}" +mkdir -p "$MODELS_DIR" + +echo "=== Downloading models to $MODELS_DIR ===" + +# Helper: download HF model if not already present +download_hf() { + local repo="$1" + local target="$MODELS_DIR/models--${repo//\//-}" + if [ -d "$target" ]; then + echo "SKIP: $repo (already downloaded)" + return + fi + echo "Downloading: $repo" + huggingface-cli download "$repo" --cache-dir "$MODELS_DIR" +} + +# Helper: download specific files from HF repo +download_hf_files() { + local repo="$1" + shift + echo "Downloading specific files from: $repo" + huggingface-cli download "$repo" "$@" --cache-dir "$MODELS_DIR" +} + +# 1. Qwen3.5-9B-FP8 +download_hf "lovedheart/Qwen3.5-9B-FP8" + +# 2. Qwen3.5-9B-FP8-Uncensored (GGUF files only) +download_hf_files "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive" \ + "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf" \ + "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf" + +# 3. Qwen3.5-4B +download_hf "Qwen/Qwen3.5-4B" + +# 4. gpt-oss-20b +download_hf "openai/gpt-oss-20b" + +# 5. gpt-oss-20b-uncensored +download_hf "aoxo/gpt-oss-20b-uncensored" + +# 6. cohere-transcribe (gated — requires accepted terms) +echo "Downloading: CohereLabs/cohere-transcribe-03-2026 (gated)" +download_hf "CohereLabs/cohere-transcribe-03-2026" || \ + echo "WARNING: cohere-transcribe download failed. Have you accepted the terms at https://huggingface.co/CohereLabs/cohere-transcribe-03-2026 ?" + +# 7. Chatterbox TTS +# Chatterbox downloads weights automatically on first load via from_pretrained(). +# We trigger a dry-run download here so weights are cached. +echo "Downloading: Chatterbox TTS weights (auto-downloaded by library)" +python3 -c " +from chatterbox.tts import ChatterboxTTS +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '' # CPU only for download +print('Downloading Chatterbox default...') +ChatterboxTTS.from_pretrained(device='cpu') +print('Downloading Chatterbox turbo...') +ChatterboxTTS.from_pretrained(device='cpu', variant='turbo') +print('Downloading Chatterbox multilingual...') +ChatterboxTTS.from_pretrained(device='cpu', variant='multilingual') +print('Chatterbox downloads complete.') +" || echo "WARNING: Chatterbox download failed. Check chatterbox-tts installation." + +echo "" +echo "=== Download complete ===" +echo "Models directory: $MODELS_DIR" +du -sh "$MODELS_DIR" +``` + +- [ ] **Step 2: Make executable and commit** + +```bash +chmod +x llmux/scripts/download_models.sh +git add llmux/scripts/download_models.sh +git commit -m "feat: model download script for all 9 physical models" +``` + +--- + +### Task 19: Pod Creation Script + +**Files:** +- Create: `llmux/scripts/create_pod_llmux.sh` + +- [ ] **Step 1: Create the pod creation script** + +`llmux/scripts/create_pod_llmux.sh`: + +```bash +#!/bin/bash +# Create the llmux Podman pod and systemd service. +# Run as user llm: bash scripts/create_pod_llmux.sh +# Prerequisites: +# - Model weights downloaded to ~/.local/share/llmux_pod/models/ +# - Config files in ~/.local/share/llmux_pod/config/ +# - Container image built: podman build -t llmux:latest -f Dockerfile . + +set -euo pipefail + +# --- Variables --- +POD_NAME="llmux_pod" +CTR_NAME="llmux_ctr" +IMAGE="localhost/llmux:latest" +PORT="127.0.0.1:8081:8081" +BIND_DIR="$HOME/.local/share/${POD_NAME}" +USER_SYSTEMD_DIR="$HOME/.config/systemd/user" + +MODELS_DIR="${BIND_DIR}/models" +CONFIG_DIR="${BIND_DIR}/config" + +# --- Sanity checks --- +if [ ! -d "$MODELS_DIR" ]; then + echo "ERROR: Models directory not found: $MODELS_DIR" + echo "Run download_models.sh first." + exit 1 +fi + +if [ ! -f "$CONFIG_DIR/models.yaml" ]; then + echo "ERROR: Config not found: $CONFIG_DIR/models.yaml" + exit 1 +fi + +if [ ! -f "$CONFIG_DIR/api_keys.yaml" ]; then + echo "ERROR: Config not found: $CONFIG_DIR/api_keys.yaml" + exit 1 +fi + +# --- Ensure directories --- +mkdir -p "$USER_SYSTEMD_DIR" + +# --- Build image if not present --- +if ! podman image exists "$IMAGE"; then + echo "Building container image..." + SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + podman build -t llmux:latest -f "$SCRIPT_DIR/../Dockerfile" "$SCRIPT_DIR/.." +fi + +# --- Remove old pod if exists --- +podman pod exists "$POD_NAME" && podman pod stop "$POD_NAME" 2>/dev/null || true +podman pod exists "$POD_NAME" && podman pod rm -f "$POD_NAME" 2>/dev/null || true + +# --- Create pod --- +echo "Creating pod $POD_NAME..." +podman pod create \ + --name "$POD_NAME" \ + -p "$PORT" + +# --- Create container --- +echo "Creating container $CTR_NAME..." +podman run -d \ + --name "$CTR_NAME" \ + --pod "$POD_NAME" \ + --device nvidia.com/gpu=all \ + -v "${MODELS_DIR}:/models:ro" \ + -v "${CONFIG_DIR}:/config:ro" \ + -e LLMUX_CONFIG_DIR=/config \ + -e LLMUX_MODELS_DIR=/models \ + "$IMAGE" + +# --- Wait for startup --- +echo "Waiting for llmux to start..." +for i in $(seq 1 30); do + if curl -sf http://127.0.0.1:8081/health > /dev/null 2>&1; then + echo "llmux is healthy!" + break + fi + sleep 2 +done + +# --- Generate systemd units --- +echo "Generating systemd units..." +cd "$USER_SYSTEMD_DIR" +podman generate systemd --files --new --name "$POD_NAME" + +# --- Stop the live pod (systemd will manage it) --- +podman pod stop "$POD_NAME" +podman pod rm -f "$POD_NAME" + +# --- Enable systemd service --- +systemctl --user daemon-reload +systemctl --user enable --now "pod-${POD_NAME}.service" + +echo "" +echo "=== llmux pod created and enabled ===" +echo "Service: systemctl --user status pod-${POD_NAME}.service" +echo "Health: curl http://127.0.0.1:8081/health" +echo "Logs: journalctl --user -u pod-${POD_NAME}.service -f" +``` + +- [ ] **Step 2: Make executable and commit** + +```bash +chmod +x llmux/scripts/create_pod_llmux.sh +git add llmux/scripts/create_pod_llmux.sh +git commit -m "feat: Podman pod creation script with systemd integration" +``` + +--- + +### Task 20: Traefik Configuration + +**Files:** +- Create: (written to) `/home/trf/.local/share/traefik_pod/dynamic/llmux.yml` + +- [ ] **Step 1: Create the Traefik dynamic config** + +Write to `/home/trf/.local/share/traefik_pod/dynamic/llmux.yml`: + +```yaml +http: + routers: + llmux: + entryPoints: ["wghttp"] + rule: "Host(`kidirekt.kischdle.com`)" + priority: 100 + service: llmux + + services: + llmux: + loadBalancer: + servers: + - url: "http://10.0.2.2:8081" +``` + +- [ ] **Step 2: Verify Traefik picks up the config** + +Traefik watches the `dynamic/` directory with `watch: true`. Check Traefik logs or dashboard at `127.0.0.1:8085` to confirm the `llmux` router appears. + +- [ ] **Step 3: Commit (in the llmux repo, note the file location)** + +The Traefik config lives outside the llmux repo. Document this in a comment within `create_pod_llmux.sh` and log it. + +```bash +git add -A +git commit -m "docs: note Traefik config location for llmux routing" +``` + +--- + +### Task 21: System Integration — Build and GPU Passthrough + +**Checkpoint: Phase 1 system integration begins. Iterate on issues until resolved before proceeding.** + +- [ ] **Step 1: Copy config to llm user data dir** + +```bash +# As user tlg (has llmux-design group access) +sudo -u llm mkdir -p /home/llm/.local/share/llmux_pod/config +sudo -u llm cp llmux/config/models.yaml /home/llm/.local/share/llmux_pod/config/ +sudo -u llm cp llmux/config/api_keys.yaml /home/llm/.local/share/llmux_pod/config/ +``` + +- [ ] **Step 2: Copy HuggingFace token to llm user** + +```bash +sudo -u llm mkdir -p /home/llm/.cache/huggingface +sudo -u llm cp /home/tlg/.cache/huggingface/token /home/llm/.cache/huggingface/token +sudo -u llm chmod 600 /home/llm/.cache/huggingface/token +``` + +- [ ] **Step 3: Build the container image** + +```bash +cd llmux +podman build -t llmux:latest -f Dockerfile . +``` + +Expected: Image builds successfully. If dependencies fail, fix Dockerfile and rebuild. + +- [ ] **Step 4: Test GPU passthrough** + +```bash +podman run --rm --device nvidia.com/gpu=all llmux:latest nvidia-smi +``` + +Expected: Shows RTX 5070 Ti inside the container. If CDI doesn't work, try `--device nvidia.com/gpu=0` or check NVIDIA container toolkit setup. + +- [ ] **Step 5: Test model mount** + +```bash +podman run --rm \ + -v /home/llm/.local/share/llmux_pod/models:/models:ro \ + llmux:latest \ + ls /models +``` + +Expected: Lists model directories. If empty, models haven't been downloaded yet — run `download_models.sh` first. + +- [ ] **Step 6: Commit any fixes** + +```bash +git add -A +git commit -m "fix: system integration fixes for container build and GPU passthrough" +``` + +--- + +### Task 22: System Integration — Service Startup and Open WebUI + +- [ ] **Step 1: Start llmux manually for testing** + +```bash +podman pod create --name llmux_pod -p 127.0.0.1:8081:8081 +podman run -d --name llmux_ctr --pod llmux_pod \ + --device nvidia.com/gpu=all \ + -v /home/llm/.local/share/llmux_pod/models:/models:ro \ + -v /home/llm/.local/share/llmux_pod/config:/config:ro \ + -e LLMUX_CONFIG_DIR=/config \ + -e LLMUX_MODELS_DIR=/models \ + llmux:latest +``` + +- [ ] **Step 2: Verify health endpoint** + +```bash +curl http://127.0.0.1:8081/health +``` + +Expected: `{"status":"ok","loaded_models":{},"available_vram_gb":16.0}` + +- [ ] **Step 3: Verify model listing with auth** + +```bash +API_KEY=$(grep 'openwebui' /home/llm/.local/share/llmux_pod/config/api_keys.yaml | awk '{print $2}' | tr -d '"') +curl -H "Authorization: Bearer $API_KEY" http://127.0.0.1:8081/v1/models | python3 -m json.tool +``` + +Expected: JSON with 16 models listed. + +- [ ] **Step 4: Configure Open WebUI via API** + +```bash +# Login to get JWT token +TOKEN=$(curl -s http://127.0.0.1:8080/api/v1/auths/signin \ + -H "Content-Type: application/json" \ + -d '{"email":"Thomas.Langer@destengs.com","password":"3hXp+3!bks"}' \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['token'])") + +# Configure OpenAI connection +API_KEY=$(grep 'openwebui' /home/llm/.local/share/llmux_pod/config/api_keys.yaml | awk '{print $2}' | tr -d '"') + +curl -X POST http://127.0.0.1:8080/api/v1/configs \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d "{ + \"OPENAI_API_BASE_URL\": \"http://127.0.0.1:8081/v1\", + \"OPENAI_API_KEY\": \"$API_KEY\" + }" +``` + +Note: The exact Open WebUI API endpoints for configuring connections and audio may differ by version. Check the Open WebUI v0.8.12 API docs and adjust. The key settings to configure: +- OpenAI API base URL → `http://127.0.0.1:8081/v1` +- OpenAI API key → the generated key +- STT engine → openai, base URL → `http://127.0.0.1:8081/v1` +- TTS engine → openai, base URL → `http://127.0.0.1:8081/v1` + +- [ ] **Step 5: Verify models appear in Open WebUI** + +Open `http://127.0.0.1:8080` in a browser, log in as user "try" (destengs@gmx.com / k4/vvZ+17), and verify the model dropdown shows the 16 virtual models. + +- [ ] **Step 6: Cleanup test pod and deploy via script** + +```bash +podman pod stop llmux_pod && podman pod rm -f llmux_pod +# Now run the real deployment script as user llm: +sudo -u llm bash /home/llm/bin/create_pod_llmux.sh +``` + +- [ ] **Step 7: Verify systemd lifecycle** + +```bash +sudo -u llm systemctl --user status pod-llmux_pod.service +sudo -u llm systemctl --user restart pod-llmux_pod.service +curl http://127.0.0.1:8081/health +``` + +Expected: Service running and healthy after restart. + +- [ ] **Step 8: Commit any fixes** + +```bash +git add -A +git commit -m "fix: system integration fixes for service startup and Open WebUI connection" +``` + +--- + +### Task 23: Download Models + +**This task takes several hours due to ~60GB of downloads.** + +- [ ] **Step 1: Run the download script** + +```bash +sudo -u llm bash llmux/scripts/download_models.sh +``` + +Expected: All models download successfully. cohere-transcribe requires accepted terms and token. Chatterbox downloads via Python. + +- [ ] **Step 2: Verify all models are present** + +```bash +ls -la /home/llm/.local/share/llmux_pod/models/ +du -sh /home/llm/.local/share/llmux_pod/models/ +``` + +Expected: ~60GB of model weights. + +--- + +### Task 24: Functional Tests — Chat Inference + +**Checkpoint: Phase 2 functional tests. Test each model via Open WebUI and curl.** + +- [ ] **Step 1: Test Qwen3.5-4B-Thinking via curl** + +```bash +API_KEY="" +curl -X POST http://127.0.0.1:8081/v1/chat/completions \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen3.5-4B-Thinking", + "messages": [{"role": "user", "content": "What is 2+2? Think step by step."}], + "stream": false + }' | python3 -m json.tool +``` + +Expected: Response with thinking/reasoning visible in the output. + +- [ ] **Step 2: Test Qwen3.5-4B-Instruct** + +Same as above but with `"model": "Qwen3.5-4B-Instruct"`. Expected: Direct response without thinking. + +- [ ] **Step 3: Test each remaining LLM model** + +Repeat curl tests for: +- Qwen3.5-9B-FP8-Thinking / Instruct +- Qwen3.5-9B-FP8-Uncensored-Thinking / Instruct +- GPT-OSS-20B-Low / Medium / High +- GPT-OSS-20B-Uncensored-Low / Medium / High + +Verify each returns a reasonable response. + +- [ ] **Step 4: Test streaming** + +```bash +curl -X POST http://127.0.0.1:8081/v1/chat/completions \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen3.5-4B-Instruct", + "messages": [{"role": "user", "content": "Count from 1 to 10."}], + "stream": true + }' +``` + +Expected: SSE stream with `data: {...}` chunks arriving incrementally. + +- [ ] **Step 5: Test in Open WebUI** + +Log in as user "try" at `http://127.0.0.1:8080`. Select each model from the dropdown and send a test message. Verify responses stream in the UI. + +--- + +### Task 25: Functional Tests — Vision and Tools + +- [ ] **Step 1: Test vision with Qwen3.5-4B** + +In Open WebUI as user "try", select Qwen3.5-4B-Instruct, attach an image, and ask "What is in this image?". Verify the model describes the image content. + +Repeat for Qwen3.5-9B-FP8-Instruct and Qwen3.5-9B-FP8-Uncensored-Instruct. + +- [ ] **Step 2: Test tool usage via curl** + +```bash +curl -X POST http://127.0.0.1:8081/v1/chat/completions \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen3.5-9B-FP8-Instruct", + "messages": [{"role": "user", "content": "What is the weather in Berlin?"}], + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"} + }, + "required": ["city"] + } + } + } + ] + }' | python3 -m json.tool +``` + +Expected: Response contains a `tool_calls` entry requesting `get_weather` with `city: "Berlin"`. + +Repeat for Qwen3.5-9B-FP8-Uncensored-Instruct (llama-cpp-python), GPT-OSS-20B-Medium, and GPT-OSS-20B-Uncensored-Medium. + +--- + +### Task 26: Functional Tests — ASR and TTS + +- [ ] **Step 1: Test ASR via curl** + +```bash +# Record a short WAV or use an existing audio file +curl -X POST http://127.0.0.1:8081/v1/audio/transcriptions \ + -H "Authorization: Bearer $API_KEY" \ + -F "file=@test_audio.wav" \ + -F "model=cohere-transcribe" \ + -F "language=en" +``` + +Expected: `{"text": "...transcribed text..."}` + +- [ ] **Step 2: Test TTS via curl** + +```bash +curl -X POST http://127.0.0.1:8081/v1/audio/speech \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "Chatterbox-Multilingual", "input": "Hello, this is a test.", "voice": "default"}' \ + --output test_output.wav + +# Play the audio +aplay test_output.wav # or ffplay test_output.wav +``` + +Expected: Audible speech output. + +- [ ] **Step 3: Test ASR and TTS in Open WebUI** + +Log in as user "try". Use the dictation button (microphone icon) to record speech. Verify it appears as text. Use audio playback on a response to hear TTS output. + +- [ ] **Step 4: Test German ASR** + +```bash +curl -X POST http://127.0.0.1:8081/v1/audio/transcriptions \ + -H "Authorization: Bearer $API_KEY" \ + -F "file=@test_german.wav" \ + -F "model=cohere-transcribe" \ + -F "language=de" +``` + +Expected: German transcription. + +--- + +### Task 27: VRAM Management Tests + +**Checkpoint: Phase 3 VRAM management tests.** + +- [ ] **Step 1: Test small LLM — ASR + TTS remain loaded** + +```bash +# Load ASR +curl -X POST http://127.0.0.1:8081/v1/audio/transcriptions \ + -H "Authorization: Bearer $API_KEY" \ + -F "file=@test_audio.wav" -F "model=cohere-transcribe" -F "language=en" + +# Load TTS +curl -X POST http://127.0.0.1:8081/v1/audio/speech \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "Chatterbox-Multilingual", "input": "Test", "voice": "default"}' --output /dev/null + +# Load small LLM +curl -X POST http://127.0.0.1:8081/v1/chat/completions \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3.5-4B-Instruct", "messages": [{"role":"user","content":"Hi"}]}' + +# Check health — all three should be loaded +curl http://127.0.0.1:8081/health | python3 -m json.tool +``` + +Expected: `loaded_models` contains cohere-transcribe, chatterbox-multilingual, and qwen3.5-4b. + +- [ ] **Step 2: Test medium LLM — ASR + TTS remain loaded** + +```bash +curl -X POST http://127.0.0.1:8081/v1/chat/completions \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3.5-9B-FP8-Instruct", "messages": [{"role":"user","content":"Hi"}]}' + +curl http://127.0.0.1:8081/health | python3 -m json.tool +``` + +Expected: `loaded_models` contains cohere-transcribe, chatterbox-multilingual, and qwen3.5-9b-fp8 (~15GB total). + +- [ ] **Step 3: Test large LLM — evicts ASR and TTS** + +```bash +curl -X POST http://127.0.0.1:8081/v1/chat/completions \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "GPT-OSS-20B-High", "messages": [{"role":"user","content":"Hi"}]}' + +curl http://127.0.0.1:8081/health | python3 -m json.tool +``` + +Expected: Only gpt-oss-20b loaded (~13GB). ASR and TTS evicted. + +- [ ] **Step 4: Test ASR request evicts LLM first** + +```bash +# With gpt-oss-20b still loaded, request ASR +curl -X POST http://127.0.0.1:8081/v1/audio/transcriptions \ + -H "Authorization: Bearer $API_KEY" \ + -F "file=@test_audio.wav" -F "model=cohere-transcribe" -F "language=en" + +curl http://127.0.0.1:8081/health | python3 -m json.tool +``` + +Expected: gpt-oss-20b evicted, cohere-transcribe loaded. + +- [ ] **Step 5: Test model swapping** + +```bash +# Load one LLM +curl -X POST http://127.0.0.1:8081/v1/chat/completions \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3.5-4B-Instruct", "messages": [{"role":"user","content":"Hi"}]}' + +# Switch to another +curl -X POST http://127.0.0.1:8081/v1/chat/completions \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "Qwen3.5-9B-FP8-Instruct", "messages": [{"role":"user","content":"Hi"}]}' + +curl http://127.0.0.1:8081/health | python3 -m json.tool +``` + +Expected: Only qwen3.5-9b-fp8 loaded (qwen3.5-4b evicted). + +--- + +### Task 28: Performance Tests + +**Checkpoint: Phase 4 performance tests.** + +- [ ] **Step 1: Test transformers LLM GPU vs CPU** + +```bash +for model in qwen3.5-4b qwen3.5-9b-fp8 gpt-oss-20b gpt-oss-20b-uncensored; do + echo "=== Testing $model ===" + curl -X POST http://127.0.0.1:8081/admin/test/performance \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d "{\"physical_model_id\": \"$model\"}" | python3 -m json.tool +done +``` + +Expected: Each model shows `"pass": true` with GPU at least 5x faster than CPU. + +- [ ] **Step 2: Test ASR GPU vs CPU** + +```bash +curl -X POST http://127.0.0.1:8081/admin/test/performance \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"physical_model_id": "cohere-transcribe"}' | python3 -m json.tool +``` + +Expected: `"pass": true` + +- [ ] **Step 3: Test llama-cpp-python GPU vs CPU** + +```bash +curl -X POST http://127.0.0.1:8081/admin/test/performance \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"physical_model_id": "qwen3.5-9b-fp8-uncensored"}' | python3 -m json.tool +``` + +Expected: `"pass": true` + +- [ ] **Step 4: Test Chatterbox performance** + +```bash +for model in chatterbox-turbo chatterbox-multilingual chatterbox; do + echo "=== Testing $model ===" + curl -X POST http://127.0.0.1:8081/admin/test/performance \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d "{\"physical_model_id\": \"$model\"}" | python3 -m json.tool +done +``` + +Expected: `realtime_factor > 1.0` (generates audio faster than real-time). + +--- + +### Task 29: Traefik and Remote Access Test + +- [ ] **Step 1: Test Traefik routing** + +From a machine on the WireGuard VPN, or locally if DNS resolves: + +```bash +curl -H "Authorization: Bearer $API_KEY" https://kidirekt.kischdle.com/v1/models | python3 -m json.tool +``` + +Expected: Same 16 models as localhost. If DNS is not yet resolving, test locally: + +```bash +curl -H "Host: kidirekt.kischdle.com" -H "Authorization: Bearer $API_KEY" http://127.0.0.1:8080/v1/models | python3 -m json.tool +``` + +(Port 8080 is Traefik's entry point.) + +- [ ] **Step 2: Test remote Whisper transcription** + +```bash +curl -X POST https://kidirekt.kischdle.com/v1/audio/transcriptions \ + -H "Authorization: Bearer $WHISPER_KEY" \ + -F "file=@test_audio.wav" \ + -F "model=cohere-transcribe" \ + -F "language=en" +``` + +Expected: Transcription returned via remote API. + +--- + +### Task 30: Final Cleanup and Documentation + +- [ ] **Step 1: Copy create_pod_llmux.sh to /home/llm/bin/** + +```bash +cp llmux/scripts/create_pod_llmux.sh /home/llm/bin/create_pod_llmux.sh +chmod +x /home/llm/bin/create_pod_llmux.sh +``` + +- [ ] **Step 2: Final commit** + +```bash +git add -A +git commit -m "feat: llmux v0.1.0 — complete implementation with all models and tests passing" +``` + +- [ ] **Step 3: Push to Gitea** + +```bash +git push origin main +```