From 3edc0552998a32fb6b3b395f0f2d5f4a1b2912d5ebcd336acb9824e240db94e1 Mon Sep 17 00:00:00 2001 From: tlg Date: Wed, 8 Apr 2026 21:50:39 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20Open=20WebUI=20integration=20=E2=80=94?= =?UTF-8?q?=20Harmony=20stripping,=20VRAM=20eviction,=20concurrency=20lock?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) --- kischdle/llmux/config/models.yaml | 2 +- kischdle/llmux/llmux/backends/llamacpp.py | 148 +++++++++--- .../llmux/llmux/backends/transformers_llm.py | 53 ++++- kischdle/llmux/llmux/harmony.py | 90 +++++++ kischdle/llmux/llmux/main.py | 10 + kischdle/llmux/llmux/model_registry.py | 3 +- kischdle/llmux/llmux/routes/chat.py | 18 +- kischdle/llmux/llmux/routes/speech.py | 15 ++ kischdle/llmux/llmux/vram_manager.py | 62 ++++- kischdle/llmux/scripts/perf_test.py | 224 ++++++++++++++++++ kischdle/llmux/tests/test_config.py | 8 +- kischdle/llmux/tests/test_harmony.py | 55 +++++ kischdle/llmux/tests/test_model_registry.py | 10 +- kischdle/llmux/tests/test_routes.py | 8 +- kischdle/llmux/tests/test_vram_manager.py | 2 +- 15 files changed, 634 insertions(+), 74 deletions(-) create mode 100644 kischdle/llmux/llmux/harmony.py create mode 100644 kischdle/llmux/scripts/perf_test.py create mode 100644 kischdle/llmux/tests/test_harmony.py diff --git a/kischdle/llmux/config/models.yaml b/kischdle/llmux/config/models.yaml index b47c82a..dd5d15b 100644 --- a/kischdle/llmux/config/models.yaml +++ b/kischdle/llmux/config/models.yaml @@ -22,7 +22,7 @@ physical_models: type: llm backend: transformers model_id: "Qwen/Qwen3.5-4B" - estimated_vram_gb: 4 + estimated_vram_gb: 9 supports_vision: true supports_tools: true diff --git a/kischdle/llmux/llmux/backends/llamacpp.py b/kischdle/llmux/llmux/backends/llamacpp.py index f04fef5..9425eb5 100644 --- a/kischdle/llmux/llmux/backends/llamacpp.py +++ b/kischdle/llmux/llmux/backends/llamacpp.py @@ -11,6 +11,7 @@ from llama_cpp import Llama from llmux.backends.base import BaseBackend from llmux.config import PhysicalModel +from llmux.harmony import HarmonyStreamFilter, extract_final_text logger = logging.getLogger(__name__) @@ -19,6 +20,7 @@ class LlamaCppBackend(BaseBackend): def __init__(self, models_dir: str = "/models"): self._models_dir = Path(models_dir) self._loaded: dict[str, dict] = {} + self._locks: dict[str, asyncio.Lock] = {} # per-model lock to prevent concurrent C++ access def _resolve_gguf_path(self, physical: PhysicalModel, filename: str) -> str: """Resolve a GGUF filename — check flat gguf/ dir first, then HF cache.""" @@ -69,29 +71,41 @@ class LlamaCppBackend(BaseBackend): "think_handler": think_handler, "no_think_handler": no_think_handler, } + self._locks[model_id] = asyncio.Lock() async def unload(self, model_id: str) -> None: if model_id not in self._loaded: return entry = self._loaded.pop(model_id) - del entry["llm"] + self._locks.pop(model_id, None) + # Delete chat handlers first (they hold references to Llama internals) + entry.pop("think_handler", None) + entry.pop("no_think_handler", None) + llm = entry.pop("llm") + # Close the Llama model to release GGML CUDA memory + if hasattr(llm, "close"): + llm.close() + del llm del entry gc.collect() + # Also clear PyTorch cache in case of mixed allocations + import torch + if torch.cuda.is_available(): + torch.cuda.empty_cache() logger.info(f"Unloaded GGUF model {model_id}") + def _select_handler(self, entry, params): + """Select the correct chat handler based on params.""" + if "enable_thinking" in params: + if params["enable_thinking"]: + return entry.get("think_handler") + else: + return entry.get("no_think_handler") + return None + async def generate(self, model_id, messages, params, stream=False, tools=None): entry = self._loaded[model_id] - llm = entry["llm"] - - # Swap chat handler based on thinking mode - original_handler = llm.chat_handler - if "enable_thinking" in params: - if params["enable_thinking"]: - handler = entry.get("think_handler") - else: - handler = entry.get("no_think_handler") - if handler: - llm.chat_handler = handler + handler = self._select_handler(entry, params) effective_messages = list(messages) if "system_prompt_prefix" in params: @@ -102,41 +116,105 @@ class LlamaCppBackend(BaseBackend): else: effective_messages.insert(0, {"role": "system", "content": prefix}) - try: - if stream: - return self._stream_generate(llm, effective_messages, model_id, tools) - else: - return await self._full_generate(llm, effective_messages, model_id, tools) - finally: - # Restore original handler - llm.chat_handler = original_handler + if stream: + return self._stream_generate(entry, effective_messages, model_id, tools, handler) + else: + return await self._full_generate(entry, effective_messages, model_id, tools, handler) + + async def _full_generate(self, entry, messages, model_id, tools, handler): + llm = entry["llm"] + lock = self._locks[model_id] - async def _full_generate(self, llm, messages, model_id, tools): def _run(): kwargs = {"messages": messages, "max_tokens": 4096} if tools: kwargs["tools"] = tools return llm.create_chat_completion(**kwargs) - loop = asyncio.get_event_loop() - result = await loop.run_in_executor(None, _run) + async with lock: + original = llm.chat_handler + if handler: + llm.chat_handler = handler + try: + loop = asyncio.get_event_loop() + result = await loop.run_in_executor(None, _run) + finally: + llm.chat_handler = original + result["model"] = model_id + for choice in result.get("choices", []): + msg = choice.get("message", {}) + if msg.get("content"): + msg["content"] = extract_final_text(msg["content"]) return result - async def _stream_generate(self, llm, messages, model_id, tools): - def _run(): - kwargs = {"messages": messages, "max_tokens": 4096, "stream": True} - if tools: - kwargs["tools"] = tools - return llm.create_chat_completion(**kwargs) + async def _stream_generate(self, entry, messages, model_id, tools, handler): + llm = entry["llm"] + lock = self._locks[model_id] - loop = asyncio.get_event_loop() - stream = await loop.run_in_executor(None, _run) + # Acquire lock for the entire duration of streaming. + # This prevents concurrent C++ access which causes segfaults. + await lock.acquire() - for chunk in stream: - chunk["model"] = model_id - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" + original = llm.chat_handler + if handler: + llm.chat_handler = handler + + try: + def _run(): + kwargs = {"messages": messages, "max_tokens": 4096, "stream": True} + if tools: + kwargs["tools"] = tools + return llm.create_chat_completion(**kwargs) + + loop = asyncio.get_event_loop() + stream = await loop.run_in_executor(None, _run) + + harmony_filter = HarmonyStreamFilter() + error_msg = None + try: + for chunk in stream: + chunk["model"] = model_id + skip = False + for choice in chunk.get("choices", []): + delta = choice.get("delta", {}) + content = delta.get("content") + if content is not None: + filtered = harmony_filter.feed(content) + if not filtered: + skip = True + else: + delta["content"] = filtered + if skip: + continue + yield f"data: {json.dumps(chunk)}\n\n" + except Exception as e: + logger.error(f"Stream error for {model_id}: {e}") + error_msg = str(e) + + flushed = harmony_filter.flush() + if flushed: + flush_chunk = { + "id": f"chatcmpl-{uuid.uuid4().hex[:12]}", + "model": model_id, + "object": "chat.completion.chunk", + "choices": [{"index": 0, "delta": {"content": flushed}, "finish_reason": None}], + } + yield f"data: {json.dumps(flush_chunk)}\n\n" + + if error_msg: + err_chunk = { + "id": f"chatcmpl-{uuid.uuid4().hex[:12]}", + "model": model_id, + "object": "chat.completion.chunk", + "choices": [{"index": 0, "delta": {"content": f"\n\n[Error: {error_msg}]"}, "finish_reason": None}], + } + yield f"data: {json.dumps(err_chunk)}\n\n" + + yield "data: [DONE]\n\n" + finally: + llm.chat_handler = original + lock.release() def _create_think_handler(llm, enable_thinking: bool): diff --git a/kischdle/llmux/llmux/backends/transformers_llm.py b/kischdle/llmux/llmux/backends/transformers_llm.py index 5220560..d07cada 100644 --- a/kischdle/llmux/llmux/backends/transformers_llm.py +++ b/kischdle/llmux/llmux/backends/transformers_llm.py @@ -11,6 +11,7 @@ from threading import Thread from llmux.backends.base import BaseBackend from llmux.config import PhysicalModel +from llmux.harmony import HarmonyStreamFilter, extract_final_text logger = logging.getLogger(__name__) @@ -95,6 +96,7 @@ class TransformersLLMBackend(BaseBackend): loop = asyncio.get_event_loop() text = await loop.run_in_executor(None, _run) + text = extract_final_text(text) return { "id": f"chatcmpl-{uuid.uuid4().hex[:12]}", "object": "chat.completion", @@ -107,24 +109,57 @@ class TransformersLLMBackend(BaseBackend): async def _stream_generate(self, model, tokenizer, inputs, model_id): streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs = {**inputs, "max_new_tokens": 4096, "streamer": streamer} - thread = Thread(target=lambda: model.generate(**gen_kwargs)) + gen_error = [None] + + def _run(): + try: + model.generate(**gen_kwargs) + except Exception as e: + gen_error[0] = e + logger.error(f"Generation error for {model_id}: {e}") + + thread = Thread(target=_run) thread.start() chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" created = int(time.time()) loop = asyncio.get_event_loop() + harmony_filter = HarmonyStreamFilter() - while True: - token = await loop.run_in_executor(None, lambda: next(streamer, None)) - if token is None: - chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]} + try: + while True: + token = await loop.run_in_executor(None, lambda: next(streamer, None)) + if token is None: + break + filtered = harmony_filter.feed(token) + if not filtered: + continue + chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": filtered}, "finish_reason": None}]} yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - break - chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": token}, "finish_reason": None}]} - yield f"data: {json.dumps(chunk)}\n\n" + except Exception as e: + logger.error(f"Stream iteration error for {model_id}: {e}") + torch.cuda.empty_cache() + thread.join() + if gen_error[0]: + torch.cuda.empty_cache() + error_msg = str(gen_error[0]) + if "out of memory" in error_msg.lower(): + error_msg = "GPU out of memory. Try a shorter message or clear VRAM." + chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": f"\n\n[Error: {error_msg}]"}, "finish_reason": None}]} + yield f"data: {json.dumps(chunk)}\n\n" + + # Flush any remaining buffered content + flushed = harmony_filter.flush() + if flushed: + chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": flushed}, "finish_reason": None}]} + yield f"data: {json.dumps(chunk)}\n\n" + + chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]} + yield f"data: {json.dumps(chunk)}\n\n" + yield "data: [DONE]\n\n" + # Physical model config injection _physical_models: dict[str, PhysicalModel] = {} diff --git a/kischdle/llmux/llmux/harmony.py b/kischdle/llmux/llmux/harmony.py new file mode 100644 index 0000000..b44443a --- /dev/null +++ b/kischdle/llmux/llmux/harmony.py @@ -0,0 +1,90 @@ +"""Post-processing for GPT-OSS Harmony format responses. + +GPT-OSS models output multi-channel responses with analysis (thinking) and +final (user-facing) channels. This module extracts only the final channel. + +Formats seen: + llamacpp: <|channel|>analysis<|message|>...<|end|><|start|>assistant<|channel|>final<|message|>Hello! + transformers: analysisUser greeting...assistantfinalHello! (special tokens stripped) +""" + +import re + +# Pattern for llamacpp output (special tokens preserved) +_LLAMACPP_FINAL_RE = re.compile( + r"<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|$)", + re.DOTALL, +) + +# Pattern for transformers output (special tokens stripped, leaving text markers) +_TRANSFORMERS_FINAL_RE = re.compile( + r"assistantfinal(.*?)$", + re.DOTALL, +) + + +def extract_final_text(text: str) -> str: + """Extract the final channel content from a Harmony format response.""" + # Try llamacpp format first + m = _LLAMACPP_FINAL_RE.search(text) + if m: + return m.group(1).strip() + + # Try transformers format + m = _TRANSFORMERS_FINAL_RE.search(text) + if m: + return m.group(1).strip() + + # Not Harmony format — return as-is + return text + + +class HarmonyStreamFilter: + """Buffers streaming chunks and emits only the final channel content. + + For streaming, we accumulate text until we detect the final channel marker, + then start emitting from that point forward. Any content before the marker + (analysis channel) is silently dropped. + """ + + # Markers that indicate the start of the final channel + _LLAMACPP_MARKER = "<|channel|>final<|message|>" + _TRANSFORMERS_MARKER = "assistantfinal" + + def __init__(self): + self._buffer = "" + self._emitting = False + self._marker_found = False + + def feed(self, chunk: str) -> str: + """Feed a chunk of streamed text. Returns text to emit (may be empty).""" + if self._emitting: + return chunk + + self._buffer += chunk + + # Check for llamacpp marker + idx = self._buffer.find(self._LLAMACPP_MARKER) + if idx >= 0: + self._emitting = True + after = self._buffer[idx + len(self._LLAMACPP_MARKER):] + self._buffer = "" + return after + + # Check for transformers marker + idx = self._buffer.find(self._TRANSFORMERS_MARKER) + if idx >= 0: + self._emitting = True + after = self._buffer[idx + len(self._TRANSFORMERS_MARKER):] + self._buffer = "" + return after + + # Not found yet — keep buffering, emit nothing + return "" + + def flush(self) -> str: + """Call at end of stream. If no marker was found, return full buffer.""" + if not self._emitting and self._buffer: + # No Harmony markers found — return unmodified content + return self._buffer + return "" diff --git a/kischdle/llmux/llmux/main.py b/kischdle/llmux/llmux/main.py index a6c57ce..b6af348 100644 --- a/kischdle/llmux/llmux/main.py +++ b/kischdle/llmux/llmux/main.py @@ -26,6 +26,16 @@ logger = logging.getLogger(__name__) MODELS_DIR = os.environ.get("LLMUX_MODELS_DIR", "/models") +# Load HF token from file if HF_TOKEN_PATH is set and HF_TOKEN is not already set +_hf_token_path = os.environ.get("HF_TOKEN_PATH") +if _hf_token_path and not os.environ.get("HF_TOKEN"): + try: + with open(_hf_token_path) as f: + os.environ["HF_TOKEN"] = f.read().strip() + logger.info(f"Loaded HF token from {_hf_token_path}") + except FileNotFoundError: + logger.warning(f"HF_TOKEN_PATH set but file not found: {_hf_token_path}") + app = FastAPI(title="llmux", version="0.1.0") diff --git a/kischdle/llmux/llmux/model_registry.py b/kischdle/llmux/llmux/model_registry.py index ca52846..7c7a412 100644 --- a/kischdle/llmux/llmux/model_registry.py +++ b/kischdle/llmux/llmux/model_registry.py @@ -23,7 +23,8 @@ class ModelRegistry: "created": 0, "owned_by": "llmux", } - for name in self._virtual + for name, vm in self._virtual.items() + if self._physical[vm.physical].type == "llm" ] def resolve(self, virtual_name: str) -> tuple[str, PhysicalModel, dict]: diff --git a/kischdle/llmux/llmux/routes/chat.py b/kischdle/llmux/llmux/routes/chat.py index 6d9bfab..266bca4 100644 --- a/kischdle/llmux/llmux/routes/chat.py +++ b/kischdle/llmux/llmux/routes/chat.py @@ -1,4 +1,5 @@ import logging +import torch from fastapi import APIRouter, Depends, HTTPException, Request from fastapi.responses import StreamingResponse from llmux.model_registry import ModelRegistry @@ -37,12 +38,17 @@ def create_chat_router(registry, vram_manager, backends, require_api_key): messages = body.get("messages", []) stream = body.get("stream", False) tools = body.get("tools") - if stream: - # generate() is async def that returns an async generator - stream_iter = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=True, tools=tools) - return StreamingResponse(stream_iter, media_type="text/event-stream") - result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=False, tools=tools) - return result + try: + if stream: + stream_iter = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=True, tools=tools) + return StreamingResponse(stream_iter, media_type="text/event-stream") + + result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=False, tools=tools) + return result + except torch.cuda.OutOfMemoryError: + logger.error(f"CUDA OOM during generation with {virtual_name}") + torch.cuda.empty_cache() + raise HTTPException(status_code=503, detail="GPU out of memory. Try a shorter message or switch to a smaller model.") return router diff --git a/kischdle/llmux/llmux/routes/speech.py b/kischdle/llmux/llmux/routes/speech.py index 91dc4ff..b002472 100644 --- a/kischdle/llmux/llmux/routes/speech.py +++ b/kischdle/llmux/llmux/routes/speech.py @@ -10,6 +10,21 @@ logger = logging.getLogger(__name__) def create_speech_router(registry, vram_manager, backends, require_api_key): router = APIRouter() + @router.get("/v1/audio/models") + async def list_audio_models(): + """Discovery endpoint for Open WebUI — lists available TTS models.""" + tts_models = [ + {"id": name} + for name, vm in registry._virtual.items() + if registry._physical[vm.physical].type == "tts" + ] + return {"models": tts_models} + + @router.get("/v1/audio/voices") + async def list_audio_voices(): + """Discovery endpoint for Open WebUI — lists available voices.""" + return {"voices": [{"id": "default", "name": "Default"}]} + @router.post("/v1/audio/speech") async def create_speech(request: Request, api_key: str = Depends(require_api_key)): body = await request.json() diff --git a/kischdle/llmux/llmux/vram_manager.py b/kischdle/llmux/llmux/vram_manager.py index 17e6ff4..3e725f5 100644 --- a/kischdle/llmux/llmux/vram_manager.py +++ b/kischdle/llmux/llmux/vram_manager.py @@ -1,7 +1,11 @@ import asyncio +import gc import logging +import time from dataclasses import dataclass +import torch + logger = logging.getLogger(__name__) _PRIORITY = {"llm": 0, "tts": 1, "asr": 2} @@ -24,10 +28,11 @@ class ModelSlot: class VRAMManager: - def __init__(self, total_vram_gb: float = 16.0): + def __init__(self, total_vram_gb: float = 16.0, verify_gpu: bool = True): self._total_vram_gb = total_vram_gb self._loaded: dict[str, ModelSlot] = {} self._lock = asyncio.Lock() + self._verify_gpu = verify_gpu @property def available_vram_gb(self) -> float: @@ -42,9 +47,6 @@ class VRAMManager: async def clear_all(self) -> dict: """Unload all models and clear CUDA cache. Returns what was unloaded.""" - import gc - import torch - async with self._lock: unloaded = [] for slot in list(self._loaded.values()): @@ -52,10 +54,7 @@ class VRAMManager: await slot.backend.unload(slot.model_id) unloaded.append(slot.model_id) self._loaded.clear() - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() + self._force_gpu_cleanup() return { "unloaded": unloaded, "available_vram_gb": round(self.available_vram_gb, 1), @@ -65,12 +64,30 @@ class VRAMManager: async with self._lock: await self._load_model_locked(model_id, model_type, vram_gb, backend) + @staticmethod + def _gpu_free_gb() -> float: + """Get actual free GPU memory in GB.""" + if not torch.cuda.is_available(): + return 16.0 + free, _ = torch.cuda.mem_get_info() + return free / (1024 ** 3) + + @staticmethod + def _force_gpu_cleanup(): + """Force garbage collection and GPU memory release.""" + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + async def _load_model_locked(self, model_id, model_type, vram_gb, backend): if model_id in self._loaded: return + evicted = False if self.available_vram_gb < vram_gb: await self._evict_for(vram_gb, model_type) + evicted = True if self.available_vram_gb < vram_gb: raise RuntimeError( @@ -78,6 +95,35 @@ class VRAMManager: f"(need {vram_gb}GB, available {self.available_vram_gb}GB)" ) + # After eviction, verify GPU memory is actually freed. + # GGML (llama-cpp) CUDA allocations may take time to release. + # Only check when we evicted AND have real GPU AND the model needs >4GB + # (small models fit even with overhead; large models are the OOM risk). + if evicted and self._verify_gpu and torch.cuda.is_available(): + self._force_gpu_cleanup() + actual_free = self._gpu_free_gb() + if actual_free < vram_gb: + logger.warning( + f"GPU has only {actual_free:.1f}GB free after eviction " + f"(need {vram_gb}GB). Waiting for memory release..." + ) + for _ in range(10): + await asyncio.sleep(1) + self._force_gpu_cleanup() + actual_free = self._gpu_free_gb() + if actual_free >= vram_gb: + break + if actual_free < vram_gb: + logger.error( + f"GPU memory not freed: {actual_free:.1f}GB free, " + f"need {vram_gb}GB for {model_id}" + ) + raise RuntimeError( + f"GPU memory not freed after eviction: " + f"{actual_free:.1f}GB free, need {vram_gb}GB" + ) + logger.info(f"GPU verified: {actual_free:.1f}GB free after eviction") + logger.info(f"Loading {model_id} ({vram_gb}GB VRAM)") await backend.load(model_id) self._loaded[model_id] = ModelSlot( diff --git a/kischdle/llmux/scripts/perf_test.py b/kischdle/llmux/scripts/perf_test.py new file mode 100644 index 0000000..ec5d903 --- /dev/null +++ b/kischdle/llmux/scripts/perf_test.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +"""Performance test for llmux — measures TTFT, tok/s, and total latency for each LLM model.""" + +import json +import time +import sys +import httpx + +BASE_URL = "http://127.0.0.1:8081" +API_KEY = "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c" +HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"} + +# Test prompts — short and long to measure different characteristics +PROMPTS = { + "short": "What is 2+2? Answer in one sentence.", + "medium": "Explain how a CPU works in 3-4 paragraphs.", +} + +# Models to test — one virtual model per physical model (avoid duplicating physical loads) +TEST_MODELS = [ + # llama-cpp backend (GGUF) + ("Qwen3.5-9B-FP8-Instruct", "llamacpp", "~10GB"), + ("GPT-OSS-20B-Uncensored-Low", "llamacpp", "~13GB"), + # transformers backend + ("Qwen3.5-4B-Instruct", "transformers", "~4GB"), + # GPT-OSS-20B-Low disabled: needs libc6-dev sys/ headers for triton MXFP4 kernels +] + + +def clear_vram(): + """Unload all models to start fresh.""" + r = httpx.post(f"{BASE_URL}/admin/clear-vram", headers=HEADERS, timeout=60) + if r.status_code == 200: + print(" VRAM cleared") + else: + print(f" WARN: clear-vram returned {r.status_code}") + + +def test_streaming(model: str, prompt: str, prompt_label: str) -> dict: + """Test a model with streaming, measuring TTFT and tok/s.""" + body = { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "stream": True, + } + + start = time.perf_counter() + first_token_time = None + token_count = 0 + full_text = [] + + try: + with httpx.stream("POST", f"{BASE_URL}/v1/chat/completions", + json=body, headers=HEADERS, timeout=300) as resp: + if resp.status_code != 200: + return {"model": model, "prompt": prompt_label, "error": f"HTTP {resp.status_code}"} + + for line in resp.iter_lines(): + if not line.startswith("data: "): + continue + data = line[6:] + if data == "[DONE]": + break + try: + chunk = json.loads(data) + delta = chunk.get("choices", [{}])[0].get("delta", {}) + content = delta.get("content", "") + if content: + if first_token_time is None: + first_token_time = time.perf_counter() + token_count += 1 + full_text.append(content) + except json.JSONDecodeError: + continue + + except Exception as e: + return {"model": model, "prompt": prompt_label, "error": str(e)} + + end = time.perf_counter() + total_time = end - start + ttft = (first_token_time - start) if first_token_time else total_time + + # Token generation time (after first token) + gen_time = (end - first_token_time) if first_token_time and token_count > 1 else 0 + tok_per_sec = (token_count - 1) / gen_time if gen_time > 0 else 0 + + output_text = "".join(full_text) + output_chars = len(output_text) + + return { + "model": model, + "prompt": prompt_label, + "ttft_s": round(ttft, 2), + "total_s": round(total_time, 2), + "tokens": token_count, + "tok_per_s": round(tok_per_sec, 1), + "output_chars": output_chars, + } + + +def test_non_streaming(model: str, prompt: str, prompt_label: str) -> dict: + """Test a model without streaming — measures total latency.""" + body = { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "stream": False, + } + + start = time.perf_counter() + try: + r = httpx.post(f"{BASE_URL}/v1/chat/completions", + json=body, headers=HEADERS, timeout=300) + if r.status_code != 200: + return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": f"HTTP {r.status_code}"} + result = r.json() + except Exception as e: + return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": str(e)} + + end = time.perf_counter() + content = result.get("choices", [{}])[0].get("message", {}).get("content", "") + + return { + "model": model, + "prompt": prompt_label, + "mode": "non-stream", + "total_s": round(end - start, 2), + "output_chars": len(content), + } + + +def run_tests(): + print("=" * 80) + print("llmux Performance Test") + print("=" * 80) + + # Check health + try: + r = httpx.get(f"{BASE_URL}/health", timeout=5) + health = r.json() + print(f"Server healthy — available VRAM: {health['available_vram_gb']} GB") + except Exception as e: + print(f"ERROR: Server not reachable: {e}") + sys.exit(1) + + results = [] + + for model, backend, vram_est in TEST_MODELS: + print(f"\n{'─' * 60}") + print(f"Model: {model} ({backend}, {vram_est})") + print(f"{'─' * 60}") + + # Clear VRAM before each model to measure cold-start load time + clear_vram() + + for prompt_label, prompt_text in PROMPTS.items(): + # First run = cold start (includes model loading) + print(f" [{prompt_label}] streaming (cold)...", end=" ", flush=True) + r = test_streaming(model, prompt_text, prompt_label) + r["cold_start"] = True + results.append(r) + if "error" in r: + print(f"ERROR: {r['error']}") + else: + print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)") + + # Second run = warm (model already loaded) + print(f" [{prompt_label}] streaming (warm)...", end=" ", flush=True) + r = test_streaming(model, prompt_text, prompt_label) + r["cold_start"] = False + results.append(r) + if "error" in r: + print(f"ERROR: {r['error']}") + else: + print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)") + + # Non-streaming tests (warm) + for plabel in ["short", "medium"]: + print(f" [{plabel}] non-streaming (warm)...", end=" ", flush=True) + r = test_non_streaming(model, PROMPTS[plabel], plabel) + results.append(r) + if "error" in r: + print(f"ERROR: {r['error']}") + else: + chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0 + print(f"total={r['total_s']}s ({r['output_chars']} chars, {chars_per_s} chars/s)") + + # Clear to free VRAM for next model + clear_vram() + + # Summary table + print(f"\n{'=' * 90}") + print("Summary — Streaming") + print(f"{'=' * 90}") + print(f"{'Model':<40} {'Prompt':<8} {'Cold':>5} {'TTFT':>7} {'Total':>7} {'Chunks':>7} {'Char/s':>7}") + print(f"{'-' * 40} {'-' * 8} {'-' * 5} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7}") + for r in results: + if r.get("mode") == "non-stream": + continue + if "error" in r: + print(f"{r['model']:<40} {r['prompt']:<8} {'':>5} {'ERROR':>7}") + continue + cold = "yes" if r.get("cold_start") else "no" + chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0 + print(f"{r['model']:<40} {r['prompt']:<8} {cold:>5} {r['ttft_s']:>6.2f}s {r['total_s']:>6.2f}s {r['tokens']:>7} {chars_per_s:>6.1f}") + + print(f"\n{'=' * 90}") + print("Summary — Non-streaming") + print(f"{'=' * 90}") + print(f"{'Model':<40} {'Prompt':<8} {'Total':>7} {'Chars':>7} {'Char/s':>7}") + print(f"{'-' * 40} {'-' * 8} {'-' * 7} {'-' * 7} {'-' * 7}") + for r in results: + if r.get("mode") != "non-stream": + continue + if "error" in r: + print(f"{r['model']:<40} {r['prompt']:<8} {'ERROR':>7}") + continue + chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0 + print(f"{r['model']:<40} {r['prompt']:<8} {r['total_s']:>6.2f}s {r['output_chars']:>7} {chars_per_s:>6.1f}") + + return results + + +if __name__ == "__main__": + run_tests() diff --git a/kischdle/llmux/tests/test_config.py b/kischdle/llmux/tests/test_config.py index 5f7ebd7..32a7b15 100644 --- a/kischdle/llmux/tests/test_config.py +++ b/kischdle/llmux/tests/test_config.py @@ -13,10 +13,10 @@ def test_physical_model_has_required_fields(): physical, _ = load_models_config() qwen = physical["qwen3.5-9b-fp8"] assert qwen.type == "llm" - assert qwen.backend == "transformers" - assert qwen.model_id == "lovedheart/Qwen3.5-9B-FP8" - assert qwen.estimated_vram_gb == 9 - assert qwen.supports_vision is True + assert qwen.backend == "llamacpp" + assert qwen.model_id == "unsloth/Qwen3.5-9B-GGUF" + assert qwen.estimated_vram_gb == 10 + assert qwen.supports_vision is False assert qwen.supports_tools is True diff --git a/kischdle/llmux/tests/test_harmony.py b/kischdle/llmux/tests/test_harmony.py new file mode 100644 index 0000000..087b986 --- /dev/null +++ b/kischdle/llmux/tests/test_harmony.py @@ -0,0 +1,55 @@ +from llmux.harmony import extract_final_text, HarmonyStreamFilter + + +def test_extract_llamacpp_format(): + text = '<|channel|>analysis<|message|>User greeting. Simple.<|end|><|start|>assistant<|channel|>final<|message|>Hello! How can I help you today?' + assert extract_final_text(text) == "Hello! How can I help you today?" + + +def test_extract_llamacpp_with_end_tag(): + text = '<|channel|>analysis<|message|>thinking...<|end|><|start|>assistant<|channel|>final<|message|>The answer is 42.<|end|>' + assert extract_final_text(text) == "The answer is 42." + + +def test_extract_transformers_format(): + text = 'analysisUser greeting. Just respond friendly.assistantfinalHello! I am doing great.' + assert extract_final_text(text) == "Hello! I am doing great." + + +def test_extract_non_harmony_passthrough(): + text = "Hello! I'm doing well, thanks for asking." + assert extract_final_text(text) == text + + +def test_stream_filter_llamacpp(): + f = HarmonyStreamFilter() + chunks = [ + "<|channel|>", "analysis", "<|message|>", "User ", "greeting.", + "<|end|>", "<|start|>", "assistant", "<|channel|>", "final", + "<|message|>", "Hello!", " How ", "are you?" + ] + output = "" + for c in chunks: + output += f.feed(c) + output += f.flush() + assert output == "Hello! How are you?" + + +def test_stream_filter_transformers(): + f = HarmonyStreamFilter() + chunks = ["analysis", "User ", "greeting.", "assistant", "final", "Hello!", " Great day!"] + output = "" + for c in chunks: + output += f.feed(c) + output += f.flush() + assert output == "Hello! Great day!" + + +def test_stream_filter_non_harmony(): + f = HarmonyStreamFilter() + chunks = ["Hello", " world", "!"] + output = "" + for c in chunks: + output += f.feed(c) + output += f.flush() + assert output == "Hello world!" diff --git a/kischdle/llmux/tests/test_model_registry.py b/kischdle/llmux/tests/test_model_registry.py index 6541ddc..ed4f614 100644 --- a/kischdle/llmux/tests/test_model_registry.py +++ b/kischdle/llmux/tests/test_model_registry.py @@ -10,12 +10,12 @@ def registry(): def test_list_virtual_models(registry): models = registry.list_virtual_models() - assert len(models) == 15 + assert len(models) == 12 # only LLM models, not ASR/TTS names = [m["id"] for m in models] assert "Qwen3.5-9B-FP8-Thinking" in names assert "GPT-OSS-20B-High" in names - assert "cohere-transcribe" in names - assert "Chatterbox-Multilingual" in names + assert "cohere-transcribe" not in names + assert "Chatterbox-Multilingual" not in names def test_virtual_model_openai_format(registry): @@ -28,7 +28,7 @@ def test_virtual_model_openai_format(registry): def test_resolve_virtual_to_physical(registry): physical_id, physical, params = registry.resolve("Qwen3.5-9B-FP8-Thinking") assert physical_id == "qwen3.5-9b-fp8" - assert physical.backend == "transformers" + assert physical.backend == "llamacpp" assert params == {"enable_thinking": True} @@ -58,7 +58,7 @@ def test_resolve_unknown_model_raises(registry): def test_get_physical(registry): physical = registry.get_physical("qwen3.5-9b-fp8") assert physical.type == "llm" - assert physical.estimated_vram_gb == 9 + assert physical.estimated_vram_gb == 10 def test_get_physical_unknown_raises(registry): diff --git a/kischdle/llmux/tests/test_routes.py b/kischdle/llmux/tests/test_routes.py index f465313..29ca9ec 100644 --- a/kischdle/llmux/tests/test_routes.py +++ b/kischdle/llmux/tests/test_routes.py @@ -40,12 +40,12 @@ def auth_headers(): return {"Authorization": f"Bearer {API_KEY}"} -def test_list_models_returns_16(client, auth_headers): +def test_list_models_returns_only_llm(client, auth_headers): resp = client.get("/v1/models", headers=auth_headers) assert resp.status_code == 200 body = resp.json() assert body["object"] == "list" - assert len(body["data"]) == 15 + assert len(body["data"]) == 12 # only LLM models def test_list_models_contains_expected_names(client, auth_headers): @@ -53,8 +53,8 @@ def test_list_models_contains_expected_names(client, auth_headers): names = [m["id"] for m in resp.json()["data"]] assert "Qwen3.5-9B-FP8-Thinking" in names assert "GPT-OSS-20B-High" in names - assert "cohere-transcribe" in names - assert "Chatterbox-Multilingual" in names + assert "cohere-transcribe" not in names + assert "Chatterbox-Multilingual" not in names def test_list_models_requires_auth(client): diff --git a/kischdle/llmux/tests/test_vram_manager.py b/kischdle/llmux/tests/test_vram_manager.py index 49d0c8b..b8cbcc1 100644 --- a/kischdle/llmux/tests/test_vram_manager.py +++ b/kischdle/llmux/tests/test_vram_manager.py @@ -23,7 +23,7 @@ class FakeBackend: @pytest.fixture def manager(): - return VRAMManager(total_vram_gb=16.0) + return VRAMManager(total_vram_gb=16.0, verify_gpu=False) def test_priority_ordering():