fix: Open WebUI integration — Harmony stripping, VRAM eviction, concurrency lock

- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 21:50:39 +02:00
parent 06923d51b4
commit 3edc055299
15 changed files with 634 additions and 74 deletions
--- a/kischdle/llmux/tests/test_config.py
+++ b/kischdle/llmux/tests/test_config.py
@@ -13,10 +13,10 @@ def test_physical_model_has_required_fields():
    physical, _ = load_models_config()
    qwen = physical["qwen3.5-9b-fp8"]
    assert qwen.type == "llm"
-    assert qwen.backend == "transformers"
-    assert qwen.model_id == "lovedheart/Qwen3.5-9B-FP8"
-    assert qwen.estimated_vram_gb == 9
-    assert qwen.supports_vision is True
+    assert qwen.backend == "llamacpp"
+    assert qwen.model_id == "unsloth/Qwen3.5-9B-GGUF"
+    assert qwen.estimated_vram_gb == 10
+    assert qwen.supports_vision is False
    assert qwen.supports_tools is True


--- a/kischdle/llmux/tests/test_harmony.py
+++ b/kischdle/llmux/tests/test_harmony.py
@@ -0,0 +1,55 @@
+from llmux.harmony import extract_final_text, HarmonyStreamFilter
+
+
+def test_extract_llamacpp_format():
+    text = '<|channel|>analysis<|message|>User greeting. Simple.<|end|><|start|>assistant<|channel|>final<|message|>Hello! How can I help you today?'
+    assert extract_final_text(text) == "Hello! How can I help you today?"
+
+
+def test_extract_llamacpp_with_end_tag():
+    text = '<|channel|>analysis<|message|>thinking...<|end|><|start|>assistant<|channel|>final<|message|>The answer is 42.<|end|>'
+    assert extract_final_text(text) == "The answer is 42."
+
+
+def test_extract_transformers_format():
+    text = 'analysisUser greeting. Just respond friendly.assistantfinalHello! I am doing great.'
+    assert extract_final_text(text) == "Hello! I am doing great."
+
+
+def test_extract_non_harmony_passthrough():
+    text = "Hello! I'm doing well, thanks for asking."
+    assert extract_final_text(text) == text
+
+
+def test_stream_filter_llamacpp():
+    f = HarmonyStreamFilter()
+    chunks = [
+        "<|channel|>", "analysis", "<|message|>", "User ", "greeting.",
+        "<|end|>", "<|start|>", "assistant", "<|channel|>", "final",
+        "<|message|>", "Hello!", " How ", "are you?"
+    ]
+    output = ""
+    for c in chunks:
+        output += f.feed(c)
+    output += f.flush()
+    assert output == "Hello! How are you?"
+
+
+def test_stream_filter_transformers():
+    f = HarmonyStreamFilter()
+    chunks = ["analysis", "User ", "greeting.", "assistant", "final", "Hello!", " Great day!"]
+    output = ""
+    for c in chunks:
+        output += f.feed(c)
+    output += f.flush()
+    assert output == "Hello! Great day!"
+
+
+def test_stream_filter_non_harmony():
+    f = HarmonyStreamFilter()
+    chunks = ["Hello", " world", "!"]
+    output = ""
+    for c in chunks:
+        output += f.feed(c)
+    output += f.flush()
+    assert output == "Hello world!"
--- a/kischdle/llmux/tests/test_model_registry.py
+++ b/kischdle/llmux/tests/test_model_registry.py
@@ -10,12 +10,12 @@ def registry():

 def test_list_virtual_models(registry):
    models = registry.list_virtual_models()
-    assert len(models) == 15
+    assert len(models) == 12  # only LLM models, not ASR/TTS
    names = [m["id"] for m in models]
    assert "Qwen3.5-9B-FP8-Thinking" in names
    assert "GPT-OSS-20B-High" in names
-    assert "cohere-transcribe" in names
-    assert "Chatterbox-Multilingual" in names
+    assert "cohere-transcribe" not in names
+    assert "Chatterbox-Multilingual" not in names


 def test_virtual_model_openai_format(registry):
@@ -28,7 +28,7 @@ def test_virtual_model_openai_format(registry):
 def test_resolve_virtual_to_physical(registry):
    physical_id, physical, params = registry.resolve("Qwen3.5-9B-FP8-Thinking")
    assert physical_id == "qwen3.5-9b-fp8"
-    assert physical.backend == "transformers"
+    assert physical.backend == "llamacpp"
    assert params == {"enable_thinking": True}


@@ -58,7 +58,7 @@ def test_resolve_unknown_model_raises(registry):
 def test_get_physical(registry):
    physical = registry.get_physical("qwen3.5-9b-fp8")
    assert physical.type == "llm"
-    assert physical.estimated_vram_gb == 9
+    assert physical.estimated_vram_gb == 10


 def test_get_physical_unknown_raises(registry):
--- a/kischdle/llmux/tests/test_routes.py
+++ b/kischdle/llmux/tests/test_routes.py
@@ -40,12 +40,12 @@ def auth_headers():
    return {"Authorization": f"Bearer {API_KEY}"}


-def test_list_models_returns_16(client, auth_headers):
+def test_list_models_returns_only_llm(client, auth_headers):
    resp = client.get("/v1/models", headers=auth_headers)
    assert resp.status_code == 200
    body = resp.json()
    assert body["object"] == "list"
-    assert len(body["data"]) == 15
+    assert len(body["data"]) == 12  # only LLM models


 def test_list_models_contains_expected_names(client, auth_headers):
@@ -53,8 +53,8 @@ def test_list_models_contains_expected_names(client, auth_headers):
    names = [m["id"] for m in resp.json()["data"]]
    assert "Qwen3.5-9B-FP8-Thinking" in names
    assert "GPT-OSS-20B-High" in names
-    assert "cohere-transcribe" in names
-    assert "Chatterbox-Multilingual" in names
+    assert "cohere-transcribe" not in names
+    assert "Chatterbox-Multilingual" not in names


 def test_list_models_requires_auth(client):
--- a/kischdle/llmux/tests/test_vram_manager.py
+++ b/kischdle/llmux/tests/test_vram_manager.py
@@ -23,7 +23,7 @@ class FakeBackend:

@pytest.fixture
 def manager():
-    return VRAMManager(total_vram_gb=16.0)
+    return VRAMManager(total_vram_gb=16.0, verify_gpu=False)


 def test_priority_ordering():