Files
DesTEngSsv006_swd/kischdle/llmux/scripts/perf_test.py
tlg 3edc055299 fix: Open WebUI integration — Harmony stripping, VRAM eviction, concurrency lock
- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both
  streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text)
- Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++
  access that caused container segfaults (exit 139)
- Fix chat handler swap for streaming: move inside _stream_generate within
  lock scope (was broken by try/finally running before stream was consumed)
- Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown)
- Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB)
- Add GPU memory verification after eviction with retry loop in vram_manager
- Add HF_TOKEN_PATH support in main.py for gated model access
- Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth)
- Add OOM error handling in both backends and chat route
- Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg
- Add performance test script (scripts/perf_test.py)
- Update tests to match current config (42 tests pass)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-08 21:50:39 +02:00

225 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""Performance test for llmux — measures TTFT, tok/s, and total latency for each LLM model."""
import json
import time
import sys
import httpx
BASE_URL = "http://127.0.0.1:8081"
API_KEY = "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c"
HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
# Test prompts — short and long to measure different characteristics
PROMPTS = {
"short": "What is 2+2? Answer in one sentence.",
"medium": "Explain how a CPU works in 3-4 paragraphs.",
}
# Models to test — one virtual model per physical model (avoid duplicating physical loads)
TEST_MODELS = [
# llama-cpp backend (GGUF)
("Qwen3.5-9B-FP8-Instruct", "llamacpp", "~10GB"),
("GPT-OSS-20B-Uncensored-Low", "llamacpp", "~13GB"),
# transformers backend
("Qwen3.5-4B-Instruct", "transformers", "~4GB"),
# GPT-OSS-20B-Low disabled: needs libc6-dev sys/ headers for triton MXFP4 kernels
]
def clear_vram():
"""Unload all models to start fresh."""
r = httpx.post(f"{BASE_URL}/admin/clear-vram", headers=HEADERS, timeout=60)
if r.status_code == 200:
print(" VRAM cleared")
else:
print(f" WARN: clear-vram returned {r.status_code}")
def test_streaming(model: str, prompt: str, prompt_label: str) -> dict:
"""Test a model with streaming, measuring TTFT and tok/s."""
body = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": True,
}
start = time.perf_counter()
first_token_time = None
token_count = 0
full_text = []
try:
with httpx.stream("POST", f"{BASE_URL}/v1/chat/completions",
json=body, headers=HEADERS, timeout=300) as resp:
if resp.status_code != 200:
return {"model": model, "prompt": prompt_label, "error": f"HTTP {resp.status_code}"}
for line in resp.iter_lines():
if not line.startswith("data: "):
continue
data = line[6:]
if data == "[DONE]":
break
try:
chunk = json.loads(data)
delta = chunk.get("choices", [{}])[0].get("delta", {})
content = delta.get("content", "")
if content:
if first_token_time is None:
first_token_time = time.perf_counter()
token_count += 1
full_text.append(content)
except json.JSONDecodeError:
continue
except Exception as e:
return {"model": model, "prompt": prompt_label, "error": str(e)}
end = time.perf_counter()
total_time = end - start
ttft = (first_token_time - start) if first_token_time else total_time
# Token generation time (after first token)
gen_time = (end - first_token_time) if first_token_time and token_count > 1 else 0
tok_per_sec = (token_count - 1) / gen_time if gen_time > 0 else 0
output_text = "".join(full_text)
output_chars = len(output_text)
return {
"model": model,
"prompt": prompt_label,
"ttft_s": round(ttft, 2),
"total_s": round(total_time, 2),
"tokens": token_count,
"tok_per_s": round(tok_per_sec, 1),
"output_chars": output_chars,
}
def test_non_streaming(model: str, prompt: str, prompt_label: str) -> dict:
"""Test a model without streaming — measures total latency."""
body = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
}
start = time.perf_counter()
try:
r = httpx.post(f"{BASE_URL}/v1/chat/completions",
json=body, headers=HEADERS, timeout=300)
if r.status_code != 200:
return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": f"HTTP {r.status_code}"}
result = r.json()
except Exception as e:
return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": str(e)}
end = time.perf_counter()
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
return {
"model": model,
"prompt": prompt_label,
"mode": "non-stream",
"total_s": round(end - start, 2),
"output_chars": len(content),
}
def run_tests():
print("=" * 80)
print("llmux Performance Test")
print("=" * 80)
# Check health
try:
r = httpx.get(f"{BASE_URL}/health", timeout=5)
health = r.json()
print(f"Server healthy — available VRAM: {health['available_vram_gb']} GB")
except Exception as e:
print(f"ERROR: Server not reachable: {e}")
sys.exit(1)
results = []
for model, backend, vram_est in TEST_MODELS:
print(f"\n{'' * 60}")
print(f"Model: {model} ({backend}, {vram_est})")
print(f"{'' * 60}")
# Clear VRAM before each model to measure cold-start load time
clear_vram()
for prompt_label, prompt_text in PROMPTS.items():
# First run = cold start (includes model loading)
print(f" [{prompt_label}] streaming (cold)...", end=" ", flush=True)
r = test_streaming(model, prompt_text, prompt_label)
r["cold_start"] = True
results.append(r)
if "error" in r:
print(f"ERROR: {r['error']}")
else:
print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)")
# Second run = warm (model already loaded)
print(f" [{prompt_label}] streaming (warm)...", end=" ", flush=True)
r = test_streaming(model, prompt_text, prompt_label)
r["cold_start"] = False
results.append(r)
if "error" in r:
print(f"ERROR: {r['error']}")
else:
print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)")
# Non-streaming tests (warm)
for plabel in ["short", "medium"]:
print(f" [{plabel}] non-streaming (warm)...", end=" ", flush=True)
r = test_non_streaming(model, PROMPTS[plabel], plabel)
results.append(r)
if "error" in r:
print(f"ERROR: {r['error']}")
else:
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
print(f"total={r['total_s']}s ({r['output_chars']} chars, {chars_per_s} chars/s)")
# Clear to free VRAM for next model
clear_vram()
# Summary table
print(f"\n{'=' * 90}")
print("Summary — Streaming")
print(f"{'=' * 90}")
print(f"{'Model':<40} {'Prompt':<8} {'Cold':>5} {'TTFT':>7} {'Total':>7} {'Chunks':>7} {'Char/s':>7}")
print(f"{'-' * 40} {'-' * 8} {'-' * 5} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7}")
for r in results:
if r.get("mode") == "non-stream":
continue
if "error" in r:
print(f"{r['model']:<40} {r['prompt']:<8} {'':>5} {'ERROR':>7}")
continue
cold = "yes" if r.get("cold_start") else "no"
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
print(f"{r['model']:<40} {r['prompt']:<8} {cold:>5} {r['ttft_s']:>6.2f}s {r['total_s']:>6.2f}s {r['tokens']:>7} {chars_per_s:>6.1f}")
print(f"\n{'=' * 90}")
print("Summary — Non-streaming")
print(f"{'=' * 90}")
print(f"{'Model':<40} {'Prompt':<8} {'Total':>7} {'Chars':>7} {'Char/s':>7}")
print(f"{'-' * 40} {'-' * 8} {'-' * 7} {'-' * 7} {'-' * 7}")
for r in results:
if r.get("mode") != "non-stream":
continue
if "error" in r:
print(f"{r['model']:<40} {r['prompt']:<8} {'ERROR':>7}")
continue
chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
print(f"{r['model']:<40} {r['prompt']:<8} {r['total_s']:>6.2f}s {r['output_chars']:>7} {chars_per_s:>6.1f}")
return results
if __name__ == "__main__":
run_tests()