#!/usr/bin/env python3 """Performance test for llmux — measures TTFT, tok/s, and total latency for each LLM model.""" import json import time import sys import httpx BASE_URL = "http://127.0.0.1:8081" API_KEY = "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c" HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"} # Test prompts — short and long to measure different characteristics PROMPTS = { "short": "What is 2+2? Answer in one sentence.", "medium": "Explain how a CPU works in 3-4 paragraphs.", } # Models to test — one virtual model per physical model (avoid duplicating physical loads) TEST_MODELS = [ # llama-cpp backend (GGUF) ("Qwen3.5-9B-FP8-Instruct", "llamacpp", "~10GB"), ("GPT-OSS-20B-Uncensored-Low", "llamacpp", "~13GB"), # transformers backend ("Qwen3.5-4B-Instruct", "transformers", "~4GB"), # GPT-OSS-20B-Low disabled: needs libc6-dev sys/ headers for triton MXFP4 kernels ] def clear_vram(): """Unload all models to start fresh.""" r = httpx.post(f"{BASE_URL}/admin/clear-vram", headers=HEADERS, timeout=60) if r.status_code == 200: print(" VRAM cleared") else: print(f" WARN: clear-vram returned {r.status_code}") def test_streaming(model: str, prompt: str, prompt_label: str) -> dict: """Test a model with streaming, measuring TTFT and tok/s.""" body = { "model": model, "messages": [{"role": "user", "content": prompt}], "stream": True, } start = time.perf_counter() first_token_time = None token_count = 0 full_text = [] try: with httpx.stream("POST", f"{BASE_URL}/v1/chat/completions", json=body, headers=HEADERS, timeout=300) as resp: if resp.status_code != 200: return {"model": model, "prompt": prompt_label, "error": f"HTTP {resp.status_code}"} for line in resp.iter_lines(): if not line.startswith("data: "): continue data = line[6:] if data == "[DONE]": break try: chunk = json.loads(data) delta = chunk.get("choices", [{}])[0].get("delta", {}) content = delta.get("content", "") if content: if first_token_time is None: first_token_time = time.perf_counter() token_count += 1 full_text.append(content) except json.JSONDecodeError: continue except Exception as e: return {"model": model, "prompt": prompt_label, "error": str(e)} end = time.perf_counter() total_time = end - start ttft = (first_token_time - start) if first_token_time else total_time # Token generation time (after first token) gen_time = (end - first_token_time) if first_token_time and token_count > 1 else 0 tok_per_sec = (token_count - 1) / gen_time if gen_time > 0 else 0 output_text = "".join(full_text) output_chars = len(output_text) return { "model": model, "prompt": prompt_label, "ttft_s": round(ttft, 2), "total_s": round(total_time, 2), "tokens": token_count, "tok_per_s": round(tok_per_sec, 1), "output_chars": output_chars, } def test_non_streaming(model: str, prompt: str, prompt_label: str) -> dict: """Test a model without streaming — measures total latency.""" body = { "model": model, "messages": [{"role": "user", "content": prompt}], "stream": False, } start = time.perf_counter() try: r = httpx.post(f"{BASE_URL}/v1/chat/completions", json=body, headers=HEADERS, timeout=300) if r.status_code != 200: return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": f"HTTP {r.status_code}"} result = r.json() except Exception as e: return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": str(e)} end = time.perf_counter() content = result.get("choices", [{}])[0].get("message", {}).get("content", "") return { "model": model, "prompt": prompt_label, "mode": "non-stream", "total_s": round(end - start, 2), "output_chars": len(content), } def run_tests(): print("=" * 80) print("llmux Performance Test") print("=" * 80) # Check health try: r = httpx.get(f"{BASE_URL}/health", timeout=5) health = r.json() print(f"Server healthy — available VRAM: {health['available_vram_gb']} GB") except Exception as e: print(f"ERROR: Server not reachable: {e}") sys.exit(1) results = [] for model, backend, vram_est in TEST_MODELS: print(f"\n{'─' * 60}") print(f"Model: {model} ({backend}, {vram_est})") print(f"{'─' * 60}") # Clear VRAM before each model to measure cold-start load time clear_vram() for prompt_label, prompt_text in PROMPTS.items(): # First run = cold start (includes model loading) print(f" [{prompt_label}] streaming (cold)...", end=" ", flush=True) r = test_streaming(model, prompt_text, prompt_label) r["cold_start"] = True results.append(r) if "error" in r: print(f"ERROR: {r['error']}") else: print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)") # Second run = warm (model already loaded) print(f" [{prompt_label}] streaming (warm)...", end=" ", flush=True) r = test_streaming(model, prompt_text, prompt_label) r["cold_start"] = False results.append(r) if "error" in r: print(f"ERROR: {r['error']}") else: print(f"TTFT={r['ttft_s']}s total={r['total_s']}s {r['tok_per_s']} tok/s ({r['tokens']} tokens)") # Non-streaming tests (warm) for plabel in ["short", "medium"]: print(f" [{plabel}] non-streaming (warm)...", end=" ", flush=True) r = test_non_streaming(model, PROMPTS[plabel], plabel) results.append(r) if "error" in r: print(f"ERROR: {r['error']}") else: chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0 print(f"total={r['total_s']}s ({r['output_chars']} chars, {chars_per_s} chars/s)") # Clear to free VRAM for next model clear_vram() # Summary table print(f"\n{'=' * 90}") print("Summary — Streaming") print(f"{'=' * 90}") print(f"{'Model':<40} {'Prompt':<8} {'Cold':>5} {'TTFT':>7} {'Total':>7} {'Chunks':>7} {'Char/s':>7}") print(f"{'-' * 40} {'-' * 8} {'-' * 5} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7}") for r in results: if r.get("mode") == "non-stream": continue if "error" in r: print(f"{r['model']:<40} {r['prompt']:<8} {'':>5} {'ERROR':>7}") continue cold = "yes" if r.get("cold_start") else "no" chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0 print(f"{r['model']:<40} {r['prompt']:<8} {cold:>5} {r['ttft_s']:>6.2f}s {r['total_s']:>6.2f}s {r['tokens']:>7} {chars_per_s:>6.1f}") print(f"\n{'=' * 90}") print("Summary — Non-streaming") print(f"{'=' * 90}") print(f"{'Model':<40} {'Prompt':<8} {'Total':>7} {'Chars':>7} {'Char/s':>7}") print(f"{'-' * 40} {'-' * 8} {'-' * 7} {'-' * 7} {'-' * 7}") for r in results: if r.get("mode") != "non-stream": continue if "error" in r: print(f"{r['model']:<40} {r['prompt']:<8} {'ERROR':>7}") continue chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0 print(f"{r['model']:<40} {r['prompt']:<8} {r['total_s']:>6.2f}s {r['output_chars']:>7} {chars_per_s:>6.1f}") return results if __name__ == "__main__": run_tests()