DesTEngSsv006_swd/kischdle/llmux/scripts/perf_test.py

#!/usr/bin/env python3
"""Performance test for llmux — measures TTFT, tok/s, and total latency for each LLM model."""

import json
import time
import sys
import httpx

BASE_URL = "http://127.0.0.1:8081"
API_KEY = "sk-llmux-openwebui-hMD6pAka1czM53MtTkmmlFP8tF5zuiiDRgt-PCBnj-c"
HEADERS = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}

# Test prompts — short and long to measure different characteristics
PROMPTS = {
    "short": "What is 2+2? Answer in one sentence.",
    "medium": "Explain how a CPU works in 3-4 paragraphs.",
}

# Models to test — one virtual model per physical model (avoid duplicating physical loads)
TEST_MODELS = [
    # llama-cpp backend (GGUF)
    ("Qwen3.5-9B-FP8-Instruct", "llamacpp", "~10GB"),
    ("GPT-OSS-20B-Uncensored-Low", "llamacpp", "~13GB"),
    # transformers backend
    ("Qwen3.5-4B-Instruct", "transformers", "~4GB"),
    # GPT-OSS-20B-Low disabled: needs libc6-dev sys/ headers for triton MXFP4 kernels
]


def clear_vram():
    """Unload all models to start fresh."""
    r = httpx.post(f"{BASE_URL}/admin/clear-vram", headers=HEADERS, timeout=60)
    if r.status_code == 200:
        print("  VRAM cleared")
    else:
        print(f"  WARN: clear-vram returned {r.status_code}")


def test_streaming(model: str, prompt: str, prompt_label: str) -> dict:
    """Test a model with streaming, measuring TTFT and tok/s."""
    body = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": True,
    }

    start = time.perf_counter()
    first_token_time = None
    token_count = 0
    full_text = []

    try:
        with httpx.stream("POST", f"{BASE_URL}/v1/chat/completions",
                          json=body, headers=HEADERS, timeout=300) as resp:
            if resp.status_code != 200:
                return {"model": model, "prompt": prompt_label, "error": f"HTTP {resp.status_code}"}

            for line in resp.iter_lines():
                if not line.startswith("data: "):
                    continue
                data = line[6:]
                if data == "[DONE]":
                    break
                try:
                    chunk = json.loads(data)
                    delta = chunk.get("choices", [{}])[0].get("delta", {})
                    content = delta.get("content", "")
                    if content:
                        if first_token_time is None:
                            first_token_time = time.perf_counter()
                        token_count += 1
                        full_text.append(content)
                except json.JSONDecodeError:
                    continue

    except Exception as e:
        return {"model": model, "prompt": prompt_label, "error": str(e)}

    end = time.perf_counter()
    total_time = end - start
    ttft = (first_token_time - start) if first_token_time else total_time

    # Token generation time (after first token)
    gen_time = (end - first_token_time) if first_token_time and token_count > 1 else 0
    tok_per_sec = (token_count - 1) / gen_time if gen_time > 0 else 0

    output_text = "".join(full_text)
    output_chars = len(output_text)

    return {
        "model": model,
        "prompt": prompt_label,
        "ttft_s": round(ttft, 2),
        "total_s": round(total_time, 2),
        "tokens": token_count,
        "tok_per_s": round(tok_per_sec, 1),
        "output_chars": output_chars,
    }


def test_non_streaming(model: str, prompt: str, prompt_label: str) -> dict:
    """Test a model without streaming — measures total latency."""
    body = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": False,
    }

    start = time.perf_counter()
    try:
        r = httpx.post(f"{BASE_URL}/v1/chat/completions",
                       json=body, headers=HEADERS, timeout=300)
        if r.status_code != 200:
            return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": f"HTTP {r.status_code}"}
        result = r.json()
    except Exception as e:
        return {"model": model, "prompt": prompt_label, "mode": "non-stream", "error": str(e)}

    end = time.perf_counter()
    content = result.get("choices", [{}])[0].get("message", {}).get("content", "")

    return {
        "model": model,
        "prompt": prompt_label,
        "mode": "non-stream",
        "total_s": round(end - start, 2),
        "output_chars": len(content),
    }


def run_tests():
    print("=" * 80)
    print("llmux Performance Test")
    print("=" * 80)

    # Check health
    try:
        r = httpx.get(f"{BASE_URL}/health", timeout=5)
        health = r.json()
        print(f"Server healthy — available VRAM: {health['available_vram_gb']} GB")
    except Exception as e:
        print(f"ERROR: Server not reachable: {e}")
        sys.exit(1)

    results = []

    for model, backend, vram_est in TEST_MODELS:
        print(f"\n{'─' * 60}")
        print(f"Model: {model}  ({backend}, {vram_est})")
        print(f"{'─' * 60}")

        # Clear VRAM before each model to measure cold-start load time
        clear_vram()

        for prompt_label, prompt_text in PROMPTS.items():
            # First run = cold start (includes model loading)
            print(f"  [{prompt_label}] streaming (cold)...", end=" ", flush=True)
            r = test_streaming(model, prompt_text, prompt_label)
            r["cold_start"] = True
            results.append(r)
            if "error" in r:
                print(f"ERROR: {r['error']}")
            else:
                print(f"TTFT={r['ttft_s']}s  total={r['total_s']}s  {r['tok_per_s']} tok/s  ({r['tokens']} tokens)")

            # Second run = warm (model already loaded)
            print(f"  [{prompt_label}] streaming (warm)...", end=" ", flush=True)
            r = test_streaming(model, prompt_text, prompt_label)
            r["cold_start"] = False
            results.append(r)
            if "error" in r:
                print(f"ERROR: {r['error']}")
            else:
                print(f"TTFT={r['ttft_s']}s  total={r['total_s']}s  {r['tok_per_s']} tok/s  ({r['tokens']} tokens)")

        # Non-streaming tests (warm)
        for plabel in ["short", "medium"]:
            print(f"  [{plabel}] non-streaming (warm)...", end=" ", flush=True)
            r = test_non_streaming(model, PROMPTS[plabel], plabel)
            results.append(r)
            if "error" in r:
                print(f"ERROR: {r['error']}")
            else:
                chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
                print(f"total={r['total_s']}s  ({r['output_chars']} chars, {chars_per_s} chars/s)")

        # Clear to free VRAM for next model
        clear_vram()

    # Summary table
    print(f"\n{'=' * 90}")
    print("Summary — Streaming")
    print(f"{'=' * 90}")
    print(f"{'Model':<40} {'Prompt':<8} {'Cold':>5} {'TTFT':>7} {'Total':>7} {'Chunks':>7} {'Char/s':>7}")
    print(f"{'-' * 40} {'-' * 8} {'-' * 5} {'-' * 7} {'-' * 7} {'-' * 7} {'-' * 7}")
    for r in results:
        if r.get("mode") == "non-stream":
            continue
        if "error" in r:
            print(f"{r['model']:<40} {r['prompt']:<8} {'':>5} {'ERROR':>7}")
            continue
        cold = "yes" if r.get("cold_start") else "no"
        chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
        print(f"{r['model']:<40} {r['prompt']:<8} {cold:>5} {r['ttft_s']:>6.2f}s {r['total_s']:>6.2f}s {r['tokens']:>7} {chars_per_s:>6.1f}")

    print(f"\n{'=' * 90}")
    print("Summary — Non-streaming")
    print(f"{'=' * 90}")
    print(f"{'Model':<40} {'Prompt':<8} {'Total':>7} {'Chars':>7} {'Char/s':>7}")
    print(f"{'-' * 40} {'-' * 8} {'-' * 7} {'-' * 7} {'-' * 7}")
    for r in results:
        if r.get("mode") != "non-stream":
            continue
        if "error" in r:
            print(f"{r['model']:<40} {r['prompt']:<8} {'ERROR':>7}")
            continue
        chars_per_s = round(r['output_chars'] / r['total_s'], 1) if r['total_s'] > 0 else 0
        print(f"{r['model']:<40} {r['prompt']:<8} {r['total_s']:>6.2f}s {r['output_chars']:>7} {chars_per_s:>6.1f}")

    return results


if __name__ == "__main__":
    run_tests()