fix: remove unsupported KV cache quantization in llama-cpp backend

GGML_TYPE_Q8_0 for type_k/type_v not supported in this llama-cpp-python version. Keep reduced n_ctx=4096 for VRAM savings. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 23:35:05 +02:00
parent da35e94b16
commit 7a0ff55eb5
1 changed files with 0 additions and 3 deletions
--- a/kischdle/llmux/llmux/backends/llamacpp.py
+++ b/kischdle/llmux/llmux/backends/llamacpp.py
@@ -38,13 +38,10 @@ class LlamaCppBackend(BaseBackend):
        logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
        def _load():
            from llama_cpp import GGML_TYPE_Q8_0
            kwargs = {
                "model_path": model_path,
                "n_gpu_layers": n_gpu_layers,
                "n_ctx": 4096,
                "type_k": GGML_TYPE_Q8_0,
                "type_v": GGML_TYPE_Q8_0,
                "verbose": False,
            }
            if physical.mmproj_file: