From da35e94b1672a96b024dc9a98d47659f049711bde1f0cb705a0c8335c4c7cddf Mon Sep 17 00:00:00 2001 From: tlg Date: Sun, 5 Apr 2026 22:49:16 +0200 Subject: [PATCH] fix: add triton kernels for MXFP4, fix GGUF KV cache quantization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add 'kernels' package to Dockerfile for native MXFP4 execution (fixes gpt-oss-20b OOM: 15.2GB→13.5GB) - Reduce GGUF n_ctx from 8192 to 4096 and quantize KV cache to Q8_0 to reduce VRAM usage - Use GGML_TYPE_Q8_0 constant instead of string for type_k/type_v Co-Authored-By: Claude Opus 4.6 (1M context) --- kischdle/llmux/Dockerfile | 5 +++-- kischdle/llmux/llmux/backends/llamacpp.py | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/kischdle/llmux/Dockerfile b/kischdle/llmux/Dockerfile index 3f2e108..4340ab3 100644 --- a/kischdle/llmux/Dockerfile +++ b/kischdle/llmux/Dockerfile @@ -27,10 +27,11 @@ RUN pip install --no-cache-dir --break-system-packages \ "sentencepiece>=0.2.0" \ "protobuf>=5.0.0" -# Install transformers + accelerate (needed for device_map) +# Install transformers + accelerate + kernels (MXFP4/FP8 triton kernels) RUN pip install --no-cache-dir --break-system-packages --no-build-isolation \ "transformers>=5.4.0" \ - "accelerate>=1.0.0" + "accelerate>=1.0.0" \ + "kernels" # Install chatterbox-tts WITHOUT its dependencies (it would downgrade # torch from 2.11 to 2.6 and pull gradio, librosa, etc.) diff --git a/kischdle/llmux/llmux/backends/llamacpp.py b/kischdle/llmux/llmux/backends/llamacpp.py index b362221..f2da464 100644 --- a/kischdle/llmux/llmux/backends/llamacpp.py +++ b/kischdle/llmux/llmux/backends/llamacpp.py @@ -38,10 +38,13 @@ class LlamaCppBackend(BaseBackend): logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}") def _load(): + from llama_cpp import GGML_TYPE_Q8_0 kwargs = { "model_path": model_path, "n_gpu_layers": n_gpu_layers, - "n_ctx": 8192, + "n_ctx": 4096, + "type_k": GGML_TYPE_Q8_0, + "type_v": GGML_TYPE_Q8_0, "verbose": False, } if physical.mmproj_file: