fix: add triton kernels for MXFP4, fix GGUF KV cache quantization
- Add 'kernels' package to Dockerfile for native MXFP4 execution (fixes gpt-oss-20b OOM: 15.2GB→13.5GB) - Reduce GGUF n_ctx from 8192 to 4096 and quantize KV cache to Q8_0 to reduce VRAM usage - Use GGML_TYPE_Q8_0 constant instead of string for type_k/type_v Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -27,10 +27,11 @@ RUN pip install --no-cache-dir --break-system-packages \
|
|||||||
"sentencepiece>=0.2.0" \
|
"sentencepiece>=0.2.0" \
|
||||||
"protobuf>=5.0.0"
|
"protobuf>=5.0.0"
|
||||||
|
|
||||||
# Install transformers + accelerate (needed for device_map)
|
# Install transformers + accelerate + kernels (MXFP4/FP8 triton kernels)
|
||||||
RUN pip install --no-cache-dir --break-system-packages --no-build-isolation \
|
RUN pip install --no-cache-dir --break-system-packages --no-build-isolation \
|
||||||
"transformers>=5.4.0" \
|
"transformers>=5.4.0" \
|
||||||
"accelerate>=1.0.0"
|
"accelerate>=1.0.0" \
|
||||||
|
"kernels"
|
||||||
|
|
||||||
# Install chatterbox-tts WITHOUT its dependencies (it would downgrade
|
# Install chatterbox-tts WITHOUT its dependencies (it would downgrade
|
||||||
# torch from 2.11 to 2.6 and pull gradio, librosa, etc.)
|
# torch from 2.11 to 2.6 and pull gradio, librosa, etc.)
|
||||||
|
|||||||
@@ -38,10 +38,13 @@ class LlamaCppBackend(BaseBackend):
|
|||||||
logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
|
logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
|
||||||
|
|
||||||
def _load():
|
def _load():
|
||||||
|
from llama_cpp import GGML_TYPE_Q8_0
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"model_path": model_path,
|
"model_path": model_path,
|
||||||
"n_gpu_layers": n_gpu_layers,
|
"n_gpu_layers": n_gpu_layers,
|
||||||
"n_ctx": 8192,
|
"n_ctx": 4096,
|
||||||
|
"type_k": GGML_TYPE_Q8_0,
|
||||||
|
"type_v": GGML_TYPE_Q8_0,
|
||||||
"verbose": False,
|
"verbose": False,
|
||||||
}
|
}
|
||||||
if physical.mmproj_file:
|
if physical.mmproj_file:
|
||||||
|
|||||||
Reference in New Issue
Block a user