fix: remove unsupported KV cache quantization in llama-cpp backend
GGML_TYPE_Q8_0 for type_k/type_v not supported in this llama-cpp-python version. Keep reduced n_ctx=4096 for VRAM savings. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -38,13 +38,10 @@ class LlamaCppBackend(BaseBackend):
|
|||||||
logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
|
logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
|
||||||
|
|
||||||
def _load():
|
def _load():
|
||||||
from llama_cpp import GGML_TYPE_Q8_0
|
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"model_path": model_path,
|
"model_path": model_path,
|
||||||
"n_gpu_layers": n_gpu_layers,
|
"n_gpu_layers": n_gpu_layers,
|
||||||
"n_ctx": 4096,
|
"n_ctx": 4096,
|
||||||
"type_k": GGML_TYPE_Q8_0,
|
|
||||||
"type_v": GGML_TYPE_Q8_0,
|
|
||||||
"verbose": False,
|
"verbose": False,
|
||||||
}
|
}
|
||||||
if physical.mmproj_file:
|
if physical.mmproj_file:
|
||||||
|
|||||||
Reference in New Issue
Block a user