fix: resolve GGUF paths through HF cache, add model_id to GGUF config
llama-cpp-python backend now uses huggingface_hub to resolve GGUF file paths within the HF cache structure instead of assuming flat /models/ directory. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -10,6 +10,7 @@ physical_models:
|
|||||||
qwen3.5-9b-fp8-uncensored:
|
qwen3.5-9b-fp8-uncensored:
|
||||||
type: llm
|
type: llm
|
||||||
backend: llamacpp
|
backend: llamacpp
|
||||||
|
model_id: "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
|
||||||
model_file: "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf"
|
model_file: "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf"
|
||||||
mmproj_file: "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf"
|
mmproj_file: "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf"
|
||||||
estimated_vram_gb: 9
|
estimated_vram_gb: 9
|
||||||
|
|||||||
@@ -19,23 +19,34 @@ class LlamaCppBackend(BaseBackend):
|
|||||||
self._models_dir = Path(models_dir)
|
self._models_dir = Path(models_dir)
|
||||||
self._loaded: dict[str, dict] = {}
|
self._loaded: dict[str, dict] = {}
|
||||||
|
|
||||||
|
def _resolve_gguf_path(self, physical: PhysicalModel, filename: str) -> str:
|
||||||
|
"""Resolve a GGUF filename to its path in the HF cache."""
|
||||||
|
from huggingface_hub import hf_hub_download
|
||||||
|
# model_id stores the HF repo, model_file/mmproj_file store the filenames
|
||||||
|
return hf_hub_download(
|
||||||
|
repo_id=physical.model_id,
|
||||||
|
filename=filename,
|
||||||
|
cache_dir=str(self._models_dir),
|
||||||
|
local_files_only=True,
|
||||||
|
)
|
||||||
|
|
||||||
async def load(self, model_id: str, n_gpu_layers: int = -1) -> None:
|
async def load(self, model_id: str, n_gpu_layers: int = -1) -> None:
|
||||||
if model_id in self._loaded:
|
if model_id in self._loaded:
|
||||||
return
|
return
|
||||||
physical = _get_physical_config(model_id)
|
physical = _get_physical_config(model_id)
|
||||||
model_path = self._models_dir / physical.model_file
|
model_path = self._resolve_gguf_path(physical, physical.model_file)
|
||||||
logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
|
logger.info(f"Loading GGUF model {model_path} with n_gpu_layers={n_gpu_layers}")
|
||||||
|
|
||||||
def _load():
|
def _load():
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"model_path": str(model_path),
|
"model_path": model_path,
|
||||||
"n_gpu_layers": n_gpu_layers,
|
"n_gpu_layers": n_gpu_layers,
|
||||||
"n_ctx": 8192,
|
"n_ctx": 8192,
|
||||||
"verbose": False,
|
"verbose": False,
|
||||||
}
|
}
|
||||||
if physical.mmproj_file:
|
if physical.mmproj_file:
|
||||||
mmproj_path = self._models_dir / physical.mmproj_file
|
mmproj_path = self._resolve_gguf_path(physical, physical.mmproj_file)
|
||||||
kwargs["chat_handler"] = _create_vision_handler(str(mmproj_path))
|
kwargs["chat_handler"] = _create_vision_handler(mmproj_path)
|
||||||
return Llama(**kwargs)
|
return Llama(**kwargs)
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
|
|||||||
Reference in New Issue
Block a user