feat: Jinja template thinking toggle, Qwen3.5-9B GGUF Q8_0
- Thinking/Instruct toggle via Jinja template patching in llama-cpp backend: creates separate handlers for thinking-enabled and thinking-disabled modes - Replace lovedheart/Qwen3.5-9B-FP8 (safetensors, 15.8GB OOM) with unsloth/Qwen3.5-9B-GGUF Q8_0 (9.2GB, fits) - Enable flash_attn in llama-cpp for better performance - GGUF path resolution falls back to flat gguf/ directory Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
physical_models:
|
||||
qwen3.5-9b-fp8:
|
||||
type: llm
|
||||
backend: transformers
|
||||
model_id: "lovedheart/Qwen3.5-9B-FP8"
|
||||
estimated_vram_gb: 9
|
||||
supports_vision: true
|
||||
backend: llamacpp
|
||||
model_id: "unsloth/Qwen3.5-9B-GGUF"
|
||||
model_file: "Qwen3.5-9B-Q8_0.gguf"
|
||||
estimated_vram_gb: 10
|
||||
supports_vision: false
|
||||
supports_tools: true
|
||||
|
||||
qwen3.5-9b-fp8-uncensored:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import gc
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
@@ -20,9 +21,13 @@ class LlamaCppBackend(BaseBackend):
|
||||
self._loaded: dict[str, dict] = {}
|
||||
|
||||
def _resolve_gguf_path(self, physical: PhysicalModel, filename: str) -> str:
|
||||
"""Resolve a GGUF filename to its path in the HF cache."""
|
||||
"""Resolve a GGUF filename — check flat gguf/ dir first, then HF cache."""
|
||||
# Check flat gguf/ directory
|
||||
flat_path = self._models_dir / "gguf" / filename
|
||||
if flat_path.exists():
|
||||
return str(flat_path)
|
||||
# Fall back to HF cache resolution
|
||||
from huggingface_hub import hf_hub_download
|
||||
# model_id stores the HF repo, model_file/mmproj_file store the filenames
|
||||
return hf_hub_download(
|
||||
repo_id=physical.model_id,
|
||||
filename=filename,
|
||||
@@ -42,36 +47,53 @@ class LlamaCppBackend(BaseBackend):
|
||||
"model_path": model_path,
|
||||
"n_gpu_layers": n_gpu_layers,
|
||||
"n_ctx": 4096,
|
||||
"flash_attn": True,
|
||||
"verbose": False,
|
||||
}
|
||||
if physical.mmproj_file:
|
||||
mmproj_path = self._resolve_gguf_path(physical, physical.mmproj_file)
|
||||
kwargs["chat_handler"] = _create_vision_handler(mmproj_path)
|
||||
return Llama(**kwargs)
|
||||
llm = Llama(**kwargs)
|
||||
return llm
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
llm = await loop.run_in_executor(None, _load)
|
||||
self._loaded[model_id] = {"llm": llm, "n_gpu_layers": n_gpu_layers}
|
||||
|
||||
# Create thinking-enabled and thinking-disabled chat handlers from Jinja template
|
||||
think_handler = _create_think_handler(llm, enable_thinking=True)
|
||||
no_think_handler = _create_think_handler(llm, enable_thinking=False)
|
||||
|
||||
self._loaded[model_id] = {
|
||||
"llm": llm,
|
||||
"n_gpu_layers": n_gpu_layers,
|
||||
"think_handler": think_handler,
|
||||
"no_think_handler": no_think_handler,
|
||||
}
|
||||
|
||||
async def unload(self, model_id: str) -> None:
|
||||
if model_id not in self._loaded:
|
||||
return
|
||||
entry = self._loaded.pop(model_id)
|
||||
del entry["llm"]
|
||||
del entry
|
||||
gc.collect()
|
||||
logger.info(f"Unloaded GGUF model {model_id}")
|
||||
|
||||
async def generate(self, model_id, messages, params, stream=False, tools=None):
|
||||
entry = self._loaded[model_id]
|
||||
llm = entry["llm"]
|
||||
|
||||
effective_messages = list(messages)
|
||||
# Swap chat handler based on thinking mode
|
||||
original_handler = llm.chat_handler
|
||||
if "enable_thinking" in params:
|
||||
if not params["enable_thinking"]:
|
||||
if effective_messages and effective_messages[0].get("role") == "system":
|
||||
effective_messages[0] = dict(effective_messages[0])
|
||||
effective_messages[0]["content"] = "/no_think\n" + effective_messages[0]["content"]
|
||||
if params["enable_thinking"]:
|
||||
handler = entry.get("think_handler")
|
||||
else:
|
||||
effective_messages.insert(0, {"role": "system", "content": "/no_think"})
|
||||
handler = entry.get("no_think_handler")
|
||||
if handler:
|
||||
llm.chat_handler = handler
|
||||
|
||||
effective_messages = list(messages)
|
||||
if "system_prompt_prefix" in params:
|
||||
prefix = params["system_prompt_prefix"]
|
||||
if effective_messages and effective_messages[0].get("role") == "system":
|
||||
@@ -80,10 +102,14 @@ class LlamaCppBackend(BaseBackend):
|
||||
else:
|
||||
effective_messages.insert(0, {"role": "system", "content": prefix})
|
||||
|
||||
try:
|
||||
if stream:
|
||||
return self._stream_generate(llm, effective_messages, model_id, tools)
|
||||
else:
|
||||
return await self._full_generate(llm, effective_messages, model_id, tools)
|
||||
finally:
|
||||
# Restore original handler
|
||||
llm.chat_handler = original_handler
|
||||
|
||||
async def _full_generate(self, llm, messages, model_id, tools):
|
||||
def _run():
|
||||
@@ -116,6 +142,28 @@ class LlamaCppBackend(BaseBackend):
|
||||
return _iter()
|
||||
|
||||
|
||||
def _create_think_handler(llm, enable_thinking: bool):
|
||||
"""Create a chat handler with thinking enabled or disabled via Jinja template."""
|
||||
mode = "enabled" if enable_thinking else "disabled"
|
||||
try:
|
||||
from llama_cpp.llama_chat_format import Jinja2ChatFormatter
|
||||
template_str = llm.metadata.get("tokenizer.chat_template", "")
|
||||
if not template_str:
|
||||
logger.warning("Model has no embedded chat template")
|
||||
return None
|
||||
value = "true" if enable_thinking else "false"
|
||||
patched = "{%- set enable_thinking = " + value + " %}\n" + template_str
|
||||
eos = llm._model.token_get_text(llm._model.token_eos())
|
||||
bos = llm._model.token_get_text(llm._model.token_bos())
|
||||
formatter = Jinja2ChatFormatter(template=patched, eos_token=eos, bos_token=bos)
|
||||
handler = formatter.to_chat_handler()
|
||||
logger.info(f"Created chat handler with thinking {mode}")
|
||||
return handler
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create thinking-{mode} handler: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _create_vision_handler(mmproj_path: str):
|
||||
from llama_cpp.llama_chat_format import Llava16ChatHandler
|
||||
return Llava16ChatHandler(clip_model_path=mmproj_path)
|
||||
|
||||
Reference in New Issue
Block a user