- Add harmony.py: strip GPT-OSS-20B analysis/thinking channel from both streaming and non-streaming responses (HarmonyStreamFilter + extract_final_text) - Add per-model asyncio.Lock in llamacpp backend to prevent concurrent C++ access that caused container segfaults (exit 139) - Fix chat handler swap for streaming: move inside _stream_generate within lock scope (was broken by try/finally running before stream was consumed) - Filter /v1/models to return only LLM models (hide ASR/TTS from chat dropdown) - Correct Qwen3.5-4B estimated_vram_gb: 4 → 9 (actual allocation ~8GB) - Add GPU memory verification after eviction with retry loop in vram_manager - Add HF_TOKEN_PATH support in main.py for gated model access - Add /v1/audio/models and /v1/audio/voices discovery endpoints (no auth) - Add OOM error handling in both backends and chat route - Add AUDIO_STT_SUPPORTED_CONTENT_TYPES for webm/wav/mp3/ogg - Add performance test script (scripts/perf_test.py) - Update tests to match current config (42 tests pass) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
113 lines
3.0 KiB
YAML
113 lines
3.0 KiB
YAML
physical_models:
|
|
qwen3.5-9b-fp8:
|
|
type: llm
|
|
backend: llamacpp
|
|
model_id: "unsloth/Qwen3.5-9B-GGUF"
|
|
model_file: "Qwen3.5-9B-Q8_0.gguf"
|
|
estimated_vram_gb: 10
|
|
supports_vision: false
|
|
supports_tools: true
|
|
|
|
qwen3.5-9b-fp8-uncensored:
|
|
type: llm
|
|
backend: llamacpp
|
|
model_id: "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
|
|
model_file: "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q8_0.gguf"
|
|
mmproj_file: "mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf"
|
|
estimated_vram_gb: 9
|
|
supports_vision: true
|
|
supports_tools: true
|
|
|
|
qwen3.5-4b:
|
|
type: llm
|
|
backend: transformers
|
|
model_id: "Qwen/Qwen3.5-4B"
|
|
estimated_vram_gb: 9
|
|
supports_vision: true
|
|
supports_tools: true
|
|
|
|
gpt-oss-20b:
|
|
type: llm
|
|
backend: transformers
|
|
model_id: "openai/gpt-oss-20b"
|
|
estimated_vram_gb: 13
|
|
supports_vision: false
|
|
supports_tools: true
|
|
|
|
gpt-oss-20b-uncensored:
|
|
type: llm
|
|
backend: llamacpp
|
|
model_id: "HauhauCS/GPT-OSS-20B-Uncensored-HauhauCS-Aggressive"
|
|
model_file: "GPT-OSS-20B-Uncensored-HauhauCS-MXFP4-Aggressive.gguf"
|
|
estimated_vram_gb: 13
|
|
supports_vision: false
|
|
supports_tools: true
|
|
|
|
cohere-transcribe:
|
|
type: asr
|
|
backend: transformers
|
|
model_id: "CohereLabs/cohere-transcribe-03-2026"
|
|
estimated_vram_gb: 4
|
|
default_language: "en"
|
|
|
|
chatterbox-multilingual:
|
|
type: tts
|
|
backend: chatterbox
|
|
variant: "multilingual"
|
|
estimated_vram_gb: 2
|
|
|
|
chatterbox:
|
|
type: tts
|
|
backend: chatterbox
|
|
variant: "default"
|
|
estimated_vram_gb: 2
|
|
|
|
virtual_models:
|
|
Qwen3.5-9B-FP8-Thinking:
|
|
physical: qwen3.5-9b-fp8
|
|
params: { enable_thinking: true }
|
|
Qwen3.5-9B-FP8-Instruct:
|
|
physical: qwen3.5-9b-fp8
|
|
params: { enable_thinking: false }
|
|
|
|
Qwen3.5-9B-FP8-Uncensored-Thinking:
|
|
physical: qwen3.5-9b-fp8-uncensored
|
|
params: { enable_thinking: true }
|
|
Qwen3.5-9B-FP8-Uncensored-Instruct:
|
|
physical: qwen3.5-9b-fp8-uncensored
|
|
params: { enable_thinking: false }
|
|
|
|
Qwen3.5-4B-Thinking:
|
|
physical: qwen3.5-4b
|
|
params: { enable_thinking: true }
|
|
Qwen3.5-4B-Instruct:
|
|
physical: qwen3.5-4b
|
|
params: { enable_thinking: false }
|
|
|
|
GPT-OSS-20B-Low:
|
|
physical: gpt-oss-20b
|
|
params: { system_prompt_prefix: "Reasoning: low" }
|
|
GPT-OSS-20B-Medium:
|
|
physical: gpt-oss-20b
|
|
params: { system_prompt_prefix: "Reasoning: medium" }
|
|
GPT-OSS-20B-High:
|
|
physical: gpt-oss-20b
|
|
params: { system_prompt_prefix: "Reasoning: high" }
|
|
|
|
GPT-OSS-20B-Uncensored-Low:
|
|
physical: gpt-oss-20b-uncensored
|
|
params: { system_prompt_prefix: "Reasoning: low" }
|
|
GPT-OSS-20B-Uncensored-Medium:
|
|
physical: gpt-oss-20b-uncensored
|
|
params: { system_prompt_prefix: "Reasoning: medium" }
|
|
GPT-OSS-20B-Uncensored-High:
|
|
physical: gpt-oss-20b-uncensored
|
|
params: { system_prompt_prefix: "Reasoning: high" }
|
|
|
|
cohere-transcribe:
|
|
physical: cohere-transcribe
|
|
Chatterbox-Multilingual:
|
|
physical: chatterbox-multilingual
|
|
Chatterbox:
|
|
physical: chatterbox
|