fix: streaming response fix + GPT-OSS-20B-Uncensored MXFP4 GGUF
- Fix async generator streaming: _stream_generate yields directly instead of returning nested _iter(), route handler awaits generate() then passes async generator to StreamingResponse - Replace aoxo/gpt-oss-20b-uncensored (no quant, OOM) with HauhauCS MXFP4 GGUF via llama-cpp backend Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -133,14 +133,11 @@ class LlamaCppBackend(BaseBackend):
|
|||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
stream = await loop.run_in_executor(None, _run)
|
stream = await loop.run_in_executor(None, _run)
|
||||||
|
|
||||||
async def _iter():
|
|
||||||
for chunk in stream:
|
for chunk in stream:
|
||||||
chunk["model"] = model_id
|
chunk["model"] = model_id
|
||||||
yield f"data: {json.dumps(chunk)}\n\n"
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
return _iter()
|
|
||||||
|
|
||||||
|
|
||||||
def _create_think_handler(llm, enable_thinking: bool):
|
def _create_think_handler(llm, enable_thinking: bool):
|
||||||
"""Create a chat handler with thinking enabled or disabled via Jinja template."""
|
"""Create a chat handler with thinking enabled or disabled via Jinja template."""
|
||||||
|
|||||||
@@ -112,9 +112,8 @@ class TransformersLLMBackend(BaseBackend):
|
|||||||
|
|
||||||
chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
|
||||||
created = int(time.time())
|
created = int(time.time())
|
||||||
|
|
||||||
async def _iter():
|
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
token = await loop.run_in_executor(None, lambda: next(streamer, None))
|
token = await loop.run_in_executor(None, lambda: next(streamer, None))
|
||||||
if token is None:
|
if token is None:
|
||||||
@@ -126,8 +125,6 @@ class TransformersLLMBackend(BaseBackend):
|
|||||||
yield f"data: {json.dumps(chunk)}\n\n"
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
thread.join()
|
thread.join()
|
||||||
|
|
||||||
return _iter()
|
|
||||||
|
|
||||||
|
|
||||||
# Physical model config injection
|
# Physical model config injection
|
||||||
_physical_models: dict[str, PhysicalModel] = {}
|
_physical_models: dict[str, PhysicalModel] = {}
|
||||||
|
|||||||
@@ -37,10 +37,12 @@ def create_chat_router(registry, vram_manager, backends, require_api_key):
|
|||||||
messages = body.get("messages", [])
|
messages = body.get("messages", [])
|
||||||
stream = body.get("stream", False)
|
stream = body.get("stream", False)
|
||||||
tools = body.get("tools")
|
tools = body.get("tools")
|
||||||
result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=stream, tools=tools)
|
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
return StreamingResponse(result, media_type="text/event-stream")
|
# generate() is async def that returns an async generator
|
||||||
|
stream_iter = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=True, tools=tools)
|
||||||
|
return StreamingResponse(stream_iter, media_type="text/event-stream")
|
||||||
|
|
||||||
|
result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=False, tools=tools)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
return router
|
return router
|
||||||
|
|||||||
Reference in New Issue
Block a user