fix: streaming response fix + GPT-OSS-20B-Uncensored MXFP4 GGUF

- Fix async generator streaming: _stream_generate yields directly
  instead of returning nested _iter(), route handler awaits generate()
  then passes async generator to StreamingResponse
- Replace aoxo/gpt-oss-20b-uncensored (no quant, OOM) with
  HauhauCS MXFP4 GGUF via llama-cpp backend

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
tlg
2026-04-06 22:21:22 +02:00
parent 61308703dc
commit 06923d51b4
3 changed files with 19 additions and 23 deletions

View File

@@ -133,13 +133,10 @@ class LlamaCppBackend(BaseBackend):
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
stream = await loop.run_in_executor(None, _run) stream = await loop.run_in_executor(None, _run)
async def _iter(): for chunk in stream:
for chunk in stream: chunk["model"] = model_id
chunk["model"] = model_id yield f"data: {json.dumps(chunk)}\n\n"
yield f"data: {json.dumps(chunk)}\n\n" yield "data: [DONE]\n\n"
yield "data: [DONE]\n\n"
return _iter()
def _create_think_handler(llm, enable_thinking: bool): def _create_think_handler(llm, enable_thinking: bool):

View File

@@ -112,21 +112,18 @@ class TransformersLLMBackend(BaseBackend):
chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
created = int(time.time()) created = int(time.time())
loop = asyncio.get_event_loop()
async def _iter(): while True:
loop = asyncio.get_event_loop() token = await loop.run_in_executor(None, lambda: next(streamer, None))
while True: if token is None:
token = await loop.run_in_executor(None, lambda: next(streamer, None)) chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]}
if token is None:
chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]}
yield f"data: {json.dumps(chunk)}\n\n"
yield "data: [DONE]\n\n"
break
chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": token}, "finish_reason": None}]}
yield f"data: {json.dumps(chunk)}\n\n" yield f"data: {json.dumps(chunk)}\n\n"
thread.join() yield "data: [DONE]\n\n"
break
return _iter() chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": token}, "finish_reason": None}]}
yield f"data: {json.dumps(chunk)}\n\n"
thread.join()
# Physical model config injection # Physical model config injection

View File

@@ -37,10 +37,12 @@ def create_chat_router(registry, vram_manager, backends, require_api_key):
messages = body.get("messages", []) messages = body.get("messages", [])
stream = body.get("stream", False) stream = body.get("stream", False)
tools = body.get("tools") tools = body.get("tools")
result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=stream, tools=tools)
if stream: if stream:
return StreamingResponse(result, media_type="text/event-stream") # generate() is async def that returns an async generator
stream_iter = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=True, tools=tools)
return StreamingResponse(stream_iter, media_type="text/event-stream")
result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=False, tools=tools)
return result return result
return router return router