diff --git a/kischdle/llmux/llmux/backends/llamacpp.py b/kischdle/llmux/llmux/backends/llamacpp.py index 9834ad5..f04fef5 100644 --- a/kischdle/llmux/llmux/backends/llamacpp.py +++ b/kischdle/llmux/llmux/backends/llamacpp.py @@ -133,13 +133,10 @@ class LlamaCppBackend(BaseBackend): loop = asyncio.get_event_loop() stream = await loop.run_in_executor(None, _run) - async def _iter(): - for chunk in stream: - chunk["model"] = model_id - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return _iter() + for chunk in stream: + chunk["model"] = model_id + yield f"data: {json.dumps(chunk)}\n\n" + yield "data: [DONE]\n\n" def _create_think_handler(llm, enable_thinking: bool): diff --git a/kischdle/llmux/llmux/backends/transformers_llm.py b/kischdle/llmux/llmux/backends/transformers_llm.py index d290554..5220560 100644 --- a/kischdle/llmux/llmux/backends/transformers_llm.py +++ b/kischdle/llmux/llmux/backends/transformers_llm.py @@ -112,21 +112,18 @@ class TransformersLLMBackend(BaseBackend): chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" created = int(time.time()) + loop = asyncio.get_event_loop() - async def _iter(): - loop = asyncio.get_event_loop() - while True: - token = await loop.run_in_executor(None, lambda: next(streamer, None)) - if token is None: - chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]} - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - break - chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": token}, "finish_reason": None}]} + while True: + token = await loop.run_in_executor(None, lambda: next(streamer, None)) + if token is None: + chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]} yield f"data: {json.dumps(chunk)}\n\n" - thread.join() - - return _iter() + yield "data: [DONE]\n\n" + break + chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": token}, "finish_reason": None}]} + yield f"data: {json.dumps(chunk)}\n\n" + thread.join() # Physical model config injection diff --git a/kischdle/llmux/llmux/routes/chat.py b/kischdle/llmux/llmux/routes/chat.py index 2dbbe0f..6d9bfab 100644 --- a/kischdle/llmux/llmux/routes/chat.py +++ b/kischdle/llmux/llmux/routes/chat.py @@ -37,10 +37,12 @@ def create_chat_router(registry, vram_manager, backends, require_api_key): messages = body.get("messages", []) stream = body.get("stream", False) tools = body.get("tools") - result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=stream, tools=tools) - if stream: - return StreamingResponse(result, media_type="text/event-stream") + # generate() is async def that returns an async generator + stream_iter = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=True, tools=tools) + return StreamingResponse(stream_iter, media_type="text/event-stream") + + result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=False, tools=tools) return result return router