diff --git a/kischdle/llmux/llmux/backends/llamacpp.py b/kischdle/llmux/llmux/backends/llamacpp.py
index 9834ad5..f04fef5 100644
--- a/kischdle/llmux/llmux/backends/llamacpp.py
+++ b/kischdle/llmux/llmux/backends/llamacpp.py
@@ -133,13 +133,10 @@ class LlamaCppBackend(BaseBackend):
         loop = asyncio.get_event_loop()
         stream = await loop.run_in_executor(None, _run)
 
-        async def _iter():
-            for chunk in stream:
-                chunk["model"] = model_id
-                yield f"data: {json.dumps(chunk)}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return _iter()
+        for chunk in stream:
+            chunk["model"] = model_id
+            yield f"data: {json.dumps(chunk)}\n\n"
+        yield "data: [DONE]\n\n"
 
 
 def _create_think_handler(llm, enable_thinking: bool):
diff --git a/kischdle/llmux/llmux/backends/transformers_llm.py b/kischdle/llmux/llmux/backends/transformers_llm.py
index d290554..5220560 100644
--- a/kischdle/llmux/llmux/backends/transformers_llm.py
+++ b/kischdle/llmux/llmux/backends/transformers_llm.py
@@ -112,21 +112,18 @@ class TransformersLLMBackend(BaseBackend):
 
         chat_id = f"chatcmpl-{uuid.uuid4().hex[:12]}"
         created = int(time.time())
+        loop = asyncio.get_event_loop()
 
-        async def _iter():
-            loop = asyncio.get_event_loop()
-            while True:
-                token = await loop.run_in_executor(None, lambda: next(streamer, None))
-                if token is None:
-                    chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]}
-                    yield f"data: {json.dumps(chunk)}\n\n"
-                    yield "data: [DONE]\n\n"
-                    break
-                chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": token}, "finish_reason": None}]}
+        while True:
+            token = await loop.run_in_executor(None, lambda: next(streamer, None))
+            if token is None:
+                chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]}
                 yield f"data: {json.dumps(chunk)}\n\n"
-            thread.join()
-
-        return _iter()
+                yield "data: [DONE]\n\n"
+                break
+            chunk = {"id": chat_id, "object": "chat.completion.chunk", "created": created, "model": model_id, "choices": [{"index": 0, "delta": {"content": token}, "finish_reason": None}]}
+            yield f"data: {json.dumps(chunk)}\n\n"
+        thread.join()
 
 
 # Physical model config injection
diff --git a/kischdle/llmux/llmux/routes/chat.py b/kischdle/llmux/llmux/routes/chat.py
index 2dbbe0f..6d9bfab 100644
--- a/kischdle/llmux/llmux/routes/chat.py
+++ b/kischdle/llmux/llmux/routes/chat.py
@@ -37,10 +37,12 @@ def create_chat_router(registry, vram_manager, backends, require_api_key):
         messages = body.get("messages", [])
         stream = body.get("stream", False)
         tools = body.get("tools")
-        result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=stream, tools=tools)
-
         if stream:
-            return StreamingResponse(result, media_type="text/event-stream")
+            # generate() is async def that returns an async generator
+            stream_iter = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=True, tools=tools)
+            return StreamingResponse(stream_iter, media_type="text/event-stream")
+
+        result = await backend.generate(model_id=physical_id, messages=messages, params=params, stream=False, tools=tools)
         return result
 
     return router