From aa7a16011809a606a7c70ae49209657c58d97c70172e44bac15f070f745b1008 Mon Sep 17 00:00:00 2001 From: tlg Date: Sun, 5 Apr 2026 17:59:23 +0200 Subject: [PATCH] fix: proper VRAM cleanup on model unload + CUDA alloc config - Force gc.collect() before torch.cuda.empty_cache() to ensure all model references are released - Set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True in container Co-Authored-By: Claude Opus 4.6 (1M context) --- kischdle/llmux/Dockerfile | 3 +++ kischdle/llmux/llmux/backends/transformers_llm.py | 14 ++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/kischdle/llmux/Dockerfile b/kischdle/llmux/Dockerfile index 2146a22..3f2e108 100644 --- a/kischdle/llmux/Dockerfile +++ b/kischdle/llmux/Dockerfile @@ -57,6 +57,9 @@ COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp_python* /u COPY llmux/ /app/llmux/ WORKDIR /app +# Avoid CUDA memory fragmentation when swapping models +ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + # Run the server EXPOSE 8081 CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"] diff --git a/kischdle/llmux/llmux/backends/transformers_llm.py b/kischdle/llmux/llmux/backends/transformers_llm.py index ca1814f..d290554 100644 --- a/kischdle/llmux/llmux/backends/transformers_llm.py +++ b/kischdle/llmux/llmux/backends/transformers_llm.py @@ -45,12 +45,18 @@ class TransformersLLMBackend(BaseBackend): async def unload(self, model_id: str) -> None: if model_id not in self._loaded: return + import gc entry = self._loaded.pop(model_id) - del entry["model"] - del entry["tokenizer"] - if entry.get("processor"): - del entry["processor"] + model = entry.pop("model") + tokenizer = entry.pop("tokenizer") + processor = entry.pop("processor", None) + del model + del tokenizer + del processor + del entry + gc.collect() torch.cuda.empty_cache() + logger.info(f"Unloaded {model_id}, VRAM freed") async def generate(self, model_id, messages, params, stream=False, tools=None): entry = self._loaded[model_id]