fix: proper VRAM cleanup on model unload + CUDA alloc config

- Force gc.collect() before torch.cuda.empty_cache() to ensure all model references are released - Set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True in container Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 17:59:23 +02:00
parent d3285bad8a
commit aa7a160118
2 changed files with 13 additions and 4 deletions
--- a/kischdle/llmux/Dockerfile
+++ b/kischdle/llmux/Dockerfile
@@ -57,6 +57,9 @@ COPY --from=builder /usr/local/lib/python3.12/dist-packages/llama_cpp_python* /u
 COPY llmux/ /app/llmux/
 WORKDIR /app

+# Avoid CUDA memory fragmentation when swapping models
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
 # Run the server
 EXPOSE 8081
 CMD ["uvicorn", "llmux.main:app", "--host", "0.0.0.0", "--port", "8081"]