From 9b3d4e40e2efeb6840047b054b0fe52be37d64d5 Mon Sep 17 00:00:00 2001 From: llm Date: Fri, 28 Nov 2025 21:29:07 +0100 Subject: [PATCH] Qwen3-VL mode working; /unload; normal model loading times --- .../share/pytorch_pod/python-apps/ai-model.py | 85 ++++++++++++++----- 1 file changed, 64 insertions(+), 21 deletions(-) diff --git a/.local/share/pytorch_pod/python-apps/ai-model.py b/.local/share/pytorch_pod/python-apps/ai-model.py index 52c806f..f5ce68e 100755 --- a/.local/share/pytorch_pod/python-apps/ai-model.py +++ b/.local/share/pytorch_pod/python-apps/ai-model.py @@ -18,6 +18,7 @@ from transformers import ( AutoTokenizer, AutoProcessor, AutoModel, + AutoModelForVision2Seq, ) from transformers.utils.import_utils import is_flash_attn_2_available @@ -177,6 +178,10 @@ class EmbeddingResponse(BaseModel): usage: Usage +class PreloadRequest(BaseModel): + model: str + + # ----------------------------------------------------------------------------- # Helpers # ----------------------------------------------------------------------------- @@ -248,28 +253,29 @@ def _load_model_locked(model_id: str): # Load Generation Model # Check if it is a VL model if "VL" in model_id: - # Attempt to load as VL - # Using AutoModelForVision2Seq or AutoModelForCausalLM - # depending on the specific model support in transformers + # Use AutoModelForVision2Seq for VL models + # The configuration class Qwen3VLConfig requires Vision2Seq or AutoModel try: - from transformers import Qwen2VLForConditionalGeneration - - model_class = Qwen2VLForConditionalGeneration - except ImportError: - # Fallback to AutoModel if specific class not available - model_class = AutoModelForCausalLM - - # Note: We use AutoModelForCausalLM for broad compatibility. - # Qwen2-VL requires Qwen2VLForConditionalGeneration for vision. - # We will try AutoModelForCausalLM first. - - model = AutoModelForCausalLM.from_pretrained( - model_id, - torch_dtype=dtype, - device_map=device_map, - attn_implementation=attn_impl, - trust_remote_code=True, # Often needed for new architectures - ).eval() + print(f"Loading {model_id} with AutoModelForVision2Seq...") + model = AutoModelForVision2Seq.from_pretrained( + model_id, + torch_dtype=dtype, + device_map=device_map, + attn_implementation=attn_impl, + trust_remote_code=True, + low_cpu_mem_usage=True, + ).eval() + except Exception as e: + print(f"Vision2Seq failed: {e}. Fallback to AutoModel...") + # Fallback to generic AutoModel if Vision2Seq fails + model = AutoModel.from_pretrained( + model_id, + torch_dtype=dtype, + device_map=device_map, + attn_implementation=attn_impl, + trust_remote_code=True, + low_cpu_mem_usage=True, + ).eval() # Processor/Tokenizer try: @@ -284,12 +290,14 @@ def _load_model_locked(model_id: str): _loaded_model_type = "generation" else: # Standard Text Model (GPT-OSS) + print(f"Loading {model_id} with AutoModelForCausalLM...") model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=dtype, device_map=device_map, attn_implementation=attn_impl, trust_remote_code=True, + low_cpu_mem_usage=True, ).eval() processor = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True @@ -367,6 +375,41 @@ def _extract_embeddings(outputs) -> torch.Tensor: # ----------------------------------------------------------------------------- +@app.post("/preload") +def preload_model(request: PreloadRequest): + model_id = request.model.strip() + if model_id not in ALLOWED_MODEL_IDS: + raise HTTPException( + status_code=400, + detail=f"Model {model_id} not in allowed models.", + ) + + with _model_lock: + try: + _ensure_model_loaded(model_id) + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to load model: {e}" + ) + + return { + "status": "ok", + "loaded_model_id": _loaded_model_id, + "vram_bytes": _current_vram_info(), + } + + +@app.post("/unload") +def unload_model(): + with _model_lock: + stats = _unload_model_locked() + return { + "status": "ok", + "vram_bytes": _current_vram_info(), + "stats": stats, + } + + @app.get("/health") def health(): cuda_ok = bool(torch.cuda.is_available())