import gradio as gr import torch import cv2 from PIL import Image from auto_gptq import AutoGPTQForCausalLM from transformers import AutoTokenizer from processing_qwen_vl import QWenVLProcessor import os model_id = "Qwen/Qwen-VL-Chat-Int4" processor = QWenVLProcessor.from_pretrained(model_id, trust_remote_code=True) model = AutoGPTQForCausalLM.from_quantized( model_id, device="cuda" if torch.cuda.is_available() else "cpu", trust_remote_code=True ).eval() def capture_photo(filename="sitting.jpg"): cap = cv2.VideoCapture(0) ret, frame = cap.read() cap.release() if ret: cv2.imwrite(filename, frame) return filename return None def speak_text(text, lang="zh"): voice = "zh-CN-XiaoxiaoNeural" if lang == "zh" else "en-US-AriaNeural" os.system(f'edge-tts --text "{text}" --voice "{voice}" --write-media output.mp3') os.system('start output.mp3' if os.name == 'nt' else 'afplay output.mp3') def analyze_posture(image=None, auto_capture=False): if auto_capture: image_path = capture_photo() if image_path is None: return "❌ 無法啟動攝像頭", None image = Image.open(image_path) elif image is None: return "❌ 請上傳圖片或啟用自動拍照", None question = "請判斷這個人是否坐姿不良,如駝背、前傾或歪斜?用中英文回答。" inputs = processor(text=question, images=image, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=512) answer = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() if "請" in answer: speak_text(answer, lang="zh") if "please" in answer.lower(): speak_text(answer, lang="en") return answer, image def run_auto_capture(): return analyze_posture(auto_capture=True) with gr.Blocks(title="駝背識別助手") as demo: gr.Markdown("## 🪑 Qwen-VL-Chat-Int4 駝背識別 Demo") with gr.Row(): with gr.Column(): auto_btn = gr.Button("📷 自動攝像頭拍照並判斷") image_input = gr.Image(type="pil", label="或手動上傳圖片") submit_btn = gr.Button("📤 上傳並判斷") with gr.Column(): output_text = gr.Textbox(label="🧠 模型判斷結果", lines=6) output_image = gr.Image(type="pil", label="分析圖片") auto_btn.click(fn=run_auto_capture, outputs=[output_text, output_image]) submit_btn.click(fn=analyze_posture, inputs=[image_input], outputs=[output_text, output_image]) demo.launch(share=True)