File size: 2,604 Bytes
c132fb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import torch
import cv2
from PIL import Image
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer
from processing_qwen_vl import QWenVLProcessor
import os

model_id = "Qwen/Qwen-VL-Chat-Int4"

processor = QWenVLProcessor.from_pretrained(model_id, trust_remote_code=True)

model = AutoGPTQForCausalLM.from_quantized(
    model_id,
    device="cuda" if torch.cuda.is_available() else "cpu",
    trust_remote_code=True
).eval()

def capture_photo(filename="sitting.jpg"):
    cap = cv2.VideoCapture(0)
    ret, frame = cap.read()
    cap.release()
    if ret:
        cv2.imwrite(filename, frame)
        return filename
    return None

def speak_text(text, lang="zh"):
    voice = "zh-CN-XiaoxiaoNeural" if lang == "zh" else "en-US-AriaNeural"
    os.system(f'edge-tts --text "{text}" --voice "{voice}" --write-media output.mp3')
    os.system('start output.mp3' if os.name == 'nt' else 'afplay output.mp3')

def analyze_posture(image=None, auto_capture=False):
    if auto_capture:
        image_path = capture_photo()
        if image_path is None:
            return "❌ η„‘ζ³•ε•Ÿε‹•ζ”εƒι ­", None
        image = Image.open(image_path)
    elif image is None:
        return "❌ θ«‹δΈŠε‚³εœ–η‰‡ζˆ–ε•Ÿη”¨θ‡ͺ動拍照", None

    question = "θ«‹εˆ€ζ–·ι€™ε€‹δΊΊζ˜―ε¦εε§ΏδΈθ‰―οΌŒε¦‚ι§θƒŒγ€ε‰ε‚Ύζˆ–ζ­ͺζ–œοΌŸη”¨δΈ­θ‹±ζ–‡ε›žη­”γ€‚"
    inputs = processor(text=question, images=image, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=512)
    answer = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()

    if "θ«‹" in answer:
        speak_text(answer, lang="zh")
    if "please" in answer.lower():
        speak_text(answer, lang="en")
    
    return answer, image

def run_auto_capture():
    return analyze_posture(auto_capture=True)

with gr.Blocks(title="ι§θƒŒθ­˜εˆ₯εŠ©ζ‰‹") as demo:
    gr.Markdown("## πŸͺ‘ Qwen-VL-Chat-Int4 ι§θƒŒθ­˜εˆ₯ Demo")

    with gr.Row():
        with gr.Column():
            auto_btn = gr.Button("πŸ“· θ‡ͺε‹•ζ”εƒι ­ζ‹η…§δΈ¦εˆ€ζ–·")
            image_input = gr.Image(type="pil", label="ζˆ–ζ‰‹ε‹•δΈŠε‚³εœ–η‰‡")
            submit_btn = gr.Button("πŸ“€ δΈŠε‚³δΈ¦εˆ€ζ–·")
        with gr.Column():
            output_text = gr.Textbox(label="🧠 ζ¨‘εž‹εˆ€ζ–·η΅ζžœ", lines=6)
            output_image = gr.Image(type="pil", label="εˆ†ζžεœ–η‰‡")

    auto_btn.click(fn=run_auto_capture, outputs=[output_text, output_image])
    submit_btn.click(fn=analyze_posture, inputs=[image_input], outputs=[output_text, output_image])

demo.launch(share=True)