File size: 1,694 Bytes
25a3bd2
 
 
 
 
 
b11de8a
25a3bd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee7d893
25a3bd2
 
 
 
 
ee7d893
25a3bd2
ee7d893
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from gtts import gTTS
from io import BytesIO
import base64
from PIL import Image
import cv2
import numpy as np
import gradio as gr
from ultralyticsplus import YOLO
from base64 import b64encode
from speech_recognition import AudioFile, Recognizer
import numpy as np
from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist

model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names
defaul_bot_voice = "γŠγ―γ„γ‚ˆγ†γ”γ–γ„γΎγ™"
area_thres = 0.3

def infer(image):
    results = model.predict(image, show=False)[0]
    image = read_image_as_pil(image)
    masks, boxes = results.masks, results.boxes
    area_image = image.width * image.height
    voice_bot = None
    most_close = 0
    out_img = None
    diff_value = 0.5
    if boxes is not None:
        for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
            if int(cls) != 0:
                continue
            box = xyxy.tolist()
            area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
            if area_rate >= most_close:
                out_img = image.crop(tuple(box)).resize((128, 128))
                most_close = area_rate
    print(most_close, diff_value)
    if most_close >= area_thres and diff_value >= 0.5:
        voice_bot = tts(defaul_bot_voice, language="ja")
    return out_img, voice_bot
    
iface = gr.Interface(
    fn=infer,
    title="aisatsu api",
    inputs=[gr.Image(label="image", type="pil", shape=(960, 640))],
    outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")],
    article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
).launch(enable_queue=True, debug=True, share=True)