vumichien commited on
Commit
25a3bd2
Β·
1 Parent(s): 178118e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -0
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gtts import gTTS
2
+ from io import BytesIO
3
+ import base64
4
+ from PIL import Image
5
+ import cv2
6
+ import numpy as np
7
+
8
+ from ultralyticsplus import YOLO
9
+ from base64 import b64encode
10
+ from speech_recognition import AudioFile, Recognizer
11
+ import numpy as np
12
+ from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
13
+
14
+ model = YOLO('ultralyticsplus/yolov8s')
15
+ CLASS = model.model.names
16
+ defaul_bot_voice = "γŠγ―γ„γ‚ˆγ†γ”γ–γ„γΎγ™"
17
+ area_thres = 0.3
18
+
19
+ def infer(image):
20
+ results = model.predict(image, show=False)[0]
21
+ image = read_image_as_pil(image)
22
+ masks, boxes = results.masks, results.boxes
23
+ area_image = image.width * image.height
24
+ voice_bot = None
25
+ most_close = 0
26
+ out_img = None
27
+ diff_value = 0.5
28
+ if boxes is not None:
29
+ for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
30
+ if int(cls) != 0:
31
+ continue
32
+ box = xyxy.tolist()
33
+ area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
34
+ if area_rate >= most_close:
35
+ out_img = image.crop(tuple(box)).resize((128, 128))
36
+ most_close = area_rate
37
+ print(most_close, diff_value)
38
+ if most_close >= area_thres and diff_value >= 0.5:
39
+ voice_bot = tts(defaul_bot_voice, language="ja")
40
+ return voice_bot, out_img
41
+
42
+ iface = gr.Interface(
43
+ fn=infer,
44
+ title="aisatsu api",
45
+ inputs=[gr.Image(label="image", type="pil", shape=(960, 640))],
46
+ outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice"))],
47
+ cache_examples=True,
48
+ article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
49
+ examples=examples).launch(enable_queue=True)