Spaces:

Detomo
/

aisatsu-app-api

Paused

App Files Files Community

vumichien commited on Mar 25, 2023

Commit

25a3bd2

1 Parent(s): 178118e

Create app.py

Browse files

Files changed (1) hide show

app.py +49 -0

app.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from gtts import gTTS
+from io import BytesIO
+import base64
+from PIL import Image
+import cv2
+import numpy as np
+from ultralyticsplus import YOLO
+from base64 import b64encode
+from speech_recognition import AudioFile, Recognizer
+import numpy as np
+from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
+model = YOLO('ultralyticsplus/yolov8s')
+CLASS = model.model.names
+defaul_bot_voice = "おはいようございます"
+area_thres = 0.3
+def infer(image):
+    results = model.predict(image, show=False)[0]
+    image = read_image_as_pil(image)
+    masks, boxes = results.masks, results.boxes
+    area_image = image.width * image.height
+    voice_bot = None
+    most_close = 0
+    out_img = None
+    diff_value = 0.5
+    if boxes is not None:
+        for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
+            if int(cls) != 0:
+                continue
+            box = xyxy.tolist()
+            area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
+            if area_rate >= most_close:
+                out_img = image.crop(tuple(box)).resize((128, 128))
+                most_close = area_rate
+    print(most_close, diff_value)
+    if most_close >= area_thres and diff_value >= 0.5:
+        voice_bot = tts(defaul_bot_voice, language="ja")
+    return voice_bot, out_img
+iface = gr.Interface(
+    fn=infer,
+    title="aisatsu api",
+    inputs=[gr.Image(label="image", type="pil", shape=(960, 640))],
+    outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice"))],
+    cache_examples=True,
+    article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
+    examples=examples).launch(enable_queue=True)