Spaces:

Detomo
/

aisatsu-app-api

Paused

aisatsu-app-api / app.py

Update app.py

ee7d893 about 2 years ago

1.69 kB

	from gtts import gTTS
	from io import BytesIO
	import base64
	from PIL import Image
	import cv2
	import numpy as np
	import gradio as gr
	from ultralyticsplus import YOLO
	from base64 import b64encode
	from speech_recognition import AudioFile, Recognizer
	import numpy as np
	from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist

	model = YOLO('ultralyticsplus/yolov8s')
	CLASS = model.model.names
	defaul_bot_voice = "おはいようございます"
	area_thres = 0.3

	def infer(image):
	results = model.predict(image, show=False)[0]
	image = read_image_as_pil(image)
	masks, boxes = results.masks, results.boxes
	area_image = image.width * image.height
	voice_bot = None
	most_close = 0
	out_img = None
	diff_value = 0.5
	if boxes is not None:
	for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
	if int(cls) != 0:
	continue
	box = xyxy.tolist()
	area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
	if area_rate >= most_close:
	out_img = image.crop(tuple(box)).resize((128, 128))
	most_close = area_rate
	print(most_close, diff_value)
	if most_close >= area_thres and diff_value >= 0.5:
	voice_bot = tts(defaul_bot_voice, language="ja")
	return out_img, voice_bot

	iface = gr.Interface(
	fn=infer,
	title="aisatsu api",
	inputs=[gr.Image(label="image", type="pil", shape=(960, 640))],
	outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")],
	article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
	).launch(enable_queue=True, debug=True, share=True)