Spaces:

FreedomIntelligence
/

SoundwaveDemo

Paused

App Files Files Community

SoundwaveDemo / app.py

FanBuCUHK

Upload app.py

c08969e verified 2 months ago

raw

history blame

3.16 kB

	import gradio as gr
	import httpx


	async def call_api(text: str, audio_path: str):
	# 读取音频文件
	with open(audio_path, "rb") as f:
	audio_bytes = f.read()

	# 发送到后端API
	async with httpx.AsyncClient() as client:
	files = {"audio_file": (audio_path, audio_bytes)}
	data = {"text": text}
	response = await client.post(
	"http://36.151.70.8:30113/process/",
	files=files,
	data=data
	)

	return response.json()["result"]


	def load_examples():
	return [
	["Can you turn my English into German?", "./show_case/common_voice_en_19664034.mp3"], # En-De
	["Can you identify the initial word that connects to 'currency_name' in this audio clip?",
	"./show_case/audio-1434542201-headset.wav"], # ER
	["What do you think the speaker's message is intended to be in this audio?",
	"./show_case/audio-1434542201-headset.wav"], # IC
	["What does the person say?", "./show_case/p225_002.wav"], # DFake
	["Assess whether this speech's pronunciation is Real or Fake.", "./show_case/Fake.wav"], # DFake
	[
	"What emotional weight does the speaker's tone carry?\nPick one answer from A, B, C, and D.\nA: fear\nB: sadness\nC: joy\nD: neutral",
	"./show_case/SER(emotion)_example.wav"], # SER(emotion)
	[
	"Choose the most suitable answer from options A, B, C, and D to respond the question in next line, you may only choose A or B or C or D.\nThe number of speakers delivering this speech is what?\nA. 4\nB. 2\nC.1\nD. 3",
	"./show_case/SNV_example.wav"], # SNV
	["Identify the language of the conversation you just heard.", "./show_case/SLR_example.wav"], # SLR
	["tell the gender of the speaker in this audio.", "./show_case/SGR_018.wav"], # SGR
	["What's the sound we're hearing in this audio from?", "./show_case/Sound_Vocal_example.wav"], # Sound_vocal
	["What is your best guess at the setting of this sound clip?", "./show_case/Scene_example.wav"], # Sound_cochl
	[
	"Choose the most suitable answer from options A, B, C, and D to respond the question in next line, Please think step by step and you may only choose A or B or C or D.\nRecognize the segment where 'project' is spoken by the speaker.\nA. [5.28, 5.39]\nB. [0.92, 1.39]\nC. [4.75, 5.28]\nD. [3.86, 4.23]",
	"./show_case/SG_audio_1.wav"], # SG
	["What type of business does the first person's son have?", "./show_case/SFT_Fisher_example.wav"] # SFT_Fisher
	]


	iface = gr.Interface(
	fn=call_api,
	inputs=[
	gr.Textbox(label="Enter text instruction", value="What does the person say?"),
	gr.Audio(type="filepath", label="Upload Audio", value="./show_case/p225_002.wav")
	],
	outputs=gr.Textbox(label="Model output"),
	examples=[], # Initially no examples shown
	allow_flagging="never"
	)

	# Add a button to load examples
	iface.add_button("Show Example", load_examples)
	iface.launch()

	if __name__ == '__main__':
	pass