|
import io |
|
import os |
|
|
|
import soundfile as sf |
|
from dotenv import load_dotenv |
|
from huggingface_hub import InferenceClient |
|
from smolagents import CodeAgent, GradioUI, HfApiModel |
|
|
|
load_dotenv() |
|
|
|
|
|
def convert_data_to_audio_filelike(your_input_tuple): |
|
"""Convert (sample_rate, np.ndarray) to a BytesIO WAV file""" |
|
sample_rate, audio_data = your_input_tuple |
|
buffer = io.BytesIO() |
|
sf.write(buffer, audio_data, sample_rate, format="WAV") |
|
buffer.seek(0) |
|
return buffer |
|
|
|
|
|
def speech2text_func(data, model: str = "openai/whisper-small.en") -> str: |
|
if isinstance(data, tuple): |
|
buffer = convert_data_to_audio_filelike(data) |
|
data = buffer.read() |
|
client = InferenceClient( |
|
provider="hf-inference", |
|
api_key=os.getenv("HF_TOKEN"), |
|
) |
|
return client.automatic_speech_recognition(data, model=model).text |
|
|
|
|
|
def get_tools(): |
|
add_base_tools = True |
|
tools_list = [] |
|
return tools_list, add_base_tools |
|
|
|
|
|
if __name__ == "__main__": |
|
tools_list, add_base_tools = get_tools() |
|
model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", provider=None) |
|
agent = CodeAgent( |
|
tools=tools_list, |
|
model=model, |
|
add_base_tools=add_base_tools, |
|
additional_authorized_imports=["web_search"], |
|
) |
|
GradioUI(agent).launch(speech2text_func=speech2text_func) |
|
|