Voice_Commands / app.py
zinoubm's picture
finished setup
bb7af57
raw
history blame
2.11 kB
import os
import gradio as gr
import numpy as np
import librosa
import torch
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from dotenv import load_dotenv
import openai
from run_command_responses import ResponseManager as rs
resoponses = {
"heated_seats_on": rs.activate_heated_seats,
"heated_seats_off": rs.deactivate_heated_seats,
"cooled_seats_on": rs.activate_cooled_seats,
"cooled_seats_off": rs.deactivate_cooled_seats,
"massage_seats_on": rs.activate_massage_seats,
"massage_seats_off": rs.deactivate_massage_seats,
}
id2label = {
1: "massage_seats_on",
2: "massage_seats_off",
3: "heated_seats_on",
4: "heated_seats_off",
5: "cooled_seats_on",
6: "cooled_seats_off",
}
load_dotenv()
os.environ["PATH"] += ".\env\Lib\site-packages\ffprobe"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = os.getenv("MODEL")
openai.api_key = OPENAI_API_KEY
model = Speech2TextForConditionalGeneration.from_pretrained(
"facebook/s2t-small-librispeech-asr"
)
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
def get_command(command, model, id2label):
completion = openai.Completion.create(
model=model, prompt=f"{command}->", max_tokens=1, temperature=0
)
id = int(completion["choices"][0]["text"].strip())
result = id2label[id] if id in id2label else "unknown"
return result
def transcribe(audio):
input, rate = librosa.load(
audio, sr=16000
) # Downsample original frequency to 16000hrz
inputs = processor(input, sampling_rate=rate, return_tensors="pt")
generated_ids = model.generate(
inputs["input_features"], attention_mask=inputs["attention_mask"]
)
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
result = get_command(transcription, MODEL, id2label)
resoponses.get(result)()
return result
if __name__ == "__main__":
gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
).launch()