File size: 2,110 Bytes
0013d95
 
 
 
 
 
 
bb7af57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0013d95
 
 
 
 
 
 
 
bb7af57
 
0013d95
 
 
 
 
 
bb7af57
 
 
 
 
 
 
 
 
0013d95
 
 
 
 
 
 
 
 
bb7af57
 
 
0013d95
 
bb7af57
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import gradio as gr
import numpy as np
import librosa
import torch
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from dotenv import load_dotenv
import openai
from run_command_responses import ResponseManager as rs

resoponses = {
    "heated_seats_on": rs.activate_heated_seats,
    "heated_seats_off": rs.deactivate_heated_seats,
    "cooled_seats_on": rs.activate_cooled_seats,
    "cooled_seats_off": rs.deactivate_cooled_seats,
    "massage_seats_on": rs.activate_massage_seats,
    "massage_seats_off": rs.deactivate_massage_seats,
}

id2label = {
    1: "massage_seats_on",
    2: "massage_seats_off",
    3: "heated_seats_on",
    4: "heated_seats_off",
    5: "cooled_seats_on",
    6: "cooled_seats_off",
}

load_dotenv()

os.environ["PATH"] += ".\env\Lib\site-packages\ffprobe"

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = os.getenv("MODEL")

openai.api_key = OPENAI_API_KEY

model = Speech2TextForConditionalGeneration.from_pretrained(
    "facebook/s2t-small-librispeech-asr"
)
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")


def get_command(command, model, id2label):
    completion = openai.Completion.create(
        model=model, prompt=f"{command}->", max_tokens=1, temperature=0
    )
    id = int(completion["choices"][0]["text"].strip())
    result = id2label[id] if id in id2label else "unknown"
    return result


def transcribe(audio):
    input, rate = librosa.load(
        audio, sr=16000
    )  # Downsample original frequency to 16000hrz
    inputs = processor(input, sampling_rate=rate, return_tensors="pt")
    generated_ids = model.generate(
        inputs["input_features"], attention_mask=inputs["attention_mask"]
    )
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
    result = get_command(transcription, MODEL, id2label)
    resoponses.get(result)()
    return result


if __name__ == "__main__":
    gr.Interface(
        fn=transcribe,
        inputs=gr.Audio(source="microphone", type="filepath"),
        outputs="text",
    ).launch()