File size: 2,347 Bytes
445397d
 
 
 
 
f16a2a2
445397d
 
f16a2a2
 
445397d
f16a2a2
445397d
f16a2a2
445397d
 
f16a2a2
445397d
 
f16a2a2
445397d
 
 
f16a2a2
445397d
 
f16a2a2
445397d
 
f16a2a2
 
 
 
 
445397d
f16a2a2
445397d
f16a2a2
 
 
 
 
 
 
 
 
 
 
 
 
445397d
f16a2a2
445397d
f16a2a2
445397d
f16a2a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445397d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import torch
from faster_whisper import WhisperModel
import pandas as pd

# Model size selection
model_size = "large-v2"

# Get device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Initialize model based on device
if device == "cuda:0":
    # Run on GPU with FP16
    model_whisper = WhisperModel(model_size, device="cuda", compute_type="float16")
else:
    # Run on CPU with INT8
    model_whisper = WhisperModel(model_size, device="cpu", compute_type="int8")

# Function to get filename from file object
def get_filename(file_obj):
    return file_obj.name.split("/")[-1]

# Function to transcribe audio to text
def audio_to_transcript(file_obj):
    try:
        filename = get_filename(file_obj)
        segments, _ = model_whisper.transcribe(file_obj.name, beam_size=5, vad_filter=True)
    except:
        filename = file_obj.split("/")[-1]
        segments, _ = model_whisper.transcribe(file_obj, beam_size=5, vad_filter=True)

    # Initialize lists to store transcription data
    start_segments, end_segments, text_segments = list(), list(), list()
    
    # Process each segment for start time, end time, and text
    for segment in segments:
        start, end, text = segment.start, segment.end, segment.text
        start_segments.append(start)
        end_segments.append(end)
        text_segments.append(text)

    # Save transcript to CSV
    df = pd.DataFrame()
    df["start"] = start_segments
    df["end"] = end_segments
    df["text"] = text_segments

    # Define output CSV file
    csv_file = filename.split(".")[0] + ".csv"
    df.to_csv(csv_file, encoding="utf-8", index=False)
    path_to_csv = gr.File.update(value=csv_file, visible=True)

    return filename, path_to_csv, df

## Gradio Interface Setup
headers = ["start", "end", "text"]

iface = gr.Interface(
    fn=audio_to_transcript,
    inputs=gr.File(label="Upload an Audio File", type="filepath"),
    outputs=[
        gr.Textbox(label="Audio file name"),
        gr.File(label="Transcript CSV file"),
        gr.DataFrame(label="Transcript", headers=headers),
    ],
    allow_flagging="never",
    title="Audio to Transcript",
    description="Upload an audio file, and this tool will return a transcript with time-stamped segments.",
    theme="compact",  # Enhanced UI theme for simplicity
)

iface.launch(debug=True)