File size: 3,601 Bytes
b29e61c
1e5834c
5906e88
e1ef382
1e5834c
4d71280
b29e61c
1e5834c
b3948c4
d2e72fa
b3948c4
1e5834c
 
b3948c4
d2e72fa
b3948c4
1e5834c
b29e61c
1e5834c
 
 
e1ef382
 
 
 
1e5834c
2ffd15f
 
4d71280
c3277a4
1412907
 
1e5834c
1412907
1e5834c
03c7515
 
1e5834c
 
 
d88a33d
1e5834c
1412907
bf18264
 
1e5834c
1412907
 
 
 
 
 
 
 
 
 
2d57870
1412907
 
 
 
 
 
e1ef382
2d57870
 
e1ef382
 
3a3b17f
1e5834c
e1ef382
76d8e4f
e1ef382
1e5834c
e1ef382
bf7c926
 
1e5834c
 
3a3b17f
 
 
bf18264
e1ef382
bf18264
 
 
3a3b17f
bf18264
 
 
 
 
 
3a3b17f
 
 
2d57870
3a3b17f
 
 
 
e1ef382
3a3b17f
76339ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
from huggingface_hub import hf_hub_download
from audio_index import AudioEmbeddingSystem
from search import search, get_prompt
import pandas as pd
import numpy as np

db_file = hf_hub_download(
    repo_id="freddyaboulton/common-voice-english-audio",
    filename="audio_db_full.sqlite",
    repo_type="dataset",
)
index_file = hf_hub_download(
    repo_id="freddyaboulton/common-voice-english-audio",
    filename="audio_faiss_full.index",
    repo_type="dataset",
)

audio_embedding_system = AudioEmbeddingSystem(db_path=db_file, index_path=index_file)


def audio_search(audio_tuple, prompt: str):
    if audio_tuple is None:
        return gr.skip()

    sample_rate, array = audio_tuple
    if array.dtype == np.int16:
        array = array.astype(np.float32) / 32768.0

    rows = audio_embedding_system.search((sample_rate, array))
    least_similar = audio_embedding_system.search((sample_rate, array), least_similar=True)
    rows += least_similar
    orig_rows = search(rows)
    for i, row in enumerate(rows):
        path = row["path"]
        for orig in orig_rows:
            orig_row = orig["row"]
            if orig_row["path"] == path:
                row["sentence"] = orig_row["sentence"]
                row["audio"] = [
                    "<audio src=" + orig_row["audio"][0]["src"] + " controls />"
                ]
    df = pd.DataFrame(rows)[["path", "audio", "sentence", "distance"]].sort_values(
        by="distance", ascending=True
    )

    # Define the styling function
    def style_path_column(col):
        n = len(col)
        # Default empty styles
        styles = [''] * n
        for i in range(n):
            # First 5 rows: green background with opacity
            if i < 5:
                styles[i] = 'background-color: rgba(0, 255, 0, 0.3)'
            # Last 3 rows: red background with opacity
            elif i >= 5:
                styles[i] = 'background-color: rgba(255, 0, 0, 0.3)'
        return styles

    # Apply the styling to the 'path' column and return the Styler object
    return df.style.apply(style_path_column, subset=['path'])

sample_text = gr.Textbox(
    label="Suggested Prompt",
    info="Unsure what to record? Use this prompt. Hit Enter to get a new one from the common voice dataset",
    value=get_prompt(),
)
iface = gr.Interface(
    fn=audio_search,
    inputs=[gr.Audio(
        label="Record or upload a clip of your voice", sources=["microphone", "upload"]
    ), sample_text],
    outputs=gr.Dataframe(
        show_label=False,
        headers=["path", "audio", "sentence", "distance"],
        datatype=["str", "html", "str", "number"],
    ),
)
with gr.Blocks() as demo:
    gr.HTML(
        f"""
        <h1 style='text-align: center; display: flex; align-items: center; justify-content: center;'>
        <img src="/gradio_api/file=Karaoke_Huggy.png" alt="Voice Match" style="height: 100px; margin-right: 10px"> Voice Match
        </h1>
        """
    )
    gr.HTML(
    """
    <h2 style='text-align: center'>
    Powered by <a href="https://huggingface.co/rimelabs/rimecaster">RimeCaster</a>
    </h2>
    """
    )
    gr.Markdown(
        f"""
        <div style='text-align: center'>
        Record or upload an English clip of your voice and we'll find the most similar (and dissimilar) voices in the <a href="https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0">Common Voice dataset</a>.
        </div>
        """
    )
    iface.render()
    sample_text.submit(fn=get_prompt, inputs=None, outputs=sample_text)

demo.launch(allowed_paths=["Karaoke_Huggy.png"])