File size: 4,387 Bytes
ae71d4b
 
 
 
27a58ec
ae71d4b
 
 
6e9b076
ae71d4b
 
6e9b076
ae71d4b
 
 
 
 
 
 
b68b40e
68c97ea
82ab075
9d22775
b68b40e
 
 
ae71d4b
 
 
 
 
bc0b828
7e14a8d
ae71d4b
27fd4f5
 
 
 
 
 
 
 
bbb7e65
 
 
 
27fd4f5
caaf71e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae71d4b
 
bbb7e65
ae71d4b
 
 
 
 
 
 
 
 
 
a1ec3a4
ae71d4b
 
 
 
 
 
 
 
 
 
 
 
 
 
6a34194
 
 
 
ae71d4b
 
 
 
 
27fd4f5
 
 
 
 
 
caaf71e
 
27fd4f5
ae71d4b
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import librosa
import numpy as np
import torch
import requests

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

speaker_embeddings = {
    "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
    "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
    "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
}

def getNews(search_key):
    return requests.get ("https://newsapi.org/v2/everything?q=" +search_key+ "&pageSize=3&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")
#    return requests.get ("https://newsapi.org/v2/top-headlines?country=us&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8" )

def getHeadlines():
    return requests.get ("https://newsapi.org/v2/top-headlines?country=us&apiKey=3bca07c913ec4703a23f6ba03e15b30b").content.decode("utf-8")


def predict(text, speaker):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))

    # text = getNews ()
    # inputs = processor(text=text, return_tensors="pt")

    if Interest == "":
        inputs = processor(text=getNews(Interest), 
                           return_tensors="pt")
    else
        inputs = processor(text=getNews(text), 
                           return_tensors="pt")

        
    # limit input length
    input_ids = inputs["input_ids"]
    input_ids = input_ids[..., :model.config.max_text_positions]

    
    if speaker == "Surprise Me!":
        # load one of the provided speaker embeddings at random
        idx = np.random.randint(len(speaker_embeddings))
        key = list(speaker_embeddings.keys())[idx]
        speaker_embedding = np.load(speaker_embeddings[key])

        # randomly shuffle the elements
        np.random.shuffle(speaker_embedding)

        # randomly flip half the values
        x = (np.random.rand(512) >= 0.5) * 1.0
        x[x == 0] = -1.0
        speaker_embedding *= x

        #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
    else:
        speaker_embedding = np.load(speaker_embeddings[speaker[:3]])

    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

    speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)

    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech)


title = "SpeechT5: Speech Synthesis"

description = """
The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.
"""

article = """
<div style='margin:20px auto;'>

<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
<a href="https://github.com/microsoft/SpeechT5/">original GitHub</a> |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>

<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a> using <a href="https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py">this script</a>.</p>

</div>
"""

examples = [
    ["It is not in the stars to hold our destiny but in ourselves.", "BDL (male)"],
    ["The octopus and Oliver went to the opera in October.", "CLB (female)"],
    ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "RMS (male)"],
    ["How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.", "CLB (female)"],
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="keyword"),
        gr.Radio(label="Interest", choices=[
            "Minecraft",
            "Technology",
            "Politics",
            "KPop",
            "Surprise Me!"
        ],
        value="KPop"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()