Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -32,22 +32,12 @@ text_to_speech_models = {
|
|
32 |
conversational_tokenizers = {}
|
33 |
conversational_models_loaded = {}
|
34 |
|
35 |
-
for model_name, model_id in conversational_models.items():
|
36 |
-
conversational_tokenizers[model_name] = AutoTokenizer.from_pretrained(model_id)
|
37 |
-
conversational_models_loaded[model_name] = AutoModelForCausalLM.from_pretrained(model_id)
|
38 |
-
|
39 |
# Initialize pipelines for Text-to-Image
|
40 |
text_to_image_pipelines = {}
|
41 |
|
42 |
-
for model_name, model_id in text_to_image_models.items():
|
43 |
-
text_to_image_pipelines[model_name] = StableDiffusionPipeline.from_pretrained(model_id)
|
44 |
-
|
45 |
# Initialize pipelines for Text-to-Speech
|
46 |
text_to_speech_pipelines = {}
|
47 |
|
48 |
-
for model_name, model_id in text_to_speech_models.items():
|
49 |
-
text_to_speech_pipelines[model_name] = pipeline("text-to-speech", model=model_id)
|
50 |
-
|
51 |
# Initialize pipelines for other tasks
|
52 |
visual_qa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
|
53 |
document_qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
@@ -61,9 +51,16 @@ summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cn
|
|
61 |
text_to_audio_pipeline = pipeline("text-to-speech", model="julien-c/ljspeech_tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space")
|
62 |
audio_classification_pipeline = pipeline("audio-classification", model="facebook/wav2vec2-base")
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def chat(model_name, user_input, history=[]):
|
65 |
-
tokenizer =
|
66 |
-
model = conversational_models_loaded[model_name]
|
67 |
|
68 |
# Encode the input
|
69 |
input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")
|
@@ -83,11 +80,15 @@ def chat(model_name, user_input, history=[]):
|
|
83 |
return history, history
|
84 |
|
85 |
def generate_image(model_name, prompt):
|
|
|
|
|
86 |
pipeline = text_to_image_pipelines[model_name]
|
87 |
image = pipeline(prompt).images[0]
|
88 |
return image
|
89 |
|
90 |
def generate_speech(model_name, text):
|
|
|
|
|
91 |
pipeline = text_to_speech_pipelines[model_name]
|
92 |
audio = pipeline(text)
|
93 |
return audio["audio"]
|
@@ -235,14 +236,4 @@ with gr.Blocks() as demo:
|
|
235 |
text_to_audio_generate = gr.Button("Generate Audio")
|
236 |
text_to_audio_output = gr.Audio(label="Generated Audio")
|
237 |
|
238 |
-
text_to_audio_generate.click(text_to_audio, inputs=text_to_audio_text, outputs=text_to_audio_output)
|
239 |
-
|
240 |
-
with gr.Tab("Audio Classification"):
|
241 |
-
audio_classification_audio = gr.Audio(label="Upload Audio")
|
242 |
-
audio_classification_generate = gr.Button("Classify")
|
243 |
-
audio_classification_output = gr.Textbox(label="Classification Result")
|
244 |
-
|
245 |
-
audio_classification_generate.click(audio_classification, inputs=audio_classification_audio, outputs=audio_classification_output)
|
246 |
-
|
247 |
-
# Launch the demo
|
248 |
-
demo.launch()
|
|
|
32 |
conversational_tokenizers = {}
|
33 |
conversational_models_loaded = {}
|
34 |
|
|
|
|
|
|
|
|
|
35 |
# Initialize pipelines for Text-to-Image
|
36 |
text_to_image_pipelines = {}
|
37 |
|
|
|
|
|
|
|
38 |
# Initialize pipelines for Text-to-Speech
|
39 |
text_to_speech_pipelines = {}
|
40 |
|
|
|
|
|
|
|
41 |
# Initialize pipelines for other tasks
|
42 |
visual_qa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
|
43 |
document_qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
|
|
51 |
text_to_audio_pipeline = pipeline("text-to-speech", model="julien-c/ljspeech_tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space")
|
52 |
audio_classification_pipeline = pipeline("audio-classification", model="facebook/wav2vec2-base")
|
53 |
|
54 |
+
def load_conversational_model(model_name):
|
55 |
+
if model_name not in conversational_models_loaded:
|
56 |
+
tokenizer = AutoTokenizer.from_pretrained(conversational_models[model_name])
|
57 |
+
model = AutoModelForCausalLM.from_pretrained(conversational_models[model_name])
|
58 |
+
conversational_tokenizers[model_name] = tokenizer
|
59 |
+
conversational_models_loaded[model_name] = model
|
60 |
+
return conversational_tokenizers[model_name], conversational_models_loaded[model_name]
|
61 |
+
|
62 |
def chat(model_name, user_input, history=[]):
|
63 |
+
tokenizer, model = load_conversational_model(model_name)
|
|
|
64 |
|
65 |
# Encode the input
|
66 |
input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")
|
|
|
80 |
return history, history
|
81 |
|
82 |
def generate_image(model_name, prompt):
|
83 |
+
if model_name not in text_to_image_pipelines:
|
84 |
+
text_to_image_pipelines[model_name] = StableDiffusionPipeline.from_pretrained(text_to_image_models[model_name])
|
85 |
pipeline = text_to_image_pipelines[model_name]
|
86 |
image = pipeline(prompt).images[0]
|
87 |
return image
|
88 |
|
89 |
def generate_speech(model_name, text):
|
90 |
+
if model_name not in text_to_speech_pipelines:
|
91 |
+
text_to_speech_pipelines[model_name] = pipeline("text-to-speech", model=text_to_speech_models[model_name])
|
92 |
pipeline = text_to_speech_pipelines[model_name]
|
93 |
audio = pipeline(text)
|
94 |
return audio["audio"]
|
|
|
236 |
text_to_audio_generate = gr.Button("Generate Audio")
|
237 |
text_to_audio_output = gr.Audio(label="Generated Audio")
|
238 |
|
239 |
+
text_to_audio_generate.click(text_to_audio, inputs=text_to_audio_text, outputs=text_to_audio_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|