brightlembo commited on
Commit
87fd27b
·
verified ·
1 Parent(s): 9e1bc30

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ BlipProcessor,
5
+ BlipForQuestionAnswering,
6
+ pipeline,
7
+ AutoTokenizer,
8
+ AutoModelForCausalLM
9
+ )
10
+ from modelscope.pipelines import pipeline as ms_pipeline
11
+ from PIL import Image
12
+
13
+ def load_models():
14
+ # Chargement des modèles
15
+ blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
16
+ blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
17
+
18
+ # Modèle de transcription audio
19
+ audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
20
+
21
+ # Modèle de génération de texte (version gratuite GPT-2)
22
+ text_generator = pipeline("text-generation", model="gpt2")
23
+
24
+ return blip_processor, blip_model, audio_transcriber, text_generator
25
+
26
+ def analyze_image(image, blip_processor, blip_model):
27
+ # Questions pour l'analyse d'image
28
+ questions = [
29
+ "What is in the picture?",
30
+ "What are the main colors?",
31
+ "What is the setting or background?",
32
+ "What is happening in the image?",
33
+ ]
34
+
35
+ responses = {}
36
+ for question in questions:
37
+ inputs = blip_processor(images=image, text=question, return_tensors="pt")
38
+ outputs = blip_model.generate(**inputs)
39
+ answer = blip_processor.decode(outputs[0], skip_special_tokens=True)
40
+ responses[question] = answer
41
+
42
+ description = f"This image shows {responses['What is in the picture?']}. "
43
+ description += f"The main colors are {responses['What are the main colors?']}. "
44
+ description += f"The setting is {responses['What is the setting or background?']}. "
45
+ description += f"In the scene, {responses['What is happening in the image?']}."
46
+
47
+ return description
48
+
49
+ def process_inputs(image, audio, text, models):
50
+ blip_processor, blip_model, audio_transcriber, text_generator = models
51
+
52
+ final_prompt = ""
53
+
54
+ # Analyse de l'image si présente
55
+ if image is not None:
56
+ image_description = analyze_image(image, blip_processor, blip_model)
57
+ final_prompt += f"Visual description: {image_description}\n"
58
+
59
+ # Transcription audio si présent
60
+ if audio is not None:
61
+ audio_text = audio_transcriber(audio)["text"]
62
+ final_prompt += f"Audio content: {audio_text}\n"
63
+
64
+ # Ajout du texte si présent
65
+ if text:
66
+ final_prompt += f"Additional context: {text}\n"
67
+
68
+ # Génération du prompt optimisé avec GPT-2
69
+ prompt_enhancement = text_generator(
70
+ final_prompt,
71
+ max_length=200,
72
+ num_return_sequences=1
73
+ )[0]["generated_text"]
74
+
75
+ # Création de la vidéo avec ModelScope
76
+ video_pipeline = ms_pipeline(
77
+ 'text-to-video-synthesis',
78
+ model='damo/text-to-video-synthesis'
79
+ )
80
+
81
+ result = video_pipeline({
82
+ 'text': prompt_enhancement,
83
+ 'output_video_path': 'output.mp4'
84
+ })
85
+
86
+ return 'output.mp4', prompt_enhancement
87
+
88
+ # Interface Gradio
89
+ def create_interface():
90
+ models = load_models()
91
+
92
+ interface = gr.Interface(
93
+ fn=lambda img, audio, txt: process_inputs(img, audio, txt, models),
94
+ inputs=[
95
+ gr.Image(type="pil", label="Upload Image"),
96
+ gr.Audio(type="filepath", label="Upload Audio"),
97
+ gr.Textbox(label="Enter Additional Text")
98
+ ],
99
+ outputs=[
100
+ gr.Video(label="Generated Video"),
101
+ gr.Textbox(label="Generated Prompt")
102
+ ],
103
+ title="Multimodal Content to Video Generator",
104
+ description="Upload an image, audio, or text (or any combination) to generate a video."
105
+ )
106
+
107
+ return interface
108
+
109
+ # Lancement de l'application
110
+ if __name__ == "__main__":
111
+ interface = create_interface()
112
+ interface.launch()