Spaces:

brightlembo
/

SY23

Sleeping

App Files Files Community

brightlembo commited on Jan 17

Commit

87fd27b

verified ·

1 Parent(s): 9e1bc30

Create app.py

Browse files

Files changed (1) hide show

app.py +112 -0

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import gradio as gr
+import torch
+from transformers import (
+    BlipProcessor,
+    BlipForQuestionAnswering,
+    pipeline,
+    AutoTokenizer,
+    AutoModelForCausalLM
+)
+from modelscope.pipelines import pipeline as ms_pipeline
+from PIL import Image
+def load_models():
+    # Chargement des modèles
+    blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+    blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+    # Modèle de transcription audio
+    audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
+    # Modèle de génération de texte (version gratuite GPT-2)
+    text_generator = pipeline("text-generation", model="gpt2")
+    return blip_processor, blip_model, audio_transcriber, text_generator
+def analyze_image(image, blip_processor, blip_model):
+    # Questions pour l'analyse d'image
+    questions = [
+        "What is in the picture?",
+        "What are the main colors?",
+        "What is the setting or background?",
+        "What is happening in the image?",
+    ]
+    responses = {}
+    for question in questions:
+        inputs = blip_processor(images=image, text=question, return_tensors="pt")
+        outputs = blip_model.generate(**inputs)
+        answer = blip_processor.decode(outputs[0], skip_special_tokens=True)
+        responses[question] = answer
+    description = f"This image shows {responses['What is in the picture?']}. "
+    description += f"The main colors are {responses['What are the main colors?']}. "
+    description += f"The setting is {responses['What is the setting or background?']}. "
+    description += f"In the scene, {responses['What is happening in the image?']}."
+    return description
+def process_inputs(image, audio, text, models):
+    blip_processor, blip_model, audio_transcriber, text_generator = models
+    final_prompt = ""
+    # Analyse de l'image si présente
+    if image is not None:
+        image_description = analyze_image(image, blip_processor, blip_model)
+        final_prompt += f"Visual description: {image_description}\n"
+    # Transcription audio si présent
+    if audio is not None:
+        audio_text = audio_transcriber(audio)["text"]
+        final_prompt += f"Audio content: {audio_text}\n"
+    # Ajout du texte si présent
+    if text:
+        final_prompt += f"Additional context: {text}\n"
+    # Génération du prompt optimisé avec GPT-2
+    prompt_enhancement = text_generator(
+        final_prompt,
+        max_length=200,
+        num_return_sequences=1
+    )[0]["generated_text"]
+    # Création de la vidéo avec ModelScope
+    video_pipeline = ms_pipeline(
+        'text-to-video-synthesis',
+        model='damo/text-to-video-synthesis'
+    )
+    result = video_pipeline({
+        'text': prompt_enhancement,
+        'output_video_path': 'output.mp4'
+    })
+    return 'output.mp4', prompt_enhancement
+# Interface Gradio
+def create_interface():
+    models = load_models()
+    interface = gr.Interface(
+        fn=lambda img, audio, txt: process_inputs(img, audio, txt, models),
+        inputs=[
+            gr.Image(type="pil", label="Upload Image"),
+            gr.Audio(type="filepath", label="Upload Audio"),
+            gr.Textbox(label="Enter Additional Text")
+        ],
+        outputs=[
+            gr.Video(label="Generated Video"),
+            gr.Textbox(label="Generated Prompt")
+        ],
+        title="Multimodal Content to Video Generator",
+        description="Upload an image, audio, or text (or any combination) to generate a video."
+    )
+    return interface
+# Lancement de l'application
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()