Spaces:

NikhilJoson
/

Multimodal_Chat_JanusPro

Running on Zero

App Files Files Community

NikhilJoson commited on Feb 24

Commit

61e1955

verified ·

1 Parent(s): e200100

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -34

app.py CHANGED Viewed

@@ -1,45 +1,95 @@
-import torch
-import argparse
 import gradio as gr
-from janus import JanusProcessor, JanusForConditionalGeneration
-from transformers import AutoTokenizer
-# Load Model and Processor
-model_id = "allenai/janus-pro-7b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-processor = JanusProcessor.from_pretrained(model_id)
-model = JanusForConditionalGeneration.from_pretrained(
-    model_id, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
-).to(device)
-def chat_with_model(history, user_input, image=None):
-    if image is not None:
-        inputs = processor(text=user_input, images=image, return_tensors="pt").to(device)
-    else:
-        inputs = processor(text=user_input, return_tensors="pt").to(device)
-    generated_ids = model.generate(**inputs, max_new_tokens=100)
-    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    history.append((user_input, response))
-    return history, ""
-with gr.Blocks() as demo:
-    gr.Markdown("# Chat with Janus Pro 7B (Multimodal AI)")
-    chat_history = gr.State([])
-    chatbot = gr.Chatbot()
-    user_input = gr.Textbox(label="Your message")
-    image_input = gr.Image(label="Upload an image (optional)", type="pil", optional=True)
-    send_btn = gr.Button("Send")
-    send_btn.click(chat_with_model, inputs=[chat_history, user_input, image_input], outputs=[chatbot, user_input])
-    # gr.Examples([
-    #     ["Describe this image", "example_image.jpg"],
-    #     ["Generate an image of a futuristic city"],
-    # ], inputs=[user_input, image_input])
-demo.launch()

 import gradio as gr
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from janus.utils.io import load_pil_images
+from PIL import Image
+import numpy as np
+import os
+import time
+from Upsample import RealESRGAN
+import spaces  # Import spaces for ZeroGPU compatibility
+# Load model and processor
+model_path = "deepseek-ai/Janus-Pro-7B"
+config = AutoConfig.from_pretrained(model_path)
+language_config = config.language_config
+language_config._attn_implementation = 'eager'
+vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, language_config=language_config, trust_remote_code=True)
+if torch.cuda.is_available():
+    vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
+else:
+    vl_gpt = vl_gpt.to(torch.float16)
+vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# SR model
+sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
+sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
+@torch.inference_mode()
+@spaces.GPU(duration=120)
+def multimodal_understanding(image, question, seed, top_p, temperature, progress=gr.Progress(track_tqdm=True)):
+    # Clear CUDA cache before generating
+    torch.cuda.empty_cache()
+    # set seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed(seed)
+    conversation = [
+        {
+            "role": "<|User |>",
+            "content": f"<image_placeholder>\n{question}",
+            "images": [image],
+        },
+        {"role": "<|Assistant|>", "content": ""},
+    ]
+    pil_images = [Image.fromarray(image)]
+    prepare_inputs = vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True
+                                      ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
+    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    outputs = vl_gpt.language_model.generate(inputs_embeds=inputs_embeds, attention_mask=prepare_inputs.attention_mask,
+                                             pad_token_id=tokenizer.eos_token_id, bos_token_id=tokenizer.bos_token_id,
+                                             eos_token_id=tokenizer.eos_token_id, max_new_tokens=512, temperature=temperature, top_p=top_p,
+                                             do_sample=False if temperature == 0 else True, use_cache=True,)
+    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+    return answer
+# Gradio interface
+css = '''
+.gradio-container {max-width: 960px !important}
+'''
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# Janus Pro 7B Chat Interface")
+    chat_history = gr.Chatbot(label="Chat History")
+    message_input = gr.Textbox(label="Type your message here...")
+    image_input = gr.Image(label="Upload an image (optional)", type="numpy", tool="editor")
+    def respond(message, image):
+        # Here you can add logic to handle the image if provided
+        if image is not None:
+            # Call multimodal understanding with the image and message
+            response = multimodal_understanding(image, message, seed=42, top_p=0.95, temperature=0.1)
+        else:
+            # If no image is provided, just respond with a text-based answer
+            response = "Please provide an image for multimodal understanding."
+        return response
+    def submit_message(message, image):
+        response = respond(message, image)
+        return message, response
+    message_input.submit(submit_message, inputs=[message_input, image_input], outputs=[message_input, chat_history])
+demo.launch(share=True)