Spaces:

NikhilJoson
/

Multimodal_Chat_JanusPro

Running on Zero

App Files Files Community

NikhilJoson commited on Feb 25

Commit

5c3cb3b

verified ·

1 Parent(s): 28bb0de

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -23

app.py CHANGED Viewed

@@ -18,7 +18,9 @@ model_path = "deepseek-ai/Janus-Pro-7B"
 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
 language_config._attn_implementation = 'eager'
-vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, language_config=language_config, trust_remote_code=True)
 if torch.cuda.is_available():
     vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
 else:
@@ -33,7 +35,16 @@ sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu
 sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
 # Patterns for detecting image generation requests
-GENERATION_PATTERNS = [r"generate (.+)", r"create (.+)", r"draw (.+)", r"make (.+)", r"show (.+)", r"visualize (.+)", r"imagine (.+)", r"picture (.+)",]
 def is_generation_request(message):
     """Determine if a message is requesting image generation"""
@@ -82,9 +93,7 @@ def unified_chat(image, message, chat_history, seed, top_p, temperature, cfg_wei
         if chat_history and len(chat_history) > 0:
             # Get the last few turns of conversation for context (limit to last 3 turns)
             recent_context = chat_history[-3:] if len(chat_history) > 3 else chat_history
-            # context_text = " ".join([f"{user}: {user_msg}" for user_msg, _ in recent_context])
-            context_text = " ".join([f"{user_msg}" for user_msg, _ in recent_context])
             # Only use context if it's not too long
             if len(context_text) < 200:  # Arbitrary length limit
@@ -157,10 +166,18 @@ def unified_chat(image, message, chat_history, seed, top_p, temperature, cfg_wei
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
-    outputs = vl_gpt.language_model.generate(inputs_embeds=inputs_embeds, attention_mask=prepare_inputs.attention_mask,
-                                             pad_token_id=tokenizer.eos_token_id, bos_token_id=tokenizer.bos_token_id,
-                                             eos_token_id=tokenizer.eos_token_id, max_new_tokens=512, temperature=temperature, top_p=top_p,
-                                             do_sample=False if temperature == 0 else True, use_cache=True,)
     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
@@ -172,7 +189,7 @@ def unified_chat(image, message, chat_history, seed, top_p, temperature, cfg_wei
 def generate(input_ids, width, height, temperature: float = 1, parallel_size: int = 5, cfg_weight: float = 5,
-             image_token_num_per_image: int = 1024, patch_size: int = 16, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache before generating
     torch.cuda.empty_cache()
@@ -187,9 +204,7 @@ def generate(input_ids, width, height, temperature: float = 1, parallel_size: in
     pkv = None
     for i in range(image_token_num_per_image):
         with torch.no_grad():
-            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
-                                                use_cache=True,
-                                                past_key_values=pkv)
             pkv = outputs.past_key_values
             hidden_states = outputs.last_hidden_state
             logits = vl_gpt.gen_head(hidden_states[:, -1, :])
@@ -221,7 +236,7 @@ def unpack(dec, width, height, parallel_size=5):
 @torch.inference_mode()
-@spaces.GPU(duration=180)  # Specify a duration to avoid timeout
 def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache and avoid tracking gradients
     torch.cuda.empty_cache()
@@ -238,7 +253,8 @@ def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=
         messages = [{'role': '<|User|>', 'content': prompt},
                     {'role': '<|Assistant|>', 'content': ''}]
         text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
-                                                                           sft_format=vl_chat_processor.sft_format, system_prompt='')
         text = text + vl_chat_processor.image_start_tag
         input_ids = torch.LongTensor(tokenizer.encode(text))
@@ -276,10 +292,18 @@ def add_image_to_chat(image, chat_history):
 def clear_chat(image):
     return [], image, None
 # Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Janus Pro 7B - Unified Chat Interface")
     # State variables to maintain context
     chat_history = gr.State([])
@@ -297,17 +321,10 @@ with gr.Blocks() as demo:
             with gr.Accordion("Image Generation Options", open=False):
                 cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
-                t2i_temperature_input = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="Temperature")
             clear_button = gr.Button("Clear Chat")
-            gr.Markdown("""
-            ### Tips:
-            1. Upload an image to discuss it
-            2. Type commands like "generate [description]" to create images
-            3. Continue chatting about uploaded or generated images
-            4. Use natural language like "show me a sunset" or "create a portrait"
-            """)
         with gr.Column(scale=2):
             chat_interface = gr.Chatbot(label="Chat History", height=500)

 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
 language_config._attn_implementation = 'eager'
+vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
+                                             language_config=language_config,
+                                             trust_remote_code=True)
 if torch.cuda.is_available():
     vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
 else:
 sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
 # Patterns for detecting image generation requests
+GENERATION_PATTERNS = [
+    r"generate (.+)",
+    r"create (.+)",
+    r"draw (.+)",
+    r"make (.+)",
+    r"show (.+)",
+    r"visualize (.+)",
+    r"imagine (.+)",
+    r"picture (.+)",
+]
 def is_generation_request(message):
     """Determine if a message is requesting image generation"""
         if chat_history and len(chat_history) > 0:
             # Get the last few turns of conversation for context (limit to last 3 turns)
             recent_context = chat_history[-3:] if len(chat_history) > 3 else chat_history
+            context_text = " ".join([f"{user}: {user_msg}" for user_msg, _ in recent_context])
             # Only use context if it's not too long
             if len(context_text) < 200:  # Arbitrary length limit
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    outputs = vl_gpt.language_model.generate(
+        inputs_embeds=inputs_embeds,
+        attention_mask=prepare_inputs.attention_mask,
+        pad_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=512,
+        do_sample=False if temperature == 0 else True,
+        use_cache=True,
+        temperature=temperature,
+        top_p=top_p,
+    )
     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
 def generate(input_ids, width, height, temperature: float = 1, parallel_size: int = 5, cfg_weight: float = 5,
+             image_token_num_per_image: int = 576, patch_size: int = 16, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache before generating
     torch.cuda.empty_cache()
     pkv = None
     for i in range(image_token_num_per_image):
         with torch.no_grad():
+            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=pkv)
             pkv = outputs.past_key_values
             hidden_states = outputs.last_hidden_state
             logits = vl_gpt.gen_head(hidden_states[:, -1, :])
 @torch.inference_mode()
+@spaces.GPU(duration=120)  # Specify a duration to avoid timeout
 def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache and avoid tracking gradients
     torch.cuda.empty_cache()
         messages = [{'role': '<|User|>', 'content': prompt},
                     {'role': '<|Assistant|>', 'content': ''}]
         text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
+                                                                   sft_format=vl_chat_processor.sft_format,
+                                                                   system_prompt='')
         text = text + vl_chat_processor.image_start_tag
         input_ids = torch.LongTensor(tokenizer.encode(text))
 def clear_chat(image):
     return [], image, None
 # Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("# Janus Pro 7B - Unified Chat Interface")
+    gr.Markdown("""
+                ### Tips:
+                1. Upload an image to discuss it
+                2. Type commands like "generate [description]" to create images
+                3. Continue chatting about uploaded or generated images
+                4. Use natural language like "show me a sunset" or "create a portrait"
+                """)
     # State variables to maintain context
     chat_history = gr.State([])
             with gr.Accordion("Image Generation Options", open=False):
                 cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
+                t2i_temperature_input = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="Temperature")
             clear_button = gr.Button("Clear Chat")
         with gr.Column(scale=2):
             chat_interface = gr.Chatbot(label="Chat History", height=500)