Spaces:

NikhilJoson
/

Multimodal_Chat_JanusPro

Running on Zero

App Files Files Community

NikhilJoson commited on Feb 25

Commit

1745cd1

verified ·

1 Parent(s): b3020f6

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -25

app.py CHANGED Viewed

@@ -85,22 +85,12 @@ def unified_chat(image, message, chat_history, seed, top_p, temperature, cfg_wei
     is_gen_request, extracted_prompt = is_generation_request(message)
     if is_gen_request:
-        # Prepare a more detailed prompt by considering context from the conversation
         context_prompt = extracted_prompt
-        # Optionally, enhance the prompt with context from previous messages
-        if chat_history and len(chat_history) > 0:
-            # Get the last few turns of conversation for context (limit to last 3 turns)
-            recent_context = chat_history[-3:] if len(chat_history) > 3 else chat_history
-            context_text = " ".join([f"User: {user_msg}" for user_msg, _ in recent_context])
-            #context_text = " ".join([f"{user}: {user_msg}" for user_msg, _ in recent_context])
-            # Only use context if it's not too long
-            if len(context_text) < 200:  # Arbitrary length limit
-                context_prompt = f"{context_text}. {extracted_prompt}"
-        # Generate images
-        generated_images = generate_image(prompt=context_prompt, seed=seed, guidance=cfg_weight, t2i_temperature=t2i_temperature)
         # Create a response that includes the generated images
         response = f"I've generated the following images based on: '{extracted_prompt}'"
@@ -111,7 +101,7 @@ def unified_chat(image, message, chat_history, seed, top_p, temperature, cfg_wei
         # Return the message, updated history, maintained image context, and generated images
         return "", chat_history, image, generated_images
-    # Regular chat flow (no image generation)
     # set seed
     torch.manual_seed(seed)
     np.random.seed(seed)
@@ -155,9 +145,8 @@ def unified_chat(image, message, chat_history, seed, top_p, temperature, cfg_wei
     if image is not None:
         pil_images = [Image.fromarray(image)]
-    prepare_inputs = vl_chat_processor(
-        conversations=conversation, images=pil_images, force_batchify=True
-    ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
@@ -224,7 +213,8 @@ def unpack(dec, width, height, parallel_size=5):
 @torch.inference_mode()
 @spaces.GPU(duration=120)  # Specify a duration to avoid timeout
-def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache and avoid tracking gradients
     torch.cuda.empty_cache()
     # Set the seed for reproducible results
@@ -236,18 +226,45 @@ def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=
     height = 384
     parallel_size = 1
     with torch.no_grad():
-        messages = [{'role': '<|User|>', 'content': prompt},
                     {'role': '<|Assistant|>', 'content': ''}]
         text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
-                                                                   sft_format=vl_chat_processor.sft_format,
-                                                                   system_prompt='')
         text = text + vl_chat_processor.image_start_tag
         input_ids = torch.LongTensor(tokenizer.encode(text))
-        output, patches = generate(input_ids, width // 16 * 16, height // 16 * 16, cfg_weight=guidance,
-                                   parallel_size=parallel_size, temperature=t2i_temperature)
-        images = unpack(patches, width // 16 * 16, height // 16 * 16, parallel_size=parallel_size)
         stime = time.time()
         ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]

     is_gen_request, extracted_prompt = is_generation_request(message)
     if is_gen_request:
+        # Extract the prompt directly
         context_prompt = extracted_prompt
+        # Generate images with full conversation history
+        generated_images = generate_image(prompt=context_prompt, conversation_history=chat_history,  # Pass the full chat history
+                                          seed=seed, guidance=cfg_weight, t2i_temperature=t2i_temperature)
         # Create a response that includes the generated images
         response = f"I've generated the following images based on: '{extracted_prompt}'"
         # Return the message, updated history, maintained image context, and generated images
         return "", chat_history, image, generated_images
+    # Rest of the function remains the same...
     # set seed
     torch.manual_seed(seed)
     np.random.seed(seed)
     if image is not None:
         pil_images = [Image.fromarray(image)]
+    prepare_inputs = vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True
+                                      ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
 @torch.inference_mode()
 @spaces.GPU(duration=120)  # Specify a duration to avoid timeout
+def generate_image(prompt, conversation_history=None,  # Add conversation history parameter
+                   seed=None, guidance=5, t2i_temperature=1.0, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache and avoid tracking gradients
     torch.cuda.empty_cache()
     # Set the seed for reproducible results
     height = 384
     parallel_size = 1
+    # Prepare a richer context-aware prompt
+    full_prompt = prompt
+    # Add conversation history context if available
+    if conversation_history and len(conversation_history) > 0:
+        # Build a context string from the last few conversation turns
+        # Limit to last 3-5 turns to keep prompt manageable
+        recent_turns = conversation_history[-5:] if len(conversation_history) > 5 else conversation_history
+        context_parts = []
+        for user_msg, assistant_msg in recent_turns:
+            if user_msg and user_msg.strip():
+                context_parts.append(f"User: {user_msg}")
+            if assistant_msg and assistant_msg.strip():
+                context_parts.append(f"Assistant: {assistant_msg}")
+        conversation_context = "\n".join(context_parts)
+        # Combine conversation context with the prompt
+        full_prompt = f"Based on this conversation:\n{conversation_context}\n\nGenerate: {prompt}"
     with torch.no_grad():
+        messages = [{'role': '<|User|>', 'content': full_prompt},
                     {'role': '<|Assistant|>', 'content': ''}]
         text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
+                                                                   sft_format=vl_chat_processor.sft_format, system_prompt='')
         text = text + vl_chat_processor.image_start_tag
         input_ids = torch.LongTensor(tokenizer.encode(text))
+        output, patches = generate(input_ids,
+                                   width // 16 * 16,
+                                   height // 16 * 16,
+                                   cfg_weight=guidance,
+                                   parallel_size=parallel_size,
+                                   temperature=t2i_temperature)
+        images = unpack(patches,
+                        width // 16 * 16,
+                        height // 16 * 16,
+                        parallel_size=parallel_size)
         stime = time.time()
         ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]