Spaces:

NikhilJoson
/

Multimodal_Chat_JanusPro

Running on Zero

App Files Files Community

Multimodal_Chat_JanusPro / app.py

NikhilJoson

Update app.py

8e9eece verified 3 months ago

raw

history blame

14.9 kB

	import gradio as gr
	import torch
	from transformers import AutoConfig, AutoModelForCausalLM
	from janus.models import MultiModalityCausalLM, VLChatProcessor
	from janus.utils.io import load_pil_images
	from PIL import Image

	import numpy as np
	import os
	import time
	import re
	from Upsample import RealESRGAN
	import spaces # Import spaces for ZeroGPU compatibility


	# Load model and processor
	model_path = "deepseek-ai/Janus-Pro-7B"
	config = AutoConfig.from_pretrained(model_path)
	language_config = config.language_config
	language_config._attn_implementation = 'eager'
	vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
	language_config=language_config,
	trust_remote_code=True)
	if torch.cuda.is_available():
	vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
	else:
	vl_gpt = vl_gpt.to(torch.float16)

	vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
	tokenizer = vl_chat_processor.tokenizer
	cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# SR model
	sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
	sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)

	# Patterns for detecting image generation requests
	GENERATION_PATTERNS = [
	r"generate (.+)",
	r"create (.+)",
	r"draw (.+)",
	r"make (.+)",
	r"show (.+)",
	r"visualize (.+)",
	r"imagine (.+)",
	r"picture (.+)",
	]

	def is_generation_request(message):
	"""Determine if a message is requesting image generation"""
	message = message.lower().strip()

	# Check if message explicitly mentions image generation
	for pattern in GENERATION_PATTERNS:
	match = re.match(pattern, message, re.IGNORECASE)
	if match:
	return True, match.group(1)

	# Check for specific keywords suggesting image generation
	image_keywords = ["image", "picture", "photo", "artwork", "illustration", "painting", "drawing"]
	generation_verbs = ["generate", "create", "make", "produce", "show me", "draw"]

	for verb in generation_verbs:
	for keyword in image_keywords:
	if f"{verb} {keyword}" in message or f"{verb} an {keyword}" in message or f"{verb} a {keyword}" in message:
	# Extract the prompt (everything after the keyword)
	pattern = f"{verb}\\s+(?:an?\\s+)?{keyword}\\s+(?:of\|showing\|depicting\|with)?\\s(.)"
	match = re.search(pattern, message, re.IGNORECASE)
	if match and match.group(1):
	return True, match.group(1)
	else:
	# If we can't extract a specific prompt, use the whole message
	return True, message

	return False, None


	@torch.inference_mode()
	@spaces.GPU(duration=120)
	# Unified chat function that handles both image understanding and generation
	def unified_chat(image, message, chat_history, seed, top_p, temperature, cfg_weight, t2i_temperature, progress=gr.Progress(track_tqdm=True)):
	# Clear CUDA cache before generating
	torch.cuda.empty_cache()

	# Check if this is an image generation request
	is_gen_request, extracted_prompt = is_generation_request(message)

	if is_gen_request:
	# Prepare a more detailed prompt by considering context from the conversation
	context_prompt = extracted_prompt

	# Optionally, enhance the prompt with context from previous messages
	if chat_history and len(chat_history) > 0:
	# Get the last few turns of conversation for context (limit to last 3 turns)
	recent_context = chat_history[-3:] if len(chat_history) > 3 else chat_history
	context_text = " ".join([f"User: {user_msg}" for user_msg, _ in recent_context])
	#context_text = " ".join([f"{user}: {user_msg}" for user_msg, _ in recent_context])

	# Only use context if it's not too long
	if len(context_text) < 200: # Arbitrary length limit
	context_prompt = f"{context_text}. {extracted_prompt}"

	# Generate images
	generated_images = generate_image(prompt=context_prompt, seed=seed, guidance=cfg_weight, t2i_temperature=t2i_temperature)

	# Create a response that includes the generated images
	response = f"I've generated the following images based on: '{extracted_prompt}'"

	# Add the images to the chat as the bot's response
	chat_history.append((message, response))

	# Return the message, updated history, maintained image context, and generated images
	return "", chat_history, image, generated_images

	# Regular chat flow (no image generation)
	# set seed
	torch.manual_seed(seed)
	np.random.seed(seed)
	torch.cuda.manual_seed(seed)

	# Process the conversation history and add current message
	conversation = []

	# Check if we have existing history
	if chat_history:
	# Add previous conversation turns
	for user_msg, assistant_msg in chat_history:
	conversation.append({
	"role": "<\|User\|>",
	"content": user_msg,
	"images": [], # No images for previous turns
	})
	conversation.append({
	"role": "<\|Assistant\|>",
	"content": assistant_msg,
	})

	# Add the current user message with image (if provided)
	user_content = message
	images_list = []

	# Only include image placeholder if image is provided or this is the first message
	if image is not None:
	user_content = f"<image_placeholder>\n{message}"
	images_list = [image]

	conversation.append({
	"role": "<\|User\|>",
	"content": user_content,
	"images": images_list,
	})
	conversation.append({"role": "<\|Assistant\|>", "content": ""})

	# Process images (if any)
	pil_images = []
	if image is not None:
	pil_images = [Image.fromarray(image)]

	prepare_inputs = vl_chat_processor(
	conversations=conversation, images=pil_images, force_batchify=True
	).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)

	inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

	outputs = vl_gpt.language_model.generate(
	inputs_embeds=inputs_embeds,
	attention_mask=prepare_inputs.attention_mask,
	pad_token_id=tokenizer.eos_token_id,
	bos_token_id=tokenizer.bos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	max_new_tokens=512,
	do_sample=False if temperature == 0 else True,
	use_cache=True,
	temperature=temperature,
	top_p=top_p,
	)

	answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)

	# Update chat history
	chat_history.append((message, answer))

	# Keep the last uploaded image in context
	return "", chat_history, image, None


	def generate(input_ids, width, height, temperature: float = 1, parallel_size: int = 5, cfg_weight: float = 5,
	image_token_num_per_image: int = 576, patch_size: int = 16, progress=gr.Progress(track_tqdm=True)):
	# Clear CUDA cache before generating
	torch.cuda.empty_cache()

	tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
	for i in range(parallel_size * 2):
	tokens[i, :] = input_ids
	if i % 2 != 0:
	tokens[i, 1:-1] = vl_chat_processor.pad_id
	inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
	generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(cuda_device)

	pkv = None
	for i in range(image_token_num_per_image):
	with torch.no_grad():
	outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=pkv)
	pkv = outputs.past_key_values
	hidden_states = outputs.last_hidden_state
	logits = vl_gpt.gen_head(hidden_states[:, -1, :])
	logit_cond = logits[0::2, :]
	logit_uncond = logits[1::2, :]
	logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
	probs = torch.softmax(logits / temperature, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)
	generated_tokens[:, i] = next_token.squeeze(dim=-1)
	next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)

	img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
	inputs_embeds = img_embeds.unsqueeze(dim=1)

	patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
	shape=[parallel_size, 8, width // patch_size, height // patch_size])

	return generated_tokens.to(dtype=torch.int), patches

	def unpack(dec, width, height, parallel_size=5):
	dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
	dec = np.clip((dec + 1) / 2 * 255, 0, 255)

	visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
	visual_img[:, :, :] = dec

	return visual_img



	@torch.inference_mode()
	@spaces.GPU(duration=120) # Specify a duration to avoid timeout
	def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=gr.Progress(track_tqdm=True)):
	# Clear CUDA cache and avoid tracking gradients
	torch.cuda.empty_cache()
	# Set the seed for reproducible results
	if seed is not None:
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	np.random.seed(seed)
	width = 384
	height = 384
	parallel_size = 1

	with torch.no_grad():
	messages = [{'role': '<\|User\|>', 'content': prompt},
	{'role': '<\|Assistant\|>', 'content': ''}]
	text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
	sft_format=vl_chat_processor.sft_format,
	system_prompt='')
	text = text + vl_chat_processor.image_start_tag

	input_ids = torch.LongTensor(tokenizer.encode(text))
	output, patches = generate(input_ids, width // 16 * 16, height // 16 * 16, cfg_weight=guidance,
	parallel_size=parallel_size, temperature=t2i_temperature)
	images = unpack(patches, width // 16 * 16, height // 16 * 16, parallel_size=parallel_size)

	stime = time.time()
	ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
	print(f'upsample time: {time.time() - stime}')
	return ret_images


	@spaces.GPU(duration=60)
	def image_upsample(img: Image.Image) -> Image.Image:
	if img is None:
	raise Exception("Image not uploaded")

	width, height = img.size

	if width >= 4096 or height >= 4096:
	raise Exception("The image is too large.")

	global sr_model
	result = sr_model.predict(img.convert('RGB'))
	return result


	# Helper function to add uploaded image to the chat context
	def add_image_to_chat(image, chat_history):
	return image, chat_history


	# Helper function to clear chat history but maintain the image
	def clear_chat(image):
	return [], image, None



	# Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Janus Pro 7B - Unified Chat Interface")
	gr.Markdown("""
	### Tips:
	1. Upload an image to discuss it
	2. Type commands like "generate [description]" to create images
	3. Continue chatting about uploaded or generated images
	4. Use natural language like "show me a sunset" or "create a portrait"
	""")

	# State variables to maintain context
	chat_history = gr.State([])
	current_image = gr.State(None)

	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(label="Upload Image (optional)")
	upload_button = gr.Button("Add Image to Chat")

	with gr.Accordion("Chat Options", open=False):
	und_seed_input = gr.Number(label="Seed", precision=0, value=42)
	top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
	temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")

	with gr.Accordion("Image Generation Options", open=False):
	cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
	t2i_temperature_input = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="Temperature")

	clear_button = gr.Button("Clear Chat")


	with gr.Column(scale=2):
	chat_interface = gr.Chatbot(label="Chat History", height=500)
	message_input = gr.Textbox(
	label="Your message",
	placeholder="Ask about an image, continue chatting, or generate new images by typing 'generate [description]'",
	lines=2
	)
	chat_button = gr.Button("Send")
	generated_images = gr.Gallery(label="Generated Images", visible=True, columns=2, rows=2)

	# Chat interface interactions
	upload_button.click(add_image_to_chat, inputs=[image_input, chat_history], outputs=[current_image, chat_history])

	chat_button.click(
	unified_chat,
	inputs=[current_image, message_input, chat_interface, und_seed_input, top_p, temperature, cfg_weight_input, t2i_temperature_input],
	outputs=[message_input, chat_interface, current_image, generated_images]
	)

	# Also trigger on Enter key
	message_input.submit(
	unified_chat,
	inputs=[current_image, message_input, chat_interface, und_seed_input, top_p, temperature, cfg_weight_input, t2i_temperature_input],
	outputs=[message_input, chat_interface, current_image, generated_images]
	)

	clear_button.click(
	clear_chat,
	inputs=[current_image],
	outputs=[chat_interface, current_image, generated_images]
	)

	# Examples for the unified interface
	examples = gr.Examples(
	label="Example queries",
	examples=[
	["What's in this image?"],
	["Generate a cute kitten with big eyes"],
	["Show me a mountain landscape at sunset"],
	["Can you explain what's happening in this picture?"],
	["Create an astronaut riding a horse"],
	["Generate a futuristic cityscape with flying cars"],
	],
	inputs=message_input,
	)

	demo.launch(share=True)