Spaces:

wusize
/

Harmon

Running on Zero

App Files Files Community

wusize commited on Mar 31

Commit

0706fd4

verified ·

1 Parent(s): 1858ad7

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +5 -26

app.py CHANGED Viewed

@@ -1,36 +1,15 @@
 import gradio as gr
 import torch
-from transformers import AutoConfig, AutoModelForCausalLM
 from transformers import AutoTokenizer, AutoModel
-from janus.models import MultiModalityCausalLM, VLChatProcessor
-from janus.utils.io import load_pil_images
 from PIL import Image
 import numpy as np
-import os
-import time
 from Upsample import RealESRGAN
 import spaces  # Import spaces for ZeroGPU compatibility
 from einops import rearrange
-# Load model and processor
-model_path = "deepseek-ai/Janus-Pro-7B"
-config = AutoConfig.from_pretrained(model_path)
-language_config = config.language_config
-language_config._attn_implementation = 'eager'
-vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
-                                             language_config=language_config,
-                                             trust_remote_code=True)
-if torch.cuda.is_available():
-    vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
-else:
-    vl_gpt = vl_gpt.to(torch.float16)
-vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
-tokenizer = vl_chat_processor.tokenizer
-cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # SR model
 sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
 sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
@@ -66,7 +45,7 @@ print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
 if torch.cuda.is_available():
     harmon_model = harmon_model.to(torch.bfloat16).cuda()
 else:
-    harmon_model = harmon_model.to(torch.float16)
 def expand2square(pil_img, background_color):
@@ -103,7 +82,7 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
     image = expand2square(
         image, (127, 127, 127))
     image = image.resize(size=(image_size, image_size))
-    image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=cuda_device)
     image = rearrange(image, 'h w c -> c h w')[None]
     image = 2 * (image / 255) - 1
@@ -112,7 +91,7 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
     image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
     prompt = prompt.replace('<image>', '<image>' * image_length)
     input_ids = harmon_tokenizer.encode(
-        prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
     _, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
     inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
     inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
@@ -163,7 +142,7 @@ def generate_image(prompt,
         prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
     inputs = harmon_tokenizer(
-        prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(cuda_device)
     with torch.no_grad():

 import gradio as gr
 import torch
+from transformers import AutoConfig
 from transformers import AutoTokenizer, AutoModel
 from PIL import Image
 import numpy as np
 from Upsample import RealESRGAN
 import spaces  # Import spaces for ZeroGPU compatibility
 from einops import rearrange
 # SR model
 sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
 sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
 if torch.cuda.is_available():
     harmon_model = harmon_model.to(torch.bfloat16).cuda()
 else:
+    harmon_model = harmon_model.to(torch.float32)
 def expand2square(pil_img, background_color):
     image = expand2square(
         image, (127, 127, 127))
     image = image.resize(size=(image_size, image_size))
+    image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=harmon_model.device)
     image = rearrange(image, 'h w c -> c h w')[None]
     image = 2 * (image / 255) - 1
     image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
     prompt = prompt.replace('<image>', '<image>' * image_length)
     input_ids = harmon_tokenizer.encode(
+        prompt, add_special_tokens=True, return_tensors='pt').to(harmon_model.device)
     _, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
     inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
     inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
         prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
     inputs = harmon_tokenizer(
+        prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(harmon_model.device)
     with torch.no_grad():