wusize commited on
Commit
0706fd4
·
verified ·
1 Parent(s): 1858ad7

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +5 -26
app.py CHANGED
@@ -1,36 +1,15 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoConfig, AutoModelForCausalLM
4
  from transformers import AutoTokenizer, AutoModel
5
- from janus.models import MultiModalityCausalLM, VLChatProcessor
6
- from janus.utils.io import load_pil_images
7
  from PIL import Image
8
 
9
  import numpy as np
10
- import os
11
- import time
12
  from Upsample import RealESRGAN
13
  import spaces # Import spaces for ZeroGPU compatibility
14
  from einops import rearrange
15
 
16
 
17
- # Load model and processor
18
- model_path = "deepseek-ai/Janus-Pro-7B"
19
- config = AutoConfig.from_pretrained(model_path)
20
- language_config = config.language_config
21
- language_config._attn_implementation = 'eager'
22
- vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
23
- language_config=language_config,
24
- trust_remote_code=True)
25
- if torch.cuda.is_available():
26
- vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
27
- else:
28
- vl_gpt = vl_gpt.to(torch.float16)
29
-
30
- vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
31
- tokenizer = vl_chat_processor.tokenizer
32
- cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
33
-
34
  # SR model
35
  sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
36
  sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
@@ -66,7 +45,7 @@ print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}", flush=True)
66
  if torch.cuda.is_available():
67
  harmon_model = harmon_model.to(torch.bfloat16).cuda()
68
  else:
69
- harmon_model = harmon_model.to(torch.float16)
70
 
71
 
72
  def expand2square(pil_img, background_color):
@@ -103,7 +82,7 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
103
  image = expand2square(
104
  image, (127, 127, 127))
105
  image = image.resize(size=(image_size, image_size))
106
- image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=cuda_device)
107
  image = rearrange(image, 'h w c -> c h w')[None]
108
  image = 2 * (image / 255) - 1
109
 
@@ -112,7 +91,7 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
112
  image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
113
  prompt = prompt.replace('<image>', '<image>' * image_length)
114
  input_ids = harmon_tokenizer.encode(
115
- prompt, add_special_tokens=True, return_tensors='pt').to(cuda_device)
116
  _, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
117
  inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
118
  inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
@@ -163,7 +142,7 @@ def generate_image(prompt,
163
  prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
164
 
165
  inputs = harmon_tokenizer(
166
- prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(cuda_device)
167
 
168
  with torch.no_grad():
169
 
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoConfig
4
  from transformers import AutoTokenizer, AutoModel
 
 
5
  from PIL import Image
6
 
7
  import numpy as np
 
 
8
  from Upsample import RealESRGAN
9
  import spaces # Import spaces for ZeroGPU compatibility
10
  from einops import rearrange
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # SR model
14
  sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
15
  sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
 
45
  if torch.cuda.is_available():
46
  harmon_model = harmon_model.to(torch.bfloat16).cuda()
47
  else:
48
+ harmon_model = harmon_model.to(torch.float32)
49
 
50
 
51
  def expand2square(pil_img, background_color):
 
82
  image = expand2square(
83
  image, (127, 127, 127))
84
  image = image.resize(size=(image_size, image_size))
85
+ image = torch.from_numpy(np.array(image)).to(dtype=harmon_model.dtype, device=harmon_model.device)
86
  image = rearrange(image, 'h w c -> c h w')[None]
87
  image = 2 * (image / 255) - 1
88
 
 
91
  image_length = (image_size // 16) ** 2 + harmon_model.mar.buffer_size
92
  prompt = prompt.replace('<image>', '<image>' * image_length)
93
  input_ids = harmon_tokenizer.encode(
94
+ prompt, add_special_tokens=True, return_tensors='pt').to(harmon_model.device)
95
  _, z_enc = harmon_model.extract_visual_feature(harmon_model.encode(image))
96
  inputs_embeds = z_enc.new_zeros(*input_ids.shape, harmon_model.llm.config.hidden_size)
97
  inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
 
142
  prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)
143
 
144
  inputs = harmon_tokenizer(
145
+ prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(harmon_model.device)
146
 
147
  with torch.no_grad():
148