Spaces:

NikhilJoson
/

Multimodal_Chat_JanusPro

Running on Zero

App Files Files Community

NikhilJoson commited on Feb 25

Commit

418b14d

verified ·

1 Parent(s): 8e9eece

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -29

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ from transformers import AutoConfig, AutoModelForCausalLM
 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from janus.utils.io import load_pil_images
 from PIL import Image
 import numpy as np
 import os
@@ -18,9 +21,7 @@ model_path = "deepseek-ai/Janus-Pro-7B"
 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
 language_config._attn_implementation = 'eager'
-vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
-                                             language_config=language_config,
-                                             trust_remote_code=True)
 if torch.cuda.is_available():
     vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
 else:
@@ -234,34 +235,39 @@ def unpack(dec, width, height, parallel_size=5):
 @torch.inference_mode()
 @spaces.GPU(duration=120)  # Specify a duration to avoid timeout
 def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=gr.Progress(track_tqdm=True)):
-    # Clear CUDA cache and avoid tracking gradients
-    torch.cuda.empty_cache()
-    # Set the seed for reproducible results
-    if seed is not None:
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
-        np.random.seed(seed)
-    width = 384
-    height = 384
-    parallel_size = 1
-    with torch.no_grad():
-        messages = [{'role': '<|User|>', 'content': prompt},
-                    {'role': '<|Assistant|>', 'content': ''}]
-        text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
-                                                                   sft_format=vl_chat_processor.sft_format,
-                                                                   system_prompt='')
-        text = text + vl_chat_processor.image_start_tag
-        input_ids = torch.LongTensor(tokenizer.encode(text))
-        output, patches = generate(input_ids, width // 16 * 16, height // 16 * 16, cfg_weight=guidance,
-                                   parallel_size=parallel_size, temperature=t2i_temperature)
-        images = unpack(patches, width // 16 * 16, height // 16 * 16, parallel_size=parallel_size)
-        stime = time.time()
-        ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
-        print(f'upsample time: {time.time() - stime}')
-        return ret_images
 @spaces.GPU(duration=60)

 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from janus.utils.io import load_pil_images
 from PIL import Image
+from diffusers import FluxPipeline
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+pipe.to("cuda")
 import numpy as np
 import os
 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
 language_config._attn_implementation = 'eager'
+vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, language_config=language_config, trust_remote_code=True)
 if torch.cuda.is_available():
     vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
 else:
 @torch.inference_mode()
 @spaces.GPU(duration=120)  # Specify a duration to avoid timeout
 def generate_image(prompt, seed=None, guidance=5, t2i_temperature=1.0, progress=gr.Progress(track_tqdm=True)):
+    # # Clear CUDA cache and avoid tracking gradients
+    # torch.cuda.empty_cache()
+    # # Set the seed for reproducible results
+    # if seed is not None:
+    #     torch.manual_seed(seed)
+    #     torch.cuda.manual_seed(seed)
+    #     np.random.seed(seed)
+    # width = 384
+    # height = 384
+    # parallel_size = 1
+    # with torch.no_grad():
+    #     messages = [{'role': '<|User|>', 'content': prompt},
+    #                 {'role': '<|Assistant|>', 'content': ''}]
+    #     text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
+    #                                                                sft_format=vl_chat_processor.sft_format,
+    #                                                                system_prompt='')
+    #     text = text + vl_chat_processor.image_start_tag
+    #     input_ids = torch.LongTensor(tokenizer.encode(text))
+    #     output, patches = generate(input_ids, width // 16 * 16, height // 16 * 16, cfg_weight=guidance,
+    #                                parallel_size=parallel_size, temperature=t2i_temperature)
+    #     images = unpack(patches, width // 16 * 16, height // 16 * 16, parallel_size=parallel_size)
+    #     stime = time.time()
+    #     ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
+    #     print(f'upsample time: {time.time() - stime}')
+    #     return ret_images
+    # Depending on the variant being used, the pipeline call will slightly vary.
+    # Refer to the pipeline documentation for more details.
+    image = pipe(prompt=prompt, guidance_scale=guidance, height=768, width=768, num_inference_steps=16,).images[0]
+    return image
 @spaces.GPU(duration=60)