Any sample pictures? Did you try use this on 2 x 3090 rtx ?
Any sample pictures? Did you try use this on 2 x 3090 rtx ?
How to run on 2 x RTX 3090 ? ( 48GB VRAM )
Python Conda
import torch
import argparse
import gc
import os
from pathlib import Path
from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
from hi_diffusers import HiDreamImagePipeline, HiDreamImageTransformer2DModel
from hi_diffusers.schedulers.flash_flow_match import FlashFlowMatchEulerDiscreteScheduler
Konfiguracja środowiskowa
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
Czyszczenie pamięci
def clear_gpu_memory():
torch.cuda.empty_cache()
gc.collect()
torch.cuda.synchronize()
Argumenty
parser = argparse.ArgumentParser()
parser.add_argument("--model_type", type=str, default="fast")
parser.add_argument("--prompt", type=str, default="A cat holding a sign that says "Hi-Dreams.ai".")
parser.add_argument("--output", type=str, default="output.png")
args = parser.parse_args()
Stałe
MODEL_PREFIX = "HiDream-ai"
LLAMA_MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
MODEL_TYPE = args.model_type
PROMPT = args.prompt
OUTPUT = args.output
print(f"Sprawdzanie dostępnych GPU...")
for i in range(torch.cuda.device_count()):
free_mem = torch.cuda.mem_get_info(i)[0] / (10243)
total_mem = torch.cuda.get_device_properties(i).total_memory / (10243)
print(f"GPU {i}: {torch.cuda.get_device_name(i)}, pamięć wolna: {free_mem:.2f} GB / {total_mem:.2f} GB")
Ustalenie typu modelu i parametrów
print(f"Ładowanie modelu typu: {MODEL_TYPE}")
if MODEL_TYPE == "fast":
model_path = f"{MODEL_PREFIX}/HiDream-I1-Fast"
guidance_scale = 0.0
num_inference_steps = 16
shift = 3.0
elif MODEL_TYPE == "dev":
model_path = f"{MODEL_PREFIX}/HiDream-I1-Dev"
guidance_scale = 0.0
num_inference_steps = 28
shift = 6.0
else: # "full"
model_path = f"{MODEL_PREFIX}/HiDream-I1-Full"
guidance_scale = 5.0
num_inference_steps = 30 # Zmniejszone z 50 dla oszczędności pamięci
shift = 3.0
Ładowanie komponentów
try:
print("Ładowanie tokenizera...")
tokenizer = PreTrainedTokenizerFast.from_pretrained(
LLAMA_MODEL_NAME, use_fast=False)
# Zwolnienie pamięci przed ładowaniem dużego modelu
clear_gpu_memory()
print("Ładowanie LLaMA na GPU 0...")
# Model LLaMA na GPU 0
text_encoder = LlamaForCausalLM.from_pretrained(
LLAMA_MODEL_NAME,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
output_hidden_states=True,
output_attentions=True).to("cuda:0")
# Zwolnienie pamięci po załadowaniu LLaMA
clear_gpu_memory()
print("Ładowanie schedulera...")
scheduler = FlashFlowMatchEulerDiscreteScheduler(
num_train_timesteps=1000,
shift=shift,
use_dynamic_shifting=False)
# Ładowanie transformera na GPU 1
print("Ładowanie transformera na GPU 1...")
# Załaduj transformera z niską precyzją
transformer = HiDreamImageTransformer2DModel.from_pretrained(
model_path,
subfolder="transformer",
torch_dtype=torch.float16).to("cuda:1")
# Zwolnienie pamięci po załadowaniu transformera
clear_gpu_memory()
print("Tworzenie pipeline...")
pipe = HiDreamImagePipeline.from_pretrained(
model_path,
scheduler=scheduler,
tokenizer_4=tokenizer,
text_encoder_4=text_encoder,
torch_dtype=torch.float16,
low_cpu_mem_usage=True
).to("cuda:0")
# Przypisz transformera na GPU 1
pipe.transformer = transformer
# Włączenie optymalizacji pamięci
if hasattr(pipe, "enable_vae_slicing"):
pipe.enable_vae_slicing()
if hasattr(pipe, "enable_attention_slicing"):
pipe.enable_attention_slicing()
print("Modele załadowane pomyślnie!")
# Generowanie obrazu
print(f"Generowanie obrazu dla: {PROMPT}")
# Ustawienie seeda
generator = torch.Generator("cuda").manual_seed(42)
# Generowanie z niższą rozdzielczością dla oszczędności pamięci
width, height = 768, 768
print(f"Używanie rozdzielczości: {width}x{height}")
# Generowanie obrazu
with torch.cuda.amp.autocast(dtype=torch.float16):
images = pipe(
PROMPT,
height=height,
width=width,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
generator=generator,
num_images_per_prompt=1
).images
# Zapisanie obrazu
image = images[0]
image.save(OUTPUT)
print(f"Obraz zapisany do: {OUTPUT}")
except Exception as e:
print(f"Wystąpił błąd: {e}")
import traceback
traceback.print_exc()
python run4.py --model_type full --prompt "A beautiful sunset over mountains" --output sunset.png
CUDA out of memory. Tried to allocate 34.00 MiB. GPU 1 has a total capacity of 23.60 GiB of which 11.00 MiB is free. Including non-PyTorch memory, this process has 23.58 GiB memory in use. Of the allocated memory 23.03 GiB is allocated by PyTorch, and 307.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Traceback (most recent call last):
File "/mnt/data2/hidream/HiDream-I1/run4.py", line 90, in
torch_dtype=torch.float16).to("cuda:1")
^^^^^^^^^^^^
File "/root/anaconda3/envs/janus/lib/python3.12/site-packages/diffusers/models/modeling_utils.py", line 1353, in to
return super().to(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/janus/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1343, in to
return self._apply(convert)
^^^^^^^^^^^^^^^^^^^^
File "/root/anaconda3/envs/janus/lib/python3.12/site-packages/torch/nn/modules/module.py", line 903, in _apply
module._apply(fn)
File "/root/anaconda3/envs/janus/lib/python3.12/site-packages/torch/nn/modules/module.py", line 903, in _apply
module._apply(fn)
File "/root/anaconda3/envs/janus/lib/python3.12/site-packages/torch/nn/modules/module.py", line 903, in _apply
module._apply(fn)
[Previous line repeated 4 more times]
File "/root/anaconda3/envs/janus/lib/python3.12/site-packages/torch/nn/modules/module.py", line 930, in _apply
param_applied = fn(param)
^^^^^^^^^
File "/root/anaconda3/envs/janus/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1329, in convert
return t.to(
^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 34.00 MiB. GPU 1 has a total capacity of 23.60 GiB of which 11.00 MiB is free. Including non-PyTorch memory, this process has 23.58 GiB memory in use. Of the allocated memory 23.03 GiB is allocated by PyTorch, and 307.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Multiple GPU inference is not currently supported. For information on running inference with less GPU memory, please refer to this GitHub issue: https://github.com/HiDream-ai/HiDream-I1/issues/7.