Spaces:
Sleeping
Sleeping
File size: 6,606 Bytes
128f7a0 33ed421 128f7a0 2559ff3 128f7a0 2559ff3 33ed421 7ac0c23 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 2559ff3 128f7a0 7ac0c23 128f7a0 2559ff3 cd967a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
from datetime import datetime
import os
import torch
import gc
# Set PyTorch memory allocation configuration
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
DESCRIPTION = "[Sparrow Qwen2-VL-2B Backend](https://github.com/katanaml/sparrow)"
def process_image(image_filepath, max_width=800, max_height=1000):
if image_filepath is None:
raise ValueError("No image provided. Please upload an image before submitting.")
img = Image.open(image_filepath)
width, height = img.size
# Calculate new dimensions while maintaining aspect ratio
if width > max_width or height > max_height:
aspect_ratio = width / height
if width > max_width:
new_width = max_width
new_height = int(new_width / aspect_ratio)
if new_height > max_height:
new_height = max_height
new_width = int(new_height * aspect_ratio)
else:
new_width, new_height = width, height
# Resize the image if needed
if new_width != width or new_height != height:
img = img.resize((new_width, new_height), Image.LANCZOS)
# Generate temporary filename - use /tmp folder for better space management
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"/tmp/image_{timestamp}.jpg" # Use jpg for smaller file size
# Save with optimized compression
img.save(filename, format='JPEG', quality=85, optimize=True)
return os.path.abspath(filename), new_width, new_height
# Initialize model with memory optimizations but without 4-bit quantization
model = None
processor = None
def load_model():
# Load model with memory optimizations
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-2B-Instruct",
torch_dtype=torch.float16, # Use fp16 for memory efficiency
device_map="auto",
attn_implementation="flash_attention_2" # Use FlashAttention if available
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
return model, processor
@spaces.GPU
def run_inference(input_imgs, text_input):
global model, processor
# Lazy load model
if model is None or processor is None:
model, processor = load_model()
results = []
# Process images one at a time to avoid OOM issues
for image in input_imgs:
# Clear cache before processing each image
torch.cuda.empty_cache()
gc.collect()
# Process image with reduced dimensions
image_path, width, height = process_image(image)
try:
# Create messages with optimized image
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image_path,
"resized_height": height,
"resized_width": width
},
{
"type": "text",
"text": text_input
}
]
}
]
# Prepare inputs with memory optimization
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
# Clear unused memory
del messages
torch.cuda.empty_cache()
# Process inputs with truncation to control memory usage
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
truncation=True, # Add truncation
max_length=768, # Limit context length
return_tensors="pt",
)
# Move to GPU efficiently
inputs = {k: v.to("cuda") for k, v in inputs.items()}
# Clean up variables to free memory
del text, image_inputs, video_inputs
torch.cuda.empty_cache()
# Generate with optimized parameters
with torch.inference_mode(): # More efficient than no_grad
generated_ids = model.generate(
**inputs,
max_new_tokens=1024, # Reduced from 4096
do_sample=False, # Deterministic generation uses less memory
use_cache=True, # Use KV cache
num_beams=1 # Disable beam search to save memory
)
# Process output efficiently
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
]
raw_output = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True
)
results.append(raw_output[0])
print(f"Processed: {image_path}")
# Clear tensors from GPU memory
del inputs, generated_ids, generated_ids_trimmed
torch.cuda.empty_cache()
gc.collect()
finally:
# Clean up temporary files
if os.path.exists(image_path):
os.remove(image_path)
return results
# Gradio interface
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Qwen2-VL-2B Input"):
with gr.Row():
with gr.Column():
input_imgs = gr.Files(file_types=["image"], label="Upload Document Images")
text_input = gr.Textbox(label="Query")
submit_btn = gr.Button(value="Submit", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Response")
submit_btn.click(run_inference, [input_imgs, text_input], [output_text])
# Use smaller queue size to manage memory
demo.queue(api_open=True, max_size=3)
demo.launch(debug=True)
|