Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +157 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ----------------------------
|
2 |
+
# STEP 1: Imports
|
3 |
+
# ----------------------------
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
import torch
|
7 |
+
import numpy as np
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
from PIL import Image
|
10 |
+
import re
|
11 |
+
import cv2
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
# Add Depth Anything repo to path
|
15 |
+
sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2")
|
16 |
+
|
17 |
+
from huggingface_hub import hf_hub_download
|
18 |
+
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
|
19 |
+
from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import
|
20 |
+
|
21 |
+
# ----------------------------
|
22 |
+
# STEP 2: Load Models
|
23 |
+
# ----------------------------
|
24 |
+
|
25 |
+
# Device config
|
26 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
+
print(f"π Using device: {device}")
|
28 |
+
|
29 |
+
# Load Kosmos-2
|
30 |
+
print("π¦ Loading Kosmos-2...")
|
31 |
+
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
|
32 |
+
model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained(
|
33 |
+
"microsoft/kosmos-2-patch14-224"
|
34 |
+
).to(device)
|
35 |
+
|
36 |
+
# Load Depth Anything V2
|
37 |
+
print("π¦ Loading Depth Anything V2...")
|
38 |
+
model_config = {
|
39 |
+
'encoder': 'vitl',
|
40 |
+
'features': 256,
|
41 |
+
'out_channels': [256, 512, 1024, 1024],
|
42 |
+
}
|
43 |
+
model_depth = DepthAnythingV2(**model_config)
|
44 |
+
checkpoint_path = hf_hub_download(
|
45 |
+
repo_id="depth-anything/Depth-Anything-V2-Large",
|
46 |
+
filename="depth_anything_v2_vitl.pth",
|
47 |
+
repo_type="model"
|
48 |
+
)
|
49 |
+
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
|
50 |
+
model_depth.load_state_dict(state_dict)
|
51 |
+
model_depth = model_depth.to(device).eval()
|
52 |
+
|
53 |
+
# ----------------------------
|
54 |
+
# STEP 3: Caption Generator
|
55 |
+
# ----------------------------
|
56 |
+
|
57 |
+
def generate_caption(image_array):
|
58 |
+
try:
|
59 |
+
import time
|
60 |
+
print("π Resizing image for Kosmos-2...")
|
61 |
+
resized = cv2.resize(image_array.astype("uint8"), (224, 224))
|
62 |
+
pil_image = Image.fromarray(resized)
|
63 |
+
|
64 |
+
prompt = "<grounding> An image of"
|
65 |
+
inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device)
|
66 |
+
|
67 |
+
print("βοΈ Running caption generation...")
|
68 |
+
start = time.time()
|
69 |
+
|
70 |
+
outputs = model_kosmos.generate(
|
71 |
+
pixel_values=inputs["pixel_values"],
|
72 |
+
input_ids=inputs["input_ids"],
|
73 |
+
attention_mask=inputs["attention_mask"],
|
74 |
+
image_embeds=None,
|
75 |
+
image_embeds_position_mask=inputs["image_embeds_position_mask"],
|
76 |
+
max_new_tokens=32, # reduced for speed
|
77 |
+
)
|
78 |
+
|
79 |
+
end = time.time()
|
80 |
+
print(f"β±οΈ Captioning took: {end - start:.2f} seconds")
|
81 |
+
|
82 |
+
raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
83 |
+
phrases = re.findall(r"<phrase>(.*?)</phrase>", raw_text)
|
84 |
+
|
85 |
+
if phrases:
|
86 |
+
return ", ".join(phrases) if len(phrases) > 1 else phrases[0]
|
87 |
+
return "No description found."
|
88 |
+
|
89 |
+
except Exception as e:
|
90 |
+
print(f"β Captioning error: {e}")
|
91 |
+
return f"Error: {e}"
|
92 |
+
|
93 |
+
|
94 |
+
# ----------------------------
|
95 |
+
# STEP 4: Depth Captioning Pipeline
|
96 |
+
# ----------------------------
|
97 |
+
def depth_caption_pipeline(uploaded_image):
|
98 |
+
try:
|
99 |
+
print("π₯ Image uploaded.")
|
100 |
+
image_np = np.array(uploaded_image.convert("RGB"))
|
101 |
+
|
102 |
+
print("π§ Estimating depth...")
|
103 |
+
with torch.no_grad():
|
104 |
+
depth_map = model_depth.infer_image(image_np[:, :, ::-1]) # BGR
|
105 |
+
depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0
|
106 |
+
depth_gray = depth_norm.astype(np.uint8)
|
107 |
+
|
108 |
+
print("πͺ Segmenting image...")
|
109 |
+
top30 = np.percentile(depth_gray.flatten(), 70)
|
110 |
+
bottom30 = np.percentile(depth_gray.flatten(), 30)
|
111 |
+
top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1)
|
112 |
+
mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1)
|
113 |
+
bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1)
|
114 |
+
|
115 |
+
top_image = np.where(top_mask_3d, image_np, 0)
|
116 |
+
mid_image = np.where(mid_mask_3d, image_np, 0)
|
117 |
+
bottom_image = np.where(bottom_mask_3d, image_np, 0)
|
118 |
+
|
119 |
+
print("π Generating captions...")
|
120 |
+
caption_top = generate_caption(top_image)
|
121 |
+
caption_mid = generate_caption(mid_image)
|
122 |
+
caption_bottom = generate_caption(bottom_image)
|
123 |
+
|
124 |
+
print("β
Completed successfully.")
|
125 |
+
return (
|
126 |
+
Image.fromarray(top_image.astype("uint8")),
|
127 |
+
Image.fromarray(mid_image.astype("uint8")),
|
128 |
+
Image.fromarray(bottom_image.astype("uint8")),
|
129 |
+
caption_top,
|
130 |
+
caption_mid,
|
131 |
+
caption_bottom
|
132 |
+
)
|
133 |
+
|
134 |
+
except Exception as e:
|
135 |
+
print(f"β Pipeline error: {e}")
|
136 |
+
return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}")
|
137 |
+
|
138 |
+
# ----------------------------
|
139 |
+
# STEP 5: Gradio Interface
|
140 |
+
# ----------------------------
|
141 |
+
demo = gr.Interface(
|
142 |
+
fn=depth_caption_pipeline,
|
143 |
+
inputs=gr.Image(type="pil", label="π€ Upload an Image"),
|
144 |
+
outputs=[
|
145 |
+
gr.Image(label="Foreground (Top 30%)"),
|
146 |
+
gr.Image(label="Midground (Mid 40%)"),
|
147 |
+
gr.Image(label="Background (Bottom 30%)"),
|
148 |
+
gr.Textbox(label="Caption - Foreground"),
|
149 |
+
gr.Textbox(label="Caption - Midground"),
|
150 |
+
gr.Textbox(label="Caption - Background"),
|
151 |
+
],
|
152 |
+
title="Depth-Aware Image Captioning",
|
153 |
+
description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding."
|
154 |
+
)
|
155 |
+
|
156 |
+
print("π Launching Gradio App...")
|
157 |
+
demo.launch(debug=True, share=True)
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
gradio
|
4 |
+
Pillow
|
5 |
+
numpy
|
6 |
+
matplotlib
|
7 |
+
opencv-python
|
8 |
+
huggingface_hub
|