devleenaaaaaa commited on
Commit
05fc032
Β·
verified Β·
1 Parent(s): d6d4394

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +157 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ----------------------------
2
+ # STEP 1: Imports
3
+ # ----------------------------
4
+ import os
5
+ import sys
6
+ import torch
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+ from PIL import Image
10
+ import re
11
+ import cv2
12
+ import gradio as gr
13
+
14
+ # Add Depth Anything repo to path
15
+ sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2")
16
+
17
+ from huggingface_hub import hf_hub_download
18
+ from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
19
+ from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import
20
+
21
+ # ----------------------------
22
+ # STEP 2: Load Models
23
+ # ----------------------------
24
+
25
+ # Device config
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ print(f"πŸš€ Using device: {device}")
28
+
29
+ # Load Kosmos-2
30
+ print("πŸ“¦ Loading Kosmos-2...")
31
+ processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
32
+ model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained(
33
+ "microsoft/kosmos-2-patch14-224"
34
+ ).to(device)
35
+
36
+ # Load Depth Anything V2
37
+ print("πŸ“¦ Loading Depth Anything V2...")
38
+ model_config = {
39
+ 'encoder': 'vitl',
40
+ 'features': 256,
41
+ 'out_channels': [256, 512, 1024, 1024],
42
+ }
43
+ model_depth = DepthAnythingV2(**model_config)
44
+ checkpoint_path = hf_hub_download(
45
+ repo_id="depth-anything/Depth-Anything-V2-Large",
46
+ filename="depth_anything_v2_vitl.pth",
47
+ repo_type="model"
48
+ )
49
+ state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
50
+ model_depth.load_state_dict(state_dict)
51
+ model_depth = model_depth.to(device).eval()
52
+
53
+ # ----------------------------
54
+ # STEP 3: Caption Generator
55
+ # ----------------------------
56
+
57
+ def generate_caption(image_array):
58
+ try:
59
+ import time
60
+ print("πŸ” Resizing image for Kosmos-2...")
61
+ resized = cv2.resize(image_array.astype("uint8"), (224, 224))
62
+ pil_image = Image.fromarray(resized)
63
+
64
+ prompt = "<grounding> An image of"
65
+ inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device)
66
+
67
+ print("✍️ Running caption generation...")
68
+ start = time.time()
69
+
70
+ outputs = model_kosmos.generate(
71
+ pixel_values=inputs["pixel_values"],
72
+ input_ids=inputs["input_ids"],
73
+ attention_mask=inputs["attention_mask"],
74
+ image_embeds=None,
75
+ image_embeds_position_mask=inputs["image_embeds_position_mask"],
76
+ max_new_tokens=32, # reduced for speed
77
+ )
78
+
79
+ end = time.time()
80
+ print(f"⏱️ Captioning took: {end - start:.2f} seconds")
81
+
82
+ raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
83
+ phrases = re.findall(r"<phrase>(.*?)</phrase>", raw_text)
84
+
85
+ if phrases:
86
+ return ", ".join(phrases) if len(phrases) > 1 else phrases[0]
87
+ return "No description found."
88
+
89
+ except Exception as e:
90
+ print(f"❌ Captioning error: {e}")
91
+ return f"Error: {e}"
92
+
93
+
94
+ # ----------------------------
95
+ # STEP 4: Depth Captioning Pipeline
96
+ # ----------------------------
97
+ def depth_caption_pipeline(uploaded_image):
98
+ try:
99
+ print("πŸ“₯ Image uploaded.")
100
+ image_np = np.array(uploaded_image.convert("RGB"))
101
+
102
+ print("🧠 Estimating depth...")
103
+ with torch.no_grad():
104
+ depth_map = model_depth.infer_image(image_np[:, :, ::-1]) # BGR
105
+ depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0
106
+ depth_gray = depth_norm.astype(np.uint8)
107
+
108
+ print("πŸ”ͺ Segmenting image...")
109
+ top30 = np.percentile(depth_gray.flatten(), 70)
110
+ bottom30 = np.percentile(depth_gray.flatten(), 30)
111
+ top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1)
112
+ mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1)
113
+ bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1)
114
+
115
+ top_image = np.where(top_mask_3d, image_np, 0)
116
+ mid_image = np.where(mid_mask_3d, image_np, 0)
117
+ bottom_image = np.where(bottom_mask_3d, image_np, 0)
118
+
119
+ print("πŸ“ Generating captions...")
120
+ caption_top = generate_caption(top_image)
121
+ caption_mid = generate_caption(mid_image)
122
+ caption_bottom = generate_caption(bottom_image)
123
+
124
+ print("βœ… Completed successfully.")
125
+ return (
126
+ Image.fromarray(top_image.astype("uint8")),
127
+ Image.fromarray(mid_image.astype("uint8")),
128
+ Image.fromarray(bottom_image.astype("uint8")),
129
+ caption_top,
130
+ caption_mid,
131
+ caption_bottom
132
+ )
133
+
134
+ except Exception as e:
135
+ print(f"❌ Pipeline error: {e}")
136
+ return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}")
137
+
138
+ # ----------------------------
139
+ # STEP 5: Gradio Interface
140
+ # ----------------------------
141
+ demo = gr.Interface(
142
+ fn=depth_caption_pipeline,
143
+ inputs=gr.Image(type="pil", label="πŸ“€ Upload an Image"),
144
+ outputs=[
145
+ gr.Image(label="Foreground (Top 30%)"),
146
+ gr.Image(label="Midground (Mid 40%)"),
147
+ gr.Image(label="Background (Bottom 30%)"),
148
+ gr.Textbox(label="Caption - Foreground"),
149
+ gr.Textbox(label="Caption - Midground"),
150
+ gr.Textbox(label="Caption - Background"),
151
+ ],
152
+ title="Depth-Aware Image Captioning",
153
+ description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding."
154
+ )
155
+
156
+ print("πŸš€ Launching Gradio App...")
157
+ demo.launch(debug=True, share=True)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio
4
+ Pillow
5
+ numpy
6
+ matplotlib
7
+ opencv-python
8
+ huggingface_hub