Spaces:

devleenaaaaaa
/

Depath-Aware-Captioning

Runtime error

App Files Files Community

devleenaaaaaa commited on 17 days ago

Commit

05fc032

verified ·

1 Parent(s): d6d4394

Upload 2 files

Browse files

Files changed (2) hide show

app.py +157 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# ----------------------------
+# STEP 1: Imports
+# ----------------------------
+import os
+import sys
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image
+import re
+import cv2
+import gradio as gr
+# Add Depth Anything repo to path
+sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2")
+from huggingface_hub import hf_hub_download
+from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
+from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import
+# ----------------------------
+# STEP 2: Load Models
+# ----------------------------
+# Device config
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"🚀 Using device: {device}")
+# Load Kosmos-2
+print("📦 Loading Kosmos-2...")
+processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
+model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained(
+    "microsoft/kosmos-2-patch14-224"
+).to(device)
+# Load Depth Anything V2
+print("📦 Loading Depth Anything V2...")
+model_config = {
+    'encoder': 'vitl',
+    'features': 256,
+    'out_channels': [256, 512, 1024, 1024],
+}
+model_depth = DepthAnythingV2(**model_config)
+checkpoint_path = hf_hub_download(
+    repo_id="depth-anything/Depth-Anything-V2-Large",
+    filename="depth_anything_v2_vitl.pth",
+    repo_type="model"
+)
+state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+model_depth.load_state_dict(state_dict)
+model_depth = model_depth.to(device).eval()
+# ----------------------------
+# STEP 3: Caption Generator
+# ----------------------------
+def generate_caption(image_array):
+    try:
+        import time
+        print("🔁 Resizing image for Kosmos-2...")
+        resized = cv2.resize(image_array.astype("uint8"), (224, 224))
+        pil_image = Image.fromarray(resized)
+        prompt = "<grounding> An image of"
+        inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device)
+        print("✍️ Running caption generation...")
+        start = time.time()
+        outputs = model_kosmos.generate(
+            pixel_values=inputs["pixel_values"],
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            image_embeds=None,
+            image_embeds_position_mask=inputs["image_embeds_position_mask"],
+            max_new_tokens=32,  # reduced for speed
+        )
+        end = time.time()
+        print(f"⏱️ Captioning took: {end - start:.2f} seconds")
+        raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        phrases = re.findall(r"<phrase>(.*?)</phrase>", raw_text)
+        if phrases:
+            return ", ".join(phrases) if len(phrases) > 1 else phrases[0]
+        return "No description found."
+    except Exception as e:
+        print(f"❌ Captioning error: {e}")
+        return f"Error: {e}"
+# ----------------------------
+# STEP 4: Depth Captioning Pipeline
+# ----------------------------
+def depth_caption_pipeline(uploaded_image):
+    try:
+        print("📥 Image uploaded.")
+        image_np = np.array(uploaded_image.convert("RGB"))
+        print("🧠 Estimating depth...")
+        with torch.no_grad():
+            depth_map = model_depth.infer_image(image_np[:, :, ::-1])  # BGR
+        depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0
+        depth_gray = depth_norm.astype(np.uint8)
+        print("🔪 Segmenting image...")
+        top30 = np.percentile(depth_gray.flatten(), 70)
+        bottom30 = np.percentile(depth_gray.flatten(), 30)
+        top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1)
+        mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1)
+        bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1)
+        top_image = np.where(top_mask_3d, image_np, 0)
+        mid_image = np.where(mid_mask_3d, image_np, 0)
+        bottom_image = np.where(bottom_mask_3d, image_np, 0)
+        print("📝 Generating captions...")
+        caption_top = generate_caption(top_image)
+        caption_mid = generate_caption(mid_image)
+        caption_bottom = generate_caption(bottom_image)
+        print("✅ Completed successfully.")
+        return (
+            Image.fromarray(top_image.astype("uint8")),
+            Image.fromarray(mid_image.astype("uint8")),
+            Image.fromarray(bottom_image.astype("uint8")),
+            caption_top,
+            caption_mid,
+            caption_bottom
+        )
+    except Exception as e:
+        print(f"❌ Pipeline error: {e}")
+        return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}")
+# ----------------------------
+# STEP 5: Gradio Interface
+# ----------------------------
+demo = gr.Interface(
+    fn=depth_caption_pipeline,
+    inputs=gr.Image(type="pil", label="📤 Upload an Image"),
+    outputs=[
+        gr.Image(label="Foreground (Top 30%)"),
+        gr.Image(label="Midground (Mid 40%)"),
+        gr.Image(label="Background (Bottom 30%)"),
+        gr.Textbox(label="Caption - Foreground"),
+        gr.Textbox(label="Caption - Midground"),
+        gr.Textbox(label="Caption - Background"),
+    ],
+    title="Depth-Aware Image Captioning",
+    description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding."
+)
+print("🚀 Launching Gradio App...")
+demo.launch(debug=True, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+transformers
+gradio
+Pillow
+numpy
+matplotlib
+opencv-python
+huggingface_hub