File size: 5,730 Bytes
8166792 0dd36da 8166792 661e202 8166792 661e202 8166792 0dd36da 8166792 0dd36da 8166792 0dd36da 661e202 8166792 661e202 8166792 0dd36da 8166792 0dd36da 8166792 0dd36da 8166792 661e202 0dd36da 661e202 8166792 661e202 8166792 0dd36da 8166792 0dd36da 661e202 8166792 0dd36da 8166792 661e202 8166792 0dd36da 8166792 0dd36da 8166792 0dd36da 8166792 661e202 8166792 661e202 8166792 661e202 0dd36da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import cv2
import torch
import numpy as np
import time
from midas.model_loader import default_models, load_model
import os
import urllib.request
import spaces
MODEL_FILE_URL = {
"midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
"dpt_hybrid_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
"dpt_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt",
"dpt_swin2_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
"dpt_beit_large_512" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
}
class MonocularDepthEstimator:
def __init__(self,
model_type="midas_v21_small_256",
model_weights_path="models/",
optimize=False,
side_by_side=False,
height=None,
square=False,
grayscale=False):
# params
print("Initializing parameters...")
self.model_type = model_type
self.model_weights_path = model_weights_path
self.is_optimize = optimize
self.is_square = square
self.is_grayscale = grayscale
self.height = height
self.side_by_side = side_by_side
self.device = 'cuda' # ZeroGPU will always use CUDA
# Model will be loaded in make_prediction
self.model = None
self.transform = None
self.net_w = None
self.net_h = None
# Download model if not exists
if not os.path.exists(model_weights_path+model_type+".pt"):
print("Model file not found. Downloading...")
urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
print("Model file downloaded successfully.")
def load_model_if_needed(self):
"""Load model if not already loaded"""
if self.model is None:
self.model, self.transform, self.net_w, self.net_h = load_model(
self.device,
self.model_weights_path + self.model_type + ".pt",
self.model_type,
self.is_optimize,
self.height,
self.is_square
)
print("Net width and height: ", (self.net_w, self.net_h))
@spaces.GPU
def predict(self, image, target_size):
"""GPU-accelerated prediction"""
# Load model if not loaded
self.load_model_if_needed()
# convert img to tensor and load to gpu
img_tensor = torch.from_numpy(image).to(self.device).unsqueeze(0)
if self.is_optimize and self.device == torch.device("cuda"):
img_tensor = img_tensor.to(memory_format=torch.channels_last)
img_tensor = img_tensor.half()
prediction = self.model.forward(img_tensor)
prediction = (
torch.nn.functional.interpolate(
prediction.unsqueeze(1),
size=target_size[::-1],
mode="bicubic",
align_corners=False,
)
.squeeze()
.cpu()
.numpy()
)
return prediction
def process_prediction(self, depth_map):
"""Process prediction (CPU operation, no GPU needed)"""
depth_min = depth_map.min()
depth_max = depth_map.max()
normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)
return normalized_depth/255, depth_colormap/255
@spaces.GPU(duration=30)
def make_prediction(self, image):
"""Main prediction function with GPU acceleration"""
image = image.copy()
with torch.no_grad():
original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
# resizing the image to feed to the model
image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
# monocular depth prediction
pred = self.predict(image_tranformed, target_size=original_image_rgb.shape[1::-1])
# process the model predictions
depthmap, depth_colormap = self.process_prediction(pred)
return depthmap, depth_colormap
@spaces.GPU(duration=60)
def run(self, input_path):
"""Video processing with GPU acceleration"""
cap = cv2.VideoCapture(input_path)
if not cap.isOpened():
print("Error opening video file")
return
with torch.no_grad():
while cap.isOpened():
inference_start_time = time.time()
ret, frame = cap.read()
if ret == True:
_, depth_colormap = self.make_prediction(frame)
inference_end_time = time.time()
fps = round(1/(inference_end_time - inference_start_time))
cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
if cv2.waitKey(1) == 27: # Escape key
break
else:
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
depth_estimator.run("assets/videos/testvideo2.mp4") |