File size: 5,730 Bytes
8166792
 
 
 
 
 
 
0dd36da
8166792
661e202
 
 
 
 
 
 
 
8166792
 
661e202
 
 
 
 
 
 
8166792
 
0dd36da
 
 
8166792
 
 
 
 
0dd36da
 
 
 
 
 
 
8166792
0dd36da
661e202
8166792
661e202
8166792
 
0dd36da
 
 
 
 
 
 
 
 
 
 
 
8166792
0dd36da
 
 
 
 
8166792
 
 
 
 
 
 
 
0dd36da
8166792
 
 
 
 
 
 
 
 
 
 
 
 
 
661e202
0dd36da
661e202
 
 
 
 
 
8166792
661e202
8166792
0dd36da
8166792
0dd36da
661e202
8166792
 
 
 
 
 
0dd36da
8166792
 
661e202
 
8166792
0dd36da
8166792
0dd36da
8166792
 
 
 
0dd36da
8166792
 
 
 
 
 
 
661e202
8166792
 
661e202
 
8166792
 
 
 
 
 
 
 
 
 
661e202
0dd36da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import cv2
import torch
import numpy as np
import time
from midas.model_loader import default_models, load_model
import os
import urllib.request
import spaces

MODEL_FILE_URL = {
    "midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
    "dpt_hybrid_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
    "dpt_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt",
    "dpt_swin2_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
    "dpt_beit_large_512" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",  
}

class MonocularDepthEstimator:
    def __init__(self,
        model_type="midas_v21_small_256",
        model_weights_path="models/", 
        optimize=False, 
        side_by_side=False, 
        height=None, 
        square=False, 
        grayscale=False):

        # params
        print("Initializing parameters...")
        self.model_type = model_type
        self.model_weights_path = model_weights_path
        self.is_optimize = optimize
        self.is_square = square
        self.is_grayscale = grayscale
        self.height = height
        self.side_by_side = side_by_side
        self.device = 'cuda'  # ZeroGPU will always use CUDA
        
        # Model will be loaded in make_prediction
        self.model = None
        self.transform = None
        self.net_w = None
        self.net_h = None

        # Download model if not exists
        if not os.path.exists(model_weights_path+model_type+".pt"):
            print("Model file not found. Downloading...")
            urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
            print("Model file downloaded successfully.")

    def load_model_if_needed(self):
        """Load model if not already loaded"""
        if self.model is None:
            self.model, self.transform, self.net_w, self.net_h = load_model(
                self.device, 
                self.model_weights_path + self.model_type + ".pt",
                self.model_type, 
                self.is_optimize, 
                self.height, 
                self.is_square
            )
            print("Net width and height: ", (self.net_w, self.net_h))

    @spaces.GPU
    def predict(self, image, target_size):
        """GPU-accelerated prediction"""
        # Load model if not loaded
        self.load_model_if_needed()

        # convert img to tensor and load to gpu
        img_tensor = torch.from_numpy(image).to(self.device).unsqueeze(0)

        if self.is_optimize and self.device == torch.device("cuda"):
            img_tensor = img_tensor.to(memory_format=torch.channels_last)
            img_tensor = img_tensor.half()
        
        prediction = self.model.forward(img_tensor)
        prediction = (
            torch.nn.functional.interpolate(
                prediction.unsqueeze(1),
                size=target_size[::-1],
                mode="bicubic",
                align_corners=False,
            )
            .squeeze()
            .cpu()
            .numpy()
        )

        return prediction

    def process_prediction(self, depth_map):
        """Process prediction (CPU operation, no GPU needed)"""
        depth_min = depth_map.min()
        depth_max = depth_map.max()
        normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
        
        grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
        depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)  
            
        return normalized_depth/255, depth_colormap/255

    @spaces.GPU(duration=30)
    def make_prediction(self, image):
        """Main prediction function with GPU acceleration"""
        image = image.copy()
        with torch.no_grad():
            original_image_rgb = np.flip(image, 2)  # in [0, 255] (flip required to get RGB)
            # resizing the image to feed to the model
            image_tranformed = self.transform({"image": original_image_rgb/255})["image"]

            # monocular depth prediction
            pred = self.predict(image_tranformed, target_size=original_image_rgb.shape[1::-1]) 

            # process the model predictions
            depthmap, depth_colormap = self.process_prediction(pred)
        return depthmap, depth_colormap

    @spaces.GPU(duration=60)
    def run(self, input_path):
        """Video processing with GPU acceleration"""
        cap = cv2.VideoCapture(input_path)

        if not cap.isOpened():
            print("Error opening video file")
            return

        with torch.no_grad():
             while cap.isOpened():
                inference_start_time = time.time()
                ret, frame = cap.read()                

                if ret == True:
                    _, depth_colormap = self.make_prediction(frame)                    
                    inference_end_time = time.time()
                    fps = round(1/(inference_end_time - inference_start_time))
                    cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
                    cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)

                    if cv2.waitKey(1) == 27:  # Escape key
                        break
                else:
                    break

        cap.release()
        cv2.destroyAllWindows()

if __name__ == "__main__":
    depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
    depth_estimator.run("assets/videos/testvideo2.mp4")