Alessio Grancini commited on
Commit
0dd36da
·
verified ·
1 Parent(s): 8dda9f5

Update monocular_depth_estimator.py

Browse files
Files changed (1) hide show
  1. monocular_depth_estimator.py +38 -67
monocular_depth_estimator.py CHANGED
@@ -5,6 +5,7 @@ import time
5
  from midas.model_loader import default_models, load_model
6
  import os
7
  import urllib.request
 
8
 
9
  MODEL_FILE_URL = {
10
  "midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
@@ -24,45 +25,47 @@ class MonocularDepthEstimator:
24
  square=False,
25
  grayscale=False):
26
 
27
- # model type
28
- # MiDaS 3.1:
29
- # For highest quality: dpt_beit_large_512
30
- # For moderately less quality, but better speed-performance trade-off: dpt_swin2_large_384
31
- # For embedded devices: dpt_swin2_tiny_256, dpt_levit_224
32
- # For inference on Intel CPUs, OpenVINO may be used for the small legacy model: openvino_midas_v21_small .xml, .bin
33
-
34
- # MiDaS 3.0:
35
- # Legacy transformer models dpt_large_384 and dpt_hybrid_384
36
-
37
- # MiDaS 2.1:
38
- # Legacy convolutional models midas_v21_384 and midas_v21_small_256
39
-
40
  # params
41
- print("Initializing parameters and model...")
 
 
42
  self.is_optimize = optimize
43
  self.is_square = square
44
  self.is_grayscale = grayscale
45
  self.height = height
46
  self.side_by_side = side_by_side
 
 
 
 
 
 
 
47
 
48
- # select device
49
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
50
- print("Running inference on : %s" % self.device)
51
-
52
- # loading model
53
  if not os.path.exists(model_weights_path+model_type+".pt"):
54
  print("Model file not found. Downloading...")
55
- # Download the model file
56
  urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
57
  print("Model file downloaded successfully.")
58
 
59
- self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path+model_type+".pt",
60
- model_type, optimize, height, square)
61
- print("Net width and height: ", (self.net_w, self.net_h))
62
-
 
 
 
 
 
 
 
 
63
 
64
- def predict(self, image, model, target_size):
65
-
 
 
 
66
 
67
  # convert img to tensor and load to gpu
68
  img_tensor = torch.from_numpy(image).to(self.device).unsqueeze(0)
@@ -71,7 +74,7 @@ class MonocularDepthEstimator:
71
  img_tensor = img_tensor.to(memory_format=torch.channels_last)
72
  img_tensor = img_tensor.half()
73
 
74
- prediction = model.forward(img_tensor)
75
  prediction = (
76
  torch.nn.functional.interpolate(
77
  prediction.unsqueeze(1),
@@ -87,30 +90,19 @@ class MonocularDepthEstimator:
87
  return prediction
88
 
89
  def process_prediction(self, depth_map):
90
- """
91
- Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
92
- for better visibility.
93
- Args:
94
- original_img: the RGB image
95
- depth_img: the depth map
96
- is_grayscale: use a grayscale colormap?
97
- Returns:
98
- the image and depth map place side by side
99
- """
100
-
101
- # normalizing depth image
102
  depth_min = depth_map.min()
103
  depth_max = depth_map.max()
104
  normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
105
 
106
- # normalized_depth *= 3
107
- # grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
108
  grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
109
  depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)
110
 
111
  return normalized_depth/255, depth_colormap/255
112
 
 
113
  def make_prediction(self, image):
 
114
  image = image.copy()
115
  with torch.no_grad():
116
  original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
@@ -118,25 +110,23 @@ class MonocularDepthEstimator:
118
  image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
119
 
120
  # monocular depth prediction
121
- pred = self.predict(image_tranformed, self.model, target_size=original_image_rgb.shape[1::-1])
122
 
123
  # process the model predictions
124
  depthmap, depth_colormap = self.process_prediction(pred)
125
  return depthmap, depth_colormap
126
 
 
127
  def run(self, input_path):
128
-
129
- # input video
130
  cap = cv2.VideoCapture(input_path)
131
 
132
- # Check if camera opened successfully
133
  if not cap.isOpened():
134
  print("Error opening video file")
 
135
 
136
  with torch.no_grad():
137
  while cap.isOpened():
138
-
139
- # Capture frame-by-frame
140
  inference_start_time = time.time()
141
  ret, frame = cap.read()
142
 
@@ -147,33 +137,14 @@ class MonocularDepthEstimator:
147
  cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
148
  cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
149
 
150
- # Press ESC on keyboard to exit
151
  if cv2.waitKey(1) == 27: # Escape key
152
  break
153
-
154
  else:
155
  break
156
 
157
-
158
- # When everything done, release
159
- # the video capture object
160
  cap.release()
161
-
162
- # Closes all the frames
163
  cv2.destroyAllWindows()
164
 
165
-
166
-
167
  if __name__ == "__main__":
168
- # params
169
- INPUT_PATH = "assets/videos/testvideo2.mp4"
170
-
171
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
172
-
173
- # set torch options
174
- torch.backends.cudnn.enabled = True
175
- torch.backends.cudnn.benchmark = True
176
-
177
  depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
178
- depth_estimator.run(INPUT_PATH)
179
-
 
5
  from midas.model_loader import default_models, load_model
6
  import os
7
  import urllib.request
8
+ import spaces
9
 
10
  MODEL_FILE_URL = {
11
  "midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
 
25
  square=False,
26
  grayscale=False):
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # params
29
+ print("Initializing parameters...")
30
+ self.model_type = model_type
31
+ self.model_weights_path = model_weights_path
32
  self.is_optimize = optimize
33
  self.is_square = square
34
  self.is_grayscale = grayscale
35
  self.height = height
36
  self.side_by_side = side_by_side
37
+ self.device = 'cuda' # ZeroGPU will always use CUDA
38
+
39
+ # Model will be loaded in make_prediction
40
+ self.model = None
41
+ self.transform = None
42
+ self.net_w = None
43
+ self.net_h = None
44
 
45
+ # Download model if not exists
 
 
 
 
46
  if not os.path.exists(model_weights_path+model_type+".pt"):
47
  print("Model file not found. Downloading...")
 
48
  urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
49
  print("Model file downloaded successfully.")
50
 
51
+ def load_model_if_needed(self):
52
+ """Load model if not already loaded"""
53
+ if self.model is None:
54
+ self.model, self.transform, self.net_w, self.net_h = load_model(
55
+ self.device,
56
+ self.model_weights_path + self.model_type + ".pt",
57
+ self.model_type,
58
+ self.is_optimize,
59
+ self.height,
60
+ self.is_square
61
+ )
62
+ print("Net width and height: ", (self.net_w, self.net_h))
63
 
64
+ @spaces.GPU
65
+ def predict(self, image, target_size):
66
+ """GPU-accelerated prediction"""
67
+ # Load model if not loaded
68
+ self.load_model_if_needed()
69
 
70
  # convert img to tensor and load to gpu
71
  img_tensor = torch.from_numpy(image).to(self.device).unsqueeze(0)
 
74
  img_tensor = img_tensor.to(memory_format=torch.channels_last)
75
  img_tensor = img_tensor.half()
76
 
77
+ prediction = self.model.forward(img_tensor)
78
  prediction = (
79
  torch.nn.functional.interpolate(
80
  prediction.unsqueeze(1),
 
90
  return prediction
91
 
92
  def process_prediction(self, depth_map):
93
+ """Process prediction (CPU operation, no GPU needed)"""
 
 
 
 
 
 
 
 
 
 
 
94
  depth_min = depth_map.min()
95
  depth_max = depth_map.max()
96
  normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
97
 
 
 
98
  grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
99
  depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)
100
 
101
  return normalized_depth/255, depth_colormap/255
102
 
103
+ @spaces.GPU(duration=30)
104
  def make_prediction(self, image):
105
+ """Main prediction function with GPU acceleration"""
106
  image = image.copy()
107
  with torch.no_grad():
108
  original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
 
110
  image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
111
 
112
  # monocular depth prediction
113
+ pred = self.predict(image_tranformed, target_size=original_image_rgb.shape[1::-1])
114
 
115
  # process the model predictions
116
  depthmap, depth_colormap = self.process_prediction(pred)
117
  return depthmap, depth_colormap
118
 
119
+ @spaces.GPU(duration=60)
120
  def run(self, input_path):
121
+ """Video processing with GPU acceleration"""
 
122
  cap = cv2.VideoCapture(input_path)
123
 
 
124
  if not cap.isOpened():
125
  print("Error opening video file")
126
+ return
127
 
128
  with torch.no_grad():
129
  while cap.isOpened():
 
 
130
  inference_start_time = time.time()
131
  ret, frame = cap.read()
132
 
 
137
  cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
138
  cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
139
 
 
140
  if cv2.waitKey(1) == 27: # Escape key
141
  break
 
142
  else:
143
  break
144
 
 
 
 
145
  cap.release()
 
 
146
  cv2.destroyAllWindows()
147
 
 
 
148
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
149
  depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
150
+ depth_estimator.run("assets/videos/testvideo2.mp4")