Spaces:

Guru-25
/

driver

Sleeping

App Files Files Community

Guru-25 commited on 19 days ago

Commit

b8b61aa

1 Parent(s): 6227cd5

new

Browse files

Files changed (10) hide show

app.py +135 -0
data/image_paths.npy +3 -0
data/labels.npy +3 -0
models/gaze_estimation_model.pth +3 -0
requirements.txt +80 -0
scripts/__init__.py +0 -0
scripts/gaze_tracker.py +148 -0
scripts/inference.py +62 -0
utils/ear_utils.py +64 -0
utils/preprocess.py +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import gradio as gr
+import cv2
+import numpy as np
+import tempfile
+import os
+from scripts.inference import GazePredictor
+from utils.ear_utils import BlinkDetector
+def smooth_values(history, current_value, window_size=5):
+    if current_value is not None:
+        history.append(current_value)
+    if len(history) > window_size:
+        history.pop(0)
+    return np.mean(history, axis=0) if isinstance(current_value, np.ndarray) and history else current_value if current_value is not None else 0
+MODEL_PATH = os.path.join("models", "gaze_estimation_model.pth")
+def analyze_video(input_video):
+    cap = cv2.VideoCapture(input_video)
+    gaze_predictor = GazePredictor(MODEL_PATH)
+    blink_detector = BlinkDetector()
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    temp_fd, temp_path = tempfile.mkstemp(suffix='.mp4')
+    os.close(temp_fd)
+    out = None
+    GAZE_STABILITY_THRESHOLD = 0.5
+    TIME_THRESHOLD = 15
+    BLINK_RATE_THRESHOLD = 1
+    EYE_CLOSURE_THRESHOLD = 10
+    HEAD_STABILITY_THRESHOLD = 0.05
+    gaze_history = []
+    head_history = []
+    ear_history = []
+    stable_gaze_time = 0
+    stable_head_time = 0
+    eye_closed_time = 0
+    blink_count = 0
+    start_time = 0
+    is_unconscious = False
+    frame_count = 0
+    fps = cap.get(cv2.CAP_PROP_FPS) or 20
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame_count += 1
+        if start_time == 0:
+            start_time = frame_count / fps
+        head_pose_gaze, gaze_h, gaze_v = gaze_predictor.predict_gaze(frame)
+        current_gaze = np.array([gaze_h, gaze_v])
+        smoothed_gaze = smooth_values(gaze_history, current_gaze)
+        ear, left_eye, right_eye, head_pose, left_iris, right_iris = blink_detector.detect_blinks(frame)
+        if ear is None:
+            cv2.putText(frame, "No face detected", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
+            smoothed_head = smooth_values(head_history, None)
+            smoothed_ear = smooth_values(ear_history, None)
+        else:
+            smoothed_head = smooth_values(head_history, head_pose)
+            smoothed_ear = smooth_values(ear_history, ear)
+            if smoothed_ear >= blink_detector.EAR_THRESHOLD:
+                cv2.drawMarker(frame, left_iris, (0, 255, 0), markerType=cv2.MARKER_CROSS, markerSize=10, thickness=2)
+                cv2.drawMarker(frame, right_iris, (0, 255, 0), markerType=cv2.MARKER_CROSS, markerSize=10, thickness=2)
+        cv2.putText(frame, f"Gaze H: {smoothed_gaze[0]:.2f}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        cv2.putText(frame, f"Gaze V: {smoothed_gaze[1]:.2f}", (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        cv2.putText(frame, f"Head Pose: {smoothed_head:.2f}", (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        cv2.putText(frame, f"EAR: {smoothed_ear:.2f}", (10, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        if len(gaze_history) > 1:
+            gaze_diff = np.sqrt(np.sum((smoothed_gaze - gaze_history[-2])**2))
+            if gaze_diff < GAZE_STABILITY_THRESHOLD:
+                if stable_gaze_time == 0:
+                    stable_gaze_time = frame_count / fps
+            else:
+                stable_gaze_time = 0
+        if len(head_history) > 1 and head_pose is not None:
+            head_diff = abs(smoothed_head - head_history[-2])
+            if head_diff < HEAD_STABILITY_THRESHOLD:
+                if stable_head_time == 0:
+                    stable_head_time = frame_count / fps
+            else:
+                stable_head_time = 0
+        if ear is not None and smoothed_ear < blink_detector.EAR_THRESHOLD:
+            if eye_closed_time == 0:
+                eye_closed_time = frame_count / fps
+            elif (frame_count / fps) - eye_closed_time > EYE_CLOSURE_THRESHOLD:
+                cv2.putText(frame, "Eyes Closed", (10, 210), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
+        else:
+            if eye_closed_time > 0 and (frame_count / fps) - eye_closed_time < 0.5:
+                blink_count += 1
+            eye_closed_time = 0
+        elapsed_minutes = ((frame_count / fps) - start_time) / 60 if start_time > 0 else 0
+        blink_rate = blink_count / elapsed_minutes if elapsed_minutes > 0 else 0
+        cv2.putText(frame, f"Blink Rate: {blink_rate:.1f}/min", (10, 240), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        unconscious_conditions = [
+            stable_gaze_time > 0 and (frame_count / fps) - stable_gaze_time > TIME_THRESHOLD,
+            blink_rate < BLINK_RATE_THRESHOLD and elapsed_minutes > 1,
+            eye_closed_time > 0 and (frame_count / fps) - eye_closed_time > EYE_CLOSURE_THRESHOLD,
+            stable_head_time > 0 and (frame_count / fps) - stable_head_time > TIME_THRESHOLD
+        ]
+        if sum(unconscious_conditions) >= 2:
+            cv2.putText(frame, "Unconscious Detected", (10, 270), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
+            is_unconscious = True
+        else:
+            is_unconscious = False
+        if out is None:
+            h, w = frame.shape[:2]
+            out = cv2.VideoWriter(temp_path, fourcc, fps, (w, h))
+        out.write(frame)
+    cap.release()
+    if out:
+        out.release()
+    return temp_path
+iface = gr.Interface(
+    fn=analyze_video,
+    inputs=gr.Video(),
+    outputs=gr.Video(),
+    title="Gaze Tracker",
+    description="Upload a video to analyze gaze and drowsiness."
+)
+if __name__ == "__main__":
+    iface.launch()

data/image_paths.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83e055c17ac696ca8a0349e9a0280e93a7f02142c86c1b22a51a16da52a8ae83
+size 1670048

data/labels.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:657cc5cfec2c850eee6343f394ac681cd04361b7c455133d161421119cbce12d
+size 70688

models/gaze_estimation_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b21ca13bee8bd9209ec0e95d3cee3c95f745abf84b88eeb9feb80a9f3316c61
+size 94378602

requirements.txt ADDED Viewed

	@@ -0,0 +1,80 @@

+absl-py==2.2.2
+astunparse==1.6.3
+attrs==25.3.0
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+contourpy==1.3.2
+cycler==0.12.1
+filelock==3.18.0
+flatbuffers==25.2.10
+fonttools==4.57.0
+fsspec==2025.3.2
+gast==0.6.0
+google-pasta==0.2.0
+grpcio==1.71.0
+h5py==3.13.0
+idna==3.10
+imutils==0.5.4
+jax==0.6.0
+jaxlib==0.6.0
+Jinja2==3.1.6
+keras==3.9.2
+kiwisolver==1.4.8
+libclang==18.1.1
+Markdown==3.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.8.3
+mdurl==0.1.2
+mediapipe==0.10.21
+ml_dtypes==0.5.1
+mpmath==1.3.0
+namex==0.0.8
+networkx==3.4.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvtx-cu12==12.1.105
+opencv-contrib-python==4.11.0.86
+opencv-python==4.10.0.84
+opt_einsum==3.4.0
+optree==0.15.0
+packaging==25.0
+pillow==11.2.1
+playsound==1.2.2
+protobuf==4.25.6
+pycparser==2.22
+pygame==2.6.1
+Pygments==2.19.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+requests==2.32.3
+rich==14.0.0
+scipy==1.15.2
+sentencepiece==0.2.0
+setuptools==79.0.0
+six==1.17.0
+sounddevice==0.5.1
+sympy==1.13.3
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+tensorflow==2.19.0
+termcolor==3.0.1
+torch==2.2.1
+torchvision==0.17.1
+typing_extensions==4.13.2
+urllib3==2.4.0
+Werkzeug==3.1.3
+wheel==0.45.1
+wrapt==1.17.2
+gradio==4.27.0

scripts/__init__.py ADDED Viewed

File without changes

scripts/gaze_tracker.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import cv2
+import time
+import numpy as np
+from scripts.inference import GazePredictor
+from utils.ear_utils import BlinkDetector
+from pygame import mixer
+def smooth_values(history, current_value, window_size=5):
+    if current_value is not None:
+        history.append(current_value)
+    if len(history) > window_size:
+        history.pop(0)
+    return np.mean(history, axis=0) if isinstance(current_value, np.ndarray) and history else current_value if current_value is not None else 0
+def track_gaze(model_path):
+    gaze_predictor = GazePredictor(model_path)
+    blink_detector = BlinkDetector()
+    cap = cv2.VideoCapture(0)
+    if not cap.isOpened():
+        print("Error: Could not open webcam.")
+        return
+    GAZE_STABILITY_THRESHOLD = 0.5
+    TIME_THRESHOLD = 15
+    BLINK_RATE_THRESHOLD = 1
+    EYE_CLOSURE_THRESHOLD = 10
+    HEAD_STABILITY_THRESHOLD = 0.05
+    gaze_history = []
+    head_history = []
+    ear_history = []
+    stable_gaze_time = 0
+    stable_head_time = 0
+    eye_closed_time = 0
+    blink_count = 0
+    start_time = time.time()
+    is_unconscious = False
+    # Initialize pygame mixer
+    mixer.init()
+    ALARM_PATH = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "sounds", "alarm.wav")))
+    if not os.path.exists(ALARM_PATH):
+        print(f"Warning: Alarm sound file not found at {ALARM_PATH}. No sound will play.")
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            print("Failed to capture frame")
+            break
+        head_pose_gaze, gaze_h, gaze_v = gaze_predictor.predict_gaze(frame)
+        current_gaze = np.array([gaze_h, gaze_v])
+        smoothed_gaze = smooth_values(gaze_history, current_gaze)
+        ear, left_eye, right_eye, head_pose, left_iris, right_iris = blink_detector.detect_blinks(frame)
+        if ear is None:
+            cv2.putText(frame, "No face detected", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
+            print("No face detected")
+            smoothed_head = smooth_values(head_history, None)
+            smoothed_ear = smooth_values(ear_history, None)
+        else:
+            print(f"EAR: {ear:.2f}, Head Pose: {head_pose:.2f}, Gaze: [{smoothed_gaze[0]:.2f}, {smoothed_gaze[1]:.2f}]")
+            smoothed_head = smooth_values(head_history, head_pose)
+            smoothed_ear = smooth_values(ear_history, ear)
+            if smoothed_ear >= blink_detector.EAR_THRESHOLD:
+                cv2.drawMarker(frame, left_iris, (0, 255, 0), markerType=cv2.MARKER_CROSS, markerSize=10, thickness=2)
+                cv2.drawMarker(frame, right_iris, (0, 255, 0), markerType=cv2.MARKER_CROSS, markerSize=10, thickness=2)
+        cv2.putText(frame, f"Gaze H: {smoothed_gaze[0]:.2f}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        cv2.putText(frame, f"Gaze V: {smoothed_gaze[1]:.2f}", (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        cv2.putText(frame, f"Head Pose: {smoothed_head:.2f}", (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        cv2.putText(frame, f"EAR: {smoothed_ear:.2f}", (10, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        if len(gaze_history) > 1:
+            gaze_diff = np.sqrt(np.sum((smoothed_gaze - gaze_history[-2])**2))
+            print(f"Gaze Diff: {gaze_diff:.2f}")
+            if gaze_diff < GAZE_STABILITY_THRESHOLD:
+                if stable_gaze_time == 0:
+                    stable_gaze_time = time.time()
+                elif time.time() - stable_gaze_time > TIME_THRESHOLD:
+                    cv2.putText(frame, "Gaze Fixed", (10, 180), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
+            else:
+                stable_gaze_time = 0
+        if len(head_history) > 1 and head_pose is not None:
+            head_diff = abs(smoothed_head - head_history[-2])
+            print(f"Head Diff: {head_diff:.2f}")
+            if head_diff < HEAD_STABILITY_THRESHOLD:
+                if stable_head_time == 0:
+                    stable_head_time = time.time()
+            else:
+                stable_head_time = 0
+        if ear is not None and smoothed_ear < blink_detector.EAR_THRESHOLD:
+            if eye_closed_time == 0:
+                eye_closed_time = time.time()
+            elif time.time() - eye_closed_time > EYE_CLOSURE_THRESHOLD:
+                cv2.putText(frame, "Eyes Closed", (10, 210), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
+        else:
+            if eye_closed_time > 0 and time.time() - eye_closed_time < 0.5:
+                blink_count += 1
+            eye_closed_time = 0
+        elapsed_minutes = (time.time() - start_time) / 60
+        blink_rate = blink_count / elapsed_minutes if elapsed_minutes > 0 else 0
+        cv2.putText(frame, f"Blink Rate: {blink_rate:.1f}/min", (10, 240), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        unconscious_conditions = [
+            stable_gaze_time > 0 and time.time() - stable_gaze_time > TIME_THRESHOLD,
+            blink_rate < BLINK_RATE_THRESHOLD and elapsed_minutes > 1,
+            eye_closed_time > 0 and time.time() - eye_closed_time > EYE_CLOSURE_THRESHOLD,
+            stable_head_time > 0 and time.time() - stable_head_time > TIME_THRESHOLD
+        ]
+        print(f"Conditions: {unconscious_conditions}")
+        if sum(unconscious_conditions) >= 2:
+            if not is_unconscious and os.path.exists(ALARM_PATH):
+                print(f"Attempting to play alarm at {ALARM_PATH}")
+                try:
+                    mixer.music.load(ALARM_PATH)
+                    mixer.music.play()
+                except Exception as e:
+                    print(f"Error playing alarm sound: {e}")
+            print("Unconscious detected!")
+            cv2.putText(frame, "Unconscious Detected", (10, 270), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
+            is_unconscious = True
+        else:
+            is_unconscious = False
+        cv2.imshow("Gaze Tracking", frame)
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+    cap.release()
+    cv2.destroyAllWindows()
+    mixer.quit()
+if __name__ == "__main__":
+    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+    MODEL_PATH = os.path.join(SCRIPT_DIR, "..", "models", "gaze_estimation_model.pth")
+    if not os.path.exists(MODEL_PATH):
+        print(f"Error: Missing model file at {MODEL_PATH}")
+        sys.exit(1)
+    track_gaze(MODEL_PATH)

scripts/inference.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import numpy as np
+import cv2
+from torchvision import transforms, models
+from utils.preprocess import preprocess_frame
+class GazeEstimationModel(torch.nn.Module):
+    def __init__(self):
+        super(GazeEstimationModel, self).__init__()
+        # Initialize ResNet-50 as the backbone
+        self.backbone = models.resnet50(pretrained=False)
+        # Modify the final fully connected layer for 3 outputs (head_pose, gaze_h, gaze_v)
+        self.backbone.fc = torch.nn.Linear(self.backbone.fc.in_features, 3)
+    def forward(self, x):
+        return self.backbone(x)
+class GazePredictor:
+    def __init__(self, model_path):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize the custom model
+        self.model = GazeEstimationModel()
+        # Load the state dictionary
+        state_dict = torch.load(model_path, map_location=self.device)
+        # Check if state_dict has 'backbone.' prefix and strip it if necessary
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            new_key = key.replace("backbone.", "")  # Remove 'backbone.' prefix
+            new_state_dict[new_key] = value
+        # Load the adjusted state dictionary into the model
+        try:
+            self.model.backbone.load_state_dict(new_state_dict)
+        except RuntimeError as e:
+            print("Error loading state dict directly:", e)
+            print("Trying to load state dict with strict=False...")
+            self.model.backbone.load_state_dict(new_state_dict, strict=False)
+        # Move to device and set to evaluation mode
+        self.model.to(self.device)
+        self.model.eval()
+        # Define preprocessing transform
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+    def predict_gaze(self, frame):
+        preprocessed = preprocess_frame(frame)
+        preprocessed = preprocessed[0]
+        preprocessed = self.transform(preprocessed).float().unsqueeze(0)
+        preprocessed = preprocessed.to(self.device)
+        with torch.no_grad():
+            outputs = self.model(preprocessed)
+            outputs = outputs.cpu().numpy()[0]
+        print("Model outputs:", outputs)  # Debug print
+        head_pose, gaze_h, gaze_v = outputs
+        return head_pose, gaze_h, gaze_v

utils/ear_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import cv2
+import numpy as np
+import mediapipe as mp
+from scipy.spatial import distance as dist
+def eye_aspect_ratio(eye_landmarks, landmarks, image_shape):
+    eye = [
+        landmarks[eye_landmarks[0]],  # P1 (left)
+        landmarks[eye_landmarks[1]],  # P2 (top-left)
+        landmarks[eye_landmarks[2]],  # P3 (top-right)
+        landmarks[eye_landmarks[3]],  # P4 (right)
+        landmarks[eye_landmarks[4]],  # P5 (bottom-right)
+        landmarks[eye_landmarks[5]]   # P6 (bottom-left)
+    ]
+    eye = [(int(p.x * image_shape[1]), int(p.y * image_shape[0])) for p in eye]
+    A = dist.euclidean(eye[1], eye[5])
+    B = dist.euclidean(eye[2], eye[4])
+    C = dist.euclidean(eye[0], eye[3])
+    ear = (A + B) / (2.0 * C)
+    return ear, eye
+class BlinkDetector:
+    def __init__(self):
+        self.mp_face_mesh = mp.solutions.face_mesh
+        self.face_mesh = self.mp_face_mesh.FaceMesh(
+            max_num_faces=1,
+            refine_landmarks=True,  # Required for iris landmarks
+            min_detection_confidence=0.5,
+            min_tracking_confidence=0.5
+        )
+        self.EAR_THRESHOLD = 0.25
+        self.EAR_CONSEC_FRAMES = 3
+    def detect_blinks(self, frame):
+        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        results = self.face_mesh.process(image_rgb)
+        if not results.multi_face_landmarks:
+            return None, None, None, None, None, None
+        landmarks = results.multi_face_landmarks[0].landmark
+        h, w = frame.shape[:2]
+        LEFT_EYE = [33, 160, 158, 133, 153, 144]
+        RIGHT_EYE = [362, 385, 387, 263, 373, 380]
+        LEFT_IRIS = 473  # Left iris center
+        RIGHT_IRIS = 468  # Right iris center
+        left_ear, left_eye_points = eye_aspect_ratio(LEFT_EYE, landmarks, (h, w))
+        right_ear, right_eye_points = eye_aspect_ratio(RIGHT_EYE, landmarks, (h, w))
+        avg_ear = (left_ear + right_ear) / 2.0
+        nose_tip = landmarks[1]
+        head_pose = (nose_tip.x - 0.5) * 2
+        # Iris coordinates
+        left_iris = (int(landmarks[LEFT_IRIS].x * w), int(landmarks[LEFT_IRIS].y * h))
+        right_iris = (int(landmarks[RIGHT_IRIS].x * w), int(landmarks[RIGHT_IRIS].y * h))
+        return avg_ear, left_eye_points, right_eye_points, head_pose, left_iris, right_iris
+    def __del__(self):
+        self.face_mesh.close()

utils/preprocess.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import cv2
+import numpy as np
+from tensorflow.keras.preprocessing.image import img_to_array
+def preprocess_frame(frame, target_size=(224, 224)):
+    frame = cv2.resize(frame, target_size)
+    frame = img_to_array(frame) / 255.0
+    return np.expand_dims(frame, axis=0)