Spaces:

Luigi
/

Video-Human-Fall-Detection-with-TimeSformer

Sleeping

App Files Files Community

Luigi commited on Apr 14

Commit

9651249

1 Parent(s): 413c57e

show top 5 result

Browse files

Files changed (1) hide show

app.py +44 -19

app.py CHANGED Viewed

@@ -2,14 +2,16 @@ import spaces  # Import spaces immediately for HF ZeroGPU support.
 import os
 import cv2
 import torch
-import yt_dlp  # (Retained in requirements for potential video fetching use)
 import numpy as np
 from PIL import Image
-import gradio as gr
 from transformers import AutoFeatureExtractor, AutoModelForVideoClassification
 # Specify the model checkpoint for TimeSformer.
-MODEL_NAME = "facebook/timesformer-base-finetuned-k400"
 def extract_frames(video_path, num_frames=16, target_size=(224, 224)):
     """
@@ -40,21 +42,23 @@ def extract_frames(video_path, num_frames=16, target_size=(224, 224)):
 def classify_video(video_path):
     """
     Loads the TimeSformer model and feature extractor inside the GPU context,
-    extracts frames from the video, runs inference, and returns the top 5 predicted actions.
     """
     # Load the feature extractor and model inside the GPU context.
     feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
     model = AutoModelForVideoClassification.from_pretrained(MODEL_NAME)
     model.eval()
     # Determine the device.
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
-    # Extract frames from the video (sampling 16 frames).
     frames = extract_frames(video_path, num_frames=16, target_size=(224, 224))
     if len(frames) == 0:
-        return "No frames extracted from video."
     # Preprocess the frames.
     inputs = feature_extractor(frames, return_tensors="pt")
@@ -64,8 +68,8 @@ def classify_video(video_path):
     with torch.no_grad():
         outputs = model(**inputs)
-    # Compute softmax probabilities from logits.
-    logits = outputs.logits  # shape: [batch_size, num_classes] with batch_size=1
     probs = torch.nn.functional.softmax(logits, dim=-1)[0]
     # Get the top 5 predictions.
@@ -73,31 +77,52 @@ def classify_video(video_path):
     top_probs = top_probs.cpu().numpy()
     top_indices = top_indices.cpu().numpy()
-    # Retrieve the label mapping from the model config.
     id2label = model.config.id2label if hasattr(model.config, "id2label") else {}
     results = []
     for idx, prob in zip(top_indices, top_probs):
         label = id2label.get(str(idx), f"Class {idx}")
-        results.append(f"{label}: {prob:.3f}")
-    return "\n".join(results)
 def process_video(video_file):
     if video_file is None:
-        return "No video provided."
-    result = classify_video(video_file)
-    return result
 # Gradio interface definition.
 demo = gr.Interface(
     fn=process_video,
-    inputs=gr.Video(sources=["upload"], label="Upload Video Clip"),
-    outputs=gr.Textbox(label="Predicted Actions"),
     title="Video Human Detection Demo using TimeSformer",
     description=(
         "Upload a video clip to see the top predicted human action labels using the TimeSformer model "
-        "(fine-tuned on Kinetics-400). This demo loads the model and feature extractor within the GPU context "
-        "for optimized inference in Hugging Face ZeroGPU Spaces while also supporting CPU-only environments."
     )
 )

 import os
 import cv2
 import torch
+import gradio as gr
 import numpy as np
+import matplotlib.pyplot as plt
+from io import BytesIO
 from PIL import Image
 from transformers import AutoFeatureExtractor, AutoModelForVideoClassification
 # Specify the model checkpoint for TimeSformer.
+MODEL_NAME = "microsoft/timesformer-base-finetuned-k400"
 def extract_frames(video_path, num_frames=16, target_size=(224, 224)):
     """
 def classify_video(video_path):
     """
     Loads the TimeSformer model and feature extractor inside the GPU context,
+    extracts frames from the video, runs inference, and returns:
+      1. A text string of the top 5 predicted action labels with their class IDs and probabilities.
+      2. A bar chart image showing the distribution over the top predictions.
     """
     # Load the feature extractor and model inside the GPU context.
     feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
     model = AutoModelForVideoClassification.from_pretrained(MODEL_NAME)
     model.eval()
     # Determine the device.
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
+    # Extract frames from the video.
     frames = extract_frames(video_path, num_frames=16, target_size=(224, 224))
     if len(frames) == 0:
+        return "No frames extracted from video.", None
     # Preprocess the frames.
     inputs = feature_extractor(frames, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
+    # Get logits and compute probabilities.
+    logits = outputs.logits  # shape: [batch_size, num_classes] with batch_size=1.
     probs = torch.nn.functional.softmax(logits, dim=-1)[0]
     # Get the top 5 predictions.
     top_probs = top_probs.cpu().numpy()
     top_indices = top_indices.cpu().numpy()
+    # Retrieve the label mapping from model config.
     id2label = model.config.id2label if hasattr(model.config, "id2label") else {}
+    # Prepare textual results showing both ID and label.
     results = []
+    x_labels = []
     for idx, prob in zip(top_indices, top_probs):
         label = id2label.get(str(idx), f"Class {idx}")
+        results.append(f"ID {idx} - {label}: {prob:.3f}")
+        x_labels.append(f"ID {idx}\n{label}")
+    results_text = "\n".join(results)
+    # Create a bar chart for the distribution.
+    fig, ax = plt.subplots(figsize=(8, 4))
+    ax.bar(x_labels, top_probs, color="skyblue")
+    ax.set_ylabel("Probability")
+    ax.set_title("Top 5 Prediction Distribution")
+    plt.xticks(rotation=45, ha="right")
+    plt.tight_layout()
+    buf = BytesIO()
+    plt.savefig(buf, format="png")
+    buf.seek(0)
+    plt.close(fig)
+    return results_text, buf
 def process_video(video_file):
     if video_file is None:
+        return "No video provided.", None
+    result_text, plot_img = classify_video(video_file)
+    return result_text, plot_img
 # Gradio interface definition.
 demo = gr.Interface(
     fn=process_video,
+    inputs=gr.Video(source="upload", label="Upload Video Clip"),
+    outputs=[
+        gr.Textbox(label="Predicted Actions"),
+        gr.Image(label="Prediction Distribution")
+    ],
     title="Video Human Detection Demo using TimeSformer",
     description=(
         "Upload a video clip to see the top predicted human action labels using the TimeSformer model "
+        "(fine-tuned on Kinetics-400). The output shows each prediction along with its class ID and probability, "
+        "and a bar chart displays the distribution of the top 5 predictions."
     )
 )