|
import spaces |
|
import os |
|
import cv2 |
|
import torch |
|
import gradio as gr |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from io import BytesIO |
|
from PIL import Image |
|
|
|
from transformers import AutoFeatureExtractor, AutoModelForVideoClassification |
|
|
|
|
|
MODEL_NAME = "microsoft/timesformer-base-finetuned-k400" |
|
|
|
def extract_frames(video_path, num_frames=16, target_size=(224, 224)): |
|
""" |
|
Extract up to `num_frames` uniformly-sampled frames from the video. |
|
If the video has fewer frames, all frames are returned. |
|
""" |
|
cap = cv2.VideoCapture(video_path) |
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
frames = [] |
|
if total_frames <= 0: |
|
cap.release() |
|
return frames |
|
indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) |
|
current_frame = 0 |
|
while True: |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
if current_frame in indices: |
|
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
frame = cv2.resize(frame, target_size) |
|
frames.append(Image.fromarray(frame)) |
|
current_frame += 1 |
|
cap.release() |
|
return frames |
|
|
|
@spaces.GPU |
|
def classify_video(video_path): |
|
""" |
|
Loads the TimeSformer model and feature extractor inside the GPU context, |
|
extracts frames from the video, runs inference, and returns: |
|
1. A text string of the top 5 predicted action labels with their class IDs and probabilities. |
|
2. A bar chart image showing the distribution over the top predictions. |
|
""" |
|
|
|
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME) |
|
model = AutoModelForVideoClassification.from_pretrained(MODEL_NAME) |
|
model.eval() |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
|
|
frames = extract_frames(video_path, num_frames=16, target_size=(224, 224)) |
|
if len(frames) == 0: |
|
return "No frames extracted from video.", None |
|
|
|
|
|
inputs = feature_extractor(frames, return_tensors="pt") |
|
inputs = {key: val.to(device) for key, val in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
logits = outputs.logits |
|
probs = torch.nn.functional.softmax(logits, dim=-1)[0] |
|
|
|
|
|
top_probs, top_indices = torch.topk(probs, k=5) |
|
top_probs = top_probs.cpu().numpy() |
|
top_indices = top_indices.cpu().numpy() |
|
|
|
|
|
id2label = model.config.id2label if hasattr(model.config, "id2label") else {} |
|
|
|
|
|
results = [] |
|
x_labels = [] |
|
for idx, prob in zip(top_indices, top_probs): |
|
label = id2label.get(str(idx), f"Class {idx}") |
|
results.append(f"ID {idx} - {label}: {prob:.3f}") |
|
x_labels.append(f"ID {idx}\n{label}") |
|
results_text = "\n".join(results) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 4)) |
|
ax.bar(x_labels, top_probs, color="skyblue") |
|
ax.set_ylabel("Probability") |
|
ax.set_title("Top 5 Prediction Distribution") |
|
plt.xticks(rotation=45, ha="right") |
|
plt.tight_layout() |
|
|
|
buf = BytesIO() |
|
plt.savefig(buf, format="png") |
|
buf.seek(0) |
|
plt.close(fig) |
|
|
|
return results_text, buf |
|
|
|
def process_video(video_file): |
|
if video_file is None: |
|
return "No video provided.", None |
|
result_text, plot_img = classify_video(video_file) |
|
return result_text, plot_img |
|
|
|
|
|
demo = gr.Interface( |
|
fn=process_video, |
|
inputs=gr.Video(source="upload", label="Upload Video Clip"), |
|
outputs=[ |
|
gr.Textbox(label="Predicted Actions"), |
|
gr.Image(label="Prediction Distribution") |
|
], |
|
title="Video Human Detection Demo using TimeSformer", |
|
description=( |
|
"Upload a video clip to see the top predicted human action labels using the TimeSformer model " |
|
"(fine-tuned on Kinetics-400). The output shows each prediction along with its class ID and probability, " |
|
"and a bar chart displays the distribution of the top 5 predictions." |
|
) |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|