Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import torch | |
import json | |
import os | |
from pathlib import Path | |
class VideoRetrieval: | |
def __init__(self): | |
self.text_model = SentenceTransformer('all-MiniLM-L6-v2') | |
self.load_data() | |
def load_data(self): | |
# Load pre-computed features | |
# In practice, these would be loaded from your actual feature files | |
self.features = { | |
'visual_features': np.load('path_to_visual_features.npy'), | |
'scene_features': np.load('path_to_scene_features.npy'), | |
'object_features': np.load('path_to_object_features.npy') | |
} | |
# Load clip metadata | |
self.clips_df = pd.read_csv('clips_metadata.csv') | |
def encode_query(self, query_text): | |
"""Encode the text query into embeddings""" | |
return self.text_model.encode(query_text) | |
def compute_similarity(self, query_embedding, feature_type='visual_features'): | |
"""Compute similarity between query and video features""" | |
similarities = cosine_similarity( | |
query_embedding.reshape(1, -1), | |
self.features[feature_type] | |
) | |
return similarities[0] | |
def retrieve_clips(self, query_text, top_k=3): | |
"""Retrieve top-k most relevant clips based on query""" | |
# Encode query | |
query_embedding = self.encode_query(query_text) | |
# Compute similarities for different feature types | |
similarities = {} | |
weights = { | |
'visual_features': 0.4, | |
'scene_features': 0.3, | |
'object_features': 0.3 | |
} | |
for feat_type, weight in weights.items(): | |
similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight | |
# Combine similarities | |
combined_similarities = sum(similarities.values()) | |
# Get top-k indices | |
top_indices = np.argsort(combined_similarities)[-top_k:][::-1] | |
# Return clip information | |
results = [] | |
for idx in top_indices: | |
results.append({ | |
'clip_id': self.clips_df.iloc[idx]['clip_id'], | |
'movie_title': self.clips_df.iloc[idx]['movie_title'], | |
'description': self.clips_df.iloc[idx]['description'], | |
'timestamp': self.clips_df.iloc[idx]['timestamp'], | |
'similarity_score': combined_similarities[idx] | |
}) | |
return results | |
# Streamlit UI | |
def main(): | |
st.title("Movie Scene Retrieval System") | |
st.write(""" | |
Search for movie scenes using natural language descriptions. | |
The system will retrieve the most relevant 2-3 minute clips based on your query. | |
""") | |
# Initialize retrieval system | |
try: | |
retrieval_system = st.session_state.retrieval_system | |
except AttributeError: | |
retrieval_system = VideoRetrieval() | |
st.session_state.retrieval_system = retrieval_system | |
# Search interface | |
query = st.text_input("Enter your scene description:", | |
"A dramatic confrontation between two characters in a dark room") | |
num_results = st.slider("Number of results to show:", min_value=1, max_value=5, value=3) | |
if st.button("Search"): | |
with st.spinner("Searching for relevant clips..."): | |
results = retrieval_system.retrieve_clips(query, top_k=num_results) | |
for i, result in enumerate(results, 1): | |
st.subheader(f"Result {i}: {result['movie_title']}") | |
col1, col2 = st.columns([2, 1]) | |
with col1: | |
st.write("**Scene Description:**") | |
st.write(result['description']) | |
st.write(f"**Timestamp:** {result['timestamp']}") | |
with col2: | |
st.write("**Similarity Score:**") | |
st.progress(float(result['similarity_score'])) | |
# In practice, you would have a way to play the video clip here | |
st.write("---") | |
# Additional features | |
with st.sidebar: | |
st.header("About") | |
st.write(""" | |
This system uses pre-computed visual features from several expert models to retrieve | |
relevant movie clips based on natural language descriptions. Features include: | |
- Visual scene understanding | |
- Character interaction analysis | |
- Object detection | |
- Action recognition | |
""") | |
st.header("Feature Weights") | |
st.write("Current weights used for similarity computation:") | |
st.write("- Visual Features: 40%") | |
st.write("- Scene Features: 30%") | |
st.write("- Object Features: 30%") | |
if __name__ == "__main__": | |
main() | |
# Requirements.txt | |
''' | |
streamlit==1.22.0 | |
pandas==1.5.3 | |
numpy==1.23.5 | |
sentence-transformers==2.2.2 | |
scikit-learn==1.2.2 | |
torch==2.0.0 | |
streamlit | |
pandas | |
numpy | |
sentence-transformers | |
scikit-learn | |
torch | |
''' |