Sambhavnoobcoder's picture
Upload app.py
936e46b
raw
history blame
6.49 kB
import gradio as gr
import numpy as np
import pandas as pd
import torch
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
def load_model():
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)
return model
def encode_and_calculate_similarity(model):
sentence_embeddings = model.encode(df_merged["soup"].tolist())
cos_sim = cosine_similarity(sentence_embeddings)
return cos_sim
def svd():
reader = Reader()
data = Dataset.load_from_df(df_ratings[["userId", "movieId", "rating"]], reader)
svd = SVD()
cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)
trainset = data.build_full_trainset()
svd.fit(trainset)
return svd
def get_sorted_movie_indices(title: str, cos_sim: np.ndarray) -> list[int]:
"""
Retrieve the sorted indices of movies based on their similarity scores to a given movie.
:param title: The title of the movie to find similar movies for.
:param cos_sim: The cosine similarity matrix of movies.
:return: A list of sorted movie indices.
"""
try:
# Get the index of the movie that matches the title
movie_index = movie_indices[title.lower()]
# If there are multiple movies with the same title, pick the first one.
if isinstance(movie_index, pd.Series):
movie_index = movie_index[0]
except KeyError:
print(f"Movie '{title}' not found. Please enter a valid movie title.")
return None
# Get the pairwise similarity scores of all movies with that movie
sim_scores = list(enumerate(cos_sim[movie_index]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]
# Get the movie indices
sorted_movie_indices = [sim_score[0] for sim_score in sim_scores]
return sorted_movie_indices
def get_qualified_movies(
df: pd.DataFrame, df_qualified: pd.DataFrame, sorted_movie_indices: list[int]
) -> pd.DataFrame:
"""
Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating.
:param df: The DataFrame containing movie details.
:param df_qualified: The DataFrame containing qualified movie details.
:param sorted_movie_indices: A list of movie indices sorted by similarity scores.
:return: A Pandas DataFrame containing the qualified movies sorted by similarity scores.
"""
movie_details = [
"id",
"title",
"genres",
"original_language",
"production_countries",
"release_date",
"runtime",
]
sorted_movies = df.loc[sorted_movie_indices, movie_details]
qualified_movies = sorted_movies[sorted_movies["id"].isin(df_qualified["id"])]
return qualified_movies
def predict_user_rating(
userId: int, qualified_movies: pd.DataFrame, indices_map: pd.DataFrame
) -> pd.DataFrame:
"""
Predict the user rating for qualified movies using SVD and return the sorted DataFrame.
:param userId: The ID of the user.
:param qualified_movies: A Pandas DataFrame containing qualified movies data.
:return: A Pandas DataFrame containing the final qualified movies sorted by estimated user ratings.
"""
# Calculate estimated user ratings for qualified movies using SVD
qualified_movies["predicted_user_rating"] = qualified_movies["id"].apply(
lambda x: round(svd.predict(userId, indices_map.loc[x]["movieId"]).est, 2)
)
final_qualified_movies = qualified_movies.sort_values(
by=["predicted_user_rating"], ascending=False
)
return final_qualified_movies
def get_movie_recommendations_hybrid(title: str, userId: int) -> pd.DataFrame:
"""
Get movie recommendations based on a given title and user ID.
:param title: The title of the movie to find similar movies for.
:param userId: The ID of the user.
:return: A Pandas DataFrame containing the recommended movies
"""
# Get recommended movie indices based on the given title
sorted_movie_indices = get_sorted_movie_indices(title, cos_sim)
# Filter out bad movies and select the top 50 qualified movies
qualified_movies = get_qualified_movies(
df_merged, df_qualified, sorted_movie_indices
).head(50)
# Predict user ratings for qualified movies and select the top recommended movies
recommended_movies = predict_user_rating(
userId, qualified_movies, indices_map
).head(5)
recommended_movies.columns = [
"ID",
"Title",
"Genres",
"Language",
"Production Countries",
"Release Date",
"Runtime",
"Predicted User Rating",
]
return recommended_movies
if __name__ == "__main__":
df_qualified = pd.read_csv("data/qualified_movies.csv")
df_ratings = pd.read_csv("data/ratings_small.csv")
df_merged = pd.read_csv("data/df_merged.csv")
model = load_model()
cos_sim = encode_and_calculate_similarity(model)
movie_indices = pd.Series(
df_merged.index, index=df_merged["title"].apply(lambda title: title.lower())
).drop_duplicates()
svd = svd()
indices_map = df_merged.set_index("id")
with gr.Blocks(theme=gr.themes.Soft(text_size="lg")) as demo:
gr.Markdown(
"""
# Movie Recommendation System
"""
)
title = gr.Dropdown(
choices=df_merged["title"].unique().tolist(),
label="Movie Title",
value="Iron Man",
)
user_id = gr.Number(
value=1, label="User ID", info="Please enter a number between 1 and 671!"
)
recommend_button = gr.Button("Get Movie Recommendations")
recommended_movies = gr.DataFrame(label="Movie Recommendations")
recommend_button.click(
get_movie_recommendations_hybrid,
inputs=[title, user_id],
outputs=recommended_movies,
)
examples = gr.Examples(
examples=[
"Captain America: The First Avenger",
"The Conjuring",
"Toy Story",
"Final Destination 5",
],
inputs=[title],
)
demo.launch()