File size: 5,973 Bytes
669d4ab 929d1c0 21a9a85 669d4ab 6f16ef5 929d1c0 669d4ab 3c6ebc2 669d4ab 3c6ebc2 669d4ab 929d1c0 669d4ab 2650ad9 669d4ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
from huggingface_hub import hf_hub_download,HfFolder
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
repo_id = "Navanihk/recommendationsystemmovie"
cache_dir = '/tmp/hf_cache'
os.makedirs(cache_dir, exist_ok=True)
def stemmed_tokenizer(text):
ps = PorterStemmer()
words = word_tokenize(text)
return [ps.stem(word) for word in words]
# Initialize an empty dictionary to store user history
user_history = {}
# Function to save user history to a pickle file
def save_user_history():
with open(cache_dir+'user_history.pkl', 'wb') as file:
pickle.dump(user_history, file)
# Function to load user history from a pickle file
def load_user_history():
global user_history
if os.path.exists(cache_dir+'user_history.pkl'):
with open('user_history.pkl', 'rb') as file:
user_history = pickle.load(file)
# Load movie data
# movies_data = pd.read_csv('./movieswithposter_updated.csv')
def load_data():
try:
# Download the CSV file
csv_path = hf_hub_download(repo_id=repo_id, filename="movieswithposter_updated.csv", cache_dir=cache_dir)
# Load as DataFrame
movies_data = pd.read_csv(csv_path)
return movies_data
except Exception as e:
print(f"Error loading data from Hugging Face: {e}")
# Fallback to local file if available
if os.path.exists('./movieswithposter_updated.csv'):
return pd.read_csv('./movieswithposter_updated.csv')
else:
raise
# Load movie data
movies_data = load_data()
# Pre-process data
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']
for feature in selected_features:
movies_data[feature] = movies_data[feature].fillna('')
# Combine features
combined_features = movies_data['genres'] + ' ' + movies_data['keywords'] + ' ' + movies_data['tagline'] + ' ' + movies_data['cast'] + ' ' + movies_data['director']
model_vectorizer = hf_hub_download(repo_id=repo_id, filename="model_vectorizer.pkl", cache_dir=cache_dir)
similarity_path = hf_hub_download(repo_id=repo_id, filename="model_similarity.pkl", cache_dir=cache_dir)
# Check if the model (vectorizer and similarity) exists
if model_vectorizer and similarity_path:
# Load the vectorizer and similarity matrix
with open(model_vectorizer, 'rb') as vec_file, open(similarity_path, 'rb') as sim_file:
vectorizer = pickle.load(vec_file)
similarity = pickle.load(sim_file)
else:
# Train the model if it doesn't exist
vectorizer = TfidfVectorizer(stop_words='english',tokenizer=stemmed_tokenizer)
feature_vectors = vectorizer.fit_transform(combined_features)
with open('feature_vector.pkl', 'wb') as file:
pickle.dump(feature_vectors, file)
# Calculate cosine similarity
similarity = cosine_similarity(feature_vectors)
# Save the model (vectorizer and similarity matrix)
with open('model_vectorizer.pkl', 'wb') as vec_file, open('model_similarity.pkl', 'wb') as sim_file:
pickle.dump(vectorizer, vec_file)
pickle.dump(similarity, sim_file)
# Function to recommend movies based on both user input and history
def recommend_movieswithhistory(user_id, movie_name):
# Add the movie to the user's history
add_to_history(user_id, movie_name)
print(user_id,movie_name)
# Fetch the user's history
history = get_history(user_id)
if len(history) == 0:
print("No history found for the user.")
return
print(f"Movies suggested for you based on your past choices: {history}\n")
# Create an aggregate similarity score across all movies in history
combined_similarity = np.zeros(similarity.shape[0])
for past_movie in history:
# Find a close match for each movie in the user's history
list_of_all_titles = movies_data['title'].tolist()
find_close_match = difflib.get_close_matches(past_movie, list_of_all_titles)
if find_close_match:
close_match = find_close_match[0]
# Find the index of the movie in the dataset
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
# Accumulate the similarity scores
combined_similarity += similarity[index_of_the_movie]
# Sort movies based on the combined similarity score
sorted_similar_movies = list(enumerate(combined_similarity))
sorted_similar_movies = sorted(sorted_similar_movies, key=lambda x: x[1], reverse=True)
# Recommend the top movies that the user hasn't already seen
i = 1
movie_return=[]
for movie in sorted_similar_movies:
index = movie[0]
# title_from_index = movies_data[movies_data.index == index]['title'].values[0]
dataFromtitle = movies_data[movies_data.index == index]
if dataFromtitle['title'].values[0] not in history: # Don't recommend movies the user has already interacted with
print(i, '.',dataFromtitle['title'].values[0], "(Score:", round(movie[1], 2), ")")
movie_return.append({'title':dataFromtitle['title'].values[0],'image':dataFromtitle['poster'].values[0]})
i += 1
if i > 35: # Limit recommendations to top 5
break
return movie_return
# Function to add a movie to user history
def add_to_history(user_id, movie_title):
if user_id not in user_history:
user_history[user_id] = []
user_history[user_id].append(movie_title)
save_user_history() # Save the updated history after adding a movie
# Function to get movies from user history
def get_history(user_id):
return user_history.get(user_id, [])
# Load the user history at the start of the program
load_user_history()
|