l1n4n's picture
Duplicate from l1n4n/FEIR-viz-tool
7a29bed
import sys
from pathlib import Path
import string
import random
import torch
import numpy as np
import pickle
import pandas as pd
import os
import json
import re
# refactor as a class with the following methods
class Data:
def __init__(self, Ufile, Pfile, Sfile, job_meta_file, user_meta_file, user_groups=None, sub_sample_size=1000):
self.U, self.P, self.S, self.job_metadata, self.job_metadata_reverse, self.user_metadata = self.load_data(Pfile, Sfile, Ufile, job_meta_file, user_meta_file)
# subsample the data
self.U_sub = self.sub_sample(self.U, sub_sample_size)
self.P_sub = self.sub_sample(self.P, sub_sample_size)
self.S_sub = self.sub_sample(self.S, sub_sample_size)
# self.U_sub = self.U
# self.P_sub = self.P
# self.S_sub = self.S
self.lookup_dict = {}
self.user_temp_data = {}
self.user_groups = user_groups
def load_data(self, Pfile, Sfile, Ufile, job_meta_file, user_meta_file):
U = torch.from_numpy(pickle.load(open(Ufile, 'rb')))
recommendations = torch.from_numpy(pickle.load(open(Pfile, 'rb')))
m, n = recommendations.shape
if Sfile:
S = torch.from_numpy(pickle.load(open(Sfile, 'rb')))
else:
S = U
if job_meta_file:
job_metadata = pickle.load(open(job_meta_file, 'rb'))
else:
job_metadata = {}
for i in range(n):
job_metadata[i] = 'Job {}'.format(i)
job_metadata_reverse = {v.capitalize(): k for k, v in job_metadata.items()}
if user_meta_file is not None:
user_metadata = pickle.load(open(user_meta_file, 'rb'))
else:
user_metadata = None
return U, recommendations, S, job_metadata, job_metadata_reverse, user_metadata
def sub_sample(self, M, sample_size=500):
if len(M) > sample_size and len(M[0]) > sample_size:
# take the first sample_size columns and rows of M, copy without touching the original
M = M[:sample_size, :sample_size].clone()
return M
def update(self, new_user_num, new_job_num):
# refactor this function
# recdata.lookup_dict = {}
# user_temp_data = {}
# U = add_jobs(U, new_job_num)
# recommendations = update_P(recommendations, new_job_num, 0)
# generate a random float between 0 and 1
# prob = random.random()
# if prob > 0.2:
# recommendations[int(user),-1] = 1.
# S = add_jobs(S, new_job_num)
# U, recommendations, S = add_jobs(U, new_job_num), add_jobs(recommendations, new_job_num), add_jobs(S, new_job_num)
# job_metadata = update_job_metadata(job_metadata, new_job_num)
# job_metadata_reverse = {v: k for k, v in job_metadata.items()}
if new_job_num > 0 or new_user_num > 0:
self.U_sub = self.add_jobs_users(self.U_sub, self.U, new_job_num, new_user_num)
self.P_sub = self.add_jobs_users(self.P_sub, self.P, new_job_num, new_user_num)
self.S_sub = self.add_jobs_users(self.S_sub, self.S, new_job_num, new_user_num)
print('U_sub shape: ', self.U_sub.shape)
print('P_sub shape: ', self.P_sub.shape)
print('S_sub shape: ', self.S_sub.shape)
self.update_job_metadata(new_job_num)
self.update_user_metadata(new_user_num)
self.lookup_dict = {}
self.user_temp_data = {}
# def shuffle_rec(P):
# rand_rec = P.copy()
# rand_rec = rand_rec[:,np.random.permutation(rand_rec.shape[1])]
# return rand_rec
def add_jobs(self, M_sub, M, new_job_num): # refactor this function, accept one matrix as input
if new_job_num == 0:
return M_sub
if len(M[0]) > len(M_sub[0]) + new_job_num:
M_updated = M[:len(M_sub), :len(M_sub[0]) + new_job_num].clone()
else:
# random number between 0 and 1 with size (S.shape[0],new_job_num)
new_jobM = np.random.rand(M.shape[0], new_job_num)
# concat new jobM to M as new columns
M_updated = np.concatenate((M_sub, new_jobM), axis=1)
return M_updated
def add_users(self, M_sub, M, new_user_num): # refactor this function, accept one matrix as input
if new_user_num == 0:
return M_sub
if len(M) > len(M_sub) + new_user_num:
M_updated = M[:len(M_sub) + new_user_num, :len(M_sub[0])].clone()
else:
# random number between 0 and 1 with size (new_user_num,S.shape[1])
new_userM = np.random.rand(new_user_num, M.shape[1])
# concat new userM to M as new rows
M_updated = np.concatenate((M_sub, new_userM), axis=0)
return M_updated
def add_jobs_users(self, M_sub, M, new_job_num, new_user_num):
# use add_jobs and add_users to add new jobs and users
M_updated = self.add_jobs(M_sub, M, new_job_num)
M_updated = self.add_users(M_updated, M, new_user_num)
print('M_updated shape: ', M_updated.shape)
return M_updated
def tweak_P(self, this_user):
# generate a random float between 0 and 1
prob = random.random()
if prob > 0.2:
self.P_sub[int(this_user),-1] = 1.
# 1 random indices of users within the range of P.shape[0]
user_indices = np.random.randint(0, self.P_sub.shape[0], 1)
self.P_sub[user_indices, -1] = 1.
def update_job_metadata(self, new_job_num):
if len(self.P_sub[0]) > len(self.P[0]):
for i in range(new_job_num):
self.job_metadata[len(self.job_metadata)] = 'Job {}'.format(len(self.job_metadata))
self.job_metadata_reverse['Job {}'.format(len(self.job_metadata_reverse))] = len(self.job_metadata_reverse)
def update_user_metadata(self, new_user_num): # TODO: generate fake user metadata for CB
if new_user_num > 0:
if len(self.P_sub) > len(self.P):
# make a new dataframe with new user metadata
new_user_metadata = {}
new_user_metadata['Id'] = [str(i) for i in range(len(self.user_metadata), len(self.user_metadata) + new_user_num)]
new_user_metadata['Sex'] = np.random.choice([0, 1], size=new_user_num, p=[.4, .6])
new_user_metadata['Edu'] = np.random.choice([0, 1, 2], size=new_user_num, p=[.2, .6, 0.2])
new_user_metadata = pd.DataFrame(new_user_metadata)
new_user_metadata['Sex'] = new_user_metadata['Sex'].map({0:'F', 1:'M'})
new_user_metadata['Edu'] = new_user_metadata['Edu'].map({0:'High school', 1:'College', 2:'Graduate+'})
# concat new user metadata to old user metadata
self.user_metadata = pd.concat([self.user_metadata, new_user_metadata], ignore_index=True)
# print(user_metadata)