Source code for PyBMF.datasets.MovieLensUserData

import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, lil_matrix, hstack
from ..utils import binarize
from .MovieLensData import MovieLensData


[docs] class MovieLensUserData(MovieLensData): '''Load MovieLens dataset with user profiles Parameters ---------- path : str Path to the cached dataset. size : str in {'100k', '1m'} MovieLens dataset size. ''' def __init__(self, path=None, size='1m'): super().__init__(path=path, size=size) self.is_single = False self.name = self.name + '_user'
[docs] def read_data(self): '''Read data. ''' # ratings and titles super().read_data() # profiles if self.size == '100k': path = os.path.join(self.root, "ml-100k", "u.user") sep, engine, names = '|', 'c', ['uid', 'age', 'gender', 'occupation', 'zip'] elif self.size == '1m': path = os.path.join(self.root, "ml-1m", "users.dat") sep, engine, names = '::', 'python', ['uid', 'gender', 'age', 'occupation', 'zip'] self.df_profiles = pd.read_table(path, delimiter=sep, engine=engine, header=None, names=names) # occupations path = os.path.join(self.root, "ml-100k", "u.occupation") self.df_occupations = pd.read_table(path, delimiter='|', header=None, names=['occupation']) # preprocessing self.df_profiles['gender'] = self.df_profiles['gender'].apply(lambda x: 0 if x == 'F' else 1) self.df_profiles['age'] = self.df_profiles['age'].apply(lambda x: int(x / 15)) self.df_profiles['occupation'] = self.df_profiles['occupation'].apply(lambda x: self.df_occupations[self.df_occupations['occupation'] == x].index[0] if isinstance(x, str) else x) from uszipcode import SearchEngine engine = SearchEngine() self.df_profiles['zip'] = self.df_profiles['zip'].apply(lambda x: engine.by_zipcode(x).state if engine.by_zipcode(x) is not None else 'NA')
[docs] def load_data(self): '''Load data. ''' super().load_data() X = self.X user_info, movie_info = self.factor_info Y, profile_alias = self.get_user_profile() profile_order = np.arange(len(profile_alias)) profile_idmap = np.arange(len(profile_alias)) profile_info = [profile_order, profile_idmap, profile_alias] self.Xs = [X, Y] self.factors = [[0, 1], [0, 2]] self.factor_info = [user_info, movie_info, profile_info]
[docs] def get_user_profile(self): '''Get user profile. ''' attributes = ['age', 'occupation', 'zip'] # genger attr_list = np.array(['gender']) Y = csr_matrix(self.df_profiles['gender'].values).T for attr in attributes: # attribute values attr_vals = sorted(self.df_profiles[attr].unique()) # new sub-matrix Z = lil_matrix((Y.shape[0], len(attr_vals))) for col, val in enumerate(attr_vals): rows = self.df_profiles.index[self.df_profiles[attr] == val] Z[rows, col] = 1 if attr == 'age': attr_vals = ['age_' + str(s) for s in attr_vals] if attr == 'occupation': attr_vals = self.df_occupations['occupation'].values Y = hstack((Y, Z), format='csr') attr_list = np.append(attr_list, attr_vals) return Y, attr_list