[docs]classMovieLensGenreCastData(MovieLensData):'''Load MovieLens dataset with IMDB genre and cast information. Parameters ---------- path : str Path to the cached dataset. size : str in {'100k', '1m'} MovieLens dataset size. '''def__init__(self,path=None,size='1m'):super().__init__(path=path,size=size)self.is_single=Falseself.name=self.name+'_genre_cast'
[docs]defread_data(self):'''Read data. '''# ratings and titlessuper().read_data()# genres and castpath=os.path.join(self.root,"MovieLens-IMDB-Dataset","ml_"+self.size+"_imdb.pickle")self.df_info=pd.read_pickle(path)# preprocessing: generate 'cast' columndefmerge_list(row,columns=['director','actor','actress']):merged=[]forcincolumns:ifisinstance(row[c],list):merged=merged+row[c]merged=list(set(merged))returnmergedself.df_info['imdb_cast']=self.df_info.apply(lambdax:merge_list(x),axis=1)
[docs]defload_data(self):'''Load data. '''super().load_data()X=self.Xuser_info,movie_info=self.factor_infoY,genre_alias=self.get_attribute_info('imdb_genres')Z,cast_alias=self.get_attribute_info('imdb_cast')genre_order=np.arange(len(genre_alias))genre_idmap=np.arange(len(genre_alias))genre_info=[genre_order,genre_idmap,genre_alias]cast_order=np.arange(len(cast_alias))cast_idmap=np.array([int(id[2:])foridincast_alias])cast_info=[cast_order,cast_idmap,cast_alias]# align the 3 matricesmovie_ids=self.df_info['ml_id'].valuesmovie_idmap=movie_info[1]idx=[iforiinrange(len(movie_ids))ifmovie_ids[i]inmovie_idmap]Y=Y[:,idx]Z=Z[:,idx]self.Xs=[X,Y,Z]self.factors=[[0,1],[2,1],[3,1]]self.factor_info=[user_info,movie_info,genre_info,cast_info]
[docs]defget_attribute_info(self,attribute):'''Get attribute information. Parameters ---------- attribute : str The name of columns in ``df_info``. '''df=self.df_info.dropna(subset=[attribute])attr_list=sorted(list(set(chain.from_iterable(df[attribute]))))attr_list=np.array(attr_list).astype(str)attr_dict={}fori,keyinenumerate(attr_list):attr_dict[key]=irows=list(chain.from_iterable(df[attribute]))rows=[attr_dict[x]forxinrows]cols=[]foriindf.index:for_inrange(len(df[attribute][i])):cols.append(i)rows=np.array(rows,dtype=int)cols=np.array(cols,dtype=int)values=np.ones(len(rows))m,n=len(attr_list),len(self.df_info)attr_mat=csr_matrix((values,(rows,cols)),shape=(m,n))returnattr_mat,attr_list