import os
import pandas as pd
from scipy.sparse import csr_matrix
from ..utils import binarize
from .NetflixData import NetflixData
from itertools import chain
from scipy.sparse import lil_matrix, csr_matrix
import numpy as np
[docs]
class NetflixGenreCastData(NetflixData):
'''Load Netflix dataset with genre and cast information.
Genre and cast information comes from Netflix-Prize-IMDB-TMDB-Joint-Dataset on GitHub:
https://github.com/felixnie/Netflix-Prize-IMDB-TMDB-Joint-Dataset
Parameters
----------
path : str
Path to the cached dataset.
size : str in {'small', 'full'}
Netflix data 'small' version, size 15MB, users ~10k, items 4945, ratings ~608k.
Netflix data 'full' version, size 2.43GB, users ~480k, items 17770, ratings ~100M.
source : str in {'imdb', 'tmdb'}
Source should be 'imdb' or 'tmdb'.
'''
def __init__(self, path=None, size='small', source='imdb'):
super().__init__(path=path, size=size)
self.is_single = False
assert source in ['imdb', 'tmdb'], "Source should be 'imdb' or 'tmdb'."
self.name = self.name + '_genre_cast_' + source
self.source = source
[docs]
def read_data(self):
'''Read data.
'''
# ratings and titles
super().read_data()
# genres and cast
path = os.path.join(self.root, "Netflix-Prize-IMDB-TMDB-Joint-Dataset", "netflix_all.pickle")
self.df_info = pd.read_pickle(path)
[docs]
def load_data(self):
'''Load data.
'''
# X and factor_info from ratings
super().load_data()
X = self.X
user_info, movie_info = self.factor_info
Y, genre_alias = self.get_attribute_info(self.source + '_genres')
Z, cast_alias = self.get_attribute_info(self.source + '_cast')
genre_order = np.arange(len(genre_alias))
genre_idmap = np.arange(len(genre_alias))
genre_info = [genre_order, genre_idmap, genre_alias]
cast_order = np.arange(len(cast_alias))
cast_idmap = cast_alias.astype(int)
cast_info = [cast_order, cast_idmap, cast_alias]
# align the 3 matrices
movie_ids = self.df_info['netflix_id'].values
movie_idmap = movie_info[1]
idx = [i for i in range(len(movie_ids)) if movie_ids[i] in movie_idmap]
Y = Y[:, idx]
Z = Z[:, idx]
self.Xs = [X, Y, Z]
self.factors = [[0, 1], [2, 1], [3, 1]]
self.factor_info = [user_info, movie_info, genre_info, cast_info]
[docs]
def get_attribute_info(self, attribute):
'''Get attribute information.
Parameters
----------
attribute : str
The name of columns in ``df_info``.
'''
df = self.df_info.dropna(subset=[attribute])
attr_list = sorted(list(set(chain.from_iterable(df[attribute]))))
attr_list = np.array(attr_list).astype(str)
attr_dict = {}
for i, key in enumerate(attr_list):
attr_dict[key] = i
rows = list(chain.from_iterable(df[attribute]))
rows = [attr_dict[x] for x in rows]
cols = []
for i in df.index:
for _ in range(len(df[attribute][i])):
cols.append(i)
rows = np.array(rows, dtype=int)
cols = np.array(cols, dtype=int)
values = np.ones(len(rows))
m, n = len(attr_list), len(self.df_info)
attr_mat = csr_matrix((values, (rows, cols)), shape=(m, n))
return attr_mat, attr_list