import numpy as np
from ..utils import show_matrix, matmul, to_sparse
from ..utils import header, record, eval, binarize
import pandas as pd
from itertools import product
from .BaseModelTools import BaseModelTools
import time
[docs]
class BaseModel(BaseModelTools):
'''The base class for all the models.
Initialize the model with parameters.
'''
def __init__(self, **kwargs):
raise NotImplementedError('This is a template class.')
[docs]
def check_params(self, **kwargs):
'''Check and load model parameters and experiment configurations.
Called upon model initialization and fitting.
.. code-block:: python
# include this in your model class:
def __init__(self, k, tol, alpha):
self.check_params(k=k, tol=tol, alpha=alpha)
def fit(self, X_train, X_val=None, X_test=None, **kwargs):
self.check_params(**kwargs)
# call them when initializing and fitting:
model = MyModel(k=10, W='mask', alpha=0.1, seed=1997)
model.fit(X_train, X_val, X_test, seed=2024, task='prediction', verbose=False, display=True)
'''
self.set_params(**kwargs)
self.set_config(**kwargs)
[docs]
def fit(self, X_train, X_val=None, X_test=None, **kwargs):
'''Fit the model to observations, with validation and test if necessary.
For a fitting procedure, implement and append your ``_fit()`` and ``finish()``.
Simply overwrite this method if you want to drop or add any parts of the procedures.
Parameters
----------
X_train : ndarray
Training data.
X_val : ndarray
Validation data.
X_test : ndarray
Test data.
**kwargs : dict
Other parameters.
'''
# these are the common routines when the fitting starts:
self.check_params(**kwargs)
self.load_dataset(X_train=X_train, X_val=X_val, X_test=X_test)
self.init_model()
# attach these in your models:
# self._fit()
# self.finish(show_logs=self.show_logs, save_model=self.save_model, show_result=self.show_result)
[docs]
def init_model(self):
'''Initialize the model.
Called after params are set and datasets are loaded.
Simply overwrite this method if you want to drop or add any parts of the procedures.
'''
self._init_factors()
self._init_logs()
self._start_timer()
self._make_name()
# attach more in your models if needed, for example:
# if you have more initialization methods:
# self.init_U() or self.init_Us()
# if you accept masking matrices:
# self.init_W() or self.init_Ws()
# if you want to force the model to work with dense `ndarray`:
# self._to_dense()
[docs]
def _fit(self):
'''Where the tedious fitting procedure takes place.
'''
raise NotImplementedError('This is a template method.')
[docs]
def finish(self, show_logs=True, save_model=True, show_result=True):
'''Called when the fitting is over.
The default finishing procedure.
Simply overwrite this method if you want to drop or add any parts of the procedures.
You can attach this to the end of ``fit()`` or simply call from outside.
'''
self._stop_timer()
if save_model:
self._save_model()
if show_result:
self._show_result()
if show_logs:
self._show_logs()
[docs]
def load_dataset(self, X_train, X_val=None, X_test=None):
'''Load train and validation data.
For matrices that are modified frequently, ``lil`` (LIst of List) or ``coo`` is preferred.
For matrices that are not getting modified, ``csr`` or ``csc`` is preferred.
Parameters
----------
X_train : ndarray, spmatrix
Data for matrix factorization.
X_val : ndarray, spmatrix
Data for model selection.
X_test : ndarray, spmatrix
Data for prediction.
'''
if X_train is None:
raise TypeError("Missing training data.")
if X_val is None:
print("[I] Missing validation data.")
if X_test is None:
print("[W] Missing testing data.")
self.X_train = to_sparse(X_train, 'csr')
self.X_val = None if X_val is None else to_sparse(X_val, 'csr')
self.X_test = None if X_test is None else to_sparse(X_test, 'csr')
self.m, self.n = self.X_train.shape
[docs]
def predict_X(self, U=None, V=None, u=None, v=None, us=None, vs=None, boolean=True):
'''Update the prediction ``X_pd``.
Parameters
----------
U : ndarray, spmatrix
Factor matrix.
V : ndarray, spmatrix
Factor matrix.
u : float
The shared threshold for factors in ``U``.
v : float
The shared threshold for factors in ``V``.
us : list of k floats
The thresholds for each factor in ``U``.
vs : list of k floats
The thresholds for each factor in ``V``.
boolean : bool
Whether to apply Boolean multiplication.
'''
U = self.U.copy() if U is None else U.copy()
V = self.V.copy() if V is None else V.copy()
if us is not None:
assert len(us) == U.shape[1]
for i in range(U.shape[1]):
U[:, i] = binarize(U[:, i], us[i])
elif u is not None:
U = binarize(U, u)
if vs is not None:
assert len(vs) == V.shape[1]
for i in range(V.shape[1]):
V[:, i] = binarize(V[:, i], vs[i])
elif v is not None:
V = binarize(V, v)
self.X_pd = matmul(U, V.T, boolean=boolean, sparse=True)
[docs]
def show_matrix(self, settings=None, scaling=None, pixels=None, **kwargs):
'''The ``show_matrix()`` wrapper for BMF models.
If ``settings`` is missing, show the factors ``U``, ``V`` and ``X_pd`` by default.
'''
scaling = self.scaling if scaling is None else scaling
pixels = self.pixels if pixels is None else pixels
if settings is None:
settings = [(self.X_pd, [0, 0], "X"), (self.U, [0, 1], "U"), (self.V.T, [1, 0], "V")]
show_matrix(settings=settings, scaling=scaling, pixels=pixels, **kwargs)
[docs]
def evaluate(self, df_name,
head_info={}, train_info={}, val_info={}, test_info={},
metrics=['Recall', 'Precision', 'Accuracy', 'F1'],
train_metrics=None, val_metrics=None, test_metrics=None, verbose=False):
'''Evaluate a BMF model on the given train, val and test dataset.
Parameters
----------
df_name : str
The name of ``dataframe`` to record with.
head_info : dict
The names and values of shared information at the head of each record.
train_info : dict
The names and values of external information measured on training data.
val_info : dict
The names and values of external information measured on validation data.
test_info : dict
The names and values of external information measured on testing data.
metrics : list of str, default: ['Recall', 'Precision', 'Accuracy', 'F1']
The metrics to be measured. For metric names check ``utils.get_metrics``.
train_metrics : list of str, optional
The metrics to be measured on training data. Will use `metrics` instead if it's ``None``.
val_metrics : list of str, optional
The metrics to be measured on validation data. Will use `metrics` instead if it's ``None``.
test_metrics : list of str, optional
The metrics to be measured on testing data. Will use `metrics` instead if it's ``None``.
'''
train_metrics = metrics if train_metrics is None else train_metrics
val_metrics = metrics if val_metrics is None else val_metrics
test_metrics = metrics if test_metrics is None else test_metrics
columns = header(list(head_info.keys()), levels=3)
results = list(head_info.values())
c, r = self._evaluate('train', train_info, train_metrics)
columns += c
results += r
if self.X_val is not None:
c, r = self._evaluate('val', val_info, val_metrics)
columns += c
results += r
if self.X_test is not None:
c, r = self._evaluate('test', test_info, test_metrics)
columns += c
results += r
record(df_dict=self.logs, df_name=df_name, columns=columns, records=results, verbose=verbose)
[docs]
def _evaluate(self, name, info, metrics):
'''Evaluate on a given dataset.
Parameters
----------
name : str in ['train', 'val', 'test']
Which matrix to evaluate.
info : dict of list
The extra information to be recorded.
metrics : list of str
The metrics to be evaluated and recorded.
'''
X_gt = getattr(self, 'X_' + name)
results = eval(X_gt=X_gt, X_pd=self.X_pd, metrics=metrics, task=self.task)
columns = list(product([name], [0], list(info.keys()) + metrics))
results = list(info.values()) + results
return columns, results