Source code for PyBMF.utils.evaluate_utils

from .boolean_utils import multiply, matmul, dot
from .sparse_utils import to_dense, to_triplet, to_sparse
from .metrics import get_metrics
from scipy.sparse import spmatrix, issparse, csr_matrix
import numpy as np
import pandas as pd
from tqdm import tqdm
# from p_tqdm import t_imap
from IPython.display import display


[docs] def eval(metrics, task, X_gt, X_pd=None, U=None, V=None): '''Evaluate with given metrics. Parameters ---------- X_gt : ndarray or spmatrix X_pd : ndarray or spmatrix, optional U : spmatrix, optional V : spmatrix, optional metrics : list of str List of metric names. task : str in {'prediction', 'reconstruction'} If `task` == 'prediction', it ignores the missing values and only use the triplet from the `spmatrix`. The triplet may contain zeros, depending on whether negative sampling has been used. If `task` == 'reconstruction', it uses the whole matrix, which considers all missing values as zeros in `spmatrix`. ''' using_matrix = X_pd is not None using_factors = U is not None and V is not None assert using_matrix or using_factors, "[E] User should provide either `U`, `V` or `X_pd`." if task == 'prediction': U_idx, V_idx, gt_data = to_triplet(X_gt) if using_factors and len(gt_data) < 5000: # faster only if the amount of samples is small pd_data = np.zeros(len(gt_data), dtype=int) for i in tqdm(range(len(gt_data)), leave=False, position=1, desc="[I] Making predictions"): pd_data[i] = dot(U[U_idx], V[V_idx], boolean=True) else: if not using_matrix: X_pd = matmul(U=U, V=V.T, sparse=True, boolean=True) pd_data = np.zeros(len(gt_data), dtype=float) # for i in tqdm(range(len(gt_data)), leave=False, position=1, desc="[I] Making predictions"): for i in range(len(gt_data)): pd_data[i] = X_pd[U_idx[i], V_idx[i]] elif task == 'reconstruction': gt_data = to_sparse(X_gt, type='csr') if not using_matrix: pd_data = matmul(U=U, V=V.T, sparse=True, boolean=True) else: pd_data = to_sparse(X_pd, type='csr') # # debug # gt_data = to_dense(X_gt) # pd_data = to_dense(X_pd) results = get_metrics(gt=gt_data, pd=pd_data, metrics=metrics) return results
[docs] def record(df_dict, df_name, columns, records, verbose=False): '''Create and add records to a dataframe in a logs dict. Parameters ---------- df_dict : dict df_name : str columns : list of str or str tuple records : list verbose : bool, default: False caption : str, optional ''' if not df_name in df_dict: # create dataframe in logs dict if isinstance(columns[0], tuple): # using multi-level headers time = header(['time'], levels=len(columns[0])) columns = pd.MultiIndex.from_tuples(time + columns) else: columns = ['time'] + columns df_dict[df_name] = pd.DataFrame(columns=columns) ts = [pd.Timestamp.now().strftime("%d/%m/%y %I:%M:%S")] records = ts + records # add timestamp df_dict[df_name].loc[len(df_dict[df_name].index)] = records # add record if verbose: # print the last 5 lines display(df_dict[df_name].tail())