Source code for PyBMF.models.Hyper

from .BaseModel import BaseModel
from mlxtend.frequent_patterns import apriori
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix, hstack
from tqdm import tqdm
from ..utils import matmul, get_prediction, get_residual



[docs]
class Hyper(BaseModel):
    '''The Hyper algorithm.
    
    Hyper is an exact decomposition algorithm. Hyper+ is used after fitting a Hyper model. It's a relaxation of the exact decomposition.

    .. topic:: Reference

        Summarizing Transactional Databases with Overlapped Hyperrectangles. Xiang et al. SIGKDD 2011.

    Parameters
    ----------
    min_support : float
        The 'alpha' in the paper. 
        The min support of frequent itemsets.
    '''
    def __init__(self, min_support):
        self.check_params(min_support=min_support)
    


[docs]
    def fit(self, X_train, X_val=None, X_test=None, **kwargs):
        super().fit(X_train, X_val, X_test, **kwargs)

        self._fit()
        self.finish(show_logs=self.show_logs, save_model=self.save_model, show_result=self.show_result)




[docs]
    def init_model(self):
        super().init_model()

        self.init_itemsets()
        self.init_transactions()
        self.sort_by_cost()




[docs]
    def init_itemsets(self):
        '''Initialize candidate itemsets with Apriori.

        I : list of int list
        '''
        X_df = pd.DataFrame.sparse.from_spmatrix(self.X_train.astype(bool))
        itemsets = apriori(X_df, min_support=self.min_support)
        itemsets['length'] = itemsets['itemsets'].apply(lambda x: len(x))
        itemsets = itemsets[itemsets['length'] > 1]
        L = len(itemsets)
        if L == 0:
            print("[W] No itemset discovered outside singletons. Try to decrease min_support.")
        else:
            print(f"[I] Found {L} itemsets, max size: {itemsets['length'].max()}")
        self.I = [[i] for i in range(self.n)]
        for i in range(L):
            self.I.append(list(itemsets['itemsets'].values[i]))




[docs]
    def init_transactions(self):
        '''Initialize transactions with cost.

        T : list of int list
        c : list of float
        X_rs : spmatrix
        '''
        self.T = []
        self.c = []
        i = 0
        progress = tqdm(range(len(self.I)), position=0, desc="[I] Initializing transactions")
        for _ in progress:
            t, c = self.find_hyper(I=self.I[i], X_gt=self.X_train, X_rs=self.X_train)
            if t == []:
                self.I.pop(i)
                # progress.reset(total=len(self.I))
            else:
                self.T.append(t)
                self.c.append(c)
                i += 1




[docs]
    def sort_by_cost(self):
        '''Sort `T`, `I` and `c` lists in the ascending order of cost `c`.
        '''
        order = np.argsort(self.c)
        self.T = [self.T[i] for i in order]
        self.I = [self.I[i] for i in order]
        self.c = [self.c[i] for i in order]



    def _fit(self):
        self.T_final, self.I_final = [], []
        self.X_rs = get_residual(X=self.X_train, U=self.U, V=self.V)

        k = 0
        progress = tqdm(range(len(self.I)), position=0, desc=f"[I] Finding exact decomposition")
        for _ in progress:
            self.T[0], self.c[0] = self.find_hyper(I=self.I[0], X_gt=self.X_train, X_rs=self.X_rs)
            while self.T[0] == []:
                self.T.pop(0)
                self.I.pop(0)
                self.c.pop(0)
                # progress.reset(total=len(self.I))
            
            n_iter = 0
            while self.c[0] > self.c[1]:
                self.sort_by_cost()
                self.T[0], self.c[0] = self.find_hyper(I=self.I[0], X_gt=self.X_train, X_rs=self.X_rs)
                n_iter += 1

            # record lists T, I
            self.T_final.append(self.T[0])
            self.I_final.append(self.I[0])

            # update factors U, V
            U = lil_matrix((self.m, 1))
            V = lil_matrix((self.n, 1))
            U[self.T[0]] = 1
            V[self.I[0]] = 1

            self.set_factors(k, u=U, v=V)

            # evaluate
            self.X_pd = get_prediction(U=self.U, V=self.V, boolean=True)
            self.X_rs = get_residual(X=self.X_train, U=self.U, V=self.V)
            self.evaluate(df_name='updates', head_info={
                'k': k, 
                'iter': n_iter, 
                'shape': [int(U.sum()), int(V.sum())],
                })

            if self.X_rs.sum() == 0:
                break

            self.T[0], self.c[0] = self.find_hyper(I=self.I[0], X_gt=self.X_train, X_rs=self.X_rs)
            self.sort_by_cost()
            k += 1

        self.k = self.U.shape[1]
        self.T = self.T_final
        self.I = self.I_final



[docs]
    @staticmethod
    def find_hyper(I, X_gt, X_rs):
        '''
        queue : list
            The indices of rows with non-zero uncoverage, in descending order. Row must be a support of I.
        '''
        covered = X_gt[:, I].sum(axis=1)
        covered = np.array(covered).flatten()

        uncovered = X_rs[:, I].sum(axis=1)
        uncovered = np.array(uncovered).flatten()

        idx = np.argsort(uncovered)[::-1]
        exact = (covered == len(I)) & (uncovered > 0)
        exact = exact[idx]
        queue = idx[exact].tolist()

        if len(queue) == 0:
            return [], np.inf
        t = queue.pop(0)
        T = [t]
        cost_old = Hyper.cost(T, I, X_rs)
        while len(queue) > 0:
            t = queue.pop(0)
            cost_new = Hyper.cost(T + [t], I, X_rs)
            if cost_new <= cost_new:
                T.append(t)
                cost_old = cost_new
            else:
                break
        return T, cost_old




[docs]
    @staticmethod
    def cost(T, I, X_rs):
        '''The cost function (gamma) in Hyper.
        '''
        cost = len(T) + len(I)
        cost = cost / X_rs[T, :][:, I].sum()
        return cost