Source code for PyBMF.models.Hyper

from .BaseModel import BaseModel
from mlxtend.frequent_patterns import apriori
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix, hstack
from tqdm import tqdm
from ..utils import matmul, get_prediction, get_residual


[docs] class Hyper(BaseModel): '''The Hyper algorithm. Hyper is an exact decomposition algorithm. Hyper+ is used after fitting a Hyper model. It's a relaxation of the exact decomposition. .. topic:: Reference Summarizing Transactional Databases with Overlapped Hyperrectangles. Xiang et al. SIGKDD 2011. Parameters ---------- min_support : float The 'alpha' in the paper. The min support of frequent itemsets. ''' def __init__(self, min_support): self.check_params(min_support=min_support)
[docs] def fit(self, X_train, X_val=None, X_test=None, **kwargs): super().fit(X_train, X_val, X_test, **kwargs) self._fit() self.finish(show_logs=self.show_logs, save_model=self.save_model, show_result=self.show_result)
[docs] def init_model(self): super().init_model() self.init_itemsets() self.init_transactions() self.sort_by_cost()
[docs] def init_itemsets(self): '''Initialize candidate itemsets with Apriori. I : list of int list ''' X_df = pd.DataFrame.sparse.from_spmatrix(self.X_train.astype(bool)) itemsets = apriori(X_df, min_support=self.min_support) itemsets['length'] = itemsets['itemsets'].apply(lambda x: len(x)) itemsets = itemsets[itemsets['length'] > 1] L = len(itemsets) if L == 0: print("[W] No itemset discovered outside singletons. Try to decrease min_support.") else: print(f"[I] Found {L} itemsets, max size: {itemsets['length'].max()}") self.I = [[i] for i in range(self.n)] for i in range(L): self.I.append(list(itemsets['itemsets'].values[i]))
[docs] def init_transactions(self): '''Initialize transactions with cost. T : list of int list c : list of float X_rs : spmatrix ''' self.T = [] self.c = [] i = 0 progress = tqdm(range(len(self.I)), position=0, desc="[I] Initializing transactions") for _ in progress: t, c = self.find_hyper(I=self.I[i], X_gt=self.X_train, X_rs=self.X_train) if t == []: self.I.pop(i) # progress.reset(total=len(self.I)) else: self.T.append(t) self.c.append(c) i += 1
[docs] def sort_by_cost(self): '''Sort `T`, `I` and `c` lists in the ascending order of cost `c`. ''' order = np.argsort(self.c) self.T = [self.T[i] for i in order] self.I = [self.I[i] for i in order] self.c = [self.c[i] for i in order]
def _fit(self): self.T_final, self.I_final = [], [] self.X_rs = get_residual(X=self.X_train, U=self.U, V=self.V) k = 0 progress = tqdm(range(len(self.I)), position=0, desc=f"[I] Finding exact decomposition") for _ in progress: self.T[0], self.c[0] = self.find_hyper(I=self.I[0], X_gt=self.X_train, X_rs=self.X_rs) while self.T[0] == []: self.T.pop(0) self.I.pop(0) self.c.pop(0) # progress.reset(total=len(self.I)) n_iter = 0 while self.c[0] > self.c[1]: self.sort_by_cost() self.T[0], self.c[0] = self.find_hyper(I=self.I[0], X_gt=self.X_train, X_rs=self.X_rs) n_iter += 1 # record lists T, I self.T_final.append(self.T[0]) self.I_final.append(self.I[0]) # update factors U, V U = lil_matrix((self.m, 1)) V = lil_matrix((self.n, 1)) U[self.T[0]] = 1 V[self.I[0]] = 1 self.set_factors(k, u=U, v=V) # evaluate self.X_pd = get_prediction(U=self.U, V=self.V, boolean=True) self.X_rs = get_residual(X=self.X_train, U=self.U, V=self.V) self.evaluate(df_name='updates', head_info={ 'k': k, 'iter': n_iter, 'shape': [int(U.sum()), int(V.sum())], }) if self.X_rs.sum() == 0: break self.T[0], self.c[0] = self.find_hyper(I=self.I[0], X_gt=self.X_train, X_rs=self.X_rs) self.sort_by_cost() k += 1 self.k = self.U.shape[1] self.T = self.T_final self.I = self.I_final
[docs] @staticmethod def find_hyper(I, X_gt, X_rs): ''' queue : list The indices of rows with non-zero uncoverage, in descending order. Row must be a support of I. ''' covered = X_gt[:, I].sum(axis=1) covered = np.array(covered).flatten() uncovered = X_rs[:, I].sum(axis=1) uncovered = np.array(uncovered).flatten() idx = np.argsort(uncovered)[::-1] exact = (covered == len(I)) & (uncovered > 0) exact = exact[idx] queue = idx[exact].tolist() if len(queue) == 0: return [], np.inf t = queue.pop(0) T = [t] cost_old = Hyper.cost(T, I, X_rs) while len(queue) > 0: t = queue.pop(0) cost_new = Hyper.cost(T + [t], I, X_rs) if cost_new <= cost_new: T.append(t) cost_old = cost_new else: break return T, cost_old
[docs] @staticmethod def cost(T, I, X_rs): '''The cost function (gamma) in Hyper. ''' cost = len(T) + len(I) cost = cost / X_rs[T, :][:, I].sum() return cost