Source code for discrimintools.discriminant_analysis._stepdisc

# -*- coding: utf-8 -*-
from numpy import linalg, exp, zeros
from pandas import DataFrame
from scipy import stats
from collections import OrderedDict, namedtuple
from sklearn.utils.validation import check_is_fitted

#intern functions
from ._base import _BaseDA
from ._discrim import DISCRIM
from ._candisc import CANDISC
from .functions.utils import check_is_dataframe
from .functions.splitmix import splitmix
from .functions.tab_disjunctive import tab_disjunctive

[docs] class STEPDISC(_BaseDA): """ Stepwise Discriminant Analysis (STEPDISC) Given a classification variable and several quantitative variables, the :class:`~discrimintools.STEPDISC` class performs a stepwise discriminant analysis to select a subset of the quantitative variables for use in discriminating among the classes. The set of variables that make up each class is assumed to be multivariate normal with a common covariance matrix. The :class:`~discrimintools.STEPDISC` class can use forward selection and backward elimination, which is a useful prelude to further analyses with the :class:`~discrimintools.CANDISC` class or the :class:`~discrimintools.DISCRIM` class. With :class:`~discrimintools.STEPDISC`, variables are chosen to enter or leave the model according to the significance level of an `F` test from an analysis of covariance, where the variables already chosen act as covariates and the variable under consideration is the dependent variable. Two selection methods are available: 'forward' and 'backward': 1. Forward selection begins with no variables in the model. At each step, :class:`~discrimintools.STEPDISC` enters the variable that contributes most to the discriminatory power of the model as measured by Wilks' lambda, the likelihood ratio criterion. When none of the unselected variables meet the entry criterion, the forward selection process stops. 2. Backward elimination begins with all variables in the model except those that are linearly dependent on previous variables in the VAR statement. At each step, the variable that contributes least to the discriminatory power of the model as measured by Wilks' lambda is removed. When all remaining variables meet the criterion to stay in the model, the backward elimination process stops. Parameters ---------- method : {'backward','forward'}, default='forward' The feature selection method to be used, possible values: - "forward" for forward selection, - "backward" for backward elimination alpha : float, default = 1e-2 The significance level for adding or retaining variables in stepwise variable selection. lambda_init : None or float, default = None Initial Wilks Lambda. verbose : bool, default=True If `True`, print intermediary steps during feature selection (default) Returns ------- call_ : NamedTuple Call informations: - obj : class An object of class CANDISC, DISCRIM - alpha : float The significance level for adding or retaining variables in stepwise variable selection. - target : str Name of target. - classes : list Names of classes - priors : Series of shape (n_classes,) Priors probabilities. disc_ : class An object of class CANDISC or DISCRIM model_ : str, default = "stepdisc" Name of model fitted. summary_ : NamedTuple Stepwise summary informations: - summary : DataFrame of shape (n_selected, 6) Summary of stepwise selection - selected : list Selected variables - removed : list Removed variables See also -------- :class:`~discrimintools.CANDISC` Canonical Discriminant Analysis (CANDISC) :class:`~discrimintools.DISCRIM` Discriminant Analysis (linear and quadratic). :class:`~discrimintools.summaryCANDISC` Printing summaries of Canonical Discriminant Analysis model. :class:`~discrimintools.summaryDISCRIM` Printing summaries of Discriminant Analysis (linear and quadratic) model. References ---------- [1] Ricco Rakotomalala (2008), « `STEPDISC - Feature selection for LDA`_ », Université Lumière Lyon 2. [2] Ricco Rakotomalala (2012), « `Linear Discriminant Analysis - Tools comparison`_ », Université Lumière Lyon 2. [3] Ricco Rakotomalala (2014), « `Linear discriminant analysis (slides)`_ », Université Lumière Lyon 2. [4] Ricco Rakotomalala (2020), « `Pratique de l'Analyse Discriminante Linéaire`_ », Version 1.0, Université Lumière Lyon 2. [5] SAS/STAT 13.1 User's Guide (2013), « `The STEPDISC Procedure`_ », Chapter 93. .. _STEPDISC - Feature selection for LDA: https://eric.univ-lyon2.fr/ricco/tanagra/fichiers/en_Tanagra_Stepdisc.pdf .. _Linear Discriminant Analysis - Tools comparison: https://eric.univ-lyon2.fr/ricco/tanagra/fichiers/en_Tanagra_LDA_Comparisons.pdf .. _Linear discriminant analysis (slides): https://eric.univ-lyon2.fr/ricco/cours/slides/en/analyse_discriminante.pdf .. _Pratique de l'Analyse Discriminante Linéaire: https://hal.science/hal-04868585v1/file/Pratique_Analyse_Discriminante_Lineaire.pdf .. _The STEPDISC Procedure: https://support.sas.com/documentation/onlinedoc/stat/131/stepdisc.pdf Examples -------- >>> from discrimintools.datasets import load_heart >>> from discrimintools import DISCRIM, STEPDISC >>> D = load_heart("train") # load training data >>> y, X = D["disease"], D.drop(columns=["disease"]) # split into X and y >>> clf = DISCRIM(method="linear") >>> clf.fit(X,y) >>> clf2 = STEPDISC(method="forward",alpha=0.01,verbose=True) >>> clf2.fit(clf) STEPDISC() """
[docs] def __init__( self, method="forward", alpha=1e-2, lambda_init = None, verbose = True ): self.method = method self.alpha = alpha self.lambda_init = lambda_init self.verbose = verbose
def decision_function(self,X): """ Apply decision function to an input data Parameters ---------- X : DataFrame of shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- C : DataFrame of shape (n_samples, n_classes) Decision function values related to each class, per sample. """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if the estimator is fitted by verifying the presence of fitted attributes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_fitted(self) #check if model has been updated if not hasattr(self,"disc_"): raise TypeError("\nSince only one feature is selected, {} procedure cannot be updated, therefore decision_function cannot be applied.".format(self.call_.obj.__class__.__name__)) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if X is an instance of class pd.DataFrame #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_dataframe(X=X) #set index name as None X.index.name = None #check if X contains original features if not set(self.call_.obj.call_.Xtot.columns).issubset(X.columns): raise ValueError("The names of the features is not the same as the ones in the active features of the {} result".format(self.call_.obj.model_)) #select original features X = X[self.call_.obj.call_.Xtot.columns] #split X split_X = splitmix(X) #extract elements X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2 #initialize DataFrame Xcod = DataFrame(index=X.index,columns=self.call_.obj.call_.X.columns).astype(float) #check if numerics variables if n_quanti > 0: #replace with numerics columns Xcod.loc[:,X_quanti.columns] = X_quanti #check if categorical variables if n_quali > 0: #active categorics categorics = [x for x in self.call_.obj.call_.X.columns if x not in self.call_.obj.call_.Xtot.columns] #replace with dummies Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="") #remove non selected variables Xcod = Xcod.loc[:,list(self.call_.obj.cov_.total.index)] return self.disc_.decision_function(Xcod) def fit(self,obj): """ Fit Stepwise Discriminant Analysis procedure Parameters ---------- obj : class An object of class CANDISC, DISCRIM Returns ------- self : object Returns the instance itself """ # Wilks lambda def wilks(Vb, Wb) -> float: """ Compute Wilk's Lambda Parameters --------- Vb : DataFrame of shape (n_features, n_features) Biaised total covariance matrix Wb : DataFrame of shape (n_features, n_features) Biaised pooled within-class covariance matrix Returns ------- value : Wilk's Lambda """ return linalg.det(Wb)/linalg.det(Vb) # Wilks lambda def wilks_log(Vb,Wb): """ Compute Wilk's Lambda Parameters ---------- Vb : DataFrame of shape (n_features, n_features) Biaised total covariance matrix Wb : DataFrame of shape (n_features, n_features) Biaised pooled within-class covariance matrix Returns ------- value : Wilks Lambda """ detVb, detWb = linalg.slogdet(Vb), linalg.slogdet(Wb) # intra-classes biaisée return exp((detWb[0]*detWb[1])-(detVb[0]*detVb[1])) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if valid discriminan analysis model #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- if obj.model_ not in ["discrim", "candisc"]: raise TypeError("'model' must be an object of class DISCRIM, CANDISC") #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if linear discriminant analysis #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- if obj.model_ == "discrim" and obj.method != "linear": raise TypeError("Only applied for linear discriminant analysis.") #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if alpha has a valid value #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- if self.alpha is None: alpha = 1e-2 elif not isinstance(self.alpha,float): raise TypeError("{} is not supported".format(type(self.alpha))) elif self.alpha < 0 or self.alpha > 1: raise ValueError("the 'alpha' value {} is not within the required range of 0 and 1.".format(self.alpha)) else: alpha = self.alpha #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #initialize all elements #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #extract active elements : number of samples, number of features and number of classes n_samples, n_features, n_classes = obj.call_.n_samples, obj.call_.n_features, obj.call_.n_classes #extract total covariance and pooled withon covariance matrices tcovb, pwcovb = obj.cov_.btotal, obj.cov_.bpooled #initialize var_names, col_names = obj.call_.features, ["Wilks' Lambda","Partial R-Square","F Value","Num DF","Den DF","Pr>F"] #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #forward procedure #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- if self.method.lower() == "forward": entered_var_list, entered_var_summary = [], [] #wilk's lambda value for q = 0 lw_init = 1 #iteration for q in range(n_features): infos_vars = [] for name in var_names: #update entered var list var_select = entered_var_list + [name] # Wilks Lambda lw = wilks_log(tcovb.loc[var_select, var_select],pwcovb.loc[var_select, var_select]) #degree of Freedom ddl1, ddl2 = n_classes - 1, n_samples - n_classes - q #fisher statistic f_value = ddl2/ddl1 * ((lw_init/lw)-1) #partial R squared partiel_rsq = 1-(lw/lw_init) #critical probability p_value = stats.f.sf(f_value, ddl1, ddl2) #concatenate infos_vars.append((lw, partiel_rsq, f_value, ddl1, ddl2, p_value)) #convert to DataFrame infos_step_results = DataFrame(infos_vars, index=var_names, columns=col_names) #to print step result if self.verbose: print("\n====================== Step {} forward selection results =======================".format(q+1)) print(infos_step_results) #apply criteria decision entered_var = infos_step_results["Wilks' Lambda"].idxmin() #test if infos_step_results.loc[entered_var, "Pr>F"] > alpha: if self.verbose: print("\nNo variable can enter\n") break else: #print if self.verbose: print("\nVariable {} will enter\n".format(entered_var)) entered_var_list.append(entered_var) #update var names var_names.remove(entered_var) #update lw init lw_init = infos_step_results.loc[entered_var,"Wilks' Lambda"] #append summary entered_var_summary.append(list(infos_step_results.loc[entered_var])) stepdisc_summary = DataFrame(entered_var_summary, index=entered_var_list, columns=col_names) #selected variables selected_vars = list(stepdisc_summary.index) #removed variables removed_vars = [x for x in var_names if x not in selected_vars] #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #backward procedure #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- elif self.method.lower() == "backward": removed_var_list, removed_var_summary = [], [] #wilk's lambda value for pour q = p if self.lambda_init is None: lw_init = wilks(tcovb,pwcovb) else: lw_init = self.lambda_init for q in range(n_features, -1, -1): infos_vars = [] for name in var_names: #update lisrt of variables var_select = [var for var in var_names if var != name] #calcul du Lambda de Wilks (q-1) lw = wilks_log(tcovb.loc[var_select,var_select],pwcovb.loc[var_select, var_select]) #degree of freedom ddl1, ddl2 = n_classes - 1, n_samples - n_classes - q + 1 #fisher statistic f_value = ddl2/ddl1*((lw/lw_init)-1) #partial Rsquared partiel_rsq = 1 - (lw_init/lw) #critical probability p_value = stats.f.sf(f_value, ddl1, ddl2) #add to list infos_vars.append((lw, partiel_rsq,f_value,ddl1,ddl2,p_value)) #convert to DataFrame infos_step_results = DataFrame(infos_vars, index=var_names, columns=col_names) #po print step result if self.verbose: print("\n====================== Step {} backward selection results =======================".format(n_features - q + 1)) print(infos_step_results) #apply criteria decision removed_var = infos_step_results["Wilks' Lambda"].idxmin() if infos_step_results.loc[removed_var, "Pr>F"] < alpha: if self.verbose: print("\nNo variable can be removed\n") break else: #print if self.verbose: print("\nVariable {} will be removed\n".format(removed_var)) removed_var_list.append(removed_var) #remove variables in global list var_names.remove(removed_var) #add removed_var_summary.append(list(infos_step_results.loc[removed_var])) #update init wilk's lambda lw_init = infos_step_results.loc[removed_var,"Wilks' Lambda"] stepdisc_summary = DataFrame(removed_var_summary, index=removed_var_list, columns=col_names) #excluded variables removed_vars = list(stepdisc_summary.index) #selected variables selected_vars = [x for x in var_names if x not in removed_vars] else: raise ValueError("method should be one of 'backward', 'forward'.") #summary of stepwise discriminant analysis summary_ = OrderedDict(summary=stepdisc_summary,selected=selected_vars,removed=removed_vars) #convert to namedtuple self.summary_ = namedtuple("summary",summary_.keys())(*summary_.values()) #initialize call_ = OrderedDict(obj=obj,alpha=alpha) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #fit model with selected features #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- if len(selected_vars) > 1: #update X Xnew = obj.call_.X.loc[:,selected_vars] if obj.model_ == "discrim": self.disc_ = DISCRIM(method="linear",priors=obj.call_.priors,classes=obj.call_.classes,warn_message=False).fit(Xnew,obj.call_.y) elif obj.model_ == "candisc": self.disc_ = CANDISC(n_components=obj.call_.n_components,classes=obj.call_.classes,warn_message=False).fit(Xnew,obj.call_.y) #update call call_ = OrderedDict(**call_,**OrderedDict(target=self.disc_.call_.target,classes=self.disc_.call_.classes,priors=self.disc_.call_.priors)) else: if self.verbose: print("\nSince only one feature is selected, {} procedure cannot be updated.".format(obj.__class__.__name__)) #convert to namedtuple self.call_ = namedtuple("call",call_.keys())(*call_.values()) #set model name self.model_ = "stepdisc" return self def fit_transform(self,obj): """ Fits transformer to ``X`` and returns a transformed version of samples. Parameters ---------- X : DataFrame of shape (n_samples, n_features) Training data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features y : Series of shape (n_samples,) Target values. True labels for ``X``. Returns ------- X_new : pandas DataFrame of shape (n_samples, n_components) or (n_samples, n_classes) Transformed samples, where ``n_components`` (resp. ``n_classes``) is the number of components (resp. classes). """ self.fit(obj) #check if model has been updated if not hasattr(self,"disc_"): raise TypeError("\nSince only one feature is selected, {} procedure cannot be updated, therefore fit_transform cannot be applied.".format(obj.__class__.__name__)) return self.transform(obj.call_.Xtot) def transform(self,X): """ Project data to maximize class separation or dimensionality reduction Parameters ---------- X : DataFrame of shape (n_samples, n_features) New data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- X_new : DataFrame of shape (n_samples, n_components) or (n_samples, n_classes) Transformed data. """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if the estimator is fitted by verifying the presence of fitted attributes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_fitted(self) #check if model has been updated if not hasattr(self,"disc_"): raise TypeError("\nSince only one feature is selected, {} procedure cannot be updated, therefore fit_transform cannot be applied.".format(self.call_.obj.__class__.__name__)) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if X is an instance of class pd.DataFrame #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_dataframe(X=X) #set index name as None X.index.name = None #check if X contains original features if not set(self.call_.obj.call_.Xtot.columns).issubset(X.columns): raise ValueError("The names of the features is not the same as the ones in the active features of the {} result".format(self.call_.obj.model_)) #select original features X = X[self.call_.obj.call_.Xtot.columns] #split X split_X = splitmix(X) #extract elements X_quanti, X_quali, n_samples, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.n, split_X.k1, split_X.k2 #initialize DataFrame Xcod = DataFrame(index=X.index,columns=self.call_.obj.call_.X.columns).astype(float) #check if numerics variables if n_quanti > 0: #replace with numerics columns Xcod.loc[:,X_quanti.columns] = X_quanti #check if categorical variables if n_quali > 0: #active categorics categorics = [x for x in self.call_.obj.call_.X.columns if x not in self.call_.obj.call_.Xtot.columns] #replace with dummies Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="") #remove non selected variables Xcod = Xcod.loc[:,list(self.call_.obj.cov_.total.index)] return self.disc_.transform(Xcod)