Source code for discrimintools.discriminant_analysis._mpca


from numpy import average, cov, sqrt, array, ones, linalg, insert, diff, c_,cumsum,nan,diag,dot, unique
from pandas import Series, DataFrame, concat
from itertools import repeat, chain
from collections import OrderedDict, namedtuple
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

#intern functions
from .functions.utils import check_is_dataframe
from .functions.preprocessing import preprocessing
from .functions.splitmix import splitmix
from .functions.recodecat import recodecat
from .functions.gsvd import gsvd
from .functions.tab_disjunctive import tab_disjunctive

[docs] class MPCA(TransformerMixin,BaseEstimator): """ Mixed Principal Components Analysis (MPCA) Mixed Principal Components Analysis (MPCA) is a standardized principal component analysis of both quantitative variables and the transformation of the dummy variables associated to qualitative variables on quantitative variables through orthogonal projections of configurations of statistical units in the individual-space with a relational inner product. For more, see [1]_. Parameters ---------- n_components : int or None Number of components to keep. If ``None``, keep all the components. Returns ------- call_ : NamedTuple Call informations: - Xtot : DataFrame of shape (n_samples, n_colums) Input data. - X : DataFrame of shape (n_samples, columns) Processing data. - dummies : DataFrame of shape (n_samples, n_categories) Disjunctive data. - Xcod : DataFrame of shape (n_samples, n_vars) Training data. - Xc : DataFrame of shape (n_samples, n_vars) Centered recode data. - Z : DataFrame of shape (n_samples, n_vars) Standardize recode data. - center : Series of shape (n_vars,) Average of recode data. - xc_center : Series of shape (n_vars,) Average of centered recode data. - xc_scale : Series of shape (n_vars,) Standard deviation of the centered recode data. - k1 : int Number of numerics columns. - k2 : int Number of categorical. - ind_weights : Series of shape (n_samples,) Individuals weights. - var_weights : Series of shape (n_vars,) Columns weights. - denom : Series of shape (n_vars,) number of variables. - max_components : int Maximum number of components. - n_components : int Number of components kept. eig_ : DataFrame of shape (max_components, 4) The eigenvalues, the difference between each eigenvalue, the percentage of variance and the cumulative percentage of variance. ind_ : NamedTuple Individuals informations: - coord : DataFrame of shape (n_samples, n_components) The individuals coordinates. model_ : str, defaut = 'mpca' The model fitted. svd_ : NamedTuple Generalized singular values decomposition: - vs : 1-D array of shape (max_components,) The singular values. - U : 2-D array of shape (n_samples, n_components) The left singular vectors. - V : 2-D array of shape (n_vars, n_components) The right singular vectors. var_ : NamedTuple Variables informations: - coord : DataFrame of shape (n_vars, n_components) The variables coordinates. See also -------- :class:`~discrimintools.GFA` General Factor Analysis (GFA) :class:`~discrimintools.GFALDA` General Factor Analysis Linear Discriminant Analysis (GFALDA) :class:`~discrimintools.MDA Mixed Discriminant Analysis (MDA) :class:`~discrimintools.summaryGFA` Printing summaries of General Factor Analysis model. :class:`~discrimintools.summaryGFALDA` Printing summaries of General Factor Analysis Linear Discriminant Analysis model. :class:`~discrimintools.summaryMDA` Printing summaries of Mixed Discriminant Analysis model. :class:`~discrimintools.summaryMPCA` Printing summaries of Mixed Principal Component Analysis model. References ---------- .. [1] Abdesselam R. (2006), « `Analyse en Composantes Principales Mixtes`_ », CREM UMR CNRS 6211 [2] Bry X. (1996), « Analyses factorielles multiple », Economica [3] Bry X. (1999), « Analyses factorielles simples », Economica [4] Escofier B., Pagès J. (2023), « Analyses Factorielles Simples et Multiples », 5ed, Dunod [5] Saporta Gilbert (2011), « `Probabilités, Analyse des données et Statistiques`_ », Editions TECHNIP, 3ed. [6] Husson, F., Le, S. and Pages, J. (2010), « Exploratory Multivariate Analysis by Example Using R », Chapman and Hall. [7] Lebart Ludovic, Piron Marie, & Morineau Alain (2006), « `Statistique Exploratoire Multidimensionnelle`_ », Dunod, Paris 4ed. [8] Pagès J. (2013), « Analyse factorielle multiple avec R : Pratique R », EDP sciences [9] Rakotomalala, R. (2020), « `Pratique des Méthodes Factorielles avec Python`_ », Université Lumière Lyon 2. Version 1.0. [10] Tenenhaus, M. (2006), « Statistique : Méthodes pour décrire, expliquer et prévoir », Dunod. .. _Analyse en Composantes Principales Mixtes: https://www.researchgate.net/profile/Rafik-Abdesselam/publication/5087866_Analyse_en_composantes_principales_mixte/links/0c960525d0d312c1b2000000/Analyse-en-composantes-principales-mixte.pdf .. _Statistique Exploratoire Multidimensionnelle: https://horizon.documentation.ird.fr/exl-doc/pleins_textes/2023-12/010038111.pdf .. _Pratique des Méthodes Factorielles avec Python: https://hal.science/hal-04868625v1/document .. _Probabilités, Analyse des données et Statistiques: https://en.pdfdrive.to/dl/probabilites-analyses-des-donnees-et-statistiques Examples -------- >>> from discrimintools.datasets import load_heart >>> from discrimintools import MPCA >>> D = load_heart("subset") >>> X = D.drop(columns=["disease"]) >>> clf = MPCA() >>> clf.fit(X) MPCA() """
[docs] def __init__( self,n_components=2 ): self.n_components = n_components
def fit(self,X,y=None): """ Fit the Mixed Principal Component Analysis Model Parameters ---------- X : DataFrame of shape (n_samples, n_columns) Training data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns. y : None y is ignored. Returns ------- self : object Returns the instance itself """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if X is an instance of class pd.DataFrame #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_dataframe(X) #make a copy of original data Xtot = X.copy() #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #preprocessing #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- X = preprocessing(X) #split X split_X = splitmix(X=X) #extract all elements X_quanti, X_quali, n_samples, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.n, split_X.k1, split_X.k2 #check if mixed data if any(x == 0 for x in [n_quanti, n_quali]): raise TypeError("MPCA require mixed data.") #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #data preparation #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #set individuals weights ind_weights = Series(ones(n_samples)/n_samples,index=X.index,name="weight") #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #center numerics variables #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #compute weighted average mean and standard deviation center1 = Series(average(X_quanti,axis=0,weights=ind_weights),index=X_quanti.columns,name="center") #center quantitatives variables X1c = X_quanti.sub(center1,axis=1) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #treatment of categorics variables #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #recode categorical variables rec = recodecat(X=X_quali) X_quali, dummies = rec.X, rec.dummies #covariance matrix between X and between X and Y Vx, Vylx = cov(X_quanti,rowvar=False,ddof=0,aweights=ind_weights), dummies.T.dot(diag(ind_weights)).dot(X1c) #compute the mean center2 = Series(dot(dot(dot(dot(Vylx,linalg.pinv(Vx,hermitian=True)),X1c.T),diag(ind_weights)),ones(n_samples)).T[0],index=dummies.columns,name="center") #center the disjunctive table X2c = dummies.sub(center2,axis=1) #concatenate Xcod, Xc, center = concat((X_quanti,dummies),axis=1), concat((X1c,X2c),axis=1), concat((center1,center2),axis=0) #variables weights var_weights = Series(ones(Xc.shape[1]),index=Xc.columns,name="weight") #denom denom = Series(list(chain(*[repeat(i,k) for i, k in zip([n_quanti, n_quali],[n_quanti, dummies.shape[1]])])),index=Xc.columns,name="denom") #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #standardize Z = (xc - mu)/sigma #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #compute weighted average and standard deviation xc_center = Series(average(Xc,axis=0,weights=ind_weights),index=Xc.columns,name="weight") xc_scale = Series(array([sqrt(cov(Xc.iloc[:,i],ddof=0,aweights=ind_weights)) for i in range(Xc.shape[1])]),index=Xc.columns,name="weight") #standardization: Z = (xc - mu)/sigma Z = Xc.sub(xc_center,axis=1).div(xc_scale,axis=1) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #set number of components #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #QR decomposition (to set maximum number of components) Q, R = linalg.qr(Z) max_components = int(min(linalg.matrix_rank(Q),linalg.matrix_rank(R), n_samples - 1, Z.shape[1] - n_quali)) #set number of components if self.n_components is None: n_components = max_components elif not isinstance(self.n_components,int): raise TypeError("'n_components' must be an integer.") elif self.n_components < 1: raise ValueError("'n_components' must be equal or greater than 1.") else: n_components = min(self.n_components,max_components) #store call informations call_ = OrderedDict(Xtot=Xtot,X=X,dummies=dummies,Xcod=Xcod,Xc=Xc,Z=Z,center=center,xc_center=xc_center,xc_scale=xc_scale,k1=n_quanti,k2=n_quali,ind_weights=ind_weights,var_weights=var_weights, denom=denom,max_components=max_components,n_components=n_components) #convert to namedtuple self.call_ = namedtuple("call",call_.keys())(*call_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #fit generalized singular values decomposition #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #fit generalized singular values decomposition (GSVD) self.svd_ = gsvd(X=Z,row_weights=ind_weights,col_weights=var_weights,n_components=n_components) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #eigen values informations #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- eigen_values = self.svd_.vs[:max_components]**2 difference, proportion = insert(-diff(eigen_values),len(eigen_values)-1,nan), 100*eigen_values/sum(eigen_values) #convert to DataFrame self.eig_ = DataFrame(c_[eigen_values,difference,proportion,cumsum(proportion)],columns=["Eigenvalue","Difference","Proportion (%)","Cumulative (%)"],index = ["Can"+str(x+1) for x in range(max_components)]) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates for the individuals #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates for the individuals ind_coord = DataFrame(self.svd_.U.dot(diag(self.svd_.vs[:n_components])),index=Z.index,columns=["Can"+str(x+1) for x in range(n_components)]) #convert to ordered dictionary ind_ = OrderedDict(coord=ind_coord) #convert to namedtuple self.ind_ = namedtuple("ind",ind_.keys())(*ind_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates for the variables #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates for the variables var_coord = DataFrame(self.svd_.V.dot(diag(self.svd_.vs[:n_components])),index=Z.columns,columns=["Can"+str(x+1) for x in range(n_components)]) #convert to ordered dictionary var_ = OrderedDict(coord=var_coord) #convert to namedtuple self.var_ = namedtuple("var",var_.keys())(*var_.values()) #set model name self.model_ = "mpca" return self def fit_transform(self,X,y=None): """ Fit the model with X and apply the dimensionality reduction on X Parameters ---------- X : DataFrame of shape (n_samples, n_columns) Training data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns. y : None. y is ignored. Returns ------- X_new : DataFrame of shape (n_samples, n_components) Transformed values, where ``n_components`` is the number of components. """ self.fit(X,y) return self.ind_.coord def transform(self,X): """ Apply the dimensionality reduction on X X is projected on the principal components previously extracted from a training set. Parameters ---------- X : Dataframe of shape (n_samples, n_columns) New data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns. Returns ------- X_new : Dataframe of shape (n_samples, n_components) Projection of X in the principal components where ``n_components`` is the number of the components. """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if the estimator is fitted by verifying the presence of fitted attributes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_fitted(self) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if X is an instance of class pd.DataFrame #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_dataframe(X) #set index name as None X.index.name = None #check if X contains active columns if not set(self.call_.X.columns).issubset(X.columns): raise ValueError("The names of the columns is not the same as the ones in the active columns of the MPCA result") #select active columns X = X[self.call_.X.columns] #split X split_X = splitmix(X) X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2 #check if mixed data if any(x == 0 for x in [n_quanti, n_quali]): raise TypeError("MPCA require mixed data.") #check if len are equal if self.call_.k1 != n_quanti: raise TypeError("The number of numerics variables must be the same") #check if len are equal if self.call_.k2 != n_quali: raise TypeError("The number of categorics variables must be the same") #test if X contains all active categorics new = [x for x in unique(X_quali.values) if x not in self.call_.dummies.columns] if len(new) > 0: raise ValueError("The following categories are not in the active dataset: "+",".join(new)) #disjunctive table for new individuals dummies = tab_disjunctive(X=X_quali,dummies_cols=self.call_.dummies.columns) #concatenate Xcod = concat((X_quanti,dummies),axis=1) #standardization: Z = (Xcod - mu1 - mu2)/sigma Z = Xcod.sub(self.call_.center,axis=1).sub(self.call_.xc_center,axis=1).div(self.call_.xc_scale,axis=1) #apply transition relation coord = Z.mul(self.call_.var_weights,axis=1).dot(self.svd_.V) coord.columns = ["Can"+str(x+1) for x in range(self.call_.n_components)] return coord