from numpy import average, cov, sqrt, array, ones, linalg, insert, diff, c_,cumsum,nan,diag,dot, unique
from pandas import Series, DataFrame, concat
from itertools import repeat, chain
from collections import OrderedDict, namedtuple
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
#intern functions
from .functions.utils import check_is_dataframe
from .functions.preprocessing import preprocessing
from .functions.splitmix import splitmix
from .functions.recodecat import recodecat
from .functions.gsvd import gsvd
from .functions.tab_disjunctive import tab_disjunctive
[docs]
class MPCA(TransformerMixin,BaseEstimator):
"""
Mixed Principal Components Analysis (MPCA)
Mixed Principal Components Analysis (MPCA) is a standardized principal component analysis of both quantitative variables and the transformation
of the dummy variables associated to qualitative variables on quantitative variables through orthogonal projections of configurations of statistical units in the
individual-space with a relational inner product. For more, see [1]_.
Parameters
----------
n_components : int or None
Number of components to keep. If ``None``, keep all the components.
Returns
-------
call_ : NamedTuple
Call informations:
- Xtot : DataFrame of shape (n_samples, n_colums)
Input data.
- X : DataFrame of shape (n_samples, columns)
Processing data.
- dummies : DataFrame of shape (n_samples, n_categories)
Disjunctive data.
- Xcod : DataFrame of shape (n_samples, n_vars)
Training data.
- Xc : DataFrame of shape (n_samples, n_vars)
Centered recode data.
- Z : DataFrame of shape (n_samples, n_vars)
Standardize recode data.
- center : Series of shape (n_vars,)
Average of recode data.
- xc_center : Series of shape (n_vars,)
Average of centered recode data.
- xc_scale : Series of shape (n_vars,)
Standard deviation of the centered recode data.
- k1 : int
Number of numerics columns.
- k2 : int
Number of categorical.
- ind_weights : Series of shape (n_samples,)
Individuals weights.
- var_weights : Series of shape (n_vars,)
Columns weights.
- denom : Series of shape (n_vars,)
number of variables.
- max_components : int
Maximum number of components.
- n_components : int
Number of components kept.
eig_ : DataFrame of shape (max_components, 4)
The eigenvalues, the difference between each eigenvalue, the percentage of variance and the cumulative percentage of variance.
ind_ : NamedTuple
Individuals informations:
- coord : DataFrame of shape (n_samples, n_components)
The individuals coordinates.
model_ : str, defaut = 'mpca'
The model fitted.
svd_ : NamedTuple
Generalized singular values decomposition:
- vs : 1-D array of shape (max_components,)
The singular values.
- U : 2-D array of shape (n_samples, n_components)
The left singular vectors.
- V : 2-D array of shape (n_vars, n_components)
The right singular vectors.
var_ : NamedTuple
Variables informations:
- coord : DataFrame of shape (n_vars, n_components)
The variables coordinates.
See also
--------
:class:`~discrimintools.GFA`
General Factor Analysis (GFA)
:class:`~discrimintools.GFALDA`
General Factor Analysis Linear Discriminant Analysis (GFALDA)
:class:`~discrimintools.MDA
Mixed Discriminant Analysis (MDA)
:class:`~discrimintools.summaryGFA`
Printing summaries of General Factor Analysis model.
:class:`~discrimintools.summaryGFALDA`
Printing summaries of General Factor Analysis Linear Discriminant Analysis model.
:class:`~discrimintools.summaryMDA`
Printing summaries of Mixed Discriminant Analysis model.
:class:`~discrimintools.summaryMPCA`
Printing summaries of Mixed Principal Component Analysis model.
References
----------
.. [1] Abdesselam R. (2006), « `Analyse en Composantes Principales Mixtes`_ », CREM UMR CNRS 6211
[2] Bry X. (1996), « Analyses factorielles multiple », Economica
[3] Bry X. (1999), « Analyses factorielles simples », Economica
[4] Escofier B., Pagès J. (2023), « Analyses Factorielles Simples et Multiples », 5ed, Dunod
[5] Saporta Gilbert (2011), « `Probabilités, Analyse des données et Statistiques`_ », Editions TECHNIP, 3ed.
[6] Husson, F., Le, S. and Pages, J. (2010), « Exploratory Multivariate Analysis by Example Using R », Chapman and Hall.
[7] Lebart Ludovic, Piron Marie, & Morineau Alain (2006), « `Statistique Exploratoire Multidimensionnelle`_ », Dunod, Paris 4ed.
[8] Pagès J. (2013), « Analyse factorielle multiple avec R : Pratique R », EDP sciences
[9] Rakotomalala, R. (2020), « `Pratique des Méthodes Factorielles avec Python`_ », Université Lumière Lyon 2. Version 1.0.
[10] Tenenhaus, M. (2006), « Statistique : Méthodes pour décrire, expliquer et prévoir », Dunod.
.. _Analyse en Composantes Principales Mixtes: https://www.researchgate.net/profile/Rafik-Abdesselam/publication/5087866_Analyse_en_composantes_principales_mixte/links/0c960525d0d312c1b2000000/Analyse-en-composantes-principales-mixte.pdf
.. _Statistique Exploratoire Multidimensionnelle: https://horizon.documentation.ird.fr/exl-doc/pleins_textes/2023-12/010038111.pdf
.. _Pratique des Méthodes Factorielles avec Python: https://hal.science/hal-04868625v1/document
.. _Probabilités, Analyse des données et Statistiques: https://en.pdfdrive.to/dl/probabilites-analyses-des-donnees-et-statistiques
Examples
--------
>>> from discrimintools.datasets import load_heart
>>> from discrimintools import MPCA
>>> D = load_heart("subset")
>>> X = D.drop(columns=["disease"])
>>> clf = MPCA()
>>> clf.fit(X)
MPCA()
"""
[docs]
def __init__(
self,n_components=2
):
self.n_components = n_components
def fit(self,X,y=None):
"""
Fit the Mixed Principal Component Analysis Model
Parameters
----------
X : DataFrame of shape (n_samples, n_columns)
Training data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns.
y : None
y is ignored.
Returns
-------
self : object
Returns the instance itself
"""
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if X is an instance of class pd.DataFrame
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_dataframe(X)
#make a copy of original data
Xtot = X.copy()
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#preprocessing
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
X = preprocessing(X)
#split X
split_X = splitmix(X=X)
#extract all elements
X_quanti, X_quali, n_samples, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.n, split_X.k1, split_X.k2
#check if mixed data
if any(x == 0 for x in [n_quanti, n_quali]):
raise TypeError("MPCA require mixed data.")
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#data preparation
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#set individuals weights
ind_weights = Series(ones(n_samples)/n_samples,index=X.index,name="weight")
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#center numerics variables
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#compute weighted average mean and standard deviation
center1 = Series(average(X_quanti,axis=0,weights=ind_weights),index=X_quanti.columns,name="center")
#center quantitatives variables
X1c = X_quanti.sub(center1,axis=1)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#treatment of categorics variables
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#recode categorical variables
rec = recodecat(X=X_quali)
X_quali, dummies = rec.X, rec.dummies
#covariance matrix between X and between X and Y
Vx, Vylx = cov(X_quanti,rowvar=False,ddof=0,aweights=ind_weights), dummies.T.dot(diag(ind_weights)).dot(X1c)
#compute the mean
center2 = Series(dot(dot(dot(dot(Vylx,linalg.pinv(Vx,hermitian=True)),X1c.T),diag(ind_weights)),ones(n_samples)).T[0],index=dummies.columns,name="center")
#center the disjunctive table
X2c = dummies.sub(center2,axis=1)
#concatenate
Xcod, Xc, center = concat((X_quanti,dummies),axis=1), concat((X1c,X2c),axis=1), concat((center1,center2),axis=0)
#variables weights
var_weights = Series(ones(Xc.shape[1]),index=Xc.columns,name="weight")
#denom
denom = Series(list(chain(*[repeat(i,k) for i, k in zip([n_quanti, n_quali],[n_quanti, dummies.shape[1]])])),index=Xc.columns,name="denom")
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#standardize Z = (xc - mu)/sigma
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#compute weighted average and standard deviation
xc_center = Series(average(Xc,axis=0,weights=ind_weights),index=Xc.columns,name="weight")
xc_scale = Series(array([sqrt(cov(Xc.iloc[:,i],ddof=0,aweights=ind_weights)) for i in range(Xc.shape[1])]),index=Xc.columns,name="weight")
#standardization: Z = (xc - mu)/sigma
Z = Xc.sub(xc_center,axis=1).div(xc_scale,axis=1)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#set number of components
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#QR decomposition (to set maximum number of components)
Q, R = linalg.qr(Z)
max_components = int(min(linalg.matrix_rank(Q),linalg.matrix_rank(R), n_samples - 1, Z.shape[1] - n_quali))
#set number of components
if self.n_components is None:
n_components = max_components
elif not isinstance(self.n_components,int):
raise TypeError("'n_components' must be an integer.")
elif self.n_components < 1:
raise ValueError("'n_components' must be equal or greater than 1.")
else:
n_components = min(self.n_components,max_components)
#store call informations
call_ = OrderedDict(Xtot=Xtot,X=X,dummies=dummies,Xcod=Xcod,Xc=Xc,Z=Z,center=center,xc_center=xc_center,xc_scale=xc_scale,k1=n_quanti,k2=n_quali,ind_weights=ind_weights,var_weights=var_weights,
denom=denom,max_components=max_components,n_components=n_components)
#convert to namedtuple
self.call_ = namedtuple("call",call_.keys())(*call_.values())
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#fit generalized singular values decomposition
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#fit generalized singular values decomposition (GSVD)
self.svd_ = gsvd(X=Z,row_weights=ind_weights,col_weights=var_weights,n_components=n_components)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#eigen values informations
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
eigen_values = self.svd_.vs[:max_components]**2
difference, proportion = insert(-diff(eigen_values),len(eigen_values)-1,nan), 100*eigen_values/sum(eigen_values)
#convert to DataFrame
self.eig_ = DataFrame(c_[eigen_values,difference,proportion,cumsum(proportion)],columns=["Eigenvalue","Difference","Proportion (%)","Cumulative (%)"],index = ["Can"+str(x+1) for x in range(max_components)])
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#coordinates for the individuals
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#coordinates for the individuals
ind_coord = DataFrame(self.svd_.U.dot(diag(self.svd_.vs[:n_components])),index=Z.index,columns=["Can"+str(x+1) for x in range(n_components)])
#convert to ordered dictionary
ind_ = OrderedDict(coord=ind_coord)
#convert to namedtuple
self.ind_ = namedtuple("ind",ind_.keys())(*ind_.values())
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#coordinates for the variables
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#coordinates for the variables
var_coord = DataFrame(self.svd_.V.dot(diag(self.svd_.vs[:n_components])),index=Z.columns,columns=["Can"+str(x+1) for x in range(n_components)])
#convert to ordered dictionary
var_ = OrderedDict(coord=var_coord)
#convert to namedtuple
self.var_ = namedtuple("var",var_.keys())(*var_.values())
#set model name
self.model_ = "mpca"
return self
def fit_transform(self,X,y=None):
"""
Fit the model with X and apply the dimensionality reduction on X
Parameters
----------
X : DataFrame of shape (n_samples, n_columns)
Training data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns.
y : None.
y is ignored.
Returns
-------
X_new : DataFrame of shape (n_samples, n_components)
Transformed values, where ``n_components`` is the number of components.
"""
self.fit(X,y)
return self.ind_.coord
def transform(self,X):
"""
Apply the dimensionality reduction on X
X is projected on the principal components previously extracted from a training set.
Parameters
----------
X : Dataframe of shape (n_samples, n_columns)
New data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns.
Returns
-------
X_new : Dataframe of shape (n_samples, n_components)
Projection of X in the principal components where ``n_components`` is the number of the components.
"""
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if the estimator is fitted by verifying the presence of fitted attributes
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_fitted(self)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if X is an instance of class pd.DataFrame
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_dataframe(X)
#set index name as None
X.index.name = None
#check if X contains active columns
if not set(self.call_.X.columns).issubset(X.columns):
raise ValueError("The names of the columns is not the same as the ones in the active columns of the MPCA result")
#select active columns
X = X[self.call_.X.columns]
#split X
split_X = splitmix(X)
X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2
#check if mixed data
if any(x == 0 for x in [n_quanti, n_quali]):
raise TypeError("MPCA require mixed data.")
#check if len are equal
if self.call_.k1 != n_quanti:
raise TypeError("The number of numerics variables must be the same")
#check if len are equal
if self.call_.k2 != n_quali:
raise TypeError("The number of categorics variables must be the same")
#test if X contains all active categorics
new = [x for x in unique(X_quali.values) if x not in self.call_.dummies.columns]
if len(new) > 0:
raise ValueError("The following categories are not in the active dataset: "+",".join(new))
#disjunctive table for new individuals
dummies = tab_disjunctive(X=X_quali,dummies_cols=self.call_.dummies.columns)
#concatenate
Xcod = concat((X_quanti,dummies),axis=1)
#standardization: Z = (Xcod - mu1 - mu2)/sigma
Z = Xcod.sub(self.call_.center,axis=1).sub(self.call_.xc_center,axis=1).div(self.call_.xc_scale,axis=1)
#apply transition relation
coord = Z.mul(self.call_.var_weights,axis=1).dot(self.svd_.V)
coord.columns = ["Can"+str(x+1) for x in range(self.call_.n_components)]
return coord