Source code for discrimintools.discriminant_analysis._gfa

# -*- coding: utf-8 -*-
from numpy import average, cov, sqrt, array, ones, zeros, linalg, insert, diff, c_,cumsum,nan,diag,unique
from pandas import Series, DataFrame, concat, get_dummies
from itertools import repeat, chain
from collections import OrderedDict, namedtuple
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

#intern functions
from .functions.utils import check_is_dataframe
from .functions.preprocessing import preprocessing
from .functions.splitmix import splitmix
from .functions.concat_empty import concat_empty
from .functions.gsvd import gsvd
from .functions.tab_disjunctive import tab_disjunctive

[docs] class GFA(TransformerMixin,BaseEstimator): """ General Factor Analysis (GFA) General factor analysis refers to dimensionality reduction technique such as PCA, CA, MCA, FAMD which helps to reduce the number of features in a dataset while keeping the most important informations. For more about generalized factor analysis, see `scientisttools <https://pypi.org/project/scientisttools/>`_. Parameters ---------- n_components : int or None, default = 2 Number of components to keep. If None, keep all the components. Returns ------- call_ : NamedTuple Call informations: - Xtot : DataFrame of shape (n_samples, n_columns) Input data. - X : DataFrame of shape (n_samples, n_features) Training data. - dummies : None or DataFrame Disjunctive data. - k1 : int. Number of numerics columns. - k2 : int. Number of categorical columns. - Z : DataFrame of shape (n_samples, n_vars) Standardize data. - Zc : DataFrame of shape (n_samples, n_vars) Csentered standardize data. - ind_weights : Series of shape (n_samples,) Individuals weights. - var_weights : Series of shape (n_vars,) Variables weights. - center : Series of shape (n_vars,) Mean of variables. - z_center : Series of shape (n_vars,) Mean of recoding variables. - scale : Series of shape (n_vars,) Scale of variables. - denom : Series of shape (n_vars,) Number of variables. - max_components : int. Maximum number of components. - n_components : int: Number of components kept. eig_ : DataFrame of shape (max_components, 4) The eigenvalues, the difference between each eigenvalue, the percentage of variance and the cumulative percentage of variance. ind_ : NamedTuple Individuals informations: - coord : DataFrame of shape (n_samples, n_components) The individuals coordinates. model_ : str, default = 'gfa' The model fitted. svd_ : NamedTuple Generalized singular values decomposition: * vs : 1-D array of shape (max_components,) The singular values. * U : 2-D array of shape (n_samples, n_components) The left singular vectors. * V : 2-D array of shape (n_vars, n_components) The right singular vectors. var_ : NamedTuple Variables informations: - coord : DataFrame of shape (n_vars, n_components) The variables coordinates. See also -------- :class:`~discrimintools.GFALDA` General Factor Analysis Linear Discriminant Analysis (GFALDA) :class:`~discrimintools.MDA` Mixed Discriminant Analysis (MDA) :class:`~discrimintools.MPCA` Mixed Principal Component Analysis (MPCA) :class:`~discrimintools.summaryGFA` Printing summaries of General Factor Analysis model. :class:`~discrimintools.summaryGFALDA` Printing summaries of General Factor Analysis Linear Discriminant Analysis model. :class:`~discrimintools.summaryMDA` Printing summaries of Mixed Discriminant Analysis model. :class:`~discrimintools.summaryMPCA` Printing summaries of Mixed Principal Component Analysis model. References ---------- [1] Bry X. (1996), « Analyses factorielles multiple », Economica. [2] Bry X. (1999), « Analyses factorielles simples », Economica. [3] Escofier B., Pagès J. (2023), « Analyses Factorielles Simples et Multiples », 5ed, Dunod [5] Husson, F., Le, S. and Pages, J. (2010), « Exploratory Multivariate Analysis by Example Using R, Chapman and Hall. [6] Lebart Ludovic, Piron Marie, & Morineau Alain (2006), « `Statistique Exploratoire Multidimensionnelle`_ », Dunod, Paris 4ed. [7] Pagès J. (2013), « Analyse factorielle multiple avec R : Pratique R », EDP sciences [8] Rakotomalala, R. (2020), « `Pratique des Méthodes Factorielles avec Python`_ », Université Lumière Lyon 2. Version 1.0. [8] Saporta Gilbert (2011), « `Probabilités, Analyse des données et Statistiques`_ », Editions TECHNIP, 3ed. [9] Tenenhaus, M. (2006), « Statistique : Méthodes pour décrire, expliquer et prévoir. Dunod. .. _Statistique Exploratoire Multidimensionnelle: https://horizon.documentation.ird.fr/exl-doc/pleins_textes/2023-12/010038111.pdf .. _Pratique des Méthodes Factorielles avec Python: https://hal.science/hal-04868625v1/document .. _Probabilités, Analyse des données et Statistiques: https://en.pdfdrive.to/dl/probabilites-analyses-des-donnees-et-statistiques Examples -------- >>> from discrimintools.datasets import load_alcools, load_canines, load_heart >>> from discrimintools import GFA >>> #principal components analysis (PCA) >>> D = load_alcools("train") # load training data >>> X = D.drop(columns=["TYPE"]) # extract X >>> clf = GFA() >>> clf.fit(X) GFA() >>> #multiple correspondence analysis (MCA) >>> D = load_canines("train") # load training data >>> X = D.drop(columns=["Fonction"]) # extract X >>> clf = GFA() >>> clf.fit(X) GFA() >>> #factor analysis of mixed data (FAMD) >>> D = load_heart("subset") # load subset data >>> X = D.drop(columns=["disease"]) # extract X >>> clf = GFA() >>> clf.fit(X) GFA() """
[docs] def __init__( self,n_components=2 ): self.n_components = n_components
def fit(self,X,y=None): """ Fit the General Factor Analysis Model Parameters ---------- X : DataFrame of shape (n_samples, n_columns) Training data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns. y : None y is ignored. Returns ------- self : object Returns the instance itself """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if X is an instance of class pd.DataFrame #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_dataframe(X) #make a copy of original data Xtot = X.copy() #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #preprocessing #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- X = preprocessing(X) #split X split_X = splitmix(X=X) #extract all elements X_quanti, X_quali, n_samples, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.n, split_X.k1, split_X.k2 #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #data preparation #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #individuals weights ind_weights = Series(ones(n_samples)/n_samples,index=X.index,name="weight") #initialize Xcod, center, scale, var_weights, denom, dummies = None, None, None, None, None, None if n_quanti > 0: #compute weighted average and standard deviation center1 = Series(average(X_quanti,axis=0,weights=ind_weights),index=X_quanti.columns,name="center") scale1 = Series([sqrt(cov(X_quanti.iloc[:,i],ddof=0,aweights=ind_weights)) for i in range(n_quanti)],index=X_quanti.columns,name="scale") #numerics variables weights var_weights1 = Series(ones(n_quanti),index=X_quanti.columns,name="scale") #denom denom1 = Series(repeat(n_quanti, n_quanti),index=X_quanti.columns,name="denon") #concatenate Xcod, center, scale = concat_empty(Xcod,X_quanti,axis=1), concat_empty(center,center1,axis=0), concat_empty(scale,scale1,axis=0) var_weights, denom = concat_empty(var_weights,var_weights1,axis=0), concat_empty(denom,denom1,axis=0) if n_quali > 0: #disjunctive table dummies = concat((get_dummies(X_quali[q],dtype=int) for q in X_quali.columns),axis=1) #proportion of levels center2 = Series(array(dummies.mul(ind_weights,axis=0).sum(axis=0)),index=dummies.columns,name="center") #denom denom2 = Series(repeat(n_quali, dummies.shape[1]),index=dummies.columns,name="denon") #concatenate Xcod, center, denom = concat_empty(Xcod,dummies,axis=1), concat_empty(center,center2,axis=0), concat_empty(denom,denom2,axis=0) #if no numerics variables - MCA scaling if n_quanti == 0: #scale scale2 = Series(center2,index=dummies.columns,name="scale") #levels weights var_weights2 = Series([x*y for x,y in zip(center,list(chain(*[repeat(i,k) for i, k in zip(ones(n_quali)/n_quali,[X_quali[q].nunique() for q in X_quali.columns])])))],index=dummies.columns,name="weight") #if numerics variables - FAMD scaling else: #scale scale2 = Series(sqrt(center2),index=dummies.columns,name="scale") #levels weights var_weights2 = Series(ones(dummies.shape[1]),index=dummies.columns,name="weight") #concatenate scale, var_weights = concat_empty(scale,scale2,axis=0), concat_empty(var_weights,var_weights2,axis=0) #standardization: Z = (X - center)/scale Z = Xcod.sub(center,axis=1).div(scale,axis=1) #center if both numerics and categorics features z_center = Series(zeros(Z.shape[1]),index=Z.columns,name="center") if all(x > 0 for x in [n_quanti, n_quali]): z_center = Series(average(Z,axis=0,weights=ind_weights),index=Z.columns,name="center") Zc = Z.sub(z_center,axis=1) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #set number of components #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #QR decomposition (to set maximum number of components) Q, R = linalg.qr(Zc) max_components = int(min(linalg.matrix_rank(Q), linalg.matrix_rank(R), n_samples - 1, Zc.shape[1] - n_quali)) #set number of components if self.n_components is None: n_components = max_components elif not isinstance(self.n_components,int): raise ValueError("'n_components' must be an integer.") elif self.n_components < 1: raise ValueError("'n_components' must be equal or greater than 1.") else: n_components = min(self.n_components,max_components) #store call informations call_ = OrderedDict(Xtot=Xtot,X=X,dummies=dummies,Xcod=Xcod,k1=n_quanti,k2=n_quali,Z=Z,Zc=Zc,ind_weights=ind_weights,var_weights=var_weights, center=center,z_center=z_center,scale=scale,denom=denom,max_components=max_components,n_components=n_components) #convert to namedtuple self.call_ = namedtuple("call",call_.keys())(*call_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #fit generalized singular values decomposition #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #fit generalized singular values decomposition (GSVD) self.svd_ = gsvd(X=Zc,row_weights=ind_weights,col_weights=var_weights,n_components=n_components) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #eigen values informations #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- eigen_values = self.svd_.vs[:max_components]**2 difference, proportion = insert(-diff(eigen_values),len(eigen_values)-1,nan), 100*eigen_values/sum(eigen_values) #convert to DataFrame self.eig_ = DataFrame(c_[eigen_values,difference,proportion,cumsum(proportion)],columns=["Eigenvalue","Difference","Proportion (%)","Cumulative (%)"],index = ["Can"+str(x+1) for x in range(max_components)]) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates for the individuals #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates for the individuals ind_coord = DataFrame(self.svd_.U.dot(diag(self.svd_.vs[:n_components])),index=Z.index,columns=["Can"+str(x+1) for x in range(n_components)]) #convert to ordered dictionary ind_ = OrderedDict(coord=ind_coord) #convert to NamedTuple self.ind_ = namedtuple("ind",ind_.keys())(*ind_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates for variables #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates for the variables var_coord = DataFrame(self.svd_.V.dot(diag(self.svd_.vs[:n_components])),index=Z.columns,columns=["Can"+str(x+1) for x in range(n_components)]) #update levels if both numerics and categorics variables if all(x > 0 for x in [n_quanti, n_quali]): var_coord.iloc[n_quanti:,:] = var_coord.iloc[n_quanti:,:].div(sqrt(center2),axis=0).mul(self.svd_.vs[:n_components],axis=1) #convert to ordered dictionary var_ = OrderedDict(coord=var_coord) #convert to NamedTuple self.var_ = namedtuple("var",var_.keys())(*var_.values()) #set model name self.model_ = "gfa" return self def fit_transform(self,X,y=None): """ Fit the model with X and apply the dimensionality reduction on X Parameters ---------- X : DataFrame of shape (n_samples, n_columns) Training data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns. y : None y is ignored. Returns ------- X_new : DataFrame of shape (n_samples, n_components) Transformed values, where ``n_components`` is the number of components. """ self.fit(X,y) return self.ind_.coord def transform(self,X): """ Apply the dimensionality reduction on X X is projected on the principal components previously extracted from a training set. Parameters ---------- X : Dataframe of shape (n_samples, n_columns) New data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of columns. Returns ------- X_new : Dataframe of shape (n_samples, n_components) Projection of X in the principal components, where ``n_components`` is the number of components. """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if the estimator is fitted by verifying the presence of fitted attributes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_fitted(self) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if X is an instance of class pd.DataFrame #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_dataframe(X) #set index name as None X.index.name = None #check if X contains active columns if not set(self.call_.X.columns).issubset(X.columns): raise ValueError("The names of the columns is not the same as the ones in the active columns of the GFA result") #select active columns X = X[self.call_.X.columns] #split X split_X = splitmix(X) X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2 #create code variables Xcod = None #check if numerics variables if X_quanti is not None : if self.call_.k1 != n_quanti: raise TypeError("The number of numerics variables must be the same") #concatenate Xcod = concat_empty(Xcod,X_quanti,axis=1) #check if categorics variables if X_quali is not None: if self.call_.k2 != n_quali: raise TypeError("The number of categorics variables must be the same") #test if X contains all active categorics new = [x for x in unique(X_quali.values) if x not in self.call_.dummies.columns] if len(new) > 0: raise ValueError("The following categories are not in the active dataset: "+",".join(new)) #disjunctive table for new individuals dummies = tab_disjunctive(X=X_quali,dummies_cols=self.call_.dummies.columns) #concatenate Xcod = concat_empty(Xcod,dummies,axis=1) #standardization : Z = (x - center)/scale - z_center Z = Xcod.sub(self.call_.center,axis=1).div(self.call_.scale,axis=1).sub(self.call_.z_center,axis=1) #apply transition relation coord = Z.mul(self.call_.var_weights,axis=1).dot(self.svd_.V) coord.columns = ["Can"+str(x+1) for x in range(self.call_.n_components)] return coord