Source code for discrimintools.discriminant_analysis._gfalda

# -*- coding: utf-8 -*-
from numpy import log
from pandas import DataFrame, concat
from collections import OrderedDict, namedtuple
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import pdist,squareform
from sklearn.utils.validation import check_is_fitted

#interns functions
from ._base import _BaseDA
from ._gfa import GFA
from ._discrim import DISCRIM

[docs] class GFALDA(_BaseDA): """ General Factor Analysis Linear Discriminant Analysis (GFALDA) Performs a linear discrimination analysis on principal components. It's a classical linear discriminant analysis (LDA) carried out on the principal components of a general factor analysis (GFA) of explanatory variables. General factor analysis linear discriminant analysis (GFALDA) consists in two steps:: 1. Computation of general factor analysis (GFA) of explanatory variables: - If all features are numerics, general factor analysis (GFA) is a principal component analysis (PCA), - if all features are categorics, general factor analysis (GFA) is a multiple correspondence analysis (MCA), - if mixed features, general factor analysis (GFA) is a factor analysis of mixed data (FAMD). 2. Computation of linear discriminant analysis (LDA) on principal components extract in step 1. Parameters ---------- n_components : int or None, default = 2 Number of components to keep. If ``None``, keep all the components. priors : str, 1-D array or Series of shape (n_classes,), default = None The priors statement specifies the class prior probabilities of group membership, possibles values: * 'equal' to set the prior probabilities equal. * 'prop' to set the prior probabilities proportional to the sample sizes. * 1-D array or Series which specify the prior probability for each level of the classification variable. classes : None, tuple or list, default = None Name of level in order to return. If ``None``, classes are sorted in unique values in y. Returns ------- call_ : NamedTuple Call informations: - Xtot : DataFrame of shape (n_samples, n_columns) Input data. - X : DataFrame of shape (n_samples, n_features) Training data. - y : Series of shape (n_samples,) Target values. True values for ``X``. - target : str Name of target. - features : list Names of features seen during ``fit``. - classes : list Names of classes. - priors : Series of shape (n_classes,) Priors probabilities. - n_samples : int Number of samples. - n_features : int Number of features. - n_classes : int Number of target values - max_components : int Maximum number of components. - n_components : int Number of components kept. cancoef_ : NamedTuple Canonical coefficients: - standardized : DataFrame of shape (n_variables, n_components) The standardized canonical coefficients. - raw : DataFrame of shape (n_variables+1, n_componets) The raw canonical coefficients. - projection : DataFrame of shape (n_variables+1, n_components) The projection canonical coefficients. classes_ : NamedTuple Classes informations: - coord : DataFrame of shape (n_classes, n_components) Class coordinates. - eucl : DataFrame of shape (n_classes, n_classes) The squared Euclidean distance to origin. - gen : DataFrame shape (n_classes, n_classes) The generalized squared distance to origin. coef_ : NamedTuple Linear discriminant coefficients: - standardized : DataFrame of shape (n_variables, n_classes) The standardized coefficients. - raw : DataFrame of shape (n_variables+1, n_classes) The raw coefficients. - projection : DataFrame of shape (n_variables+1, n_classes) The projection coefficients. ind_ : NamedTuple Individuals informations: - coord : DataFrame of shape (n_samples, n_components) Individuals coordinates. - scores : DataFrame of shape (n_samples, n_classes) The scores of individuals. - projection : DataFrame of shape (n_samples, n_classes) The projection of individuals. - eucl : DataFrame of shape (n_samples, n_classes) The squared Euclidean distance to origin. - gen : DataFrame shape (n_samples, n_classes) The generalized squared distance to origin. model_ : str, default = 'gfalda' The model fitted. pipe_ : a sequence of data transformers with two named_steps : - gfa : generalized factor analysis (GFA) - lda : linear discriminant analysis (LDA) See also -------- :class:`~discrimintools.GFA` General Factor Analysis (GFA) :class:`~discrimintools.MDA` Mixed Discriminant Analysis (MDA) :class:`~discrimintools.MPCA` Mixed Principal Component Analysis (MPCA) :class:`~discrimintools.summaryGFA` Printing summaries of General Factor Analysis model. :class:`~discrimintools.summaryGFA` Printing summaries of General Factor Analysis model. :class:`~discrimintools.summaryMDA` Printing summaries of Mixed Discriminant Analysis model. :class:`~discrimintools.summaryMPCA` Printing summaries of Mixed Principal Component Analysis model. References ---------- [1] Ricco Rakotomalala (2020), « `Pratique de l'Analyse Discriminante Linéaire`_ », Université Lumière Lyon 2, Version 1.0. .. _Pratique de l'Analyse Discriminante Linéaire: https://hal.science/hal-04868585v1/file/Pratique_Analyse_Discriminante_Lineaire.pdf Examples -------- >>> from discrimintools.datasets import load_alcools, load_vote, load_heart >>> from discrimintools import GFALDA >>> #PCA + LDA = PCALDA >>> D = load_alcools("train") >>> y, X = D["TYPE"], D.drop(columns=["TYPE"]) >>> clf = GFALDA() >>> clf.fit(X,y) GFALDA() >>> #MCA + LDA = DISQUAL >>> D = load_vote("train") >>> y, X = D["group"], D.drop(columns=["group"]) >>> clf = GFALDA() >>> clf.fit(X,y) GFALDA() >>> #FAMD + LDA = DISMIX >>> D = load_heart("subset") >>> y, X = D["disease"], D.drop(columns=["disease"]) >>> clf = GFALDA() >>> clf.fit(X,y) GFALDA() """
[docs] def __init__( self, n_components = 2, priors = None, classes = False ): self.n_components = n_components self.priors = priors self.classes = classes
def decision_function(self,X) -> DataFrame: """ Apply decision function to an input data Parameters ---------- X : DataFrame of shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- C : DataFrame of shape (n_samples, n_classes) Decision function values related to each class, per sample. """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if the estimator is fitted by verifying the presence of fitted attributes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_fitted(self) return self.pipe_.decision_function(X) def fit(self,X,y): """ Fit the General Factor Analysis Linear Discriminant Analysis Model Parameters ---------- X : DataFrame of shape (n_samples, n_features) Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. y : Series of shape (n_samples,) Target values. True labels for ``X``. Returns ------- self : object Fitted estimator """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #discriminant analysis on principal components (DAPC) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- self.pipe_ = Pipeline([("gfa",GFA(n_components=self.n_components)), ("lda",DISCRIM(method="linear",priors=self.priors,classes=self.classes,warn_message=False))]).fit(X, y) #extract separate fitted models gfa, clf = self.pipe_["gfa"], self.pipe_["lda"] #convert to ordered dictionary call_ = OrderedDict(Xtot=X,X=X,y=y,target=clf.call_.target,features=list(X.columns),classes=clf.call_.classes,priors=clf.call_.priors,n_samples=clf.call_.n_samples,n_features=clf.call_.n_features,n_classes=clf.call_.n_classes, max_components = gfa.call_.max_components,n_components=gfa.call_.n_components) #convert to namedtuple self.call_ = namedtuple("call",call_.keys())(*call_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #canonical discriminant coefficients #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #total-sample standardized canonical coefficients std_cancoef = DataFrame(gfa.svd_.V,index=gfa.var_.coord.index,columns=gfa.var_.coord.columns) #total-sample raw (unstandardized) canonical coefficients raw_cancoef = concat((std_cancoef.T.dot(-(gfa.call_.center/gfa.call_.scale + gfa.call_.z_center)).to_frame("Constant").T, std_cancoef.div(gfa.call_.scale,axis=0)),axis=0) #projection function coefficients proj_cancoef = gfa.var_.coord.div(gfa.call_.denom,axis=0).div(gfa.svd_.vs[:gfa.call_.n_components],axis=1) #convert to ordered dictionary cancoef_ = OrderedDict(standardized=std_cancoef,raw=raw_cancoef,projection=proj_cancoef) #convert to namedtuple self.cancoef_ = namedtuple("cancoef",cancoef_.keys())(*cancoef_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #classification functions #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #standardize coefficients std_coef = concat((clf.coef_.iloc[0,:].to_frame().T,std_cancoef.dot(clf.coef_.iloc[1:,:])),axis=0) #concatenate raw_coef = raw_cancoef.dot(clf.coef_.iloc[1:,:]) #add constant to canonical constant raw_coef.iloc[0,:] = raw_coef.iloc[0,:].to_frame().T.add(clf.coef_.iloc[0,:].to_frame().T,axis=1) #using projection proj_coef = concat((clf.coef_.iloc[0,:].to_frame().T,proj_cancoef.dot(clf.coef_.iloc[1:,:])),axis=0) #convert to ordered dictionary coef_ = OrderedDict(standardized=std_coef,raw=raw_coef,projection=proj_coef) #convert to namedtuple self.coef_ = namedtuple("coef",coef_.keys())(*coef_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- ##Individuals informations #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #individuals coordinates ind_coord = gfa.ind_.coord #score with unstandardized coefficients ind_scores = gfa.call_.Xcod.dot(raw_coef.iloc[1:,:]).add(raw_coef.iloc[0,:].values,axis=1) #score with unstandardized coefficients ind_proj = gfa.call_.Xcod.dot(proj_coef.iloc[1:,:]).add(proj_coef.iloc[0,:].values,axis=1) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #classes informations: coordinates, squared euclidean distance and squared generalized distance #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #classes coordinates class_coord = concat((ind_coord.loc[y[y==k].index,:].mean(axis=0).to_frame(k) for k in clf.call_.classes),axis=1).T #squared euclidean distance between classes class_eucl = DataFrame(squareform(pdist(class_coord,metric="sqeuclidean")),index=clf.call_.classes,columns=clf.call_.classes) #squared generalized distance class_gen = class_eucl.sub(2*log(clf.call_.priors),axis=1) #convert to ordered dictionary classes_ = OrderedDict(infos=clf.classes_.infos,coord=class_coord,eucl=class_eucl,gen=class_gen) #convert to namedtuple self.classes_ = namedtuple("classes",classes_.keys())(*classes_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #individuals additional informations: squared euclidean distance between classes barycenters and squared generalized distance #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #squared euclidean distance to class center ind_eucl = concat((ind_coord.sub(class_coord.loc[k,:],axis=1).pow(2).sum(axis=1).to_frame(k) for k in clf.call_.classes),axis=1) #squared generalized distance ind_gen = ind_eucl.sub(2*log(clf.call_.priors),axis=1) #convert to ordered dictionary ind_ = OrderedDict(coord=ind_coord,scores=ind_scores,projection=ind_proj,eucl=ind_eucl,gen=ind_gen) #convert to namedtuple self.ind_ = namedtuple("ind",ind_.keys())(*ind_.values()) self.model_ = "gfalda" return self def fit_transform(self,X,y): """ Fit to data, then transform it Parameters ---------- X : DataFrame of shape (n_samples, n_features) Training Data, where ``n_samples`` is the number of samples and ``n_columns`` is the number of features. y : Series of shape (n_samples,) Target values. True labels for ``X``. Returns ------- X_new : DataFrame of shape (n_samples, n_classes) Transformed data, where ``n_classes`` is the number of classes. """ self.fit(X,y) return self.ind_.scores def transform(self,X): """ Project data to maximize class separation Parameters ---------- X : DataFrame of shape (n_samples, n_features) New data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- X_new : DataFrame of shape (n_samples, n_classes) Transformed data, where ``n_classes`` is the number of classes. """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if the estimator is fitted by verifying the presence of fitted attributes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_fitted(self) return self.pipe_.transform(X)