Source code for discrimintools.discriminant_analysis._dica

# -*- coding: utf-8 -*-
from numpy import array, linalg, c_, log, sqrt, unique, insert, diff, nan, cumsum, diag
from pandas import Series, CategoricalDtype, concat, DataFrame
from pandas.api.types import is_string_dtype
from collections import OrderedDict, namedtuple
from scipy.spatial.distance import pdist,squareform
from sklearn.utils.validation import check_is_fitted

#interns functions
from ._base import _BaseDA
from .functions.utils import check_is_dataframe, check_is_series
from .functions.preprocessing import preprocessing
from .functions.gsvd import gsvd
from .functions.univ_test import univ_test
from .functions.tab_disjunctive import tab_disjunctive


[docs]
class DiCA(_BaseDA):
    """
    Discriminant Correspondence Analysis (DiCA)
    
    Performs Discriminant Correspondence Analysis to classify each observation into groups
    
    This component performs a canonical discriminant analysis (CANDISC) where we want to characterize the groups of individuals (described by a discrete target attribute) from a set of discrete descriptors.
    This approach is based on a correspondence analysis (CA) on an overall crosstab which os a concatenation of individual crosstabs between the target attribute with each predictive attribute (see [4]_).
    We obtain factor scores both for values of the target attribute and the input ones which enable to explain the relationship between the variables. We obtain also a factor score coefficients which enable to calculate the coordinates 
    of new individuals from their indicator vector description.
    
    Parameters
    ----------
    n_components : int, default = 2
        Number of components to keep. If None, keep all the components.
    
    classes : None, tuple or list, default = None
        Name of level in order to return. If None, classes are sorted in unique values in y.

    Returns
    -------
    call_ : NamedTuple
        Call informations:

        - Xtot : DataFrame of shape (n_samples, n_columns)
            Input data.
        - X : DataFrame of shape (n_samples, n_features)
            Training data.
        - y : Series of shape (n_samples,)
            Target values. True values for ``X``.
        - target : str
            Name of target.
        - features : list
            Names of features seen during ``fit``.
        - classes : list
            Names of classes.
        - priors : Series of shape (n_classes,)
            Priors probabilities
        - n_samples : int
            Number of samples.
        - n_features : int
            Number of features.
        - n_classes : int
            Number of target values.
        - N : DataFrame of shape (n_classes,  n_categories)
            The contingence table for correspondence analysis.
        - Z : DataFrame of shape (n_classes, n_categories)
            The standardized data.
        - total : int
            The total size : ``total_size = n_samples * n_features``
        - row_marge : Series of shape (n_classes,)
            The row margins of frequencies table.
        - col_marge : Series of shape (n_categories,)
            The column margins of frequencies table.
        - max_components : int
            Maximum number of components.
        - n_components : int
            Number of components kept.

    cancoef_ : NamedTuple 
        Coefficients of discriminant correspondence analysis:

        - standardize : DataFrame of shape (n_categories, n_components)
            The standardized coefficients.
        - projection : DataFrame of shape (n_categories, n_components)
            The projection coefficients.

    cancorr_ : DataFrame of shape (2, 4)
        The canonical correlations test.

    classes_ : NamedTuple 
        Classes informations:

        - infos : DataFrame of shape (n_classes, 3)
            class level information (frequency, proportion, prior probability).
        - coord : DataFrame of shape (n_classes, n_components)
            The class coordinates.
        - eucl : DataFrame of shape (n_classes, n_classes)
            The squared Euclidean distances between classes.
        - gen : DataFrame of shape (n_classes, n_classes)
            The squared generalized distances between classes.

    eig_ : DataFrame of shape (n_components, 4)
        The eigenvalues, the difference between each eigenvalue, the percentage of variance and the cumulative percentage of variance    
    
    ind_ : NamedTuple
        Individuals informations:

        - coord : DataFrame of shape (n_samples, n_components)
            The coordinates of individuals.
        - eucl : DataFrame of shape (n_samples, n_classes)
            The squared Euclidean distance to origin.
        - gen : DataFrame of shape (n_samples, n_classes)
            The generalized squared distance to origin.

    model_ : str, default = 'dica'
        The model fitted.

    svd_ : Namedtuple 
        Generalized singular value decomposition (GSVD):

        - svd : 1D array of shape (n_components,)
            The singular values.
        - U : 2D array of shape (n_samples, n_components)
            The left singular vectors of generalized singular values decomposition.
        - V : 2D array of shape (n_categories, n_components)
            The right singular vectors of generalized singular values decomposition.
    
    var_ : NamedTuple 
        Variables informations:

        - coord : DataFrame of shape (n_categories, n_components)
            The coordinates of variables.
        - eta2 : DataFrame of shape (n_features, n_components)
            The square correlation ratio - eta2.

    See also
    --------
    :class:`~discrimintools.fviz_dica`
        Visualize Discriminant Correspondence Analysis Analysis.
    :class:`~discrimintools.fviz_dica_biplot`
        Visualize Discriminant Correspondence Analysis (DiCA) - Biplot of individuals and variables.
    :class:`~discrimintools.fviz_dica_ind`
        Visualize Discriminant Correspondence Analysis (DiCA) - Graph of individuals.
    :class:`~discrimintools.fviz_dica_quali_var`
        Visualize Discriminant Correspondence Analysis (DiCA) - Graph of qualitative variables.
    :class:`~discrimintools.fviz_dica_var`
        Visualize Discriminant Correspondence Analysis (DiCA) - Graph of variables/categories.
    :class:`~discrimintools.fviz_dist`
        Visualize distance between barycenter.
    :class:`~discrimintools.summaryDiCA`
        Printing summaries of Discriminant Correspondence Analysis model.
    :class:`~discrimintools.summaryDA`
        Printing summaries of Discriminant Analysis model.

    References
    ----------
    [1] Abdi, H., and Williams, L.J. (2010). `Principal component analysis <https://pzs.dstu.dp.ua/DataMining/pca/bibl/PCA.pdf>`_. Wiley Interdisciplinary Reviews: Computational Statistics, 2, 433-459.
    
    [2] Abdi, H. and Williams, L.J. (2010). `Correspondence analysis <https://personal.utdallas.edu/~herve/abdi-AB2018_CA.pdf>`_. In N.J. Salkind, D.M., Dougherty, & B. Frey (Eds.): Encyclopedia of Research Design. Thousand Oaks (CA): Sage. pp. 267-278.
    
    [3] Abdi, H. (2007). `Singular Value Decomposition (SVD) and Generalized Singular Value Decomposition (GSVD) <https://www.cimat.mx/~alram/met_num/clases/Abdi-SVD2007-pretty.pdf>`_. In N.J. Salkind (Ed.): Encyclopedia of Measurement and Statistics.Thousand Oaks (CA): Sage. pp. 907-912.
    
    .. [4] Abdi, H. (2007). `Discriminant correspondence analysis <https://personal.utdallas.edu/~herve/Abdi-DCA2007-pretty.pdf>`_. In N.J. Salkind (Ed.): Encyclopedia of Measurement and Statistics. Thousand Oaks (CA): Sage. pp. 270-275
    
    [5] Ricco Rakotomalala (2020), `Pratique de l'Analyse Discriminante Linéaire <https://hal.science/hal-04868585v1/file/Pratique_Analyse_Discriminante_Lineaire.pdf>`_, Version 1.0, Université Lumière Lyon 2.

    Examples
    --------
    >>> from discrimintools.datasets import load_divay
    >>> from discrimintools import DiCA
    >>> D = load_divay() # load training data
    >>> y, X = D["Region"], D.drop(columns=["Region"]) # split into X and y
    >>> clf = DiCA()
    >>> clf.fit(X,y)
    DiCA()
    """

[docs]
    def __init__(
            self, n_components = 2, classes = None
    ):
        self.n_components = n_components
        self.classes = classes


    def decision_function(self,X) -> DataFrame:
        """
        Apply decision function to a an input data

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
            
        Returns
        -------
        C : DataFrame of shape (n_samples, n_classes)
            Decision function values related to each class, per sample.
        """
        #canonical coordinates
        coord = self.transform(X)
        #squared euclidean distance to origin
        gsqdist = concat((coord.sub(self.classes_.coord.loc[k,:],axis=1).pow(2).sum(axis=1).to_frame(k) for k in self.call_.classes),axis=1)
        #add priors log-probabilities to squared euclidean distance if priors is not equal
        return -0.5*gsqdist.sub(2*log(self.call_.priors),axis=1)
    
    def fit(self,X,y):
        """
        Fit the Discriminant Correspondence Analysis model

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        
        y : Series of shape (n_samples,)
            Target values. True labels for ``X``.
        
        Returns
        -------
        self : object
            Fitted estimator
        """ 
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X)

        #check if all features are categorics
        if not all(is_string_dtype(X[k]) for k in X.columns):
            raise TypeError("All features must be categorics")
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if y is an instance of class pd.Series
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_series(y)

        #check if len are equal
        if X.shape[0] != y.shape[0]:
            raise ValueError("The number of samples in X must be equal to the number of samples in y")

        #check if all elements in y are string
        if not all(isinstance(kq, str) for kq in y):
            raise TypeError("All elements in y must be a string")
        
        #set y name if None
        if y.name is None or isinstance(y.name, int):
            y.name = "group"

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if all columns in X are categorics
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if not all (is_string_dtype(X[k]) for k in X.columns):
            raise TypeError("X mus contain categorical columns")

        #make a copy of original data
        Xtot = X.copy(deep=True)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #preprocessing
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        X = preprocessing(X=X)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #set classes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #unique element in y
        uq_y = sorted(list(y.unique()))
        #number of classes
        n_classes = len(uq_y)
        if self.classes is not None and isinstance(self.classes, (list,tuple)):
            if len(list(set(self.classes) & set(uq_y))) != n_classes:
                raise ValueError("Insert good classes")
            classes = [str(k) for k in self.classes]
        else:
            classes = uq_y

        #convert y to categorical data type
        y = y.astype(CategoricalDtype(categories=classes,ordered=True))

        #number of samples and number of features
        n_samples, n_features = X.shape
        #set target and features names
        target, features = y.name, list(X.columns)

        #count and proportion
        n_k, p_k = y.value_counts(normalize=False).loc[classes],  y.value_counts(normalize=True).loc[classes]
        #convert to pandas Series
        priors = Series(array(y.value_counts(normalize=True).loc[classes]),index=classes,name="priors")

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #correspondence analysis (CA)
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #disjunctive table
        dummies_y, dummies_x = tab_disjunctive(y).loc[:,classes], tab_disjunctive(X)
        #matrix N for correspondence analysis
        N = dummies_y.T.dot(dummies_x)
        #total
        total = n_samples*n_features
        #frequencies table
        freq = N.div(total)
        #columns and rows margins calcul
        col_marge, row_marge = freq.sum(axis=0), freq.sum(axis=1)
        col_marge.name, row_marge.name = "Margin", "Margin"

        #standardization: z_ij = (fij/(fi.*f.j)) - 1
        Z = freq.div(row_marge,axis=0).div(col_marge,axis=1).sub(1)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #set number of components
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #QR decomposition (to set maximum number of components)
        Q, R = linalg.qr(Z)
        max_components = int(min(linalg.matrix_rank(Q),linalg.matrix_rank(R), n_classes - 1, dummies_x.shape[1] - 1))
        #set number of components
        if self.n_components is None:
            n_components = max_components
        elif not isinstance(self.n_components,int):
            raise TypeError("'n_components' must be an integer.")
        elif self.n_components < 1:
            raise TypeError("'n_components' must be equal or greater than 1.")
        else:
            n_components = min(self.n_components,max_components)

        #convert to ordered dictionary
        call_ = OrderedDict(Xtot=Xtot,X=X,y=y,target=target,features=features,classes=classes,priors=priors,n_samples=n_samples,n_features=n_features,n_classes=n_classes,
                            N=N,Z=Z,total=total,row_marge=row_marge,col_marge=col_marge,max_components=max_components,n_components=n_components)
        #convert to namedtuple
        self.call_ = namedtuple("call",call_.keys())(*call_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #fit generalized singular values decomposition
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #fit generalized singular values decomposition (GSVD)
        self.svd_ = gsvd(X=Z,row_weights=row_marge,col_weights=col_marge,n_components=n_components)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #eigen values informations
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        eigen_values = self.svd_.vs[:max_components]**2
        difference, proportion = insert(-diff(eigen_values),len(eigen_values)-1,nan), 100*eigen_values/sum(eigen_values)
        #convert to DataFrame
        self.eig_ = DataFrame(c_[eigen_values,difference,proportion,cumsum(proportion)],columns=["Eigenvalue","Difference","Proportion (%)","Cumulative (%)"],index = ["Can"+str(x+1) for x in range(max_components)])

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #classes informations: coordinates, squared euclidean distance and squared generalized distance
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #class level information
        class_infos = DataFrame(c_[n_k,p_k,priors],columns=["Frequency","Proportion","Prior Probability"],index=classes)
        class_infos["Frequency"] = class_infos["Frequency"].astype(int)
        #classes coordinates
        class_coord = DataFrame(self.svd_.U.dot(diag(self.svd_.vs[:n_components])),index=Z.index,columns=["Can"+str(x+1) for x in range(n_components)])
        #squared euclidean distance between classes
        class_eucl = DataFrame(squareform(pdist(class_coord,metric="sqeuclidean")),index=classes,columns=classes)
        #squared generalized distance
        class_gen = class_eucl.sub(2*log(priors),axis=1)
        #convert to ordered dictionary
        classes_ = OrderedDict(infos=class_infos,coord=class_coord,eucl=class_eucl,gen=class_gen)
        #convert to namedtuple
        self.classes_ = namedtuple("classes",classes_.keys())(*classes_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #variables informations: coordinates
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #coordinates for the variables
        var_coord = DataFrame(self.svd_.V.dot(diag(self.svd_.vs[:n_components])),index=Z.columns,columns=["Can"+str(x+1) for x in range(n_components)])
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #canonical discriminant functions
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total-sample standardized canonical coefficients
        std_cancoef = DataFrame(self.svd_.V,index=Z.columns,columns=self.eig_.index[:n_components])
        #projection function coefficients
        proj_cancoef = var_coord.div(n_features*self.svd_.vs[:n_components],axis=1)
        #convert to ordered dictionary
        cancoef_ = OrderedDict(standardized=std_cancoef,projection=proj_cancoef)
        #convert to namedtuple
        self.cancoef_ = namedtuple("cancoef",cancoef_.keys())(*cancoef_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #individuals informations: individuals coordinates, squared euclidean distance and squared generalized distance
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #individuals coordinates
        ind_coord = dummies_x.dot(proj_cancoef)
        #squared euclidean distance to class center
        ind_eucl = concat((ind_coord.sub(classes_["coord"].loc[k,:],axis=1).pow(2).sum(axis=1).to_frame(k) for k in classes),axis=1)
        #squared generalized distance
        ind_gen = ind_eucl.sub(2*log(priors),axis=1)
        #convert to ordered dictionary
        ind_ = OrderedDict(coord=ind_coord,eucl=ind_eucl,gen=ind_gen)
        #convert to namedtuple
        self.ind_ = namedtuple("ind",ind_.keys())(*ind_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #update variables informations: correlation ratio
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #variables square correlation ratio
        var_sqeta = concat(((univ_test(ind_coord,X[q])["R-Square"]).to_frame(q) for q in list(X.columns)),axis=1).T
        #convert to ordered dictionary
        var_ = OrderedDict(coord=var_coord,eta2=var_sqeta)
        #convert to namedtuple
        self.var_ = namedtuple("var",var_.keys())(*var_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #canonical correlation informations
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total sum of squared
        tss = ind_coord.pow(2).sum(axis=0)
        #eta squared
        sqeta = self.eig_.iloc[:n_components,0].mul(n_samples).div(tss)
        #canonical correlation
        self.cancorr_ = DataFrame(c_[self.eig_.iloc[:n_components,0],tss,sqeta,sqrt(sqeta)],columns=["Eigenvalue","Total SS", "Eta Sq.","Canonical Correlation"],index=self.eig_.index[:n_components])
        
        self.model_ = "dica"
        return self
        
    def fit_transform(self,X,y):
        """
        Fit to data, then transform it

        Fits transformer to ``X`` and returns a transformed version of ``X``.

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        
        y : Series of shape (n_samples,)
            Target values. True labels for ``X``.

        Returns
        -------
        X_new : DataFrame of shape (n_samples, n_components)
            Transformed data, where ``n_components`` is the number of components
        """
        self.fit(X,y)
        return self.ind_.coord
    
    def transform(self,X):
        """ 
        Apply the dimensionality reduction on X 
        
        X is projected on the canonical components previously extracted from a training set.

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            New data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.

        Returns
        -------
        X_new : DataFrame of shape (n_samples, n_components)
            Transformed data, where ``n_components`` is the number of components.
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if the estimator is fitted by verifying the presence of fitted attributes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_fitted(self)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X=X)

        #set index name as None
        X.index.name = None

        #check if X contains original features
        if not set(self.call_.Xtot.columns).issubset(X.columns):
            raise ValueError("The names of the features is not the same as the ones in the active features of the DiCA result")
        
        #select original features
        X = X[self.call_.Xtot.columns]

        #check if all variables are discrete
        if not all(is_string_dtype(X[k]) for k in X.columns):
            raise TypeError("All features must be categorics")
        
        #test if X contains all active categorics
        new = [x for x in unique(X.values) if x not in self.call_.N.columns]
        if len(new) > 0:
            raise ValueError("The following categories are not in the active dataset: "+",".join(new))
        
        #disjunctive table for new individuals
        dummies = tab_disjunctive(X=X,dummies_cols=self.call_.N.columns,prefix=False)   
        #multiply by canonical coefficients
        return  dummies.dot(self.cancoef_.projection)