Source code for discrimintools.discriminant_analysis._candisc

# -*- coding: utf-8 -*-
from numpy import linalg,sum, sqrt, zeros, dot, transpose,nan, log,c_,real, insert, cumsum,diff,log,array,diag, average,corrcoef
from pandas import DataFrame, Series, concat, CategoricalDtype
from pandas.api.types import is_string_dtype
from collections import namedtuple, OrderedDict
from functools import reduce
from statsmodels.multivariate.manova import MANOVA
from scipy.spatial.distance import pdist,squareform
from sklearn.utils.validation import check_is_fitted

#interns functions
from ._base import _BaseDA
from .functions.utils import check_is_dataframe, check_is_series, check_is_bool
from .functions.preprocessing import preprocessing
from .functions.model_matrix import model_matrix
from .functions.describe import describe
from .functions.sscp import sscp
from .functions.cov_to_cor_test import cov_to_cor_test
from .functions.distance import sqmahalanobis
from .functions.wcorrcoef import wcorrcoef
from .functions.univ_test import univ_test
from .functions.diagnostics import diagnostics
from .functions.lrtest import lrtest
from .functions.splitmix import splitmix
from .functions.tab_disjunctive import tab_disjunctive


[docs]
class CANDISC(_BaseDA):
    """
    Canonical Discriminant Analysis (CANDISC)

    Canonical discriminant analysis is a dimension-reduction technique related to principal component analysis and canonical correlation. 
    The methodology that is used in deriving the canonical coefficients parallels that of a one-way multivariate analysis of variance (MANOVA). 
    MANOVA tests for equality of the mean vector across class levels. Canonical discriminant analysis finds linear combinations of the quantitative variables
    that provide maximal separation between classes or groups. Given a classification variable and several
    quantitative variables, the CANDISC procedure derives `canonical variables`, which are linear combinations
    of the quantitative variables that summarize between-class variation in much the same way that principal
    components summarize total variation.

    The :class:`~discrimintools.CANDISC` procedure performs a canonical discriminant analysis, computes squared Mahalanobis
    distances between class means, and performs both univariate and multivariate one-way analyses of variance.
    
    Parameters
    ----------
    n_components : int or `None <https://docs.python.org/3/library/constants.html#None>`_, default = 2
        Number of components to keep. If `None <https://docs.python.org/3/library/constants.html#None>`_ set all components are kept::

    classes : `None <https://docs.python.org/3/library/constants.html#None>`_, tuple or list, default = `None <https://docs.python.org/3/library/constants.html#None>`_
        Name of level in order to return. If `None <https://docs.python.org/3/library/constants.html#None>`_, classes are sorted in unique values in y.
    
    warn_message : bool, default = True
        Show warning messages. Raise a warning without making the program crash.

    Returns
    -------
    call_ : NamedTuple
        Call informations:

        - Xtot : DataFrame of shape (n_samples, n_columns)
            Input data.
        - X : DataFrame of shape (n_samples, n_features)
            Training data.
        - y : Series of shape (n_samples,)
            Target values. True values for ``X``.
        - target : str
            Name of target.
        - features : list
            Names of features seen during ``fit``.
        - classes : list
            Names of classes.
        - priors : Series of shape (n_classes,)
            Priors probabilities.
        - n_samples : int
            Number of samples.
        - n_features : int
            Number of features.
        - n_classes : int
            Number of target values
        - max_components : int
            Maximum number of components.
        - n_components : int
            Number of components kept.

    cancoef_: NamedTuple
        Canonical coefficients:

        - raw : DataFrame of shape (n_features + 1, n_components)
            Raw canonical coefficients.
        - total : DataFrame of shape (n_features, n_components)
            Total canonical coefficients.
        - pooled : DataFrame of shape (n_features, n_components)
            Pooled canonical coefficients.

    cancorr_ : DataFrame of shape (n_components, 10)
        The canonical correlations test.

    classes_ : NamedTuple
        Classes informations:

        - infos : DataFrame of shape (n_classes, 3)
            class level information (frequency, proportion, prior probability).
        - center : DataFrame of shape (n_classes, n_features) 
            Class means.
        - total : DataFrame of shape (n_features, n_classes)
            Total-sample standardized class means.
        - pooled : DataFrame of shape (n_features, n_classes)
            Pooled-within class standardized class means.
        - mahal : DataFrame of shape (n_classes, n_classes)
            Squared Mahalanobis distances between classes.
        - coord : DataFrame of shape (n_classes, n_components)
            Class coordinates.
        - eucl : DataFrame of shape (n_classes, n_classes)
            The squared Euclidean distance to origin.
        - gen : DataFrame shape (n_classes, n_classes) 
            The generalized squared distance to origin.

    coef_ : DataFrame of shape (n_features + 1, n_classes)
        Linear classification functions coefficients.

    corr_ : NamedTuple 
        Correlation coefficients test:

        - total : DataFrame of shape (C^{2}_{n_features}, 7)
            Total-sample correlation coefficients test.
        - within : dict 
            Within-class correlation coefficients test.
        - pooled : DataFrame of shape (C^{2}_{n_features}, 7)
            Pooled within-class correlation coefficients test.
        - between : DataFrame of shape (C^{2}_{n_features}, 7)
            Between-class correlation coefficients test.

    cov_ : NamedTuple
        Covariance matrices:

        - total : DataFrame of shape (n_features, n_features)
            Total-sample covariance matrix.
        - btotal : DataFrame of shape (n_features, n_features)
            Biased total-sample covariance matrix.
        - within : dict 
            Within-class covariance matrices.
        - bwithin : dict
            Biased within-class covariance matrices.
        - pooled : DataFrame of shape (n_features, n_features)
            Pooled within-class covariance matrix.
        - bpooled : DataFrame of shape (n_features, n_features)
            Biased pooled within-class covariance matrix.
        - between : DataFrame of shape (n_features, n_features)
            Between-class covariance matrix
        - bbetween : DataFrame of shape (n_features, n_features)
            biased between-class covariance matrix.

    eig_ : DataFrame of shape (n_components, 4)
        The eigenvalues, the difference between each eigenvalue, the percentage of variance and the cumulative percentage of variance

    ind_ : NamedTuple
        Individuals informations:

        - coord : DataFrame of shape (n_samples, n_components)
            The coordinates of individuals.
        - mahal : DataFrame of shape (n_samples, n_classes) 
            The squared Mahalanobis distance to origin.
        - eucl : DataFrame of shape (n_samples, n_classes)
            The squared Euclidean distance to origin.
        - gen : DataFrame shape (n_samples, n_classes) 
            The generalized squared distance to origin.
        - scores : DataFrame of shape (n_samples, n_classes) 
            The total scores of individuals.

    model_ : str, default = "candisc"
        Name of model fitted.

    sscp_ : NamedTuple 
        Sum of square cross product (SSCP) matrices:

        -  total : DataFrame of shape (n_features, n_features)
            Total-sample SSCP matrix.
        -  within : dict
            Within-class SSCP matrices
        - pooled: DataFrame of shape (n_features, n_features)
            Pooled within-class SSCP matrix.
        - between : DataFrame of shape (n_features, n_features)
            Between-class SSCP matrix.

    statistics_ : NamedTuple
        Statistics results:

        - anova : DataFrame of shape (n_features, 11)
            Analysis of variance test.
        - manova : DataFrame of shape (4, 5)
            Multivariate analysis of variance test.
        - average_rsq : DataFrame of shape (1, 2)
            Average R-square.
        - performance : DataFrame of shape (3, 3)
            The model global performance.

    summary_ : NamedTuple
        Summary informations:

        - infos : DataFrame of shape (3, 4)
            Summary informations (total sample size, number of features, number of classes, 
            total degree of freedom, within-class degree of freedom, between-class degree of freedom).
        - total : DataFrame of shape (n_features, 8)
            Total-sample statistics, see `pandas.Describe`_.
        - within : dict
            Within-class statistics

    svd_ : Namedtuple 
        Singular value decomposition:

        -  value : 1D array of shape (n_components,)
            The eigenvalues
        -  vectors : 2D array of shape (n_features, n_components)
            The eigenvectors
    
    var_ : NamedTuple 
        Variables informations (correlation):

        - total : DataFrame of shape (n_features, n_components)
            The total-sample correlation of variables with canonical dimensions.
        - pooled : DataFrame of shape (n_features, n_components)
            The pooled-within class correlation of variables with canonical dimensions.
        * between : DataFrame of shape (n_features, n_components)
            The between-class correlation of variables with canonical dimensions.

    See also
    --------
    :class:`~discrimintools.fviz_candisc`
        Visualize Canonical Discriminant Analysis.
    :class:`~discrimintools.fviz_candisc_biplot`
        Visualize Canonical Discriminant Analysis (CANDISC) - Biplot of individuals and variables.
    :class:`~discrimintools.fviz_candisc_ind`
        Visualize Canonical Discriminant Analysis (CANDISC) - Graph of individuals.
    :class:`~discrimintools.fviz_candisc_var`
        Visualize Canonical Discriminant Analysis (CANDISC) - Graph of variables.
    :class:`~discrimintools.fviz_dist`
        Visualize distance between barycenter.
    :class:`~discrimintools.summaryCANDISC`
        Printing summaries of Canonical Discriminant Analysis model.
    :class:`~discrimintools.summaryDA`
        Printing summaries of Discriminant Analysis model.

    References
    ----------
    [1] Lebart Ludovic, Piron Marie, & Morineau Alain (2006), « `Statistique Exploratoire Multidimensionnelle`_ », Dunod, Paris 4ed.

    [2] Ricco Rakotomalala (2020), « `Pratique de l'Analyse Discriminante Linéaire`_ », Version 1.0, Université Lumière Lyon 2.
    
    [3] Saporta Gilbert (2011), « `Probabilités, Analyse de données et Statistiques`_ »,  Editions TECHNIP, 3ed.

    [4] Tenenhaus Michel (2007), « Statistique - Méthodes pour décrire, expliquer et prévoir », Dunod.

    [5] Tenenhaus Michel (1996), « Méthodes statistiques en gestion », Dunod.

    [6] Tuffery Stephane (2017), « Data Mining et Statistique décisionelle », Editions TECHNIP, 5ed.
    
    [7] Tuffery Stephane (2025), « Data Science, Statistique et Machine learning », Editions TECHNIP, 6ed.

    [8] SAS/STAT User's Guide (2013), « `The CANDISC Procedure`_ », Chapter 31.

    .. _Statistique Exploratoire Multidimensionnelle: https://horizon.documentation.ird.fr/exl-doc/pleins_textes/2023-12/010038111.pdf
    .. _Pratique de l'Analyse Discriminante Linéaire: https://hal.science/hal-04868585v1/file/Pratique_Analyse_Discriminante_Lineaire.pdf
    .. _Probabilités, Analyse de données et Statistiques: https://en.pdfdrive.to/dl/probabilites-analyses-des-donnees-et-statistiques
    .. _The CANDISC Procedure: https://support.sas.com/documentation/onlinedoc/stat/131/candisc.pdf
    .. _pandas.Describe: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html
    
    Examples
    --------
    >>> from discrimintools.datasets import load_wine
    >>> from discrimintools import CANDISC
    >>> D = load_wine() # load training data
    >>> y, X = D["Quality"], D.drop(columns=["Quality"]) # split into X and y
    >>> clf = CANDISC()
    >>> clf.fit(X,y)
    CANDISC()
    >>> XTest = load_wine("test") # load test data
    >>> print(clf.predict(XTest))
    1958    bad
    Name: prediction, dtype: object
    """

[docs]
    def __init__(
            self, n_components = 2, classes = None, warn_message = True
    ):
        self.n_components = n_components
        self.classes = classes
        self.warn_message = warn_message


    def decision_function(self,X):
        """
        Apply decision function to an input data

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.

        Returns
        -------
        C : DataFrame of shape (n_samples, n_classes)
            Decision function values related to each class, per sample.
        """
        #raw canonical coordinates
        coord = self.transform(X=X)
        #squared eulidean distance to origin
        gsqdist = concat((coord.sub(self.classes_.coord.loc[k,:],axis=1).pow(2).sum(axis=1).to_frame(k) for k in self.call_.classes),axis=1)
        #add priors log-probabiliies to squared euclidean distance
        return -0.5*gsqdist.sub(2*log(self.call_.priors),axis=1)

    def fit(self,X,y):
        """
        Fit the Canonical Discriminant Analysis model

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        
        y : Series of shape (n_samples,)
            Target values. True labels for ``X``.
        
        Returns
        -------
        self : object
            Fitted estimator
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if y is an instance of class pd.Series
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_series(y)

        #check if len are equal
        if X.shape[0] != y.shape[0]:
            raise ValueError("The number of samples in X must be equal to the number of samples in y")
        
        #check if all elements in y are string
        if not all(isinstance(kq, str) for kq in y):
            raise TypeError("All elements in y must be a string")
        
        #set y name if None
        if y.name is None or isinstance(y.name, int):
            y.name = "group"

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if warn_message is a bool
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_bool(self.warn_message)

        #make a copy of original data
        Xtot = X.copy(deep=True)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #preprocessing
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        X = preprocessing(X)

        #warning message to inform
        if self.warn_message:
            if any(is_string_dtype(X[k]) for k in X.columns):
                print("\nCategorical features have been encoded into binary variables.\n")
        
        #encode categorical variables into binary without first level.
        X = model_matrix(X=X)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #set classes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #unique element in y
        uq_y = sorted(list(y.unique()))
        #number of class
        n_classes = len(uq_y)
        if self.classes is not None and isinstance(self.classes, (list,tuple)):
            if len(list(set(self.classes) & set(uq_y))) != n_classes:
                raise ValueError("Insert good classes")
            classes = [str(k) for k in self.classes]
        else:
            classes = uq_y
        #convert y to categorical data type
        y = y.astype(CategoricalDtype(categories=classes,ordered=True))

        #number of samples and number of features
        n_samples, n_features = X.shape
        #set target and features names
        target, features = y.name, list(X.columns)
        
        #define subset of X
        X_k = {k : X.loc[y[y==k].index,:] for k in classes}
        #count and proportion
        n_k, p_k = y.value_counts(normalize=False).loc[classes],  y.value_counts(normalize=True).loc[classes]
        #set piors
        priors = Series(array(p_k),index=classes,name="priors")

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #set number of components to kept
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #maximum components
        max_components = int(min(n_classes - 1, n_features))
        #set number of components
        if self.n_components is None:
            n_components = max_components
        elif not isinstance(self.n_components,int):
            raise ValueError("'n_components' must be an integer.")
        elif self.n_components < 1:
            raise ValueError("'n_components' must be equal or greater than 1.")
        else:
            n_components = min(self.n_components, max_components)

        #convert to ordered dictionary
        call_ = OrderedDict(Xtot=Xtot,X=X,y=y,target=target,features=features,classes=classes,priors=priors,n_samples=n_samples,n_features=n_features,n_classes=n_classes,
                            max_components=max_components,n_components=n_components)
        #convert to namedtuple
        self.call_ = namedtuple("call",call_.keys())(*call_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #sample statistics
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #summary information
        summary_infos = DataFrame({"infos" : ["Total Sample Size","Variables","Classes"],
                                   "Value" : [n_samples, n_features, n_classes],
                                   "DF" : ["DF Total", "DF Within Classes", "DF Between Classes"],
                                   "DF value" : [n_samples-1, n_samples - n_classes, n_classes-1]})
        #total-sample and within-class summaries
        tsummary, wsummary = describe(X), {k : describe(X_k[k]) for k in classes}
        #convert to ordered dictionary
        summary_ = OrderedDict(infos=summary_infos,total=tsummary,within=wsummary)
        #convert to namedtuple
        self.summary_ = namedtuple("summary",summary_.keys())(*summary_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #sum of square cross product (SSCP) matrix
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total-sample and within-class SSCP matrices
        tsscp, wsscp = sscp(X=X), {k: sscp(X_k[k]) for k in classes}
        #pooled within-class SSCCP matrix
        pwsscp = reduce(lambda i , j : i + j, wsscp.values())
        #between-class SSCP matrix
        bsscp = tsscp - pwsscp
        #convert to ordered dictionary
        sscp_ = OrderedDict(total=tsscp,within=wsscp,pooled=pwsscp,between=bsscp)
        #convert to namedtuple
        self.sscp_ = namedtuple("sscp",sscp_.keys())(*sscp_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #covariance matrices
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total-sample and biased total-sample covariance matrices
        tcov, tcovb = tsscp.div(n_samples - 1), tsscp.div(n_samples)
        #within-class and biased within-class covariance matrices
        wcov, wcovb = {k : wsscp[k].div(n_k[k]-1) for k in classes}, {k : wsscp[k].div(n_k[k]) for k in classes}
        #pooled within-class and biased pooled within-class covariance matrices
        pwcov, pwcovb = pwsscp.div(n_samples - n_classes), pwsscp.div(n_samples)
        #inverse of within-class and pooled within-class covariance matrices
        invpwcov = linalg.inv(pwcov)
        #between-class and biased between-class covariance matrices
        bcov, bcovb  = bsscp.div(n_samples*(n_classes-1)/n_classes), bsscp.div(n_samples)
        #convert to ordered dictionary
        cov_ = OrderedDict(total=tcov,btotal=tcovb,within=wcov,bwithin=wcovb,pooled=pwcov,bpooled=pwcovb,between=bcov,bbetween=bcovb)
        #convert to namedtuple
        self.cov_ = namedtuple("cov",cov_.keys())(*cov_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #correlation coefficients test
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total sample and within-class correlation coefficients
        tcortest, wcortest  = cov_to_cor_test(X=tcovb,n_samples=n_samples), {k: cov_to_cor_test(X=wcovb[k],n_samples=n_k[k]) for k in classes}
        #pooled within-class and between-class correlation coefficients
        pwcortest, bcortest = cov_to_cor_test(X=pwcov,n_samples=n_samples-n_classes + 1), cov_to_cor_test(X=bcov,n_samples=n_classes)
        #convert to ordered dictionary
        cortest_ = OrderedDict(total=tcortest,within=wcortest,pooled=pwcortest,between=bcortest)
        #convert to namedtuple
        self.corr_ = namedtuple("corr",cortest_.keys())(*cortest_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #classes informations
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #class level information
        class_infos = DataFrame(c_[n_k,p_k,priors],columns=["Frequency","Proportion","Prior Probability"],index=classes)
        class_infos["Frequency"] = class_infos["Frequency"].astype(int)
        #within-class average
        class_center = concat((X_k[k].mean(axis=0).to_frame(k) for k in classes),axis=1).T
        #total-sample standardized class means
        tcenter = class_center.sub(tsummary["mean"],axis=1).div(sqrt(diag(tcov)),axis=1).T
        #pooled within-class standardized class means
        pcenter = class_center.sub(tsummary["mean"],axis=1).div(sqrt(diag(pwcov)),axis=1).T
        #squared mahalanobis distances between class - pairwise squared distances between groups
        class_mahal = concat((sqmahalanobis(X=class_center,VI=invpwcov,mu=class_center.loc[k,:]).to_frame(k) for k in classes),axis=1)
        #convert to ordered dictionary
        classes_ = OrderedDict(infos=class_infos,center=class_center,total=tcenter,pooled=pcenter,mahal=class_mahal)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #eigen decomposition
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #matrix C
        C = class_center.sub(tsummary["mean"],axis=1).mul(sqrt(priors),axis=0).T
        #eigen decomposition
        eig = linalg.eig(dot(dot(C.T,linalg.inv(tcovb)),C))
        #gestion des nombres complexes
        lambd = array(sorted(real(eig.eigenvalues),reverse=True))[:n_components]
        #find index
        idx = [list(real(eig.eigenvalues)).index(x) for x in lambd]
        #reorder eigen vectors
        vector = real(eig.eigenvectors)[:,idx]
        #convert to namedtuple
        self.svd_ = namedtuple("svd",["value","vector"])(lambd,vector)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #eigen values informations
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #eigenvalue informations
        rho = array([x/(1-x) for x in lambd])
        difference, proportion = insert(-diff(rho),len(rho)-1,nan), 100*rho/sum(rho)
        #store all informations
        self.eig_ = DataFrame(c_[rho,difference,proportion,cumsum(proportion)],columns=["Eigenvalue","Difference","Proportion","Cumulative"],index = ["Can"+str(x+1) for x in range(n_components)])

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #canonical coefficients - canonical discriminant coefficients
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #coefficients of features (without intercept)
        rawcoef = DataFrame(dot(dot(linalg.inv(tcovb),C),vector),index=features,columns=["Can"+str(x+1) for x in range(n_components)]).mul(sqrt((n_samples-n_classes)/(n_samples*lambd*(1-lambd))),axis=1)
        #intercept
        rawintercept = - rawcoef.T.dot(tsummary["mean"].values.reshape(-1,1))
        rawintercept.columns = ["Constant"]
        #total-sample standardized canonical coefficients
        tcan_coef = rawcoef.mul(sqrt(diag(tcov)),axis=0)
        #pooled within-class standardized canonical coefficients
        pcan_coef = rawcoef.mul(sqrt(diag(pwcov)),axis=0)
        #convert to ordered dictionary
        cancoef_ = OrderedDict(raw=concat((rawintercept.T,rawcoef),axis=0),total=tcan_coef,pooled=pcan_coef)
        #convert to namedtuple
        self.cancoef_ = namedtuple("cancoef",cancoef_.keys())(*cancoef_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #canonical correlation
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #canonical correlation
        cancorr = DataFrame(c_[sqrt(lambd),lambd],columns=["Canonical Correlation","Squared Canonical Correlation"])
        #likelohood ratio test
        lr_test = DataFrame(zeros((n_components,8)),columns=["Likelihood Ratio","Approximate F value","Num DF","Den DF","Pr>F","Chi-Square","DF","Pr>Chi2"])
        for i in range(n_components):
            lr_test.iloc[-i,:] = lrtest(n_samples=n_samples,n_features=n_features,n_classes=n_classes,eigen=lambd[-(i+1):])
        lr_test = lr_test.sort_index(ascending=False).reset_index(drop=True)
        lr_test["Num DF"], lr_test["Den DF"], lr_test["DF"] = lr_test["Num DF"].astype(int), lr_test["Den DF"].astype(int), lr_test["DF"].astype(int)
        #convert to CANDISC attribute
        self.cancorr_ = concat((cancorr,lr_test),axis=1)
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #individuals informations: squared mahalanobis distance & coordinates
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #individuals scores
        ind_coord = X.sub(tsummary["mean"],axis=1).dot(rawcoef)
        #squared mahalanobis distance to origin
        ind_mahal = concat((sqmahalanobis(X=X,VI=invpwcov,mu=class_center.loc[k,:]).to_frame(k) for k in classes),axis=1)
        #convert to ordered dictionary
        ind_ = OrderedDict(coord=ind_coord,mahal=ind_mahal)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #others classes informations: coordinates of classes, coordinates of center of classes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #coordinates of the classes
        class_coord = concat((ind_coord.loc[y[y==k].index,:].mean(axis=0).to_frame(k) for k in classes),axis=1).T
        #squared euclidean distance between classes
        class_eucl = DataFrame(squareform(pdist(class_coord,metric="sqeuclidean")),index=classes,columns=classes)
        #squared generalized distance
        class_gen = class_eucl.sub(2*log(priors),axis=1)
        #updated classes_ dictionary
        classes_ = OrderedDict(**classes_, **OrderedDict(coord=class_coord,eucl=class_eucl,gen=class_gen))
        #convert to namedtuple
        self.classes_ = namedtuple("classes",classes_.keys())(*classes_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #variables coordinates: total, within & between
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total correlation - total canonical Structure
        var_tcoord = DataFrame(corrcoef(x=X,y=ind_coord,rowvar=False)[:n_features,n_features:],index=features,columns=ind_coord.columns)
        #poold within correlation - Pooled Within Canonical Structure
        z1, z2 = ind_coord.sub(class_coord.loc[y.values,:].values), X.sub(class_center.loc[y.values,:].values,axis=1)
        var_pcoord = DataFrame(transpose(corrcoef(x=z1,y=z2,rowvar=False)[:n_components,n_components:]),index=features,columns=ind_coord.columns)
        #between correlation - between Canonical Structure
        var_bcoord = concat((Series([wcorrcoef(class_center[k],class_coord[l],priors) for l in ind_coord.columns],index=ind_coord.columns,name=k) for k in features),axis=1).T
        #convert to ordered dictionary
        var_ = OrderedDict(total=var_tcoord,pooled=var_pcoord,between=var_bcoord) 
        #onvert to namedtuple
        self.var_ = namedtuple("var",var_.keys())(*var_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #classification function
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #coefficients of classification function
        self.coef_ = self.cancoef_.raw.dot(class_coord.T)
        #update intercept
        self.coef_.iloc[0,:] = self.coef_.iloc[0,:].sub(0.5*class_coord.pow(2).sum(axis=1)).add(log(p_k))
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #individuals informations: scores, squared euclidean distance and squared generalized distance
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #squared euclidean distance to class center
        ind_eucl = concat((ind_coord.sub(class_coord.loc[k,:],axis=1).pow(2).sum(axis=1).to_frame(k) for k in classes),axis=1)
        #squared generalized distance
        ind_gen = ind_eucl.sub(2*log(priors),axis=1)
        #individuals scores
        ind_scores = X.dot(self.coef_.iloc[1:,:]).add(self.coef_.iloc[0,:].values,axis=1)
        #update individuals dictionary
        ind_ = OrderedDict(**ind_,**OrderedDict(eucl=ind_eucl,gend=ind_gen,scores=ind_scores))
        #convert to namedtuple
        self.ind_ = namedtuple("ind",ind_.keys())(*ind_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #others statistics
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #standard deviation
        stdev = DataFrame(c_[sqrt(diag(tcov)),sqrt(diag(pwcov)),sqrt(diag(bcov))],columns=["Total Std. Dev.","Pooled Std. Dev.","Between Std. Dev."],index=features)
        #compute univariate analysis of variance - ANOVA
        anova = concat((stdev,univ_test(X,y)),axis=1)
        #average R-Square
        unwavg_rsq, wavg_rsq = average(anova.loc[:,"R-Square"].values), average(anova.loc[:,"R-Square"].values,weights=diag(tcovb))
        avg_rsq = DataFrame([[unwavg_rsq,wavg_rsq]],index=["Average R-Square"],columns=["Unweighted","Weighted by Variance"])
        #compute multivariate analysis of variance - MANOVA
        manova = MANOVA.from_formula(formula="{}~{}".format("+".join(features),"+".join([target])), data=concat((X,y),axis=1)).mv_test(skip_intercept_test=True).summary_frame
        manova.index = manova.index.droplevel()
        #performance
        performance = diagnostics(Vb=tcovb,Wb=pwcovb,n_samples=n_samples,n_classes=n_classes)
        #convert to ordered dictionary
        statistics_ = OrderedDict(anova=anova,manova=manova,average_rsq=avg_rsq,performance=performance)   
        #convert to namedtuple
        self.statistics_ = namedtuple("statistics",statistics_.keys())(*statistics_.values())
        
        #set model name
        self.model_ = "candisc"
        return self
    
    def fit_transform(self,X,y):
        """
        Fit to data, then transform it

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        
        y : Series of shape (n_samples,)
            Target values. True labels for ``X``.
        
        Returns
        -------
        X_new : DataFrame of shape (n_samples, n_components)
            Transformed data, where ``n_components`` is the number of components
        """
        #fit the model
        self.fit(X,y)
        return self.ind_.coord
    
    def transform(self,X):
        """
        Apply the dimensionality reduction on X

        X is projected on the canonical components previously extracted from a training set.

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            New data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
            
        Returns
        -------
        X_new : DataFrame of shape (n_samples, n_components)
            Transformed data, where ``n_components`` is the number of components.
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if the estimator is fitted by verifying the presence of fitted attributes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_fitted(self)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X=X)

        #set index name as None
        X.index.name = None

        #check if X contains original features
        if not set(self.call_.Xtot.columns).issubset(X.columns):
            raise ValueError("The names of the features is not the same as the ones in the active features of the CANDISC result")
        
        #select original features
        X = X[self.call_.Xtot.columns]

        #split X
        split_X = splitmix(X)
        #extract elements
        X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2

        #initialize DataFrame
        Xcod = DataFrame(index=X.index,columns=self.call_.X.columns).astype(float)

        #check if numerics variables
        if n_quanti > 0:
            #replace with numerics columns
            Xcod.loc[:,X_quanti.columns] = X_quanti
        
        #check if categorical variables      
        if n_quali > 0:
            #active categorics
            categorics = [x for x in self.call_.X.columns if x not in self.call_.Xtot.columns]
            #replace with dummies
            Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="")
        
        #multiply by raw canonical coefficients
        return Xcod.dot(self.cancoef_.raw.iloc[1:,:]).add(self.cancoef_.raw.iloc[0,:],axis=1)