Source code for discrimintools.discriminant_analysis._discrim

# -*- coding: utf-8 -*-
from numpy import ndarray, array, c_, diag, dot, linalg, log, sqrt, average, ones
from pandas import DataFrame, concat, CategoricalDtype, Series
from pandas.api.types import is_string_dtype
from functools import reduce
from collections import OrderedDict, namedtuple
from statsmodels.multivariate.manova import MANOVA
from sklearn.utils.validation import check_is_fitted

#intern function
from ._base import _BaseDA
from .functions.utils import check_is_dataframe, check_is_series, check_is_bool
from .functions.preprocessing import preprocessing
from .functions.model_matrix import model_matrix
from .functions.ldavip import ldavip
from .functions.describe import describe
from .functions.sscp import sscp
from .functions.box_m_test import box_m_test
from .functions.cov_to_cor_test import cov_to_cor_test
from .functions.distance import sqmahalanobis
from .functions.cov_infos import cov_infos
from .functions.univ_test import univ_test
from .functions.diagnostics import diagnostics
from .functions.splitmix import splitmix
from .functions.tab_disjunctive import tab_disjunctive


[docs]
class DISCRIM(_BaseDA):
    """
    Discriminant Analysis (DISCRIM)
    
    Performs a discriminant analysis (linear and quadratic) on a set of observations (training data) containing one or more numerics variables and a classification variables defining groups of observations.
    The derived discriminant criterion from the training data can be applied to a testing dataset.

    Parameters
    ----------
    method : {'linear', 'quad'}, default = 'linear'
        The discriminant analysis method to performs, possible values: 

        - 'linear' for linear discriminant analysis (LDA).
        - 'quad' for quadratic discriminant analysis (QDA)
    
    priors : str or array-like or Series of shape (n_classes,), default = None
        The priors statement specifies the class prior probabilities of group membership, possibles values: 

        - 'equal' to set the prior probabilities equal.
        - 'prop' to set the prior probabilities proportional to the sample sizes.
        - numpy 1-D array or Series which specify the prior probability for each level of the classification variable.
    
    classes : None, tuple or list, default = None
        Name of level in order to return. If None, classes are sorted in unique values in y.
    
    var_select : bool, default = False
        Whether to applied feature selection based on variable importance (contribution) in prediction for linear discriminant analysis

    level : float, default = None
        Significance level for the variable importance critical probability. You can specify the `level` option only when both method = 'linear' and var_select=True are also specified.
        If you specify both method = 'linear' and var_select=True but omit the `level` option, DISCRIM uses :math:`5e-2` as the significance level for the variabe importance.
    
    tol : float, default = None
        Significance level for the test of homogeneity. You can specify the `tol` option only when method = 'quad' is also specified.
        If you specify method = 'quad' but omit the `tol` option, DISCRIM uses :math:`1e-1` as the significance level for the test.
    
    warn_message : bool, default = True
        Show warning messages. Raise a warning without making the program crash.

    Returns
    -------
    call_ :  NamedTuple
        Call informations:

        - Xtot : DataFrame of shape (n_samples, n_columns)
            Input data.
        - X : DataFrame of shape (n_samples, n_features)
            Training data.
        - y : Series of shape (n_samples,)
            Target values. True values for ``X``.
        - target : str
            Name of target.
        - features : list
            Names of features seen during ``fit``.
        - classes : list
            Names of classes.
        - priors : Series of shape (n_classes,)
            Priors probabilities.
        - n_samples : int
            Number of samples.
        - n_features : int
            Number of features.
        - n_classes : int
            Number of target values.

    classes_ : Namedtuple 
        Classes informations:

        - infos : DataFrame of shape (n_classes, 3)
            class level information (frequency, proportion, prior probability).
        - center : DataFrame of shape (n_classes, n_features) 
            Class means.
        - total : DataFrame of shape (n_features, n_classes)
            Total-sample standardized class means.
        - pooled : DataFrame of shape (n_features, n_classes)
            Pooled-within class standardized class means.
        - mahal : DataFrame of shape (n_classes, n_classes)
            Squared Mahalanobis distances between classes.
        - gen : DataFrame of shape (n_classes, n_classes)
            Generalized Squared distances between classes.

    coef_ : DataFrame of shape (n_features, n_classes)
        Linear classification functions coefficients.

    corr_ : NamedTuple 
        Correlation coefficients test:

        - total : DataFrame of shape (C^{2}_{n_features}, 7)
            Total-sample correlation coefficients test.
        - within : dict 
            Within-class correlation coefficients test.
        - pooled : DataFrame of shape (C^{2}_{n_features}, 7)
            Pooled within-class correlation coefficients test.
        - between : DataFrame of shape (C^{2}_{n_features}, 7)
            Between-class correlation coefficients test.

    cov_ : NamedTuple
        Covariance matrices:

        - total : DataFrame of shape (n_features, n_features)
            Total-sample covariance matrix.
        * btotal : DataFrame of shape (n_features, n_features)
            Biased total-sample covariance matrix.
        - within : dict 
            Within-class covariance matrices.
        - bwithin : dict
            Biased within-class covariance matrices.
        - pooled : DataFrame of shape (n_features, n_features)
            pooled within-class covariance matrix.
        - bpooled : DataFrame of shape (n_features, n_features)
            biased pooled within-class covariance matrix.
        - between : DataFrame of shape (n_features, n_features)
            Between-class covariance matrix
        - bbetween : DataFrale of shape (n_features, n_features)
            biased between-class covariance matrix.
        - test : DataFrame of shape (1, 7)
            Box's M test.

    ind_ : NamedTuple
        Individuals informations:

        - scores : DataFrame of shape (n_samples, n_classes) 
            The total scores of individuals.
        - mahal : DataFrame of shape (n_samples, n_classes) 
            Squared Mahalanobis distance to origin.
        * gen : DataFrame shape (n_samples, n_classes) 
            Generalized squared distance to origin.

    model_ : str, default = "discrim"
        Name of model fitted.

    sscp_ : NamedTuple 
        Sum of square cross product (SSCP) matrices:

        - total : DataFrame of shape (n_features, n_features)
            Total-sample SSCP matrix
        - within : dict
            Within-class SSCP matrices
        - pooled: DataFrame of shape (n_features, n_features)
            Pooled within-class SSCP matrix
        - between : DataFrame of shape (n_features, n_features)
            Between-class SSCP matrix.

    statistics_ : NamedTuple
        Statistics results:

        - anova : DataFrame of shape (n_features, 11)
            Analysis of variance test.
        - manova : DataFrame of shape (4, 5)
            Multivariate analysis of variance test.
        - average_rsq : DataFrame of shape (1, 2)
            Average R-square.
        - performance : DataFrame of shape (3, 3)
            The model global performance. Only if linear discriminant analysis.

    summary_ : NamedTuple
        Summary informations:

        - infos : DataFrame of shape (3, 4)
            Summary informations (total sample size, number of features, number of classes, 
            total degree of freedom, within-class degree of freedom, between-class degree of freedom).
        - total : DataFrame of shape (n_features, 8)
            Total-sample statistics, see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html
        - within : dict
            Within-class statistics.
    
    vip_ : NamedTuple
        Variable importance for prediction:

        - vip : DataFrame of shape (n_features, 6)
            Variable importance for prediction.
        - selected : list
            Selected variables.
    
    See also
    --------
    :class:`~discrimintools.GFALDA`
        General Factor Analysis Linear Discriminant Analysis
    :class:`~discrimintools.CPLS`
        Partial Least Squares for Classification
    :class:`~discrimintools.PLSDA`
        Partial Least Squares Discriminant Analysis
    :class:`~discrimintools.PLSLDA`
        Partial Least Squares Linear Discriminant Analysis
    :class:`~discrimintools.summaryDISCRIM`
        Printing summaries of Discriminant Analysis (linear & quadratic) model.
    :class:`~discrimintools.summaryDA`
        Printing summaries of Discriminant Analysis model.
        
    References
    ----------
    [1] Bardos M. (2001), « Analyse discriminante - Application au risque et scoring financier », Dunod.
    
    [2] Lebart Ludovic, Piron Marie, & Morineau Alain (2006), « `Statistique Exploratoire Multidimensionnelle <https://horizon.documentation.ird.fr/exl-doc/pleins_textes/2023-12/010038111.pdf>`_ », Dunod, Paris 4ed.
    
    [3] Ricco Rakotomalala (2020), « `Pratique de l'Analyse Discriminante Linéaire <https://hal.science/hal-04868585v1/file/Pratique_Analyse_Discriminante_Lineaire.pdf>`_ », Université Lumière Lyon 2, Version 1.0.
    
    [4] Saporta Gilbert (2011), « `Probabilités, Analyse des données et Statistiques <https://en.pdfdrive.to/dl/probabilites-analyses-des-donnees-et-statistiques>`_ », Editions TECHNIP, 3ed.
    
    [5] Tenenhaus Michel (1996), « Méthodes statistiques en gestion », Dunod.
    
    [6] Tuffery Stephane (2017), « Data Mining et Statistique décisionelle », Editions TECHNIP, 5ed.
    
    [7] Tuffery Stephane (2025), « Data Science, Statistique et Machine learning », Editions TECHNIP, 6ed.

    [8] SAS/STAT 13.2 User's Guide (2014), « `The DISCRIM Procedure <https://support.sas.com/documentation/onlinedoc/stat/132/discrim.pdf>`_ », Chapter 35.

    Examples
    --------
    >>> from discrimintools.datasets import load_alcools
    >>> from discrimintools import DISCRIM
    >>> D = load_alcools() # load training data
    >>> y, X = D["TYPE"], D.drop(columns["TYPE"]) # split into X and y
    >>> #linear discriminant analysis (LDA)
    >>> clf = DISCRIM()
    >>> clf.fit(X,y)
    DISCRIM(priors='prop')
    >>> #quadratic discriminant analysis
    >>> clf2 = DISCRIM(method='quad')
    >>> clf2.fit(X,y)
    DISCRIM(method='quad',priors='prop')
    ```
    """

[docs]
    def __init__(
            self, method = 'linear', priors = None, classes = None, var_select = False, level = None, tol = None, warn_message = True
    ):
        self.method = method 
        self.priors = priors
        self.classes = classes
        self.var_select = var_select
        self.level = level
        self.tol = tol
        self.warn_message = warn_message


    def decision_function(self,X):
        """
        Apply decision function to an input data

        The decision function is equal to the `log-posterior`_ of the model.

        .. log-posterior: https://online.stat.psu.edu/stat857/node/80/

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
            
        Returns
        -------
        C : DataFrame of shape (n_samples, n_classes)
            Decision function values related to each class, per sample.
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if the estimator is fitted by verifying the presence of fitted attributes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_fitted(self)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X=X)

        #set index name as None
        X.index.name = None

        #check if X contains original features
        if not set(self.call_.Xtot.columns).issubset(X.columns):
            raise ValueError("The names of the features is not the same as the ones in the active features of the DISCRIM result")
        #select original features
        X = X[self.call_.Xtot.columns]

        #split X
        split_X = splitmix(X)
        #extract elements
        X_quanti, X_quali, n_samples, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.n, split_X.k1, split_X.k2

        #initialize DataFrame
        Xcod = DataFrame(index=X.index,columns=self.call_.X.columns).astype(float)

        #check if numerics variables
        if n_quanti > 0:
            #replace with numerics columns
            Xcod.loc[:,X_quanti.columns] = X_quanti
        
        #check if categorical variables      
        if n_quali > 0:
            #active categorics
            categorics = [x for x in self.call_.X.columns if x not in self.call_.Xtot.columns]
            #replace with dummies
            Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="")

        #remove non selected variables
        Xcod = Xcod.loc[:,list(self.cov_.total.index)] 

        #chi-square pvalue and tolerance threshold
        p_value, tol = self.cov_.test.iloc[0,6], self.call_.tol
        #quadratic discriminant analysis
        if self.method == "quad" and  p_value <= tol:
            #inverse of within-class covariance matrix
            invwcov = {k : linalg.inv(self.cov_.within[k]) for k in self.call_.classes}
            #squared distance of individuals to origin
            mahal = concat((sqmahalanobis(X=Xcod,VI=invwcov[k],mu=self.classes_.center.loc[k,:]).to_frame(k) for k in self.call_.classes),axis=1)
            #generalized squared distance of individuals to origin
            gsqdist = mahal.add(self.cov_.infos.loc[self.call_.classes,"Natural Log of the Determinant"],axis=1) 
        #linear discriminant analysis
        elif (self.method == "linear") or (self.method == "quad" and  p_value > tol):
            #inverse of pooled within-class covariance matrix
            invpwcov = linalg.inv(self.cov_.pooled)
            #generalized squared to distance of individuals to origin
            gsqdist = concat((sqmahalanobis(X=Xcod,VI=invpwcov,mu=self.classes_.center.loc[k,:]).to_frame(k) for k in self.call_.classes),axis=1)
        
        #remove priors log-probabilities
        if not (isinstance(self.priors, str) and self.priors == "equal"):
            gsqdist = gsqdist.sub(2*log(self.call_.priors),axis=1)
        return -0.5*gsqdist
    
    def feature_importance(self,level=5e-2,all_vars=True):
        """
        Variables Importance for Prediction in Linear Discriminant Analysis (LDAVIP)

        Parameters
        ----------
        level : float, default=5e-2
            Significance level for the variable importance critical probability.
            If None :math:`5e-2` is used as the significance level for the variabe importance.

        all_vars : bool, default=True
            If to test all subset of variables.

        Returns
        ------- 
        vip : DataFrame of shape (n_features, 6)
            Variable importance for prediction.
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if the estimator is fitted by verifying the presence of fitted attributes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_fitted(self)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if linear discriminant analysis model
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.method != "linear":
            raise NotImplementedError("'feature_importance' method cannot be used for QDA method.")
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if var_select is False
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.var_select:
            raise  NotImplementedError("'feature_importance' method cannot be used if var_select=True.")
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if level is not None
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if level is None:
            level = 5e-2
        elif not isinstance(level,float):
            raise TypeError("{} is not supported".format(type(level)))
        elif level < 0 or level > 1:
            raise ValueError("the 'level' value {} is not within the required range of 0 and 1.".format(level))
        else:
            level = level
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if boolean
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_bool(all_vars)
        
        return ldavip(X=self.call_.X,y=self.call_.y,level=level,all_vars=all_vars).vip
    
    def fit(self,X,y):
        """
        Fit the Discriminant Analysis model.

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        
        y : Series of shape (n_samples,)
            Target values. True labels for ``X``.
        
        Returns
        -------
        self : object
            Fitted estimator.
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if y is an instance of class pd.Series
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_series(y)

        #check if len are equal
        if X.shape[0] != y.shape[0]:
            raise ValueError("The number of samples in X must be equal to the number of samples in y")

        #check if all elements in y are string
        if not all(isinstance(kq, str) for kq in y):
            raise TypeError("All elements in y must be a string")
        
        #set y name if None
        if y.name is None or isinstance(y.name, int):
            y.name = "group"
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if method is 'linear' or 'quad'
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.method not in ['linear','quad']:
            raise ValueError("method must be one of 'linear', 'quad', got {}".format(self.method))
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if priors is not None
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.priors is None:
            self.priors = "prop"
        elif not isinstance(self.priors,(str,list,tuple,ndarray,Series)):
            raise TypeError("{} is not supported".format(type(self.priors)))
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if var_select is a bool
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_bool(self.var_select)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if level is not None
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.method == 'linear':
            if self.level is None:
                level = 5e-2
            elif not isinstance(self.level,float):
                raise TypeError("{} is not supported".format(type(self.level)))
            elif self.level < 0 or self.level > 1:
                raise ValueError("the 'level' value {} is not within the required range of 0 and 1.".format(self.level))
            else:
                level = self.level
        else:
            level = None
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if tol is not None (for quadratic discriminant analysis)
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.method == 'quad':
            if self.tol is None:
                tol = 1e-1
            elif not isinstance(self.tol,float):
                raise TypeError("{} is not supported".format(type(self.tol)))
            elif self.tol < 0 or self.tol > 1:
                raise ValueError("the 'tol' value {} is not within the required range of 0 and 1.".format(self.tol))
            else:
                tol = self.tol
        else:
            tol = None
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if warn_message is a bool
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_bool(self.warn_message)

        #make a copy of original data
        Xtot = X.copy(deep=True)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #preprocessing
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        X = preprocessing(X)
         
        #warning message to inform
        if self.warn_message:
            if any(is_string_dtype(X[k]) for k in X.columns):
                print("\nCategorical features have been encoded into binary variables.\n")

        #encode categorical variables into binary without first level.
        X = model_matrix(X=X)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #set classes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #unique element in y
        uq_y = sorted(list(y.unique()))
        #number of classes
        n_classes = len(uq_y)
        if self.classes is not None and isinstance(self.classes, (list,tuple)):
            if len(list(set(self.classes) & set(uq_y))) != n_classes:
                raise ValueError("Insert good classes")
            classes = [str(k) for k in self.classes]
        else:
            classes = uq_y

        #convert y to categorical data type
        y = y.astype(CategoricalDtype(categories=classes,ordered=True))
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #variables importance for prediction in linear discriminant analysis
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.method == 'linear':
            vip = ldavip(X,y,level,all_vars=False)
            #check if at least one variable selected
            if self.var_select and len(vip.selected) > 0:
                #update X with selected variables
                X = X.loc[:,vip.selected]
                #update variable importance
                vip = ldavip(X,y,level,all_vars=False)
            #set to attribute
            self.vip_ = vip

        #number of samples and number of features
        n_samples, n_features = X.shape
        #set target and features names
        target, features = y.name, X.columns.tolist()

        #define subset of X
        X_k = {k : X.loc[y[y==k].index,:] for k in classes}
        #count and proportion
        n_k, p_k = y.value_counts(normalize=False).loc[classes],  y.value_counts(normalize=True).loc[classes]

        #set piors probabilities
        if isinstance(self.priors,str):
            if self.priors == "prop":
                priors = array(p_k)
            elif self.priors == "equal":
                priors = ones(n_classes)/n_classes
            else:
                raise TypeError("Specify a right value for piors")
        elif isinstance(self.priors,(list,tuple,ndarray,Series)):
            priors = array([x/sum(self.priors) for x in self.priors])

        #check if any value in priors is negative
        if any(x < 0 for x in priors):
            raise ValueError("priors must be non-negative")

        #convert to pandas Series
        priors = Series(priors,index=classes,name="priors")

        #convert to ordered dictionary
        call_ = OrderedDict(Xtot=Xtot,X=X,y=y,target=target,features=features,classes=classes,priors=priors,n_samples=n_samples,n_features=n_features,n_classes=n_classes,level=level,tol=tol)
        #convert to namedtuple
        self.call_ = namedtuple("call",call_.keys())(*call_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #sample statistics
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #summary information
        summary_infos = DataFrame({"Infos" : ["Total Sample Size","Variables","Classes"],
                                   "Value" : [n_samples,n_features,n_classes],
                                   "DF" : ["DF Total", "DF Within Classes", "DF Between Classes"],
                                   "DF value" : [n_samples - 1, n_samples - n_classes, n_classes - 1]})
        #total-sample and within-class summaries
        tsummary, wsummary = describe(X), {k : describe(X_k[k]) for k in classes}
        #convert to dictionary
        summary_ = OrderedDict(infos=summary_infos,total=tsummary,within=wsummary)
        #convert to namedtuple
        self.summary_ = namedtuple("summary",summary_.keys())(*summary_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #sum of square cross product (SSCP) matrix
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total-sample and within-class SSCP matrix
        tsscp, wsscp = sscp(X=X), {k: sscp(X_k[k]) for k in classes}
        #pooled within-class SSCCP matrix
        pwsscp = reduce(lambda i , j : i + j, wsscp.values())
        #between-class SSCP matrix
        bsscp = tsscp - pwsscp
        #convert to dictionary
        sscp_ = OrderedDict(total=tsscp,within=wsscp,pooled=pwsscp,between=bsscp)
        #convert to namedtuple
        self.sscp_ = namedtuple("sscp",sscp_.keys())(*sscp_.values())
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #covariance matrices
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total-sample and biased total-sample covariance matrices
        tcov, tcovb = tsscp.div(n_samples - 1), tsscp.div(n_samples)
        #within-class and biased within-class covariance matrices
        wcov, wcovb = {k : wsscp[k].div(n_k[k]-1) for k in classes}, {k : wsscp[k].div(n_k[k]) for k in classes}
        #test of equality of covariance matrix - homogeneity of variance and covariance
        wcov_test = box_m_test(wcov.values(),list(n_k))
        #chi-square pvalue
        p_value = wcov_test.iloc[0,6]
        #print warning message
        if self.warn_message:
            if self.method == "quad" and  p_value > tol:
                print("\nSince the Chi-Square value is not significant at the {} level, a pooled covariance matrix will be used in the discriminant function.\nReference: Morrison, D.F. (1976) Multivariate Statistical Methods p252.".format(tol))
            elif self.method == "quad" and  p_value <= tol:
                print("\nSince the Chi-Square value is significant at the {} level, the within covariance matrices will be used in the discriminant function.\nReference: Morrison, D.F. (1976) Multivariate Statistical Methods p252.".format(tol))

        #pooled within-class and biased pooled within-class covariance matrices
        pwcov, pwcovb = pwsscp.div(n_samples - n_classes), pwsscp.div(n_samples)
        #between-class and biased between-class covariance matrices
        bcov, bcovb  = bsscp.div(n_samples*(n_classes-1)/n_classes), bsscp.div(n_samples)
        ##covariance matrices informations - rank and natural log of the determinant
        cov_info = cov_infos(X=pwcov).to_frame("Pooled").T
        if self.method == "quad":
            #within-class covariance matrices informations
            wcov_infos = concat((cov_infos(X=wcov[k]).to_frame(k) for k in classes),axis=1).T
            #concatenate
            cov_info = concat((cov_info,wcov_infos),axis=0)
        #convert to integer
        cov_info["Rank"] = cov_info["Rank"].astype(int)
        #convert to dictionary
        cov_ = OrderedDict(infos=cov_info,total=tcov,btotal=tcovb,within=wcov,bwithin=wcovb,pooled=pwcov,bpooled=pwcovb,between=bcov,bbetween=bcovb,test=wcov_test)
        #convert to namedtuple
        self.cov_ = namedtuple("cov",cov_.keys())(*cov_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #correlation coefficients test
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total sample and within-class correlation coefficients
        tcortest, wcortest  = cov_to_cor_test(X=tcovb,n_samples=n_samples), {k: cov_to_cor_test(X=wcovb[k],n_samples=n_k[k]) for k in classes}
        #pooled within-class and between-class correlation coefficients
        pwcortest, bcortest = cov_to_cor_test(X=pwcov,n_samples=n_samples-n_classes+1), cov_to_cor_test(X=bcov,n_samples=n_classes)
        #convert to dictionary
        cortest_ = OrderedDict(total=tcortest,within=wcortest,pooled=pwcortest,between=bcortest)
        #convert to namedtuple
        self.corr_ = namedtuple("corr",cortest_.keys())(*cortest_.values())
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #classes and individuals informations
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #class level information
        class_infos = DataFrame(c_[n_k,p_k,priors],columns=["Frequency","Proportion","Prior Probability"],index=classes)
        class_infos["Frequency"] = class_infos["Frequency"].astype(int)
        #within-class average
        wcenter = concat((X_k[k].mean(axis=0).to_frame(k) for k in classes),axis=1).T
        #total-sample standardized class means
        tcenter = wcenter.sub(tsummary["mean"],axis=1).div(tsummary["std"],axis=1).T
        #pooled within-class standardized class means
        pcenter = wcenter.sub(tsummary["mean"],axis=1).div(sqrt(diag(pwcov)),axis=1).T
        #convert to dictionary
        classes_, ind_ = OrderedDict(infos=class_infos,center=wcenter,total=tcenter,pooled=pcenter), OrderedDict()
        
        #squared distance and generalized squared distance
        if self.method == "quad" and  p_value <= tol:
            #inverse of within-class covariance matrices
            invwcov = {k : linalg.inv(wcov[k]) for k in classes}
            #squared distance of classes to origin
            class_mahal = concat((sqmahalanobis(X=wcenter,VI=invwcov[k],mu=wcenter.loc[k,:]).to_frame(k) for k in classes),axis=1)
            #generalized squared distance of classes to origin
            class_gen = class_mahal.add(wcov_infos.loc[classes,"Natural Log of the Determinant"],axis=1) 

            #squared distance of individuals to origin
            ind_mahal = concat((sqmahalanobis(X=X,VI=invwcov[k],mu=wcenter.loc[k,:]).to_frame(k) for k in classes),axis=1)
            #generalized squared distance of individuals to origin
            ind_gen = ind_mahal.add(wcov_infos.loc[classes,"Natural Log of the Determinant"],axis=1) 
        elif (self.method == "linear") or (self.method == "quad" and  p_value > tol):
            #inverse pooled within-class covariance matrix
            invpwcov = linalg.inv(pwcov)
            #squared distance of classes to origin
            class_mahal = concat((sqmahalanobis(X=wcenter,VI=invpwcov,mu=wcenter.loc[k,:]).to_frame(k) for k in classes),axis=1)
            #generalized squared distance of classes to origin
            class_gen = class_mahal.copy()

            #squared distance of individuals to origin
            ind_mahal = concat((sqmahalanobis(X=X,VI=invpwcov,mu=wcenter.loc[k,:]).to_frame(k) for k in classes),axis=1)
            #generalized squared to distance of individuals to origin
            ind_gen = ind_mahal.copy()

            #coefficients of features
            coef = DataFrame(dot(invpwcov,wcenter.T),index=X.columns,columns=classes)
            #intercept
            intercept = - 0.5*diag(dot(dot(wcenter,invpwcov),wcenter.T))
            if not (isinstance(self.priors,str) and self.priors == "equal"):
                intercept = intercept + log(priors)
            #convert to DataFrame
            intercept = Series(intercept,index=classes).to_frame("Constant").T
            #scores of individuals
            ind_scores = X.dot(coef).add(intercept.values,axis=1)
            #concatenate
            self.coef_ = concat((intercept,coef),axis=0)

            #add score to ordered dictionary
            ind_ = OrderedDict(**ind_,**OrderedDict(scores=ind_scores))

        #remove priors log-probabilities
        if not (isinstance(self.priors, str) and self.priors == "equal"):
            class_gen, ind_gen = class_gen.sub(2*log(priors),axis=1), ind_gen.sub(2*log(priors),axis=1)

        #convert to ordered dictionary
        classes_, ind_ = OrderedDict(**classes_, **OrderedDict(mahal=class_mahal,gen=class_gen)), OrderedDict(**ind_,**OrderedDict(mahal=ind_mahal,gen=ind_gen))
        #convert to namedtuple
        self.classes_, self.ind_ = namedtuple("classes",classes_.keys())(*classes_.values()), namedtuple("ind",ind_.keys())(*ind_.values())   
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #multivariate goodness of fit - diagnostic test
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #standard deviation
        stdev = DataFrame(c_[sqrt(diag(tcov)),sqrt(diag(pwcov)),sqrt(diag(bcov))],columns=["Total Std. Dev.","Pooled Std. Dev.","Between Std. Dev."],index=features)
        #compute univariate analysis of variance - ANOVA
        anova = concat((stdev,univ_test(X,y)),axis=1)
        #average R-Square
        unwavg_rsq, wavg_rsq = average(anova.loc[:,"R-Square"].values), average(anova.loc[:,"R-Square"].values,weights=diag(tcovb))
        avg_rsq = DataFrame([[unwavg_rsq,wavg_rsq]],index=["Average R-Square"],columns=["Unweighted","Weighted by Variance"])
        #compute multivariate analysis of variance - MANOVA
        manova = MANOVA.from_formula(formula="{}~{}".format("+".join(features),"+".join([target])), data=concat((X,y),axis=1)).mv_test(skip_intercept_test=True).summary_frame
        manova.index = manova.index.droplevel()
        #convert to dictionary
        statistics_ = OrderedDict(anova=anova,manova=manova,average_rsq=avg_rsq) 
    
        #add if linear discriminant analysis
        if self.method == "linear":
            #performance
            performance = diagnostics(Vb=tcovb,Wb=pwcovb,n_samples=n_samples,n_classes=n_classes)
            #update classes
            statistics_ = OrderedDict(**statistics_,**OrderedDict(performance=performance))
        
        #convert to namedtuple
        self.statistics_ = namedtuple("statistics",statistics_.keys())(*statistics_.values()) 

        self.model_ = "discrim"
        return self
    
    def fit_transform(self,X,y):
        """
        Fit to data, then transform it

        Fits transformer to ``X`` and returns a transformed version of samples.

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Training data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        
        y : Series of shape (n_samples,)
            Target values. True labels for ``X``.
        
        Returns
        -------
        X_new : DataFrame of shape (n_samples, n_classes)
            Transformed samples. 
        """
        #fit discriminant analysis model
        self.fit(X,y)
        #chi-square pvalue and tolerance threshold
        p_value, tol = self.cov_.test.iloc[0,6], self.call_.tol
        if self.method == "quad" and  p_value <= tol:
            raise NotImplementedError("Since the Chi-Square value is significant at the {} level.'fit_transform' method cannot be used.".format(tol))
        return self.ind_.scores
    
    def transform(self,X):
        """
        Project data to maximize class separation

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            New data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        
        Returns
        -------
        X_new : DataFrame of shape (n_samples, n_classes)
            Transformed data, where ``n_samples`` is the number of samples and ``n_classes`` is the number of classes.
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if the estimator is fitted by verifying the presence of fitted attributes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_fitted(self)

        #chi-square pvalue and tolerance threshokd
        p_value, tol = self.cov_.test.iloc[0,6], self.call_.tol
        if self.method == "quad" and  p_value <= tol:
            raise NotImplementedError("Since the Chi-Square value is significant at the {} level.'transform' method cannot be used.".format(tol))

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X=X)

        #set index name as None
        X.index.name = None

        #check if X contains original features
        if not set(self.call_.Xtot.columns).issubset(X.columns):
            raise ValueError("The names of the features is not the same as the ones in the active features of the DISCRIM result")
        
        #select original features
        X = X[self.call_.Xtot.columns]

        #split X
        split_X = splitmix(X)
        #extract elements
        X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2

        #initialize DataFrame
        Xcod = DataFrame(index=X.index,columns=self.call_.X.columns).astype(float)

        #check if numerics variables
        if n_quanti > 0:
            #replace with numerics columns
            Xcod.loc[:,X_quanti.columns] = X_quanti
        
        #check if categorical variables      
        if n_quali > 0:
            #active categorics
            categorics = [x for x in self.call_.X.columns if x not in self.call_.Xtot.columns]
            #replace with dummies
            Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="")
        
        #remove non selected variables
        Xcod = Xcod.loc[:,list(self.cov_.total.index)] 
        #multiply by linear discriminant analysis coefficients
        return Xcod.dot(self.coef_.iloc[1:,:]).add(self.coef_.iloc[0,:].values,axis=1)