Source code for discrimintools.discriminant_analysis._plslogit

# -*- coding: utf-8 -*-
from numpy import array, linalg, c_, cumsum, log, where
from pandas import DataFrame, CategoricalDtype, get_dummies, Series, concat
from pandas.api.types import is_string_dtype
from collections import OrderedDict, namedtuple
from sklearn.cross_decomposition import PLSRegression
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel
from scipy.spatial.distance import pdist,squareform
from sklearn.utils.validation import check_is_fitted

#interns functions
from ._base import _BaseDA
from .functions.utils import check_is_dataframe, check_is_series, check_is_bool
from .functions.preprocessing import preprocessing
from .functions.model_matrix import model_matrix
from .functions.plsrvip import plsrvip
from .functions.splitmix import splitmix
from .functions.tab_disjunctive import tab_disjunctive


[docs]
class PLSLOGIT(_BaseDA):
    """
    Partial Least Squares Logistic Regression (PLSLOGIT)

    Performs partial least squares logistic regression (PLSLOGIT). It's a classical logistic regression (binary, multinomial, ordinal) carried out on the scores of a partial least scores of explanatory variables.
    Partial least squares logistic regression consists in three steps:
    
    1. Recode the target variable into ``n_classes`` dummy variables.
    2. Computation of partial least squares regression using `PLSRegression <https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html>`_.
    3. Computation of logistic regression (binary, multinomial, ordinal) on ``x_scores`` extract in step 2 using `Statsmodels <https://www.statsmodels.org/v0.14.4/index.html>`_.   

    Parameters
    ----------
    n_components : int or None, default = 2
        Number of components to keep. Should be in ``[1, n_features]``.

    scale : bool, defaul = True
        Whether to scale ``X`` and ``y``.

    classes : None, tuple or list, default = None
        Name of level in order to return. If ``None``, classes are sorted using unique values in y.

    max_iter : int, default = 500
        The maximum number of iterations for NIPALS method
    
    tol : float, default = 1e-06
        The tolerance used as convergence criteria in the NIPALS method.

    var_select : bool, default = True
        Whether to applied feature selection based on variables importance in Projection for Partial Least-Squares Regression

    threshold : float, default = 1.0
        You can use VIP to select predictor variables when multicollinearity exists among variables. 
        Variables with a VIP score greater than 1 are considered important for the projection of the PLS regression.

    multi_class : None, str.
        You can choose between ``multinomial`` or ``ordinal`` logistic regression. Only for multiclass problem.

    warn_message : bool, default = True
        Whether to show warning messages.

    kwargs : 
        Additionals parameters to used in ``fit`` for logistic regression. see `statsmodels <https://www.statsmodels.org/v0.14.4/index.html>`_.

    Returns
    -------
    call_ : NamedTuple
        Call informations:

        - Xtot : DataFrame of shape (n_samples, n_columns)
                Input data.
        - X : DataFrame of shape (n_samples, n_features)
                Training data.
        - y : Series of shape (n_samples,)
                Target values. True values for ``X``.
        - target : str
                 Name of target.
        - features : list
            Names of features seen during ``fit``.
        - classes : list
            Names of classes
        - priors : Series of shape (n_classes,)
            Priors probabilities
        - center : Series of shape (n_features,)
            The average of `X`
        - scale : Series of shape (n_features,)
            The standard deviation of ``X``.
        - n_samples : int
            Number of samples.
        - n_features : int
            Number of features.
        - max_components : int
            Maximum number of components.
        - n_components : int
            Number of components kept.
        - n_classes : int
            Number of target values.
        - max_iter : int
            Maximum number of iterations.
        - tol : float
            The tolerance used as convergence criteria.
        - threshold : float,
            The tolerance for variable importance in projection.
        - multi_class : None, str
            The multiclass logistic regression applied.

    cancoef_ : NamedTuple
        Canonical coefficients:

        - standardized : DataFrame of shape (n_variables, n_components)
            The standardized canonical coefficients
        - raw : DataFrame of shape (n_variables + 1, n_components)
            The raw canonical coefficients

    classes_ : NamedTuple
        Classes informations:

        - infos : DataFrame of shape (n_classes, 3)
            class level information (frequency, proportion, prior probability).
        - coord : DataFrame of shape (n_classes, n_components)
            Class coordinates.
        - eucl : DataFrame of shape (n_classes, n_classes)
            The squared Euclidean distance to origin.
        - gen : DataFrame shape (n_classes, n_classes) 
            The generalized squared distance to origin.

    coef_ : NamedTuple
        Partial least squares logit model coefficients:

        - standardized : DataFrame of shape (n_variables, n_classes - 1)
            The standardized coefficients.
        - raw : DataFrame of shape (n_variables+1, n_classes - 1)
            The raw coefficients.

    explained_variance_ : DataFrame of shape (n_components, 2)
        The explained variance and the cumulative explained variance.

    ind_ : NamedTuple
        Individuals informations:

        - coord : DataFrame of shape (n_samples, n_components)
            The transformed training simples.
        - scores : DataFrame of shape (n_samples,) or (n_samples, n_classes - 1)
            The total scores of individuals.
        - eucl : DataFrame of shape (n_samples, n_classes)
            The squared Euclidean distance to origin.
        - gen : DataFrame shape (n_samples, n_classes) 
            The generalized squared distance to origin.

    logit_ : class
        An object of class Logit.

    logit_coef_ : DataFrame of shape (n_components + 1,) or (n_components + 1, n_classes - 1)
        Logistic regression model coefficients.

    model_ : str, default = 'plslogit'
        The model fitted name.

    var_ : NamedTuple
        Variables informations:

        - weights : DataFrame of shape (n_features, n_components)
            The left singular vectors of the cross-covariance matrices of each iteration.

        - loadings : DataFrame of shape (n_features, n_components)
            The loadings of `X`.

        - rotations : DataFrame of shape (n_features, n_components)
            The projection matrix used to transform X.

    See also
    --------
    :class:`~discrimintools.PLSDA`
        Partial Least Squares Linear Discriminant Analysis
    :class:`~discrimintools.summaryPLSLOGIT`
        Printing summaries of Partial Least Squares Linear Logistic Regression model.
    :class:`~discrimintools.summaryDA`
        Printing summaries of Discriminant Analysis model.

    References
    ----------
    [1] Droesbeke J. J., Lejeune M., Saporta G.  (2005), « `Modèles statistiques pour données qualitatives <https://www.editionstechnip.com/fr/catalogue-detail/995/modeles-statistiques-pour-donnees-qualitatives.html>`_ », Editions TECHNIP.

    [2] Tuffery S. (2017), « Data Mining et Statistique décisionnelle : La science des données », Editions TECHNIP.

    [3] Tuffery S. (2024), « `Modélisation prédictive et Apprentissage statistique avec R <https://www.editionstechnip.com/fr/catalogue-detail/2145/modelisation-predictive-et-apprentissage-statistique-avec-r.html>`_ », Editions TECHNIP, 5ed;

    [4] Tuffery R. (2025), « `Data Science, Statistique et Machine Learning <https://www.editionstechnip.com/fr/catalogue-detail/1005/data-science-statistique-et-machine-learning.html>`_ », Editions TECHNIP, 6ed.

    Examples
    --------
    >>> from discrimintools.datasets import load_dataset, load_vins
    >>> from discrimintools import PLSLOGIT
    >>> #pls + logit
    >>> D = load_dataset("breast")
    >>> y, X = D["Class"], D.drop(columns=["Class"])
    >>> clf = PLSLOGIT()
    >>> clf.fit(X,y)
    PLSLOGIT()
    >>> D = load_vins("train")
    >>> y, X = D["Qualite"], D.drop(columns=["Qualite"])
    >>> #pls + multinomial
    >>> clf = PLSLOGIT(classes=('Mediocre','Moyen','Bon'))
    >>> clf.fit(X,y)
    PLSLOGIT(classes=('Mediocre','Moyen','Bon'))
    >>> "pls + ordinal
    >>> clf = PLSLOGIT(multi_class="ordinal",classes=('Mediocre','Moyen','Bon'),method='bfgs')
    >>> clf.fit(X,y)
    PLSLOGIT(multi_class="ordinal",classes=('Mediocre','Moyen','Bon'),method='bfgs')
    """

[docs]
    def __init__(
            self, n_components = 2, scale = True, classes = None, max_iter = 500, tol = 1e-10, var_select = False, threshold = 1.0, multi_class=None, warn_message = True, **kwargs
    ):
        self.n_components = n_components
        self.scale = scale
        self.classes = classes
        self.max_iter = max_iter
        self.tol = tol
        self.var_select = var_select
        self.threshold = threshold
        self.multi_class = multi_class
        self.warn_message = warn_message
        self.kwargs = kwargs


    def decision_function(self,X):
        """
        Apply decision function to an input data

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.

        Returns
        -------
        C : DataFrame of shape (n_samples, ) or (n_samples, n_classes - 1)
            Decision function values
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if the estimator is fitted by verifying the presence of fitted attributes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_fitted(self)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X)

        #set index name as None
        X.index.name = None

        #check if X contains original features
        if not set(self.call_.Xtot.columns).issubset(X.columns):
            raise ValueError("The names of the features is not the same as the ones in the active features of the PLSLOGIT result")
        
        #select original features
        X = X[self.call_.Xtot.columns]

        #split X
        split_X = splitmix(X)
        #extract elements
        X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2

        #initialize DataFrame
        Xcod = DataFrame(index=X.index,columns=self.call_.X.columns).astype(float)

        #check if numerics variables
        if n_quanti > 0:
            #replace with numerics columns
            Xcod.loc[:,X_quanti.columns] = X_quanti
        
        #check if categorical variables      
        if n_quali > 0:
            #active categorics
            categorics = [x for x in self.call_.X.columns if x not in self.call_.Xtot.columns]
            #replace with dummies
            Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="")

        #remove non selected variables
        Xcod = Xcod.loc[:,list(self.call_.center.index)]
        #multiply by coefficients
        if self.call_.n_classes == 2:
            C = Xcod.dot(self.coef_.raw.iloc[1:]).add(self.coef_.raw.iloc[0])
        else:
            C = Xcod.dot(self.coef_.raw.iloc[1:,:]).add(self.coef_.raw.iloc[0,:].values,axis=1)
        return C

    def fit(self,X,y):
        """
        Fit Partial Least Squares Logistic Regression Model

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Training data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
        
        y : Series of shape (n_samples,)
            Target values. True values for ``X``.

        Returns
        -------
        self : object
            Fitted estimator
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if y is an instance of class pd.Series
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_series(y)

        #check if len are equal
        if X.shape[0] != y.shape[0]:
            raise ValueError("The number of samples in X must be equal to the number of samples in y")
        
        #check if all elements in y are string
        if not all(isinstance(kq, str) for kq in y):
            raise TypeError("All elements in y must be a string")
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if max_iter is not None
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.max_iter is None:
            max_iter = 500
        elif not isinstance(self.max_iter,(int,float)):
            raise TypeError("{} is not supported".format(type(self.max_iter)))
        elif self.max_iter < 0:
            raise ValueError("max_iter' must be positive")
        else:
            max_iter = self.max_iter
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if tol is not None
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.tol is None:
            tol = 1e-10
        elif not isinstance(self.tol,(int,float)):
            raise TypeError("{} is not supported".format(type(self.tol)))
        elif self.tol < 0 or self.tol > 1:
            raise ValueError("the 'tol' value {} is not within the required range of 0 and 1.".format(self.tol))
        else:
            tol = self.tol

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if var_select is a bool
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_bool(self.var_select)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if threshold is not None
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if self.threshold is None:
            threshold = 1
        elif not isinstance(self.threshold,(int,float)):
            raise TypeError("{} is not supported".format(type(self.threshold)))
        elif self.threshold < 0 :
            raise ValueError("the 'threshold' value must be positive.")
        else:
            threshold = self.threshold

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if warn_message is a bool
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_bool(self.warn_message)

        #make a copy of original data
        Xtot = X.copy(deep=True)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #preprocessing
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        X = preprocessing(X)

        #set y name if None
        if y.name is None or isinstance(y.name, int):
            y.name = "group"

        #warning message to inform
        if self.warn_message:
            if any(is_string_dtype(X[k]) for k in X.columns):
                print("\nCategorical features have been encoded into binary variables.\n")

        #recode to dummy if categorics variables
        X = model_matrix(X=X)

        #unique element in y
        uq_y = sorted(y.unique().tolist())
        #number of classes
        n_classes = len(uq_y)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if multi_class is assigned for multiclass logistic regression
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        if n_classes > 2:
            if self.multi_class is None:
                multi_class = "multinomial"
            elif self.multi_class not in ["multinomial","ordinal"]:
                raise ValueError("'multi_class' should be one of 'multinomial', 'ordinal'.")
            else:
                multi_class = self.multi_class
        else:
            multi_class = None

        #class of categories
        if self.classes is not None and isinstance(self.classes, (list,tuple)):
            if len(list(set(self.classes) & set(uq_y))) != n_classes:
                raise ValueError("Insert good classes")
            classes = [str(x) for x in self.classes]
        else:
            classes = uq_y

        #convert y to categorical data type
        y = y.astype(CategoricalDtype(categories=classes,ordered=True))
        #set target name
        target = y.name
        #set piors
        priors = Series(array(y.value_counts(normalize=True).loc[classes]),index=classes,name="priors")
        #number of samples and features
        n_samples, n_features, features = X.shape[0], X.shape[1], list(X.columns)

        #create disjunctive table
        Y = get_dummies(y,dtype=int)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #partial least square regression (PLSR)
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #QR decomposition of X
        Q, R = linalg.qr(X)
        #maximum number of components
        max_components = int(min(linalg.matrix_rank(Q), linalg.matrix_rank(R), n_samples - 1, n_features))
        #set number of components
        if self.n_components is None:
            n_components = max_components
        elif not isinstance(self.n_components,int):
            raise TypeError("'n_components' must be an integer.")
        elif self.n_components < 1:
            raise ValueError("'n_components' must be equal or greater than 1.")
        else:
            n_components = min(self.n_components, max_components)

       #partial least squares regression
        plsr = PLSRegression(n_components=n_components,scale=self.scale,max_iter=max_iter,tol=tol).fit(X,Y)
        #variable importance in projection
        vip = plsrvip(obj=plsr,threshold=threshold)

        #if selected variables
        if self.var_select:
            #update X
            X = X[vip.selected]
            #update partial least squares regression
            plsr = PLSRegression(n_components=n_components,scale=self.scale,max_iter=max_iter,tol=tol).fit(X,Y)
            #update variable importance in projection
            vip = plsrvip(obj=plsr,threshold=threshold)
        
        #update number of features and features in
        n_features, features = plsr.n_features_in_, plsr.feature_names_in_

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #convert to pandas DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #center and scale
        x_center, x_scale = Series(plsr._x_mean,index=plsr.feature_names_in_,name="center"), Series(plsr._x_std,index=plsr.feature_names_in_,name="scale")

        #convert to ordered dictionary
        call_ = OrderedDict(Xtot=Xtot,X=X,y=y,target=target,features=features,classes=classes,priors=priors,center=x_center,scale=x_scale,
                            n_samples=n_samples,n_features=n_features,max_components=max_components,n_components=n_components,n_classes=n_classes,
                            max_iter=max_iter,tol=tol,threshold=threshold,multi_class=multi_class)
        #convert to namedtuple
        self.call_ = namedtuple("call",call_.keys())(*call_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #variables informations : weights, loadings and rotations
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #variables weights
        var_weights = DataFrame(plsr.x_weights_,index=features,columns=["Can{}".format(x+1) for x in range(n_components)])
        #variables loadings
        var_loadings = DataFrame(plsr.x_loadings_,index=features,columns=var_weights.columns)
        #variables rotations
        var_rotations = DataFrame(plsr.x_rotations_,index=features,columns=var_weights.columns)
        #convert to ordered dictionary
        var_ = OrderedDict(weights=var_weights,loadings=var_loadings,rotations=var_rotations)
        #convert to namedtuple
        self.var_  = namedtuple("var",var_.keys())(*var_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #canonical discriminant coefficients
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total-sample standardized canonical coefficients
        std_cancoef = var_loadings
        #total-sample raw (unstandardized) canonical coefficients
        raw_cancoef = concat((std_cancoef.T.dot(-x_center/x_scale).to_frame("Constant").T, std_cancoef.div(x_scale,axis=0)),axis=0)
        #convert to ordered dictionary
        cancoef_ = OrderedDict(standardized=std_cancoef,raw=raw_cancoef)
        #convert to namedtuple
        self.cancoef_ = namedtuple("cancoef",cancoef_.keys())(*cancoef_.values())
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #individuals informations : coordinates and scores
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #individuals coordinates
        ind_coord = DataFrame(plsr.x_scores_,index=X.index,columns=var_weights.columns)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #logistic regression
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #add constant to PLS components
        x_coord = sm.add_constant(ind_coord)
        if n_classes == 2:
            #binary logistic regression
            clf = sm.Logit(y.cat.codes,x_coord).fit(disp=self.warn_message,**self.kwargs)

            #binary logistic coefficients
            clf_coef = clf.params
            #set name
            clf_coef.name = classes[1]
            
            #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
            #classification functions
            #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
            #standardize coefficients
            std_coef = concat((clf.params.iloc[:1],std_cancoef.dot(clf.params.iloc[1:])),axis=0)
            #concatenate
            raw_coef = raw_cancoef.dot(clf.params.iloc[1:])
            #update constante
            raw_coef.update(raw_coef.iloc[:1].add(clf.params.iloc[0]))
            #set name
            std_coef.name, raw_coef.name = classes[1], classes[1]
            #convert to ordered dictionary
            coef_ = OrderedDict(standardized=std_coef,raw=raw_coef)
            #convert to namedtuple
            self.coef_ = namedtuple("coef",coef_.keys())(*coef_.values())

            #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
            #individuals scores - decision function
            #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
            ind_scores = X.dot(raw_coef.iloc[1:]).add(raw_coef.iloc[0])
        else:
            if multi_class == "multinomial":
                #multinomial logistic regression
                clf = sm.MNLogit(y,x_coord).fit(disp=self.warn_message,**self.kwargs)
                #coefficients of multinomial logistic regression
                clf_coef = clf.params
                clf_coef.columns = classes[1:]
            else:
                #cumulative logistic regression
                clf = OrderedModel(y,ind_coord,distr='logit').fit(disp=self.warn_message,**self.kwargs)

                #coefficients of cumulative logistic regression
                clf_coef, clf_cst = clf.params.iloc[:n_components], clf.params.iloc[n_components:]
                clf_cst.name = "Constant"
                #replicate coeffiicients
                clf_coef = concat((clf_coef for x in clf_cst.index),axis=1)
                clf_coef.columns = clf_cst.index
                #concatenate
                clf_coef = concat((clf_cst.to_frame().T,clf_coef),axis=0)
            
            #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
            #classification functions
            #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
            #standardize coefficients
            std_coef = concat((clf_coef.iloc[0,:].to_frame().T,std_cancoef.dot(clf_coef.iloc[1:,:])),axis=0)
            #concatenate
            raw_coef = raw_cancoef.iloc[1:,:].dot(clf_coef.iloc[1:,:])
            #update constante
            raw_cst = raw_cancoef.iloc[0,:].to_frame().T.dot(clf_coef.iloc[1:,:]).add(clf_coef.iloc[0,:].values,axis=1)
            #concatenate
            raw_coef = concat((raw_cst,raw_coef),axis=0)
            #convert to ordered dictionary
            coef_ = OrderedDict(standardized=std_coef,raw=raw_coef)
            #convert to namedtuple
            self.coef_ = namedtuple("coef",coef_.keys())(*coef_.values())

            #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
            #individuals scores - decision functions
            #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
            ind_scores = X.dot(raw_coef.iloc[1:,:]).add(raw_coef.iloc[0,:],axis=1)
            
        #store logistic informations
        self.logit_, self.logit_coef_ = clf, clf_coef

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #classes informations: coordinates, squared euclidean distance and squared generalized distance
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #count and proportion
        n_k, p_k = y.value_counts(normalize=False).loc[classes],  y.value_counts(normalize=True).loc[classes]
        #class level information
        class_infos = DataFrame(c_[n_k,p_k,priors],columns=["Frequency","Proportion","Prior Probability"],index=classes)
        class_infos["Frequency"] = class_infos["Frequency"].astype(int)
        #classes coordinates
        class_coord = concat((ind_coord.loc[y[y==k].index,:].mean(axis=0).to_frame(k) for k in classes),axis=1).T
        #squared euclidean distance between classes
        class_eucl = DataFrame(squareform(pdist(class_coord,metric="sqeuclidean")),index=classes,columns=classes)
        #squared generalized distance
        class_gen = class_eucl.sub(2*log(priors),axis=1)
        #convert to ordered dictionary
        classes_ = OrderedDict(infos=class_infos,coord=class_coord,eucl=class_eucl,gen=class_gen)
        #convert to namedtuple
        self.classes_ = namedtuple("classes",classes_.keys())(*classes_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #individuals additional informations: squared euclidean distance between classes barycenters and squared generalized distance
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #squared euclidean distance to class center
        ind_eucl = concat((ind_coord.sub(class_coord.loc[k,:],axis=1).pow(2).sum(axis=1).to_frame(k) for k in classes),axis=1)
        #squared generalized distance
        ind_gen = ind_eucl.sub(2*log(priors),axis=1)
        #convert to ordered dictionary
        ind_ = OrderedDict(coord=ind_coord,scores=ind_scores,eucl=ind_eucl,gen=ind_gen)
        #convert to namedtuple
        self.ind_ = namedtuple("ind",ind_.keys())(*ind_.values())

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #explained variance for X by each components 
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #total variance in Z
        total_var = X.sub(x_center,axis=1).div(x_scale,axis=1).var(axis=0,ddof=1).sum()
        #explained variance
        explained_var = 100*(ind_coord.var(axis=0,ddof=1)*var_loadings.pow(2).sum(axis=0))/total_var
        #convert to DataFrame
        self.explained_variance_ = DataFrame(c_[explained_var,cumsum(explained_var)],columns=["Proportion (%)","Cumulative (%)"],index=explained_var.index)

        #set model name
        self.model_ = "plslogit"
        return self
    
    def predict(self,X):
        """
        Predict class labels for samples in X

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
            The data for which we want to get the predictions.
        
        Returns
        -------
        y_pred : Series of shape (n_samples,) 
            Predicted labels for ``X``.
        """
        #estimated probabilities
        y_prob = self.predict_proba(X)
        if self.call_.n_classes == 2:
            y_pred = Series(array(self.call_.classes)[where(y_prob >= 0.5,1,0)],index=X.index,name="prediction")
        else:
            y_pred = y_prob.idxmax(axis=1)
            y_pred.name = "prediction"
        return y_pred
    
    def predict_log_proba(self, X):
        """
        Predict logarithm of probability estimates.

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.

        Returns
        -------
        C : DataFrame of shape (n_samples,) or (n_samples, n_classes)
            Posterior log-probabilities
        """
        #estimated log-probabilities
        return self.predict_proba(X).transform(log)
    
    def predict_proba(self,X):
        """
        Probability estimates.

        Parameters
        ----------
        X : DataFrame of shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
            The data for which we want to get the predictions.
        
        Returns
        -------
        C : DataFrame of shape (n_samples,) or (n_samples, n_classes)
            Estimated probabilities.
        """
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if the estimator is fitted by verifying the presence of fitted attributes
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_fitted(self)

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        #check if X is an instance of class pd.DataFrame
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------------
        check_is_dataframe(X)

        #set index name as None
        X.index.name = None

        #check if X contains original features
        if not set(self.call_.Xtot.columns).issubset(X.columns):
            raise ValueError("The names of the features is not the same as the ones in the active features of the PLSLOGIT result")
        #select original features
        X = X[self.call_.Xtot.columns]

        #split X
        split_X = splitmix(X)
        #extract elements
        X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2

        #initialize DataFrame
        Xcod = DataFrame(index=X.index,columns=self.call_.X.columns).astype(float)

        #check if numerics variables
        if n_quanti > 0:
            #replace with numerics columns
            Xcod.loc[:,X_quanti.columns] = X_quanti
        
        #check if categorical variables      
        if n_quali > 0:
            #active categorics
            categorics = [x for x in self.call_.X.columns if x not in self.call_.Xtot.columns]
            #replace with dummies
            Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="")
        
        #remove non selected variables
        Xcod = Xcod.loc[:,list(self.call_.center.index)]
        #standardize : Z = (X - center)/scale and apply rotation
        coord = Xcod.sub(self.call_.center,axis=1).div(self.call_.scale,axis=1).dot(self.var_.rotations)
        #add constant
        if not (self.call_.n_classes > 2 and self.call_.multi_class == "ordinal"):
            coord = sm.add_constant(coord)
        #predicted probabilities
        y_prob = self.logit_.predict(coord)
        #set columns in case of multi class
        if self.call_.n_classes > 2:
            y_prob.columns = self.call_.classes
        return y_prob