Source code for discrimintools.discriminant_analysis._candisc

# -*- coding: utf-8 -*-
from numpy import linalg,sum, sqrt, zeros, dot, transpose,nan, log,c_,real, insert, cumsum,diff,log,array,diag, average,corrcoef
from pandas import DataFrame, Series, concat, CategoricalDtype
from pandas.api.types import is_string_dtype
from collections import namedtuple, OrderedDict
from functools import reduce
from statsmodels.multivariate.manova import MANOVA
from scipy.spatial.distance import pdist,squareform
from sklearn.utils.validation import check_is_fitted

#interns functions
from ._base import _BaseDA
from .functions.utils import check_is_dataframe, check_is_series, check_is_bool
from .functions.preprocessing import preprocessing
from .functions.model_matrix import model_matrix
from .functions.describe import describe
from .functions.sscp import sscp
from .functions.cov_to_cor_test import cov_to_cor_test
from .functions.distance import sqmahalanobis
from .functions.wcorrcoef import wcorrcoef
from .functions.univ_test import univ_test
from .functions.diagnostics import diagnostics
from .functions.lrtest import lrtest
from .functions.splitmix import splitmix
from .functions.tab_disjunctive import tab_disjunctive

[docs] class CANDISC(_BaseDA): """ Canonical Discriminant Analysis (CANDISC) Canonical discriminant analysis is a dimension-reduction technique related to principal component analysis and canonical correlation. The methodology that is used in deriving the canonical coefficients parallels that of a one-way multivariate analysis of variance (MANOVA). MANOVA tests for equality of the mean vector across class levels. Canonical discriminant analysis finds linear combinations of the quantitative variables that provide maximal separation between classes or groups. Given a classification variable and several quantitative variables, the CANDISC procedure derives `canonical variables`, which are linear combinations of the quantitative variables that summarize between-class variation in much the same way that principal components summarize total variation. The :class:`~discrimintools.CANDISC` procedure performs a canonical discriminant analysis, computes squared Mahalanobis distances between class means, and performs both univariate and multivariate one-way analyses of variance. Parameters ---------- n_components : int or `None <https://docs.python.org/3/library/constants.html#None>`_, default = 2 Number of components to keep. If `None <https://docs.python.org/3/library/constants.html#None>`_ set all components are kept:: classes : `None <https://docs.python.org/3/library/constants.html#None>`_, tuple or list, default = `None <https://docs.python.org/3/library/constants.html#None>`_ Name of level in order to return. If `None <https://docs.python.org/3/library/constants.html#None>`_, classes are sorted in unique values in y. warn_message : bool, default = True Show warning messages. Raise a warning without making the program crash. Returns ------- call_ : NamedTuple Call informations: - Xtot : DataFrame of shape (n_samples, n_columns) Input data. - X : DataFrame of shape (n_samples, n_features) Training data. - y : Series of shape (n_samples,) Target values. True values for ``X``. - target : str Name of target. - features : list Names of features seen during ``fit``. - classes : list Names of classes. - priors : Series of shape (n_classes,) Priors probabilities. - n_samples : int Number of samples. - n_features : int Number of features. - n_classes : int Number of target values - max_components : int Maximum number of components. - n_components : int Number of components kept. cancoef_: NamedTuple Canonical coefficients: - raw : DataFrame of shape (n_features + 1, n_components) Raw canonical coefficients. - total : DataFrame of shape (n_features, n_components) Total canonical coefficients. - pooled : DataFrame of shape (n_features, n_components) Pooled canonical coefficients. cancorr_ : DataFrame of shape (n_components, 10) The canonical correlations test. classes_ : NamedTuple Classes informations: - infos : DataFrame of shape (n_classes, 3) class level information (frequency, proportion, prior probability). - center : DataFrame of shape (n_classes, n_features) Class means. - total : DataFrame of shape (n_features, n_classes) Total-sample standardized class means. - pooled : DataFrame of shape (n_features, n_classes) Pooled-within class standardized class means. - mahal : DataFrame of shape (n_classes, n_classes) Squared Mahalanobis distances between classes. - coord : DataFrame of shape (n_classes, n_components) Class coordinates. - eucl : DataFrame of shape (n_classes, n_classes) The squared Euclidean distance to origin. - gen : DataFrame shape (n_classes, n_classes) The generalized squared distance to origin. coef_ : DataFrame of shape (n_features + 1, n_classes) Linear classification functions coefficients. corr_ : NamedTuple Correlation coefficients test: - total : DataFrame of shape (C^{2}_{n_features}, 7) Total-sample correlation coefficients test. - within : dict Within-class correlation coefficients test. - pooled : DataFrame of shape (C^{2}_{n_features}, 7) Pooled within-class correlation coefficients test. - between : DataFrame of shape (C^{2}_{n_features}, 7) Between-class correlation coefficients test. cov_ : NamedTuple Covariance matrices: - total : DataFrame of shape (n_features, n_features) Total-sample covariance matrix. - btotal : DataFrame of shape (n_features, n_features) Biased total-sample covariance matrix. - within : dict Within-class covariance matrices. - bwithin : dict Biased within-class covariance matrices. - pooled : DataFrame of shape (n_features, n_features) Pooled within-class covariance matrix. - bpooled : DataFrame of shape (n_features, n_features) Biased pooled within-class covariance matrix. - between : DataFrame of shape (n_features, n_features) Between-class covariance matrix - bbetween : DataFrame of shape (n_features, n_features) biased between-class covariance matrix. eig_ : DataFrame of shape (n_components, 4) The eigenvalues, the difference between each eigenvalue, the percentage of variance and the cumulative percentage of variance ind_ : NamedTuple Individuals informations: - coord : DataFrame of shape (n_samples, n_components) The coordinates of individuals. - mahal : DataFrame of shape (n_samples, n_classes) The squared Mahalanobis distance to origin. - eucl : DataFrame of shape (n_samples, n_classes) The squared Euclidean distance to origin. - gen : DataFrame shape (n_samples, n_classes) The generalized squared distance to origin. - scores : DataFrame of shape (n_samples, n_classes) The total scores of individuals. model_ : str, default = "candisc" Name of model fitted. sscp_ : NamedTuple Sum of square cross product (SSCP) matrices: - total : DataFrame of shape (n_features, n_features) Total-sample SSCP matrix. - within : dict Within-class SSCP matrices - pooled: DataFrame of shape (n_features, n_features) Pooled within-class SSCP matrix. - between : DataFrame of shape (n_features, n_features) Between-class SSCP matrix. statistics_ : NamedTuple Statistics results: - anova : DataFrame of shape (n_features, 11) Analysis of variance test. - manova : DataFrame of shape (4, 5) Multivariate analysis of variance test. - average_rsq : DataFrame of shape (1, 2) Average R-square. - performance : DataFrame of shape (3, 3) The model global performance. summary_ : NamedTuple Summary informations: - infos : DataFrame of shape (3, 4) Summary informations (total sample size, number of features, number of classes, total degree of freedom, within-class degree of freedom, between-class degree of freedom). - total : DataFrame of shape (n_features, 8) Total-sample statistics, see `pandas.Describe`_. - within : dict Within-class statistics svd_ : Namedtuple Singular value decomposition: - value : 1D array of shape (n_components,) The eigenvalues - vectors : 2D array of shape (n_features, n_components) The eigenvectors var_ : NamedTuple Variables informations (correlation): - total : DataFrame of shape (n_features, n_components) The total-sample correlation of variables with canonical dimensions. - pooled : DataFrame of shape (n_features, n_components) The pooled-within class correlation of variables with canonical dimensions. * between : DataFrame of shape (n_features, n_components) The between-class correlation of variables with canonical dimensions. See also -------- :class:`~discrimintools.fviz_candisc` Visualize Canonical Discriminant Analysis. :class:`~discrimintools.fviz_candisc_biplot` Visualize Canonical Discriminant Analysis (CANDISC) - Biplot of individuals and variables. :class:`~discrimintools.fviz_candisc_ind` Visualize Canonical Discriminant Analysis (CANDISC) - Graph of individuals. :class:`~discrimintools.fviz_candisc_var` Visualize Canonical Discriminant Analysis (CANDISC) - Graph of variables. :class:`~discrimintools.fviz_dist` Visualize distance between barycenter. :class:`~discrimintools.summaryCANDISC` Printing summaries of Canonical Discriminant Analysis model. :class:`~discrimintools.summaryDA` Printing summaries of Discriminant Analysis model. References ---------- [1] Lebart Ludovic, Piron Marie, & Morineau Alain (2006), « `Statistique Exploratoire Multidimensionnelle`_ », Dunod, Paris 4ed. [2] Ricco Rakotomalala (2020), « `Pratique de l'Analyse Discriminante Linéaire`_ », Version 1.0, Université Lumière Lyon 2. [3] Saporta Gilbert (2011), « `Probabilités, Analyse de données et Statistiques`_ », Editions TECHNIP, 3ed. [4] Tenenhaus Michel (2007), « Statistique - Méthodes pour décrire, expliquer et prévoir », Dunod. [5] Tenenhaus Michel (1996), « Méthodes statistiques en gestion », Dunod. [6] Tuffery Stephane (2017), « Data Mining et Statistique décisionelle », Editions TECHNIP, 5ed. [7] Tuffery Stephane (2025), « Data Science, Statistique et Machine learning », Editions TECHNIP, 6ed. [8] SAS/STAT User's Guide (2013), « `The CANDISC Procedure`_ », Chapter 31. .. _Statistique Exploratoire Multidimensionnelle: https://horizon.documentation.ird.fr/exl-doc/pleins_textes/2023-12/010038111.pdf .. _Pratique de l'Analyse Discriminante Linéaire: https://hal.science/hal-04868585v1/file/Pratique_Analyse_Discriminante_Lineaire.pdf .. _Probabilités, Analyse de données et Statistiques: https://en.pdfdrive.to/dl/probabilites-analyses-des-donnees-et-statistiques .. _The CANDISC Procedure: https://support.sas.com/documentation/onlinedoc/stat/131/candisc.pdf .. _pandas.Describe: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html Examples -------- >>> from discrimintools.datasets import load_wine >>> from discrimintools import CANDISC >>> D = load_wine() # load training data >>> y, X = D["Quality"], D.drop(columns=["Quality"]) # split into X and y >>> clf = CANDISC() >>> clf.fit(X,y) CANDISC() >>> XTest = load_wine("test") # load test data >>> print(clf.predict(XTest)) 1958 bad Name: prediction, dtype: object """
[docs] def __init__( self, n_components = 2, classes = None, warn_message = True ): self.n_components = n_components self.classes = classes self.warn_message = warn_message
def decision_function(self,X): """ Apply decision function to an input data Parameters ---------- X : DataFrame of shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- C : DataFrame of shape (n_samples, n_classes) Decision function values related to each class, per sample. """ #raw canonical coordinates coord = self.transform(X=X) #squared eulidean distance to origin gsqdist = concat((coord.sub(self.classes_.coord.loc[k,:],axis=1).pow(2).sum(axis=1).to_frame(k) for k in self.call_.classes),axis=1) #add priors log-probabiliies to squared euclidean distance return -0.5*gsqdist.sub(2*log(self.call_.priors),axis=1) def fit(self,X,y): """ Fit the Canonical Discriminant Analysis model Parameters ---------- X : DataFrame of shape (n_samples, n_features) Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. y : Series of shape (n_samples,) Target values. True labels for ``X``. Returns ------- self : object Fitted estimator """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if X is an instance of class pd.DataFrame #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_dataframe(X) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if y is an instance of class pd.Series #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_series(y) #check if len are equal if X.shape[0] != y.shape[0]: raise ValueError("The number of samples in X must be equal to the number of samples in y") #check if all elements in y are string if not all(isinstance(kq, str) for kq in y): raise TypeError("All elements in y must be a string") #set y name if None if y.name is None or isinstance(y.name, int): y.name = "group" #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if warn_message is a bool #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_bool(self.warn_message) #make a copy of original data Xtot = X.copy(deep=True) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #preprocessing #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- X = preprocessing(X) #warning message to inform if self.warn_message: if any(is_string_dtype(X[k]) for k in X.columns): print("\nCategorical features have been encoded into binary variables.\n") #encode categorical variables into binary without first level. X = model_matrix(X=X) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #set classes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #unique element in y uq_y = sorted(list(y.unique())) #number of class n_classes = len(uq_y) if self.classes is not None and isinstance(self.classes, (list,tuple)): if len(list(set(self.classes) & set(uq_y))) != n_classes: raise ValueError("Insert good classes") classes = [str(k) for k in self.classes] else: classes = uq_y #convert y to categorical data type y = y.astype(CategoricalDtype(categories=classes,ordered=True)) #number of samples and number of features n_samples, n_features = X.shape #set target and features names target, features = y.name, list(X.columns) #define subset of X X_k = {k : X.loc[y[y==k].index,:] for k in classes} #count and proportion n_k, p_k = y.value_counts(normalize=False).loc[classes], y.value_counts(normalize=True).loc[classes] #set piors priors = Series(array(p_k),index=classes,name="priors") #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #set number of components to kept #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #maximum components max_components = int(min(n_classes - 1, n_features)) #set number of components if self.n_components is None: n_components = max_components elif not isinstance(self.n_components,int): raise ValueError("'n_components' must be an integer.") elif self.n_components < 1: raise ValueError("'n_components' must be equal or greater than 1.") else: n_components = min(self.n_components, max_components) #convert to ordered dictionary call_ = OrderedDict(Xtot=Xtot,X=X,y=y,target=target,features=features,classes=classes,priors=priors,n_samples=n_samples,n_features=n_features,n_classes=n_classes, max_components=max_components,n_components=n_components) #convert to namedtuple self.call_ = namedtuple("call",call_.keys())(*call_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #sample statistics #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #summary information summary_infos = DataFrame({"infos" : ["Total Sample Size","Variables","Classes"], "Value" : [n_samples, n_features, n_classes], "DF" : ["DF Total", "DF Within Classes", "DF Between Classes"], "DF value" : [n_samples-1, n_samples - n_classes, n_classes-1]}) #total-sample and within-class summaries tsummary, wsummary = describe(X), {k : describe(X_k[k]) for k in classes} #convert to ordered dictionary summary_ = OrderedDict(infos=summary_infos,total=tsummary,within=wsummary) #convert to namedtuple self.summary_ = namedtuple("summary",summary_.keys())(*summary_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #sum of square cross product (SSCP) matrix #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #total-sample and within-class SSCP matrices tsscp, wsscp = sscp(X=X), {k: sscp(X_k[k]) for k in classes} #pooled within-class SSCCP matrix pwsscp = reduce(lambda i , j : i + j, wsscp.values()) #between-class SSCP matrix bsscp = tsscp - pwsscp #convert to ordered dictionary sscp_ = OrderedDict(total=tsscp,within=wsscp,pooled=pwsscp,between=bsscp) #convert to namedtuple self.sscp_ = namedtuple("sscp",sscp_.keys())(*sscp_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #covariance matrices #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #total-sample and biased total-sample covariance matrices tcov, tcovb = tsscp.div(n_samples - 1), tsscp.div(n_samples) #within-class and biased within-class covariance matrices wcov, wcovb = {k : wsscp[k].div(n_k[k]-1) for k in classes}, {k : wsscp[k].div(n_k[k]) for k in classes} #pooled within-class and biased pooled within-class covariance matrices pwcov, pwcovb = pwsscp.div(n_samples - n_classes), pwsscp.div(n_samples) #inverse of within-class and pooled within-class covariance matrices invpwcov = linalg.inv(pwcov) #between-class and biased between-class covariance matrices bcov, bcovb = bsscp.div(n_samples*(n_classes-1)/n_classes), bsscp.div(n_samples) #convert to ordered dictionary cov_ = OrderedDict(total=tcov,btotal=tcovb,within=wcov,bwithin=wcovb,pooled=pwcov,bpooled=pwcovb,between=bcov,bbetween=bcovb) #convert to namedtuple self.cov_ = namedtuple("cov",cov_.keys())(*cov_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #correlation coefficients test #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #total sample and within-class correlation coefficients tcortest, wcortest = cov_to_cor_test(X=tcovb,n_samples=n_samples), {k: cov_to_cor_test(X=wcovb[k],n_samples=n_k[k]) for k in classes} #pooled within-class and between-class correlation coefficients pwcortest, bcortest = cov_to_cor_test(X=pwcov,n_samples=n_samples-n_classes + 1), cov_to_cor_test(X=bcov,n_samples=n_classes) #convert to ordered dictionary cortest_ = OrderedDict(total=tcortest,within=wcortest,pooled=pwcortest,between=bcortest) #convert to namedtuple self.corr_ = namedtuple("corr",cortest_.keys())(*cortest_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #classes informations #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #class level information class_infos = DataFrame(c_[n_k,p_k,priors],columns=["Frequency","Proportion","Prior Probability"],index=classes) class_infos["Frequency"] = class_infos["Frequency"].astype(int) #within-class average class_center = concat((X_k[k].mean(axis=0).to_frame(k) for k in classes),axis=1).T #total-sample standardized class means tcenter = class_center.sub(tsummary["mean"],axis=1).div(sqrt(diag(tcov)),axis=1).T #pooled within-class standardized class means pcenter = class_center.sub(tsummary["mean"],axis=1).div(sqrt(diag(pwcov)),axis=1).T #squared mahalanobis distances between class - pairwise squared distances between groups class_mahal = concat((sqmahalanobis(X=class_center,VI=invpwcov,mu=class_center.loc[k,:]).to_frame(k) for k in classes),axis=1) #convert to ordered dictionary classes_ = OrderedDict(infos=class_infos,center=class_center,total=tcenter,pooled=pcenter,mahal=class_mahal) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #eigen decomposition #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #matrix C C = class_center.sub(tsummary["mean"],axis=1).mul(sqrt(priors),axis=0).T #eigen decomposition eig = linalg.eig(dot(dot(C.T,linalg.inv(tcovb)),C)) #gestion des nombres complexes lambd = array(sorted(real(eig.eigenvalues),reverse=True))[:n_components] #find index idx = [list(real(eig.eigenvalues)).index(x) for x in lambd] #reorder eigen vectors vector = real(eig.eigenvectors)[:,idx] #convert to namedtuple self.svd_ = namedtuple("svd",["value","vector"])(lambd,vector) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #eigen values informations #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #eigenvalue informations rho = array([x/(1-x) for x in lambd]) difference, proportion = insert(-diff(rho),len(rho)-1,nan), 100*rho/sum(rho) #store all informations self.eig_ = DataFrame(c_[rho,difference,proportion,cumsum(proportion)],columns=["Eigenvalue","Difference","Proportion","Cumulative"],index = ["Can"+str(x+1) for x in range(n_components)]) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #canonical coefficients - canonical discriminant coefficients #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coefficients of features (without intercept) rawcoef = DataFrame(dot(dot(linalg.inv(tcovb),C),vector),index=features,columns=["Can"+str(x+1) for x in range(n_components)]).mul(sqrt((n_samples-n_classes)/(n_samples*lambd*(1-lambd))),axis=1) #intercept rawintercept = - rawcoef.T.dot(tsummary["mean"].values.reshape(-1,1)) rawintercept.columns = ["Constant"] #total-sample standardized canonical coefficients tcan_coef = rawcoef.mul(sqrt(diag(tcov)),axis=0) #pooled within-class standardized canonical coefficients pcan_coef = rawcoef.mul(sqrt(diag(pwcov)),axis=0) #convert to ordered dictionary cancoef_ = OrderedDict(raw=concat((rawintercept.T,rawcoef),axis=0),total=tcan_coef,pooled=pcan_coef) #convert to namedtuple self.cancoef_ = namedtuple("cancoef",cancoef_.keys())(*cancoef_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #canonical correlation #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #canonical correlation cancorr = DataFrame(c_[sqrt(lambd),lambd],columns=["Canonical Correlation","Squared Canonical Correlation"]) #likelohood ratio test lr_test = DataFrame(zeros((n_components,8)),columns=["Likelihood Ratio","Approximate F value","Num DF","Den DF","Pr>F","Chi-Square","DF","Pr>Chi2"]) for i in range(n_components): lr_test.iloc[-i,:] = lrtest(n_samples=n_samples,n_features=n_features,n_classes=n_classes,eigen=lambd[-(i+1):]) lr_test = lr_test.sort_index(ascending=False).reset_index(drop=True) lr_test["Num DF"], lr_test["Den DF"], lr_test["DF"] = lr_test["Num DF"].astype(int), lr_test["Den DF"].astype(int), lr_test["DF"].astype(int) #convert to CANDISC attribute self.cancorr_ = concat((cancorr,lr_test),axis=1) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #individuals informations: squared mahalanobis distance & coordinates #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #individuals scores ind_coord = X.sub(tsummary["mean"],axis=1).dot(rawcoef) #squared mahalanobis distance to origin ind_mahal = concat((sqmahalanobis(X=X,VI=invpwcov,mu=class_center.loc[k,:]).to_frame(k) for k in classes),axis=1) #convert to ordered dictionary ind_ = OrderedDict(coord=ind_coord,mahal=ind_mahal) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #others classes informations: coordinates of classes, coordinates of center of classes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coordinates of the classes class_coord = concat((ind_coord.loc[y[y==k].index,:].mean(axis=0).to_frame(k) for k in classes),axis=1).T #squared euclidean distance between classes class_eucl = DataFrame(squareform(pdist(class_coord,metric="sqeuclidean")),index=classes,columns=classes) #squared generalized distance class_gen = class_eucl.sub(2*log(priors),axis=1) #updated classes_ dictionary classes_ = OrderedDict(**classes_, **OrderedDict(coord=class_coord,eucl=class_eucl,gen=class_gen)) #convert to namedtuple self.classes_ = namedtuple("classes",classes_.keys())(*classes_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #variables coordinates: total, within & between #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #total correlation - total canonical Structure var_tcoord = DataFrame(corrcoef(x=X,y=ind_coord,rowvar=False)[:n_features,n_features:],index=features,columns=ind_coord.columns) #poold within correlation - Pooled Within Canonical Structure z1, z2 = ind_coord.sub(class_coord.loc[y.values,:].values), X.sub(class_center.loc[y.values,:].values,axis=1) var_pcoord = DataFrame(transpose(corrcoef(x=z1,y=z2,rowvar=False)[:n_components,n_components:]),index=features,columns=ind_coord.columns) #between correlation - between Canonical Structure var_bcoord = concat((Series([wcorrcoef(class_center[k],class_coord[l],priors) for l in ind_coord.columns],index=ind_coord.columns,name=k) for k in features),axis=1).T #convert to ordered dictionary var_ = OrderedDict(total=var_tcoord,pooled=var_pcoord,between=var_bcoord) #onvert to namedtuple self.var_ = namedtuple("var",var_.keys())(*var_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #classification function #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #coefficients of classification function self.coef_ = self.cancoef_.raw.dot(class_coord.T) #update intercept self.coef_.iloc[0,:] = self.coef_.iloc[0,:].sub(0.5*class_coord.pow(2).sum(axis=1)).add(log(p_k)) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #individuals informations: scores, squared euclidean distance and squared generalized distance #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #squared euclidean distance to class center ind_eucl = concat((ind_coord.sub(class_coord.loc[k,:],axis=1).pow(2).sum(axis=1).to_frame(k) for k in classes),axis=1) #squared generalized distance ind_gen = ind_eucl.sub(2*log(priors),axis=1) #individuals scores ind_scores = X.dot(self.coef_.iloc[1:,:]).add(self.coef_.iloc[0,:].values,axis=1) #update individuals dictionary ind_ = OrderedDict(**ind_,**OrderedDict(eucl=ind_eucl,gend=ind_gen,scores=ind_scores)) #convert to namedtuple self.ind_ = namedtuple("ind",ind_.keys())(*ind_.values()) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #others statistics #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #standard deviation stdev = DataFrame(c_[sqrt(diag(tcov)),sqrt(diag(pwcov)),sqrt(diag(bcov))],columns=["Total Std. Dev.","Pooled Std. Dev.","Between Std. Dev."],index=features) #compute univariate analysis of variance - ANOVA anova = concat((stdev,univ_test(X,y)),axis=1) #average R-Square unwavg_rsq, wavg_rsq = average(anova.loc[:,"R-Square"].values), average(anova.loc[:,"R-Square"].values,weights=diag(tcovb)) avg_rsq = DataFrame([[unwavg_rsq,wavg_rsq]],index=["Average R-Square"],columns=["Unweighted","Weighted by Variance"]) #compute multivariate analysis of variance - MANOVA manova = MANOVA.from_formula(formula="{}~{}".format("+".join(features),"+".join([target])), data=concat((X,y),axis=1)).mv_test(skip_intercept_test=True).summary_frame manova.index = manova.index.droplevel() #performance performance = diagnostics(Vb=tcovb,Wb=pwcovb,n_samples=n_samples,n_classes=n_classes) #convert to ordered dictionary statistics_ = OrderedDict(anova=anova,manova=manova,average_rsq=avg_rsq,performance=performance) #convert to namedtuple self.statistics_ = namedtuple("statistics",statistics_.keys())(*statistics_.values()) #set model name self.model_ = "candisc" return self def fit_transform(self,X,y): """ Fit to data, then transform it Parameters ---------- X : DataFrame of shape (n_samples, n_features) Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. y : Series of shape (n_samples,) Target values. True labels for ``X``. Returns ------- X_new : DataFrame of shape (n_samples, n_components) Transformed data, where ``n_components`` is the number of components """ #fit the model self.fit(X,y) return self.ind_.coord def transform(self,X): """ Apply the dimensionality reduction on X X is projected on the canonical components previously extracted from a training set. Parameters ---------- X : DataFrame of shape (n_samples, n_features) New data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- X_new : DataFrame of shape (n_samples, n_components) Transformed data, where ``n_components`` is the number of components. """ #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if the estimator is fitted by verifying the presence of fitted attributes #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_fitted(self) #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- #check if X is an instance of class pd.DataFrame #--------------------------------------------------------------------------------------------------------------------------------------------------------------------- check_is_dataframe(X=X) #set index name as None X.index.name = None #check if X contains original features if not set(self.call_.Xtot.columns).issubset(X.columns): raise ValueError("The names of the features is not the same as the ones in the active features of the CANDISC result") #select original features X = X[self.call_.Xtot.columns] #split X split_X = splitmix(X) #extract elements X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2 #initialize DataFrame Xcod = DataFrame(index=X.index,columns=self.call_.X.columns).astype(float) #check if numerics variables if n_quanti > 0: #replace with numerics columns Xcod.loc[:,X_quanti.columns] = X_quanti #check if categorical variables if n_quali > 0: #active categorics categorics = [x for x in self.call_.X.columns if x not in self.call_.Xtot.columns] #replace with dummies Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="") #multiply by raw canonical coefficients return Xcod.dot(self.cancoef_.raw.iloc[1:,:]).add(self.cancoef_.raw.iloc[0,:],axis=1)