# -*- coding: utf-8 -*-
from numpy import ndarray, array, c_, diag, dot, linalg, log, sqrt, average, ones
from pandas import DataFrame, concat, CategoricalDtype, Series
from pandas.api.types import is_string_dtype
from functools import reduce
from collections import OrderedDict, namedtuple
from statsmodels.multivariate.manova import MANOVA
from sklearn.utils.validation import check_is_fitted
#intern function
from ._base import _BaseDA
from .functions.utils import check_is_dataframe, check_is_series, check_is_bool
from .functions.preprocessing import preprocessing
from .functions.model_matrix import model_matrix
from .functions.ldavip import ldavip
from .functions.describe import describe
from .functions.sscp import sscp
from .functions.box_m_test import box_m_test
from .functions.cov_to_cor_test import cov_to_cor_test
from .functions.distance import sqmahalanobis
from .functions.cov_infos import cov_infos
from .functions.univ_test import univ_test
from .functions.diagnostics import diagnostics
from .functions.splitmix import splitmix
from .functions.tab_disjunctive import tab_disjunctive
[docs]
class DISCRIM(_BaseDA):
"""
Discriminant Analysis (DISCRIM)
Performs a discriminant analysis (linear and quadratic) on a set of observations (training data) containing one or more numerics variables and a classification variables defining groups of observations.
The derived discriminant criterion from the training data can be applied to a testing dataset.
Parameters
----------
method : {'linear', 'quad'}, default = 'linear'
The discriminant analysis method to performs, possible values:
- 'linear' for linear discriminant analysis (LDA).
- 'quad' for quadratic discriminant analysis (QDA)
priors : str or array-like or Series of shape (n_classes,), default = None
The priors statement specifies the class prior probabilities of group membership, possibles values:
- 'equal' to set the prior probabilities equal.
- 'prop' to set the prior probabilities proportional to the sample sizes.
- numpy 1-D array or Series which specify the prior probability for each level of the classification variable.
classes : None, tuple or list, default = None
Name of level in order to return. If None, classes are sorted in unique values in y.
var_select : bool, default = False
Whether to applied feature selection based on variable importance (contribution) in prediction for linear discriminant analysis
level : float, default = None
Significance level for the variable importance critical probability. You can specify the `level` option only when both method = 'linear' and var_select=True are also specified.
If you specify both method = 'linear' and var_select=True but omit the `level` option, DISCRIM uses :math:`5e-2` as the significance level for the variabe importance.
tol : float, default = None
Significance level for the test of homogeneity. You can specify the `tol` option only when method = 'quad' is also specified.
If you specify method = 'quad' but omit the `tol` option, DISCRIM uses :math:`1e-1` as the significance level for the test.
warn_message : bool, default = True
Show warning messages. Raise a warning without making the program crash.
Returns
-------
call_ : NamedTuple
Call informations:
- Xtot : DataFrame of shape (n_samples, n_columns)
Input data.
- X : DataFrame of shape (n_samples, n_features)
Training data.
- y : Series of shape (n_samples,)
Target values. True values for ``X``.
- target : str
Name of target.
- features : list
Names of features seen during ``fit``.
- classes : list
Names of classes.
- priors : Series of shape (n_classes,)
Priors probabilities.
- n_samples : int
Number of samples.
- n_features : int
Number of features.
- n_classes : int
Number of target values.
classes_ : Namedtuple
Classes informations:
- infos : DataFrame of shape (n_classes, 3)
class level information (frequency, proportion, prior probability).
- center : DataFrame of shape (n_classes, n_features)
Class means.
- total : DataFrame of shape (n_features, n_classes)
Total-sample standardized class means.
- pooled : DataFrame of shape (n_features, n_classes)
Pooled-within class standardized class means.
- mahal : DataFrame of shape (n_classes, n_classes)
Squared Mahalanobis distances between classes.
- gen : DataFrame of shape (n_classes, n_classes)
Generalized Squared distances between classes.
coef_ : DataFrame of shape (n_features, n_classes)
Linear classification functions coefficients.
corr_ : NamedTuple
Correlation coefficients test:
- total : DataFrame of shape (C^{2}_{n_features}, 7)
Total-sample correlation coefficients test.
- within : dict
Within-class correlation coefficients test.
- pooled : DataFrame of shape (C^{2}_{n_features}, 7)
Pooled within-class correlation coefficients test.
- between : DataFrame of shape (C^{2}_{n_features}, 7)
Between-class correlation coefficients test.
cov_ : NamedTuple
Covariance matrices:
- total : DataFrame of shape (n_features, n_features)
Total-sample covariance matrix.
* btotal : DataFrame of shape (n_features, n_features)
Biased total-sample covariance matrix.
- within : dict
Within-class covariance matrices.
- bwithin : dict
Biased within-class covariance matrices.
- pooled : DataFrame of shape (n_features, n_features)
pooled within-class covariance matrix.
- bpooled : DataFrame of shape (n_features, n_features)
biased pooled within-class covariance matrix.
- between : DataFrame of shape (n_features, n_features)
Between-class covariance matrix
- bbetween : DataFrale of shape (n_features, n_features)
biased between-class covariance matrix.
- test : DataFrame of shape (1, 7)
Box's M test.
ind_ : NamedTuple
Individuals informations:
- scores : DataFrame of shape (n_samples, n_classes)
The total scores of individuals.
- mahal : DataFrame of shape (n_samples, n_classes)
Squared Mahalanobis distance to origin.
* gen : DataFrame shape (n_samples, n_classes)
Generalized squared distance to origin.
model_ : str, default = "discrim"
Name of model fitted.
sscp_ : NamedTuple
Sum of square cross product (SSCP) matrices:
- total : DataFrame of shape (n_features, n_features)
Total-sample SSCP matrix
- within : dict
Within-class SSCP matrices
- pooled: DataFrame of shape (n_features, n_features)
Pooled within-class SSCP matrix
- between : DataFrame of shape (n_features, n_features)
Between-class SSCP matrix.
statistics_ : NamedTuple
Statistics results:
- anova : DataFrame of shape (n_features, 11)
Analysis of variance test.
- manova : DataFrame of shape (4, 5)
Multivariate analysis of variance test.
- average_rsq : DataFrame of shape (1, 2)
Average R-square.
- performance : DataFrame of shape (3, 3)
The model global performance. Only if linear discriminant analysis.
summary_ : NamedTuple
Summary informations:
- infos : DataFrame of shape (3, 4)
Summary informations (total sample size, number of features, number of classes,
total degree of freedom, within-class degree of freedom, between-class degree of freedom).
- total : DataFrame of shape (n_features, 8)
Total-sample statistics, see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html
- within : dict
Within-class statistics.
vip_ : NamedTuple
Variable importance for prediction:
- vip : DataFrame of shape (n_features, 6)
Variable importance for prediction.
- selected : list
Selected variables.
See also
--------
:class:`~discrimintools.GFALDA`
General Factor Analysis Linear Discriminant Analysis
:class:`~discrimintools.CPLS`
Partial Least Squares for Classification
:class:`~discrimintools.PLSDA`
Partial Least Squares Discriminant Analysis
:class:`~discrimintools.PLSLDA`
Partial Least Squares Linear Discriminant Analysis
:class:`~discrimintools.summaryDISCRIM`
Printing summaries of Discriminant Analysis (linear & quadratic) model.
:class:`~discrimintools.summaryDA`
Printing summaries of Discriminant Analysis model.
References
----------
[1] Bardos M. (2001), « Analyse discriminante - Application au risque et scoring financier », Dunod.
[2] Lebart Ludovic, Piron Marie, & Morineau Alain (2006), « `Statistique Exploratoire Multidimensionnelle <https://horizon.documentation.ird.fr/exl-doc/pleins_textes/2023-12/010038111.pdf>`_ », Dunod, Paris 4ed.
[3] Ricco Rakotomalala (2020), « `Pratique de l'Analyse Discriminante Linéaire <https://hal.science/hal-04868585v1/file/Pratique_Analyse_Discriminante_Lineaire.pdf>`_ », Université Lumière Lyon 2, Version 1.0.
[4] Saporta Gilbert (2011), « `Probabilités, Analyse des données et Statistiques <https://en.pdfdrive.to/dl/probabilites-analyses-des-donnees-et-statistiques>`_ », Editions TECHNIP, 3ed.
[5] Tenenhaus Michel (1996), « Méthodes statistiques en gestion », Dunod.
[6] Tuffery Stephane (2017), « Data Mining et Statistique décisionelle », Editions TECHNIP, 5ed.
[7] Tuffery Stephane (2025), « Data Science, Statistique et Machine learning », Editions TECHNIP, 6ed.
[8] SAS/STAT 13.2 User's Guide (2014), « `The DISCRIM Procedure <https://support.sas.com/documentation/onlinedoc/stat/132/discrim.pdf>`_ », Chapter 35.
Examples
--------
>>> from discrimintools.datasets import load_alcools
>>> from discrimintools import DISCRIM
>>> D = load_alcools() # load training data
>>> y, X = D["TYPE"], D.drop(columns["TYPE"]) # split into X and y
>>> #linear discriminant analysis (LDA)
>>> clf = DISCRIM()
>>> clf.fit(X,y)
DISCRIM(priors='prop')
>>> #quadratic discriminant analysis
>>> clf2 = DISCRIM(method='quad')
>>> clf2.fit(X,y)
DISCRIM(method='quad',priors='prop')
```
"""
[docs]
def __init__(
self, method = 'linear', priors = None, classes = None, var_select = False, level = None, tol = None, warn_message = True
):
self.method = method
self.priors = priors
self.classes = classes
self.var_select = var_select
self.level = level
self.tol = tol
self.warn_message = warn_message
def decision_function(self,X):
"""
Apply decision function to an input data
The decision function is equal to the `log-posterior`_ of the model.
.. log-posterior: https://online.stat.psu.edu/stat857/node/80/
Parameters
----------
X : DataFrame of shape (n_samples, n_features)
Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
Returns
-------
C : DataFrame of shape (n_samples, n_classes)
Decision function values related to each class, per sample.
"""
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if the estimator is fitted by verifying the presence of fitted attributes
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_fitted(self)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if X is an instance of class pd.DataFrame
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_dataframe(X=X)
#set index name as None
X.index.name = None
#check if X contains original features
if not set(self.call_.Xtot.columns).issubset(X.columns):
raise ValueError("The names of the features is not the same as the ones in the active features of the DISCRIM result")
#select original features
X = X[self.call_.Xtot.columns]
#split X
split_X = splitmix(X)
#extract elements
X_quanti, X_quali, n_samples, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.n, split_X.k1, split_X.k2
#initialize DataFrame
Xcod = DataFrame(index=X.index,columns=self.call_.X.columns).astype(float)
#check if numerics variables
if n_quanti > 0:
#replace with numerics columns
Xcod.loc[:,X_quanti.columns] = X_quanti
#check if categorical variables
if n_quali > 0:
#active categorics
categorics = [x for x in self.call_.X.columns if x not in self.call_.Xtot.columns]
#replace with dummies
Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="")
#remove non selected variables
Xcod = Xcod.loc[:,list(self.cov_.total.index)]
#chi-square pvalue and tolerance threshold
p_value, tol = self.cov_.test.iloc[0,6], self.call_.tol
#quadratic discriminant analysis
if self.method == "quad" and p_value <= tol:
#inverse of within-class covariance matrix
invwcov = {k : linalg.inv(self.cov_.within[k]) for k in self.call_.classes}
#squared distance of individuals to origin
mahal = concat((sqmahalanobis(X=Xcod,VI=invwcov[k],mu=self.classes_.center.loc[k,:]).to_frame(k) for k in self.call_.classes),axis=1)
#generalized squared distance of individuals to origin
gsqdist = mahal.add(self.cov_.infos.loc[self.call_.classes,"Natural Log of the Determinant"],axis=1)
#linear discriminant analysis
elif (self.method == "linear") or (self.method == "quad" and p_value > tol):
#inverse of pooled within-class covariance matrix
invpwcov = linalg.inv(self.cov_.pooled)
#generalized squared to distance of individuals to origin
gsqdist = concat((sqmahalanobis(X=Xcod,VI=invpwcov,mu=self.classes_.center.loc[k,:]).to_frame(k) for k in self.call_.classes),axis=1)
#remove priors log-probabilities
if not (isinstance(self.priors, str) and self.priors == "equal"):
gsqdist = gsqdist.sub(2*log(self.call_.priors),axis=1)
return -0.5*gsqdist
def feature_importance(self,level=5e-2,all_vars=True):
"""
Variables Importance for Prediction in Linear Discriminant Analysis (LDAVIP)
Parameters
----------
level : float, default=5e-2
Significance level for the variable importance critical probability.
If None :math:`5e-2` is used as the significance level for the variabe importance.
all_vars : bool, default=True
If to test all subset of variables.
Returns
-------
vip : DataFrame of shape (n_features, 6)
Variable importance for prediction.
"""
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if the estimator is fitted by verifying the presence of fitted attributes
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_fitted(self)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if linear discriminant analysis model
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
if self.method != "linear":
raise NotImplementedError("'feature_importance' method cannot be used for QDA method.")
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if var_select is False
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
if self.var_select:
raise NotImplementedError("'feature_importance' method cannot be used if var_select=True.")
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if level is not None
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
if level is None:
level = 5e-2
elif not isinstance(level,float):
raise TypeError("{} is not supported".format(type(level)))
elif level < 0 or level > 1:
raise ValueError("the 'level' value {} is not within the required range of 0 and 1.".format(level))
else:
level = level
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if boolean
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_bool(all_vars)
return ldavip(X=self.call_.X,y=self.call_.y,level=level,all_vars=all_vars).vip
def fit(self,X,y):
"""
Fit the Discriminant Analysis model.
Parameters
----------
X : DataFrame of shape (n_samples, n_features)
Training Data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
y : Series of shape (n_samples,)
Target values. True labels for ``X``.
Returns
-------
self : object
Fitted estimator.
"""
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if X is an instance of class pd.DataFrame
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_dataframe(X)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if y is an instance of class pd.Series
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_series(y)
#check if len are equal
if X.shape[0] != y.shape[0]:
raise ValueError("The number of samples in X must be equal to the number of samples in y")
#check if all elements in y are string
if not all(isinstance(kq, str) for kq in y):
raise TypeError("All elements in y must be a string")
#set y name if None
if y.name is None or isinstance(y.name, int):
y.name = "group"
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if method is 'linear' or 'quad'
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
if self.method not in ['linear','quad']:
raise ValueError("method must be one of 'linear', 'quad', got {}".format(self.method))
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if priors is not None
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
if self.priors is None:
self.priors = "prop"
elif not isinstance(self.priors,(str,list,tuple,ndarray,Series)):
raise TypeError("{} is not supported".format(type(self.priors)))
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if var_select is a bool
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_bool(self.var_select)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if level is not None
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
if self.method == 'linear':
if self.level is None:
level = 5e-2
elif not isinstance(self.level,float):
raise TypeError("{} is not supported".format(type(self.level)))
elif self.level < 0 or self.level > 1:
raise ValueError("the 'level' value {} is not within the required range of 0 and 1.".format(self.level))
else:
level = self.level
else:
level = None
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if tol is not None (for quadratic discriminant analysis)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
if self.method == 'quad':
if self.tol is None:
tol = 1e-1
elif not isinstance(self.tol,float):
raise TypeError("{} is not supported".format(type(self.tol)))
elif self.tol < 0 or self.tol > 1:
raise ValueError("the 'tol' value {} is not within the required range of 0 and 1.".format(self.tol))
else:
tol = self.tol
else:
tol = None
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if warn_message is a bool
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_bool(self.warn_message)
#make a copy of original data
Xtot = X.copy(deep=True)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#preprocessing
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
X = preprocessing(X)
#warning message to inform
if self.warn_message:
if any(is_string_dtype(X[k]) for k in X.columns):
print("\nCategorical features have been encoded into binary variables.\n")
#encode categorical variables into binary without first level.
X = model_matrix(X=X)
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#set classes
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#unique element in y
uq_y = sorted(list(y.unique()))
#number of classes
n_classes = len(uq_y)
if self.classes is not None and isinstance(self.classes, (list,tuple)):
if len(list(set(self.classes) & set(uq_y))) != n_classes:
raise ValueError("Insert good classes")
classes = [str(k) for k in self.classes]
else:
classes = uq_y
#convert y to categorical data type
y = y.astype(CategoricalDtype(categories=classes,ordered=True))
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#variables importance for prediction in linear discriminant analysis
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
if self.method == 'linear':
vip = ldavip(X,y,level,all_vars=False)
#check if at least one variable selected
if self.var_select and len(vip.selected) > 0:
#update X with selected variables
X = X.loc[:,vip.selected]
#update variable importance
vip = ldavip(X,y,level,all_vars=False)
#set to attribute
self.vip_ = vip
#number of samples and number of features
n_samples, n_features = X.shape
#set target and features names
target, features = y.name, X.columns.tolist()
#define subset of X
X_k = {k : X.loc[y[y==k].index,:] for k in classes}
#count and proportion
n_k, p_k = y.value_counts(normalize=False).loc[classes], y.value_counts(normalize=True).loc[classes]
#set piors probabilities
if isinstance(self.priors,str):
if self.priors == "prop":
priors = array(p_k)
elif self.priors == "equal":
priors = ones(n_classes)/n_classes
else:
raise TypeError("Specify a right value for piors")
elif isinstance(self.priors,(list,tuple,ndarray,Series)):
priors = array([x/sum(self.priors) for x in self.priors])
#check if any value in priors is negative
if any(x < 0 for x in priors):
raise ValueError("priors must be non-negative")
#convert to pandas Series
priors = Series(priors,index=classes,name="priors")
#convert to ordered dictionary
call_ = OrderedDict(Xtot=Xtot,X=X,y=y,target=target,features=features,classes=classes,priors=priors,n_samples=n_samples,n_features=n_features,n_classes=n_classes,level=level,tol=tol)
#convert to namedtuple
self.call_ = namedtuple("call",call_.keys())(*call_.values())
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#sample statistics
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#summary information
summary_infos = DataFrame({"Infos" : ["Total Sample Size","Variables","Classes"],
"Value" : [n_samples,n_features,n_classes],
"DF" : ["DF Total", "DF Within Classes", "DF Between Classes"],
"DF value" : [n_samples - 1, n_samples - n_classes, n_classes - 1]})
#total-sample and within-class summaries
tsummary, wsummary = describe(X), {k : describe(X_k[k]) for k in classes}
#convert to dictionary
summary_ = OrderedDict(infos=summary_infos,total=tsummary,within=wsummary)
#convert to namedtuple
self.summary_ = namedtuple("summary",summary_.keys())(*summary_.values())
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#sum of square cross product (SSCP) matrix
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#total-sample and within-class SSCP matrix
tsscp, wsscp = sscp(X=X), {k: sscp(X_k[k]) for k in classes}
#pooled within-class SSCCP matrix
pwsscp = reduce(lambda i , j : i + j, wsscp.values())
#between-class SSCP matrix
bsscp = tsscp - pwsscp
#convert to dictionary
sscp_ = OrderedDict(total=tsscp,within=wsscp,pooled=pwsscp,between=bsscp)
#convert to namedtuple
self.sscp_ = namedtuple("sscp",sscp_.keys())(*sscp_.values())
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#covariance matrices
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#total-sample and biased total-sample covariance matrices
tcov, tcovb = tsscp.div(n_samples - 1), tsscp.div(n_samples)
#within-class and biased within-class covariance matrices
wcov, wcovb = {k : wsscp[k].div(n_k[k]-1) for k in classes}, {k : wsscp[k].div(n_k[k]) for k in classes}
#test of equality of covariance matrix - homogeneity of variance and covariance
wcov_test = box_m_test(wcov.values(),list(n_k))
#chi-square pvalue
p_value = wcov_test.iloc[0,6]
#print warning message
if self.warn_message:
if self.method == "quad" and p_value > tol:
print("\nSince the Chi-Square value is not significant at the {} level, a pooled covariance matrix will be used in the discriminant function.\nReference: Morrison, D.F. (1976) Multivariate Statistical Methods p252.".format(tol))
elif self.method == "quad" and p_value <= tol:
print("\nSince the Chi-Square value is significant at the {} level, the within covariance matrices will be used in the discriminant function.\nReference: Morrison, D.F. (1976) Multivariate Statistical Methods p252.".format(tol))
#pooled within-class and biased pooled within-class covariance matrices
pwcov, pwcovb = pwsscp.div(n_samples - n_classes), pwsscp.div(n_samples)
#between-class and biased between-class covariance matrices
bcov, bcovb = bsscp.div(n_samples*(n_classes-1)/n_classes), bsscp.div(n_samples)
##covariance matrices informations - rank and natural log of the determinant
cov_info = cov_infos(X=pwcov).to_frame("Pooled").T
if self.method == "quad":
#within-class covariance matrices informations
wcov_infos = concat((cov_infos(X=wcov[k]).to_frame(k) for k in classes),axis=1).T
#concatenate
cov_info = concat((cov_info,wcov_infos),axis=0)
#convert to integer
cov_info["Rank"] = cov_info["Rank"].astype(int)
#convert to dictionary
cov_ = OrderedDict(infos=cov_info,total=tcov,btotal=tcovb,within=wcov,bwithin=wcovb,pooled=pwcov,bpooled=pwcovb,between=bcov,bbetween=bcovb,test=wcov_test)
#convert to namedtuple
self.cov_ = namedtuple("cov",cov_.keys())(*cov_.values())
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#correlation coefficients test
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#total sample and within-class correlation coefficients
tcortest, wcortest = cov_to_cor_test(X=tcovb,n_samples=n_samples), {k: cov_to_cor_test(X=wcovb[k],n_samples=n_k[k]) for k in classes}
#pooled within-class and between-class correlation coefficients
pwcortest, bcortest = cov_to_cor_test(X=pwcov,n_samples=n_samples-n_classes+1), cov_to_cor_test(X=bcov,n_samples=n_classes)
#convert to dictionary
cortest_ = OrderedDict(total=tcortest,within=wcortest,pooled=pwcortest,between=bcortest)
#convert to namedtuple
self.corr_ = namedtuple("corr",cortest_.keys())(*cortest_.values())
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#classes and individuals informations
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#class level information
class_infos = DataFrame(c_[n_k,p_k,priors],columns=["Frequency","Proportion","Prior Probability"],index=classes)
class_infos["Frequency"] = class_infos["Frequency"].astype(int)
#within-class average
wcenter = concat((X_k[k].mean(axis=0).to_frame(k) for k in classes),axis=1).T
#total-sample standardized class means
tcenter = wcenter.sub(tsummary["mean"],axis=1).div(tsummary["std"],axis=1).T
#pooled within-class standardized class means
pcenter = wcenter.sub(tsummary["mean"],axis=1).div(sqrt(diag(pwcov)),axis=1).T
#convert to dictionary
classes_, ind_ = OrderedDict(infos=class_infos,center=wcenter,total=tcenter,pooled=pcenter), OrderedDict()
#squared distance and generalized squared distance
if self.method == "quad" and p_value <= tol:
#inverse of within-class covariance matrices
invwcov = {k : linalg.inv(wcov[k]) for k in classes}
#squared distance of classes to origin
class_mahal = concat((sqmahalanobis(X=wcenter,VI=invwcov[k],mu=wcenter.loc[k,:]).to_frame(k) for k in classes),axis=1)
#generalized squared distance of classes to origin
class_gen = class_mahal.add(wcov_infos.loc[classes,"Natural Log of the Determinant"],axis=1)
#squared distance of individuals to origin
ind_mahal = concat((sqmahalanobis(X=X,VI=invwcov[k],mu=wcenter.loc[k,:]).to_frame(k) for k in classes),axis=1)
#generalized squared distance of individuals to origin
ind_gen = ind_mahal.add(wcov_infos.loc[classes,"Natural Log of the Determinant"],axis=1)
elif (self.method == "linear") or (self.method == "quad" and p_value > tol):
#inverse pooled within-class covariance matrix
invpwcov = linalg.inv(pwcov)
#squared distance of classes to origin
class_mahal = concat((sqmahalanobis(X=wcenter,VI=invpwcov,mu=wcenter.loc[k,:]).to_frame(k) for k in classes),axis=1)
#generalized squared distance of classes to origin
class_gen = class_mahal.copy()
#squared distance of individuals to origin
ind_mahal = concat((sqmahalanobis(X=X,VI=invpwcov,mu=wcenter.loc[k,:]).to_frame(k) for k in classes),axis=1)
#generalized squared to distance of individuals to origin
ind_gen = ind_mahal.copy()
#coefficients of features
coef = DataFrame(dot(invpwcov,wcenter.T),index=X.columns,columns=classes)
#intercept
intercept = - 0.5*diag(dot(dot(wcenter,invpwcov),wcenter.T))
if not (isinstance(self.priors,str) and self.priors == "equal"):
intercept = intercept + log(priors)
#convert to DataFrame
intercept = Series(intercept,index=classes).to_frame("Constant").T
#scores of individuals
ind_scores = X.dot(coef).add(intercept.values,axis=1)
#concatenate
self.coef_ = concat((intercept,coef),axis=0)
#add score to ordered dictionary
ind_ = OrderedDict(**ind_,**OrderedDict(scores=ind_scores))
#remove priors log-probabilities
if not (isinstance(self.priors, str) and self.priors == "equal"):
class_gen, ind_gen = class_gen.sub(2*log(priors),axis=1), ind_gen.sub(2*log(priors),axis=1)
#convert to ordered dictionary
classes_, ind_ = OrderedDict(**classes_, **OrderedDict(mahal=class_mahal,gen=class_gen)), OrderedDict(**ind_,**OrderedDict(mahal=ind_mahal,gen=ind_gen))
#convert to namedtuple
self.classes_, self.ind_ = namedtuple("classes",classes_.keys())(*classes_.values()), namedtuple("ind",ind_.keys())(*ind_.values())
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#multivariate goodness of fit - diagnostic test
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#standard deviation
stdev = DataFrame(c_[sqrt(diag(tcov)),sqrt(diag(pwcov)),sqrt(diag(bcov))],columns=["Total Std. Dev.","Pooled Std. Dev.","Between Std. Dev."],index=features)
#compute univariate analysis of variance - ANOVA
anova = concat((stdev,univ_test(X,y)),axis=1)
#average R-Square
unwavg_rsq, wavg_rsq = average(anova.loc[:,"R-Square"].values), average(anova.loc[:,"R-Square"].values,weights=diag(tcovb))
avg_rsq = DataFrame([[unwavg_rsq,wavg_rsq]],index=["Average R-Square"],columns=["Unweighted","Weighted by Variance"])
#compute multivariate analysis of variance - MANOVA
manova = MANOVA.from_formula(formula="{}~{}".format("+".join(features),"+".join([target])), data=concat((X,y),axis=1)).mv_test(skip_intercept_test=True).summary_frame
manova.index = manova.index.droplevel()
#convert to dictionary
statistics_ = OrderedDict(anova=anova,manova=manova,average_rsq=avg_rsq)
#add if linear discriminant analysis
if self.method == "linear":
#performance
performance = diagnostics(Vb=tcovb,Wb=pwcovb,n_samples=n_samples,n_classes=n_classes)
#update classes
statistics_ = OrderedDict(**statistics_,**OrderedDict(performance=performance))
#convert to namedtuple
self.statistics_ = namedtuple("statistics",statistics_.keys())(*statistics_.values())
self.model_ = "discrim"
return self
def fit_transform(self,X,y):
"""
Fit to data, then transform it
Fits transformer to ``X`` and returns a transformed version of samples.
Parameters
----------
X : DataFrame of shape (n_samples, n_features)
Training data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
y : Series of shape (n_samples,)
Target values. True labels for ``X``.
Returns
-------
X_new : DataFrame of shape (n_samples, n_classes)
Transformed samples.
"""
#fit discriminant analysis model
self.fit(X,y)
#chi-square pvalue and tolerance threshold
p_value, tol = self.cov_.test.iloc[0,6], self.call_.tol
if self.method == "quad" and p_value <= tol:
raise NotImplementedError("Since the Chi-Square value is significant at the {} level.'fit_transform' method cannot be used.".format(tol))
return self.ind_.scores
def transform(self,X):
"""
Project data to maximize class separation
Parameters
----------
X : DataFrame of shape (n_samples, n_features)
New data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features.
Returns
-------
X_new : DataFrame of shape (n_samples, n_classes)
Transformed data, where ``n_samples`` is the number of samples and ``n_classes`` is the number of classes.
"""
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if the estimator is fitted by verifying the presence of fitted attributes
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_fitted(self)
#chi-square pvalue and tolerance threshokd
p_value, tol = self.cov_.test.iloc[0,6], self.call_.tol
if self.method == "quad" and p_value <= tol:
raise NotImplementedError("Since the Chi-Square value is significant at the {} level.'transform' method cannot be used.".format(tol))
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
#check if X is an instance of class pd.DataFrame
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------
check_is_dataframe(X=X)
#set index name as None
X.index.name = None
#check if X contains original features
if not set(self.call_.Xtot.columns).issubset(X.columns):
raise ValueError("The names of the features is not the same as the ones in the active features of the DISCRIM result")
#select original features
X = X[self.call_.Xtot.columns]
#split X
split_X = splitmix(X)
#extract elements
X_quanti, X_quali, n_quanti, n_quali = split_X.quanti, split_X.quali, split_X.k1, split_X.k2
#initialize DataFrame
Xcod = DataFrame(index=X.index,columns=self.call_.X.columns).astype(float)
#check if numerics variables
if n_quanti > 0:
#replace with numerics columns
Xcod.loc[:,X_quanti.columns] = X_quanti
#check if categorical variables
if n_quali > 0:
#active categorics
categorics = [x for x in self.call_.X.columns if x not in self.call_.Xtot.columns]
#replace with dummies
Xcod.loc[:,categorics] = tab_disjunctive(X=X_quali,dummies_cols=categorics,prefix=True,sep="")
#remove non selected variables
Xcod = Xcod.loc[:,list(self.cov_.total.index)]
#multiply by linear discriminant analysis coefficients
return Xcod.dot(self.coef_.iloc[1:,:]).add(self.coef_.iloc[0,:].values,axis=1)