{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## CANDISC - heart dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#disable warnings\n", "from warnings import simplefilter, filterwarnings\n", "simplefilter(action='ignore', category=FutureWarning)\n", "filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### heart dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 150 entries, 0 to 149\n", "Data columns (total 14 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 disease 150 non-null object \n", " 1 age 150 non-null int64 \n", " 2 sex 150 non-null object \n", " 3 chestpain 150 non-null object \n", " 4 restbpress 150 non-null int64 \n", " 5 cholesteral 150 non-null int64 \n", " 6 sugar 150 non-null object \n", " 7 electro 150 non-null object \n", " 8 maxHeartRate 150 non-null int64 \n", " 9 ExerciseAngina 150 non-null object \n", " 10 oldpeak 150 non-null float64\n", " 11 slope 150 non-null object \n", " 12 vesselsColored 150 non-null int64 \n", " 13 thal 150 non-null object \n", "dtypes: float64(1), int64(5), object(8)\n", "memory usage: 17.6+ KB\n", "None\n" ] } ], "source": [ "#vins dataset\n", "from discrimintools.datasets import load_heart\n", "D = load_heart(\"train\")\n", "print(D.info())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#split into X and y\n", "y, X = D[\"disease\"], D.drop(columns=[\"disease\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### instanciation & training" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Categorical features have been encoded into binary variables.\n", "\n" ] }, { "data": { "text/html": [ "
CANDISC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "CANDISC()" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from discrimintools import CANDISC\n", "clf = CANDISC(n_components=2)\n", "clf.fit(X,y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Evaluatin of prediction on training data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Observation Profile:\n", " Read Used\n", "Number of Observations 150 150\n", "\n", "Number of Observations Classified into disease:\n", "prediction absence presence Total\n", "disease \n", "absence 75 7 82\n", "presence 12 56 68\n", "Total 87 63 150\n", "\n", "Percent Classified into disease:\n", "prediction absence presence Total\n", "disease \n", "absence 91.463415 8.536585 100.0\n", "presence 17.647059 82.352941 100.0\n", "Total 58.000000 42.000000 100.0\n", "Priors 0.546667 0.453333 NaN\n", "\n", "Error Count Estimates for disease:\n", " absence presence Total\n", "Rate 0.085366 0.176471 0.126667\n", "Priors 0.546667 0.453333 NaN\n", "\n", "Classification Report for disease:\n", " precision recall f1-score support\n", "absence 0.862069 0.914634 0.887574 82.000000\n", "presence 0.888889 0.823529 0.854962 68.000000\n", "accuracy 0.873333 0.873333 0.873333 0.873333\n", "macro avg 0.875479 0.869082 0.871268 150.000000\n", "weighted avg 0.874227 0.873333 0.872790 150.000000\n" ] } ], "source": [ "#eval_predict function\n", "eval_train = clf.eval_predict(X,y,verbose=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy : 87.0%\n" ] } ], "source": [ "#score function\n", "print(\"Accuracy : {}%\".format(100*round(clf.score(X,y),2)))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Error rate : 13.0%\n" ] } ], "source": [ "#error rate\n", "print(\"Error rate : {}%\".format(100-100*round(clf.score(X,y),2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### summary" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Canonical Discriminant Analysis - Results \n", "\n", "Summary Information:\n", " infos Value DF DF value\n", "0 Total Sample Size 150 DF Total 149\n", "1 Variables 18 DF Within Classes 148\n", "2 Classes 2 DF Between Classes 1\n", "\n", "Class Level Information:\n", " Frequency Proportion Prior Probability\n", "absence 82 0.5467 0.5467\n", "presence 68 0.4533 0.4533\n", "\n", "Total-Sample Class Means:\n", " absence presence\n", "age 53.0244 56.3824\n", "sexmale 0.5488 0.8529\n", "chestpainatypicalAngina 0.2073 0.0441\n", "chestpainnonAnginal 0.4390 0.1324\n", "chestpaintypicalAngina 0.1098 0.0294\n", "restbpress 129.3902 135.6471\n", "cholesteral 243.6951 249.1912\n", "sugarlow 0.8293 0.8529\n", "electrosttAbnormality 0.0000 0.0147\n", "electroventricHypertrophy 0.3780 0.6029\n", "maxHeartRate 159.3049 139.2794\n", "ExerciseAnginayes 0.1220 0.5000\n", "oldpeak 0.6415 1.6279\n", "slopeflat 0.3049 0.5882\n", "slopeupsloping 0.6220 0.2941\n", "vesselsColored 0.3171 1.1029\n", "thalnormal 0.7683 0.3088\n", "thalreversableEffect 0.1829 0.6176\n", "\n", "Importance of components:\n", " Eigenvalue Difference Proportion Cumulative\n", "Can1 1.5613 NaN 100.0 100.0\n", "\n", "Raw Canonical and Classification Functions Coefficients:\n", " Can1 absence presence\n", "Constant 1.0245 -0.0847 -3.1163\n", "age -0.0031 -0.0035 0.0042\n", "sexmale -0.6604 -0.7464 0.9000\n", "chestpainatypicalAngina 0.9829 1.1110 -1.3397\n", "chestpainnonAnginal 0.7308 0.8260 -0.9960\n", "chestpaintypicalAngina 1.8507 2.0918 -2.5224\n", "restbpress -0.0092 -0.0104 0.0126\n", "cholesteral 0.0014 0.0015 -0.0018\n", "sugarlow -0.5250 -0.5933 0.7155\n", "electrosttAbnormality -1.1995 -1.3557 1.6348\n", "electroventricHypertrophy -0.2464 -0.2784 0.3358\n", "maxHeartRate 0.0111 0.0125 -0.0151\n", "ExerciseAnginayes -0.4188 -0.4734 0.5708\n", "oldpeak -0.2847 -0.3218 0.3881\n", "slopeflat -0.4727 -0.5343 0.6443\n", "slopeupsloping -0.2029 -0.2293 0.2766\n", "vesselsColored -0.5928 -0.6700 0.8079\n", "thalnormal 0.4137 0.4676 -0.5638\n", "thalreversableEffect -0.5253 -0.5937 0.7159\n", "\n", "Test of H0: The canonical correlations in the current row and all that follow are zero\n", " Canonical Correlation Squared Canonical Correlation Likelihood Ratio \\\n", "0 0.7808 0.6096 0.3904 \n", "\n", " Approximate F value Num DF Den DF Pr>F Chi-Square DF Pr>Chi2 \n", "0 11.3629 18 131 0.0 130.7323 18 0.0 \n", "\n", "Classification Summary for Calibration Data:\n", "\n", "Observation Profile:\n", " Read Used\n", "Number of Observations 150 150\n", "\n", "Number of Observations Classified into disease:\n", "prediction absence presence Total\n", "disease \n", "absence 75 7 82\n", "presence 12 56 68\n", "Total 87 63 150\n", "\n", "Percent Classified into disease:\n", "prediction absence presence Total\n", "disease \n", "absence 91.4634 8.5366 100.0\n", "presence 17.6471 82.3529 100.0\n", "Total 58.0000 42.0000 100.0\n", "Priors 0.5467 0.4533 NaN\n", "\n", "Error Count Estimates for disease:\n", " absence presence Total\n", "Rate 0.0854 0.1765 0.1267\n", "Priors 0.5467 0.4533 NaN\n", "\n", "Classification Report for disease:\n", " precision recall f1-score support\n", "absence 0.8621 0.9146 0.8876 82.0000\n", "presence 0.8889 0.8235 0.8550 68.0000\n", "accuracy 0.8733 0.8733 0.8733 0.8733\n", "macro avg 0.8755 0.8691 0.8713 150.0000\n", "weighted avg 0.8742 0.8733 0.8728 150.0000\n" ] } ], "source": [ "from discrimintools import summaryCANDISC\n", "summaryCANDISC(clf,detailed=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Evaluation of prediction on testing dataset\n", "\n", "#### Testing data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 120 entries, 150 to 269\n", "Data columns (total 14 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 disease 120 non-null object \n", " 1 age 120 non-null int64 \n", " 2 sex 120 non-null object \n", " 3 chestpain 120 non-null object \n", " 4 restbpress 120 non-null int64 \n", " 5 cholesteral 120 non-null int64 \n", " 6 sugar 120 non-null object \n", " 7 electro 120 non-null object \n", " 8 maxHeartRate 120 non-null int64 \n", " 9 ExerciseAngina 120 non-null object \n", " 10 oldpeak 120 non-null float64\n", " 11 slope 120 non-null object \n", " 12 vesselsColored 120 non-null int64 \n", " 13 thal 120 non-null object \n", "dtypes: float64(1), int64(5), object(8)\n", "memory usage: 14.1+ KB\n", "None\n" ] } ], "source": [ "#testining data\n", "DTest = load_heart(\"test\")\n", "print(DTest.info())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Observation Profile:\n", " Read Used\n", "Number of Observations 120 120\n", "\n", "Number of Observations Classified into disease:\n", "prediction absence presence Total\n", "disease \n", "absence 59 9 68\n", "presence 11 41 52\n", "Total 70 50 120\n", "\n", "Percent Classified into disease:\n", "prediction absence presence Total\n", "disease \n", "absence 86.764706 13.235294 100.0\n", "presence 21.153846 78.846154 100.0\n", "Total 58.333333 41.666667 100.0\n", "Priors 0.546667 0.453333 NaN\n", "\n", "Error Count Estimates for disease:\n", " absence presence Total\n", "Rate 0.132353 0.211538 0.16825\n", "Priors 0.546667 0.453333 NaN\n", "\n", "Classification Report for disease:\n", " precision recall f1-score support\n", "absence 0.842857 0.867647 0.855072 68.000000\n", "presence 0.820000 0.788462 0.803922 52.000000\n", "accuracy 0.833333 0.833333 0.833333 0.833333\n", "macro avg 0.831429 0.828054 0.829497 120.000000\n", "weighted avg 0.832952 0.833333 0.832907 120.000000\n" ] } ], "source": [ "#split into X and y\n", "yTest, XTest = DTest[\"disease\"], DTest.drop(columns=[\"disease\"])\n", "eval_test = clf.eval_predict(XTest,yTest,verbose=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }