{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## STEPDISC LDA - alcool dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#disable warnings\n", "from warnings import simplefilter, filterwarnings\n", "simplefilter(action='ignore', category=FutureWarning)\n", "filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### alcools dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 52 entries, 0 to 51\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 TYPE 52 non-null object \n", " 1 MEOH 52 non-null float64\n", " 2 ACET 52 non-null float64\n", " 3 BU1 52 non-null float64\n", " 4 BU2 52 non-null float64\n", " 5 ISOP 52 non-null int64 \n", " 6 MEPR 52 non-null float64\n", " 7 PRO1 52 non-null float64\n", " 8 ACAL 52 non-null float64\n", "dtypes: float64(7), int64(1), object(1)\n", "memory usage: 3.8+ KB\n", "None\n" ] } ], "source": [ "#vins dataset\n", "from discrimintools.datasets import load_alcools\n", "D = load_alcools(\"train\")\n", "print(D.info())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### instanciation and training" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================== Step 1 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F\n", "MEOH 0.282629 0.717371 62.186129 2 49 3.587597e-14\n", "ACET 0.971855 0.028145 0.709531 2 49 4.968583e-01\n", "BU1 0.286173 0.713827 61.112585 2 49 4.868527e-14\n", "BU2 0.914588 0.085412 2.288014 2 49 1.122087e-01\n", "ISOP 0.887731 0.112269 3.098457 2 49 5.406192e-02\n", "MEPR 0.691854 0.308146 10.912106 2 49 1.203236e-04\n", "PRO1 0.835465 0.164535 4.824978 2 49 1.222491e-02\n", "ACAL 0.979642 0.020358 0.509127 2 49 6.041644e-01\n", "\n", "Variable MEOH will enter\n", "\n", "\n", "====================== Step 2 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F\n", "ACET 0.253614 0.102660 2.745708 2 48 0.074297\n", "BU1 0.192547 0.318729 11.228252 2 48 0.000100\n", "BU2 0.244101 0.136320 3.788072 2 48 0.029680\n", "ISOP 0.264061 0.065697 1.687604 2 48 0.195751\n", "MEPR 0.221217 0.217287 6.662572 2 48 0.002796\n", "PRO1 0.255676 0.095365 2.530037 2 48 0.090232\n", "ACAL 0.235697 0.166054 4.778852 2 48 0.012803\n", "\n", "Variable BU1 will enter\n", "\n", "\n", "====================== Step 3 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F\n", "ACET 0.178725 0.071786 1.817445 2 47 0.173671\n", "BU2 0.170291 0.115585 3.071236 2 47 0.055772\n", "ISOP 0.174351 0.094502 2.452583 2 47 0.097018\n", "MEPR 0.147786 0.232468 7.117602 2 47 0.001994\n", "PRO1 0.176100 0.085419 2.194821 2 47 0.122666\n", "ACAL 0.173496 0.098943 2.580493 2 47 0.086432\n", "\n", "Variable MEPR will enter\n", "\n", "\n", "====================== Step 4 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F\n", "ACET 0.138022 0.066069 1.627088 2 46 0.207606\n", "BU2 0.131479 0.110340 2.852582 2 46 0.067944\n", "ISOP 0.129820 0.121570 3.183082 2 46 0.050730\n", "PRO1 0.136572 0.075879 1.888507 2 46 0.162842\n", "ACAL 0.127365 0.138180 3.687719 2 46 0.032702\n", "\n", "No variable can enter\n", "\n" ] }, { "data": { "text/html": [ "
STEPDISC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "STEPDISC()" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from discrimintools import DISCRIM, STEPDISC\n", "#split into X and y\n", "y, X = D[\"TYPE\"], D.drop(columns=[\"TYPE\"])\n", "clf = DISCRIM().fit(X,y)\n", "clf2 = STEPDISC(method=\"forward\",alpha=0.01,verbose=True)\n", "clf2.fit(clf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Selected variables" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['MEOH', 'BU1', 'MEPR']\n" ] } ], "source": [ "#selected variables\n", "print(clf2.summary_.selected)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### summary" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Stepwise Discriminant Analysis - Results \n", "\n", "====================== Before forward selection =======================\n", "\n", " Discriminant Analysis - Results \n", "\n", "Summary Information:\n", " Infos Value DF DF value\n", "0 Total Sample Size 52 DF Total 51\n", "1 Variables 8 DF Within Classes 49\n", "2 Classes 3 DF Between Classes 2\n", "\n", "Class Level Information:\n", " Frequency Proportion Prior Probability\n", "KIRSCH 17 0.3269 0.3269\n", "MIRAB 15 0.2885 0.2885\n", "POIRE 20 0.3846 0.3846\n", "\n", "Pooled Covariance Matrix Information:\n", " Rank Natural Log of the Determinant\n", "Pooled 8 58.3267\n", "\n", "Linear Discriminant Function for TYPE:\n", " KIRSCH MIRAB POIRE\n", "Constant -5.0165 -18.8407 -24.7649\n", "MEOH 0.0034 0.0290 0.0334\n", "ACET 0.0064 0.0164 0.0075\n", "BU1 -0.0637 0.4054 0.3180\n", "BU2 -0.0009 0.0714 0.1150\n", "ISOP 0.0231 0.0298 -0.0085\n", "MEPR 0.0375 -0.1289 0.0618\n", "PRO1 0.0020 -0.0054 -0.0083\n", "ACAL 0.0662 -0.2264 -0.1303\n", "\n", "Classification Summary for Calibration Data:\n", "\n", "Observation Profile:\n", " Read Used\n", "Number of Observations 52 52\n", "\n", "Number of Observations Classified into TYPE:\n", "prediction KIRSCH MIRAB POIRE Total\n", "TYPE \n", "KIRSCH 17 0 0 17\n", "MIRAB 0 14 1 15\n", "POIRE 0 2 18 20\n", "Total 17 16 19 52\n", "\n", "Percent Classified into TYPE:\n", "prediction KIRSCH MIRAB POIRE Total\n", "TYPE \n", "KIRSCH 100.0000 0.0000 0.0000 100.0\n", "MIRAB 0.0000 93.3333 6.6667 100.0\n", "POIRE 0.0000 10.0000 90.0000 100.0\n", "Total 32.6923 30.7692 36.5385 100.0\n", "Priors 0.3269 0.2885 0.3846 NaN\n", "\n", "Error Count Estimates for TYPE:\n", " KIRSCH MIRAB POIRE Total\n", "Rate 0.0000 0.0667 0.1000 0.0577\n", "Priors 0.3269 0.2885 0.3846 NaN\n", "\n", "Classification Report for TYPE:\n", " precision recall f1-score support\n", "KIRSCH 1.0000 1.0000 1.0000 17.0000\n", "MIRAB 0.8750 0.9333 0.9032 15.0000\n", "POIRE 0.9474 0.9000 0.9231 20.0000\n", "accuracy 0.9423 0.9423 0.9423 0.9423\n", "macro avg 0.9408 0.9444 0.9421 52.0000\n", "weighted avg 0.9437 0.9423 0.9425 52.0000\n", "\n", "====================== After forward selection =======================\n", "\n", " Discriminant Analysis - Results \n", "\n", "Summary Information:\n", " Infos Value DF DF value\n", "0 Total Sample Size 52 DF Total 51\n", "1 Variables 3 DF Within Classes 49\n", "2 Classes 3 DF Between Classes 2\n", "\n", "Class Level Information:\n", " Frequency Proportion Prior Probability\n", "KIRSCH 17 0.3269 0.3269\n", "MIRAB 15 0.2885 0.2885\n", "POIRE 20 0.3846 0.3846\n", "\n", "Pooled Covariance Matrix Information:\n", " Rank Natural Log of the Determinant\n", "Pooled 3 19.4106\n", "\n", "Linear Discriminant Function for TYPE:\n", " KIRSCH MIRAB POIRE\n", "Constant -3.6107 -14.7754 -18.3711\n", "MEOH 0.0069 0.0213 0.0226\n", "BU1 -0.0766 0.4010 0.3735\n", "MEPR 0.0867 -0.0325 0.0467\n", "\n", "Classification Summary for Calibration Data:\n", "\n", "Observation Profile:\n", " Read Used\n", "Number of Observations 52 52\n", "\n", "Number of Observations Classified into TYPE:\n", "prediction KIRSCH MIRAB POIRE Total\n", "TYPE \n", "KIRSCH 17 0 0 17\n", "MIRAB 0 12 3 15\n", "POIRE 0 4 16 20\n", "Total 17 16 19 52\n", "\n", "Percent Classified into TYPE:\n", "prediction KIRSCH MIRAB POIRE Total\n", "TYPE \n", "KIRSCH 100.0000 0.0000 0.0000 100.0\n", "MIRAB 0.0000 80.0000 20.0000 100.0\n", "POIRE 0.0000 20.0000 80.0000 100.0\n", "Total 32.6923 30.7692 36.5385 100.0\n", "Priors 0.3269 0.2885 0.3846 NaN\n", "\n", "Error Count Estimates for TYPE:\n", " KIRSCH MIRAB POIRE Total\n", "Rate 0.0000 0.2000 0.2000 0.1346\n", "Priors 0.3269 0.2885 0.3846 NaN\n", "\n", "Classification Report for TYPE:\n", " precision recall f1-score support\n", "KIRSCH 1.0000 1.0000 1.0000 17.0000\n", "MIRAB 0.7500 0.8000 0.7742 15.0000\n", "POIRE 0.8421 0.8000 0.8205 20.0000\n", "accuracy 0.8654 0.8654 0.8654 0.8654\n", "macro avg 0.8640 0.8667 0.8649 52.0000\n", "weighted avg 0.8672 0.8654 0.8658 52.0000\n" ] } ], "source": [ "from discrimintools import summarySTEPDISC\n", "summarySTEPDISC(clf2,detailed=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Evaluation of prediction on testing dataset\n", "\n", "#### Testing data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 50 entries, 0 to 49\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 TYPE 50 non-null object \n", " 1 MEOH 50 non-null int64 \n", " 2 ACET 50 non-null int64 \n", " 3 BU1 50 non-null float64\n", " 4 BU2 50 non-null float64\n", " 5 ISOP 50 non-null int64 \n", " 6 MEPR 50 non-null int64 \n", " 7 PRO1 50 non-null int64 \n", " 8 ACAL 50 non-null float64\n", "dtypes: float64(3), int64(5), object(1)\n", "memory usage: 3.6+ KB\n" ] } ], "source": [ "#testining data\n", "DTest = load_alcools(\"test\")\n", "DTest.info()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Observation Profile:\n", " Read Used\n", "Number of Observations 50 50\n", "\n", "Number of Observations Classified into TYPE:\n", "prediction KIRSCH MIRAB POIRE Total\n", "TYPE \n", "KIRSCH 14 0 0 14\n", "MIRAB 0 12 5 17\n", "POIRE 2 8 9 19\n", "Total 16 20 14 50\n", "\n", "Percent Classified into TYPE:\n", "prediction KIRSCH MIRAB POIRE Total\n", "TYPE \n", "KIRSCH 100.000000 0.000000 0.000000 100.0\n", "MIRAB 0.000000 70.588235 29.411765 100.0\n", "POIRE 10.526316 42.105263 47.368421 100.0\n", "Total 32.000000 40.000000 28.000000 100.0\n", "Priors 0.326923 0.288462 0.384615 NaN\n", "\n", "Error Count Estimates for TYPE:\n", " KIRSCH MIRAB POIRE Total\n", "Rate 0.000000 0.294118 0.526316 0.287271\n", "Priors 0.326923 0.288462 0.384615 NaN\n", "\n", "Classification Report for TYPE:\n", " precision recall f1-score support\n", "KIRSCH 0.875000 1.000000 0.933333 14.0\n", "MIRAB 0.600000 0.705882 0.648649 17.0\n", "POIRE 0.642857 0.473684 0.545455 19.0\n", "accuracy 0.700000 0.700000 0.700000 0.7\n", "macro avg 0.705952 0.726522 0.709146 50.0\n", "weighted avg 0.693286 0.700000 0.689147 50.0\n" ] } ], "source": [ "#split into X and y\n", "yTest, XTest = DTest[\"TYPE\"], DTest.drop(columns=[\"TYPE\"])\n", "eval_test = clf2.eval_predict(XTest,yTest,verbose=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }