{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## STEPDISC CANDISC - oliveoil dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#disable warnings\n", "from warnings import simplefilter, filterwarnings\n", "simplefilter(action='ignore', category=FutureWarning)\n", "filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### oliveoil dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 569 entries, 0 to 568\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 CLASSE 569 non-null object\n", " 1 palmitic 569 non-null int64 \n", " 2 palmitoleic 569 non-null int64 \n", " 3 stearic 569 non-null int64 \n", " 4 oleic 569 non-null int64 \n", " 5 linoleic 569 non-null int64 \n", " 6 linolenic 569 non-null int64 \n", " 7 arachidic 569 non-null int64 \n", " 8 eicosenoic 569 non-null int64 \n", "dtypes: int64(8), object(1)\n", "memory usage: 40.1+ KB\n", "None\n" ] } ], "source": [ "#vins dataset\n", "from discrimintools.datasets import load_oliveoil\n", "D = load_oliveoil(\"train\")\n", "print(D.info())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Forward selection" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================== Step 1 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF \\\n", "palmitic 0.538509 0.461491 242.524854 2 566 \n", "palmitoleic 0.604905 0.395095 184.841942 2 566 \n", "stearic 0.998272 0.001728 0.489942 2 566 \n", "oleic 0.473479 0.526521 314.703134 2 566 \n", "linoleic 0.550371 0.449629 231.198312 2 566 \n", "linolenic 0.687722 0.312278 128.503464 2 566 \n", "arachidic 0.662890 0.337110 143.918675 2 566 \n", "eicosenoic 0.202071 0.797929 1117.498522 2 566 \n", "\n", " Pr>F \n", "palmitic 8.465810e-77 \n", "palmitoleic 1.650063e-62 \n", "stearic 6.129213e-01 \n", "oleic 1.288711e-92 \n", "linoleic 4.032628e-74 \n", "linolenic 9.724383e-47 \n", "arachidic 2.936859e-51 \n", "eicosenoic 2.867939e-197 \n", "\n", "Variable eicosenoic will enter\n", "\n", "\n", "====================== Step 2 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF \\\n", "palmitic 0.130129 0.356025 156.181332 2 565 \n", "palmitoleic 0.123582 0.388421 179.418881 2 565 \n", "stearic 0.184593 0.086494 26.748148 2 565 \n", "oleic 0.102388 0.493307 275.036355 2 565 \n", "linoleic 0.094108 0.534283 324.091029 2 565 \n", "linolenic 0.195821 0.030927 9.015798 2 565 \n", "arachidic 0.139761 0.308355 125.946400 2 565 \n", "\n", " Pr>F \n", "palmitic 1.012924e-54 \n", "palmitoleic 4.708670e-61 \n", "stearic 7.960766e-12 \n", "oleic 3.895257e-84 \n", "linoleic 1.756295e-94 \n", "linolenic 1.398527e-04 \n", "arachidic 5.848639e-46 \n", "\n", "Variable linoleic will enter\n", "\n", "\n", "====================== Step 3 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF \\\n", "palmitic 0.064624 0.313297 128.658103 2 564 \n", "palmitoleic 0.054167 0.424414 207.935167 2 564 \n", "stearic 0.088816 0.056230 16.801620 2 564 \n", "oleic 0.070818 0.247485 92.743401 2 564 \n", "linolenic 0.078805 0.162609 54.760308 2 564 \n", "arachidic 0.064452 0.315126 129.754781 2 564 \n", "\n", " Pr>F \n", "palmitic 9.306291e-47 \n", "palmitoleic 2.244706e-68 \n", "stearic 8.170662e-08 \n", "oleic 1.504050e-35 \n", "linolenic 1.843977e-22 \n", "arachidic 4.386814e-47 \n", "\n", "Variable palmitoleic will enter\n", "\n", "\n", "====================== Step 4 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF \\\n", "palmitic 0.051660 0.046294 13.664275 2 563 \n", "stearic 0.053220 0.017479 5.007993 2 563 \n", "oleic 0.051666 0.046172 13.626478 2 563 \n", "linolenic 0.045867 0.153235 50.941613 2 563 \n", "arachidic 0.039318 0.274139 106.315330 2 563 \n", "\n", " Pr>F \n", "palmitic 1.604028e-06 \n", "stearic 6.985163e-03 \n", "oleic 1.662907e-06 \n", "linolenic 4.626894e-21 \n", "arachidic 6.764564e-40 \n", "\n", "Variable arachidic will enter\n", "\n", "\n", "====================== Step 5 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF \\\n", "palmitic 0.036354 0.075390 22.912027 2 562 \n", "stearic 0.038407 0.023180 6.668090 2 562 \n", "oleic 0.037676 0.041772 12.249610 2 562 \n", "linolenic 0.034623 0.119397 38.099399 2 562 \n", "\n", " Pr>F \n", "palmitic 2.718440e-10 \n", "stearic 1.373760e-03 \n", "oleic 6.205177e-06 \n", "linolenic 3.042803e-16 \n", "\n", "Variable linolenic will enter\n", "\n", "\n", "====================== Step 6 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F\n", "palmitic 0.032986 0.047282 13.920669 2 561 0.000001\n", "stearic 0.034059 0.016309 4.650441 2 561 0.009929\n", "oleic 0.034059 0.016292 4.645700 2 561 0.009975\n", "\n", "Variable palmitic will enter\n", "\n", "\n", "====================== Step 7 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F\n", "stearic 0.032434 0.016756 4.771705 2 560 0.008813\n", "oleic 0.032173 0.024665 7.080984 2 560 0.000918\n", "\n", "Variable oleic will enter\n", "\n", "\n", "====================== Step 8 forward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F\n", "stearic 0.03196 0.00662 1.862669 2 559 0.15622\n", "\n", "No variable can enter\n", "\n" ] }, { "data": { "text/html": [ "
STEPDISC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "STEPDISC()" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from discrimintools import CANDISC, STEPDISC\n", "#split into X and y\n", "y, X = D[\"CLASSE\"], D.drop(columns=[\"CLASSE\"])\n", "clf = CANDISC(n_components=2).fit(X,y)\n", "clf2 = STEPDISC(method=\"forward\",alpha=0.01,verbose=True)\n", "clf2.fit(clf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Selected variables" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['eicosenoic', 'linoleic', 'palmitoleic', 'arachidic', 'linolenic', 'palmitic', 'oleic']\n" ] } ], "source": [ "#selected variables\n", "print(clf2.summary_.selected)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### summary" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Stepwise Discriminant Analysis - Results \n", "\n", "====================== Before forward selection =======================\n", "\n", " Canonical Discriminant Analysis - Results \n", "\n", "Summary Information:\n", " infos Value DF DF value\n", "0 Total Sample Size 569 DF Total 568\n", "1 Variables 8 DF Within Classes 566\n", "2 Classes 3 DF Between Classes 2\n", "\n", "Class Level Information:\n", " Frequency Proportion Prior Probability\n", "Centre_North 150 0.2636 0.2636\n", "Sardinia 97 0.1705 0.1705\n", "South 322 0.5659 0.5659\n", "\n", "Total-Sample Class Means:\n", " Centre_North Sardinia South\n", "palmitic 1094.8333 1112.0619 1332.3696\n", "palmitoleic 83.8933 96.3505 154.8882\n", "stearic 231.0400 226.3505 228.7081\n", "oleic 7791.9733 7266.9072 7099.5311\n", "linoleic 727.8800 1197.3608 1034.0093\n", "linolenic 21.7467 27.0103 38.0373\n", "arachidic 37.5467 73.0000 63.1025\n", "eicosenoic 1.9733 1.9278 27.3323\n", "\n", "Importance of components:\n", " Eigenvalue Difference Proportion Cumulative\n", "Can1 8.4718 6.1684 78.6232 78.6232\n", "Can2 2.3034 NaN 21.3768 100.0000\n", "\n", "Raw Canonical and Classification Functions Coefficients:\n", " Can1 Can2 Centre_North Sardinia South\n", "Constant -13.0646 -56.9169 -70.0899 194.6549 -37.1812\n", "palmitic 0.0028 0.0089 0.0072 -0.0344 0.0070\n", "palmitoleic 0.0131 0.0184 -0.0095 -0.0959 0.0333\n", "stearic -0.0028 0.0043 0.0171 -0.0029 -0.0071\n", "oleic 0.0006 0.0062 0.0094 -0.0199 0.0016\n", "linoleic 0.0011 -0.0013 -0.0061 -0.0001 0.0029\n", "linolenic 0.0411 0.0058 -0.1257 -0.1523 0.1045\n", "arachidic -0.0173 -0.0347 -0.0063 0.1565 -0.0442\n", "eicosenoic 0.1631 0.0101 -0.5231 -0.5673 0.4146\n", "\n", "====================== After forward selection =======================\n", "\n", " Canonical Discriminant Analysis - Results \n", "\n", "Summary Information:\n", " infos Value DF DF value\n", "0 Total Sample Size 569 DF Total 568\n", "1 Variables 7 DF Within Classes 566\n", "2 Classes 3 DF Between Classes 2\n", "\n", "Class Level Information:\n", " Frequency Proportion Prior Probability\n", "Centre_North 150 0.2636 0.2636\n", "Sardinia 97 0.1705 0.1705\n", "South 322 0.5659 0.5659\n", "\n", "Total-Sample Class Means:\n", " Centre_North Sardinia South\n", "eicosenoic 1.9733 1.9278 27.3323\n", "linoleic 727.8800 1197.3608 1034.0093\n", "palmitoleic 83.8933 96.3505 154.8882\n", "arachidic 37.5467 73.0000 63.1025\n", "linolenic 21.7467 27.0103 38.0373\n", "palmitic 1094.8333 1112.0619 1332.3696\n", "oleic 7791.9733 7266.9072 7099.5311\n", "\n", "Importance of components:\n", " Eigenvalue Difference Proportion Cumulative\n", "Can1 8.4496 6.1603 78.6823 78.6823\n", "Can2 2.2893 NaN 21.3177 100.0000\n", "\n", "Raw Canonical and Classification Functions Coefficients:\n", " Can1 Can2 Centre_North Sardinia South\n", "Constant -32.1378 -27.9172 46.3443 174.6732 -85.3694\n", "eicosenoic 0.1649 0.0067 -0.5330 -0.5656 0.4187\n", "linoleic 0.0029 -0.0040 -0.0171 0.0018 0.0074\n", "palmitoleic 0.0150 0.0155 -0.0210 -0.0940 0.0381\n", "arachidic -0.0157 -0.0374 -0.0170 0.1583 -0.0398\n", "linolenic 0.0435 0.0020 -0.1403 -0.1498 0.1105\n", "palmitic 0.0048 0.0058 -0.0050 -0.0324 0.0121\n", "oleic 0.0025 0.0034 -0.0021 -0.0179 0.0064\n" ] } ], "source": [ "from discrimintools import summarySTEPDISC\n", "summarySTEPDISC(clf2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Evaluation of prediction on testing dataset" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Observation Profile:\n", " Read Used\n", "Number of Observations 3 3\n", "\n", "Number of Observations Classified into CLASSE:\n", "prediction Centre_North Sardinia South Total\n", "CLASSE \n", "Centre_North 1 0 0 1\n", "Sardinia 0 1 0 1\n", "South 0 0 1 1\n", "Total 1 1 1 3\n", "\n", "Percent Classified into CLASSE:\n", "prediction Centre_North Sardinia South Total\n", "CLASSE \n", "Centre_North 100.000000 0.000000 0.000000 100.0\n", "Sardinia 0.000000 100.000000 0.000000 100.0\n", "South 0.000000 0.000000 100.000000 100.0\n", "Total 33.333333 33.333333 33.333333 100.0\n", "Priors 0.263620 0.170475 0.565905 NaN\n", "\n", "Error Count Estimates for CLASSE:\n", " Centre_North Sardinia South Total\n", "Rate 0.00000 0.000000 0.000000 0.0\n", "Priors 0.26362 0.170475 0.565905 NaN\n", "\n", "Classification Report for CLASSE:\n", " precision recall f1-score support\n", "Centre_North 1.0 1.0 1.0 1.0\n", "Sardinia 1.0 1.0 1.0 1.0\n", "South 1.0 1.0 1.0 1.0\n", "accuracy 1.0 1.0 1.0 1.0\n", "macro avg 1.0 1.0 1.0 3.0\n", "weighted avg 1.0 1.0 1.0 3.0\n" ] } ], "source": [ "#testining data\n", "DTest = load_oliveoil(\"test\")\n", "#split into X and y\n", "yTest, XTest = DTest[\"CLASSE\"], DTest.drop(columns=[\"CLASSE\"])\n", "#evaluation of prediction on testing dataset\n", "eval_test = clf2.eval_predict(XTest,yTest,verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### backward selection" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================== Step 1 backward selection results =======================\n", " Wilks' Lambda Partial R-Square F Value Num DF Den DF Pr>F\n", "stearic 1.0 0.96804 8465.862059 2 559 0.0\n", "\n", "No variable can be removed\n", "\n", "\n", "Since only one feature is selected, CANDISC procedure cannot be updated.\n" ] }, { "data": { "text/html": [ "
STEPDISC(method='backward')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "STEPDISC(method='backward')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#backward selection\n", "clf2 = STEPDISC(method=\"backward\",alpha=0.01,verbose=True)\n", "clf2.fit(clf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Selected variables" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['stearic']\n" ] } ], "source": [ "#selected variables\n", "print(clf2.summary_.selected)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Summary" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Stepwise Discriminant Analysis - Results \n", "\n", "====================== Before backward selection =======================\n", "\n", " Canonical Discriminant Analysis - Results \n", "\n", "Summary Information:\n", " infos Value DF DF value\n", "0 Total Sample Size 569 DF Total 568\n", "1 Variables 8 DF Within Classes 566\n", "2 Classes 3 DF Between Classes 2\n", "\n", "Class Level Information:\n", " Frequency Proportion Prior Probability\n", "Centre_North 150 0.2636 0.2636\n", "Sardinia 97 0.1705 0.1705\n", "South 322 0.5659 0.5659\n", "\n", "Total-Sample Class Means:\n", " Centre_North Sardinia South\n", "palmitic 1094.8333 1112.0619 1332.3696\n", "palmitoleic 83.8933 96.3505 154.8882\n", "stearic 231.0400 226.3505 228.7081\n", "oleic 7791.9733 7266.9072 7099.5311\n", "linoleic 727.8800 1197.3608 1034.0093\n", "linolenic 21.7467 27.0103 38.0373\n", "arachidic 37.5467 73.0000 63.1025\n", "eicosenoic 1.9733 1.9278 27.3323\n", "\n", "Importance of components:\n", " Eigenvalue Difference Proportion Cumulative\n", "Can1 8.4718 6.1684 78.6232 78.6232\n", "Can2 2.3034 NaN 21.3768 100.0000\n", "\n", "Raw Canonical and Classification Functions Coefficients:\n", " Can1 Can2 Centre_North Sardinia South\n", "Constant -13.0646 -56.9169 -70.0899 194.6549 -37.1812\n", "palmitic 0.0028 0.0089 0.0072 -0.0344 0.0070\n", "palmitoleic 0.0131 0.0184 -0.0095 -0.0959 0.0333\n", "stearic -0.0028 0.0043 0.0171 -0.0029 -0.0071\n", "oleic 0.0006 0.0062 0.0094 -0.0199 0.0016\n", "linoleic 0.0011 -0.0013 -0.0061 -0.0001 0.0029\n", "linolenic 0.0411 0.0058 -0.1257 -0.1523 0.1045\n", "arachidic -0.0173 -0.0347 -0.0063 0.1565 -0.0442\n", "eicosenoic 0.1631 0.0101 -0.5231 -0.5673 0.4146\n", "\n", "No model has been updated.\n" ] } ], "source": [ "from discrimintools import summarySTEPDISC\n", "summarySTEPDISC(clf2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }