Source code for discrimintools.datasets.load_dataset

# -*- coding: utf-8 -*-
from pandas import read_excel, read_csv
from pathlib import Path

#set directory
DATASETS_DIR = Path(__file__).parent / "data"


[docs]
def load_dataset(name="iris"):
    """
    Load an example dataset

    Note
    ----
    This function provides quick access to a small number of example datasets that are useful.

    Parameters
    ----------
    name : str, default = 'iris'
        The name of the dataset. Possible values are:

        - 'breast' for the Breat Cancer dataset.
        - 'cultivar' for the Chemical composition of three cultivars of wine.
        - 'fish' for the fish dataset
        - 'iris' for the iris dataset
        - 'sonar' for the sonar dataset
        - 'wavenoise' for the wavenoise dataset

    Returns
    -------
    data : DataFrame of shape (n_samples, n_columns)
        The dataset loaded.
    """
    if name == "breast":
        data = read_excel(DATASETS_DIR/"breast_cancer.xlsx",header=0,sheet_name=0).rename(columns={"class" : "Class"})
        data.__doc__ = """
        Breat Cancer dataset

        """
    elif name == "cultivar":
        data = read_excel(DATASETS_DIR/"cultivar.xlsx",header=0,sheet_name=0)
        data.__doc__= """
        Wine: Chemical composition of three cultivars of wine

        For more, see https://archive.ics.uci.edu/dataset/109/wine

        Examples
        --------
        >>> from discrimintools.datasets import load_dataset
        >>> from discrimintools import CANDISC
        >>> D = load_dataset("cultivar") # load data
        >>> y, X = D["Cultivar"], D.drop(columns=["Cultivar"]) # split into X and y
        >>> clf = CANDISC()
        >>> clf.fit(X,y)
        CANDISC()
        """
    elif name == "fish":
        data = read_excel(DATASETS_DIR/"fish.xlsx",header=0,index_col=0)
        data.__doc__ = """
        Fish dataset

        Note
        ----
        The data in this example are measurements of 159 fish caught in Finland's Lake Laengelmaevesi; this data set
        is available from the Puranen. For each of the seven species (bream, roach, whitefish, parkki, perch, pike, and
        smelt), the weight, length, height, and width of each fish are tallied. Three different length measurements are
        recorded: from the nose of the fish to the beginning of its tail, from the nose to the notch of its tail, and from
        the nose to the end of its tail. The height and width are recorded as percentages of the third length variable.

        """
    elif name == "iris":
        data = read_csv(DATASETS_DIR/"iris.csv",header=0,sep=",")
        data.__doc__ = """
        Iris dataset

        Examples
        --------
        >>> from discrimintools.datasets import load_dataset
        >>> from discrimintools import CANDISC
        >>> D = load_dataset("iris") # load data
        >>> y, X = DTrain["Species"], D.drop(columns=["Species"]) # split into X and y
        >>> clf = CANDISC()
        >>> clf.fit(XTrain,yTrain)
        CANDISC()
        """
    elif name == "sonar":
        data = read_excel(DATASETS_DIR/"sonar.xlsx",sheet_name="Feuil1",header=0,index_col=None)
        data.__doc__ = """
        Sonar dataset

        Examples
        --------
        >>> from discrimintools.datasets import load_dataset
        >>> from discrimintools import DISCRIM, STEPDISC
        >>> DTrain = load_dataset("sonar") # load data
        >>> yTrain, XTrain = DTrain["Class"], DTrain.drop(columns=["Class"]) # split into X and y
        >>> #linear discriminant analysis (LDA)
        >>> clf = DISCRIM()
        >>> clf.fit(XTrain,yTrain)
        DISCRIM(priors="prop")
        >>> #stepwise discriminant analysis (STEPDISC)
        >>> clf2 = STEPDISC(method="backward")
        >>> clf2.fit(clf)
        STEPDISC(method="backward")
        """
    elif name == "wavenoise":
        data = read_excel(DATASETS_DIR/"wavenoise.xlsx",sheet_name="Feuil1",header=0,index_col=None)
        data.__doc__ = """
        Wave Noise dataset

        Examples
        --------
        >>> from discrimintools.datasets import load_dataset
        >>> from discrimintools import DISCRIM, STEPDISC
        >>> D = load_dataset("wavenoise") # load data
        >>> y, X = D["classe"], D.drop(columns=["classe"]) # split into X and y
        >>> #linear discriminant analysis (LDA)
        >>> clf = DISCRIM()
        >>> clf.fit(X,y)
        DISCRIM(priors='prop')
        >>> #stepwise discriminant analysis (STEPDISC)
        >>> clf2 = STEPDISC(method="backward")
        >>> clf2.fit(clf)
        STEPDISC(method="backward")
        """
    else:
        raise ValueError("{} not supported".format(name))
    
    return data