Source code for autoprot.analysis.clustering

# -*- coding: utf-8 -*-
"""
Autoprot Analysis Functions.

@author: Wignand, Julian, Johannes

@documentation: Julian
"""
import os
import warnings
from typing import Union, Literal

import matplotlib
import matplotlib.colors as clrs
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
import seaborn as sns
from gprofiler import GProfiler
from numpy.typing import ArrayLike
from scipy import cluster as clst
from scipy.spatial import distance
from scipy.stats import zscore
from sklearn import cluster as clstsklearn
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
)

from .. import r_helper

gp = GProfiler(user_agent="autoprot", return_dataframe=True)
RFUNCTIONS, R = r_helper.return_r_path()

# check where this is actually used and make it local
cmap = sns.diverging_palette(150, 275, s=80, l=55, n=9)


class _Cluster:
    r"""
    Base class for clustering pipelines.
    """

    def __init__(
        self,
        data: Union[np.array, pd.DataFrame],
        clabels: Union[None, list] = None,
        rlabels: Union[None, list] = None,
        zs: Union[None, int] = None,
        linkage: Union[None, ArrayLike] = None,
    ):
        """
        Initialise the class.

        Parameters
        ----------
        data : np.array or pd.DataFrame
            The data to be clustered.
        clabels : list or None
            Column labels. Must be present in the in input df.
            Defaulting to RangeIndex(0, 1, 2, …, n).
        rlabels : list or None
            Row labels. Must be present in the in input df.
            Will default to RangeIndex if no indexing information part of
            input data and no index provided.
        zs : int or None, optional
            Axis along which to calculate the zscore.
            The default is None.
        linkage : scipy.cluster.hierarchy.linkage object, optional
            Precalculated linkage object.
            The default is None.

        Returns
        -------
        None.

        """

        def _sanitize_data(
            data: Union[np.ndarray, pd.DataFrame],
            clabels: list,
            rlabels: list,
            zs_axis: Union[int, None],
        ) -> tuple[np.ndarray, list, list]:
            """
            Check if data contains missing values and remove them.

            Parameters
            ----------
            data : np.ndarray or pd.DataFrame
                The data to be clustered.
            clabels : list
                Column labels.
            rlabels : list
                Row labels.
            zs_axis : int or None
                Axis along which to calculate the zscore.
                The default is None.

            Returns
            -------
            np.ndarray, list, list
                The cleaned data, row labels and column labels.

            Raises
            ------
            ValueError
                If the data is not a DataFrame or np.ndarray.
            """

            # make sure that the data is a pandas DataFrame with the correct labels
            # this will make sure that the labels are modified along with the data
            if not isinstance(data, (pd.DataFrame, np.ndarray)):
                raise ValueError("Data must be a pandas DataFrame or a numpy ndarray.")
            else:
                dataframe = pd.DataFrame(data, index=rlabels, columns=clabels)

            # check that all input is numeric before proceeding
            if not all(is_numeric_dtype(dataframe[col]) for col in dataframe.columns):
                # collect columns containing non-numeric values
                non_numeric = [
                    col
                    for col in dataframe.columns
                    if not is_numeric_dtype(dataframe[col])
                ]
                raise ValueError(
                    f"All values must be numeric. Check for non-numeric values in column(s) {non_numeric}."
                )

            # if the zscore is to be calculated (i.e. if zs != None)
            # a dataframe with zscores instead of values is calculated
            if zs_axis is not None:
                to_transform = dataframe.to_numpy()  # convert to numpy array
                transformed = zscore(to_transform, axis=zs_axis)  # calculate zscore
                dataframe = pd.DataFrame(  # convert back to dataframe
                    transformed, index=dataframe.index, columns=dataframe.columns
                )

            print(
                f"Removed {dataframe.isnull().values.sum()} NaN values from the dataframe to prepare for clustering."
            )
            # no NA values should remain during cluster analysis
            dataframe.dropna(how="any", axis=1, inplace=True)

            # check that the data is not empty
            if dataframe.empty:
                raise ValueError("The dataframe is empty.")

            return (
                dataframe.to_numpy(),
                dataframe.index.tolist(),
                dataframe.columns.tolist(),
            )

        #
        self.data, self.rlabels, self.clabels = _sanitize_data(
            data=data, clabels=clabels, rlabels=rlabels, zs_axis=zs
        )

        # the linkage object for hierarchical clustering
        self.linkage = linkage
        # the number of clusters
        self.nclusters = None
        # list of len(data) with IDs of clusters corresponding to rows
        self.clusterId = None
        # the standard colormap
        self.cmap = matplotlib.colormaps["viridis"]
        # type of clustering (base class is None)
        self.type = None

    def vis_cluster(
        self,
        col_cluster=False,
        make_traces=False,
        make_heatmap=False,
        file=None,
        row_colors=None,
        colors: list = None,
        ytick_labels="",
        ret_figs: bool = False,
        make_clustermap: bool = True,
        **kwargs,
    ):
        """
        Visualise the clustering.

        Parameters
        ----------
        make_clustermap : bool, optional
            Whether to make a clustermap. The default is True.
        col_cluster : bool, optional
            Whether to cluster the columns. The default is False.
        make_traces : bool, optional
            Whether to generate traces of each cluster. The default is False.
        make_heatmap : bool, optional
            Whether to generate a summery heatmap.
            The default is False.
        file : str, optional
            Path to the output plot file. The default is None.
        row_colors : dict, optional
            dictionary of mapping a row title to a list of colours.
            The list must have the same length as the data has rows.
            Generates an additional column in the heatmeap showing
            the indicated columns values as colors.
            Has to be same length as provided data.
            The default is None.
        colors : list of str, optional
            Colors for the annotated clusters.
            Has to be the same size as the number of clusters.
            The default is None.
        ytick_labels : list of str, optional
            Labels for the y ticks. The default is "".
        ret_figs : bool, optional
            Whether to return the figure. The default is False.
        **kwargs :
            passed to seaborn.clustermap.
            See https://seaborn.pydata.org/generated/seaborn.clustermap.html
            May also contain 'z-score' that is used during making of
            cluster traces.

        Returns
        -------
        figs : list of matplotlib.figure.Figure or seaborn.matrix.ClusterGrid or None

        """

        def make_cluster_traces(file, colors: list, zs=None, ret_fig: bool = False):
            """
            Plot RMSD vs colname line plots.

            Shaded areas representing groups of RMSDs are plotted.

            Parameters
            ----------
            file : str
                Filename with extension to save file to.
                Will be extended by FNAME_traces.EXT.
            colors : list of str or None.
                Colours for the traces. If none, the same predefined colours will
                be used for all n traces.
            zs : int or None, optional
                Axis along which to standardise the data by z-score transformation.
                The default is None.
            ret_fig : bool, optional
                Whether to return the figure. The default is False.

            Returns
            -------
            fig : matplotlib.figure.Figure or None
            """
            fig = plt.figure(figsize=(5, 5 * self.nclusters))
            temp = pd.DataFrame(self.data.copy())
            if zs is not None:
                temp = pd.DataFrame(zscore(temp, axis=1 - zs))
            temp["cluster"] = self.clusterId
            labels = list(set(self.clusterId))
            for idx, i in enumerate(labels):
                ax = plt.subplot(self.nclusters, 1, idx + 1)
                temp2 = temp[temp["cluster"] == i].drop("cluster", axis=1)
                temp2["distance"] = temp2.apply(
                    lambda x: -np.log(np.sqrt(sum((x - temp2.mean()) ** 2))), 1
                )

                if temp2.shape[0] == 1:
                    ax.set_title(f"Cluster {i}")
                    ax.set_ylabel("")
                    ax.set_xlabel("")
                    ax.plot(
                        range(temp2.shape[1] - 1),
                        temp2.drop("distance", axis=1).values.reshape(-1),
                    )

                    plt.xticks(range(len(self.clabels)), self.clabels)
                    continue
                temp2["distance"] = pd.cut(temp2["distance"], 5)
                if colors is None:
                    color = ["#C72119", "#D67155", "#FFC288", "#FFE59E", "#FFFDBF"]
                else:
                    color = [colors[i]] * 5
                color = color[::-1]
                alpha = [0.1, 0.2, 0.25, 0.4, 0.6]
                grouped = temp2.groupby("distance", observed=False)
                ax.set_title(f"Cluster {i}")
                if zs is None:
                    ax.set_ylabel("value")
                else:
                    ax.set_ylabel("z-score")
                ax.set_xlabel("Condition")
                for jdx, (_, group) in enumerate(grouped):
                    for j in range(group.shape[0]):
                        ax.plot(
                            range(temp2.shape[1] - 1),
                            group.drop("distance", axis=1).iloc[j],
                            color=color[jdx],
                            alpha=alpha[jdx],
                        )

                plt.xticks(range(len(self.clabels)), self.clabels, rotation=90)
                plt.tight_layout()
                if file is not None:
                    name, ext = file.split(".")
                    filet = f"{name}_traces.{ext}"
                    plt.savefig(filet)

            if ret_fig:
                return fig
            else:
                return None

        def make_cluster_heatmap(file=None, ret_fig: bool = False):
            """
            Make summary heatmap of clustering.

            Parameters
            ----------
            file : str
                Path to write summary.
            ret_fig : bool, optional
                Whether to return the figure. The default is False.

            Returns
            -------
            fig : matplotlib.figure.Figure or None
            """
            temp = pd.DataFrame(self.data, index=self.rlabels, columns=self.clabels)
            temp["cluster"] = self.clusterId
            grouped = temp.groupby("cluster")[self.clabels].mean()
            ylabel = [
                f"Cluster{i + 1} (n={j})"
                for i, j in enumerate(temp.groupby("cluster").count().iloc[:, 0].values)
            ]

            fig = plt.figure()
            plt.title("Summary Of Clustering")
            sns.heatmap(grouped, cmap=self.cmap)
            plt.yticks([i + 0.5 for i in range(len(ylabel))], ylabel, rotation=0)
            plt.tight_layout()
            if file is not None:
                name, ext = file.split(".")
                filet = f"{name}_summary.{ext}"
                plt.savefig(filet)

            if ret_fig:
                return fig
            else:
                return None

        # initialise the figure variables
        clustermap = None
        traces = None
        heatmap = None

        norm = clrs.Normalize(vmin=self.clusterId.min(), vmax=self.clusterId.max())
        if colors is not None and len(colors) == self.nclusters:
            cmap = clrs.ListedColormap(colors)
            mapper = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
        else:
            mapper = plt.cm.ScalarMappable(norm=norm, cmap=self.cmap)
        a = mapper.to_rgba(self.clusterId)
        # noinspection PyTypeChecker
        cluster_colors = np.apply_along_axis(func1d=clrs.to_hex, axis=1, arr=a)
        if "cmap" not in kwargs.keys():
            kwargs["cmap"] = self.cmap
        if row_colors is not None:
            row_colors_df = pd.DataFrame(row_colors)
            row_colors_df["Cluster"] = cluster_colors
            row_colors_df.index = self.rlabels
        else:
            row_colors_df = pd.DataFrame(
                cluster_colors, columns=["Cluster"], index=self.rlabels
            )

        if make_clustermap:
            value_type = "z-score" if "z_score" in kwargs else "value"
            if self.type == "KMeans":
                # KMeans does not have linkage should not be reclustered
                clustermap = sns.clustermap(
                    data=pd.DataFrame(
                        self.data, index=self.rlabels, columns=self.clabels
                    ),
                    row_colors=row_colors_df,
                    row_cluster=False,
                    col_cluster=False,
                    dendrogram_ratio=0.05,
                    cbar_pos=None,
                    yticklabels=ytick_labels,
                    **kwargs,
                )
            else:
                clustermap = sns.clustermap(
                    pd.DataFrame(self.data, index=self.rlabels, columns=self.clabels),
                    row_linkage=self.linkage,
                    row_colors=row_colors_df,
                    col_cluster=col_cluster,
                    yticklabels=ytick_labels,
                    cbar_kws={"label": value_type},
                    **kwargs,
                )

        if file is not None:
            plt.savefig(file)
        if make_traces:
            if "z_score" in kwargs:
                traces = make_cluster_traces(
                    file, zs=kwargs["z_score"], colors=colors, ret_fig=ret_figs
                )
            else:
                traces = make_cluster_traces(file, colors=colors, ret_fig=ret_figs)
        if make_heatmap:
            heatmap = make_cluster_heatmap(file, ret_fig=ret_figs)

        if ret_figs:
            figs = [x for x in [clustermap, traces, heatmap] if x is not None]
            return figs[0] if len(figs) == 1 else figs

    def return_cluster(self):
        """Return dataframe with clustered data."""
        temp = pd.DataFrame(self.data, index=self.rlabels, columns=self.clabels)
        temp["cluster"] = self.clusterId
        return temp

    def write_cluster_files(self, root_dir):
        """
        Generate a folder with text files for each cluster.

        Parameters
        ----------
        root_dir : str
            Path to target dir.
            If the folder is named clusterResults, text files will be saved
            within.
            Else a new folder clusterResults will be created.

        Returns
        -------
        None.

        """
        path = os.path.join(root_dir, "clusterResults")
        if "clusterResults" not in os.listdir(root_dir):
            os.mkdir(path)

        temp = pd.DataFrame(self.data, index=self.rlabels, columns=self.clabels)
        temp["cluster"] = self.clusterId
        for cluster in temp["cluster"].unique():
            pd.DataFrame(temp[temp["cluster"] == cluster].index).to_csv(
                f"{path}/cluster_{cluster}.tsv", header=False, index=False
            )

    def clustering_evaluation(self, pred, figsize, start, up_to, plot: bool):
        pred = np.array(pred)
        print(
            f"Best Davies Boulding at {start + list(pred[::, 0]).index(min(pred[::, 0]))} with {min(pred[::, 0])}"
        )
        print(
            f"Best Silhouette_score at {start + list(pred[::, 1]).index(max(pred[::, 1]))} with {max(pred[::, 1])}"
        )
        print(
            f"Best Harabasz/Calinski at {start + list(pred[::, 2]).index(max(pred[::, 2]))} with {max(pred[::, 2])}"
        )
        self.nclusters = start + list(pred[::, 0]).index(min(pred[::, 0]))
        print(f"Using Davies Boulding Score for setting # clusters: {self.nclusters}")
        print("You may manually overwrite this by setting self.nclusters")
        if plot:
            plt.figure(figsize=figsize)
            plt.subplot(131)
            plt.title("Davies_boulding_score")
            plt.plot(pred[::, 0])
            plt.xticks(
                range(up_to - start), [str(x) for x in range(start, up_to)], rotation=90
            )
            plt.grid(axis="x")
            plt.subplot(132)
            plt.title("Silhouette_score")
            plt.plot(pred[::, 1])
            plt.xticks(
                range(up_to - start), [str(x) for x in range(start, up_to)], rotation=90
            )
            plt.grid(axis="x")
            plt.subplot(133)
            plt.title("Harabasz score")
            plt.plot(pred[::, 2])
            plt.xticks(
                range(up_to - start), [str(x) for x in range(start, up_to)], rotation=90
            )
            plt.grid(axis="x")



[docs]
class HCA(_Cluster):
    # noinspection PyUnresolvedReferences
    r"""
    Conduct hierarchical cluster analysis.

    Notes
    -----
    User provides dataframe and can afterwards use various metrics and methods to perfom and evaluate
    clustering.

    StandarWorkflow:
    makeLinkage() -> findNClusters() -> makeCluster()

    Examples
    --------
    First grab a dataset that will be used for clustering such as the iris dataset.
    Extract the species labelling from the dataframe as it cannot be used for
    clustering and will be used later to evaluate the result.

    Initialise the clustering class with the data and find the optimum number of
    clusters and generate the final clustering with the autoRun method.

    .. plot::
        :context: close-figs

        df = sns.load_dataset('iris')
        labels = df.pop('species')
        c = ana.HCA(df)
        c.auto_run()

    Finally, visualise the clustering using the vis_cluster method and include the
    previously extracted labeling column from the original dataframe.

     .. plot::
         :context: close-figs

         labels.replace(['setosa', 'virginica', 'versicolor'], ["teal", "purple", "salmon"], inplace=True)
         rc = {"species" : labels}
         c.vis_cluster(row_colors={'species': labels})

    HCA separates the setosa quite well but virginica and versicolor are harder.
    When we manually pick true the number of clusters, HCA performs only slightly
    better von this dataset. Note that you can change the default cmap for the
    class by changing the cmap attribute.

     .. plot::
         :context: close-figs

            c.nclusters = 3
            c.make_cluster()
            c.cmap = 'coolwarm'
            c.vis_cluster(row_colors={'species': labels}, make_traces=True, file=None, make_heatmap=True)
    """

    def __init__(self, *args, **kwargs):
        """
        Initialise the subclass and set the type.
        """
        super().__init__(*args, **kwargs)
        self.type = "HCA"


[docs]
    def make_linkage(
        self,
        method="single",
        metric: Literal[
            "braycurtis",
            "canberra",
            "chebyshev",
            "cityblock",
            "correlation",
            "cosine",
            "dice",
            "euclidean",
            "hamming",
            "jaccard",
            "jensenshannon",
            "kulczynski1",
            "mahalanobis",
            "matching",
            "minkowski",
            "rogerstanimoto",
            "russellrao",
            "seuclidean",
            "sokalmichener",
            "sokalsneath",
            "sqeuclidean",
            "yule",
            "spearman",
            "pearson",
        ] = "euclidean",
    ):
        """
        Perform hierarchical clustering on the data.

        Parameters
        ----------
        method : str
            Which method is used for the clustering.
            Possible are 'single', 'average' and 'complete' and all values
            for method of scipy.cluster.hierarchy.linkage
            See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
        metric : str or function
            Which metric is used to calculate distance.
            Possible values are 'pearson', 'spearman' and all metrics
            implemented in scipy.spatial.distance.pdist
            See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html

        Returns
        -------
        None.

        """

        def as_dist(c):
            # noinspection PyUnresolvedReferences
            """
            Convert a matrix (i.e. correlation matrix) into a distance matrix for hierachical clustering.

            Parameters
            ----------
            c : np.ndarray
                Input matrix.

            Returns
            -------
            list
                List corresponsding to left off-diagonal elememnts of the
                correlation matrix.

            Examples
            --------
            >>> a = array([[ 1.        , -0.35153114, -0.74736506, -0.48917666],
            ...            [-0.35153114,  1.        ,  0.23810227,  0.15958285],
            ...            [-0.74736506,  0.23810227,  1.        , -0.03960706],
            ...            [-0.48917666,  0.15958285, -0.03960706,  1.        ]])
            >>> ana.autoHCA.as_dist(c)
            [-0.3515311393849671,
             -0.7473650573493561,
             -0.4891766567441463,
             0.23810227412143423,
             0.15958285448266604,
             -0.03960705975653923]
            """
            return [
                c[i][j]
                for i in (range(c.shape[0]))
                for j in (range(c.shape[1]))
                if i < j
            ]

        if self.linkage is not None:
            warnings.warn(
                "Linkage is already present, using the already defined linkage. If you want to reset the "
                "linkage, manually set HCA.linkage = None",
                UserWarning,
            )
            # leave the function
            return None

        # First calculate a distance metric between the points
        if metric in {"pearson", "spearman", "kendall"}:
            metric: Literal["pearson", "spearman", "kendall"]
            corr = pd.DataFrame(self.data).T.corr(metric).values
            dist = as_dist(1 - corr)
        else:
            # noinspection PyTypeChecker
            dist = distance.pdist(X=self.data, metric=metric)
        # perform hierarchical clustering using the distance metric
        # the returned matrix self.linkage contains n-1 x 4 elements
        # with each row representing
        # cluster1, cluster2, distance_between_1_and_2,
        # number_of_observations_in_the_cluster
        self.linkage = clst.hierarchy.linkage(dist, method=method)



[docs]
    def find_nclusters(self, start=2, up_to=20, figsize=(15, 5), plot=True):
        """
        Evaluate number of clusters.

        Parameters
        ----------
        start : int, optional
            The minimum number of clusters to plot. The default is 2.
        up_to : int, optional
            The maximum number of clusters to plot. The default is 20.
        figsize : tuple of float or int, optional
            The size of the plotted figure.
            The default is (15,5).
        plot : bool, optional
            Whether to plot the corresponding figures for the cluster scores

        Notes
        -----
        Davies-Bouldin score:
            The score is defined as the average similarity measure of each
            cluster with its most similar cluster, where similarity is the
            ratio of within-cluster distances to between-cluster distances.
            Thus, clusters which are farther apart and less dispersed will
            result in a better score.
            The minimum score is zero, with lower values indicating better
            clustering.
        Silhouette score:
            The Silhouette Coefficient is calculated using the mean
            intra-cluster distance (a) and the mean nearest-cluster
            distance (b) for each sample. The Silhouette Coefficient for a
            sample is (b - a) / max(a, b). To clarify, b is the distance
            between a sample and the nearest cluster that the sample is not a
            part of. Note that Silhouette Coefficient is only defined if
            number of labels is 2 <= n_labels <= n_samples - 1.
            The best value is 1 and the worst value is -1. Values near 0
            indicate overlapping clusters. Negative values generally indicate
            that a sample has been assigned to the wrong cluster, as a
            different cluster is more similar.
        Harabasz score:
            It is also known as the Variance Ratio Criterion.
            The score is defined as ratio between the within-cluster dispersion
            and the between-cluster dispersion.

        Returns
        -------
        None.

        """
        up_to += 1
        pred = []
        for i in range(start, up_to):
            # return the assigned cluster labels for each data point
            cluster = clst.hierarchy.fcluster(self.linkage, t=i, criterion="maxclust")
            # calculate scores based on assigned cluster labels and
            # the original data points
            pred.append(
                (
                    davies_bouldin_score(self.data, cluster),
                    silhouette_score(self.data, cluster),
                    calinski_harabasz_score(self.data, cluster),
                )
            )

        self.clustering_evaluation(pred, figsize, start, up_to, plot)



[docs]
    def make_cluster(self):
        """
        Form flat clusters from the hierarchical clustering of linkage.

        Returns
        -------
        None.

        """
        if self.nclusters is None:
            raise AttributeError(
                "No. of clusters is None. Perform find_nclusters before."
            )

        # self.cluster is an array of length x
        # with x = number of original data points containing the ID
        # of the corresponding cluster
        self.clusterId = clst.hierarchy.fcluster(
            self.linkage,  # the hierarchical clustering
            t=self.nclusters,  # max number of clusters
            criterion="maxclust",
        )  # forms maximumum n=t clusters



[docs]
    def auto_run(self, start_processing=1, stop_processing=5):
        """
        Automatically run the clustering pipeline with standard settings.

        Parameters
        ----------
        start_processing : int, optional
            Step of the pipeline to start. The default is 1.
        stop_processing : int, optional
            Step of the pipeline to stop. The default is 5.

        Notes
        -----
        The pipeline currently consists of (1) makeLinkage, (2) findNClusters
        and (3) makeCluster.

        Returns
        -------
        None.

        """
        if start_processing <= 1:
            self.make_linkage()
        if start_processing <= 2 <= stop_processing:
            self.find_nclusters()
        if start_processing <= 3 <= stop_processing:
            self.make_cluster()





[docs]
class KMeans(_Cluster):
    # noinspection PyUnresolvedReferences
    """
    Perform KMeans clustering on a dataset.

    Returns
    -------
    None.

    Notes
    -----
    The functions uses scipy.cluster.vq.kmeans2
    (see https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.kmeans2.html#scipy.cluster.vq.kmeans2)

    References
    ----------
    D. Arthur and S. Vassilvitskii, “k-means++: the advantages of careful seeding”, Proceedings of the Eighteenth
    Annual ACM-SIAM Symposium on Discrete Algorithms, 2007.

    Examples
    --------

    First grab a dataset that will be used for clustering such as the iris dataset.
    Extract the species labelling from the dataframe as it cannot be used for
    clustering and will be used later to evaluate the result.

    Initialise the clustering class with the data and find the optimum number of
    clusters and generate the final clustering with the autoRun method.

    .. plot::
        :context: close-figs

        df = sns.load_dataset('iris')
        labels = df.pop('species')
        c = ana.KMeans(df)
        c.auto_run()

    Finally, visualise the clustering using the visCluster method and include the
    previously extracted labeling column from the original dataframe.

     .. plot::
         :context: close-figs

         labels.replace(['setosa', 'virginica', 'versicolor'], ["teal", "purple", "salmon"], inplace=True)
         rc = {"species" : labels}
         c.vis_cluster(row_colors={'species': labels})

    As you can see can KMeans quite well separate setosa but virginica and versicolor are harder.
    When we manually pick the number of clusters, it gets a bit better

     .. plot::
         :context: close-figs

            c.nclusters = 3
            c.make_cluster()
            c.vis_cluster(row_colors={'species': labels}, make_traces=True, file=None, make_heatmap=True)
    """

    def __init__(self, *args, **kwargs):
        """
        Initialise the subclass and set the type.
        """
        super().__init__(*args, **kwargs)
        self.type = "KMeans"


[docs]
    def find_nclusters(
        self, start=2, up_to=20, figsize=(15, 5), plot=True, algo="scipy"
    ):
        """
        Evaluate number of clusters.

        Parameters
        ----------
        start : int, optional
            The minimum number of clusters to plot. The default is 2.
        up_to : int, optional
            The maximum number of clusters to plot. The default is 20.
        figsize : tuple of float or int, optional
            The size of the plotted figure.
            The default is (15,5).
        plot : bool, optional
            Whether to plot the corresponding figures for the cluster scores
        algo : str, optional
            Algorith to use for KMeans Clustering. Either "scipy" or "sklearn"

        Notes
        -----
        Davies-Bouldin score:
            The score is defined as the average similarity measure of each
            cluster with its most similar cluster, where similarity is the
            ratio of within-cluster distances to between-cluster distances.
            Thus, clusters which are farther apart and less dispersed will
            result in a better score.
            The minimum score is zero, with lower values indicating better
            clustering.
        Silhouette score:
            The Silhouette Coefficient is calculated using the mean
            intra-cluster distance (a) and the mean nearest-cluster
            distance (b) for each sample. The Silhouette Coefficient for a
            sample is (b - a) / max(a, b). To clarify, b is the distance
            between a sample and the nearest cluster that the sample is not a
            part of. Note that Silhouette Coefficient is only defined if
            number of labels is 2 <= n_labels <= n_samples - 1.
            The best value is 1 and the worst value is -1. Values near 0
            indicate overlapping clusters. Negative values generally indicate
            that a sample has been assigned to the wrong cluster, as a
            different cluster is more similar.
        Harabasz score:
            It is also known as the Variance Ratio Criterion.
            The score is defined as ratio between the within-cluster dispersion
            and the between-cluster dispersion.

        Returns
        -------
        None.

        """
        up_to += 1
        pred = []
        for i in range(start, up_to):

            if algo == "scipy":
                # return the assigned cluster labels for each data point
                _, cluster = clst.vq.kmeans2(data=self.data, k=i, minit="++")
            elif algo == "sklearn":
                model = clstsklearn.KMeans(n_clusters=i)
                model.fit(self.data)
                cluster = model.labels_
            else:
                raise ValueError(
                    'Provide either "sklearn" or "scipy" as parameter for the algo kwarg.'
                )

            # calculate scores based on assigned cluster labels and
            # the original data points
            pred.append(
                (
                    davies_bouldin_score(self.data, cluster),
                    silhouette_score(self.data, cluster),
                    calinski_harabasz_score(self.data, cluster),
                )
            )

        self.clustering_evaluation(pred, figsize, start, up_to, plot)



[docs]
    def make_cluster(self, algo="scipy", **kwargs):
        """
        Perform k-means clustering and store the resulting labels in self.clusterId.

        Parameters
        ----------
        algo : str, optional
            Algorith to use for KMeans Clustering. Either "scipy" or "sklearn"
        **kwargs:
            passed to either scipy or sklearn kmeans

        Returns
        -------
        None.

        """
        if algo == "scipy":
            centroids, self.clusterId = clst.vq.kmeans2(
                data=self.data, k=self.nclusters, minit="++", **kwargs
            )
        elif algo == "sklearn":
            # initialise model
            model = clstsklearn.KMeans(
                n_clusters=self.nclusters, n_init="auto", **kwargs
            )
            model.fit(self.data)
            self.clusterId = model.labels_
        else:
            raise ValueError(
                'Provide either "sklearn" or "scipy" as parameter for the algo kwarg.'
            )



[docs]
    def auto_run(self, start_processing=1, stop_processing=5):
        """
        Automatically run the clustering pipeline with standard settings.

        Parameters
        ----------
        start_processing : int, optional
            Step of the pipeline to start. The default is 1.
        stop_processing : int, optional
            Step of the pipeline to stop. The default is 5.

        Notes
        -----
        The pipeline currently consists of (1) findNClusters
        and (2) makeCluster.

        Returns
        -------
        None.

        """
        if start_processing <= 1:
            self.find_nclusters()
        if start_processing <= 2 <= stop_processing:
            self.make_cluster()