Source code for sknetwork.clustering.kmeans

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on October 2019
@author: Nathan de Lara <ndelara@enst.fr>
@author: Thomas Bonald <bonald@enst.fr>
"""
from typing import Union, Tuple

import numpy as np
from scipy import sparse

from sknetwork.clustering.base import BaseClustering
from sknetwork.clustering.postprocess import reindex_labels
from sknetwork.embedding.base import BaseEmbedding
from sknetwork.embedding.spectral import Spectral
from sknetwork.utils.format import is_square
from sknetwork.utils.check import check_n_clusters, check_format
from sknetwork.utils.kmeans import KMeansDense


def get_embedding(input_matrix: Union[sparse.csr_matrix, np.ndarray], method: BaseEmbedding,
                  co_embedding: bool = False) -> Tuple[np.ndarray, bool]:
    """Return the embedding of the input_matrix.
    Parameters
    ----------
    input_matrix :
        Adjacency matrix of biadjacency matrix of the graph.
    method :
        Embedding method.
    co_embedding : bool
        If ``True``, co-embedding of rows and columns.
        Otherwise, do it only if the input matrix is not square or not symmetric with ``allow_directed=False``.
    """
    bipartite = (not is_square(input_matrix)) or co_embedding
    if co_embedding:
        try:
            method.fit(input_matrix, force_bipartite=True)
        except:
            method.fit(input_matrix)
        embedding = np.vstack((method.embedding_row_, method.embedding_col_))
    else:
        method.fit(input_matrix)
        embedding = method.embedding_
    return embedding, bipartite


[docs]class KMeans(BaseClustering):
    """K-means clustering applied in the embedding space.

    Parameters
    ----------
    n_clusters :
        Number of desired clusters (default = 2).
    embedding_method :
        Embedding method (default = Spectral embedding in dimension 10).
    co_cluster :
        If ``True``, co-cluster rows and columns, considered as different nodes (default = ``False``).
    sort_clusters :
            If ``True``, sort labels in decreasing order of cluster size.
    return_membership :
            If ``True``, return the membership matrix of nodes to each cluster (soft clustering).
    return_aggregate :
            If ``True``, return the adjacency matrix of the graph between clusters.
    Attributes
    ----------
    labels_ : np.ndarray
        Labels of the nodes.
    labels_row_ : np.ndarray
        Labels of the rows (for bipartite graphs).
    labels_col_ : np.ndarray
        Labels of the columns (for bipartite graphs).
    membership_ : sparse.csr_matrix
        Membership matrix of the nodes, shape (n_nodes, n_clusters).
    membership_row_ : sparse.csr_matrix
        Membership matrix of the rows (for bipartite graphs).
    membership_col_ : sparse.csr_matrix
        Membership matrix of the columns (for bipartite graphs).
    aggregate_ : sparse.csr_matrix
        Aggregate adjacency matrix or biadjacency matrix between clusters.

    Example
    -------
    >>> from sknetwork.clustering import KMeans
    >>> from sknetwork.data import karate_club
    >>> kmeans = KMeans(n_clusters=3)
    >>> adjacency = karate_club()
    >>> labels = kmeans.fit_transform(adjacency)
    >>> len(set(labels))
    3
    """
    def __init__(self, n_clusters: int = 2, embedding_method: BaseEmbedding = Spectral(10), co_cluster: bool = False,
                 sort_clusters: bool = True, return_membership: bool = True, return_aggregate: bool = True):
        super(KMeans, self).__init__(sort_clusters=sort_clusters, return_membership=return_membership,
                                     return_aggregate=return_aggregate)
        self.n_clusters = n_clusters
        self.embedding_method = embedding_method
        self.co_cluster = co_cluster
        self.bipartite = None

[docs]    def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray]) -> 'KMeans':
        """Apply embedding method followed by K-means.

        Parameters
        ----------
        input_matrix :
            Adjacency matrix or biadjacency matrix of the graph.

        Returns
        -------
        self: :class:`KMeans`
        """
        self._init_vars()

        # input
        check_format(input_matrix)
        if self.co_cluster:
            check_n_clusters(self.n_clusters, np.sum(input_matrix.shape))
        else:
            check_n_clusters(self.n_clusters, input_matrix.shape[0])

        # embedding
        embedding, self.bipartite = get_embedding(input_matrix, self.embedding_method, self.co_cluster)

        # clustering
        kmeans = KMeansDense(self.n_clusters)
        kmeans.fit(embedding)

        # sort
        if self.sort_clusters:
            labels = reindex_labels(kmeans.labels_)
        else:
            labels = kmeans.labels_

        # output
        self.labels_ = labels
        if self.co_cluster:
            self._split_vars(input_matrix.shape)
        self._secondary_outputs(input_matrix)

        return self