Source code for sknetwork.clustering.kcenters

"""
Created in March 2024
@author: Laurène David <laurene.david@ip-paris.fr>
@author: Thomas Bonald <bonald@enst.fr>
"""

from typing import Union

import numpy as np
from scipy import sparse

from sknetwork.clustering import BaseClustering
from sknetwork.ranking import PageRank
from sknetwork.clustering import get_modularity
from sknetwork.classification.pagerank import PageRankClassifier
from sknetwork.utils.format import get_adjacency, directed2undirected


[docs]class KCenters(BaseClustering):
    """K-center clustering algorithm. The center of each cluster is obtained by the PageRank algorithm.

    Parameters
    ----------
    n_clusters : int
        Number of clusters.
    directed : bool, default False
        If ``True``, the graph is considered directed.
    center_position : str, default "row"
        Force centers to correspond to the nodes on the rows or columns of the biadjacency matrix.
        Can be ``row``, ``col`` or ``both``. Only considered for bipartite graphs.
    n_init : int, default 5
        Number of reruns of the k-centers algorithm with different centers.
        The run that produce the best modularity is chosen as the final result.
    max_iter : int, default 20
        Maximum number of iterations of the k-centers algorithm for a single run.

    Attributes
    ----------
    labels_ : np.ndarray, shape (n_nodes,)
        Label of each node.
    labels_row_, labels_col_ : np.ndarray
        Labels of rows and columns, for bipartite graphs.
    centers_ : np.ndarray, shape (n_nodes,)
        Cluster centers.
    centers_row_, centers_col_ : np.ndarray
        Cluster centers of rows and columns, for bipartite graphs.

    Example
    -------
    >>> from sknetwork.clustering import KCenters
    >>> from sknetwork.data import karate_club
    >>> kcenters = KCenters(n_clusters=2)
    >>> adjacency = karate_club()
    >>> labels = kcenters.fit_predict(adjacency)
    >>> len(set(labels))
    2

    """
    def __init__(self, n_clusters: int, directed: bool = False, center_position: str = "row", n_init: int = 5,
                 max_iter: int = 20):
        super(BaseClustering, self).__init__()
        self.n_clusters = n_clusters
        self.directed = directed
        self.bipartite = None
        self.center_position = center_position
        self.n_init = n_init
        self.max_iter = max_iter
        self.labels_ = None
        self.centers_ = None
        self.centers_row_ = None
        self.centers_col_ = None

    def _compute_mask_centers(self, input_matrix: Union[sparse.csr_matrix, np.ndarray]):
        """Generate mask to filter nodes that can be cluster centers.

        Parameters
        ----------
        input_matrix :
            Adjacency matrix or biadjacency matrix of the graph.

        Return
        ------
        mask : np.array, shape (n_nodes,)
            Mask for possible cluster centers.

        """
        n_row, n_col = input_matrix.shape
        if self.bipartite:
            n_nodes = n_row + n_col
            mask = np.zeros(n_nodes, dtype=bool)
            if self.center_position == "row":
                mask[:n_row] = True
            elif self.center_position == "col":
                mask[n_row:] = True
            elif self.center_position == "both":
                mask[:] = True
            else:
                raise ValueError('Unknown center position')
        else:
            mask = np.ones(n_row, dtype=bool)

        return mask

    @staticmethod
    def _init_centers(adjacency: Union[sparse.csr_matrix, np.ndarray], mask: np.ndarray, n_clusters: int):
        """
        Kcenters++ initialization to select cluster centers.
        This algorithm is an adaptation of the Kmeans++ algorithm to graphs.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph.
        mask :
            Initial mask for allowed positions of centers.
        n_clusters : int
            Number of centers to initialize.

        Returns
        ---------
        centers : np.array, shape (n_clusters,)
            Initial cluster centers.
        """
        mask = mask.copy()
        n_nodes = adjacency.shape[0]
        nodes = np.arange(n_nodes)
        centers = []

        # Choose the first center uniformly at random
        center = np.random.choice(nodes[mask])
        mask[center] = 0
        centers.append(center)

        pagerank = PageRank()
        weights = {center: 1}

        for k in range(n_clusters - 1):
            # select nodes that are far from existing centers
            ppr_scores = pagerank.fit_predict(adjacency, weights)
            ppr_scores = ppr_scores[mask]

            if min(ppr_scores) == 0:
                center = np.random.choice(nodes[mask][ppr_scores == 0])
            else:
                probs = 1 / ppr_scores
                probs = probs / np.sum(probs)
                center = np.random.choice(nodes[mask], p=probs)

            mask[center] = 0
            centers.append(center)
            weights.update({center: 1})

        centers = np.array(centers)
        return centers

[docs]    def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray], force_bipartite: bool = False) -> "KCenters":
        """Compute the clustering of the graph by k-centers.

        Parameters
        ----------
        input_matrix :
            Adjacency matrix or biadjacency matrix of the graph.
        force_bipartite :
            If ``True``, force the input matrix to be considered as a biadjacency matrix even if square.

        Returns
        -------
        self : :class:`KCenters`
        """

        if self.n_clusters < 2:
            raise ValueError("The number of clusters must be at least 2.")

        if self.n_init < 1:
            raise ValueError("The n_init parameter must be at least 1.")

        if self.directed:
            input_matrix = directed2undirected(input_matrix)

        adjacency, self.bipartite = get_adjacency(input_matrix, force_bipartite=force_bipartite)
        n_row = input_matrix.shape[0]
        n_nodes = adjacency.shape[0]
        nodes = np.arange(n_nodes)

        mask = self._compute_mask_centers(input_matrix)
        if self.n_clusters > np.sum(mask):
            raise ValueError("The number of clusters is to high. This might be due to the center_position parameter.")

        pagerank_clf = PageRankClassifier()
        pagerank = PageRank()

        labels_ = []
        centers_ = []
        modularity_ = []

        # Restarts
        for i in range(self.n_init):

            # Initialization
            centers = self._init_centers(adjacency, mask, self.n_clusters)
            prev_centers = None
            labels = None
            n_iter = 0

            while not np.equal(prev_centers, centers).all() and (n_iter < self.max_iter):

                # Assign nodes to centers
                labels_center = {center: label for label, center in enumerate(centers)}
                labels = pagerank_clf.fit_predict(adjacency, labels_center)

                # Find new centers
                prev_centers = centers.copy()
                new_centers = []

                for label in np.unique(labels):
                    mask_cluster = labels == label
                    mask_cluster &= mask
                    scores = pagerank.fit_predict(adjacency, weights=mask_cluster)
                    scores[~mask_cluster] = 0
                    new_centers.append(nodes[np.argmax(scores)])

                n_iter += 1

            # Store results
            if self.bipartite:
                labels_row = labels[:n_row]
                labels_col = labels[n_row:]
                modularity = get_modularity(input_matrix, labels_row, labels_col)
            else:
                modularity = get_modularity(adjacency, labels)

            labels_.append(labels)
            centers_.append(centers)
            modularity_.append(modularity)

        # Select restart with the highest modularity
        idx_max = np.argmax(modularity_)
        self.labels_ = np.array(labels_[idx_max])
        self.centers_ = np.array(centers_[idx_max])

        if self.bipartite:
            self._split_vars(input_matrix.shape)

            # Define centers based on center position
            if self.center_position == "row":
                self.centers_row_ = self.centers_
            elif self.center_position == "col":
                self.centers_col_ = self.centers_ - n_row
            else:
                self.centers_row_ = self.centers_[self.centers_ < n_row]
                self.centers_col_ = self.centers_[~np.isin(self.centers_, self.centers_row_)] - n_row

        return self