Source code for sknetwork.clustering.propagation_clustering

#!/usr/bin/env python3
# coding: utf-8
"""
Created on May, 2020
@author: Thomas Bonald <tbonald@enst.fr>
"""
from typing import Union

import numpy as np
from scipy import sparse

from sknetwork.classification.propagation import Propagation
from sknetwork.clustering.base import BaseClustering
from sknetwork.utils.format import check_format, get_adjacency


[docs]class PropagationClustering(BaseClustering, Propagation):
    """Clustering by label propagation.

    Parameters
    ----------
    n_iter : int
        Maximum number of iterations (-1 for infinity).
    node_order : str
        * `'random'`: node labels are updated in random order.
        * `'increasing'`: node labels are updated by increasing order of weight.
        * `'decreasing'`: node labels are updated by decreasing order of weight.
        * Otherwise, node labels are updated by index order.
    weighted : bool
        If ``True``, the vote of each neighbor is proportional to the edge weight.
        Otherwise, all votes have weight 1.
    sort_clusters : bool
        If ``True``, sort labels in decreasing order of cluster size.
    return_probs : bool
        If ``True``, return the probability distribution over clusters (soft clustering).
    return_aggregate : bool
        If ``True``, return the aggregate adjacency matrix or biadjacency matrix between clusters.

    Attributes
    ----------
    labels_ : np.ndarray, shape (n_labels,)
        Label of each node.
    probs_ : sparse.csr_matrix, shape (n_row, n_labels)
        Probability distribution over labels.
    labels_row_, labels_col_ : np.ndarray
        Labels of rows and columns, for bipartite graphs.
    probs_row_, probs_col_ : sparse.csr_matrix, shape (n_row, n_labels)
        Probability distributions over labels for rows and columns (for bipartite graphs).
    aggregate_ : sparse.csr_matrix
        Aggregate adjacency matrix or biadjacency matrix between clusters.

    Example
    -------
    >>> from sknetwork.clustering import PropagationClustering
    >>> from sknetwork.data import karate_club
    >>> propagation = PropagationClustering()
    >>> graph = karate_club(metadata=True)
    >>> adjacency = graph.adjacency
    >>> labels = propagation.fit_predict(adjacency)
    >>> len(set(labels))
    2

    References
    ----------
    Raghavan, U. N., Albert, R., & Kumara, S. (2007).
    `Near linear time algorithm to detect community structures in large-scale networks.
    <https://arxiv.org/pdf/0709.2938.pdf>`_
    Physical review E, 76(3), 036106.
    """
    def __init__(self, n_iter: int = 5, node_order: str = 'decreasing', weighted: bool = True,
                 sort_clusters: bool = True, return_probs: bool = True, return_aggregate: bool = True):
        Propagation.__init__(self, n_iter, node_order, weighted)
        BaseClustering.__init__(self, sort_clusters, return_probs, return_aggregate)
        self.bipartite = None

[docs]    def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray]) -> 'PropagationClustering':
        """Clustering by label propagation.

        Parameters
        ----------
        input_matrix : sparse.csr_matrix, np.ndarray
            Adjacency matrix or biadjacency matrix of the graph.

        Returns
        -------
        self: :class:`PropagationClustering`
        """
        self._init_vars()

        # input
        input_matrix = check_format(input_matrix)
        adjacency, bipartite = get_adjacency(input_matrix)

        # propagation
        Propagation.fit(self, adjacency)

        # output
        _, self.labels_ = np.unique(self.labels_, return_inverse=True)
        if bipartite:
            self._split_vars(input_matrix.shape)
            self.bipartite = True
        self._secondary_outputs(input_matrix)

        return self