Source code for sknetwork.clustering.metrics

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu July 10 2018
@author: Nathan de Lara <ndelara@enst.fr>
@author: Thomas Bonald <bonald@enst.fr>
"""
from typing import Union, Tuple

import numpy as np
from scipy import sparse

from sknetwork.linalg.normalization import diag_pinv
from sknetwork.utils.check import check_format, get_probs, check_square
from sknetwork.utils.format import bipartite2directed
from sknetwork.utils.membership import membership_matrix


[docs]def modularity(adjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray,
               weights: Union[str, np.ndarray] = 'degree', weights_in: Union[str, np.ndarray] = 'degree',
               resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]:
    """Modularity of a clustering.

    The modularity of a clustering is

    :math:`Q = \\dfrac{1}{w} \\sum_{i,j}\\left(A_{ij} - \\gamma \\dfrac{d_id_j}{w}\\right)\\delta_{c_i,c_j}`
    for graphs,

    :math:`Q = \\dfrac{1}{w} \\sum_{i,j}\\left(A_{ij} - \\gamma \\dfrac{d^+_id^-_j}{w}\\right)\\delta_{c_i,c_j}`
    for directed graphs,

    where

    * :math:`c_i` is the cluster of node :math:`i`,\n
    * :math:`d_i` is the weight of node :math:`i`,\n
    * :math:`d^+_i, d^-_i` are the out-weight, in-weight of node :math:`i` (for digraphs),\n
    * :math:`w = 1^TA1` is the total weight,\n
    * :math:`\\delta` is the Kronecker symbol,\n
    * :math:`\\gamma \\ge 0` is the resolution parameter.

    Parameters
    ----------
    adjacency:
        Adjacency matrix of the graph.
    labels:
        Labels of nodes, vector of size :math:`n` .
    weights :
        Weights of nodes.
        ``'degree'`` (default), ``'uniform'`` or custom weights.
    weights_in :
        In-weights of nodes.
        ``None`` (default), ``'degree'``, ``'uniform'`` or custom weights.
        If ``None``, taken equal to weights.
    resolution:
        Resolution parameter (default = 1).
    return_all:
        If ``True``, return modularity, fit, diversity.

    Returns
    -------
    modularity : float
    fit: float, optional
    diversity: float, optional

    Example
    -------
    >>> from sknetwork.clustering import modularity
    >>> from sknetwork.data import house
    >>> adjacency = house()
    >>> labels = np.array([0, 0, 1, 1, 0])
    >>> np.round(modularity(adjacency, labels), 2)
    0.11
    """
    adjacency = check_format(adjacency).astype(float)
    check_square(adjacency)

    if len(labels) != adjacency.shape[0]:
        raise ValueError('Dimension mismatch between labels and adjacency matrix.')

    probs_row = get_probs(weights, adjacency)
    probs_col = get_probs(weights_in, adjacency.T)
    membership = membership_matrix(labels)

    fit: float = membership.multiply(adjacency.dot(membership)).data.sum() / adjacency.data.sum()
    div: float = membership.T.dot(probs_col).dot(membership.T.dot(probs_row))
    mod: float = fit - resolution * div
    if return_all:
        return mod, fit, div
    else:
        return mod


[docs]def bimodularity(biadjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray, labels_col: np.ndarray,
                 weights: Union[str, np.ndarray] = 'degree', weights_col: Union[str, np.ndarray] = 'degree',
                 resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]:
    """Bimodularity of the clustering (for bipartite graphs).

    The bimodularity of a clustering is

    :math:`Q = \\sum_{i}\\sum_{j}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{d_{1,i}d_{2,j}}{w^2}\\right)
    \\delta_{c_{1,i},c_{2,j}}`

    where

    * :math:`c_{1,i}, c_{2,j}` are the clusters of nodes :math:`i` (row) and :math:`j` (column),\n
    * :math:`d_{1,i}, d_{2,j}` are the weights of nodes :math:`i` (row) and :math:`j` (column),\n
    * :math:`w = 1^TB1` is the total weight,\n
    * :math:`\\delta` is the Kronecker symbol,\n
    * :math:`\\gamma \\ge 0` is the resolution parameter.

    Parameters
    ----------
    biadjacency :
        Biadjacency matrix of the graph (shape :math:`n_1 \\times n_2`).
    labels :
        Labels of rows, vector of size :math:`n_1`.
    labels_col:
        Labels of columns, vector of size :math:`n_2`.
    weights :
        Weights of nodes.
        ``'degree'`` (default), ``'uniform'`` or custom weights.
    weights_col :
        Weights of columns.
        ``'degree'`` (default), ``'uniform'`` or custom weights.
    resolution:
        Resolution parameter (default = 1).
    return_all:
        If ``True``, return modularity, fit, diversity.

    Returns
    -------
    modularity : float
    fit: float, optional
    diversity: float, optional

    Example
    -------
    >>> from sknetwork.clustering import bimodularity
    >>> from sknetwork.data import star_wars
    >>> biadjacency = star_wars()
    >>> labels = np.array([1, 1, 0, 0])
    >>> labels_col = np.array([1, 0, 0])
    >>> np.round(bimodularity(biadjacency, labels, labels_col), 2)
    0.22
    """
    biadjacency = check_format(biadjacency).astype(float)
    n_row, n_col = biadjacency.shape

    if len(labels) != n_row:
        raise ValueError('Dimension mismatch between labels and biadjacency matrix.')
    if len(labels_col) != n_col:
        raise ValueError('Dimension mismatch between labels_col and biadjacency matrix.')

    adjacency = bipartite2directed(biadjacency)

    weights_ = get_probs(weights, biadjacency)
    weights_ = np.hstack((weights_, np.zeros(n_col)))
    weights_col_ = get_probs(weights_col, biadjacency.T)
    weights_col_ = np.hstack((np.zeros(n_row), weights_col_))

    labels_ = np.hstack((labels, labels_col))

    return modularity(adjacency, labels_, weights_, weights_col_, resolution, return_all)


[docs]def comodularity(adjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray, resolution: float = 1,
                 return_all: bool = False) -> Union[float, Tuple[float, float, float]]:
    """Modularity of a clustering in the normalized co-neighborhood graph.

    Quality metric of a clustering given by:

    :math:`Q = \\dfrac{1}{w}\\sum_{i,j}\\left((AD_2^{-1}A^T)_{ij} - \\gamma \\dfrac{d_id_j}{w}\\right)
    \\delta_{c_i,c_j}`

    where

    * :math:`c_i` is the cluster of node `i`,\n
    * :math:`D_2` is the diagonal matrix of the weights of columns,\n
    * :math:`\\delta` is the Kronecker symbol,\n
    * :math:`\\gamma \\ge 0` is the resolution parameter.

    Parameters
    ----------
    adjacency :
        Adjacency matrix or biadjacency matrix of the graph.
    labels :
       Labels of the nodes.
    resolution :
        Resolution parameter (default = 1).
    return_all :
        If ``True``, return modularity, fit, diversity.

    Returns
    -------
    modularity : float
    fit : float, optional
    diversity: float, optional

    Example
    -------
    >>> from sknetwork.clustering import comodularity
    >>> from sknetwork.data import house
    >>> adjacency = house()
    >>> labels = np.array([0, 0, 1, 1, 0])
    >>> np.round(comodularity(adjacency, labels), 2)
    0.06

    Notes
    -----
    Does not require the computation of the adjacency matrix of the normalized co-neighborhood graph.
    """
    adjacency = check_format(adjacency).astype(float)

    n_row, n_col = adjacency.shape
    total_weight = adjacency.data.sum()
    probs = adjacency.dot(np.ones(n_col)) / total_weight

    weights_col = adjacency.T.dot(np.ones(n_col))
    diag_col = diag_pinv(np.sqrt(weights_col))
    normalized_adjacency = (adjacency.dot(diag_col)).T.tocsr()

    if len(labels) != n_row:
        raise ValueError('The number of labels must match the number of rows.')

    membership = membership_matrix(labels)
    fit: float = ((normalized_adjacency.dot(membership)).data ** 2).sum() / total_weight
    div: float = np.linalg.norm(membership.T.dot(probs)) ** 2
    mod: float = fit - resolution * div

    if return_all:
        return mod, fit, div
    else:
        return mod


[docs]def normalized_std(labels: np.ndarray) -> float:
    """Normalized standard deviation of cluster sizes.

    A score of 1 means perfectly balanced clustering.

    Parameters
    ----------
    labels :
        Labels of nodes.

    Returns
    -------
    float

    Example
    -------
    >>> from sknetwork.clustering import normalized_std
    >>> labels = np.array([0, 0, 1, 1])
    >>> normalized_std(labels)
    1.0
    """
    n = labels.shape[0]
    unique_clusters, counts = np.unique(labels, return_counts=True)
    n_clust = len(unique_clusters)
    if n_clust < 2:
        raise ValueError('There must be at least two different clusters.')
    return 1 - np.std(counts) / np.sqrt(n ** 2 * (n_clust - 1) / n_clust ** 2)