#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu July 10 2018
@author: Nathan de Lara <ndelara@enst.fr>
@author: Thomas Bonald <bonald@enst.fr>
"""
from typing import Union, Tuple
import numpy as np
from scipy import sparse
from sknetwork.linalg.normalization import diag_pinv
from sknetwork.utils.check import check_format, get_probs, check_square
from sknetwork.utils.format import bipartite2directed
from sknetwork.utils.membership import membership_matrix
[docs]def modularity(adjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray,
weights: Union[str, np.ndarray] = 'degree', weights_in: Union[str, np.ndarray] = 'degree',
resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]:
"""Modularity of a clustering.
The modularity of a clustering is
:math:`Q = \\dfrac{1}{w} \\sum_{i,j}\\left(A_{ij} - \\gamma \\dfrac{d_id_j}{w}\\right)\\delta_{c_i,c_j}`
for graphs,
:math:`Q = \\dfrac{1}{w} \\sum_{i,j}\\left(A_{ij} - \\gamma \\dfrac{d^+_id^-_j}{w}\\right)\\delta_{c_i,c_j}`
for directed graphs,
where
* :math:`c_i` is the cluster of node :math:`i`,\n
* :math:`d_i` is the weight of node :math:`i`,\n
* :math:`d^+_i, d^-_i` are the out-weight, in-weight of node :math:`i` (for digraphs),\n
* :math:`w = 1^TA1` is the total weight,\n
* :math:`\\delta` is the Kronecker symbol,\n
* :math:`\\gamma \\ge 0` is the resolution parameter.
Parameters
----------
adjacency:
Adjacency matrix of the graph.
labels:
Labels of nodes, vector of size :math:`n` .
weights :
Weights of nodes.
``'degree'`` (default), ``'uniform'`` or custom weights.
weights_in :
In-weights of nodes.
``None`` (default), ``'degree'``, ``'uniform'`` or custom weights.
If ``None``, taken equal to weights.
resolution:
Resolution parameter (default = 1).
return_all:
If ``True``, return modularity, fit, diversity.
Returns
-------
modularity : float
fit: float, optional
diversity: float, optional
Example
-------
>>> from sknetwork.clustering import modularity
>>> from sknetwork.data import house
>>> adjacency = house()
>>> labels = np.array([0, 0, 1, 1, 0])
>>> np.round(modularity(adjacency, labels), 2)
0.11
"""
adjacency = check_format(adjacency).astype(float)
check_square(adjacency)
if len(labels) != adjacency.shape[0]:
raise ValueError('Dimension mismatch between labels and adjacency matrix.')
probs_row = get_probs(weights, adjacency)
probs_col = get_probs(weights_in, adjacency.T)
membership = membership_matrix(labels)
fit: float = membership.multiply(adjacency.dot(membership)).data.sum() / adjacency.data.sum()
div: float = membership.T.dot(probs_col).dot(membership.T.dot(probs_row))
mod: float = fit - resolution * div
if return_all:
return mod, fit, div
else:
return mod
[docs]def bimodularity(biadjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray, labels_col: np.ndarray,
weights: Union[str, np.ndarray] = 'degree', weights_col: Union[str, np.ndarray] = 'degree',
resolution: float = 1, return_all: bool = False) -> Union[float, Tuple[float, float, float]]:
"""Bimodularity of the clustering (for bipartite graphs).
The bimodularity of a clustering is
:math:`Q = \\sum_{i}\\sum_{j}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{d_{1,i}d_{2,j}}{w^2}\\right)
\\delta_{c_{1,i},c_{2,j}}`
where
* :math:`c_{1,i}, c_{2,j}` are the clusters of nodes :math:`i` (row) and :math:`j` (column),\n
* :math:`d_{1,i}, d_{2,j}` are the weights of nodes :math:`i` (row) and :math:`j` (column),\n
* :math:`w = 1^TB1` is the total weight,\n
* :math:`\\delta` is the Kronecker symbol,\n
* :math:`\\gamma \\ge 0` is the resolution parameter.
Parameters
----------
biadjacency :
Biadjacency matrix of the graph (shape :math:`n_1 \\times n_2`).
labels :
Labels of rows, vector of size :math:`n_1`.
labels_col:
Labels of columns, vector of size :math:`n_2`.
weights :
Weights of nodes.
``'degree'`` (default), ``'uniform'`` or custom weights.
weights_col :
Weights of columns.
``'degree'`` (default), ``'uniform'`` or custom weights.
resolution:
Resolution parameter (default = 1).
return_all:
If ``True``, return modularity, fit, diversity.
Returns
-------
modularity : float
fit: float, optional
diversity: float, optional
Example
-------
>>> from sknetwork.clustering import bimodularity
>>> from sknetwork.data import star_wars
>>> biadjacency = star_wars()
>>> labels = np.array([1, 1, 0, 0])
>>> labels_col = np.array([1, 0, 0])
>>> np.round(bimodularity(biadjacency, labels, labels_col), 2)
0.22
"""
biadjacency = check_format(biadjacency).astype(float)
n_row, n_col = biadjacency.shape
if len(labels) != n_row:
raise ValueError('Dimension mismatch between labels and biadjacency matrix.')
if len(labels_col) != n_col:
raise ValueError('Dimension mismatch between labels_col and biadjacency matrix.')
adjacency = bipartite2directed(biadjacency)
weights_ = get_probs(weights, biadjacency)
weights_ = np.hstack((weights_, np.zeros(n_col)))
weights_col_ = get_probs(weights_col, biadjacency.T)
weights_col_ = np.hstack((np.zeros(n_row), weights_col_))
labels_ = np.hstack((labels, labels_col))
return modularity(adjacency, labels_, weights_, weights_col_, resolution, return_all)
[docs]def comodularity(adjacency: Union[sparse.csr_matrix, np.ndarray], labels: np.ndarray, resolution: float = 1,
return_all: bool = False) -> Union[float, Tuple[float, float, float]]:
"""Modularity of a clustering in the normalized co-neighborhood graph.
Quality metric of a clustering given by:
:math:`Q = \\dfrac{1}{w}\\sum_{i,j}\\left((AD_2^{-1}A^T)_{ij} - \\gamma \\dfrac{d_id_j}{w}\\right)
\\delta_{c_i,c_j}`
where
* :math:`c_i` is the cluster of node `i`,\n
* :math:`D_2` is the diagonal matrix of the weights of columns,\n
* :math:`\\delta` is the Kronecker symbol,\n
* :math:`\\gamma \\ge 0` is the resolution parameter.
Parameters
----------
adjacency :
Adjacency matrix or biadjacency matrix of the graph.
labels :
Labels of the nodes.
resolution :
Resolution parameter (default = 1).
return_all :
If ``True``, return modularity, fit, diversity.
Returns
-------
modularity : float
fit : float, optional
diversity: float, optional
Example
-------
>>> from sknetwork.clustering import comodularity
>>> from sknetwork.data import house
>>> adjacency = house()
>>> labels = np.array([0, 0, 1, 1, 0])
>>> np.round(comodularity(adjacency, labels), 2)
0.06
Notes
-----
Does not require the computation of the adjacency matrix of the normalized co-neighborhood graph.
"""
adjacency = check_format(adjacency).astype(float)
n_row, n_col = adjacency.shape
total_weight = adjacency.data.sum()
probs = adjacency.dot(np.ones(n_col)) / total_weight
weights_col = adjacency.T.dot(np.ones(n_col))
diag_col = diag_pinv(np.sqrt(weights_col))
normalized_adjacency = (adjacency.dot(diag_col)).T.tocsr()
if len(labels) != n_row:
raise ValueError('The number of labels must match the number of rows.')
membership = membership_matrix(labels)
fit: float = ((normalized_adjacency.dot(membership)).data ** 2).sum() / total_weight
div: float = np.linalg.norm(membership.T.dot(probs)) ** 2
mod: float = fit - resolution * div
if return_all:
return mod, fit, div
else:
return mod
[docs]def normalized_std(labels: np.ndarray) -> float:
"""Normalized standard deviation of cluster sizes.
A score of 1 means perfectly balanced clustering.
Parameters
----------
labels :
Labels of nodes.
Returns
-------
float
Example
-------
>>> from sknetwork.clustering import normalized_std
>>> labels = np.array([0, 0, 1, 1])
>>> normalized_std(labels)
1.0
"""
n = labels.shape[0]
unique_clusters, counts = np.unique(labels, return_counts=True)
n_clust = len(unique_clusters)
if n_clust < 2:
raise ValueError('There must be at least two different clusters.')
return 1 - np.std(counts) / np.sqrt(n ** 2 * (n_clust - 1) / n_clust ** 2)