Source code for sknetwork.embedding.svd

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created in May 2018
@author: Nathan de Lara <nathan.delara@polytechnique.org>
@author: Thomas Bonald <bonald@enst.fr>
"""
from typing import Union

import numpy as np
from scipy import sparse

from sknetwork.embedding.base import BaseEmbedding
from sknetwork.linalg import SVDSolver, LanczosSVD, safe_sparse_dot, diagonal_pseudo_inverse, normalize, Regularizer, SparseLR
from sknetwork.utils.check import check_format, check_adjacency_vector, check_nonnegative, check_n_components


[docs]class GSVD(BaseEmbedding): """Graph embedding by Generalized Singular Value Decomposition of the adjacency or biadjacency matrix :math:`A`. This is equivalent to the Singular Value Decomposition of the matrix :math:`D_1^{- \\alpha_1}AD_2^{- \\alpha_2}` where :math:`D_1, D_2` are the diagonal matrices of row weights and columns weights, respectively, and :math:`\\alpha_1, \\alpha_2` are parameters. Parameters ----------- n_components : int Dimension of the embedding. regularization : ``None`` or float (default = ``None``) Regularization factor :math:`\\alpha` so that the matrix is :math:`A + \\alpha \\frac{11^T}{n}`. factor_row : float (default = 0.5) Power factor :math:`\\alpha_1` applied to the diagonal matrix of row weights. factor_col : float (default = 0.5) Power factor :math:`\\alpha_2` applied to the diagonal matrix of column weights. factor_singular : float (default = 0.) Parameter :math:`\\alpha` applied to the singular values on right singular vectors. The embedding of rows and columns are respectively :math:`D_1^{- \\alpha_1}U \\Sigma^{1-\\alpha}` and :math:`D_2^{- \\alpha_2}V \\Sigma^\\alpha` where: * :math:`U` is the matrix of left singular vectors, shape (n_row, n_components) * :math:`V` is the matrix of right singular vectors, shape (n_col, n_components) * :math:`\\Sigma` is the diagonal matrix of singular values, shape (n_components, n_components) normalized : bool (default = ``True``) If ``True``, normalized the embedding so that each vector has norm 1 in the embedding space, i.e., each vector lies on the unit sphere. solver : ``'lanczos'`` (Lanczos algorithm, default) or :class:`SVDSolver` (custom solver) Which solver to use. Attributes ---------- embedding_ : array, shape = (n, n_components) Embedding of the nodes. embedding_row_ : array, shape = (n_row, n_components) Embedding of the rows, for bipartite graphs. embedding_col_ : array, shape = (n_col, n_components) Embedding of the columns, for bipartite graphs. singular_values_ : np.ndarray, shape = (n_components) Singular values. singular_vectors_left_ : np.ndarray, shape = (n_row, n_components) Left singular vectors. singular_vectors_right_ : np.ndarray, shape = (n_col, n_components) Right singular vectors. weights_col_ : np.ndarray, shape = (n2) Weights applied to columns. Example ------- >>> from sknetwork.embedding import GSVD >>> from sknetwork.data import karate_club >>> gsvd = GSVD() >>> adjacency = karate_club() >>> embedding = gsvd.fit_transform(adjacency) >>> embedding.shape (34, 2) References ---------- Abdi, H. (2007). `Singular value decomposition (SVD) and generalized singular value decomposition. <https://www.cs.cornell.edu/cv/ResearchPDF/Generalizing%20The%20Singular%20Value%20Decomposition.pdf>`_ Encyclopedia of measurement and statistics, 907-912. """ def __init__(self, n_components=2, regularization: Union[None, float] = None, factor_row: float = 0.5, factor_col: float = 0.5, factor_singular: float = 0., normalized: bool = True, solver: Union[str, SVDSolver] = 'lanczos'): super(GSVD, self).__init__() self.n_components = n_components if regularization == 0: self.regularization = None else: self.regularization = regularization self.factor_row = factor_row self.factor_col = factor_col self.factor_singular = factor_singular self.normalized = normalized self.solver = solver self.singular_values_ = None self.singular_vectors_left_ = None self.singular_vectors_right_ = None self.regularization_ = None self.weights_col_ = None
[docs] def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray]) -> 'GSVD': """Compute the embedding of the graph. Parameters ---------- input_matrix : Adjacency matrix or biadjacency matrix of the graph. Returns ------- self: :class:`GSVD` """ self._init_vars() adjacency = check_format(input_matrix).asfptype() n_row, n_col = adjacency.shape n_components = check_n_components(self.n_components, min(n_row, n_col) - 1) if isinstance(self.solver, str): self.solver = LanczosSVD() regularization = self.regularization if regularization: adjacency_reg = Regularizer(adjacency, regularization) else: adjacency_reg = adjacency weights_row = adjacency_reg.dot(np.ones(n_col)) weights_col = adjacency_reg.T.dot(np.ones(n_row)) diag_row = diagonal_pseudo_inverse(np.power(weights_row, self.factor_row)) diag_col = diagonal_pseudo_inverse(np.power(weights_col, self.factor_col)) self.solver.fit(safe_sparse_dot(diag_row, safe_sparse_dot(adjacency_reg, diag_col)), n_components) singular_values = self.solver.singular_values_ index = np.argsort(-singular_values) singular_values = singular_values[index] singular_vectors_left = self.solver.singular_vectors_left_[:, index] singular_vectors_right = self.solver.singular_vectors_right_[:, index] singular_left_diag = sparse.diags(np.power(singular_values, 1 - self.factor_singular)) singular_right_diag = sparse.diags(np.power(singular_values, self.factor_singular)) embedding_row = diag_row.dot(singular_vectors_left) embedding_col = diag_col.dot(singular_vectors_right) embedding_row = singular_left_diag.dot(embedding_row.T).T embedding_col = singular_right_diag.dot(embedding_col.T).T if self.normalized: embedding_row = normalize(embedding_row, p=2) embedding_col = normalize(embedding_col, p=2) self.embedding_row_ = embedding_row self.embedding_col_ = embedding_col self.embedding_ = embedding_row self.singular_values_ = singular_values self.singular_vectors_left_ = singular_vectors_left self.singular_vectors_right_ = singular_vectors_right self.weights_col_ = weights_col return self
@staticmethod def _check_adj_vector(adjacency_vectors): check_nonnegative(adjacency_vectors)
[docs] def predict(self, adjacency_vectors: Union[sparse.csr_matrix, np.ndarray]) -> np.ndarray: """Predict the embedding of new rows, defined by their adjacency vectors. Parameters ---------- adjacency_vectors : Adjacency vectors of nodes. Array of shape (n_col,) (single vector) or (n_vectors, n_col) Returns ------- embedding_vectors : np.ndarray Embedding of the nodes. """ self._check_fitted() singular_vectors_right = self.singular_vectors_right_ singular_values = self.singular_values_ n_row, _ = self.embedding_row_.shape n_col, _ = self.embedding_col_.shape adjacency_vectors = check_adjacency_vector(adjacency_vectors, n_col) self._check_adj_vector(adjacency_vectors) # regularization if self.regularization: adjacency_vectors = Regularizer(adjacency_vectors, self.regularization) # weighting weights_row = adjacency_vectors.dot(np.ones(n_col)) diag_row = diagonal_pseudo_inverse(np.power(weights_row, self.factor_row)) diag_col = diagonal_pseudo_inverse(np.power(self.weights_col_, self.factor_col)) adjacency_vectors = safe_sparse_dot(diag_row, safe_sparse_dot(adjacency_vectors, diag_col)) # projection in the embedding space averaging = adjacency_vectors embedding_vectors = diag_row.dot(averaging.dot(singular_vectors_right)) # scaling embedding_vectors /= np.power(singular_values, self.factor_singular) if self.normalized: embedding_vectors = normalize(embedding_vectors, p=2) if len(embedding_vectors) == 1: embedding_vectors = embedding_vectors.ravel() return embedding_vectors
[docs]class SVD(GSVD): """Graph embedding by Singular Value Decomposition of the adjacency or biadjacency matrix of the graph. Parameters ---------- n_components : int Dimension of the embedding. regularization : ``None`` or float (default = ``None``) Regularization factor :math:`\\alpha` so that the matrix is :math:`A + \\alpha \\frac{11^T}{n}`. factor_singular : float (default = 0.) Power factor :math:`\\alpha` applied to the singular values on right singular vectors. The embedding of rows and columns are respectively :math:`U \\Sigma^{1-\\alpha}` and :math:`V \\Sigma^\\alpha` where: * :math:`U` is the matrix of left singular vectors, shape (n_row, n_components) * :math:`V` is the matrix of right singular vectors, shape (n_col, n_components) * :math:`\\Sigma` is the diagonal matrix of singular values, shape (n_components, n_components) normalized : bool (default = ``False``) If ``True``, normalized the embedding so that each vector has norm 1 in the embedding space, i.e., each vector lies on the unit sphere. solver : ``'lanczos'`` (Lanczos algorithm, default) or :class:`SVDSolver` (custom solver) Which solver to use. Attributes ---------- embedding_ : array, shape = (n, n_components) Embedding of the nodes. embedding_row_ : array, shape = (n_row, n_components) Embedding of the rows, for bipartite graphs. embedding_col_ : array, shape = (n_col, n_components) Embedding of the columns, for bipartite graphs. singular_values_ : np.ndarray, shape = (n_components) Singular values. singular_vectors_left_ : np.ndarray, shape = (n_row, n_components) Left singular vectors. singular_vectors_right_ : np.ndarray, shape = (n_col, n_components) Right singular vectors. Example ------- >>> from sknetwork.embedding import SVD >>> from sknetwork.data import karate_club >>> svd = SVD() >>> adjacency = karate_club() >>> embedding = svd.fit_transform(adjacency) >>> embedding.shape (34, 2) References ---------- Abdi, H. (2007). `Singular value decomposition (SVD) and generalized singular value decomposition. <https://www.cs.cornell.edu/cv/ResearchPDF/Generalizing%20The%20Singular%20Value%20Decomposition.pdf>`_ Encyclopedia of measurement and statistics. """ def __init__(self, n_components=2, regularization: Union[None, float] = None, factor_singular: float = 0., normalized: bool = False, solver: Union[str, SVDSolver] = 'lanczos'): super(SVD, self).__init__(n_components=n_components, regularization=regularization, factor_singular=factor_singular, factor_row=0., factor_col=0., normalized=normalized, solver=solver) @staticmethod def _check_adj_vector(adjacency_vectors: np.ndarray): return
[docs]class PCA(SVD): """Graph embedding by Principal Component Analysis of the adjacency or biadjacency matrix. Parameters ---------- n_components : int Dimension of the embedding. normalized : bool (default = ``False``) If ``True``, normalized the embedding so that each vector has norm 1 in the embedding space, i.e., each vector lies on the unit sphere. solver : ``'lanczos'`` (Lanczos algorithm, default) or :class:`SVDSolver` (custom solver) Which solver to use. Attributes ---------- embedding_ : array, shape = (n, n_components) Embedding of the nodes. embedding_row_ : array, shape = (n_row, n_components) Embedding of the rows, for bipartite graphs. embedding_col_ : array, shape = (n_col, n_components) Embedding of the columns, for bipartite graphs. singular_values_ : np.ndarray, shape = (n_components) Singular values. singular_vectors_left_ : np.ndarray, shape = (n_row, n_components) Left singular vectors. singular_vectors_right_ : np.ndarray, shape = (n_col, n_components) Right singular vectors. Example ------- >>> from sknetwork.embedding import PCA >>> from sknetwork.data import karate_club >>> pca = PCA() >>> adjacency = karate_club() >>> embedding = pca.fit_transform(adjacency) >>> embedding.shape (34, 2) References ---------- Jolliffe, I.T. (2002). `Principal Component Analysis` Series: Springer Series in Statistics. """ def __init__(self, n_components=2, normalized: bool = False, solver: Union[str, SVDSolver] = 'lanczos'): super(PCA, self).__init__() self.n_components = n_components self.normalized = normalized if isinstance(solver, str): self.solver = LanczosSVD() else: self.solver = solver
[docs] def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'PCA': """Compute the embedding of the graph. Parameters ---------- adjacency : Adjacency or biadjacency matrix of the graph. Returns ------- self: :class:`PCA` """ adjacency = check_format(adjacency).asfptype() n_row, n_col = adjacency.shape adjacency_centered = SparseLR(adjacency, (-np.ones(n_row), adjacency.T.dot(np.ones(n_row)) / n_row)) svd = self.solver svd.fit(adjacency_centered, self.n_components) self.embedding_row_ = svd.singular_vectors_left_ self.embedding_col_ = svd.singular_vectors_right_ self.embedding_ = svd.singular_vectors_left_ self.singular_values_ = svd.singular_values_ self.singular_vectors_left_ = svd.singular_vectors_left_ self.singular_vectors_right_ = svd.singular_vectors_right_ return self