Source code for sknetwork.linalg.ppr_solver

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Apr 2020
@author: Nathan de Lara <ndelara@enst.fr>
"""
from typing import Union

import numpy as np
from scipy import sparse
from scipy.sparse.linalg import eigs, LinearOperator, bicgstab

from sknetwork.linalg.diteration import diffusion
from sknetwork.linalg.push import push_pagerank
from sknetwork.linalg.normalization import normalize
from sknetwork.linalg.polynome import Polynome


class RandomSurferOperator(LinearOperator):
    """Random surfer as a LinearOperator

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph as a CSR or a LinearOperator.
    damping_factor : float
        Probability to continue the random walk.
    seeds :
        Probability vector for seeds.

    Attributes
    ----------
    a : sparse.csr_matrix
        Scaled transposed transition matrix.
    b : np.ndarray
        Scaled restart probability vector.
    """

    def __init__(self, adjacency: Union[sparse.csr_matrix, LinearOperator], seeds: np.ndarray, damping_factor):
        super(RandomSurferOperator, self).__init__(shape=adjacency.shape, dtype=float)

        n = adjacency.shape[0]
        out_degrees = adjacency.dot(np.ones(n)).astype(bool)

        if hasattr(adjacency, 'left_sparse_dot'):
            self.a = damping_factor * normalize(adjacency).T
        else:
            self.a = (damping_factor * normalize(adjacency)).T.tocsr()
        self.b = (np.ones(n) - damping_factor * out_degrees) * seeds

    def _matvec(self, x: np.ndarray):
        return self.a.dot(x) + self.b * x.sum()


[docs]def get_pagerank(adjacency: Union[sparse.csr_matrix, LinearOperator], seeds: np.ndarray, damping_factor: float,
                 n_iter: int, tol: float = 1e-6, solver: str = 'piteration') -> np.ndarray:
    """Solve the Pagerank problem. Formally,

    :math:`x = \\alpha Px + (1-\\alpha)y`,

    where :math:`P = (D^{-1}A)^T` is the transition matrix and :math:`y` is the personalization probability vector.

    Parameters
    ----------
    adjacency : sparse.csr_matrix
        Adjacency matrix of the graph.
    seeds : np.ndarray
        Personalization array. Must be a valid probability vector.
    damping_factor : float
        Probability to continue the random walk.
    n_iter : int
        Number of iterations for some of the solvers such as ``'piteration'`` or ``'diteration'``.
    tol : float
        Tolerance for the convergence of some solvers such as ``'bicgstab'`` or ``'lanczos'`` or ``'push'``.
    solver : :obj:`str`
        Which solver to use: ``'piteration'``, ``'diteration'``, ``'bicgstab'``, ``'lanczos'``, ``̀'RH'``, ``'push'``.

    Returns
    -------
    pagerank : np.ndarray
        Probability vector.

    Examples
    --------
    >>> from sknetwork.data import house
    >>> adjacency = house()
    >>> seeds = np.array([1, 0, 0, 0, 0])
    >>> scores = get_pagerank(adjacency, seeds, damping_factor=0.85, n_iter=10)
    >>> np.round(scores, 2)
    array([0.29, 0.24, 0.12, 0.12, 0.24])


    References
    ----------
    * Hong, D. (2012). `Optimized on-line computation of pagerank algorithm.
      <https://arxiv.org/pdf/1202.6158.pdf>`_
      arXiv preprint arXiv:1202.6158.
    * Van der Vorst, H. A. (1992). `Bi-CGSTAB:
      <https://en.wikipedia.org/wiki/Biconjugate_gradient_stabilized_method>`_
      A fast and smoothly converging variant of Bi-CG for the solution of nonsymmetric linear systems.
      SIAM Journal on scientific and Statistical Computing, 13(2), 631-644.
    * Lanczos, C. (1950).
      `An iteration method for the solution of the eigenvalue problem of linear differential and integral operators.
      <http://www.cs.umd.edu/~oleary/lanczos1950.pdf>`_
      Los Angeles, CA: United States Governm. Press Office.
    * Whang, J. , Lenharth, A. , Dhillon, I. , & Pingali, K. . (2015).
      `Scalable Data-Driven PageRank: Algorithms, System Issues, and Lessons Learned`. 9233, 438-450.
      <https://www.cs.utexas.edu/users/inderjit/public_papers/scalable_pagerank_europar15.pdf>
    """
    n = adjacency.shape[0]

    if solver == 'diteration':
        if not isinstance(adjacency, sparse.csr_matrix):
            raise ValueError('D-iteration is not compatible with linear operators.')
        adjacency = normalize(adjacency, p=1)
        indptr = adjacency.indptr.astype(np.int32)
        indices = adjacency.indices.astype(np.int32)
        data = adjacency.data.astype(np.float32)
        damping_factor = np.float32(damping_factor)
        n_iter = np.int32(n_iter)
        tol = np.float32(tol)

        scores = np.zeros(n, dtype=np.float32)
        fluid = (1 - damping_factor) * seeds.astype(np.float32)
        diffusion(indptr, indices, data, scores, fluid, damping_factor, n_iter, tol)

    elif solver == 'push':
        n = adjacency.shape[0]
        damping_factor = np.float32(damping_factor)
        tol = np.float32(tol)
        degrees = adjacency.dot(np.ones(n)).astype(np.int32)
        rev_adjacency = adjacency.transpose().tocsr()

        indptr = adjacency.indptr.astype(np.int32)
        indices = adjacency.indices.astype(np.int32)
        rev_indptr = rev_adjacency.indptr.astype(np.int32)
        rev_indices = rev_adjacency.indices.astype(np.int32)

        scores = push_pagerank(n, degrees, indptr, indices,
                               rev_indptr, rev_indices,
                               seeds.astype(np.float32),
                               damping_factor, tol)

    elif solver == 'RH':
        coeffs = np.ones(n_iter + 1)
        polynome = Polynome(damping_factor * normalize(adjacency, p=1).T.tocsr(), coeffs)
        scores = polynome.dot(seeds)

    else:
        rso = RandomSurferOperator(adjacency, seeds, damping_factor)
        v0 = rso.b
        if solver == 'bicgstab':
            scores, info = bicgstab(sparse.eye(n, format='csr') - rso.a, rso.b, atol=tol, x0=v0)
        elif solver == 'lanczos':
            # noinspection PyTypeChecker
            _, scores = sparse.linalg.eigs(rso, k=1, tol=tol, v0=v0)
            scores = abs(scores.flatten().real)
        elif solver == 'piteration':
            scores = v0
            for i in range(n_iter):
                scores_ = rso.dot(scores)
                scores_ /= scores_.sum()
                if np.linalg.norm(scores - scores_, ord=1) < tol:
                    break
                else:
                    scores = scores_
        else:
            raise ValueError('Unknown solver.')

    return scores / scores.sum()