Source code for sknetwork.classification.knn

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Nov, 2019
@author: Nathan de Lara <ndelara@enst.fr>
@author: Thomas Bonald <tbonald@enst.fr>
"""
from typing import Optional, Union

import numpy as np
from scipy import sparse
from scipy.spatial import cKDTree

from sknetwork.classification.base import BaseClassifier
from sknetwork.embedding.base import BaseEmbedding
from sknetwork.embedding.svd import GSVD
from sknetwork.linalg.normalization import normalize
from sknetwork.utils.check import check_n_neighbors, check_n_jobs
from sknetwork.utils.format import get_adjacency_seeds


[docs]class KNN(BaseClassifier): """Node classification by K-nearest neighbors in the embedding space. For bigraphs, classify rows only (see ``BiKNN`` for joint classification of rows and columns). Parameters ---------- embedding_method : Which algorithm to use to project the nodes in vector space. Default is ``GSVD``. n_neighbors : Number of nearest neighbors to consider. factor_distance : Power weighting factor :math:`\\alpha` applied to the distance to each neighbor. Neighbor at distance :math:``d`` has weight :math:`1 / d^\\alpha`. Default is 2. leaf_size : Leaf size passed to KDTree. p : Which Minkowski p-norm to use. Default is 2 (Euclidean distance). tol_nn : Tolerance in nearest neighbors search; the k-th returned value is guaranteed to be no further than ``1 + tol_nn`` times the distance to the actual k-th nearest neighbor. n_jobs : Number of jobs to schedule for parallel processing. If -1 is given all processors are used. Attributes ---------- labels_ : np.ndarray, shape (n_labels,) Label of each node. membership_ : sparse.csr_matrix, shape (n_row, n_labels) Membership matrix. labels_row_ : np.ndarray Labels of rows, for bipartite graphs. labels_col_ : np.ndarray Labels of columns, for bipartite graphs. membership_row_ : sparse.csr_matrix, shape (n_row, n_labels) Membership matrix of rows, for bipartite graphs. membership_col_ : sparse.csr_matrix, shape (n_col, n_labels) Membership matrix of columns, for bipartite graphs. Example ------- >>> from sknetwork.classification import KNN >>> from sknetwork.embedding import GSVD >>> from sknetwork.data import karate_club >>> knn = KNN(GSVD(3), n_neighbors=1) >>> graph = karate_club(metadata=True) >>> adjacency = graph.adjacency >>> labels_true = graph.labels >>> seeds = {0: labels_true[0], 33: labels_true[33]} >>> labels_pred = knn.fit_transform(adjacency, seeds) >>> np.round(np.mean(labels_pred == labels_true), 2) 0.97 """ def __init__(self, embedding_method: BaseEmbedding = GSVD(10), n_neighbors: int = 5, factor_distance: float = 2, leaf_size: int = 16, p: float = 2, tol_nn: float = 0.01, n_jobs: Optional[int] = None): super(KNN, self).__init__() self.embedding_method = embedding_method self.n_neighbors = n_neighbors self.factor_distance = factor_distance self.leaf_size = leaf_size self.p = p self.tol_nn = tol_nn self.n_jobs = check_n_jobs(n_jobs) if self.n_jobs is None: self.n_jobs = -1 self.bipartite = None def _instantiate_vars(self, seeds: Union[np.ndarray, dict]): labels = seeds.astype(int) index_seed = np.argwhere(labels >= 0).ravel() index_remain = np.argwhere(labels < 0).ravel() labels_seed = labels[index_seed] return index_seed, index_remain, labels_seed def _fit_core(self, n, labels_seed, embedding, index_seed, index_remain): n_seeds = len(labels_seed) embedding_seed = embedding[index_seed] embedding_remain = embedding[index_remain] n_neighbors = check_n_neighbors(self.n_neighbors, n_seeds) tree = cKDTree(embedding_seed, self.leaf_size) distances, neighbors = tree.query(embedding_remain, n_neighbors, self.tol_nn, self.p, n_jobs=self.n_jobs) if n_neighbors == 1: distances = distances[:, np.newaxis] neighbors = neighbors[:, np.newaxis] labels_neighbor = labels_seed[neighbors] index = (np.min(distances, axis=1) == 0) weights_neighbor = np.zeros_like(distances).astype(float) # take all seeds at distance zero, if any weights_neighbor[index] = (distances[index] == 0).astype(float) # assign weights with respect to distances for other weights_neighbor[~index] = 1 / np.power(distances[~index], self.factor_distance) # form the corresponding matrix row = list(np.repeat(index_remain, n_neighbors)) col = list(labels_neighbor.ravel()) data = list(weights_neighbor.ravel()) row += list(index_seed) col += list(labels_seed) data += list(np.ones_like(index_seed)) membership = normalize(sparse.csr_matrix((data, (row, col)), shape=(n, np.max(labels_seed) + 1))) membership_dense = membership.toarray() labels = np.argmax(membership_dense, axis=1) return membership, labels
[docs] def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray], seeds: Union[np.ndarray, dict] = None, seeds_row: Union[np.ndarray, dict] = None, seeds_col: Union[np.ndarray, dict] = None) -> 'KNN': """Node classification by k-nearest neighbors in the embedding space. Parameters ---------- input_matrix : Adjacency matrix or biadjacency matrix of the graph. seeds : Seed nodes. Can be a dict {node: label} or an array where "-1" means no label. seeds_row, seeds_col : Seeds of rows and columns (for bipartite graphs). Returns ------- self: :class:`KNN` """ adjacency, seeds, self.bipartite = get_adjacency_seeds(input_matrix, seeds=seeds, seeds_row=seeds_row, seeds_col=seeds_col) index_seed, index_remain, labels_seed = self._instantiate_vars(seeds) embedding = self.embedding_method.fit_transform(adjacency) membership, labels = self._fit_core(adjacency.shape[0], labels_seed, embedding, index_seed, index_remain) self.membership_ = membership self.labels_ = labels if self.bipartite: self._split_vars(input_matrix.shape) return self