Source code for sknetwork.data.parse

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created in December 2018
@author: Quentin Lutz <qlutz@enst.fr>
@author: Nathan de Lara <nathan.delara@polytechnique.org>
@author: Thomas Bonald <bonald@enst.fr>
"""

from csv import reader
from typing import Dict, List, Tuple, Union, Optional
from xml.etree import ElementTree

import numpy as np
from scipy import sparse

from sknetwork.data.base import Bunch
from sknetwork.utils.format import directed2undirected


[docs]def from_edge_list(edge_list: Union[np.ndarray, List[Tuple]], directed: bool = False,
                   bipartite: bool = False, weighted: bool = True, reindex: bool = False, shape: Optional[tuple] = None,
                   sum_duplicates: bool = True, matrix_only: bool = None) -> Union[Bunch, sparse.csr_matrix]:
    """Load a graph from an edge list.

    Parameters
    ----------
    edge_list : Union[np.ndarray, List[Tuple]]
        The edge list to convert, given as a NumPy array of size (n, 2) or (n, 3) or a list of tuples of
        length 2 or 3.
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix.
    weighted : bool
        If ``True``, returns a weighted graph.
    reindex : bool
        If ``True``, reindex nodes and returns the original node indices as names.
        Reindexing is enforced if nodes are not integers.
    shape : tuple
        Shape of the adjacency or biadjacency matrix.
        If not specified or if nodes are reindexed, the shape is the smallest compatible with node indices.
    sum_duplicates : bool
        If ``True`` (default), sums weights of duplicate edges.
        Otherwise, the weight of each edge is that of the first occurrence of this edge.
    matrix_only : bool
        If ``True``, returns only the adjacency or biadjacency matrix.
        Otherwise, returns a ``Dataset`` object with graph attributes (e.g., node names).
        If not specified (default), selects the most appropriate format.
    Returns
    -------
    graph : :class:`Dataset` (including node names) or sparse matrix

    Examples
    --------
    >>> edges = [(0, 1), (1, 2), (2, 0)]
    >>> adjacency = from_edge_list(edges)
    >>> adjacency.shape
    (3, 3)
    >>> edges = [('Alice', 'Bob'), ('Bob', 'Carol'), ('Carol', 'Alice')]
    >>> graph = from_edge_list(edges)
    >>> adjacency = graph.adjacency
    >>> adjacency.shape
    (3, 3)
    >>> print(graph.names)
    ['Alice' 'Bob' 'Carol']
    """
    edge_array = np.array([])
    weights = None
    if isinstance(edge_list, list):
        try:
            edge_array = np.array([[edge[0], edge[1]] for edge in edge_list])
            if len(edge_list) and len(edge_list[0]) == 3:
                weights = np.array([edge[2] for edge in edge_list])
            else:
                raise ValueError()
        except ValueError:
            ValueError('Edges must be given as tuples of fixed size (2 or 3).')
    elif isinstance(edge_list, np.ndarray):
        if edge_list.ndim != 2 or edge_list.shape[1] not in [2, 3]:
            raise ValueError('The edge list must be given as an array of shape (n_edges, 2) or '
                             '(n_edges, 3).')
        edge_array = edge_list[:, :2]
        if edge_list.shape[1] == 3:
            weights = edge_list[:, 2]
    else:
        raise TypeError('The edge list must be given as a NumPy array or a list of tuples.')
    return from_edge_array(edge_array=edge_array, weights=weights, directed=directed, bipartite=bipartite,
                           weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                           matrix_only=matrix_only)


[docs]def from_adjacency_list(adjacency_list: Union[List[List], Dict[str, List]], directed: bool = False,
                        bipartite: bool = False, weighted: bool = True, reindex: bool = False,
                        shape: Optional[tuple] = None, sum_duplicates: bool = True, matrix_only: bool = None) \
                        -> Union[Bunch, sparse.csr_matrix]:
    """Load a graph from an adjacency list.

    Parameters
    ----------
    adjacency_list : Union[List[List], Dict[str, List]]
        Adjacency list (neighbors of each node) or dictionary (node: neighbors).
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix.
    weighted : bool
        If ``True``, returns a weighted graph.
    reindex : bool
        If ``True``, reindex nodes and returns the original node indices as names.
        Reindexing is enforced if nodes are not integers.
    shape : tuple
        Shape of the adjacency or biadjacency matrix.
        If not specified or if nodes are reindexed, the shape is the smallest compatible with node indices.
    sum_duplicates : bool
        If ``True`` (default), sums weights of duplicate edges.
        Otherwise, the weight of each edge is that of the first occurrence of this edge.
    matrix_only : bool
        If ``True``, returns only the adjacency or biadjacency matrix.
        Otherwise, returns a ``Dataset`` object with graph attributes (e.g., node names).
        If not specified (default), selects the most appropriate format.
    Returns
    -------
    graph : :class:`Dataset` or sparse matrix

    Example
    -------
    >>> edges = [[1, 2], [0, 2, 3], [0, 1]]
    >>> adjacency = from_adjacency_list(edges)
    >>> adjacency.shape
    (4, 4)
    """
    edge_list = []
    if isinstance(adjacency_list, list):
        for i, neighbors in enumerate(adjacency_list):
            for j in neighbors:
                edge_list.append((i, j))
    elif isinstance(adjacency_list, dict):
        for i, neighbors in adjacency_list.items():
            for j in neighbors:
                edge_list.append((i, j))
    else:
        raise TypeError('The adjacency list must be given as a list of lists or a dict of lists.')
    return from_edge_list(edge_list=edge_list, directed=directed, bipartite=bipartite, weighted=weighted,
                          reindex=reindex, shape=shape, sum_duplicates=sum_duplicates, matrix_only=matrix_only)


def from_edge_array(edge_array: np.ndarray, weights: np.ndarray = None, directed: bool = False, bipartite: bool = False,
                    weighted: bool = True, reindex: bool = False, shape: Optional[tuple] = None,
                    sum_duplicates: bool = True, matrix_only: bool = None) -> Union[Bunch, sparse.csr_matrix]:
    """Load a graph from an edge array of shape (n_edges, 2) and weights (optional).

    Parameters
    ----------
    edge_array : np.ndarray
        Array of edges.
    weights : np.ndarray
        Array of weights.
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix.
    weighted : bool
        If ``True``, returns a weighted graph.
    reindex : bool
        If ``True``, reindex nodes and returns the original node indices as names.
        Reindexing is enforced if nodes are not integers.
    shape : tuple
        Shape of the adjacency or biadjacency matrix.
        If not specified or if nodes are reindexed, the shape is the smallest compatible with node indices.
    sum_duplicates : bool
        If ``True`` (default), sums weights of duplicate edges.
        Otherwise, the weight of each edge is that of the first occurrence of this edge.
    matrix_only : bool
        If ``True``, returns only the adjacency or biadjacency matrix.
        Otherwise, returns a ``Dataset`` object with graph attributes (e.g., node names).
        If not specified (default), selects the most appropriate format.

    Returns
    -------
    graph : :class:`Dataset` or sparse matrix
    """
    try:
        edge_array = edge_array.astype(float)
    except ValueError:
        pass
    if edge_array.dtype == float and (edge_array == edge_array.astype(int)).all():
        edge_array = edge_array.astype(int)
    if weights is None:
        weights = np.ones(len(edge_array))
    if weights.dtype not in [bool, int, float]:
        try:
            weights = weights.astype(float)
        except ValueError:
            raise ValueError('Weights must be numeric.')
    if all(weights == weights.astype(int)):
        weights = weights.astype(int)
    if not weighted:
        weights = weights.astype(bool)

    if not sum_duplicates:
        _, index = np.unique(edge_array, axis=0, return_index=True)
        edge_array = edge_array[index]
        weights = weights[index]
    graph = Bunch()
    if bipartite:
        row = edge_array[:, 0]
        col = edge_array[:, 1]
        if row.dtype != int or reindex:
            names_row, row = np.unique(row, return_inverse=True)
            graph.names_row = names_row
            graph.names = names_row
            n_row = len(names_row)
        elif shape is not None:
            n_row = max(shape[0], max(row) + 1)
        else:
            n_row = max(row) + 1
        if col.dtype != int or reindex:
            names_col, col = np.unique(col, return_inverse=True)
            graph.names_col = names_col
            n_col = len(names_col)
        elif shape is not None:
            n_col = max(shape[1], max(col) + 1)
        else:
            n_col = max(col) + 1
        matrix = sparse.csr_matrix((weights, (row, col)), shape=(n_row, n_col))
        matrix.sum_duplicates()
        graph.biadjacency = matrix
    else:
        nodes = edge_array.ravel()
        if nodes.dtype != int or reindex:
            names, nodes = np.unique(nodes, return_inverse=True)
            graph.names = names
            n = len(names)
            edge_array = nodes.reshape(-1, 2)
        elif shape is not None:
            n = max(shape[0], max(nodes) + 1)
        else:
            n = max(nodes) + 1
        row = edge_array[:, 0]
        col = edge_array[:, 1]
        matrix = sparse.csr_matrix((weights, (row, col)), shape=(n, n))
        if not directed:
            matrix = directed2undirected(matrix)
        matrix.sum_duplicates()
        graph.adjacency = matrix
    if matrix_only or (matrix_only is None and len(graph) == 1):
        return matrix
    else:
        return graph


[docs]def from_csv(file_path: str, delimiter: str = None, sep: str = None, comments: str = '#%',
             data_structure: str = None, directed: bool = False, bipartite: bool = False, weighted: bool = True,
             reindex: bool = False, shape: Optional[tuple] = None, sum_duplicates: bool = True,
             matrix_only: bool = None) -> Union[Bunch, sparse.csr_matrix]:
    """Load a graph from a CSV or TSV file.
    The delimiter can be specified (e.g., ' ' for space-separated values).

    Parameters
    ----------
    file_path : str
        Path to the CSV file.
    delimiter : str
        Delimiter used in the file. Guessed if not specified.
    sep : str
        Alias for delimiter.
    comments : str
        Characters for comment lines.
    data_structure : str
        If 'edge_list', consider each row of the file as an edge (tuple of size 2 or 3).
        If 'adjacency_list', consider each row of the file as an adjacency list (list of neighbors,
        in the order of node indices; an empty line means no neighbor).
        If 'adjacency_dict', consider each row of the file as an adjacency dictionary with key
        given by the first column (node: list of neighbors).
        If ``None`` (default), data_structure is guessed from the first rows of the file.
    directed : bool
        If ``True``, considers the graph as directed.
    bipartite : bool
        If ``True``, returns a biadjacency matrix of shape (n1, n2).
    weighted : bool
        If ``True``, returns a weighted graph (e.g., counts the number of occurrences of each edge).
    reindex : bool
        If ``True``, reindex nodes and returns the original node indices as names.
        Reindexing is enforced if nodes are not integers.
    shape : tuple
        Shape of the adjacency or biadjacency matrix.
        If not specified or if nodes are reindexed, the shape is the smallest compatible with node indices.
    sum_duplicates : bool
        If ``True`` (default), sums weights of duplicate edges.
        Otherwise, the weight of each edge is that of the first occurrence of this edge.
    matrix_only : bool
        If ``True``, returns only the adjacency or biadjacency matrix.
        Otherwise, returns a ``Dataset`` object with graph attributes (e.g., node names).
        If not specified (default), selects the most appropriate format.

    Returns
    -------
    graph: :class:`Dataset` or sparse matrix
    """
    header_length, delimiter_guess, comment_guess, data_structure_guess = scan_header(file_path, delimiters=delimiter,
                                                                                      comments=comments)
    if delimiter is None:
        if sep is not None:
            delimiter = sep
        else:
            delimiter = delimiter_guess
    if data_structure is None:
        data_structure = data_structure_guess
    if data_structure == 'edge_list':
        try:
            array = np.genfromtxt(file_path, delimiter=delimiter, comments=comment_guess)
            if np.isnan(array).any():
                raise TypeError()
            edge_array = array[:, :2].astype(int)
            if array.shape[1] == 3:
                weights = array[:, 2]
            else:
                weights = None
            return from_edge_array(edge_array=edge_array, weights=weights, directed=directed, bipartite=bipartite,
                                   weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                                   matrix_only=matrix_only)
        except TypeError:
            pass
    with open(file_path, 'r', encoding='utf-8') as f:
        for i in range(header_length):
            f.readline()
        csv_reader = reader(f, delimiter=delimiter)
        if data_structure == 'edge_list':
            edge_list = [tuple(row) for row in csv_reader]
            return from_edge_list(edge_list=edge_list, directed=directed, bipartite=bipartite,
                                  weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                                  matrix_only=matrix_only)
        elif data_structure == 'adjacency_list':
            adjacency_list = [row for row in csv_reader]
            return from_adjacency_list(adjacency_list=adjacency_list, directed=directed, bipartite=bipartite,
                                       weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                                       matrix_only=matrix_only)
        elif data_structure == 'adjacency_dict':
            adjacency_list = {row[0]: row[1:] for row in csv_reader}
            return from_adjacency_list(adjacency_list=adjacency_list, directed=directed, bipartite=bipartite,
                                       weighted=weighted, reindex=reindex, shape=shape, sum_duplicates=sum_duplicates,
                                       matrix_only=matrix_only)


def scan_header(file_path: str, delimiters: str = None, comments: str = '#%', n_scan: int = 100):
    """Infer some properties of the graph from the first lines of a CSV file .
    Parameters
    ----------
    file_path : str
        Path to the CSV file.
    delimiters : str
        Possible delimiters.
    comments : str
        Possible comment characters.
    n_scan : int
        Number of rows scanned for inference.

    Returns
    -------
    header_length : int
        Length of the header (comments and blank lines)
    delimiter_guess : str
        Guessed delimiter.
    comment_guess : str
        Guessed comment character.
    data_structure_guess : str
        Either 'edge_list' or 'adjacency_list'.
    """
    header_length = 0
    if delimiters is None:
        delimiters = '\t,; '
    comment_guess = comments[0]
    count = {delimiter: [] for delimiter in delimiters}
    rows = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for row in f.readlines():
            if row.startswith(tuple(comments)) or row == '':
                if len(row):
                    comment_guess = row[0]
                header_length += 1
            else:
                rows.append(row.rstrip())
                for delimiter in delimiters:
                    count[delimiter].append(row.count(delimiter))
                if len(rows) == n_scan:
                    break
    means = [np.mean(count[delimiter]) for delimiter in delimiters]
    stds = [np.std(count[delimiter]) for delimiter in delimiters]
    index = np.argwhere((np.array(means) > 0) * (np.array(stds) == 0)).ravel()
    if len(index) == 1:
        delimiter_guess = delimiters[int(index)]
    else:
        delimiter_guess = delimiters[int(np.argmax(means))]
    length = {len(row.split(delimiter_guess)) for row in rows}
    if length == {2} or length == {3}:
        data_structure_guess = 'edge_list'
    else:
        data_structure_guess = 'adjacency_list'
    return header_length, delimiter_guess, comment_guess, data_structure_guess


def load_labels(file: str) -> np.ndarray:
    """Parser for files with a single entry on each row.

    Parameters
    ----------
    file : str
        The path to the dataset

    Returns
    -------
    labels: np.ndarray
        Labels.
    """
    rows = []
    with open(file, 'r', encoding='utf-8') as f:
        for row in f:
            rows.append(row.strip())
    return np.array(rows)


def load_header(file: str):
    """Check if the graph is directed, bipartite, weighted."""
    directed, bipartite, weighted = False, False, True
    with open(file, 'r', encoding='utf-8') as f:
        row = f.readline()
        if 'bip' in row:
            bipartite = True
        if 'unweighted' in row:
            weighted = False
        if 'asym' in row:
            directed = True
    return directed, bipartite, weighted


def load_metadata(file: str, delimiter: str = ': ') -> Bunch:
    """Extract metadata from the file."""
    metadata = Bunch()
    with open(file, 'r', encoding='utf-8') as f:
        for row in f:
            parts = row.split(delimiter)
            key, value = parts[0], ': '.join(parts[1:]).strip('\n')
            metadata[key] = value
    return metadata


[docs]def from_graphml(file_path: str, weight_key: str = 'weight', max_string_size: int = 512) -> Bunch:
    """Load graph from GraphML file.

    Hyperedges and nested graphs are not supported.

    Parameters
    ----------
    file_path: str
        Path to the GraphML file.
    weight_key: str
        The key to be used as a value for edge weights
    max_string_size: int
        The maximum size for string features of the data

    Returns
    -------
    data: :class:`Bunch`
        The dataset in a Dataset with the adjacency as a CSR matrix.
    """
    # see http://graphml.graphdrawing.org/primer/graphml-primer.html
    # and http://graphml.graphdrawing.org/specification/dtd.html#top
    tree = ElementTree.parse(file_path)
    n_nodes = 0
    n_edges = 0
    symmetrize = None
    naming_nodes = True
    default_weight = 1
    weight_type = bool
    weight_id = None
    # indices in the graph tree
    node_indices = []
    edge_indices = []
    data = Bunch()
    graph = None
    file_description = None
    attribute_descriptions = Bunch()
    attribute_descriptions.node = Bunch()
    attribute_descriptions.edge = Bunch()
    keys = {}
    for file_element in tree.getroot():
        if file_element.tag.endswith('graph'):
            graph = file_element
            symmetrize = (graph.attrib['edgedefault'] == 'undirected')
            for index, element in enumerate(graph):
                if element.tag.endswith('node'):
                    node_indices.append(index)
                    n_nodes += 1
                elif element.tag.endswith('edge'):
                    edge_indices.append(index)
                    if 'directed' in element.attrib:
                        if element.attrib['directed'] == 'true':
                            n_edges += 1
                        else:
                            n_edges += 2
                    elif symmetrize:
                        n_edges += 2
                    else:
                        n_edges += 1
            if 'parse.nodeids' in graph.attrib:
                naming_nodes = not (graph.attrib['parse.nodeids'] == 'canonical')
    for file_element in tree.getroot():
        if file_element.tag.endswith('key'):
            attribute_name = file_element.attrib['attr.name']
            attribute_type = java_type_to_python_type(file_element.attrib['attr.type'])
            if attribute_name == weight_key:
                weight_type = java_type_to_python_type(file_element.attrib['attr.type'])
                weight_id = file_element.attrib['id']
                for key_element in file_element:
                    if key_element.tag == 'default':
                        default_weight = attribute_type(key_element.text)
            else:
                default_value = None
                if file_element.attrib['for'] == 'node':
                    size = n_nodes
                    if 'node_attribute' not in data:
                        data.node_attribute = Bunch()
                    for key_element in file_element:
                        if key_element.tag.endswith('desc'):
                            attribute_descriptions.node[attribute_name] = key_element.text
                        elif key_element.tag.endswith('default'):
                            default_value = attribute_type(key_element.text)
                    if attribute_type == str:
                        local_type = '<U' + str(max_string_size)
                    else:
                        local_type = attribute_type
                    if default_value:
                        data.node_attribute[attribute_name] = np.full(size, default_value, dtype=local_type)
                    else:
                        data.node_attribute[attribute_name] = np.zeros(size, dtype=local_type)
                elif file_element.attrib['for'] == 'edge':
                    size = n_edges
                    if 'edge_attribute' not in data:
                        data.edge_attribute = Bunch()
                    for key_element in file_element:
                        if key_element.tag.endswith('desc'):
                            attribute_descriptions.edge[attribute_name] = key_element.text
                        elif key_element.tag.endswith('default'):
                            default_value = attribute_type(key_element.text)
                    if attribute_type == str:
                        local_type = '<U' + str(max_string_size)
                    else:
                        local_type = attribute_type
                    if default_value:
                        data.edge_attribute[attribute_name] = np.full(size, default_value, dtype=local_type)
                    else:
                        data.edge_attribute[attribute_name] = np.zeros(size, dtype=local_type)
                keys[file_element.attrib['id']] = [attribute_name, attribute_type]
        elif file_element.tag.endswith('desc'):
            file_description = file_element.text
    if file_description or attribute_descriptions.node or attribute_descriptions.edge:
        data.meta = Bunch()
        if file_description:
            data.meta['description'] = file_description
        if attribute_descriptions.node or attribute_descriptions.edge:
            data.meta['attributes'] = attribute_descriptions
    if graph is not None:
        row = np.zeros(n_edges, dtype=int)
        col = np.zeros(n_edges, dtype=int)
        dat = np.full(n_edges, default_weight, dtype=weight_type)
        data.names = None
        if naming_nodes:
            data.names = np.zeros(n_nodes, dtype='<U512')

        node_map = {}
        # deal with nodes first
        for number, index in enumerate(node_indices):
            node = graph[index]
            if naming_nodes:
                name = node.attrib['id']
                data.names[number] = name
                node_map[name] = number
            for node_attribute in node:
                if node_attribute.tag.endswith('data'):
                    data.node_attribute[keys[node_attribute.attrib['key']][0]][number] = \
                        keys[node_attribute.attrib['key']][1](node_attribute.text)
        # deal with edges
        edge_index = -1
        for index in edge_indices:
            edge_index += 1
            duplicate = False
            edge = graph[index]
            if naming_nodes:
                node1 = node_map[edge.attrib['source']]
                node2 = node_map[edge.attrib['target']]
            else:
                node1 = int(edge.attrib['source'][1:])
                node2 = int(edge.attrib['target'][1:])
            row[edge_index] = node1
            col[edge_index] = node2
            for edge_attribute in edge:
                if edge_attribute.tag.endswith('data'):
                    if edge_attribute.attrib['key'] == weight_id:
                        dat[edge_index] = weight_type(edge_attribute.text)
                    else:
                        data.edge_attribute[keys[edge_attribute.attrib['key']][0]][edge_index] = \
                            keys[edge_attribute.attrib['key']][1](edge_attribute.text)
            if 'directed' in edge.attrib:
                if edge.attrib['directed'] != 'true':
                    duplicate = True
            elif symmetrize:
                duplicate = True
            if duplicate:
                edge_index += 1
                row[edge_index] = node2
                col[edge_index] = node1
                for edge_attribute in edge:
                    if edge_attribute.tag.endswith('data'):
                        if edge_attribute.attrib['key'] == weight_id:
                            dat[edge_index] = weight_type(edge_attribute.text)
                        else:
                            data.edge_attribute[keys[edge_attribute.attrib['key']][0]][edge_index] = \
                                keys[edge_attribute.attrib['key']][1](edge_attribute.text)
        data.adjacency = sparse.csr_matrix((dat, (row, col)), shape=(n_nodes, n_nodes))
        if data.names is None:
            data.pop('names')
        return data
    else:
        raise ValueError(f'No graph defined in {file_path}.')


def java_type_to_python_type(value: str) -> type:
    if value == 'boolean':
        return bool
    elif value == 'int':
        return int
    elif value == 'string':
        return str
    elif value in ('long', 'float', 'double'):
        return float


def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False