Source code for proteinshake.representations.graph

import os
from sklearn.neighbors import kneighbors_graph, radius_neighbors_graph
from tqdm import tqdm
import numpy as np

from proteinshake.utils import tokenize, error

class Graph():
    """ Graph representation of a protein.

    Parameters
    ----------
    protein: dict
        A protein object.
    construction: str
        Whether to use knn or eps construction.
    eps: float
        The epsilon radius to be used in graph construction (in Angstrom).
    k: int
        The number of neighbors to be used in the k-NN graph.
    weighted_edges: bool, default False
        If `True`, edges are attributed with their euclidean distance. If `False`, edges are unweighted.

    """

    def __init__(self, protein, construction, k, eps, weighted_edges):
        resolution = 'atom' if 'atom' in protein else 'residue'
        mode = 'distance' if weighted_edges else 'connectivity'
        coords = np.stack([protein[resolution]['x'], protein[resolution]['y'], protein[resolution]['z']], axis=1)
        nodes = tokenize(protein[resolution][f'{resolution}_type'], resolution=resolution)
        if construction == 'eps':
            adj = radius_neighbors_graph(coords, radius=eps, mode=mode)
        elif construction == 'knn':
            n_neighbors = min(len(coords) - 1, k) # reduce k if protein is smaller than self.k
            adj = kneighbors_graph(coords,  n_neighbors=n_neighbors, mode=mode)
        self.protein_dict = protein
        self.resolution = resolution
        self.data = (nodes, adj)
        self.weighted_edges = weighted_edges



class GraphDataset():
    """ Graph representation of a protein structure dataset.
    Converts a protein object to a graph by using a k-nearest-neighbor or epsilon-neighborhood approach. Define either `k` or `eps` to determine which one is used.

    Parameters
    ----------
    proteins: generator
        A generator of protein objects from a Dataset.
    size: int
        The size of the dataset.
    path: str
        Path to save the processed dataset.
    resolution: str, default 'residue'
        Resolution of the proteins to use in the graph representation. Can be 'atom' or 'residue'.
    eps: float
        The epsilon radius to be used in graph construction (in Angstrom).
    k: int
        The number of neighbors to be used in the k-NN graph.
    weighted_edges: bool, default False
        If `True`, edges are attributed with their euclidean distance. If `False`, edges are unweighted.

    """

    def __init__(self, proteins, root, name, resolution='residue', eps=None, k=None, weighted_edges=False, verbosity=2):
        self.verbosity = verbosity
        if (eps is None and k is None): error('You must specify eps or k in the graph construction.', verbosity=self.verbosity)
        construction = 'knn' if not k is None else 'eps'
        param = k if construction == 'knn' else eps
        weighted = '_weighted' if weighted_edges else ''
        self.path = f'{root}/processed/graph/{name}_{resolution}_{construction}_{param}{weighted}'
        self.graphs = (Graph(protein, construction, k, eps, weighted_edges) for protein in proteins)
        self.size = len(proteins)
        os.makedirs(os.path.dirname(self.path), exist_ok=True)

    def pyg(self, *args, **kwargs):
        from proteinshake.frameworks.pyg import PygGraphDataset
        return PygGraphDataset(self.graphs, self.size, self.path+'.pyg', verbosity=self.verbosity, *args, **kwargs)

    def dgl(self, *args, **kwargs):
        from proteinshake.frameworks.dgl import DGLGraphDataset
        return DGLGraphDataset(self.graphs, self.size, self.path+'.dgl', verbosity=self.verbosity, *args, **kwargs)

    def nx(self, *args, **kwargs):
        from proteinshake.frameworks.nx import NetworkxGraphDataset
        return NetworkxGraphDataset(self.graphs, self.size, self.path+'.nx', verbosity=self.verbosity, *args, **kwargs)