Source code for proteinshake.tasks.structure_similarity

import itertools

from scipy.stats import spearmanr
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

from proteinshake.datasets import TMAlignDataset
from proteinshake.tasks import Task

[docs]class StructureSimilarityTask(Task):
    """ Predict the structural similarity between two proteins. This is a pair-wise protein-level regression task.
    Ground truth is computed using the TMAlign software. Split indices are stored as tuples which contain two indices in
    the underlying dataset.


    .. admonition:: Task Summary 

        * **Input:** pair of proteins 
        * **Output:** Local Distance Difference Test score (lDDT)
        * **Evaluation:** Spearman correlation (custom task) 


    """

    DatasetClass = TMAlignDataset
    
    type = 'Regression'
    input = 'Protein and Protein'
    output = 'Local Distance Difference Test'

    def __init__(self, *args, **kwargs):

        super().__init__(*args, **kwargs)

[docs]    def update_index(self):
        """ Transform to pairwise indexing """
        self.train_index = self.compute_pairs(self.train_index)
        self.val_index = self.compute_pairs(self.val_index)
        self.test_index = self.compute_pairs(self.test_index)

def compute_targets(self): self.train_targets = np.array([self.target(*self.proteins[i]) for i in self.train_index]) self.val_targets = np.array([self.target(*self.proteins[i]) for i in self.val_index]) self.test_targets = np.array([self.target(*self.proteins[i]) for i in self.test_index]) @property def task_in(self): return ('protein', 'protein') @property def task_type(self): return ('protein_pair', 'regression') @property def task_out(self): return ('regression') @property def target_dim(self): return (1) def compute_pairs(self, index): combinations = np.array(list(itertools.combinations(range(len(index)), 2)), dtype=int) return index[combinations] def target(self, protein1, protein2): pdbid_1 = protein1['protein']['ID'] pdbid_2 = protein2['protein']['ID'] return self.dataset.lddt(pdbid_1,pdbid_2) def dummy_output(self): import random return [random.random() for _ in range(len(self.test_targets))] @property def default_metric(self): return 'spearman' def evaluate(self, y_true, y_pred): return { 'mse': metrics.mean_squared_error(y_true, y_pred), 'spearman': spearmanr(y_true, y_pred)[0] }