Source code for proteinshake.datasets.protein_ligand_interface

# -*- coding: utf-8 -*-
import glob
import os
import re
import os.path as osp

from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem

import numpy as np

from proteinshake.datasets import Dataset
from proteinshake.utils import extract_tar, download_url

RDLogger.DisableLog('rdApp.*') # disable warnings

class ProteinLigandInterfaceDataset(Dataset):
    """ Proteins bound to small molecules from PDBBind with annotated binding site, ligand and affinity information.

    .. admonition:: Please cite

        Wang, Renxiao, et al. "The PDBbind database: Collection of binding affinities for protein− ligand complexes with known three-dimensional structures." Journal of medicinal chemistry 47.12 (2004): 2977-2980.

    .. admonition:: Source

        Raw data was obtained and modified with permission from `PDBbind-CN <http://www.pdbbind.org.cn/>`_, originally licensed under the `End User Agreement for Access to the PDBbind-CN Database and Web Site <http://www.pdbbind.org.cn/enroll.php>`_.

      
    Parameters
    ----------
    root: str
        Root directory where the dataset should be saved.
    name: str
        The name of the dataset.
    version: str
        PDBBind version to use.

    .. list-table:: Dataset stats
       :widths: 100
       :header-rows: 1

       * - # proteins
       * - 4642

    .. list-table:: Annotations
        :widths: 20 55 25
        :header-rows: 1

        * - Attribute
        - Key
        - Sample value
        * - Dissociation constant (kd)
        - :code:`protein['protein']['kd']`
        - :code:`77.0`
        * - Affinity
        - :code:`protein['protein']['neglog_aff']`
        - :code:`4.11000`
        * - Resolution (Angstroms)
        - :code:`protein['protein']['resolution']`
        - :code:`2.20`
        * - Year solved
        - :code:`protein['protein']['year']`
        - :code:`2016`
        * - Ligand identifier (PDB code)
        - :code:`protein['protein']['ligands_id']`
        - :code:`IEE`
        * - Ligand SMILES 
        - :code:`protein['protein']['ligand_smiles']`
        - :code:`'Cc1ccc(CNc2cc(Cl)nc(N)n2)cc1'`
        * - Molecular ingerprints
        - :code:`protein['protein']['fp_maccs']`, :code:`protein['protein']['fp_morgan_r2']`
        - :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]`
        * - Molecular ingerprints
        - :code:`protein['protein']['fp_maccs']`, :code:`protein['protein']['fp_morgan_r2']`
        - :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]`
        * - Binding site (1 if in binding site, 0 else)
        - :code:`protein['residue']['binding_site']`
        - :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]`
    """

    description = ''

    def __init__(self, version='2020', **kwargs):
        self.version = version
        super().__init__(**kwargs)

    def get_raw_files(self):
        return glob.glob(f'{self.root}/raw/files/*_protein.pdb')[:self.limit]

    def get_id_from_filename(self, filename):
        return filename[:4]

    def affinity_parse(self, s):
        """ Parse the affinity string. e.g. `Kd=30uM`.
        Parameters
        ----------
        s: str
            Affinity measurement string to parse.
        Returns
        -------
        dict
            Dictionary containing parsed affinity information. `value` key stores
            the float value of the measurement. `operator` is the logical operator
            (e.g. `=`, `>`) applied to the value, `unit` is `uM, nM, pM` and
            `measure` is the type experimental measurement (e.g. `Kd, Ki, IC50`)
        """
        operator = "".join(re.findall(r"[=|<|>|~]", s))
        measures = ['Kd', 'Ki', 'IC50']
        for m in measures:
            if s.startswith(m):
                measure = m
                break
        value = float(re.search(r"\d+[.,]?\d*", s).group())
        unit = re.search(r"[m|u|n|f|p]M", s).group()

        return {'operator': operator,
                'measure': measure,
                'value': value,
                'unit': unit
                }
def parse_pdbbind_PL_index(self, index_path): """ > INDEX_refined_data.2020 # ============================================================================== # List of the protein-ligand complexes in the PDBbind refined set v.2020 # 5316 protein-ligand complexes in total, which are ranked by binding data # Latest update: July 2021 # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name # ============================================================================== 2r58 2.00 2007 2.00 Kd=10mM // 2r58.pdf (MLY) 3c2f 2.35 2008 2.00 Kd=10.1mM // 3c2f.pdf (PRP) 3g2y 1.31 2009 2.00 Ki=10mM // 3g2y.pdf (GF4) 3pce 2.06 1998 2.00 Ki=10mM // 3pce.pdf (3HP) 4qsu 1.90 2014 2.00 Kd=10mM // 4qsu.pdf (TDR) 4qsv 1.90 2014 2.00 Kd=10mM // 4qsv.pdf (THM) """ data = {} with open(index_path, 'r') as ind_file: for line in ind_file: if line.startswith("#"): continue pre, post = line.split("//") pdbid, res, date, neglog, kd = pre.split() kd = self.affinity_parse(kd) lig_id = post.split("(")[1].rstrip(")") # remove peptide ligands if lig_id.endswith('-mer'): continue data[pdbid] = { 'resolution': float(res), 'date': int(date), 'kd': kd, 'neglog_aff': float(neglog), 'ligand_id': lig_id } return data