# -*- coding: utf-8 -*-
import glob
import os
import re
import os.path as osp
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
import numpy as np
from proteinshake.datasets import Dataset
from proteinshake.utils import extract_tar, download_url
RDLogger.DisableLog('rdApp.*') # disable warnings
class ProteinLigandInterfaceDataset(Dataset):
""" Proteins bound to small molecules from PDBBind with annotated binding site, ligand and affinity information.
.. admonition:: Please cite
Wang, Renxiao, et al. "The PDBbind database: Collection of binding affinities for protein− ligand complexes with known three-dimensional structures." Journal of medicinal chemistry 47.12 (2004): 2977-2980.
.. admonition:: Source
Raw data was obtained and modified with permission from `PDBbind-CN <http://www.pdbbind.org.cn/>`_, originally licensed under the `End User Agreement for Access to the PDBbind-CN Database and Web Site <http://www.pdbbind.org.cn/enroll.php>`_.
Parameters
----------
root: str
Root directory where the dataset should be saved.
name: str
The name of the dataset.
version: str
PDBBind version to use.
.. list-table:: Dataset stats
:widths: 100
:header-rows: 1
* - # proteins
* - 4642
.. list-table:: Annotations
:widths: 20 55 25
:header-rows: 1
* - Attribute
- Key
- Sample value
* - Dissociation constant (kd)
- :code:`protein['protein']['kd']`
- :code:`77.0`
* - Affinity
- :code:`protein['protein']['neglog_aff']`
- :code:`4.11000`
* - Resolution (Angstroms)
- :code:`protein['protein']['resolution']`
- :code:`2.20`
* - Year solved
- :code:`protein['protein']['year']`
- :code:`2016`
* - Ligand identifier (PDB code)
- :code:`protein['protein']['ligands_id']`
- :code:`IEE`
* - Ligand SMILES
- :code:`protein['protein']['ligand_smiles']`
- :code:`'Cc1ccc(CNc2cc(Cl)nc(N)n2)cc1'`
* - Molecular ingerprints
- :code:`protein['protein']['fp_maccs']`, :code:`protein['protein']['fp_morgan_r2']`
- :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]`
* - Molecular ingerprints
- :code:`protein['protein']['fp_maccs']`, :code:`protein['protein']['fp_morgan_r2']`
- :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]`
* - Binding site (1 if in binding site, 0 else)
- :code:`protein['residue']['binding_site']`
- :code:`'[..,0, 0, 1, 0, 1, 0, 0, 0,..]`
"""
description = ''
def __init__(self, version='2020', **kwargs):
self.version = version
super().__init__(**kwargs)
def get_raw_files(self):
return glob.glob(f'{self.root}/raw/files/*_protein.pdb')[:self.limit]
def get_id_from_filename(self, filename):
return filename[:4]
def affinity_parse(self, s):
""" Parse the affinity string. e.g. `Kd=30uM`.
Parameters
----------
s: str
Affinity measurement string to parse.
Returns
-------
dict
Dictionary containing parsed affinity information. `value` key stores
the float value of the measurement. `operator` is the logical operator
(e.g. `=`, `>`) applied to the value, `unit` is `uM, nM, pM` and
`measure` is the type experimental measurement (e.g. `Kd, Ki, IC50`)
"""
operator = "".join(re.findall(r"[=|<|>|~]", s))
measures = ['Kd', 'Ki', 'IC50']
for m in measures:
if s.startswith(m):
measure = m
break
value = float(re.search(r"\d+[.,]?\d*", s).group())
unit = re.search(r"[m|u|n|f|p]M", s).group()
return {'operator': operator,
'measure': measure,
'value': value,
'unit': unit
}