Source code for proteinshake.datasets.protein_ligand_decoys
# -*- coding: utf-8 -*-
import glob
import os
from biopandas.pdb import PandasPdb
from unittest.mock import patch
from proteinshake.datasets.dataset import Dataset, AA_THREE_TO_ONE
from proteinshake.utils import download_url, progressbar
EXTENDED_AA_THREE_TO_ONE = {
**AA_THREE_TO_ONE,
'CYZ': 'C',
'CYX': 'C',
'HIP': 'H',
'HID': 'H',
'HIE': 'H',
}
class ProteinLigandDecoysDataset(Dataset):
""" Proteins (targets) from DUDE-Z with annotated ligands and decoys.
Each molecule is encoded as a SMILES string, meant to be used in a virtual screen setting.
In this setting a model is given a protein and a ligand and outputs a score reflecting the likelihood that the given molecule is a binder.
Then, this score is used to sort the union of all the ligands and decoys.
A good model places true ligands at the top of this list.
This is known as enrichment factor analysis.
.. admonition:: Please cite
Stein, Reed M et al. “Property-Unmatched Decoys in Docking Benchmarks.” Journal of chemical information and modeling vol. 61,2 (2021): 699-714. doi:10.1021/acs.jcim.0c00598
.. admonition:: Source
Raw data was obtained and modified from `DUDE-Z <https://dudez.docking.org/>`_.
.. list-table:: Dataset stats
:widths: 100
:header-rows: 1
* - # proteins
* - 38
.. list-table:: Annotations
:widths: 25 35 45
:header-rows: 1
* - Attribute
- Key
- Sample value
* - Non-binders SIMLES
- :code:`protein['protein']['deocys_smiles']`
- :code:`['O=C(CSc1nnc(COc2ccccc2)o1)NC1CCCCC1', 'C[N@H+]1CC[C@@](N)(C(=O)NC[C@@H]2CC[C@@H](C[NH3+])CC2)C1',..]`
* - Non-binders identifiers
- :code:`protein['protein']['decoys_ids']`
- :code:`['ZINC000000087599', 'ZINC000648138664',..]`
* - Binders SIMLES
- :code:`protein['protein']['ligands_smiles']`
- :code:`['CC1=CC2=C(NC(=O)[C@H](CC3CC3)C2)C(=O)N1CC(=O)NCC1=CC=C(N)N=C1C', 'ClC1=CC=CC(CC2=NC3=C(NCCC4CCCC[NH2+]4)N=CC=C3O2)=C1',..]`
* - Binders identifiers
- :code:`protein['protein']['ligands_ids']`
- :code:`['CHEMBL10785', 'CHEMBL439678', 'CHEMBL278985',..]`
* - Pfam accession code
- :code:`protein['protein']['Pfam']`
- ``['PF00102']``
"""
description = 'Proteins with ligands and decoys'
@patch('proteinshake.datasets.dataset.AA_THREE_TO_ONE', EXTENDED_AA_THREE_TO_ONE)
def pdb2df(self, path):
return super().pdb2df(path)
def get_raw_files(self):
return glob.glob(f'{self.root}/raw/files/*.pdb')[:self.limit]
def get_id_from_filename(self, filename):
return filename.split(".")[0]
def download(self):
targets = ['AA2AR', 'ABL1', 'ACES', 'ADA', 'ADRB2', 'AMPC', 'ANDR', 'CSF1R', 'CXCR4', 'DEF', 'DRD4', 'EGFR', 'FA7', 'FA10', 'FABP4', 'FGFR1', 'FKB1A', 'GLCM', 'HDAC8', 'HIVPR', 'HMDH', 'HS90A', 'ITAL', 'KITH', 'KIT', 'LCK', 'MAPK2', 'MK01', 'MT1', 'NRAM', 'PARP1', 'PLK1', 'PPARA', 'PTN1', 'PUR2', 'RENI', 'ROCK1', 'SRC', 'THRB', 'TRY1', 'TRYB1', 'UROK', 'XIAP']
for target_id in progressbar(targets, desc='Downloading', verbosity=self.verbosity):
# grab receptor
download_url(f"https://dudez.docking.org/DOCKING_GRIDS_AND_POSES/{target_id}/rec.crg.pdb", f"{self.root}/raw/files/", verbosity=0)
os.rename(f'{self.root}/raw/files/rec.crg.pdb', f'{self.root}/raw/files/{target_id}.pdb')
# grab ligands
download_url(f"https://dudez.docking.org/property_matched/{target_id}_new_DUDE_1/ligands.smi", f"{self.root}/raw/files/", verbosity=0)
os.rename(f'{self.root}/raw/files/ligands.smi', f'{self.root}/raw/files/ligands_{target_id}.smi')
# grab decoys
download_url(f"https://dudez.docking.org/property_matched/{target_id}_new_DUDE_1/decoys.smi", f"{self.root}/raw/files/", verbosity=0)
os.rename(f'{self.root}/raw/files/decoys.smi', f'{self.root}/raw/files/decoys_{target_id}.smi')