Source code for proteinshake.datasets.scop
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
from proteinshake.datasets import RCSBDataset
from proteinshake.utils import download_url, progressbar
class SCOPDataset(RCSBDataset):
""" Proteins with annotated SCOP class.
.. admonition:: Please cite
Murzin, Alexey G., et al. "SCOP: a structural classification of proteins database for the investigation of sequences and structures." Journal of molecular biology 247.4 (1995): 536-540.
Berman, H M et al. “The Protein Data Bank.” Nucleic acids research vol. 28,1 (2000): 235-42. doi:10.1093/nar/28.1.235
.. admonition:: Source
Raw data was obtained and modified from `RCSB Protein Data Bank <https://www.rcsb.org/>`_, originally licensed under `CC0 1.0 <https://creativecommons.org/publicdomain/zero/1.0/>`_.
.. list-table:: Dataset stats
:widths: 100
:header-rows: 1
* - # proteins
* - 10066
.. list-table:: Annotations
:widths: 25 45 35
:header-rows: 1
* - Attribute
- Key
- Sample value
* - Protein type
- :code:`protein['protein']['SCOP-TP']`
- :code:`'1'`
* - Protein class
- :code:`protein['protein']['SCOP-CL']`
- ``'1000000'``
* - Superfamily
- :code:`protein['protein']['SCOP-SF']`
- ``'3000001'``
* - Family
- :code:`protein['protein']['SCOP-FA']`
- ``'4002873'``
"""
def _parse_scop(self, path):
names = ['FA-DOMID', 'FA-PDBID', 'FA-PDBREG', 'FA-UNIID', 'FA-UNIREG', 'SF-DOMID', 'SF-PDBID', 'SF-PDBREG', 'SF-UNIID', 'SF-UNIREG', 'SCOPCLA']
df = pd.read_csv(path, sep=' ', comment='#', names=names, dtype=str)
return {k: dict([cla.split("=") for cla in v.split(",")]) for k,v in zip(df['FA-PDBID'], df['SCOPCLA'])}
def download(self):
# get the annots
download_url(f'http://scop.mrc-lmb.cam.ac.uk/files/scop-cla-latest.txt', f'{self.root}/raw/scop.txt')
self.scop = self._parse_scop(f'{self.root}/raw/scop.txt')
ids = list(self.scop['FA-PDBID'].unique())
# get the proteins
if self.n_jobs == 1:
print('Warning: Downloading an RCSB dataset with use_precompute = False is very slow. Consider increasing n_jobs.')
ids = ids[:self.limit] # for testing
failed = Parallel(n_jobs=self.n_jobs)(delayed(self.download_from_rcsb)(id) for id in progressbar(ids, desc='Downloading PDBs', verbosity=verbosity))
failed = [f for f in failed if not f is True]
if len(failed)>0:
print(f'Failed to download {len(failed)} PDB files.')
def add_protein_attributes(self, protein):
""" We annotate the protein with the scop classifications at each level.
SCOPCLA - SCOP domain classification. The abbreviations denote: TP=protein type, CL=protein class, CF=fold, SF=superfamily, FA=family
"""
protein_id = protein['protein']['ID'].upper()
if not protein_id in self.scop: return None
for cla, val in self.scop[protein_id].items():
protein['protein']['SCOP-' + cla] = val
return protein