Source code for proteinshake.datasets.protein_family

import json

from proteinshake.datasets import RCSBDataset

class ProteinFamilyDataset(RCSBDataset):
    """ Proteins with annotated protein families (Pfam).
    Each protein in the dataset has a `Pfam` attribute which stores the list of protein families.

    .. admonition:: Please cite

      Berman, H M et al. “The Protein Data Bank.” Nucleic acids research vol. 28,1 (2000): 235-42. doi:10.1093/nar/28.1.235

    .. admonition:: Source

      Raw data was obtained and modified from `RCSB Protein Data Bank <https://www.rcsb.org/>`_, originally licensed under `CC0 1.0 <https://creativecommons.org/publicdomain/zero/1.0/>`_.



    .. list-table:: Dataset stats
       :widths: 100
       :header-rows: 1

       * - # proteins
       * - 31109


   .. list-table:: Annotations
      :widths: 25 35 45
      :header-rows: 1

      * - Attribute
        - Key
        - Sample value
      * - Pfam accession code
        - :code:`protein['protein']['Pfam']`
        - `['PF00102']`

    """

    description = 'Protein Families'

    def __init__(self, pfam_version='34.0', query=[['rcsb_polymer_entity_annotation.type','exact_match','Pfam']], **kwargs):
        self.pfam_version = pfam_version
        super().__init__(query=query, **kwargs)

    def add_protein_attributes(self, protein):
        with open(f'{self.root}/raw/files/{protein["protein"]["ID"]}.annot.json','r') as file:
            annot = json.load(file)
        pfams = []
        for a in annot['rcsb_polymer_entity_annotation']:
            if a['type'] == 'Pfam':
                # pfams.append(a['name'])
                pfams.append(a['annotation_id'])
        protein['protein']['Pfam'] = pfams
        return protein
def describe(self): desc = super().describe() desc['property'] = "Protein Family (Pfam)" desc['values'] = f"{len(set((p['Pfam'][0] for p in self.proteins)))} (root)" desc['type'] = 'Categorical, Hierarchical' return desc