Source code for proteinshake.utils.embeddings

"""
Some basic functions for embedding protein sequences. Supply to a representation class.

All embeddings take a sequence string as an input and return the embedding as a numpy array of shape n x d.
"""

import numpy as np

residue_alphabet = 'ARNDCEQGHILKMFPSTWYV'
atom_alphabet = 'NCOSH'

def onehot(sequence, resolution='residue'):
    """ Compute the one-hot encoding of a protein sequence.

    Parameters
    ----------
    sequence: str
        The protein sequence.
    resolution: str, default 'resolution'
        Resolution of the protein. 'residue' or 'atom'.

    Returns
    -------
    ndarray
        The embedded sequence.
    """
    if resolution == 'residue':
        return np.stack([np.eye(len(residue_alphabet))[residue_alphabet.index(aa)] for aa in sequence])
    else:
        return np.stack([np.eye(len(atom_alphabet))[atom_alphabet.index(aa[0])] for aa in sequence])
def tokenize(sequence, resolution='residue'): """ Tokenizes the sequence. Parameters ---------- sequence: str The protein sequence. resolution: str, default 'resolution' Resolution of the protein. 'residue' or 'atom'. Returns ------- ndarray The embedded sequence. """ if resolution == 'residue': return np.array([residue_alphabet.index(aa) for aa in sequence]) else: return np.array([atom_alphabet.index(aa[0]) for aa in sequence])