Source code for proteinshake.utils.embeddings
"""
Some basic functions for embedding protein sequences. Supply to a representation class.
All embeddings take a sequence string as an input and return the embedding as a numpy array of shape n x d.
"""
import numpy as np
residue_alphabet = 'ARNDCEQGHILKMFPSTWYV'
atom_alphabet = 'NCOSH'
def onehot(sequence, resolution='residue'):
""" Compute the one-hot encoding of a protein sequence.
Parameters
----------
sequence: str
The protein sequence.
resolution: str, default 'resolution'
Resolution of the protein. 'residue' or 'atom'.
Returns
-------
ndarray
The embedded sequence.
"""
if resolution == 'residue':
return np.stack([np.eye(len(residue_alphabet))[residue_alphabet.index(aa)] for aa in sequence])
else:
return np.stack([np.eye(len(atom_alphabet))[atom_alphabet.index(aa[0])] for aa in sequence])
def tokenize(sequence, resolution='residue'):
""" Tokenizes the sequence.
Parameters
----------
sequence: str
The protein sequence.
resolution: str, default 'resolution'
Resolution of the protein. 'residue' or 'atom'.
Returns
-------
ndarray
The embedded sequence.
"""
if resolution == 'residue':
return np.array([residue_alphabet.index(aa) for aa in sequence])
else:
return np.array([atom_alphabet.index(aa[0]) for aa in sequence])