Source code for scripts.data.preprocessing.parseEtab

""" Functions to parse :code:`.etab` files."""
import argparse
import pickle

import numpy as np

from terminator.utils.common import AA_to_int


[docs]def parseEtab(filename, save=True):
    """Given an etab file, parse the corresponding Potts model parameters.

    This assumes that full chain protein etabs are provided, so it is not designed to
    work with etabs computed on partial chains.

    Args
    ====
    filename : str
        path to :code:`.etab` file
    save ; bool, default=True
        whether or not to save the outputs to a file

    Returns
    =======
    potts_dict : dict
        Sparse representation of etab that maps pairs of
        (residue1_id : int, residue2_id : int) to 22 x 22 matrices representing
        pair energies. Self energies are incorperated into the diagonals of
        self-interaction matricies, with key (residue_id, residue_id). The dimension is 22 because
        0 is used as a padding index and 21 is used to represent X for unknown residues.
    potts_selfE : np.ndarray
        Numpy array specifying self energies.
        Shape: all_chains_length x 22.
    potts : np.ndarray
        Dense representation of potts model mapping pairs of residue ids
        (the first two dimensions) to 22 x 22 matricies representing pair
        energies. Self energies are incorperated into the diagonals of
        self-interactions matricies. Edges are also duplicated e.g. interaction energies
        between residues 1 and 2 can be found at both :code`potts[1][2]`
        and :code:`potts[2][1]`.
        Shape: all_chains_length x all_chains_length x 22 x 22

    """

    if filename.split('.')[-1] != 'etab':
        raise ValueError('Input file is not an etab file!')

    selfE = []
    pairE = []
    id_to_resid = {}
    fp = open(filename, 'r')
    # this loop requires that all self energies
    # occur before pair energies
    for idx, line in enumerate(fp):
        l = line.strip().split(' ')

        # self energies
        if len(l) == 3:
            id = l[0]
            resid = idx // 20
            id_to_resid[id] = resid

            residue = AA_to_int[l[1]]
            E = float(l[2])

            selfE.append({'resid': resid, 'residue': residue, 'E': E})

        # couplings
        elif len(l) == 5:
            id0 = l[0]
            id1 = l[1]
            resid0 = id_to_resid[id0]
            resid1 = id_to_resid[id1]

            residue0 = AA_to_int[l[2]]
            residue1 = AA_to_int[l[3]]
            E = float(l[4])

            pairE.append({'resid0': resid0, 'resid1': resid1, 'residue0': residue0, 'residue1': residue1, 'E': E})
        else:
            raise ValueError("Something doesn't look right at line %d: %s" % (idx, line))

    fp.close()

    # number of amino acids in all chains
    L = max(id_to_resid.values()) + 1

    potts_selfE = np.zeros((L, 22))
    potts = np.zeros((L, L, 22, 22))

    potts_dict = {}

    for data in selfE:
        resid = data['resid']
        residue = data['residue']

        # self E
        potts_selfE[resid][residue] = data['E']
        # dense potts
        slice = potts[resid][resid]
        slice[residue][residue] = data['E']
        # sparse potts
        key = (resid, resid)
        idx = residue * 22 + residue
        if key not in potts_dict.keys():
            potts_dict[key] = np.zeros(22 * 22)
        potts_dict[key][idx] = data['E']

    for data in pairE:
        resid0 = data['resid0']
        resid1 = data['resid1']
        residue0 = data['residue0']
        residue1 = data['residue1']
        # dense potts
        slice0 = potts[resid0][resid1]
        slice0[residue0][residue1] = data['E']
        slice1 = potts[resid1][resid0]
        slice1[residue1][residue0] = data['E']

        # sparse potts
        key1 = (resid0, resid1)
        idx1 = residue0 * 22 + residue1
        if key1 not in potts_dict.keys():
            potts_dict[key1] = np.zeros(22 * 22)
        potts_dict[key1][idx1] = data['E']

        key2 = (resid1, resid0)
        idx2 = residue1 * 22 + residue0
        if key2 not in potts_dict.keys():
            potts_dict[key2] = np.zeros(22 * 22)
        potts_dict[key2][idx2] = data['E']

    if save:
        np.save(filename, potts)
        np.save(filename[:-5] + '_selfE.etab', potts_selfE)
        with open('potts_dict.etab') as fp:
            pickle.dump(potts_dict, fp)

    # testing that the values in the potts parameters is correct
    """
    np.set_printoptions(threshold=np.inf, precision=2, linewidth=300)
    print(potts[0][0])
    """

    return potts_dict, potts_selfE, potts