Source code for scripts.data.preprocessing.parseEtab

""" Functions to parse :code:`.etab` files."""
import argparse
import pickle

import numpy as np

from terminator.utils.common import AA_to_int


[docs]def parseEtab(filename, save=True): """Given an etab file, parse the corresponding Potts model parameters. This assumes that full chain protein etabs are provided, so it is not designed to work with etabs computed on partial chains. Args ==== filename : str path to :code:`.etab` file save ; bool, default=True whether or not to save the outputs to a file Returns ======= potts_dict : dict Sparse representation of etab that maps pairs of (residue1_id : int, residue2_id : int) to 22 x 22 matrices representing pair energies. Self energies are incorperated into the diagonals of self-interaction matricies, with key (residue_id, residue_id). The dimension is 22 because 0 is used as a padding index and 21 is used to represent X for unknown residues. potts_selfE : np.ndarray Numpy array specifying self energies. Shape: all_chains_length x 22. potts : np.ndarray Dense representation of potts model mapping pairs of residue ids (the first two dimensions) to 22 x 22 matricies representing pair energies. Self energies are incorperated into the diagonals of self-interactions matricies. Edges are also duplicated e.g. interaction energies between residues 1 and 2 can be found at both :code`potts[1][2]` and :code:`potts[2][1]`. Shape: all_chains_length x all_chains_length x 22 x 22 """ if filename.split('.')[-1] != 'etab': raise ValueError('Input file is not an etab file!') selfE = [] pairE = [] id_to_resid = {} fp = open(filename, 'r') # this loop requires that all self energies # occur before pair energies for idx, line in enumerate(fp): l = line.strip().split(' ') # self energies if len(l) == 3: id = l[0] resid = idx // 20 id_to_resid[id] = resid residue = AA_to_int[l[1]] E = float(l[2]) selfE.append({'resid': resid, 'residue': residue, 'E': E}) # couplings elif len(l) == 5: id0 = l[0] id1 = l[1] resid0 = id_to_resid[id0] resid1 = id_to_resid[id1] residue0 = AA_to_int[l[2]] residue1 = AA_to_int[l[3]] E = float(l[4]) pairE.append({'resid0': resid0, 'resid1': resid1, 'residue0': residue0, 'residue1': residue1, 'E': E}) else: raise ValueError("Something doesn't look right at line %d: %s" % (idx, line)) fp.close() # number of amino acids in all chains L = max(id_to_resid.values()) + 1 potts_selfE = np.zeros((L, 22)) potts = np.zeros((L, L, 22, 22)) potts_dict = {} for data in selfE: resid = data['resid'] residue = data['residue'] # self E potts_selfE[resid][residue] = data['E'] # dense potts slice = potts[resid][resid] slice[residue][residue] = data['E'] # sparse potts key = (resid, resid) idx = residue * 22 + residue if key not in potts_dict.keys(): potts_dict[key] = np.zeros(22 * 22) potts_dict[key][idx] = data['E'] for data in pairE: resid0 = data['resid0'] resid1 = data['resid1'] residue0 = data['residue0'] residue1 = data['residue1'] # dense potts slice0 = potts[resid0][resid1] slice0[residue0][residue1] = data['E'] slice1 = potts[resid1][resid0] slice1[residue1][residue0] = data['E'] # sparse potts key1 = (resid0, resid1) idx1 = residue0 * 22 + residue1 if key1 not in potts_dict.keys(): potts_dict[key1] = np.zeros(22 * 22) potts_dict[key1][idx1] = data['E'] key2 = (resid1, resid0) idx2 = residue1 * 22 + residue0 if key2 not in potts_dict.keys(): potts_dict[key2] = np.zeros(22 * 22) potts_dict[key2][idx2] = data['E'] if save: np.save(filename, potts) np.save(filename[:-5] + '_selfE.etab', potts_selfE) with open('potts_dict.etab') as fp: pickle.dump(potts_dict, fp) # testing that the values in the potts parameters is correct """ np.set_printoptions(threshold=np.inf, precision=2, linewidth=300) print(potts[0][0]) """ return potts_dict, potts_selfE, potts