Source code for scripts.data.preprocessing.parseTERM

"""Functions to parse TERM data from :code:`.dat` files"""
import argparse
import json
import os
import pickle

import numpy as np
from scipy.linalg import block_diag

from terminator.utils.common import seq_to_ints

HEAD_LEN = len('* TERM ')


[docs]def parseTERMdata(filename): """Function that parses all relavent data from TERM data dumps. Returns the sequence numerically encoded, the selection, full sequence ppoe, and all TERMs found. Args ==== filename : str path to :code:`.dat` file Returns ====== output : dict Dictionary containing information about the dTERMen run e.g. sequence, structural information, and chain lengths, as well as a list of all data mined from TERM matches. """ fp = open(filename, 'r') # parse initial PDB parameters # PDB sequence and selected residues seq = fp.readline().strip() seq = seq_to_ints(seq) selection = fp.readline().strip().split(' ') selection = [int(i) for i in selection] # parse phi, psi, omega, and environ vals # keep track of chain len based on phi=999 ppoe = [] chain_lens = [] current_chain_len = 0 current_line = fp.readline() while current_line[0] != '*': data = current_line.strip().split(' ') data = [float(i) for i in data] # if phi = 999. start new chain if data[0] == 999: chain_lens.append(current_chain_len) current_chain_len = 0 ppoe.append(data) current_chain_len += 1 current_line = fp.readline() # append last chain len chain_lens.append(current_chain_len) # the first chain len will always be 0, so pop that off chain_lens.pop(0) ppoe = np.array(ppoe) assert sum(chain_lens) == len(seq), "sum of chain lens != total seq len" # parse TERMs from rest of file terms = [] while current_line != '': term, current_line = _parseTERM(fp, current_line) terms.append(term) fp.close() output = {} output['sequence'] = seq output['selection'] = selection output['ppoe'] = ppoe output['terms'] = terms output['chain_lens'] = chain_lens return output
def _parseTERM(fp, lastline): """Helper function that parses a singluar TERM within the :code:`.dat` file. Args ==== fp : open file pointer file which currently points to the beginning of a TERM lastline : str the contents of the last line that the previous fp pointed to Returns ======= term_dict : dict Dictionary containing TERM match information, such as sequences, RMSD, and structural information such as torsion angles and environment values. current_line : str The last line read from fp. """ term_dict = {} # idx: index of TERM term_dict['idx'] = int(lastline.strip().split(' ')[-1]) # print(term_dict['idx']) # focus: residues in TERM focus = fp.readline().strip().split() focus = [int(i) for i in focus] term_dict['focus'] = focus term_dict['contact_idx'] = contact_idx(focus) focus_len = len(focus) # parse each individual structure match, append to term term_labels = [] term_rmsds = [] term_ppoe = [] current_line = fp.readline().strip() while current_line != '' and current_line[0] != '*': data = current_line.split(' ') label, rmsd, ppoe = seq_to_ints(data[0]), float(data[1]), [float(i) for i in data[2:]] ppoe = np.array(ppoe).reshape((4, focus_len)) term_labels.append(label) term_rmsds.append(rmsd) term_ppoe.append(ppoe) current_line = fp.readline().strip() # reshape as numpy arrays term_dict['labels'] = np.concatenate([term_labels]) term_dict['rmsds'] = np.concatenate([term_rmsds]) term_dict['ppoe'] = np.concatenate([term_ppoe]) return term_dict, current_line
[docs]def contact_idx(focus): """Assign an index per TERM residue based on how close you are to the central element used to create the TERM. We set 0 to the central element, increment as you go N->C, decrement as you go C->N. Central element is middle residue for a first order TERM, central contact for second order TERM Args ==== focus : list of int List of ints representing global indices (e.g. within the protein) for residues in a TERM. Returns ======= list of int Context indices derived from the focus """ l = len(focus) # if all residues are consecutive, first order TERM if focus[-1] - focus[0] + 1 == l: if l % 2 == 1: # if it's odd we can easily make this return [i - l // 2 for i in range(l)] else: # if it's even we assign both center elements 0 tail_list = [i for i in range(l // 2)] head_list = [-i for i in reversed(tail_list)] return head_list + tail_list else: # otherwise, second order TERM breakpoint = 0 for i in range(1, l): if focus[i] - focus[i - 1] != 1: breakpoint = i break first_chain = focus[:breakpoint] second_chain = focus[breakpoint:] return contact_idx(first_chain) + contact_idx(second_chain)