"""Functions to parse TERM data from :code:`.dat` files"""
import argparse
import json
import os
import pickle
import numpy as np
from scipy.linalg import block_diag
from terminator.utils.common import seq_to_ints
HEAD_LEN = len('* TERM ')
[docs]def parseTERMdata(filename):
"""Function that parses all relavent data from TERM data dumps.
Returns the sequence numerically encoded, the selection,
full sequence ppoe, and all TERMs found.
Args
====
filename : str
path to :code:`.dat` file
Returns
======
output : dict
Dictionary containing information about the dTERMen run e.g. sequence,
structural information, and chain lengths, as well as a list of
all data mined from TERM matches.
"""
fp = open(filename, 'r')
# parse initial PDB parameters
# PDB sequence and selected residues
seq = fp.readline().strip()
seq = seq_to_ints(seq)
selection = fp.readline().strip().split(' ')
selection = [int(i) for i in selection]
# parse phi, psi, omega, and environ vals
# keep track of chain len based on phi=999
ppoe = []
chain_lens = []
current_chain_len = 0
current_line = fp.readline()
while current_line[0] != '*':
data = current_line.strip().split(' ')
data = [float(i) for i in data]
# if phi = 999. start new chain
if data[0] == 999:
chain_lens.append(current_chain_len)
current_chain_len = 0
ppoe.append(data)
current_chain_len += 1
current_line = fp.readline()
# append last chain len
chain_lens.append(current_chain_len)
# the first chain len will always be 0, so pop that off
chain_lens.pop(0)
ppoe = np.array(ppoe)
assert sum(chain_lens) == len(seq), "sum of chain lens != total seq len"
# parse TERMs from rest of file
terms = []
while current_line != '':
term, current_line = _parseTERM(fp, current_line)
terms.append(term)
fp.close()
output = {}
output['sequence'] = seq
output['selection'] = selection
output['ppoe'] = ppoe
output['terms'] = terms
output['chain_lens'] = chain_lens
return output
def _parseTERM(fp, lastline):
"""Helper function that parses a singluar TERM within the :code:`.dat` file.
Args
====
fp : open file pointer
file which currently points to the beginning of a TERM
lastline : str
the contents of the last line that the previous fp pointed to
Returns
=======
term_dict : dict
Dictionary containing TERM match information, such as sequences, RMSD, and
structural information such as torsion angles and environment values.
current_line : str
The last line read from fp.
"""
term_dict = {}
# idx: index of TERM
term_dict['idx'] = int(lastline.strip().split(' ')[-1])
# print(term_dict['idx'])
# focus: residues in TERM
focus = fp.readline().strip().split()
focus = [int(i) for i in focus]
term_dict['focus'] = focus
term_dict['contact_idx'] = contact_idx(focus)
focus_len = len(focus)
# parse each individual structure match, append to term
term_labels = []
term_rmsds = []
term_ppoe = []
current_line = fp.readline().strip()
while current_line != '' and current_line[0] != '*':
data = current_line.split(' ')
label, rmsd, ppoe = seq_to_ints(data[0]), float(data[1]), [float(i) for i in data[2:]]
ppoe = np.array(ppoe).reshape((4, focus_len))
term_labels.append(label)
term_rmsds.append(rmsd)
term_ppoe.append(ppoe)
current_line = fp.readline().strip()
# reshape as numpy arrays
term_dict['labels'] = np.concatenate([term_labels])
term_dict['rmsds'] = np.concatenate([term_rmsds])
term_dict['ppoe'] = np.concatenate([term_ppoe])
return term_dict, current_line