Source code for scripts.data.preprocessing.packageTensors

"""Functions that generate features from dTERMen files :code:`.dat` and :code:`.red.pdb`"""
import os
import pickle
import sys

import numpy as np

from terminator.utils.common import seq_to_ints

# for autosummary import purposes
sys.path.insert(0, os.path.dirname(__file__))
from parseCoords import parseCoords
from parseTERM import parseTERMdata

NUM_AA = 21  # including X
ZERO = 1e-10  # 0 is used for padding


[docs]def dumpTrainingTensors(in_path, out_path=None, cutoff=1000, save=True, coords_only=False, dummy_terms=None): """Generate features from dTERMen :code:`.dat` and :code:`.red.pdb`, and dump the output into a file if requested. Args ---- in_path : str Prefix to :code:`.dat` and :code:`.red.pdb` files out_path : str or None Prefix to the output :code:`.features` and :code:`.length`. Can be None if :code:`save=False`. cutoff : int, default=1000 Max number of TERMs to featurize save : bool, default=True Whether or not to save the training tensors coords_only : bool, default=False Whether to use only backbone-derived features dummy_terms : str or None Method by which to incorperate dummy TERMs. Options include :code:`'replace'`, which means replacing TERM features with those derived from a dummy TERM, or :code:`'include'`, which includes the dummy TERM into the mined TERM matches. Returns ------- dict Dictionary of features for TERMinator """ if dummy_terms is not None: assert dummy_terms in ['replace', 'include'], f"dummy_terms={dummy_terms} is an invalid argument" if dummy_terms == 'replace': cutoff = 1 coords, _ = parseCoords(in_path + '.red.pdb', save=False) data = parseTERMdata(in_path + '.dat') # etab, self_etab, _ = parseEtab(in_path + '.etab', save=False) selection = data['selection'] # embed target ppoe struct_ppoe = data['ppoe'] struct_ppo = struct_ppoe[:, :3] struct_ppo_rads = np.radians(struct_ppo) struct_env = struct_ppoe[:, 3:] # zero out dihedral embeddings where there are no dihedrals struct_is_999 = (struct_ppo == 999) struct_sin_ppo = np.sin(struct_ppo_rads) struct_sin_ppo[struct_is_999] = 0 struct_cos_ppo = np.cos(struct_ppo_rads) struct_cos_ppo[struct_is_999] = 0 struct_embedded_ppoe = np.concatenate([struct_sin_ppo, struct_cos_ppo, struct_env], axis=1) term_msas = [] term_features = [] term_focuses = [] term_contact_idxs = [] term_lens = [] # compute TERM features for term_data in data['terms']: focus = term_data['focus'] # only take data for residues that are in the selection take = [i for i in range(len(focus)) if focus[i] in selection] msa = term_data['labels'] # apply take msa = np.take(msa, take, axis=-1) if dummy_terms is None: # cutoff MSAs at top N term_msas.append(msa[:cutoff]) elif dummy_terms == 'replace': # replace the whole TERM with one sequence of only X term_msas.append(np.ones_like(msa[:1]).astype(int) * 20) elif dummy_terms == "include": dummy_seq = np.ones_like(msa[:1]).astype(int) * 20 term_msas.append(np.concatenate([dummy_seq, msa[:cutoff - 1]])) # add focus focus_take = [item for item in focus if item in selection] term_focuses += focus_take # add contact idx contact_idx = term_data['contact_idx'] contact_idx_take = [contact_idx[i] for i in range(len(contact_idx)) if focus[i] in selection] term_contact_idxs += contact_idx_take # append term len, the len of the focus term_lens.append(len(focus_take)) # process ppoe if dummy_terms is None: ppoe = term_data['ppoe'] elif dummy_terms == "replace": ppoe = np.expand_dims(struct_ppoe[focus].transpose(1, 0), 0) elif dummy_terms == "include": dummy_ppoe = np.expand_dims(struct_ppoe[focus].transpose(1, 0), 0) ppoe = np.concatenate([dummy_ppoe, term_data['ppoe']], axis=0) term_len = ppoe.shape[2] num_alignments = ppoe.shape[0] # project to sin, cos ppo_rads = ppoe[:, :3] / 180 * np.pi is_999 = (ppoe[:, :3] == 999) sin_ppo = np.sin(ppo_rads) cos_ppo = np.cos(ppo_rads) # zero out dihedrals where there is no dihedral angle sin_ppo[is_999] = 0 cos_ppo[is_999] = 0 env = ppoe[:, 3:] # apply take ppoe = np.take(ppoe, take, axis=-1) # place rmsds into np array if dummy_terms is None: rmsd = np.expand_dims(term_data['rmsds'], 1) elif dummy_terms == "include": rmsd = np.concatenate([ np.array([ZERO]), term_data['rmsds'], ], axis=0) rmsd = np.expand_dims(rmsd, 1) rmsd_arr = np.concatenate([rmsd for _ in range(term_len)], axis=1) rmsd_arr = np.expand_dims(rmsd_arr, 1) if dummy_terms == 'replace': # we set the RMSD of the true match to be 0 rmsd_arr = np.ones_like(rmsd_arr) * ZERO term_len_arr = np.zeros((cutoff, 1, term_len)) term_len_arr += term_len num_alignments_arr = np.zeros((cutoff, 1, term_len)) num_alignments_arr += num_alignments # select features, cutoff at top N selected_features = [sin_ppo[:cutoff], cos_ppo[:cutoff], env[:cutoff], rmsd_arr[:cutoff], term_len_arr] features = np.concatenate(selected_features, axis=1) # pytorch does row vector computation # swap rows and columns features = features.transpose(0, 2, 1) term_features.append(features) msa_tensor = np.concatenate(term_msas, axis=-1) features_tensor = np.concatenate(term_features, axis=1) len_tensor = np.array(term_lens) term_focuses = np.array(term_focuses) term_contact_idxs = np.array(term_contact_idxs) # package cov matrices into one tensor max_term_len = max(term_lens) num_terms = len(term_lens) # check that sum of term lens is as long as the feature tensor assert sum(len_tensor) == features_tensor.shape[1] # manipulate coords to right shape pdb = in_path.split('/')[-1] coords_tensor = None if len(coords) == 1: chain = next(iter(coords.keys())) coords_tensor = coords[chain] else: chains = sorted(coords.keys()) coords_tensor = np.vstack([coords[c] for c in chains]) assert coords_tensor.shape[0] == len(data['sequence']), "num aa coords != seq length" output = { 'pdb': pdb, 'coords': coords_tensor, 'ppoe': struct_embedded_ppoe, 'features': features_tensor, 'msas': msa_tensor, 'focuses': term_focuses, 'contact_idxs': term_contact_idxs, 'term_lens': len_tensor, 'sequence': np.array(data['sequence']), 'seq_len': len(data['selection']), 'chain_lens': data['chain_lens'] } if coords_only: dummy_arr_3d = np.zeros([1, 1, 1]) dummy_arr_2d = np.zeros([1, 1]) output = { 'pdb': pdb, 'coords': coords_tensor, 'ppoe': dummy_arr_3d, 'features': dummy_arr_3d, 'msas': dummy_arr_2d, 'focuses': dummy_arr_2d, 'contact_idxs': dummy_arr_2d, 'term_lens': len_tensor, 'sequence': np.array(data['sequence']), 'seq_len': len(data['selection']), 'chain_lens': data['chain_lens'] } if save: assert out_path, "out_path required if save=True" with open(out_path + '.features', 'wb') as fp: pickle.dump(output, fp) with open(out_path + '.length', 'w') as fp: fp.write(str(len(term_focuses)) + '\n') fp.write(str(len(data['selection']))) print('Done with', pdb) return output
[docs]def dumpCoordsTensors(in_path, out_path=None, save=True): """Create a feature file based only on the coordinate information, placing dummy arrays for all TERM based items. Args ---- in_path : str Prefix to :code:`.red.pdb` file out_path : str or None Prefix to the output :code:`.features` and :code:`.length`. Can be None if :code:`save=False`. save : bool, default=True Whether or not to save the training tensors Returns ------- dict Dictionary of features for TERMinator """ in_file = in_path + '.red.pdb' coords, seq = parseCoords(in_file, save=False) if len(coords) == 1: chain = next(iter(coords.keys())) coords_tensor = coords[chain] else: chains = sorted(coords.keys()) coords_tensor = np.vstack([coords[c] for c in chains]) chain_lens = [len(coords[c]) for c in sorted(coords.keys())] pdb = os.path.basename(in_path) dummy_arr_3d = np.zeros([1, 1, 1]) dummy_arr_2d = np.zeros([1, 1]) dummy_arr_1d = np.ones([1]) output = { 'pdb': pdb, 'coords': coords_tensor, 'ppoe': dummy_arr_3d, 'features': dummy_arr_3d, 'msas': dummy_arr_2d, 'focuses': dummy_arr_2d, 'contact_idxs': dummy_arr_2d, 'term_lens': dummy_arr_1d.astype(int), 'sequence': np.array(seq_to_ints(seq)), 'seq_len': len(seq), 'chain_lens': chain_lens } if save: assert out_path, "out_path required if save=True" with open(out_path + '.features', 'wb') as fp: pickle.dump(output, fp) with open(out_path + '.length', 'w') as fp: fp.write(str(1) + '\n') fp.write(str(len(seq))) print('Done with', pdb) return output