Source code for scripts.data.preprocessing.packageTensors

"""Functions that generate features from dTERMen files :code:`.dat` and :code:`.red.pdb`"""
import os
import pickle
import sys

import numpy as np

from terminator.utils.common import seq_to_ints

# for autosummary import purposes
sys.path.insert(0, os.path.dirname(__file__))
from parseCoords import parseCoords
from parseTERM import parseTERMdata

NUM_AA = 21  # including X
ZERO = 1e-10  # 0 is used for padding


[docs]def dumpTrainingTensors(in_path, out_path=None, cutoff=1000, save=True, coords_only=False, dummy_terms=None):
    """Generate features from dTERMen :code:`.dat` and :code:`.red.pdb`, and
    dump the output into a file if requested.

    Args
    ----
    in_path : str
        Prefix to :code:`.dat` and :code:`.red.pdb` files
    out_path : str or None
        Prefix to the output :code:`.features` and :code:`.length`. Can be None if :code:`save=False`.
    cutoff : int, default=1000
        Max number of TERMs to featurize
    save : bool, default=True
        Whether or not to save the training tensors
    coords_only : bool, default=False
        Whether to use only backbone-derived features
    dummy_terms : str or None
        Method by which to incorperate dummy TERMs. Options include :code:`'replace'`,
        which means replacing TERM features with those derived from a dummy TERM, or
        :code:`'include'`, which includes the dummy TERM into the mined TERM matches.

    Returns
    -------
    dict
        Dictionary of features for TERMinator
    """
    if dummy_terms is not None:
        assert dummy_terms in ['replace', 'include'], f"dummy_terms={dummy_terms} is an invalid argument"

    if dummy_terms == 'replace':
        cutoff = 1

    coords, _ = parseCoords(in_path + '.red.pdb', save=False)
    data = parseTERMdata(in_path + '.dat')
    # etab, self_etab, _ = parseEtab(in_path + '.etab', save=False)

    selection = data['selection']

    # embed target ppoe
    struct_ppoe = data['ppoe']
    struct_ppo = struct_ppoe[:, :3]
    struct_ppo_rads = np.radians(struct_ppo)
    struct_env = struct_ppoe[:, 3:]
    # zero out dihedral embeddings where there are no dihedrals
    struct_is_999 = (struct_ppo == 999)
    struct_sin_ppo = np.sin(struct_ppo_rads)
    struct_sin_ppo[struct_is_999] = 0
    struct_cos_ppo = np.cos(struct_ppo_rads)
    struct_cos_ppo[struct_is_999] = 0
    struct_embedded_ppoe = np.concatenate([struct_sin_ppo, struct_cos_ppo, struct_env], axis=1)

    term_msas = []
    term_features = []
    term_focuses = []
    term_contact_idxs = []
    term_lens = []
    # compute TERM features
    for term_data in data['terms']:
        focus = term_data['focus']
        # only take data for residues that are in the selection
        take = [i for i in range(len(focus)) if focus[i] in selection]

        msa = term_data['labels']
        # apply take
        msa = np.take(msa, take, axis=-1)
        if dummy_terms is None:
            # cutoff MSAs at top N
            term_msas.append(msa[:cutoff])
        elif dummy_terms == 'replace':
            # replace the whole TERM with one sequence of only X
            term_msas.append(np.ones_like(msa[:1]).astype(int) * 20)
        elif dummy_terms == "include":
            dummy_seq = np.ones_like(msa[:1]).astype(int) * 20
            term_msas.append(np.concatenate([dummy_seq, msa[:cutoff - 1]]))

        # add focus
        focus_take = [item for item in focus if item in selection]
        term_focuses += focus_take
        # add contact idx
        contact_idx = term_data['contact_idx']
        contact_idx_take = [contact_idx[i] for i in range(len(contact_idx)) if focus[i] in selection]
        term_contact_idxs += contact_idx_take
        # append term len, the len of the focus
        term_lens.append(len(focus_take))

        # process ppoe
        if dummy_terms is None:
            ppoe = term_data['ppoe']
        elif dummy_terms == "replace":
            ppoe = np.expand_dims(struct_ppoe[focus].transpose(1, 0), 0)
        elif dummy_terms == "include":
            dummy_ppoe = np.expand_dims(struct_ppoe[focus].transpose(1, 0), 0)
            ppoe = np.concatenate([dummy_ppoe, term_data['ppoe']], axis=0)
        term_len = ppoe.shape[2]
        num_alignments = ppoe.shape[0]
        # project to sin, cos
        ppo_rads = ppoe[:, :3] / 180 * np.pi
        is_999 = (ppoe[:, :3] == 999)
        sin_ppo = np.sin(ppo_rads)
        cos_ppo = np.cos(ppo_rads)
        # zero out dihedrals where there is no dihedral angle
        sin_ppo[is_999] = 0
        cos_ppo[is_999] = 0
        env = ppoe[:, 3:]

        # apply take
        ppoe = np.take(ppoe, take, axis=-1)

        # place rmsds into np array
        if dummy_terms is None:
            rmsd = np.expand_dims(term_data['rmsds'], 1)
        elif dummy_terms == "include":
            rmsd = np.concatenate([
                np.array([ZERO]),
                term_data['rmsds'],
            ], axis=0)
            rmsd = np.expand_dims(rmsd, 1)
        rmsd_arr = np.concatenate([rmsd for _ in range(term_len)], axis=1)
        rmsd_arr = np.expand_dims(rmsd_arr, 1)
        if dummy_terms == 'replace':
            # we set the RMSD of the true match to be 0
            rmsd_arr = np.ones_like(rmsd_arr) * ZERO
        term_len_arr = np.zeros((cutoff, 1, term_len))
        term_len_arr += term_len
        num_alignments_arr = np.zeros((cutoff, 1, term_len))
        num_alignments_arr += num_alignments

        # select features, cutoff at top N
        selected_features = [sin_ppo[:cutoff], cos_ppo[:cutoff], env[:cutoff], rmsd_arr[:cutoff], term_len_arr]

        features = np.concatenate(selected_features, axis=1)

        # pytorch does row vector computation
        # swap rows and columns
        features = features.transpose(0, 2, 1)
        term_features.append(features)

    msa_tensor = np.concatenate(term_msas, axis=-1)
    features_tensor = np.concatenate(term_features, axis=1)
    len_tensor = np.array(term_lens)
    term_focuses = np.array(term_focuses)
    term_contact_idxs = np.array(term_contact_idxs)

    # package cov matrices into one tensor
    max_term_len = max(term_lens)
    num_terms = len(term_lens)

    # check that sum of term lens is as long as the feature tensor
    assert sum(len_tensor) == features_tensor.shape[1]

    # manipulate coords to right shape
    pdb = in_path.split('/')[-1]

    coords_tensor = None
    if len(coords) == 1:
        chain = next(iter(coords.keys()))
        coords_tensor = coords[chain]
    else:
        chains = sorted(coords.keys())
        coords_tensor = np.vstack([coords[c] for c in chains])
    assert coords_tensor.shape[0] == len(data['sequence']), "num aa coords != seq length"

    output = {
        'pdb': pdb,
        'coords': coords_tensor,
        'ppoe': struct_embedded_ppoe,
        'features': features_tensor,
        'msas': msa_tensor,
        'focuses': term_focuses,
        'contact_idxs': term_contact_idxs,
        'term_lens': len_tensor,
        'sequence': np.array(data['sequence']),
        'seq_len': len(data['selection']),
        'chain_lens': data['chain_lens']
    }

    if coords_only:
        dummy_arr_3d = np.zeros([1, 1, 1])
        dummy_arr_2d = np.zeros([1, 1])
        output = {
            'pdb': pdb,
            'coords': coords_tensor,
            'ppoe': dummy_arr_3d,
            'features': dummy_arr_3d,
            'msas': dummy_arr_2d,
            'focuses': dummy_arr_2d,
            'contact_idxs': dummy_arr_2d,
            'term_lens': len_tensor,
            'sequence': np.array(data['sequence']),
            'seq_len': len(data['selection']),
            'chain_lens': data['chain_lens']
        }

    if save:
        assert out_path, "out_path required if save=True"

        with open(out_path + '.features', 'wb') as fp:
            pickle.dump(output, fp)
        with open(out_path + '.length', 'w') as fp:
            fp.write(str(len(term_focuses)) + '\n')
            fp.write(str(len(data['selection'])))

    print('Done with', pdb)
    return output


[docs]def dumpCoordsTensors(in_path, out_path=None, save=True):
    """Create a feature file based only on the coordinate information,
    placing dummy arrays for all TERM based items.

    Args
    ----
    in_path : str
        Prefix to :code:`.red.pdb` file
    out_path : str or None
        Prefix to the output :code:`.features` and :code:`.length`. Can be None if :code:`save=False`.
    save : bool, default=True
        Whether or not to save the training tensors

    Returns
    -------
    dict
        Dictionary of features for TERMinator
    """
    in_file = in_path + '.red.pdb'
    coords, seq = parseCoords(in_file, save=False)

    if len(coords) == 1:
        chain = next(iter(coords.keys()))
        coords_tensor = coords[chain]
    else:
        chains = sorted(coords.keys())
        coords_tensor = np.vstack([coords[c] for c in chains])

    chain_lens = [len(coords[c]) for c in sorted(coords.keys())]
    pdb = os.path.basename(in_path)

    dummy_arr_3d = np.zeros([1, 1, 1])
    dummy_arr_2d = np.zeros([1, 1])
    dummy_arr_1d = np.ones([1])
    output = {
        'pdb': pdb,
        'coords': coords_tensor,
        'ppoe': dummy_arr_3d,
        'features': dummy_arr_3d,
        'msas': dummy_arr_2d,
        'focuses': dummy_arr_2d,
        'contact_idxs': dummy_arr_2d,
        'term_lens': dummy_arr_1d.astype(int),
        'sequence': np.array(seq_to_ints(seq)),
        'seq_len': len(seq),
        'chain_lens': chain_lens
    }

    if save:
        assert out_path, "out_path required if save=True"
        with open(out_path + '.features', 'wb') as fp:
            pickle.dump(output, fp)
        with open(out_path + '.length', 'w') as fp:
            fp.write(str(1) + '\n')
            fp.write(str(len(seq)))

    print('Done with', pdb)
    return output