Source code for scripts.data.preprocessing.generateDataset

"""Generate feature files for TERMinator.

Usage:
    .. code-block::

        python generateDataset.py \\
            --in_folder <input_folder> \\
            --out_folder <output_folder> \\
            [--cutoff <matches_cutoff>] \\
            [-n <num_processes>] \\
            [-u] \\ # update existing files
            [--coords_only] \\
            [--dummy_terms [None, 'replace', 'include']]

    :code:`--in_folder <input_folder>` should be structured as :code:`<input_folder>/<pdb_id>/<pdb_id>.<ext>`.
    For full feature generation, :code:`ext` must include :code:`.dat` and :code:`.red.pdb`, while
    if running using :code:`--coords_only` only :code:`.red.pdb` is required.
    If you use :code:`scripts/data/preprocessing/cleanStructs.py`, this structure is automatically built.

    :code:`--out_folder <output_folder>` will be structured as :code:`<input_folder>/<pdb_id>/<pdb_id>.<ext>`,
    where :code:`<ext>` includes :code:`.features`, which specifies protein and TERM features, and
    :code:`.length`, which contains two integerss. The first integer specifies the number of TERM residues
    in the protein, while the second integer specifies the sequence length of the protein.

    :code:`--cutoff <matches_cutoff>` restricts the number of matches featurized to the top :code:`<matches_cutoff>`,
    ranked by increasing RMSD. Defaults to 50.

    :code:`-n <num_processes>` specifies how many processes to use while processing. Defaults to 1.

    :code:`[-u]` is an optional flag which, if specified, forces rewriting of existing feature files.

    :code:`--coords_only` is an option flag which, if specified, generated only backbone-derived features.
    Running this mode does not require prior TERM mining, but does require you clean the backbone using
    :code:`scripts/data/preprocessing/cleanStructs.py`.

    :code:`--dummy_terms` allows specifying how dummy TERMs are incorperated into features. Dummy TERMs are
    constructs where there is one TERM match with a degenerate X sequence and structural features derived from
    the target structure, By default, it is set to :code:`None`, or no dummy TERMs. If set to :code:`'replace'`,
    only the dummy TERM is included. If set to :code:`'include'`, the first match is set to the dummy TERM match
    and the remaining TERMs are those parsed from the :code:`.dat` file.

See :code:`python generateDataset.py --help` for more info.
"""
import argparse
import functools
import glob
import multiprocessing as mp
import os
import sys
import traceback

# for autosummary import purposes
sys.path.insert(0, os.path.dirname(__file__))
from packageTensors import dumpCoordsTensors, dumpTrainingTensors


# when subprocesses fail you usually don't get an error...
[docs]def generateDatasetParallel(in_folder, out_folder, cutoff=50, num_cores=1, update=True, coords_only=False, dummy_terms=None): """Parallelize :code:`dataGen` over a list of files. Args ---- in_folder : str Path to input directory in proper structure out_folder : str Path to the output folder cutoff : int Max number of TERMs to featurize num_cores : int Number of processes to parallelize with update : bool Whether or not to overwrite existing files coords_only : bool Whether to use only backbone-derived features dummy_terms : str or None Method by which to incorperate dummy TERMs. Options include :code:`'replace'`, which means replacing TERM features with those derived from a dummy TERM, or :code:`'include'`, which includes the dummy TERM into the mined TERM matches. """ print('num cores', num_cores) print(('warning! it seems that if subprocesses fail right now you don\'t get an error message. ' 'be wary of this if the number of files you\'re getting seems off')) # make folder where the dataset files are gonna be placed if not os.path.exists(out_folder): os.mkdir(out_folder) # generate absolute paths so i dont have to think about relative references in_folder = os.path.abspath(in_folder) out_folder = os.path.abspath(out_folder) os.chdir(in_folder) process_func = functools.partial(dataGen, cutoff=cutoff, coords_only=coords_only, dummy_terms=dummy_terms) pool = mp.Pool(num_cores, maxtasksperchild=10) # process folder by folder for folder in glob.glob("*"): # folders that aren't directories aren't folders! if not os.path.isdir(folder): continue full_folder_path = os.path.join(out_folder, folder) if not os.path.exists(full_folder_path): os.mkdir(full_folder_path) for _, file in enumerate(glob.glob(folder + '/*.red.pdb')): name = file[:-len(".red.pdb")] if not update: out_file = os.path.join(out_folder, name) if os.path.exists(out_file + '.features'): continue pool.apply_async(process_func, args=(file, out_folder), error_callback=_raise_error) pool.close() pool.join()
def _raise_error(error): """Wrapper for error handling without crashing""" traceback.print_exception(Exception, error, None) # inner loop we wanna parallize
[docs]def dataGen(file, out_folder, cutoff, coords_only, dummy_terms): """Wrapper function for parallelization which deals with paths and other args. Args ---- file : str The .red.pdb file for the protein to featurize. out_folder : str Path to the output folder cutoff : int Max number of TERMs to featurize coords_only : bool Whether to use only backbone-derived features dummy_terms : str or None Method by which to incorperate dummy TERMs. Options include :code:`'replace'`, which means replacing TERM features with those derived from a dummy TERM, or :code:`'include'`, which includes the dummy TERM into the mined TERM matches. """ name = file[:-len(".red.pdb")] out_file = os.path.join(out_folder, name) print('out file', out_file) try: if coords_only: dumpCoordsTensors(name, out_path=out_file) else: dumpTrainingTensors(name, out_path=out_file, cutoff=cutoff, coords_only=coords_only, dummy_terms=dummy_terms) except Exception as e: print(out_file, file=sys.stderr) raise e
if __name__ == '__main__': # idek how to do real parallelism but this should fix the bug of stalling when processes crash mp.set_start_method("spawn") # i should use context managers but low priority change parser = argparse.ArgumentParser('Generate features data files from dTERMen .dat files') parser.add_argument('--in_folder', help='input folder containing .dat/.red.pdb files in proper directory structure', required=True) parser.add_argument('--out_folder', help='folder where features will be placed', required=True) parser.add_argument('--cutoff', dest='cutoff', help='max number of match entries per TERM', default=50, type=int) parser.add_argument('-n', dest='num_cores', help='number of processes to use', default=1, type=int) parser.add_argument('-u', dest='update', help='if added, update existing files. else, files that already exist will not be overwritten', default=False, action='store_true') parser.add_argument('--coords_only', dest='coords_only', help='if added, only include coordinates-relevant data in the feature files', default=False, action='store_true') parser.add_argument('--dummy_terms', help='option for how to use dummy TERMs in the feature files', default=None) args = parser.parse_args() generateDatasetParallel(args.in_folder, args.out_folder, cutoff=args.cutoff, num_cores=args.num_cores, update=args.update, coords_only=args.coords_only, dummy_terms=args.dummy_terms)