Source code for scripts.data.preprocessing.cleanStructs

"""Convert .pdb files into protein backbone .red.pdb files.

Usage:
    .. code-block::

        python cleanStructs.py \\
            --in_list_path <pdb_paths_file> \\
            --out_folder <output_folder> \\
            [-n <num_processes>]

    :code:`<pdb_paths_file>` should be a file of paths to .pdb files, with one path per line

    :code:`<output_folder>` will be where the outputted .red.pdb files are dumped, and will
    be structured as :code:`<output_folder>/<pdb_id>/<pdb_id>.red.pdb`

See :code:`python cleanStructs.py --help` for more info.
"""
import argparse
import multiprocessing as mp
import os
import sys
import traceback

import numpy as np

# pylint: disable=unspecified-encoding


[docs]def extractBackbone(filename, outpath): """Given a PDB structure, extract the protein backbone atoms and dump it in a redesigned PDB file. Args ---- filename : str Input .pdb file outpath : str Prefix to place the output file (.red.pdb will be appended) """ VALID_ELEMENTS = ['N', 'CA', 'C', 'O'] VALID_RECORD_TYPES = ['ATOM', 'HETATM'] struct_dict = {} valid_entry_lines = [] with open(filename, 'r') as fp: entry_lines = [l for l in fp] for line_num, line in enumerate(entry_lines): data = line.strip() if data[:3] == 'TER' or data[:3] == 'END': valid_entry_lines.append(line_num) continue record_type = data[0:6].strip() if record_type not in VALID_RECORD_TYPES: print(f"Skipping line: {data}") continue try: element = data[13:16].strip() residx = data[22:27].strip() chain = data[21] except Exception as e: print(data) raise e if (chain, residx) not in struct_dict.keys(): struct_dict[(chain, residx)] = {"elements": np.array([False for _ in range(5)]), "line_numbers": []} if element in VALID_ELEMENTS: struct_dict[(chain, residx)]["elements"][VALID_ELEMENTS.index(element)] = True struct_dict[(chain, residx)]["line_numbers"].append(line_num) elif element == 'OXT': struct_dict[(chain, residx)]["elements"][-1] = True struct_dict[(chain, residx)]["oxt_num"] = line_num for struct_vals in struct_dict.values(): elem_arr = struct_vals["elements"] if elem_arr[:4].all(): # if we have N, CA, C, O, we take those lines # and ignore OXT even if present valid_entry_lines += struct_vals["line_numbers"] elif elem_arr[[0, 1, 2, 4]].all() and not elem_arr[3]: # if we have N, CA, C, OXT, but no O # we take OXT as O assert len(struct_vals["line_numbers"]) == 3, struct_vals["line_numbers"] valid_entry_lines += struct_vals["line_numbers"] valid_entry_lines.append(struct_vals["oxt_num"]) valid_entry_lines.sort() with open(outpath, 'w') as fp: for idx, _ in enumerate(valid_entry_lines): cur_line_num = valid_entry_lines[idx] prev_line_num = valid_entry_lines[idx - 1] if idx > 0 else valid_entry_lines[0] cur_line = entry_lines[cur_line_num] prev_line = entry_lines[prev_line_num] if prev_line.strip() == 'TER' and cur_line.strip() == 'TER': # prevent redundant TER if we filter out a whole section continue fp.write(cur_line)
def _raise_error(error): """Wrapper for error handling without crashing""" traceback.print_exception(Exception, error, None) # inner loop we wanna parallize
[docs]def dataGen(in_path, out_folder): """Wrapper for :code:`extractBackbone` for path manipuation and error catching. Args ---- in_path : str input .pdb to :code:`extractBackbone` out_folder : str output directory to dump .red.pdb files into """ name = os.path.basename(in_path)[:-len(".pdb")] data_folder = os.path.join(out_folder, name) if not os.path.isdir(data_folder): os.mkdir(data_folder) out_file = os.path.join(out_folder, name, f"{name}.red.pdb") print('out file', out_file) try: extractBackbone(in_path, out_file) assert os.path.exists(out_file) except Exception as e: print(out_file, file=sys.stderr) raise e
# when subprocesses fail you usually don't get an error...
[docs]def generateCoordsDir(in_list, out_folder, num_cores=1): """Parallelize :code:`dataGen` over a list of files. Args ---- in_list : list of paths List of input paths to :code:`dataGen`. out_folder : str Path to the output folder """ print('num cores', num_cores) print(('warning! it seems that if subprocesses fail right now you don\'t get an error message. ' 'be wary of this if the number of files you\'re getting seems off')) # make folder where the dataset files are gonna be placed if not os.path.exists(out_folder): os.mkdir(out_folder) # generate absolute paths so i dont have to think about relative references out_folder = os.path.abspath(out_folder) pool = mp.Pool(num_cores, maxtasksperchild=10) for in_file in in_list: in_file = os.path.abspath(in_file) pool.apply_async(dataGen, args=(in_file, out_folder), error_callback=_raise_error) pool.close() pool.join() print("Done")
if __name__ == '__main__': # idek how to do real parallelism but this should fix the bug of stalling when processes crash mp.set_start_method("spawn") # i should use context managers but low priority change parser = argparse.ArgumentParser('Extract backbone from a list of PDB files') parser.add_argument('--in_list_path', help='file that contains paths to PDB files to clean, with one path per line.', required=True) parser.add_argument('--out_folder', help=('folder where cleaned .red.pdb files will be placed. ' 'folder organization is <out_folder>/<pdb_id>/<pdb_id>.red.pdb'), required=True) parser.add_argument('-n', dest='num_cores', help='number of cores to use', default=1, type=int) args = parser.parse_args() with open(args.in_list_path) as fp: in_list = [l.strip() for l in fp] generateCoordsDir(in_list, args.out_folder, num_cores=args.num_cores)