Source code for scripts.data.preprocessing.split_datasets

"""Split dataset into multiple training folds"""
import argparse
import glob
import os
import random

INPUT_DATA = '/pool001/users/vsundar/TERMinator/'
MIN_PROT_LEN = 30


[docs]def main(args): dataset_files = os.path.join(INPUT_DATA, args.dataset) num_folds = args.folds pdb_ids = [] for filename in glob.glob(os.path.join(dataset_files, '*', '*.features')): prefix = os.path.splitext(filename)[0] with open(f'{prefix}.length') as fp: fp.readline() seq_len = int(fp.readline().strip()) if seq_len < MIN_PROT_LEN: continue pdb_ids += [prefix[-4:]] random.shuffle(pdb_ids) out_folder = os.path.join(dataset_files, args.outfolder) if not os.path.isdir(out_folder): os.makedirs(out_folder) len_fold = int(len(pdb_ids) / num_folds) folds = [] for i in range(num_folds): if i == num_folds - 1: endval = len(pdb_ids) else: endval = (i + 1) * len_fold folds += [pdb_ids[int(i * len_fold):endval]] with open(os.path.join(out_folder, f'fold_{i}.in'), 'w') as f: for pdb_id in folds[i]: f.write(pdb_id + '\n') with open(os.path.join(out_folder, 'holdout_fold.in'), 'w') as f: for pdb_id in folds[-1]: f.write(pdb_id + '\n') for i in range(num_folds - 1): test_fold = i val_fold = i - 1 if val_fold == -1: val_fold += num_folds - 1 training_folds = list(set(range(num_folds - 1)) - set([test_fold, val_fold])) print(training_folds, val_fold, test_fold) with open(os.path.join(out_folder, f'train_fold{i}.in'), 'w') as f: for fold in training_folds: for pdb_id in folds[fold]: f.write(pdb_id + '\n') with open(os.path.join(out_folder, f'val_fold{i}.in'), 'w') as f: for pdb_id in folds[val_fold]: f.write(pdb_id + '\n') with open(os.path.join(out_folder, f'test_fold{i}.in'), 'w') as f: for pdb_id in folds[test_fold]: f.write(pdb_id + '\n')
if __name__ == '__main__': parser = argparse.ArgumentParser('Split Data for TERMinator') parser.add_argument('--dataset', help='input folder .features files in proper directory structure. prefix is $ifsdata/', default='features_singlechain') parser.add_argument('--folds', help='number of folds', default='11', type=int) parser.add_argument('--outfolder', help='folder to store fold splits in', default='fold_splits') args = parser.parse_args() main(args)