SPA_glycosylation_model.py

import SPA
from pandas import read_csv
import numpy as np
import os

def Bydlinski_setup(exp_name):
    """
    For a given Bydlinski data name, takes the data for SPA
    """
    # Load the .csv files with all the data
    X_train = read_csv('../datasets/Training-X.csv', index_col = 0).values
    y_train = read_csv(f'../datasets/{exp_name}_training-y.csv', index_col = 0)
    glyco_labels = y_train.columns.to_list()
    y_train = y_train.values
    X_test = read_csv('../datasets/Test-X.csv', index_col = 0).values
    y_test = read_csv(f'../datasets/{exp_name}_test-y.csv', index_col = 0).values

    # Setup for convenience
    os.mkdir(f'{exp_name}_results')
    os.chdir(f'{exp_name}_results')
    return glyco_labels, X_train, y_train, X_test, y_test

def Kotidis_setup():
    """
    Takes the data in the NN_modelNSD files for SPA
    """
    # Load the .csv files with all the data
    X_train = read_csv('../datasets/NN_modelNSD_training-X.csv', index_col = 0).values
    y_train = read_csv('../datasets/NN_modelNSD_training-y.csv', index_col = 0)
    glyco_labels = y_train.columns.to_list()
    y_train = y_train.values
    X_test = read_csv('../datasets/NN_modelNSD_test-X.csv', index_col = 0).values
    y_test = read_csv('../datasets/NN_modelNSD_test-y.csv', index_col = 0).values

    # Setup for convenience
    os.mkdir('NN_modelNSD_results')
    os.chdir('NN_modelNSD_results')
    return glyco_labels, X_train, y_train, X_test, y_test

def run_SPA(exp_name, glyco_labels, X_train, y_train, X_test, y_test, nested = False):
    """
    Manipulates the data according to SPA's requirements, then ...
    runs SPA once per glycan, saving the results in a new folder
    """
    # SPA can test only one y variable at a time, so we will split the data and call SPA multiple times
    for exp_idx, current_glyco in enumerate(glyco_labels):
        print(f'Beginning glyco run {exp_idx+1} out of {len(glyco_labels)} | {exp_name}-{current_glyco}')
        # Save the data as X data + one y variable for SPA
        concat_train = np.atleast_2d(y_train[:, exp_idx]).T # The current y variable
        train = np.concatenate((X_train, concat_train), axis = 1)
        np.savetxt('Current_training.csv', train, delimiter = ',')

        concat_test = np.atleast_2d(y_test[:, exp_idx]).T # The current y variable
        test = np.concatenate((X_test, concat_test), axis = 1)
        np.savetxt('Current_testing.csv', test, delimiter = ',')

        # Group names file
        if exp_name == 'NN_modelNSD':
            group_file = '../../group_names_NN.txt'
        else:
            group_file = '../../group_names.txt'

        # Run SPA
        _ = SPA.main_SPA('Current_training.csv', test_data = 'Current_testing.csv', model_name = ['EN', 'RR', 'PLS'],
                        cv_method = 'groupkfold', group_name = group_file, K_fold = 4, nested_cv = nested)
        for myfile in os.scandir():
            if myfile.name.startswith('SPA_results'):
                _, ext = os.path.splitext(myfile.name)
                os.rename(myfile.name, f'{exp_name}_{current_glyco}_results{ext}')

    # Removing temp files
    os.remove('Current_training.csv')
    os.remove('Current_testing.csv')

def get_mean_and_std():
    """
    Finds all the .json files (as generated by main_fun() ) in a folder, then ...
    collects the means and stdevs for each training and testing set, putting these ...
    values in a .csv file
    """
    # Collecting the means and stdevs
    glyco_names = ''
    train_mean = []
    train_std = []
    test_mean = []
    test_std = []
    for myfile in os.scandir():
        if myfile.name.endswith('.json'):
            with open(myfile.name) as f:
                glyco_names += myfile.name.split('_')[2] + ','
                for line in f.readlines():
                    if 'train_nontrans_mean' in line:
                        num_location = line.find(':') + 2 # +2 to also exclude ": "
                        train_mean.append(float(line[num_location : -2])) # -2 to remove ",\n" at the end
                    elif 'train_nontrans_std' in line:
                        num_location = line.find(':') + 2 # +2 to also exclude ": "
                        train_std.append(float(line[num_location : -2])) # -2 to remove ",\n" at the end
                    elif 'test_nontrans_mean' in line:
                        num_location = line.find(':') + 2 # +2 to also exclude ": "
                        test_mean.append(float(line[num_location : -2])) # -2 to remove ",\n" at the end
                    elif 'test_nontrans_std' in line:
                        num_location = line.find(':') + 2 # +2 to also exclude ": "
                        test_std.append(float(line[num_location : -1])) # -1 to remove "\n" at the end

    # Saving to .csv files
    mylen = len(train_mean) # For convenience
    train = np.concatenate((train_mean, train_std)).reshape(-1, mylen)
    np.savetxt('train_mean_std.csv', train, delimiter = ',', fmt = '%.3g', header = glyco_names)
    test = np.concatenate((test_mean, test_std)).reshape(-1, mylen)
    np.savetxt('test_mean_std.csv', test, delimiter = ',', fmt = '%.3g', header = glyco_names)

if __name__ == '__main__':
    # Input setup (to allow passing --nested flag)
    import argparse
    parser = argparse.ArgumentParser(description = 'Runs SPA on the Bydlinski and Kotidis datasets to predict the N-glycosylation glycan distribution')
    parser.add_argument('--nested', metavar='True | [False]', type=bool, nargs='?', default = False, const = True, choices = {True, False},
                        help = 'Set this flag to run nested cross validation (instead of regular cross validation)')
    nested = parser.parse_args().nested

    # Folder setup (for organization)
    folder_name = f'SPA_results{"_nested"*(nested)}'
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    os.chdir(folder_name)
    # Bydlinski data
    for exp_name in ('Asn_24', 'Asn_38', 'Asn_83', 'Asn_110', 'Asn_168', 'Asn_538', 'Asn_745', 'Fc_DAO', 'Fc_EPO'):
        try:
            glyco_labels, X_train, y_train, X_test, y_test = Bydlinski_setup(exp_name)
            run_SPA(exp_name, glyco_labels, X_train, y_train, X_test, y_test, nested)
            get_mean_and_std()
            os.chdir('..')
        except FileExistsError: # Error gets raised during Bydlinski_setup()
            print(f'The folder {exp_name}_results already exists. Proceeding to the next site...')
    # Kotidis data
    try:
        glyco_labels, X_train, y_train, X_test, y_test = Kotidis_setup()
        run_SPA('NN_modelNSD', glyco_labels, X_train, y_train, X_test, y_test, nested)
        get_mean_and_std()
        os.chdir('..')
    except FileExistsError: # Error gets raised during Kotidis_setup()
        print(f'The folder NN_modelNSD_results already exists.')