From d1bbc3f31b012c9796acef5802c1fca52f36fa69 Mon Sep 17 00:00:00 2001 From: iquasere Date: Mon, 4 Nov 2024 11:26:31 +0000 Subject: [PATCH] Fix on providing empty "Name" values in "Experiments" - now properly --- workflow/mosca.py | 13 +++++++++++++ workflow/rules/common.smk | 16 ---------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/workflow/mosca.py b/workflow/mosca.py index 6c8abe2..3c5fd25 100644 --- a/workflow/mosca.py +++ b/workflow/mosca.py @@ -49,6 +49,13 @@ def save_config(config_data, filename, output_format): def validate_exps(exps_data): + def set_name(files, data_type): + filename = files.split('/')[-1] + if data_type == 'protein': + return filename # which is the foldername (e.g. input/mp1 -> mp1) + if ',' in files: + return filename.split(',')[0].split('_R')[0] + return filename.split('.fa')[0] exps = pd.DataFrame(exps_data) reserved_words = [ 'if', 'else', 'repeat', 'while', 'function', 'for', 'in', 'next', 'break', 'TRUE', 'FALSE', 'NULL', 'Inf', @@ -62,6 +69,12 @@ def validate_exps(exps_data): if not bool(good_pattern.match(name)): sys.exit(f'INVALID "NAME" in "experiments": {name} starts with a number or has a special character.\n' f'Please use only letters, numbers, dots (.) and underscores (_).') + for i in range(len(exps)): + if pd.isnull(exps.iloc[i]['Name']) or exps.iloc[i]['Name'] == '': + exps.iloc[i, exps.columns.get_loc('Name')] = set_name( + exps.iloc[i]['Files'], exps.iloc[i]['Data type']) + # if not config['do_assembly']: + # EXPS.iloc[i]['Sample'] = EXPS.iloc[i]['Name'] if exps['Name'].duplicated().any(): sys.exit(f'ERROR: Multiple rows with same "Name" value: {",".join(exps["Name"].duplicated().any())}.') diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 26c7c39..85b72d5 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -10,22 +10,6 @@ validate(config, schema="../schemas/config.schema.yaml") OUTPUT = config["output"] EXPS = pd.DataFrame(config["experiments"]) - -def set_name(files, data_type): - filename = files.split('/')[-1] - if data_type == 'protein': - return filename # which is the foldername (e.g. input/mp1 -> mp1) - if ',' in files: - return filename.split(',')[0].split('_R')[0] - return filename.split('.fa')[0] - -for i in range(len(EXPS)): - if pd.isnull(EXPS.iloc[i]['Name']) or EXPS.iloc[i]['Name'] == '': - EXPS.iloc[i, EXPS.columns.get_loc('Name')] = set_name( - EXPS.iloc[i]['Files'], EXPS.iloc[i]['Data type']) - #if not config['do_assembly']: - # EXPS.iloc[i]['Sample'] = EXPS.iloc[i]['Name'] - pathlib.Path(f"{OUTPUT}").mkdir(parents=True, exist_ok=True) EXPS.to_csv(f"{OUTPUT}/exps.tsv", sep = '\t', index = False)