From 61a97e6ebc5b11bb1c2787b54dc1102df71f7f44 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Tue, 16 Jan 2024 19:01:42 +0100 Subject: [PATCH 01/18] Fix __main__.py pylinting errors --- jasentool/__main__.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/jasentool/__main__.py b/jasentool/__main__.py index d2221e7..6c64c3b 100644 --- a/jasentool/__main__.py +++ b/jasentool/__main__.py @@ -1,15 +1,17 @@ +"""__main__ file that handles help and cli execution""" + import sys -import os from jasentool import __author__, __copyright__, __version__ from jasentool.cli import get_main_parser from jasentool.main import OptionsParser def print_help(): - print(''' + """Print help string for jasentool software""" + print(f''' - ...::: Jasentool v%s :::... -Author(s): %s + ...::: Jasentool v{__version__} :::... +Author(s): {__author__} Description: This software is a mongodb tool that fetches, inserts and @@ -32,9 +34,10 @@ def print_help(): fix Fix output files from bjorn. converge Converge tuberculosis mutation catlogues. qc Extract QC values after alignment. -''' % (__version__, __author__)) +''') def main(): + """Main function that handles cli""" args = None if len(sys.argv) == 1: print_help() @@ -57,14 +60,14 @@ def main(): except KeyboardInterrupt: print('Controlled exit resulting from interrupt signal.') sys.exit(1) - except Exception as e: + except Exception as error_code: error_message = 'Uncontrolled exit resulting from an unexpected error.\n\n' error_message += '-' * 80 + '\n' - error_message += 'EXCEPTION: {}\n'.format(type(e).__name__) - error_message += 'MESSAGE: {}\n'.format(e) + error_message += f'EXCEPTION: {type(error_code).__name__}\n' + error_message += f'MESSAGE: {error_code}\n' error_message += '-' * 80 + '\n\n' print(error_message) sys.exit(1) if __name__ == "__main__": - main() \ No newline at end of file + main() From db40159d26e51a097aa0e02fcdfdb6efb2214f9b Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:26:39 +0100 Subject: [PATCH 02/18] Fix __init__.py pylinting errors --- jasentool/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jasentool/__init__.py b/jasentool/__init__.py index 3853a2a..e44e71f 100644 --- a/jasentool/__init__.py +++ b/jasentool/__init__.py @@ -1,3 +1,5 @@ +"""Information regarding jasentool for setup.py""" + __author__ = 'Ryan James Kennedy' __author_email__ = 'ryan.kennedy@skane.se' __copyright__ = 'Copyright 2023' @@ -6,7 +8,7 @@ __license__ = 'GPL3' __maintainer__ = 'Ryan James Kennedy' __maintainer_email__ = 'ryan.kennedy@skane.se' -__name__ = 'jasentool' +__software_name__ = 'jasentool' __python_requires__ = '>=3.11' __status__ = 'Production' __title__ = 'jasentool' From e21bb2555dfe2ede8817c2536d2f2cc7539e5b0a Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:27:02 +0100 Subject: [PATCH 03/18] Fix cli.py pylinting errors --- jasentool/cli.py | 115 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 24 deletions(-) diff --git a/jasentool/cli.py b/jasentool/cli.py index 0c53b53..2b114ea 100644 --- a/jasentool/cli.py +++ b/jasentool/cli.py @@ -1,117 +1,184 @@ -import os +"""Command line interface module""" + import argparse from contextlib import contextmanager @contextmanager def subparser(parser, name, desc): - yield parser.add_parser(name, conflict_handler='resolve', help=desc, formatter_class=argparse.RawDescriptionHelpFormatter) + """Yield subparser""" + yield parser.add_parser(name, conflict_handler='resolve', help=desc, + formatter_class=argparse.RawDescriptionHelpFormatter) @contextmanager def mutex_group(parser, required): - group = parser.add_argument_group(f'mutually exclusive {"required" if required else "optional"} arguments') + """Yield mutually exclusive group""" + arg_type = "required" if required else "optional" + group = parser.add_argument_group(f'mutually exclusive {arg_type} arguments') yield group.add_mutually_exclusive_group(required=required) @contextmanager def arg_group(parser, name): + """Yield mutually argument group""" yield parser.add_argument_group(name) def __query(group, required): + """Add query argument to group""" group.add_argument('-q', '--query', required=required, nargs='+', help='sample query') def __sample_id(group, required): + """Add sample_id argument to group""" group.add_argument('--sample_id', required=required, type=str, help='sample ID') def __input_dir(group, required, help): + """Add input_dir argument to group""" group.add_argument('--input_dir', required=required, help=help) def __input_file(group, required, help): - group.add_argument('-i', '--input_file', nargs='+', help=help) + """Add input_file argument to group""" + group.add_argument('-i', '--input_file', required=required, nargs='+', help=help) def __csv_file(group, required, help): + """Add csv_file argument to group""" group.add_argument('--csv_file', required=required, help=help) def __sh_file(group, required, help): + """Add sh_file argument to group""" group.add_argument('--sh_file', required=required, help=help) def __bam_file(group, required): + """Add bam_file argument to group""" group.add_argument('--bam_file', required=required, type=str, help='input bam file') def __bed_file(group, required): + """Add bed_file argument to group""" group.add_argument('--bed_file', required=required, type=str, help='input bed file') def __baits_file(group, required): - group.add_argument('--baits_file', required=required, type=str, default=None, help='input bam file') + """Add baits_file argument to group""" + group.add_argument('--baits_file', required=required, type=str, default=None, + help='input baits file') def __reference(group, required, help): + """Add reference argument to group""" group.add_argument('--reference', required=required, type=str, help=help) def __output_file(group, required, help): + """Add output_file argument to group""" group.add_argument('-o', '--output_file', required=required, type=str, help=help) def __output_dir(group, required): - group.add_argument('--output_dir', required=required, type=str, help='directory to output files') + """Add output_dir argument to group""" + group.add_argument('--output_dir', required=required, type=str, + help='directory to output files') def __analysis_dir(group, required): - group.add_argument('--analysis_dir', required=required, type=str, help='analysis results dir containing jasen results') + """Add analysis_dir argument to group""" + group.add_argument('--analysis_dir', required=required, type=str, + help='analysis results dir containing jasen results') def __restore_dir(group, required): - group.add_argument('--restore_dir', required=required, type=str, default='/fs2/seqdata/restored', help='directory user wishes spring files to be restored to') + """Add restore_dir argument to group""" + group.add_argument('--restore_dir', required=required, type=str, + default='/fs2/seqdata/restored', + help='directory user wishes spring files to be restored to') def __remote_dir(group, required): - group.add_argument('--remote_dir', required=required, type=str, default='/fs1/bjorn/jasen', help='directory user wishes spring files to be restored to') + """Add remote_dir argument to group""" + group.add_argument('--remote_dir', required=required, type=str, + default='/fs1/bjorn/jasen', + help='directory user wishes spring files to be restored to') def __restore_file(group, required): - group.add_argument('--restore_file', required=required, type=str, help='filepath bash shell script (.sh) to be output') + """Add restore_file argument to group""" + group.add_argument('--restore_file', required=required, type=str, + help='filepath bash shell script (.sh) to be output') def __missing_log(group, required): - group.add_argument('--missing_log', required=required, type=str, default='missing_samples.log', help='file containing missing files') + """Add missing_log argument to group""" + group.add_argument('--missing_log', required=required, type=str, + default='missing_samples.log', + help='file containing missing files') def __assay(group, required): - group.add_argument('--assay', required=required, type=str, default='jasen-saureus-dev', help='assay for jasen to run') + """Add assay argument to group""" + group.add_argument('--assay', required=required, type=str, + default='jasen-saureus-dev', + help='assay for jasen to run') def __platform(group, required): - group.add_argument('--platform', required=required, type=str, default='illumina', help='sequencing platform for jasen to run') + """Add platform argument to group""" + group.add_argument('--platform', required=required, type=str, + default='illumina', + help='sequencing platform for jasen to run') def __uri(group): - group.add_argument('--address', '--uri', default='mongodb://localhost:27017/', help='Mongodb host address. Use: `sudo lsof -iTCP -sTCP:LISTEN | grep mongo` to get address') + """Add mongodb address argument to group""" + group.add_argument('--address', '--uri', + default='mongodb://localhost:27017/', + help='Mongodb host address. \ + Use: `sudo lsof -iTCP -sTCP:LISTEN | grep mongo` to get address') def __db_name(group, required): - group.add_argument('--db_name', required=required, help='Mongodb database name address. Use: `show dbs` to get db name') + """Add db_name argument to group""" + group.add_argument('--db_name', required=required, + help='Mongodb database name address. \ + Use: `show dbs` to get db name') def __db_collection(group, required): - group.add_argument('--db_collection', required=required, help='Mongodb collection name. Use: `show collections` to get db collection') + """Add db_collection argument to group""" + group.add_argument('--db_collection', required=required, + help='Mongodb collection name. \ + Use: `show collections` to get db collection') def __out_format(group, required): - group.add_argument('-f', '--out_format', required=required, type=str, default="bed", help='output format') + """Add out_format argument to group""" + group.add_argument('-f', '--out_format', required=required, type=str, + default="bed", help='output format') def __accession(group, required): + """Add accession argument to group""" group.add_argument('-a', '--accession', required=required, type=str, help='accession number') def __remote_hostname(group, required): - group.add_argument('--remote_hostname', required=required, type=str, default='rs-fs1.lunarc.lu.se', help='remote hostname') + """Add remote_hostname argument to group""" + group.add_argument('--remote_hostname', required=required, type=str, + default='rs-fs1.lunarc.lu.se', help='remote hostname') def __prefix(group): - group.add_argument('--prefix', type=str, default='jasentool_results_', help='prefix for all output files') + """Add prefix argument to group""" + group.add_argument('--prefix', type=str, default='jasentool_results_', + help='prefix for all output files') def __auto_start(group, required): - group.add_argument('--auto_start', required=required, dest='auto_start', action='store_true', default=False, help='automatically start') + """Add auto_start argument to group""" + group.add_argument('--auto_start', required=required, dest='auto_start', action='store_true', + default=False, help='automatically start') def __remote(group, required): - group.add_argument('--remote', required=required, dest='remote', action='store_true', default=False, help='remote copy') + """Add remote argument to group""" + group.add_argument('--remote', required=required, dest='remote', action='store_true', + default=False, help='remote copy') def __combined_output(group): - group.add_argument('--combined_output', dest='combined_output', action='store_true', help='combine all of the outputs into one output') + """Add combined_output argument to group""" + group.add_argument('--combined_output', dest='combined_output', action='store_true', + help='combine all of the outputs into one output') def __sample_sheet(group, required): - group.add_argument('--sample_sheet', required=required, dest='sample_sheet', action='store_true', help='sample sheet input') + """Add sample_sheet argument to group""" + group.add_argument('--sample_sheet', required=required, dest='sample_sheet', + action='store_true', help='sample sheet input') def __cpus(group): + """Add cpus argument to group""" group.add_argument('--cpus', dest='cpus', type=int, default=2, help='input cpus') def __help(group): + """Add help argument to group""" group.add_argument('-h', '--help', action='help', help='show help message') def get_main_parser(): + """Get/build the main argument parser""" main_parser = argparse.ArgumentParser(prog='jasentool', conflict_handler='resolve') sub_parsers = main_parser.add_subparsers(help='--', dest='subparser_name') with subparser(sub_parsers, 'find', 'Find sample from given mongo db') as parser: @@ -209,4 +276,4 @@ def get_main_parser(): __cpus(group) __help(group) - return main_parser \ No newline at end of file + return main_parser From fc412eb6fbf236cb68fb63b52005d34882854b7b Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:27:32 +0100 Subject: [PATCH 04/18] Fix converge.py pylinting errors --- jasentool/converge.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/jasentool/converge.py b/jasentool/converge.py index 4c50728..6d7d3b8 100644 --- a/jasentool/converge.py +++ b/jasentool/converge.py @@ -1,3 +1,5 @@ +"""Module to converge mutation catalogues""" + import os import pandas as pd from jasentool.who import WHO @@ -5,7 +7,8 @@ from jasentool.tbprofiler import Tbprofiler from jasentool.utils import Utils -class Converge(object): +class Converge: + """Class that converges mutation catalogues""" def __init__(self, download_dir): self.download_dir = download_dir self.fohm_fpath = os.path.join(os.path.dirname(__file__), "data/dbs/fohm.csv") @@ -45,6 +48,7 @@ def compare_columns(self, tbdb_df, who_df, column_names): return intersection_df, unique_tbdb_df, unique_who_df def run(self): + """Run the retrieval and convergance of mutation catalogues""" utils = Utils() # Download the genome mycobacterium_genome = Genome("NC_000962.3", "AL123456.3", self.download_dir, "h37rv") @@ -55,14 +59,18 @@ def run(self): tbprofiler = Tbprofiler(self.tbdb_filepath) #h37rv_gb_filepath = mycobacterium_genome.download_genbank() who_df = who._parse(fasta_filepath, gff_filepath, self.download_dir) - tbdb_df = tbprofiler._parse(fasta_filepath, gff_filepath, self.download_dir) - #tbdb_df, who_df = pd.read_csv("/data/bnf/dev/ryan/pipelines/jasen/converge/tbdb.csv"), pd.read_csv("/data/bnf/dev/ryan/pipelines/jasen/converge/who.csv") + tbdb_df = tbprofiler._parse(self.download_dir) + #tbdb_df = pd.read_csv("/data/bnf/dev/ryan/pipelines/jasen/converge/tbdb.csv") + #who_df = pd.read_csv("/data/bnf/dev/ryan/pipelines/jasen/converge/who.csv") fohm_df = pd.read_csv(self.fohm_fpath) - intersection_df, unique_tbdb_df, unique_who_df = self.compare_columns(tbdb_df, who_df, ['Drug', 'Gene', 'Mutation']) - fohm_tbdb_df = pd.concat([intersection_df, unique_tbdb_df, fohm_df], ignore_index=True).drop_duplicates() + column_names = ['Drug', 'Gene', 'Mutation'] + intersection_df, unique_tbdb_df, unique_who_df = self.compare_columns(tbdb_df, who_df, column_names) + dfs_to_concat = [intersection_df, unique_tbdb_df, fohm_df] + fohm_tbdb_df = pd.concat(dfs_to_concat, ignore_index=True).drop_duplicates() intersection_df.to_csv(self.intersection_outfpath, index=False) unique_tbdb_df.to_csv(self.unique_tbdb_outfpath, index=False) unique_who_df.to_csv(self.unique_who_outfpath, index=False) fohm_tbdb_df.to_csv(self.fohm_tbdb_outfpath, index=False) - converged_df = pd.concat([intersection_df, unique_tbdb_df, unique_who_df, fohm_df], ignore_index=True).drop_duplicates() + dfs_to_converge = [intersection_df, unique_tbdb_df, unique_who_df, fohm_df] + converged_df = pd.concat(dfs_to_converge, ignore_index=True).drop_duplicates() converged_df.to_csv(self.convereged_outfpath, index=False) From 4adfcab789bdafbf1a834f8531e3ea83d35246ba Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:27:56 +0100 Subject: [PATCH 05/18] Fix convert.py pylinting errors --- jasentool/convert.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/jasentool/convert.py b/jasentool/convert.py index 80974d8..99c3201 100644 --- a/jasentool/convert.py +++ b/jasentool/convert.py @@ -1,8 +1,12 @@ -class Convert(object): +"""Module that converts file type""" + +class Convert: + """Convert class for converting files into desired format""" @staticmethod def targets2bed(target_file, accn): + """Convert cgmlst locus targets to bed file format""" bed_output = "" - with open(target_file, 'r') as fin: + with open(target_file, 'r', encoding="utf-8") as fin: for line in fin: if line.startswith("Locus"): continue @@ -11,4 +15,4 @@ def targets2bed(target_file, accn): length = int(line_split[4]) end = start + length bed_output += f"{accn}\t{start}\t{end}\n" - return bed_output \ No newline at end of file + return bed_output From 966679acc281ea450f932986d750aa6b2e9a720e Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:28:41 +0100 Subject: [PATCH 06/18] Fix database.py pylinting errors --- jasentool/database.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/jasentool/database.py b/jasentool/database.py index ee2848c..3fa991e 100644 --- a/jasentool/database.py +++ b/jasentool/database.py @@ -1,11 +1,14 @@ +"""Module for handling mongodb requests""" import pymongo -class Database(object): +class Database: + """Class that assists in handling mongodb request""" uri = "mongodb://localhost:27017/" db = None @staticmethod def initialize(db_name): + """Initialize mongodb client""" client = pymongo.MongoClient(Database.uri) Database.db = client[db_name] # Database Name Database.db_name = db_name # Database Name @@ -13,30 +16,37 @@ def initialize(db_name): @staticmethod def insert(collection, data): + """Insert data into mongodb""" Database.db[collection].insert(data) @staticmethod def find(collection, query, fields): + """Find data in mongodb""" return Database.db[collection].find(query, fields) @staticmethod def find_one(collection, query): + """Find one entry in mongodb""" return Database.db[collection].find_one(query) - + @staticmethod def get_pvl(collection, query): + """Get pvl result data from mongodb""" return Database.db[collection].find(query, {"_id": 0, "aribavir.lukS_PV.present": 1}) - + @staticmethod def get_mlst(collection, query): + """Get mlst result data from mongodb""" return Database.db[collection].find(query, {"_id": 0, "mlst": 1}) - + @staticmethod def get_cgmlst(collection, query): + """Get cgmlst result data from mongodb""" return Database.db[collection].find(query, {"_id": 0, "alleles": 1}) - + @staticmethod def get_meta_fields(): + """Get respective metadata from mongodb""" fields = { "id": 1, "mlst.sequence_type": 1, From 8e41d6287e66fadc345bdbed9e4cb62ff522ad30 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:28:57 +0100 Subject: [PATCH 07/18] Fix fix.py pylinting errors --- jasentool/fix.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/jasentool/fix.py b/jasentool/fix.py index 3009334..5f265ff 100644 --- a/jasentool/fix.py +++ b/jasentool/fix.py @@ -1,16 +1,20 @@ +"""Module that fixes csv and shell scripts""" + import os import pandas as pd from jasentool.utils import Utils -class Fix(object): +class Fix: + """Class that fixes csvs for start_nextflow_analysis.pl""" @staticmethod def fix_csv(input_file, output_fpath): + """Convert the provided bjorn csvs into new jasen-compatible csvs""" assays = [] out_fpaths = [] - with open(input_file, 'r') as csvfile: - df = pd.read_csv(csvfile) - df['assay'] = df['species'] - for assay, df_assay in df.groupby('assay'): + with open(input_file, 'r', encoding="utf-8") as csvfile: + samples = pd.read_csv(csvfile) + samples['assay'] = samples['species'] + for assay, df_assay in samples.groupby('assay'): out_fpath = f'{os.path.splitext(output_fpath)[0]}_{assay}.csv' df_assay.to_csv(out_fpath, encoding='utf-8', index=False) out_fpaths.append(out_fpath) @@ -19,16 +23,18 @@ def fix_csv(input_file, output_fpath): @staticmethod def fix_sh(input_file, output_fpath, assays): + """Fix the shell scripts""" utils = Utils() output_content = "" out_fpaths = [] - with open(input_file, 'r') as shfile: + with open(input_file, 'r', encoding="utf-8") as shfile: for line in shfile: line = line.rstrip() if line.startswith('/fs2/sw/bnf-scripts/start_nextflow_analysis.pl'): for assay in assays: output_txt = "" - line = f'/fs2/sw/bnf-scripts/start_nextflow_analysis.pl $SCRIPTPATH/{os.path.splitext(output_fpath)[0]}_{assay}.csv' + line = '/fs2/sw/bnf-scripts/start_nextflow_analysis.pl ' + \ + f'$SCRIPTPATH/{os.path.splitext(output_fpath)[0]}_{assay}.csv' out_fpath = f'{os.path.splitext(output_fpath)[0]}_{assay}.sh' output_txt += output_content+line+'\n' utils.write_out_txt(output_txt, out_fpath) From 62983603877764060cb4bb808f47bb03d5b1e369 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:29:19 +0100 Subject: [PATCH 08/18] Fix fohm.py pylinting errors --- jasentool/fohm.py | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/jasentool/fohm.py b/jasentool/fohm.py index c767772..d9e95e4 100644 --- a/jasentool/fohm.py +++ b/jasentool/fohm.py @@ -1,38 +1,43 @@ +"""Module that handles FoHM excel sheet""" + import os import pandas as pd +from openpyxl import load_workbook class Fohm: + """Class for processing FoHM TB mutation catalogue""" def __init__(self, download_dir): self.download_dir = download_dir self.fohm_filepath = os.path.join(download_dir, "fohm.csv") - def convert_colour(): - from openpyxl import load_workbook - excel_file = 'color_codes.xlsx' - wb = load_workbook(excel_file, data_only = True) - sh = wb['Sheet1'] - color_in_hex = sh['A2'].fill.start_color.index # this gives you Hexadecimal value of the color - print ('HEX =',color_in_hex) + def convert_colour(self, excel_filepath): + """Convert coloured cells to hex value""" + excel_catalogue = load_workbook(excel_filepath, data_only = True) + mutation_sheet = excel_catalogue['Sheet1'] + color_in_hex = mutation_sheet['A2'].fill.start_color.index + print ('HEX =', color_in_hex) print('RGB =', tuple(int(color_in_hex[i:i+2], 16) for i in (0, 2, 4))) # Color in RGB - + def read_file(self, csv_filepath, xlsx_filepath): + """Read excel and csv files""" catalogue = pd.read_csv(csv_filepath, header=True) catalogue = pd.read_excel(xlsx_filepath, sheet_name='Mutation_catalogue', header=[0,1]).set_index([('variant (common_name)', 'Unnamed: 2_level_1')]) return catalogue def convert2hgvs(self, mutation): - if mutation[:3].isalpha() and mutation[0].isupper(): - return f'p.{mutation}' - elif mutation[0].isalpha() and mutation[0].islower() and not mutation[1].isalpha(): - if 'Stop' in mutation: - mutation.replace('Stop', '*') - ref = mutation[0].upper() - alt = mutation[-1].upper() - pos = mutation[1:-1] - return f'c.{pos}{ref}>{alt}' - else: - return mutation - + """Convert mutation format to hgvs format""" + if mutation[:3].isalpha() and mutation[0].isupper(): + return f'p.{mutation}' + if mutation[0].isalpha() and mutation[0].islower() and not mutation[1].isalpha(): + if 'Stop' in mutation: + mutation.replace('Stop', '*') + ref = mutation[0].upper() + alt = mutation[-1].upper() + pos = mutation[1:-1] + return f'c.{pos}{ref}>{alt}' + return mutation + def _parse(self): + """Parse the mutation catalogue""" catalogue = pd.read_csv(self.fohm_filepath, header=True) catalogue['Mutation'] = catalogue.Mutation.apply(self.convert2hgvs) From 4a95a44dd1bb709ee67ff52d099b0e77686c0659 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:29:43 +0100 Subject: [PATCH 09/18] Fix genome.py pylinting errors --- jasentool/genome.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/jasentool/genome.py b/jasentool/genome.py index 5298943..3910c24 100644 --- a/jasentool/genome.py +++ b/jasentool/genome.py @@ -1,8 +1,11 @@ +"""Module for genomes and files related to the genomes""" + import os from Bio import Entrez, SeqIO from jasentool.utils import Utils class Genome: + """Class for handling genome download in multiple formats (fasta, genbank, gff) from NCBI""" def __init__(self, refseq_accn, genbank_accn, download_dir, prefix, email="rjkennedyy@gmail.com"): Entrez.email = email self.refseq_accn = refseq_accn @@ -14,9 +17,11 @@ def __init__(self, refseq_accn, genbank_accn, download_dir, prefix, email="rjken self.gff_filepath = os.path.join(download_dir, f"{prefix}.gff") def download_fasta(self): + """Download genome in fasta format""" try: # Fetch the fasta record from NCBI - fasta_handle = Entrez.efetch(db="nucleotide", id=self.refseq_accn, rettype="fasta", retmode="text") + fasta_handle = Entrez.efetch(db="nucleotide", id=self.refseq_accn, + rettype="fasta", retmode="text") fasta_record = SeqIO.read(fasta_handle, "fasta") fasta_handle.close() @@ -25,14 +30,16 @@ def download_fasta(self): print(f"Fasta downloaded and saved to {self.fasta_filepath}") - except Exception as e: - print(f"Error downloading the genome: {e}") + except Exception as error_code: + print(f"Error downloading the genome: {error_code}") return self.fasta_filepath def download_genbank(self): + """Download genome in fasta format""" try: # Fetch the GenBank record from NCBI - genbank_handle = Entrez.efetch(db="nucleotide", id=self.genbank_accn, rettype="gb", retmode="text") + genbank_handle = Entrez.efetch(db="nucleotide", id=self.genbank_accn, + rettype="gb", retmode="text") genbank_record = SeqIO.read(genbank_handle, "genbank") genbank_handle.close() @@ -41,11 +48,12 @@ def download_genbank(self): print(f"Genbank file downloaded and saved to {self.genbank_filepath}") - except Exception as e: - print(f"Error downloading the genbank file: {e}") + except Exception as error_code: + print(f"Error downloading the genbank file: {error_code}") return self.genbank_filepath - + def download_gff(self): + """Download gff of genome genes""" utils = Utils() h37rv_url = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000195955.2/download?include_annotation_type=GENOME_GFF&filename=GCF_000195955.2.zip" try: @@ -54,6 +62,6 @@ def download_gff(self): source = os.path.join(self.download_dir, "ncbi_dataset/data/GCF_000195955.2/genomic.gff") destination = os.path.join(self.download_dir, "h37rv.gff") utils.copy_file(source, destination) - except Exception as e: - print(f"Error downloading the gff file: {e}") + except Exception as error_code: + print(f"Error downloading the gff file: {error_code}") return self.gff_filepath From 4a841c6cca14a8dca4413e34a08ce285f99d7e33 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:30:03 +0100 Subject: [PATCH 10/18] Fix main.py pylinting errors --- jasentool/main.py | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/jasentool/main.py b/jasentool/main.py index dc5b542..5d119cb 100644 --- a/jasentool/main.py +++ b/jasentool/main.py @@ -1,3 +1,5 @@ +"""Module for executing each module/class""" + import os import sys import json @@ -12,8 +14,10 @@ from jasentool.converge import Converge from jasentool.qc import QC -class OptionsParser(object): +class OptionsParser: + """Class that parses through cli arguments and executes respective modules""" def __init__(self, version): + """Initiate OptionsParser class""" self.version = version self._check_python() @@ -36,7 +40,7 @@ def _get_output_fpaths(self, input_files, output_dir, output_file, prefix, combi output_fpaths = [] if output_dir: output_dir = os.path.expanduser(output_dir) - if combined_output: + if combined_output: output_fpaths = [os.path.join(output_dir, prefix + "combined_outputs")] else: output_fpaths = [os.path.join(output_dir, prefix + os.path.basename(os.path.splitext(input_fpath)[0])) for input_fpath in input_files] @@ -48,33 +52,41 @@ def _get_output_fpaths(self, input_files, output_dir, output_file, prefix, combi return output_fpaths def find(self, options): + """Find entry in mongodb""" Database.initialize(options.db_name) - output_fpaths = self._get_output_fpaths(options.query, options.output_dir, options.output_file, options.prefix, options.combined_output) + output_fpaths = self._get_output_fpaths(options.query, options.output_dir, + options.output_file, options.prefix, + options.combined_output) for query_idx, query in enumerate(options.query): find = list(Database.find(options.db_collection, {"id": query}, {})) if not find: find = list(Database.find(options.db_collection, {"sample_id": query}, {})) - pp = pprint.PrettyPrinter(indent=4) - pp.pprint(find) - #with open(output_fpaths[query_idx], 'w+') as fout: - #json.dump(find, fout) + sample_pp = pprint.PrettyPrinter(indent=4) + sample_pp.pprint(find) + with open(output_fpaths[query_idx], 'w+', encoding="utf-8") as fout: + json.dump(find, fout) def insert(self, options): + """Insert entry in mongodb""" Database.initialize(options.db_name) input_files = self._input_to_process(options.input_file, options.input_dir) for input_file in input_files: - with open(input_file, 'r') as fin: + with open(input_file, 'r', encoding="utf-8") as fin: input_sample = json.load(fin) Database.insert(options.db_collection, input_sample) def validate(self, options): + """Execute validation of old vs new pipeline results""" Database.initialize(options.db_name) input_files = self._input_to_process(options.input_file, options.input_dir) - output_fpaths = self._get_output_fpaths(input_files, options.output_dir, options.output_file, options.prefix, options.combined_output) + output_fpaths = self._get_output_fpaths(input_files, options.output_dir, + options.output_file, options.prefix, + options.combined_output) validate = Validate() validate.run(input_files, output_fpaths, options.db_collection, options.combined_output) def missing(self, options): + """Execute search for missing samples from new pipeline results""" utils = Utils() missing = Missing() db = Database() @@ -98,6 +110,7 @@ def missing(self, options): utils.write_out_txt(bash_script, bash_fpath) def convert(self, options): + """Execute conversion of file formats""" utils = Utils() convert = Convert() input_file = options.input_file[0] @@ -108,6 +121,7 @@ def convert(self, options): utils.write_out_txt(output_txt, output_fpath) def fix(self, options): + """Execute fixing of file to desired format(s)""" utils = Utils() fix = Fix() csv_files, assays = fix.fix_csv(options.csv_file, options.output_file) @@ -118,15 +132,18 @@ def fix(self, options): utils.start_remote_pipelines(batch_files, options.remote_dir) def converge(self, options): + """Execute convergence of mutation catalogues""" converge = Converge(options.output_dir) converge.run() def qc(self, options): + """Execute retrieval of qc results""" qc = QC(options) json_result = qc.run() qc.write_json_result(json_result, options.output_file) def parse_options(self, options): + """Options parser""" if options.subparser_name == 'find': self.find(options) @@ -147,6 +164,6 @@ def parse_options(self, options): elif options.subparser_name == 'converge': self.converge(options) - + elif options.subparser_name == 'qc': self.qc(options) From cd22f6a81a371111035fe96b9be36ce8bef4e976 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:30:24 +0100 Subject: [PATCH 11/18] Fix missing.py pylinting errors --- jasentool/missing.py | 142 +++++++++++++++++++++++++++++++------------ 1 file changed, 104 insertions(+), 38 deletions(-) diff --git a/jasentool/missing.py b/jasentool/missing.py index a855299..eea135a 100644 --- a/jasentool/missing.py +++ b/jasentool/missing.py @@ -1,16 +1,18 @@ +"""Module to find samples that have not been run via jasen""" + import os import re -import csv -import pandas as pd -class Missing(object): +class Missing: + """Class for locating expected samples that are missing from a given directory""" @staticmethod def rm_double_dmltplx(read_files): + """Exclude files that have been demultiplexed twice""" first_reads = read_files[0] for read_file in read_files[1:]: errors = 0 - for i in range(len(first_reads)): - if first_reads[i] != read_file[i]: + for idx, _ in enumerate(first_reads): + if first_reads[idx] != read_file[idx]: errors += 1 if errors == 1: return [first_reads, read_file] @@ -18,24 +20,30 @@ def rm_double_dmltplx(read_files): @staticmethod def find_files(search_term, parent_dir): + """Find files in given directory using regex search term""" try: search_files = os.listdir(parent_dir) except FileNotFoundError: print(f"WARN: {parent_dir} does not exist! Trying to fix.") finally: search_files = os.listdir(parent_dir) - found_files = sorted([os.path.join(parent_dir, search_file) for search_file in search_files if re.search(search_term, search_file) and not search_file.endswith("~")]) + found_files = sorted([os.path.join(parent_dir, search_file) + for search_file in search_files + if re.search(search_term, search_file) and + not search_file.endswith("~") + ]) return found_files @staticmethod def edit_read_paths(reads, restore_dir): - restore_dirs = set([restore_dir.rstrip("/"), "/fs2/seqdata/restored"]) + """Edit read paths to show intended location to be coppied to""" filename = os.path.join(restore_dir, reads.split("BaseCalls/")[1]) read1, read2 = [filename.rstrip(".spring") + f"_R{i}_001.fastq.gz" for i in [1, 2]] return os.path.join(restore_dir, reads.split("BaseCalls/")[1]), [read1, read2] - + @staticmethod def check_file_cp(reads, restore_dir): + """Check that file not already coppied to restore directory""" checked_reads = [] restore_dirs = set([restore_dir.rstrip("/"), "/fs2/seqdata/restored"]) for filepath in reads: @@ -43,18 +51,26 @@ def check_file_cp(reads, restore_dir): if filepath.startswith("/fs") and os.path.exists(filepath): checked_reads.append(filepath) else: - for dir in restore_dirs: - read_fpath = os.path.join(dir, filename) - if os.path.exists(read_fpath) and not os.path.isdir(read_fpath) and len(checked_reads) != 2: + for directory in restore_dirs: + read_fpath = os.path.join(directory, filename) + if ( + os.path.exists(read_fpath) and + not os.path.isdir(read_fpath) and + len(checked_reads) != 2 + ): checked_reads.append(read_fpath) if len(checked_reads) == 0: - checked_reads = [os.path.join(restore_dir, os.path.basename(read_filepath)) for read_filepath in reads] + checked_reads = [ + os.path.join(restore_dir, os.path.basename(read_filepath)) + for read_filepath in reads + ] return checked_reads @staticmethod def parse_sample_sheet(sample_sheet, restore_dir): + """Parse sample sheets for sample meta data""" csv_dict = {} - with open(sample_sheet, "r") as fin: + with open(sample_sheet, "r", encoding="utf-8") as fin: for line in fin: if line.endswith("saureus\n"): line = line.rstrip() @@ -69,34 +85,78 @@ def parse_sample_sheet(sample_sheet, restore_dir): except IndexError: clarity_group_id = clarity_id if ":" in line: - parent_dir = os.path.join(line.split(":")[0].rstrip("SampleSheet.csv"), "Data/Intensities/BaseCalls/") + parent_dir = os.path.join( + line.split(":")[0].rstrip("SampleSheet.csv"), + "Data/Intensities/BaseCalls/" + ) else: - parent_dir = os.path.join(os.path.dirname(sample_sheet), "Data/Intensities/BaseCalls/") + parent_dir = os.path.join( + os.path.dirname(sample_sheet), + "Data/Intensities/BaseCalls/" + ) try: paired_reads = Missing.find_files(r'^' + clarity_id, parent_dir) if len(paired_reads) == 2 and paired_reads[0].endswith(".gz"): restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir) - csv_dict[sample_id] = [clarity_group_id, species, restored_reads_fpaths, None, paired_reads] + csv_dict[sample_id] = [ + clarity_group_id, + species, + restored_reads_fpaths, + None, + paired_reads + ] elif len(paired_reads) == 1 and paired_reads[0].endswith(".spring"): spring_fpaths = paired_reads - (restored_spring_fpaths, paired_reads) = list(map(Missing.edit_read_paths, spring_fpaths, [restore_dir]*len(spring_fpaths)))[0] - csv_dict[sample_id] = [clarity_group_id, species, paired_reads, spring_fpaths, restored_spring_fpaths] + (restored_spring_fpaths, paired_reads) = list(map( + Missing.edit_read_paths, + spring_fpaths, + [restore_dir]*len(spring_fpaths) + ))[0] + csv_dict[sample_id] = [ + clarity_group_id, + species, + paired_reads, + spring_fpaths, + restored_spring_fpaths + ] elif len(paired_reads) == 4 and paired_reads[0].endswith(".gz"): paired_reads = Missing.rm_double_dmltplx(paired_reads) if len(paired_reads) == 2: restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir) - csv_dict[sample_id] = [clarity_group_id, species, restored_reads_fpaths, None, paired_reads] + csv_dict[sample_id] = [ + clarity_group_id, + species, + restored_reads_fpaths, + None, + paired_reads + ] elif len(paired_reads) == 4: paired_reads_string = '\n-'.join(paired_reads) - print(f"There are 4 sets of reads related to sample {sample_id} from the {parent_dir}: \n-{paired_reads_string}\n") + print(f"There are 4 sets of reads related to sample {sample_id} from the {parent_dir}: " + f"\n-{paired_reads_string}\n") + elif len(paired_reads) == 3: - paired_reads = [paired_read for paired_read in paired_reads if paired_read.endswith(".fastq.gz")] + paired_reads = [paired_read for paired_read in paired_reads + if paired_read.endswith(".fastq.gz")] restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir) - csv_dict[sample_id] = [clarity_group_id, species, restored_reads_fpaths, None, paired_reads] + csv_dict[sample_id] = [ + clarity_group_id, + species, + restored_reads_fpaths, + None, + paired_reads + ] elif len(paired_reads) == 6: - paired_reads = [paired_read for paired_read in paired_reads if paired_read.endswith(".fastq.gz")] + paired_reads = [paired_read for paired_read in paired_reads + if paired_read.endswith(".fastq.gz")] restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir) - csv_dict[sample_id] = [clarity_group_id, species, restored_reads_fpaths, None, paired_reads] + csv_dict[sample_id] = [ + clarity_group_id, + species, + restored_reads_fpaths, + None, + paired_reads + ] #elif len(paired_reads) == 0: #print(f"The sample {sample_id} doesn't have read/spring files in the {parent_dir} ({paired_reads}).") #else: @@ -109,7 +169,11 @@ def parse_sample_sheet(sample_sheet, restore_dir): @staticmethod def check_format(fpath): - if fpath.startswith("/fs1") and not os.path.exists(os.path.join(fpath, "Data/Intensities/BaseCalls")): + """Check that filepath has the correct prefix and that it exists""" + if ( + fpath.startswith("/fs1") and + not os.path.exists(os.path.join(fpath, "Data/Intensities/BaseCalls")) + ): print(f"WARN: {fpath} does not exist! Fixing by removing '/fs1' prefix.") fpath = fpath.replace("/fs1", "") if fpath.startswith("NovaSeq"): @@ -121,22 +185,23 @@ def check_format(fpath): data_fpath = "/data" + fpath if os.path.exists(os.path.join(fs2_fpath, "Data/Intensities/BaseCalls")): return fs2_fpath - elif os.path.exists(os.path.join(isilon_fpath, "Data/Intensities/BaseCalls")): + if os.path.exists(os.path.join(isilon_fpath, "Data/Intensities/BaseCalls")): return isilon_fpath - elif os.path.exists(os.path.join(data_fpath, "Data/Intensities/BaseCalls")): + if os.path.exists(os.path.join(data_fpath, "Data/Intensities/BaseCalls")): return data_fpath - elif os.path.exists(fpath): - return fpath.rstrip("Data/Intensities/BaseCalls/")#.replace("Data/Intensities/BaseCalls", "") - else: - print(f"WARN: Base calls for {fpath} cannot be found.") + if os.path.exists(fpath): + return fpath.rstrip("Data/Intensities/BaseCalls/") + print(f"WARN: Base calls for {fpath} cannot be found.") return fpath @staticmethod def parse_dir(dir_fpath): + """Return filenames in directory""" return [filename.split("_")[0] for filename in os.listdir(dir_fpath)] - + @staticmethod def filter_csv_dict(csv_dict, missing_samples): + """Filter out missing samples""" filtered_csv_dict = {} not_found = [] for missing_sample in missing_samples: @@ -148,9 +213,10 @@ def filter_csv_dict(csv_dict, missing_samples): print(f"{len(not_found)} samples could not be found") print(f"{len(filtered_csv_dict.keys())} samples remain after filtering") return filtered_csv_dict, not_found - + @staticmethod def find_missing(meta_dict, analysis_dir_fnames, restore_dir): + """Find missing samples from jasen results directory""" sample_run = "" missing_samples = [] csv_dict = {} @@ -158,18 +224,16 @@ def find_missing(meta_dict, analysis_dir_fnames, restore_dir): for sample in meta_dict: if sample["id"] not in analysis_dir_fnames: missing_samples.append(sample["id"]) - if sample_run != sample["run"]: #if sample run changes based on + if sample_run != sample["run"]: #if sample run changes based on ss_dict = {} sample_run_dir = Missing.check_format(sample["run"]) sample_sheets = Missing.find_files(r'.csv$', sample_run_dir) if sample_sheets: for sample_sheet in sample_sheets: ss_dict |= Missing.parse_sample_sheet(sample_sheet, restore_dir) - if not sample_sheet: - print(f"sample sheets yieded nothing from {sample['run']}") csv_dict |= ss_dict else: - print(f"WARN: No sample sheets exist in the following path path {sample['run']}!") + print(f"WARN: No sample sheets exist in the following path: {sample['run']}!") sample_run = sample["run"] print(f"{len(csv_dict.keys())} samples found") @@ -180,6 +244,7 @@ def find_missing(meta_dict, analysis_dir_fnames, restore_dir): @staticmethod def create_bash_script(csv_dict, restore_dir): + """Create shell script that executes copying of files and starts nextflow analysis""" spring_command = "" shell_script_path = 'SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"\n' shell_fail_count = "FAIL=0\n" @@ -189,7 +254,7 @@ def create_bash_script(csv_dict, restore_dir): unspring_command = "" try: spring_fpaths, restored_fpaths = csv_dict[sample][3][0], csv_dict[sample][4] - read1, read2 = csv_dict[sample][2] + read1, _ = csv_dict[sample][2] if not os.path.exists(restored_fpaths) and not os.path.exists(read1): jcp_command = f'/fs2/sw/bnf-scripts/jcp {spring_fpaths} {restore_dir}/ && ' unspring_command = f'/fs2/sw/bnf-scripts/unspring_file.pl {restored_fpaths} {restore_dir}/ WAIT &\nPIDS="$PIDS $!"\n' @@ -203,6 +268,7 @@ def create_bash_script(csv_dict, restore_dir): @staticmethod def remove_empty_files(csv_dict): + """Remove fastq filepaths if the file size is < 10 mb""" empty_files_dict = {} for sample in csv_dict: try: From be0a496776ae810fa13542c95f9fb3c187797a34 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:30:56 +0100 Subject: [PATCH 12/18] Fix qc.py pylinting errors --- jasentool/qc.py | 50 ++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/jasentool/qc.py b/jasentool/qc.py index 48868c3..3e357cb 100644 --- a/jasentool/qc.py +++ b/jasentool/qc.py @@ -1,8 +1,11 @@ +"""Module for retrieving qc results""" + import os import json import subprocess class QC: + """Class for retrieving qc results""" def __init__(self, args): self.results = {} self.bam = args.bam @@ -14,11 +17,13 @@ def __init__(self, args): self.paired = self.is_paired() def write_json_result(self, json_result, output_filepath): - with open(output_filepath, 'w') as json_file: + """Write out json file""" + with open(output_filepath, 'w', encoding="utf-8") as json_file: json_file.write(json_result) - def parse_basecov_bed(self, fn, thresholds): - with open(fn) as cov_fh: + def parse_basecov_bed(self, basecov_fpath, thresholds): + """Parse base coverage bed file""" + with open(basecov_fpath, "r", encoding="utf-8") as cov_fh: head_str = cov_fh.readline().strip().lstrip("#") head = head_str.split("\t") cov_field = head.index("COV") @@ -29,12 +34,12 @@ def parse_basecov_bed(self, fn, thresholds): tot, cnt = 0, 0 levels = {} for line in cov_fh: - a = line.strip().split("\t") - tot += int(a[2]) + line = line.strip().split("\t") + tot += int(line[2]) cnt += 1 tot_bases += 1 for min_val in thresholds: - if int(a[cov_field]) >= min_val: + if int(line[cov_field]) >= min_val: above_cnt[min_val] += 1 above_pct = {min_val: 100 * (above_cnt[min_val] / tot_bases) for min_val in thresholds} @@ -46,34 +51,37 @@ def parse_basecov_bed(self, fn, thresholds): q3_num = 3 * cnt / 4 median_num = cnt / 2 sum_val = 0 - q1, q3, median = None, None, None + quartile1, quartile3, median = None, None, None iqr_median = "9999" - for l in sorted(levels): - sum_val += levels[l] - if sum_val >= q1_num and not q1: - q1 = l + for level in sorted(levels): + sum_val += levels[level] + if sum_val >= q1_num and not quartile1: + quartile1 = level if sum_val >= median_num and not median: - median = l - if sum_val >= q3_num and not q3: - q3 = l + median = level + if sum_val >= q3_num and not quartile3: + quartile3 = level - if q1 and q3 and median: - iqr_median = (q3 - q1) / median + if quartile1 and quartile3 and median: + iqr_median = (quartile3 - quartile1) / median return above_pct, mean_cov, iqr_median def is_paired(self): + """Check if reads are paired""" line = subprocess.check_output(f"samtools view {self.bam} | head -n 1| awk '{{print $2}}'", shell=True, text=True) remainder = int(line) % 2 is_paired = 1 if remainder else 0 return is_paired def system_p(self, *cmd): + """Execute subproces""" print(f"RUNNING: {' '.join(cmd)}") print() subprocess.run(cmd, check=True) def run(self): + """Run QC info extraction""" if self.baits and self.reference: print("Calculating HS-metrics...") dict_file = self.reference @@ -85,11 +93,11 @@ def run(self): self.system_p(f"picard BedToIntervalList -I {self.baits} -O {self.baits}.interval_list -SD {dict_file}") self.system_p(f"picard CollectHsMetrics -I {self.bam} -O {self.bam}.hsmetrics -R {self.reference} -BAIT_INTERVALS {self.baits}.interval_list -TARGET_INTERVALS {self.bed}.interval_list") - with open(f"{self.bam}.hsmetrics") as hs: - for line in hs: + with open(f"{self.bam}.hsmetrics", "r", encoding="utf-8") as fin: + for line in fin: if line.startswith("## METRICS CLASS"): - next(hs) - vals = next(hs).split("\t") + next(fin) + vals = next(fin).split("\t") self.results['pct_on_target'] = vals[18] self.results['fold_enrichment'] = vals[26] self.results['median_coverage'] = vals[23] @@ -104,7 +112,7 @@ def run(self): if self.paired: print("Collect insert sizes...") self.system_p(f"picard CollectInsertSizeMetrics -I {self.bam} -O {self.bam}.inssize -H {self.bam}.ins.pdf -STOP_AFTER 1000000") - with open(f"{self.bam}.inssize") as ins: + with open(f"{self.bam}.inssize", "r", encoding="utf-8") as ins: for line in ins: if line.startswith("## METRICS CLASS"): next(ins) From 5f32f5352606bfca29941a581c90cacdf816da8e Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:31:14 +0100 Subject: [PATCH 13/18] Fix tbprofiler.py pylinting errors --- jasentool/tbprofiler.py | 88 +++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 56 deletions(-) diff --git a/jasentool/tbprofiler.py b/jasentool/tbprofiler.py index 64ad372..1df43bf 100644 --- a/jasentool/tbprofiler.py +++ b/jasentool/tbprofiler.py @@ -1,20 +1,23 @@ +"""Module that handles TBProfiler's database (tbdb)""" + import os import re import sys import pandas as pd -from tqdm import tqdm from jasentool.utils import Utils -class Tbprofiler(object): +class Tbprofiler: + """Class that handles TBProfiler tb mutation catalogue""" def __init__(self, tbdb_dir): self.tbdb_filepath = os.path.join(tbdb_dir, "tbdb.csv") self.chr_name = "Chromosome" - self.aa_long2short = self.get_aa_dict() + self.aa_long2short = Utils.get_aa_dict() def fasta2dict(self, filepath): + """Convert fasta to dictionary""" fa_dict = {} seq_name = "" - with open(filepath, 'r') as fin: + with open(filepath, 'r', encoding="utf-8") as fin: for line in fin: line = line.rstrip() if line.startswith(">"): @@ -22,7 +25,7 @@ def fasta2dict(self, filepath): fa_dict[seq_name] = [] else: fa_dict[seq_name].append(line) - return {seq: "".join(fa_dict[seq]) for seq in fa_dict} + return {seq: "".join(val) for seq, val in fa_dict.items()} def reverse_complement(self, seq): """Return reverse complement of a sequence""" @@ -33,64 +36,38 @@ def complement(seq): return complement(seq[::-1]) def write_gene_pos(self, infile, genes, outfile): + """Write out gene positions""" output_txt = "" - with open(infile, "r") as fin: + with open(infile, "r", encoding="utf-8") as fin: for line in fin: row = line.strip().split() rv, gene, chr_start, chr_end, gene_start, gene_end = [row[0], row[1]]+[int(row[i]) for i in range(2,6)] if rv in genes: - y = 0 + x = 0 for i, chr_pos in enumerate(range(chr_start, chr_end+1)): x = 1 if gene_start< gene_end else -1 if gene_start+(x*i) == 0: - y = 1 if gene_start< gene_end else -1 - output_txt += "%s\t%s\t%s\t%s\n" % (self.chr_name, chr_pos, rv, gene_start+(x*i)+y) - with open(outfile, "w") as fout: + x = 1 if gene_start< gene_end else -1 + output_txt += f"{self.chr_name}\t{chr_pos}\t{rv}\t{gene_start+(x*i)+x}\n" + with open(outfile, "w", encoding="utf-8") as fout: fout.write(output_txt) - def get_aa_dict(self): - return { - 'Ala': 'A', - 'Arg': 'R', - 'Asn': 'N', - 'Asp': 'D', - 'Asx': 'B', - 'Cys': 'C', - 'Glu': 'E', - 'Gln': 'Q', - 'Glx': 'Z', - 'Gly': 'G', - 'His': 'H', - 'Ile': 'I', - 'Leu': 'L', - 'Lys': 'K', - 'Met': 'M', - 'Phe': 'F', - 'Pro': 'P', - 'Ser': 'S', - 'Thr': 'T', - 'Trp': 'W', - 'Tyr': 'Y', - 'Val': 'V', - "Stop":"*", - "-":"-" - } - def parse_mutation(self, mut, gene, fasta_dict, gene_info): + """Parse mutation and determine type""" # AA change re_obj = re.search("p.([A-Z][a-z][a-z])([0-9]+)([A-Z][a-z][a-z])", mut) if re_obj: ref_aa = self.aa_long2short[re_obj.group(1)] alt_aa = self.aa_long2short[re_obj.group(3)] codon_num = re_obj.group(2) - return ["%s%s>%s%s" % (codon_num, ref_aa, codon_num, alt_aa)] + return [f"{codon_num}{ref_aa}>{codon_num}{alt_aa}"] # Stop codon re_obj = re.search("p.([A-Z][a-z][a-z])([0-9]+)(\*)", mut) if re_obj: ref_aa = self.aa_long2short[re_obj.group(1)] alt_aa = re_obj.group(3) codon_num = re_obj.group(2) - return ["%s%s>%s%s" % (codon_num, ref_aa, codon_num, alt_aa)] + return [f"{codon_num}{ref_aa}>{codon_num}{alt_aa}"] # Deletion single base re_obj = re.search("c.([\-0-9]+)del", mut) if re_obj: @@ -101,7 +78,7 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info): else: chr_start_nt = gene_info[gene]["start"] - gene_info[gene]["gene_start"] + gene_start_nt - (0 if gene_start_nt<0 else 1) seq = fasta_dict["Chromosome"][chr_start_nt-2:chr_start_nt] - return ["%s%s>%s" % (chr_start_nt-1,seq,seq[0])] + return [f"{chr_start_nt-1}{seq}>{seq[0]}"] # Deletion multi base re_obj = re.search("c.([\-0-9]+)_([\-0-9]+)del", mut) if re_obj: @@ -115,7 +92,7 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info): chr_start_nt = gene_info[gene]["start"] - gene_info[gene]["gene_start"] + gene_start_nt - (0 if gene_start_nt<0 else 1) chr_end_nt = chr_start_nt+del_len-1 seq = fasta_dict["Chromosome"][chr_start_nt-2:chr_end_nt] - return ["%s%s>%s" % (chr_start_nt-1, seq, seq[0])] + return [f"{chr_start_nt-1}{seq}>{seq[0]}"] # Insertion re_obj = re.search("c.([0-9]+)_([0-9]+)ins([A-Z]+)", mut) if re_obj: @@ -128,7 +105,7 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info): else: chr_start_nt = gene_info[gene]["start"] - gene_info[gene]["gene_start"] + gene_start_nt - 1 seq_start = fasta_dict["Chromosome"][chr_start_nt-1] - return ["%s%s>%s" % (chr_start_nt,seq_start,seq_start+seq_ins)] + return [f"{chr_start_nt}{seq_start}>{seq_start+seq_ins}"] # Promoter Mutation ## c.-16G>C re_obj = re.search("c.(\-[0-9]+)([A-Z])>([A-Z])",mut) @@ -139,11 +116,10 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info): strand = gene_info[gene]["strand"] if strand == "+": - chr_pos = gene_info[gene]["start"] - (gene_info[gene]["gene_start"] - nt_pos) - return ["%s%s>%s" % (nt_pos,ref_nt,alt_nt)] - else: - chr_pos = gene_info[gene]["end"] + (gene_info[gene]["gene_end"] - nt_pos) - return ["%s%s>%s" % (nt_pos, self.reverse_complement(ref_nt), self.reverse_complement(alt_nt))] + #chr_pos = gene_info[gene]["start"] - (gene_info[gene]["gene_start"] - nt_pos) + return [f"{nt_pos}{ref_nt}>{alt_nt}"] + #chr_pos = gene_info[gene]["end"] + (gene_info[gene]["gene_end"] - nt_pos) + return [f"{nt_pos}{self.reverse_complement(ref_nt)}>{self.reverse_complement(alt_nt)}"] # ncRNA Mutation ## r.514a>c re_obj = re.search("r.([0-9]+)([a-z]+)>([a-z]+)",mut) @@ -151,7 +127,7 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info): nt_pos = re_obj.group(1) ref_nt = re_obj.group(2) alt_nt = re_obj.group(3) - return ["%s%s>%s" % (nt_pos,ref_nt.upper(),alt_nt.upper())] + return [f"{nt_pos}{ref_nt.upper()}>{alt_nt.upper()}"] # frameshift re_obj = re.search("frameshift",mut) if re_obj: @@ -166,26 +142,26 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info): if re_obj: start = int(re_obj.group(1)) end = int(re_obj.group(2)) - return ["any_missense_codon_%s" % i for i in range(start,end+1)] + return [f"any_missense_codon_{pos}" for pos in range(start,end+1)] # Codon single ## any_missense_codon_425 re_obj = re.search("any_missense_codon_([0-9]+)",mut) if re_obj: start = int(re_obj.group(1)) - return ["any_missense_codon_%s" % start] + return [f"any_missense_codon_{start}"] # Indel range re_obj = re.search("any_indel_nucleotide_([0-9]+)_([0-9]+)",mut) if re_obj: start = int(re_obj.group(1)) end = int(re_obj.group(2)) - return ["any_indel_nucleotide_%s" % i for i in range(start,end+1)] + return [f"any_indel_nucleotide_{pos}" for pos in range(start,end+1)] # large_deletion - re_obj = re.search("large_deletion",mut) + re_obj = re.search("large_deletion", mut) if re_obj: return ["large_deletion"] - sys.exit("%s is not a valid formatted mutation... Exiting!" % mut) - - def _parse(self, fasta_filepath, gff_filepath, download_dir): + sys.exit(f"{mut} is not a valid formatted mutation... Exiting!") + + def _parse(self, download_dir): utils = Utils() tbdb_url = "https://raw.githubusercontent.com/jodyphelan/tbdb/master/tbdb.csv" tbdb_filepath = os.path.join(download_dir, "tbdb.csv") From c7f30395eba769b14e5b90ef6f4b04fd423218c6 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:31:29 +0100 Subject: [PATCH 14/18] Fix utils.py pylinting errors --- jasentool/utils.py | 74 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/jasentool/utils.py b/jasentool/utils.py index 9cc85b2..7e17ac4 100644 --- a/jasentool/utils.py +++ b/jasentool/utils.py @@ -1,34 +1,40 @@ -#!/usr/bin/env python3 +"""Module for utility tools""" import os import csv import shutil import pathlib -import requests import subprocess -import pandas as pd from time import sleep -from zipfile import ZipFile +from zipfile import ZipFile +import requests -class Utils(object): +class Utils: + """Class containing utilities used throughout jasentool""" @staticmethod def write_out_csv(csv_dict, assay, platform, out_fpath): - with open(out_fpath, 'w+') as csvfile: + """Write out file as csv""" + with open(out_fpath, 'w+', encoding="utf-8") as csvfile: fieldnames = ["id", "group", "species", "assay", "platform", "read1", "read2"] #header writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for sample in csv_dict: - row_dict = {"id":sample, "group": csv_dict[sample][0], "species": csv_dict[sample][1], "assay": assay, "platform": platform, "read1": csv_dict[sample][2][0], "read2": csv_dict[sample][2][1]} #write rows to CSV + row_dict = {"id":sample, "group": csv_dict[sample][0], + "species": csv_dict[sample][1], "assay": assay, + "platform": platform, "read1": csv_dict[sample][2][0], + "read2": csv_dict[sample][2][1]} #write rows to CSV writer.writerow(row_dict) @staticmethod def write_out_txt(output_txt, out_fpath): - with open(out_fpath, 'w+') as fout: + """Write out file as text""" + with open(out_fpath, 'w+', encoding="utf-8") as fout: fout.write(output_txt) @staticmethod def pipeline_ready(batch_file): - assays = ['saureus'] + """Check if pipeline exists""" + assays = ['saureus', 'ecoli', 'mtuberculosis'] for assay in assays: if assay in batch_file: return True @@ -36,6 +42,7 @@ def pipeline_ready(batch_file): @staticmethod def copy_batch_and_csv_files(batch_files, csv_files, remote_dir, remote_hostname, remote=False): + """Copy shell and csv files to desired (remote) location""" if remote: # Copy files to remote server using ssh/scp process = subprocess.run( @@ -51,21 +58,24 @@ def copy_batch_and_csv_files(batch_files, csv_files, remote_dir, remote_hostname else: # Copy files to a local directory pathlib.Path(remote_dir).mkdir(parents=True, exist_ok=True) - for fn in batch_files + csv_files: - shutil.copy(fn, remote_dir) + for fin in batch_files + csv_files: + shutil.copy(fin, remote_dir) @staticmethod def start_remote_pipelines(batch_files, remote_hostname, remote_dir): + """Start nextflow pipelines on a remote server""" for batch_file in batch_files: if Utils.pipeline_ready(batch_file): sleep(10.0) # Avoid maxing SSH auth connections process = subprocess.Popen( - ["ssh", remote_hostname, "bash", f"{remote_dir}/{os.path.basename(batch_file)}"], + ["ssh", remote_hostname, + "bash", f"{remote_dir}/{os.path.basename(batch_file)}"], close_fds=True ) @staticmethod def download_and_save_file(url, output_filepath): + """Download the file and save it to the user-specified path""" try: # Make a request to the URL response = requests.get(url, stream=True) @@ -79,18 +89,50 @@ def download_and_save_file(url, output_filepath): print(f"File downloaded and saved to: {output_filepath}") - except requests.exceptions.RequestException as e: - print(f"Error downloading the file: {e}") + except requests.exceptions.RequestException as error_code: + print(f"Error downloading the file: {error_code}") @staticmethod def unzip(zip_file, outdir): + """Unzip zip file""" with ZipFile(zip_file, 'r') as zip_object: zip_object.extractall(path=outdir) @staticmethod def copy_file(source, destination): + """Copy file from source to destination""" try: shutil.copy(source, destination) print(f"File copied from {source} to {destination}") - except Exception as e: - print(f"Error copying file: {e}") + except Exception as error_code: + print(f"Error copying file: {error_code}") + + @staticmethod + def get_aa_dict(): + """Amino acid one letter translations""" + return { + 'Ala': 'A', + 'Arg': 'R', + 'Asn': 'N', + 'Asp': 'D', + 'Asx': 'B', + 'Cys': 'C', + 'Glu': 'E', + 'Gln': 'Q', + 'Glx': 'Z', + 'Gly': 'G', + 'His': 'H', + 'Ile': 'I', + 'Leu': 'L', + 'Lys': 'K', + 'Met': 'M', + 'Phe': 'F', + 'Pro': 'P', + 'Ser': 'S', + 'Thr': 'T', + 'Trp': 'W', + 'Tyr': 'Y', + 'Val': 'V', + "Stop":"*", + "-":"-" + } From f3d60c10072bd355de1cfa5abcce6423f9445648 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:31:49 +0100 Subject: [PATCH 15/18] Fix validate.py pylinting errors --- jasentool/validate.py | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/jasentool/validate.py b/jasentool/validate.py index a4db9cf..c546776 100644 --- a/jasentool/validate.py +++ b/jasentool/validate.py @@ -1,36 +1,46 @@ -import os -import sys +"""Module for validating pipelines""" + import json from jasentool.database import Database from jasentool.utils import Utils -class Validate(object): +class Validate: + """Class to validate old pipeline (cgviz) with new pipeline (jasen)""" def get_sample_id(self, results): + """Get sample ID from mongodb""" return results["sample_id"] def get_species_name(self, results): + """Get species name from mongodb""" return results["species_prediction"][0]["scientific_name"] def _check_exists(self, db_collection, sample_id): - return (True if list(Database.find(db_collection, {"id": sample_id}, {})) else False) + """Check if sample ID exists in mongodb""" + return bool(list(Database.find(db_collection, {"id": sample_id}, {}))) def search(self, search_query, search_kw, search_list): + """Search for query in list of arrays""" return [element for element in search_list if element[search_kw] == search_query] def get_virulence_results(self, results): + """Get virulence results""" return self.search("VIRULENCE", "type", results["element_type_result"]) def get_pvl(self, results): + """Get pvl result""" virulence_results = self.get_virulence_results(results) - return (True if self.search("lukS-PV", "gene_symbol", virulence_results[0]["result"]["genes"]) else False) + return bool(self.search("lukS-PV", "gene_symbol", virulence_results[0]["result"]["genes"])) def get_mlst(self, results): + """Get mlst result""" return self.search("mlst", "type", results["typing_result"]) def get_cgmlst(self, results): + """Get cgmlst result""" return self.search("cgmlst", "type", results["typing_result"]) def get_mdb_cgv_data(self, db_collection, sample_id): + """Get sample mongodb data""" mdb_pvl = list(Database.get_pvl(db_collection, {"id": sample_id, "metadata.QC": "OK"})) mdb_mlst = list(Database.get_mlst(db_collection, {"id": sample_id, "metadata.QC": "OK"})) mdb_cgmlst = list(Database.get_cgmlst(db_collection, {"id": sample_id, "metadata.QC": "OK"})) @@ -39,20 +49,24 @@ def get_mdb_cgv_data(self, db_collection, sample_id): mdb_mlst_seqtype = str(mdb_mlst[0]["mlst"]["sequence_type"]) if mdb_mlst[0]["mlst"]["sequence_type"] != "-" else str(None) mdb_mlst_alleles = mdb_mlst[0]["mlst"]["alleles"] mdb_cgmlst_alleles = mdb_cgmlst[0]["alleles"] - return {"pvl": mdb_pvl_present, "mlst_seqtype": mdb_mlst_seqtype, "mlst_alleles": mdb_mlst_alleles, "cgmlst_alleles": mdb_cgmlst_alleles} + return {"pvl": mdb_pvl_present, "mlst_seqtype": mdb_mlst_seqtype, + "mlst_alleles": mdb_mlst_alleles, "cgmlst_alleles": mdb_cgmlst_alleles} except IndexError: return False def get_fin_data(self, sample_json): + """Get sample input file data""" fin_pvl_present = self.get_pvl(sample_json) fin_mlst = self.get_mlst(sample_json) fin_cgmlst = self.get_cgmlst(sample_json) fin_mlst_seqtype = str(fin_mlst[0]["result"]["sequence_type"]) fin_mlst_alleles = fin_mlst[0]["result"]["alleles"] fin_cgmlst_alleles = list(fin_cgmlst[0]["result"]["alleles"].values()) - return {"pvl": fin_pvl_present, "mlst_seqtype": fin_mlst_seqtype, "mlst_alleles": fin_mlst_alleles, "cgmlst_alleles": fin_cgmlst_alleles} + return {"pvl": fin_pvl_present, "mlst_seqtype": fin_mlst_seqtype, + "mlst_alleles": fin_mlst_alleles, "cgmlst_alleles": fin_cgmlst_alleles} def compare_mlst_alleles(self, old_mlst_alleles, new_mlst_alleles): + """Parse through mlst alleles of old and new pipeline and compare results""" match_count, total_count = 0, 0 for allele in old_mlst_alleles: if str(old_mlst_alleles[allele]) == str(new_mlst_alleles[allele]): @@ -61,18 +75,21 @@ def compare_mlst_alleles(self, old_mlst_alleles, new_mlst_alleles): return 100*(match_count/total_count) def compare_cgmlst_alleles(self, old_cgmlst_alleles, new_cgmlst_alleles): + """Parse through cgmlst alleles of old and new pipeline and compare results""" match_count, total_count = 0, 0 - for allele in range(0, len(old_cgmlst_alleles)): - if str(old_cgmlst_alleles[allele]) == str(new_cgmlst_alleles[allele]): + for idx, old_allele in enumerate(old_cgmlst_alleles): + if str(old_allele) == str(new_cgmlst_alleles[idx]): match_count += 1 total_count += 1 return 100*(match_count/total_count) def compare_data(self, sample_id, old_data, new_data): + """Compare data between old pipeline and new pipeline""" pvl_comp = int(old_data["pvl"] == new_data["pvl"]) mlst_seqtype_comp = int(old_data["mlst_seqtype"] == new_data["mlst_seqtype"]) if mlst_seqtype_comp == 0: - mlst_at_list = [f'{old_data["mlst_alleles"][gene]},{new_data["mlst_alleles"][gene]}' for gene in sorted(old_data["mlst_alleles"].keys())] + mlst_at_list = [f'{old_data["mlst_alleles"][gene]},{new_data["mlst_alleles"][gene]}' + for gene in sorted(old_data["mlst_alleles"].keys())] mlst_at_str = ",".join(mlst_at_list) print(f'{sample_id},{old_data["mlst_seqtype"]},{new_data["mlst_seqtype"]},{mlst_at_str}') mlst_alleles = self.compare_mlst_alleles(old_data["mlst_alleles"], new_data["mlst_alleles"]) @@ -80,10 +97,11 @@ def compare_data(self, sample_id, old_data, new_data): return f"{sample_id},{pvl_comp},{mlst_seqtype_comp},{mlst_alleles},{cgmlst_alleles}" def run(self, input_files, output_fpaths, db_collection, combined_output): + """Execute validation of new pipeline (jasen)""" utils = Utils() csv_output = "sample_id,pvl,mlst_seqtype,mlst_allele_matches(%),cgmlst_allele_matches(%)" for input_idx, input_file in enumerate(input_files): - with open(input_file, 'r') as fin: + with open(input_file, 'r', encoding="utf-8") as fin: sample_json = json.load(fin) sample_id = self.get_sample_id(sample_json) if not self._check_exists(db_collection, sample_id): @@ -91,7 +109,7 @@ def run(self, input_files, output_fpaths, db_collection, combined_output): continue mdb_data_dict = self.get_mdb_cgv_data(db_collection, sample_id) if mdb_data_dict: - species_name = self.get_species_name(sample_json) + #species_name = self.get_species_name(sample_json) fin_data_dict = self.get_fin_data(sample_json) compared_data_output = self.compare_data(sample_id, mdb_data_dict, fin_data_dict) csv_output += "\n" + compared_data_output From ca04dae9e67f809d08de694ddd885f141c1c7880 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:32:05 +0100 Subject: [PATCH 16/18] Fix who.py pylinting errors --- jasentool/who.py | 172 ++++++++++++++++++++++++----------------------- 1 file changed, 89 insertions(+), 83 deletions(-) diff --git a/jasentool/who.py b/jasentool/who.py index 0d89bf4..8e983de 100644 --- a/jasentool/who.py +++ b/jasentool/who.py @@ -1,49 +1,27 @@ +"""Module for handling WHO mutation catalogue""" + import os import re import pandas as pd from tqdm import tqdm from jasentool.utils import Utils -class WHO(object): +class WHO: + """Class for handling WHO tb mutation catalogue""" def __init__(self): - self.aa_dict_1 = self.get_aa_dict() + self.aa_dict_1 = Utils.get_aa_dict() self.aa_dict_2 = self.inv_dict() self.nucleotide_complements = self.get_nt_complements() self.drug_dict = self.get_drug_dict() self.re_c, self.re_p, self.re_d, self.re_i = self.setup_re() self.re_attr = re.compile('Name=([^;]+).*locus_tag=([^;|\n]+)') - def get_aa_dict(self): - return { - 'Ala': 'A', - 'Arg': 'R', - 'Asn': 'N', - 'Asp': 'D', - 'Asx': 'B', - 'Cys': 'C', - 'Glu': 'E', - 'Gln': 'Q', - 'Glx': 'Z', - 'Gly': 'G', - 'His': 'H', - 'Ile': 'I', - 'Leu': 'L', - 'Lys': 'K', - 'Met': 'M', - 'Phe': 'F', - 'Pro': 'P', - 'Ser': 'S', - 'Thr': 'T', - 'Trp': 'W', - 'Tyr': 'Y', - 'Val': 'V', - '*': '!', - } - def inv_dict(self): + """Invert amino acid dictionary""" return {v: k for k, v in self.aa_dict_1.items()} def get_nt_complements(self): + """Get nucleotide complements""" return { 'C': 'G', 'G': 'C', @@ -52,6 +30,7 @@ def get_nt_complements(self): } def get_drug_dict(self): + """Get drug 3 letter code translation dictionary""" return { 'RIF': 'rifampicin', 'INH': 'isoniazid', @@ -71,31 +50,34 @@ def get_drug_dict(self): } def setup_re(self): - # Setup the regular expressions - re_c = re.compile('^(\w+)_([actg])(-*\d+)([actg])$') #regex pattern for - re_p = re.compile('^(\w+)_([A-Z])(\d+)([A-Z!])$') #regex pattern for protein - re_d = re.compile('^(\w+)_(-*\d+)_del_(\d+)_([actg]+)_([actg]+)$') #regex pattern for deletions - re_i = re.compile('^(\w+)_(-*\d+)_ins_(\d+)_([actg]+)_([actg]+)$') #regex pattern for insertions + """Setup the regular expressions""" + re_c = re.compile('^(\w+)_([actg])(-*\d+)([actg])$') #regex pattern for nucleotide changes + re_p = re.compile('^(\w+)_([A-Z])(\d+)([A-Z!])$') #regex for protein + re_d = re.compile('^(\w+)_(-*\d+)_del_(\d+)_([actg]+)_([actg]+)$') #regex for deletions + re_i = re.compile('^(\w+)_(-*\d+)_ins_(\d+)_([actg]+)_([actg]+)$') #regex for insertions return re_c, re_p, re_d, re_i def lower_row(self, row): + """Lowercase string in row""" return row.str.lower() def read_files(self, gff_filepath, xlsx_filepath, h37rv_filepath): + """Read gff, excel & genome files""" # Load the reference GFF file - gff = pd.read_csv(gff_filepath, names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'], sep='\t', header=None) + columns = ['seqid', 'source', 'type', 'start', 'end', + 'score', 'strand', 'phase', 'attributes'] + gff = pd.read_csv(gff_filepath, names=columns, sep='\t', header=None) # Load the WHO catalogue catalogue = pd.read_excel(xlsx_filepath, sheet_name='Catalogue_master_file', header=2) # Load the reference genome to impute missing data from deletions h37rv = '' - with open(h37rv_filepath, 'r') as fin: + with open(h37rv_filepath, 'r', encoding="utf-8") as fin: for line in fin: h37rv += line.replace('\n', '') return gff, catalogue, h37rv def process_variant(self, variant, gff_dict): - '''Translates variants in the WHO catalogue format to HGVS''' - + """Translates variants in the WHO catalogue format to HGVS""" c_match = self.re_c.match(variant) if c_match: if gff_dict[c_match[1]]['type'] == 'rRNA': @@ -106,37 +88,37 @@ def process_variant(self, variant, gff_dict): v_type = 'c' ref = c_match[2].upper() alt = c_match[4].upper() - return (c_match[1], v_type, '{}.{}{}>{}'.format(v_type, c_match[3], ref, alt), False, None) + return (c_match[1], v_type, f'{v_type}.{c_match[3]}{ref}>{alt}', False, None) p_match = self.re_p.match(variant) if p_match: - return (p_match[1], 'p', 'p.{}{}{}'.format(self.aa_dict_2[p_match[2].upper()], p_match[3], self.aa_dict_2[p_match[4].upper()]), False, None) + return (p_match[1], 'p', f'p.{self.aa_dict_2[p_match[2].upper()]}{p_match[3]}{self.aa_dict_2[p_match[4].upper()]}', False, None) d_match = self.re_d.match(variant) if d_match: if int(d_match[3]) != len(d_match[4]) - len(d_match[5]): return (None, None, None, True, 'length mismatch') - starts = [pos for pos in range(1, len(d_match[4]) + 1 - int(d_match[3])) if d_match[4][:pos]+d_match[4][pos+int(d_match[3]):] == d_match[5]] + starts = [pos for pos in range(1, len(d_match[4])+1-int(d_match[3])) + if d_match[4][:pos]+d_match[4][pos+int(d_match[3]):] == d_match[5]] if not starts: return (None, None, None, True, 'invalid indel') if not gff_dict[d_match[1]]['strand']: hgvs = [] for start in starts: if int(d_match[3]) == 1: - hgvs.append('c.{}del'.format(int(d_match[2])+start)) + hgvs.append(f'c.{int(d_match[2])+start}del') else: - hgvs.append('c.{}_{}del'.format(int(d_match[2])+start, int(d_match[2])+start-1+int(d_match[3]))) - return (d_match[1], 'c', '|'.join(hgvs), False, None) - else: - hgvs = [] - for start in starts: - if int(d_match[3]) == 1: - hgvs.append('c.{}del'.format(int(d_match[2]) - start - int(d_match[3]) + 1)) - else: - v = 'c.{}_{}del'.format(int(d_match[2]) - start - int(d_match[3]) + 1, int(d_match[2]) - start) - hgvs.append(v) + hgvs.append(f'c.{int(d_match[2])+start}_{int(d_match[2])+start-1+int(d_match[3])}del') return (d_match[1], 'c', '|'.join(hgvs), False, None) + hgvs = [] + for start in starts: + if int(d_match[3]) == 1: + hgvs.append(f'c.{int(d_match[2]) - start - int(d_match[3]) + 1}del') + else: + hgvs_var = f'c.{int(d_match[2])-start-int(d_match[3])+1}_{int(d_match[2])-start}del' + hgvs.append(hgvs_var) + return (d_match[1], 'c', '|'.join(hgvs), False, None) i_match = self.re_i.match(variant) if i_match: @@ -148,18 +130,26 @@ def process_variant(self, variant, gff_dict): if not gff_dict[i_match[1]]['strand']: hgvs = [] for start in starts: - hgvs.append('c.{}_{}ins{}'.format(int(i_match[2])+start-1, int(i_match[2])+start, ''.join([i.upper() for i in i_match[5][start:start+int(i_match[3])]]))) - return (i_match[1], 'c', '|'.join(hgvs), False, None) - else: - hgvs = [] - for start in starts: - v = 'c.{}_{}ins{}'.format(int(i_match[2])-start, int(i_match[2]) - start+1, ''.join([self.nucleotide_complements[i.upper()] for i in i_match[5][start:start+int(i_match[3])][::-1]])) - hgvs.append(v) + start_pos = int(i_match[2])+start-1 + end_pos = int(i_match[2])+start + seq = ''.join([i.upper() for i in i_match[5][start:start+int(i_match[3])]]) + hgvs_var = f'c.{start_pos}_{end_pos}ins{seq}' + hgvs.append(f'c.{start_pos}_{end_pos}ins{seq}') return (i_match[1], 'c', '|'.join(hgvs), False, None) + hgvs = [] + for start in starts: + start_pos = int(i_match[2])-start + end_pos = int(i_match[2]) - start+1 + seq = ''.join([self.nucleotide_complements[i.upper()] + for i in i_match[5][start:start+int(i_match[3])][::-1]]) + hgvs_var = f'c.{start_pos}_{end_pos}ins{seq}' + hgvs.append(hgvs_var) + return (i_match[1], 'c', '|'.join(hgvs), False, None) return (None, None, None, True, 'does not match indel or variant') def extract_info(self, info_string): + """Extract gene name and locus tag from provided string""" if pd.notna(info_string): match = self.re_attr.search(info_string) if match: @@ -169,8 +159,7 @@ def extract_info(self, info_string): return pd.Series([None, None]) def get_gene_info(self, gff): - # Get the gene information from the GFF file - # Apply the function to the 'attributes' column + """Get gene info from the GFF file and apply the function to the 'attributes' column""" gff[['locus_tag', 'name']] = gff.attributes.apply(self.extract_info) gff_dict = {} @@ -191,13 +180,14 @@ def get_gene_info(self, gff): return gff_dict def prep_catalogue(self, catalogue): - # Prepare the WHO catalogue dataframe + """Prepare the WHO catalogue dataframe""" classified = [] - v = re.compile('^(.*) \((.*)\)') + variant_re = re.compile('^(.*) \((.*)\)') + who_mut_cat_url = 'https://www.who.int/publications/i/item/9789240028173' for var, row in catalogue[catalogue[('FINAL CONFIDENCE GRADING', 'Unnamed: 51_level_1')].apply(lambda conf: conf != 'combo')].iterrows(): drug_key = row[('drug', 'Unnamed: 0_level_1')] drug = self.drug_dict[drug_key] - v_match = v.match(var) + v_match = variant_re.match(var) if v_match: # Include all variants listed variants = [v_match[1]] + [i.strip() for i in v_match[2].split(',')] @@ -206,14 +196,16 @@ def prep_catalogue(self, catalogue): else: variants = [var] category = ' '.join(row[('FINAL CONFIDENCE GRADING', 'Unnamed: 51_level_1')].split(' ')[1:]) - genome_pos = '{:.0f}'.format(row[('Genome position', 'Unnamed: 3_level_1')]) + #genome_pos = '{:.0f}'.format(row[('Genome position', 'Unnamed: 3_level_1')]) for variant in variants: - classified.append([variant, drug, 'resistance', '', 'https://www.who.int/publications/i/item/9789240028173', category]) - classified = pd.DataFrame(classified, columns=['variant', 'Drug', 'Confers', 'Interaction', 'Literature', 'WHO Confidence']) + row = [variant, drug, 'resistance', '', who_mut_cat_url, category] + classified.append() + column_names = ['variant', 'Drug', 'Confers', 'Interaction', 'Literature', 'WHO Confidence'] + classified = pd.DataFrame(classified, columns=column_names) return classified def var2hgvs(self, classified, gff_dict): - # Convert the variants to HGVS format + """Convert the variants to HGVS format""" for idx, row in tqdm(classified.iterrows(), total=classified.shape[0]): gene, var_type, variant, fail, fail_reason = self.process_variant(row.variant, gff_dict) classified.loc[idx, 'gene'] = gene @@ -224,27 +216,37 @@ def var2hgvs(self, classified, gff_dict): return classified def impute_del(self, classified, gff_dict, h37rv): - # Impute missing data for deletions + """Impute missing data for deletions""" length_mismatch = classified[classified.fail_reason == 'length mismatch'].sort_values(by='variant', key=self.lower_row) for idx, row in tqdm(length_mismatch.iterrows(), total=length_mismatch.shape[0]): d_match = self.re_d.match(row.variant) if d_match: if not gff_dict[d_match[1]]['strand']: - indexing_correction = -1 if int(d_match[2]) < 0 else -2 # correct for 0 based python indexing (-1 if promotor, -2 if within gene) + # Correct for 0 based python indexing (-1 if promotor, -2 if within gene) + indexing_correction = -1 if int(d_match[2]) < 0 else -2 + start = int(gff_dict[d_match[1]]['start']) + int(d_match[2]) + int(indexing_correction) - end = start + int(d_match[3]) + len(d_match[5]) # add the lenght of the alt allele to account for the bases not part of the indel + + # add the length of the alt allele to account for the bases not part of the indel + end = start + int(d_match[3]) + len(d_match[5]) try: - complete_variant = '{}_{}_del_{}_{}_{}'.format(d_match[1], d_match[2], d_match[3], h37rv[start:end].lower(), d_match[5]) + complete_variant = f'{d_match[1]}_{d_match[2]}_del_{d_match[3]}_{h37rv[start:end].lower()}_{d_match[5]}' except TypeError: print(f"{start}: {type(start)}\n{end}: {type(end)}") classified.loc[idx, 'complete_variant'] = complete_variant classified.loc[idx, 'complete_variant_fail'] = False else: - indexing_correction = -1 if int(d_match[2]) < 0 else 0 # correct for 0 based python indexing (-1 if promotor, 0 if within gene) - start = int(gff_dict[d_match[1]]['end']) - int(d_match[2]) + int(indexing_correction) # subtract d_match[2] instead of adding as this is the opposite strand - end = start + int(d_match[3]) + len(d_match[5]) # add the lenght of the alt allele to account for the bases not part of the indel - complete_variant = '{}_{}_del_{}_{}_{}'.format(d_match[1], d_match[2], d_match[3], h37rv[start:end].lower(), d_match[5]) + # Correct for 0 based python indexing (-1 if promotor, 0 if within gene) + indexing_correction = -1 if int(d_match[2]) < 0 else 0 + + # Subtract d_match[2] instead of adding as this is the opposite strand + start = int(gff_dict[d_match[1]]['end']) - int(d_match[2]) + int(indexing_correction) + + # Add the length of the alt allele to account for the bases not part of the indel + end = start + int(d_match[3]) + len(d_match[5]) + + complete_variant = f'{d_match[1]}_{d_match[2]}_del_{d_match[3]}_{h37rv[start:end].lower()}_{d_match[5]}' classified.loc[idx, 'complete_variant'] = complete_variant classified.loc[idx, 'complete_variant_fail'] = False continue @@ -259,10 +261,11 @@ def impute_del(self, classified, gff_dict, h37rv): pass continue return classified - + def imp2hgvs(self, classified, gff_dict): - # Convert imputed deletions to HGVS format - for idx, row in tqdm(classified[classified.complete_variant_fail == False].iterrows(), total=classified[classified.complete_variant_fail == False].shape[0]): + """Convert imputed deletions to HGVS format""" + for idx, row in tqdm(classified[classified.complete_variant_fail == False].iterrows(), + total=classified[classified.complete_variant_fail == False].shape[0]): if row.complete_variant_fail: continue gene, var_type, variant, fail, fail_reason = self.process_variant(row.complete_variant, gff_dict) @@ -274,17 +277,19 @@ def imp2hgvs(self, classified, gff_dict): return classified def write_out_csv(self, classified, csv_outpath): - # Write results to csv file + """Write results to csv file""" classified.to_csv(csv_outpath, index=False) def _parse(self, fasta_filepath, gff_filepath, download_dir): + """Parse WHO excel file""" utils = Utils() #who_url = "https://apps.who.int/iris/bitstream/handle/10665/341906/WHO-UCN-GTB-PCI-2021.7-eng.xlsx" who_url = "https://raw.githubusercontent.com/GTB-tbsequencing/mutation-catalogue-2023/main/Final%20Result%20Files/WHO-UCN-TB-2023.5-eng.xlsx" who_filepath = os.path.join(download_dir, "who.xlsx") utils.download_and_save_file(who_url, who_filepath) - gff, catalogue, h37rv = self.read_files(gff_filepath, who_filepath, fasta_filepath) - gff_dict = self.get_gene_info(gff) + _, catalogue, _ = self.read_files(gff_filepath, who_filepath, fasta_filepath) + #gff, catalogue, h37rv = self.read_files(gff_filepath, who_filepath, fasta_filepath) + #gff_dict = self.get_gene_info(gff) catalogue.columns = catalogue.columns.str.title() catalogue.rename(columns={'Final Confidence Grading': 'WHO Confidence'}, inplace=True) catalogue['Confers'] = 'resistance' @@ -292,7 +297,8 @@ def _parse(self, fasta_filepath, gff_filepath, download_dir): catalogue['Literature'] = 'https://www.who.int/publications/i/item/9789240082410' catalogue['WHO Confidence'] = catalogue['WHO Confidence'].apply(lambda x: ' '.join(x.split(' ')[1:])) catalogue['Drug'] = catalogue['Drug'].apply(lambda x: x.lower()) - catalogue = catalogue.loc[:, ["Drug","Confers","Interaction","Literature","WHO Confidence","Gene","Mutation"]] + catalogue = catalogue.loc[:, ["Drug", "Confers", "Interaction", "Literature", + "WHO Confidence", "Gene", "Mutation"]] csv_outpath = os.path.join(download_dir, "who.csv") self.write_out_csv(catalogue, csv_outpath) return catalogue From c9b8876eac95d83b5c89819bc588cdc7f5e945c7 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:32:20 +0100 Subject: [PATCH 17/18] Fix setup.py pylinting errors --- setup.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 5c1e4e1..30d5385 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +"""setup.py for pip installation of jasentool""" + import re import sys @@ -9,10 +11,10 @@ 'Try runnning: python -m ensurepip' ) -with open("README.md", 'r') as fin: +with open("README.md", 'r', encoding="utf-8") as fin: long_description = fin.read() -with open("jasentool/__init__.py", 'r') as fin: +with open("jasentool/__init__.py", 'r', encoding="utf-8") as fin: version_line_regex = re.compile(r'^\s*__version__\s*=\s*[\'"]([^\'"]+)[\'"]') for line in fin: match = version_line_regex.match(line) @@ -33,6 +35,6 @@ ], install_requires=["pymongo", "openpyxl", "biopython"], entry_points={"console_scripts": ["jasentool=jasentool.__main__:main"]}, - packages=find_packages(exclude=("tests")), + packages=find_packages(exclude="tests"), package_data={"jasentool": ["data/dbs/*"]}, -) \ No newline at end of file +) From 7422c5535531df3bd6a82125343a6d8acd12df42 Mon Sep 17 00:00:00 2001 From: ryanjameskennedy Date: Wed, 17 Jan 2024 14:33:06 +0100 Subject: [PATCH 18/18] Add pylint GA workflow --- .github/workflows/pylint.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..a68dccd --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,24 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint pytest + pip install -e . + - name: Analysing the code with pylint + run: | + pylint --disable=W1401,R0914,W0718 --fail-under 9 $(git ls-files '*.py') \ No newline at end of file