From 61a97e6ebc5b11bb1c2787b54dc1102df71f7f44 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Tue, 16 Jan 2024 19:01:42 +0100
Subject: [PATCH 01/18] Fix __main__.py pylinting errors

---
 jasentool/__main__.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/jasentool/__main__.py b/jasentool/__main__.py
index d2221e7..6c64c3b 100644
--- a/jasentool/__main__.py
+++ b/jasentool/__main__.py
@@ -1,15 +1,17 @@
+"""__main__ file that handles help and cli execution"""
+
 import sys
-import os
 
 from jasentool import __author__, __copyright__, __version__
 from jasentool.cli import get_main_parser
 from jasentool.main import OptionsParser
 
 def print_help():
-    print('''
+    """Print help string for jasentool software"""
+    print(f'''
 
-                    ...::: Jasentool v%s :::...
-Author(s): %s
+                    ...::: Jasentool v{__version__} :::...
+Author(s): {__author__}
 
 Description:
     This software is a mongodb tool that fetches, inserts and 
@@ -32,9 +34,10 @@ def print_help():
     fix                 Fix output files from bjorn.
     converge            Converge tuberculosis mutation catlogues.
     qc                  Extract QC values after alignment.
-''' % (__version__, __author__))
+''')
 
 def main():
+    """Main function that handles cli"""
     args = None
     if len(sys.argv) == 1:
         print_help()
@@ -57,14 +60,14 @@ def main():
     except KeyboardInterrupt:
         print('Controlled exit resulting from interrupt signal.')
         sys.exit(1)
-    except Exception as e:
+    except Exception as error_code:
         error_message = 'Uncontrolled exit resulting from an unexpected error.\n\n'
         error_message += '-' * 80 + '\n'
-        error_message += 'EXCEPTION: {}\n'.format(type(e).__name__)
-        error_message += 'MESSAGE: {}\n'.format(e)
+        error_message += f'EXCEPTION: {type(error_code).__name__}\n'
+        error_message += f'MESSAGE: {error_code}\n'
         error_message += '-' * 80 + '\n\n'
         print(error_message)
         sys.exit(1)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From db40159d26e51a097aa0e02fcdfdb6efb2214f9b Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:26:39 +0100
Subject: [PATCH 02/18] Fix __init__.py pylinting errors

---
 jasentool/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/jasentool/__init__.py b/jasentool/__init__.py
index 3853a2a..e44e71f 100644
--- a/jasentool/__init__.py
+++ b/jasentool/__init__.py
@@ -1,3 +1,5 @@
+"""Information regarding jasentool for setup.py"""
+
 __author__ = 'Ryan James Kennedy'
 __author_email__ = 'ryan.kennedy@skane.se'
 __copyright__ = 'Copyright 2023'
@@ -6,7 +8,7 @@
 __license__ = 'GPL3'
 __maintainer__ = 'Ryan James Kennedy'
 __maintainer_email__ = 'ryan.kennedy@skane.se'
-__name__ = 'jasentool'
+__software_name__ = 'jasentool'
 __python_requires__ = '>=3.11'
 __status__ = 'Production'
 __title__ = 'jasentool'

From e21bb2555dfe2ede8817c2536d2f2cc7539e5b0a Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:27:02 +0100
Subject: [PATCH 03/18] Fix cli.py pylinting errors

---
 jasentool/cli.py | 115 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 91 insertions(+), 24 deletions(-)

diff --git a/jasentool/cli.py b/jasentool/cli.py
index 0c53b53..2b114ea 100644
--- a/jasentool/cli.py
+++ b/jasentool/cli.py
@@ -1,117 +1,184 @@
-import os
+"""Command line interface module"""
+
 import argparse
 from contextlib import contextmanager
 
 @contextmanager
 def subparser(parser, name, desc):
-    yield parser.add_parser(name, conflict_handler='resolve', help=desc, formatter_class=argparse.RawDescriptionHelpFormatter)
+    """Yield subparser"""
+    yield parser.add_parser(name, conflict_handler='resolve', help=desc,
+                            formatter_class=argparse.RawDescriptionHelpFormatter)
 
 @contextmanager
 def mutex_group(parser, required):
-    group = parser.add_argument_group(f'mutually exclusive {"required" if required else "optional"} arguments')
+    """Yield mutually exclusive group"""
+    arg_type = "required" if required else "optional"
+    group = parser.add_argument_group(f'mutually exclusive {arg_type} arguments')
     yield group.add_mutually_exclusive_group(required=required)
 
 @contextmanager
 def arg_group(parser, name):
+    """Yield mutually argument group"""
     yield parser.add_argument_group(name)
 
 def __query(group, required):
+    """Add query argument to group"""
     group.add_argument('-q', '--query', required=required, nargs='+', help='sample query')
 
 def __sample_id(group, required):
+    """Add sample_id argument to group"""
     group.add_argument('--sample_id', required=required, type=str, help='sample ID')
 
 def __input_dir(group, required, help):
+    """Add input_dir argument to group"""
     group.add_argument('--input_dir', required=required, help=help)
 
 def __input_file(group, required, help):
-    group.add_argument('-i', '--input_file', nargs='+', help=help)
+    """Add input_file argument to group"""
+    group.add_argument('-i', '--input_file', required=required, nargs='+', help=help)
 
 def __csv_file(group, required, help):
+    """Add csv_file argument to group"""
     group.add_argument('--csv_file', required=required, help=help)
 
 def __sh_file(group, required, help):
+    """Add sh_file argument to group"""
     group.add_argument('--sh_file', required=required, help=help)
 
 def __bam_file(group, required):
+    """Add bam_file argument to group"""
     group.add_argument('--bam_file', required=required, type=str, help='input bam file')
 
 def __bed_file(group, required):
+    """Add bed_file argument to group"""
     group.add_argument('--bed_file', required=required, type=str, help='input bed file')
 
 def __baits_file(group, required):
-    group.add_argument('--baits_file', required=required, type=str, default=None, help='input bam file')
+    """Add baits_file argument to group"""
+    group.add_argument('--baits_file', required=required, type=str, default=None,
+                       help='input baits file')
 
 def __reference(group, required, help):
+    """Add reference argument to group"""
     group.add_argument('--reference', required=required, type=str, help=help)
 
 def __output_file(group, required, help):
+    """Add output_file argument to group"""
     group.add_argument('-o', '--output_file', required=required, type=str, help=help)
 
 def __output_dir(group, required):
-    group.add_argument('--output_dir', required=required, type=str, help='directory to output files')
+    """Add output_dir argument to group"""
+    group.add_argument('--output_dir', required=required, type=str,
+                       help='directory to output files')
 
 def __analysis_dir(group, required):
-    group.add_argument('--analysis_dir', required=required, type=str, help='analysis results dir containing jasen results')
+    """Add analysis_dir argument to group"""
+    group.add_argument('--analysis_dir', required=required, type=str,
+                       help='analysis results dir containing jasen results')
 
 def __restore_dir(group, required):
-    group.add_argument('--restore_dir', required=required, type=str, default='/fs2/seqdata/restored', help='directory user wishes spring files to be restored to')
+    """Add restore_dir argument to group"""
+    group.add_argument('--restore_dir', required=required, type=str,
+                       default='/fs2/seqdata/restored',
+                       help='directory user wishes spring files to be restored to')
 
 def __remote_dir(group, required):
-    group.add_argument('--remote_dir', required=required, type=str, default='/fs1/bjorn/jasen', help='directory user wishes spring files to be restored to')
+    """Add remote_dir argument to group"""
+    group.add_argument('--remote_dir', required=required, type=str,
+                       default='/fs1/bjorn/jasen',
+                       help='directory user wishes spring files to be restored to')
 
 def __restore_file(group, required):
-    group.add_argument('--restore_file', required=required, type=str, help='filepath bash shell script (.sh) to be output')
+    """Add restore_file argument to group"""
+    group.add_argument('--restore_file', required=required, type=str,
+                       help='filepath bash shell script (.sh) to be output')
 
 def __missing_log(group, required):
-    group.add_argument('--missing_log', required=required, type=str, default='missing_samples.log', help='file containing missing files')
+    """Add missing_log argument to group"""
+    group.add_argument('--missing_log', required=required, type=str,
+                       default='missing_samples.log',
+                       help='file containing missing files')
 
 def __assay(group, required):
-    group.add_argument('--assay', required=required, type=str, default='jasen-saureus-dev', help='assay for jasen to run')
+    """Add assay argument to group"""
+    group.add_argument('--assay', required=required, type=str,
+                       default='jasen-saureus-dev',
+                       help='assay for jasen to run')
 
 def __platform(group, required):
-    group.add_argument('--platform', required=required, type=str, default='illumina', help='sequencing platform for jasen to run')
+    """Add platform argument to group"""
+    group.add_argument('--platform', required=required, type=str,
+                       default='illumina',
+                       help='sequencing platform for jasen to run')
 
 def __uri(group):
-    group.add_argument('--address', '--uri', default='mongodb://localhost:27017/', help='Mongodb host address. Use: `sudo lsof -iTCP -sTCP:LISTEN | grep mongo` to get address')
+    """Add mongodb address argument to group"""
+    group.add_argument('--address', '--uri',
+                       default='mongodb://localhost:27017/',
+                       help='Mongodb host address. \
+                        Use: `sudo lsof -iTCP -sTCP:LISTEN | grep mongo` to get address')
 
 def __db_name(group, required):
-    group.add_argument('--db_name', required=required, help='Mongodb database name address. Use: `show dbs` to get db name')
+    """Add db_name argument to group"""
+    group.add_argument('--db_name', required=required,
+                       help='Mongodb database name address. \
+                        Use: `show dbs` to get db name')
 
 def __db_collection(group, required):
-    group.add_argument('--db_collection', required=required, help='Mongodb collection name. Use: `show collections` to get db collection')
+    """Add db_collection argument to group"""
+    group.add_argument('--db_collection', required=required,
+                       help='Mongodb collection name. \
+                        Use: `show collections` to get db collection')
 
 def __out_format(group, required):
-    group.add_argument('-f', '--out_format', required=required, type=str, default="bed", help='output format')
+    """Add out_format argument to group"""
+    group.add_argument('-f', '--out_format', required=required, type=str,
+                       default="bed", help='output format')
 
 def __accession(group, required):
+    """Add accession argument to group"""
     group.add_argument('-a', '--accession', required=required, type=str, help='accession number')
 
 def __remote_hostname(group, required):
-    group.add_argument('--remote_hostname', required=required, type=str, default='rs-fs1.lunarc.lu.se', help='remote hostname')
+    """Add remote_hostname argument to group"""
+    group.add_argument('--remote_hostname', required=required, type=str,
+                       default='rs-fs1.lunarc.lu.se', help='remote hostname')
 
 def __prefix(group):
-    group.add_argument('--prefix', type=str, default='jasentool_results_', help='prefix for all output files')
+    """Add prefix argument to group"""
+    group.add_argument('--prefix', type=str, default='jasentool_results_',
+                       help='prefix for all output files')
 
 def __auto_start(group, required):
-    group.add_argument('--auto_start', required=required, dest='auto_start', action='store_true', default=False, help='automatically start')
+    """Add auto_start argument to group"""
+    group.add_argument('--auto_start', required=required, dest='auto_start', action='store_true',
+                       default=False, help='automatically start')
 
 def __remote(group, required):
-    group.add_argument('--remote', required=required, dest='remote', action='store_true', default=False, help='remote copy')
+    """Add remote argument to group"""
+    group.add_argument('--remote', required=required, dest='remote', action='store_true',
+                       default=False, help='remote copy')
 
 def __combined_output(group):
-    group.add_argument('--combined_output', dest='combined_output', action='store_true', help='combine all of the outputs into one output')
+    """Add combined_output argument to group"""
+    group.add_argument('--combined_output', dest='combined_output', action='store_true',
+                       help='combine all of the outputs into one output')
 
 def __sample_sheet(group, required):
-    group.add_argument('--sample_sheet', required=required, dest='sample_sheet', action='store_true', help='sample sheet input')
+    """Add sample_sheet argument to group"""
+    group.add_argument('--sample_sheet', required=required, dest='sample_sheet',
+                       action='store_true', help='sample sheet input')
 
 def __cpus(group):
+    """Add cpus argument to group"""
     group.add_argument('--cpus', dest='cpus', type=int, default=2, help='input cpus')
 
 def __help(group):
+    """Add help argument to group"""
     group.add_argument('-h', '--help', action='help', help='show help message')
 
 def get_main_parser():
+    """Get/build the main argument parser"""
     main_parser = argparse.ArgumentParser(prog='jasentool', conflict_handler='resolve')
     sub_parsers = main_parser.add_subparsers(help='--', dest='subparser_name')
     with subparser(sub_parsers, 'find', 'Find sample from given mongo db') as parser:
@@ -209,4 +276,4 @@ def get_main_parser():
             __cpus(group)
             __help(group)
 
-    return main_parser
\ No newline at end of file
+    return main_parser

From fc412eb6fbf236cb68fb63b52005d34882854b7b Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:27:32 +0100
Subject: [PATCH 04/18] Fix converge.py pylinting errors

---
 jasentool/converge.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/jasentool/converge.py b/jasentool/converge.py
index 4c50728..6d7d3b8 100644
--- a/jasentool/converge.py
+++ b/jasentool/converge.py
@@ -1,3 +1,5 @@
+"""Module to converge mutation catalogues"""
+
 import os
 import pandas as pd
 from jasentool.who import WHO
@@ -5,7 +7,8 @@
 from jasentool.tbprofiler import Tbprofiler
 from jasentool.utils import Utils
 
-class Converge(object):
+class Converge:
+    """Class that converges mutation catalogues"""
     def __init__(self, download_dir):
         self.download_dir = download_dir
         self.fohm_fpath = os.path.join(os.path.dirname(__file__), "data/dbs/fohm.csv")
@@ -45,6 +48,7 @@ def compare_columns(self, tbdb_df, who_df, column_names):
         return intersection_df, unique_tbdb_df, unique_who_df
 
     def run(self):
+        """Run the retrieval and convergance of mutation catalogues"""
         utils = Utils()
         # Download the genome
         mycobacterium_genome = Genome("NC_000962.3", "AL123456.3", self.download_dir, "h37rv")
@@ -55,14 +59,18 @@ def run(self):
         tbprofiler = Tbprofiler(self.tbdb_filepath)
         #h37rv_gb_filepath = mycobacterium_genome.download_genbank()
         who_df = who._parse(fasta_filepath, gff_filepath, self.download_dir)
-        tbdb_df = tbprofiler._parse(fasta_filepath, gff_filepath, self.download_dir)
-        #tbdb_df, who_df = pd.read_csv("/data/bnf/dev/ryan/pipelines/jasen/converge/tbdb.csv"), pd.read_csv("/data/bnf/dev/ryan/pipelines/jasen/converge/who.csv")
+        tbdb_df = tbprofiler._parse(self.download_dir)
+        #tbdb_df = pd.read_csv("/data/bnf/dev/ryan/pipelines/jasen/converge/tbdb.csv")
+        #who_df = pd.read_csv("/data/bnf/dev/ryan/pipelines/jasen/converge/who.csv")
         fohm_df = pd.read_csv(self.fohm_fpath)
-        intersection_df, unique_tbdb_df, unique_who_df = self.compare_columns(tbdb_df, who_df, ['Drug', 'Gene', 'Mutation'])
-        fohm_tbdb_df = pd.concat([intersection_df, unique_tbdb_df, fohm_df], ignore_index=True).drop_duplicates()
+        column_names = ['Drug', 'Gene', 'Mutation']
+        intersection_df, unique_tbdb_df, unique_who_df = self.compare_columns(tbdb_df, who_df, column_names)
+        dfs_to_concat = [intersection_df, unique_tbdb_df, fohm_df]
+        fohm_tbdb_df = pd.concat(dfs_to_concat, ignore_index=True).drop_duplicates()
         intersection_df.to_csv(self.intersection_outfpath, index=False)
         unique_tbdb_df.to_csv(self.unique_tbdb_outfpath, index=False)
         unique_who_df.to_csv(self.unique_who_outfpath, index=False)
         fohm_tbdb_df.to_csv(self.fohm_tbdb_outfpath, index=False)
-        converged_df = pd.concat([intersection_df, unique_tbdb_df, unique_who_df, fohm_df], ignore_index=True).drop_duplicates()
+        dfs_to_converge = [intersection_df, unique_tbdb_df, unique_who_df, fohm_df]
+        converged_df = pd.concat(dfs_to_converge, ignore_index=True).drop_duplicates()
         converged_df.to_csv(self.convereged_outfpath, index=False)

From 4adfcab789bdafbf1a834f8531e3ea83d35246ba Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:27:56 +0100
Subject: [PATCH 05/18] Fix convert.py pylinting errors

---
 jasentool/convert.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/jasentool/convert.py b/jasentool/convert.py
index 80974d8..99c3201 100644
--- a/jasentool/convert.py
+++ b/jasentool/convert.py
@@ -1,8 +1,12 @@
-class Convert(object):
+"""Module that converts file type"""
+
+class Convert:
+    """Convert class for converting files into desired format"""
     @staticmethod
     def targets2bed(target_file, accn):
+        """Convert cgmlst locus targets to bed file format"""
         bed_output = ""
-        with open(target_file, 'r') as fin:
+        with open(target_file, 'r', encoding="utf-8") as fin:
             for line in fin:
                 if line.startswith("Locus"):
                     continue
@@ -11,4 +15,4 @@ def targets2bed(target_file, accn):
                 length = int(line_split[4])
                 end = start + length
                 bed_output += f"{accn}\t{start}\t{end}\n"
-        return bed_output
\ No newline at end of file
+        return bed_output

From 966679acc281ea450f932986d750aa6b2e9a720e Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:28:41 +0100
Subject: [PATCH 06/18] Fix database.py pylinting errors

---
 jasentool/database.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/jasentool/database.py b/jasentool/database.py
index ee2848c..3fa991e 100644
--- a/jasentool/database.py
+++ b/jasentool/database.py
@@ -1,11 +1,14 @@
+"""Module for handling mongodb requests"""
 import pymongo
 
-class Database(object):
+class Database:
+    """Class that assists in handling mongodb request"""
     uri = "mongodb://localhost:27017/"
     db = None
 
     @staticmethod
     def initialize(db_name):
+        """Initialize mongodb client"""
         client = pymongo.MongoClient(Database.uri)
         Database.db = client[db_name] # Database Name
         Database.db_name = db_name # Database Name
@@ -13,30 +16,37 @@ def initialize(db_name):
 
     @staticmethod
     def insert(collection, data):
+        """Insert data into mongodb"""
         Database.db[collection].insert(data)
 
     @staticmethod
     def find(collection, query, fields):
+        """Find data in mongodb"""
         return Database.db[collection].find(query, fields)
 
     @staticmethod
     def find_one(collection, query):
+        """Find one entry in mongodb"""
         return Database.db[collection].find_one(query)
-    
+
     @staticmethod
     def get_pvl(collection, query):
+        """Get pvl result data from mongodb"""
         return Database.db[collection].find(query, {"_id": 0, "aribavir.lukS_PV.present": 1})
-    
+
     @staticmethod
     def get_mlst(collection, query):
+        """Get mlst result data from mongodb"""
         return Database.db[collection].find(query, {"_id": 0, "mlst": 1})
-    
+
     @staticmethod
     def get_cgmlst(collection, query):
+        """Get cgmlst result data from mongodb"""
         return Database.db[collection].find(query, {"_id": 0, "alleles": 1})
-    
+
     @staticmethod
     def get_meta_fields():
+        """Get respective metadata from mongodb"""
         fields = {
             "id": 1,
             "mlst.sequence_type": 1,

From 8e41d6287e66fadc345bdbed9e4cb62ff522ad30 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:28:57 +0100
Subject: [PATCH 07/18] Fix fix.py pylinting errors

---
 jasentool/fix.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/jasentool/fix.py b/jasentool/fix.py
index 3009334..5f265ff 100644
--- a/jasentool/fix.py
+++ b/jasentool/fix.py
@@ -1,16 +1,20 @@
+"""Module that fixes csv and shell scripts"""
+
 import os
 import pandas as pd
 from jasentool.utils import Utils
 
-class Fix(object):
+class Fix:
+    """Class that fixes csvs for start_nextflow_analysis.pl"""
     @staticmethod
     def fix_csv(input_file, output_fpath):
+        """Convert the provided bjorn csvs into new jasen-compatible csvs"""
         assays = []
         out_fpaths = []
-        with open(input_file, 'r') as csvfile:
-            df = pd.read_csv(csvfile)
-            df['assay'] = df['species']
-            for assay, df_assay in df.groupby('assay'):
+        with open(input_file, 'r', encoding="utf-8") as csvfile:
+            samples = pd.read_csv(csvfile)
+            samples['assay'] = samples['species']
+            for assay, df_assay in samples.groupby('assay'):
                 out_fpath = f'{os.path.splitext(output_fpath)[0]}_{assay}.csv'
                 df_assay.to_csv(out_fpath, encoding='utf-8', index=False)
                 out_fpaths.append(out_fpath)
@@ -19,16 +23,18 @@ def fix_csv(input_file, output_fpath):
 
     @staticmethod
     def fix_sh(input_file, output_fpath, assays):
+        """Fix the shell scripts"""
         utils = Utils()
         output_content = ""
         out_fpaths = []
-        with open(input_file, 'r') as shfile:
+        with open(input_file, 'r', encoding="utf-8") as shfile:
             for line in shfile:
                 line = line.rstrip()
                 if line.startswith('/fs2/sw/bnf-scripts/start_nextflow_analysis.pl'):
                     for assay in assays:
                         output_txt = ""
-                        line = f'/fs2/sw/bnf-scripts/start_nextflow_analysis.pl  $SCRIPTPATH/{os.path.splitext(output_fpath)[0]}_{assay}.csv'
+                        line = '/fs2/sw/bnf-scripts/start_nextflow_analysis.pl ' + \
+                            f'$SCRIPTPATH/{os.path.splitext(output_fpath)[0]}_{assay}.csv'
                         out_fpath = f'{os.path.splitext(output_fpath)[0]}_{assay}.sh'
                         output_txt += output_content+line+'\n'
                         utils.write_out_txt(output_txt, out_fpath)

From 62983603877764060cb4bb808f47bb03d5b1e369 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:29:19 +0100
Subject: [PATCH 08/18] Fix fohm.py pylinting errors

---
 jasentool/fohm.py | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/jasentool/fohm.py b/jasentool/fohm.py
index c767772..d9e95e4 100644
--- a/jasentool/fohm.py
+++ b/jasentool/fohm.py
@@ -1,38 +1,43 @@
+"""Module that handles FoHM excel sheet"""
+
 import os
 import pandas as pd
+from openpyxl import load_workbook
 
 class Fohm:
+    """Class for processing FoHM TB mutation catalogue"""
     def __init__(self, download_dir):
         self.download_dir = download_dir
         self.fohm_filepath = os.path.join(download_dir, "fohm.csv")
 
-    def convert_colour():
-        from openpyxl import load_workbook
-        excel_file = 'color_codes.xlsx' 
-        wb = load_workbook(excel_file, data_only = True)
-        sh = wb['Sheet1']
-        color_in_hex = sh['A2'].fill.start_color.index # this gives you Hexadecimal value of the color
-        print ('HEX =',color_in_hex) 
+    def convert_colour(self, excel_filepath):
+        """Convert coloured cells to hex value"""
+        excel_catalogue = load_workbook(excel_filepath, data_only = True)
+        mutation_sheet = excel_catalogue['Sheet1']
+        color_in_hex = mutation_sheet['A2'].fill.start_color.index
+        print ('HEX =', color_in_hex)
         print('RGB =', tuple(int(color_in_hex[i:i+2], 16) for i in (0, 2, 4))) # Color in RGB
-    
+
     def read_file(self, csv_filepath, xlsx_filepath):
+        """Read excel and csv files"""
         catalogue = pd.read_csv(csv_filepath, header=True)
         catalogue = pd.read_excel(xlsx_filepath, sheet_name='Mutation_catalogue', header=[0,1]).set_index([('variant (common_name)', 'Unnamed: 2_level_1')])
         return catalogue
 
     def convert2hgvs(self, mutation):
-            if mutation[:3].isalpha() and mutation[0].isupper():
-                return f'p.{mutation}'
-            elif mutation[0].isalpha() and mutation[0].islower() and not mutation[1].isalpha():
-                if 'Stop' in mutation:
-                    mutation.replace('Stop', '*')
-                ref = mutation[0].upper()
-                alt = mutation[-1].upper()
-                pos = mutation[1:-1]
-                return f'c.{pos}{ref}>{alt}'
-            else:
-                return mutation
-    
+        """Convert mutation format to hgvs format"""
+        if mutation[:3].isalpha() and mutation[0].isupper():
+            return f'p.{mutation}'
+        if mutation[0].isalpha() and mutation[0].islower() and not mutation[1].isalpha():
+            if 'Stop' in mutation:
+                mutation.replace('Stop', '*')
+            ref = mutation[0].upper()
+            alt = mutation[-1].upper()
+            pos = mutation[1:-1]
+            return f'c.{pos}{ref}>{alt}'
+        return mutation
+
     def _parse(self):
+        """Parse the mutation catalogue"""
         catalogue = pd.read_csv(self.fohm_filepath, header=True)
         catalogue['Mutation'] = catalogue.Mutation.apply(self.convert2hgvs)

From 4a95a44dd1bb709ee67ff52d099b0e77686c0659 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:29:43 +0100
Subject: [PATCH 09/18] Fix genome.py pylinting errors

---
 jasentool/genome.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/jasentool/genome.py b/jasentool/genome.py
index 5298943..3910c24 100644
--- a/jasentool/genome.py
+++ b/jasentool/genome.py
@@ -1,8 +1,11 @@
+"""Module for genomes and files related to the genomes"""
+
 import os
 from Bio import Entrez, SeqIO
 from jasentool.utils import Utils
 
 class Genome:
+    """Class for handling genome download in multiple formats (fasta, genbank, gff) from NCBI"""
     def __init__(self, refseq_accn, genbank_accn, download_dir, prefix, email="rjkennedyy@gmail.com"):
         Entrez.email = email
         self.refseq_accn = refseq_accn
@@ -14,9 +17,11 @@ def __init__(self, refseq_accn, genbank_accn, download_dir, prefix, email="rjken
         self.gff_filepath = os.path.join(download_dir, f"{prefix}.gff")
 
     def download_fasta(self):
+        """Download genome in fasta format"""
         try:
             # Fetch the fasta record from NCBI
-            fasta_handle = Entrez.efetch(db="nucleotide", id=self.refseq_accn, rettype="fasta", retmode="text")
+            fasta_handle = Entrez.efetch(db="nucleotide", id=self.refseq_accn,
+                                         rettype="fasta", retmode="text")
             fasta_record = SeqIO.read(fasta_handle, "fasta")
             fasta_handle.close()
 
@@ -25,14 +30,16 @@ def download_fasta(self):
 
             print(f"Fasta downloaded and saved to {self.fasta_filepath}")
 
-        except Exception as e:
-            print(f"Error downloading the genome: {e}")
+        except Exception as error_code:
+            print(f"Error downloading the genome: {error_code}")
         return self.fasta_filepath
 
     def download_genbank(self):
+        """Download genome in fasta format"""
         try:
             # Fetch the GenBank record from NCBI
-            genbank_handle = Entrez.efetch(db="nucleotide", id=self.genbank_accn, rettype="gb", retmode="text")
+            genbank_handle = Entrez.efetch(db="nucleotide", id=self.genbank_accn,
+                                           rettype="gb", retmode="text")
             genbank_record = SeqIO.read(genbank_handle, "genbank")
             genbank_handle.close()
 
@@ -41,11 +48,12 @@ def download_genbank(self):
 
             print(f"Genbank file downloaded and saved to {self.genbank_filepath}")
 
-        except Exception as e:
-            print(f"Error downloading the genbank file: {e}")
+        except Exception as error_code:
+            print(f"Error downloading the genbank file: {error_code}")
         return self.genbank_filepath
-    
+
     def download_gff(self):
+        """Download gff of genome genes"""
         utils = Utils()
         h37rv_url = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000195955.2/download?include_annotation_type=GENOME_GFF&filename=GCF_000195955.2.zip"
         try:
@@ -54,6 +62,6 @@ def download_gff(self):
             source = os.path.join(self.download_dir, "ncbi_dataset/data/GCF_000195955.2/genomic.gff")
             destination = os.path.join(self.download_dir, "h37rv.gff")
             utils.copy_file(source, destination)
-        except Exception as e:
-            print(f"Error downloading the gff file: {e}")
+        except Exception as error_code:
+            print(f"Error downloading the gff file: {error_code}")
         return self.gff_filepath

From 4a841c6cca14a8dca4413e34a08ce285f99d7e33 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:30:03 +0100
Subject: [PATCH 10/18] Fix main.py pylinting errors

---
 jasentool/main.py | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/jasentool/main.py b/jasentool/main.py
index dc5b542..5d119cb 100644
--- a/jasentool/main.py
+++ b/jasentool/main.py
@@ -1,3 +1,5 @@
+"""Module for executing each module/class"""
+
 import os
 import sys
 import json
@@ -12,8 +14,10 @@
 from jasentool.converge import Converge
 from jasentool.qc import QC
 
-class OptionsParser(object):
+class OptionsParser:
+    """Class that parses through cli arguments and executes respective modules"""
     def __init__(self, version):
+        """Initiate OptionsParser class"""
         self.version = version
         self._check_python()
 
@@ -36,7 +40,7 @@ def _get_output_fpaths(self, input_files, output_dir, output_file, prefix, combi
         output_fpaths = []
         if output_dir:
             output_dir = os.path.expanduser(output_dir)
-            if combined_output: 
+            if combined_output:
                 output_fpaths = [os.path.join(output_dir, prefix + "combined_outputs")]
             else:
                 output_fpaths = [os.path.join(output_dir, prefix + os.path.basename(os.path.splitext(input_fpath)[0])) for input_fpath in input_files]
@@ -48,33 +52,41 @@ def _get_output_fpaths(self, input_files, output_dir, output_file, prefix, combi
         return output_fpaths
 
     def find(self, options):
+        """Find entry in mongodb"""
         Database.initialize(options.db_name)
-        output_fpaths = self._get_output_fpaths(options.query, options.output_dir, options.output_file, options.prefix, options.combined_output)
+        output_fpaths = self._get_output_fpaths(options.query, options.output_dir,
+                                                options.output_file, options.prefix,
+                                                options.combined_output)
         for query_idx, query in enumerate(options.query):
             find = list(Database.find(options.db_collection, {"id": query}, {}))
             if not find:
                 find = list(Database.find(options.db_collection, {"sample_id": query}, {}))
-            pp = pprint.PrettyPrinter(indent=4)
-            pp.pprint(find)
-            #with open(output_fpaths[query_idx], 'w+') as fout:
-                #json.dump(find, fout)
+            sample_pp = pprint.PrettyPrinter(indent=4)
+            sample_pp.pprint(find)
+            with open(output_fpaths[query_idx], 'w+', encoding="utf-8") as fout:
+                json.dump(find, fout)
 
     def insert(self, options):
+        """Insert entry in mongodb"""
         Database.initialize(options.db_name)
         input_files = self._input_to_process(options.input_file, options.input_dir)
         for input_file in input_files:
-            with open(input_file, 'r') as fin:
+            with open(input_file, 'r', encoding="utf-8") as fin:
                 input_sample = json.load(fin)
                 Database.insert(options.db_collection, input_sample)
 
     def validate(self, options):
+        """Execute validation of old vs new pipeline results"""
         Database.initialize(options.db_name)
         input_files = self._input_to_process(options.input_file, options.input_dir)
-        output_fpaths = self._get_output_fpaths(input_files, options.output_dir, options.output_file, options.prefix, options.combined_output)
+        output_fpaths = self._get_output_fpaths(input_files, options.output_dir,
+                                                options.output_file, options.prefix,
+                                                options.combined_output)
         validate = Validate()
         validate.run(input_files, output_fpaths, options.db_collection, options.combined_output)
 
     def missing(self, options):
+        """Execute search for missing samples from new pipeline results"""
         utils = Utils()
         missing = Missing()
         db = Database()
@@ -98,6 +110,7 @@ def missing(self, options):
             utils.write_out_txt(bash_script, bash_fpath)
 
     def convert(self, options):
+        """Execute conversion of file formats"""
         utils = Utils()
         convert = Convert()
         input_file = options.input_file[0]
@@ -108,6 +121,7 @@ def convert(self, options):
             utils.write_out_txt(output_txt, output_fpath)
 
     def fix(self, options):
+        """Execute fixing of file to desired format(s)"""
         utils = Utils()
         fix = Fix()
         csv_files, assays = fix.fix_csv(options.csv_file, options.output_file)
@@ -118,15 +132,18 @@ def fix(self, options):
                 utils.start_remote_pipelines(batch_files, options.remote_dir)
 
     def converge(self, options):
+        """Execute convergence of mutation catalogues"""
         converge = Converge(options.output_dir)
         converge.run()
 
     def qc(self, options):
+        """Execute retrieval of qc results"""
         qc = QC(options)
         json_result = qc.run()
         qc.write_json_result(json_result, options.output_file)
 
     def parse_options(self, options):
+        """Options parser"""
         if options.subparser_name == 'find':
             self.find(options)
 
@@ -147,6 +164,6 @@ def parse_options(self, options):
 
         elif options.subparser_name == 'converge':
             self.converge(options)
-        
+
         elif options.subparser_name == 'qc':
             self.qc(options)

From cd22f6a81a371111035fe96b9be36ce8bef4e976 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:30:24 +0100
Subject: [PATCH 11/18] Fix missing.py pylinting errors

---
 jasentool/missing.py | 142 +++++++++++++++++++++++++++++++------------
 1 file changed, 104 insertions(+), 38 deletions(-)

diff --git a/jasentool/missing.py b/jasentool/missing.py
index a855299..eea135a 100644
--- a/jasentool/missing.py
+++ b/jasentool/missing.py
@@ -1,16 +1,18 @@
+"""Module to find samples that have not been run via jasen"""
+
 import os
 import re
-import csv
-import pandas as pd
 
-class Missing(object):
+class Missing:
+    """Class for locating expected samples that are missing from a given directory"""
     @staticmethod
     def rm_double_dmltplx(read_files):
+        """Exclude files that have been demultiplexed twice"""
         first_reads = read_files[0]
         for read_file in read_files[1:]:
             errors = 0
-            for i in range(len(first_reads)):
-                if first_reads[i] != read_file[i]:
+            for idx, _ in enumerate(first_reads):
+                if first_reads[idx] != read_file[idx]:
                     errors += 1
             if errors == 1:
                 return [first_reads, read_file]
@@ -18,24 +20,30 @@ def rm_double_dmltplx(read_files):
 
     @staticmethod
     def find_files(search_term, parent_dir):
+        """Find files in given directory using regex search term"""
         try:
             search_files = os.listdir(parent_dir)
         except FileNotFoundError:
             print(f"WARN: {parent_dir} does not exist! Trying to fix.")
         finally:
             search_files = os.listdir(parent_dir)
-            found_files = sorted([os.path.join(parent_dir, search_file) for search_file in search_files if re.search(search_term, search_file) and not search_file.endswith("~")])
+            found_files = sorted([os.path.join(parent_dir, search_file)
+                                  for search_file in search_files
+                                  if re.search(search_term, search_file) and
+                                  not search_file.endswith("~")
+                                ])
             return found_files
 
     @staticmethod
     def edit_read_paths(reads, restore_dir):
-        restore_dirs = set([restore_dir.rstrip("/"), "/fs2/seqdata/restored"])
+        """Edit read paths to show intended location to be coppied to"""
         filename = os.path.join(restore_dir, reads.split("BaseCalls/")[1])
         read1, read2 = [filename.rstrip(".spring") + f"_R{i}_001.fastq.gz" for i in [1, 2]]
         return os.path.join(restore_dir, reads.split("BaseCalls/")[1]), [read1, read2]
-    
+
     @staticmethod
     def check_file_cp(reads, restore_dir):
+        """Check that file not already coppied to restore directory"""
         checked_reads = []
         restore_dirs = set([restore_dir.rstrip("/"), "/fs2/seqdata/restored"])
         for filepath in reads:
@@ -43,18 +51,26 @@ def check_file_cp(reads, restore_dir):
             if filepath.startswith("/fs") and os.path.exists(filepath):
                 checked_reads.append(filepath)
             else:
-                for dir in restore_dirs:
-                    read_fpath = os.path.join(dir, filename)
-                    if os.path.exists(read_fpath) and not os.path.isdir(read_fpath) and len(checked_reads) != 2:
+                for directory in restore_dirs:
+                    read_fpath = os.path.join(directory, filename)
+                    if (
+                        os.path.exists(read_fpath) and
+                        not os.path.isdir(read_fpath) and
+                        len(checked_reads) != 2
+                    ):
                         checked_reads.append(read_fpath)
         if len(checked_reads) == 0:
-            checked_reads = [os.path.join(restore_dir, os.path.basename(read_filepath)) for read_filepath in reads]
+            checked_reads = [
+                os.path.join(restore_dir, os.path.basename(read_filepath))
+                for read_filepath in reads
+            ]
         return checked_reads
 
     @staticmethod
     def parse_sample_sheet(sample_sheet, restore_dir):
+        """Parse sample sheets for sample meta data"""
         csv_dict = {}
-        with open(sample_sheet, "r") as fin:
+        with open(sample_sheet, "r", encoding="utf-8") as fin:
             for line in fin:
                 if line.endswith("saureus\n"):
                     line = line.rstrip()
@@ -69,34 +85,78 @@ def parse_sample_sheet(sample_sheet, restore_dir):
                     except IndexError:
                         clarity_group_id = clarity_id
                     if ":" in line:
-                        parent_dir = os.path.join(line.split(":")[0].rstrip("SampleSheet.csv"), "Data/Intensities/BaseCalls/")
+                        parent_dir = os.path.join(
+                            line.split(":")[0].rstrip("SampleSheet.csv"),
+                            "Data/Intensities/BaseCalls/"
+                        )
                     else:
-                        parent_dir = os.path.join(os.path.dirname(sample_sheet), "Data/Intensities/BaseCalls/")
+                        parent_dir = os.path.join(
+                            os.path.dirname(sample_sheet),
+                            "Data/Intensities/BaseCalls/"
+                        )
                     try:
                         paired_reads = Missing.find_files(r'^' + clarity_id, parent_dir)
                         if len(paired_reads) == 2 and paired_reads[0].endswith(".gz"):
                             restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
-                            csv_dict[sample_id] = [clarity_group_id, species, restored_reads_fpaths, None, paired_reads]
+                            csv_dict[sample_id] = [
+                                clarity_group_id,
+                                species,
+                                restored_reads_fpaths,
+                                None,
+                                paired_reads
+                            ]
                         elif len(paired_reads) == 1 and paired_reads[0].endswith(".spring"):
                             spring_fpaths = paired_reads
-                            (restored_spring_fpaths, paired_reads) = list(map(Missing.edit_read_paths, spring_fpaths, [restore_dir]*len(spring_fpaths)))[0]
-                            csv_dict[sample_id] = [clarity_group_id, species, paired_reads, spring_fpaths, restored_spring_fpaths]
+                            (restored_spring_fpaths, paired_reads) = list(map(
+                                Missing.edit_read_paths,
+                                spring_fpaths,
+                                [restore_dir]*len(spring_fpaths)
+                            ))[0]
+                            csv_dict[sample_id] = [
+                                clarity_group_id,
+                                species,
+                                paired_reads,
+                                spring_fpaths,
+                                restored_spring_fpaths
+                            ]
                         elif len(paired_reads) == 4 and paired_reads[0].endswith(".gz"):
                             paired_reads = Missing.rm_double_dmltplx(paired_reads)
                             if len(paired_reads) == 2:
                                 restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
-                                csv_dict[sample_id] = [clarity_group_id, species, restored_reads_fpaths, None, paired_reads]
+                                csv_dict[sample_id] = [
+                                    clarity_group_id,
+                                    species,
+                                    restored_reads_fpaths,
+                                    None,
+                                    paired_reads
+                                ]
                             elif len(paired_reads) == 4:
                                 paired_reads_string = '\n-'.join(paired_reads)
-                                print(f"There are 4 sets of reads related to sample {sample_id} from the {parent_dir}: \n-{paired_reads_string}\n")
+                                print(f"There are 4 sets of reads related to sample {sample_id} from the {parent_dir}: "
+                                      f"\n-{paired_reads_string}\n")
+
                         elif len(paired_reads) == 3:
-                            paired_reads = [paired_read for paired_read in paired_reads if paired_read.endswith(".fastq.gz")]
+                            paired_reads = [paired_read for paired_read in paired_reads
+                                            if paired_read.endswith(".fastq.gz")]
                             restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
-                            csv_dict[sample_id] = [clarity_group_id, species, restored_reads_fpaths, None, paired_reads]
+                            csv_dict[sample_id] = [
+                                clarity_group_id,
+                                species,
+                                restored_reads_fpaths,
+                                None,
+                                paired_reads
+                            ]
                         elif len(paired_reads) == 6:
-                            paired_reads = [paired_read for paired_read in paired_reads if paired_read.endswith(".fastq.gz")]
+                            paired_reads = [paired_read for paired_read in paired_reads
+                                            if paired_read.endswith(".fastq.gz")]
                             restored_reads_fpaths = Missing.check_file_cp(paired_reads, restore_dir)
-                            csv_dict[sample_id] = [clarity_group_id, species, restored_reads_fpaths, None, paired_reads]
+                            csv_dict[sample_id] = [
+                                clarity_group_id,
+                                species,
+                                restored_reads_fpaths,
+                                None,
+                                paired_reads
+                            ]
                         #elif len(paired_reads) == 0:
                             #print(f"The sample {sample_id} doesn't have read/spring files in the {parent_dir} ({paired_reads}).")
                         #else:
@@ -109,7 +169,11 @@ def parse_sample_sheet(sample_sheet, restore_dir):
 
     @staticmethod
     def check_format(fpath):
-        if fpath.startswith("/fs1") and not os.path.exists(os.path.join(fpath, "Data/Intensities/BaseCalls")):
+        """Check that filepath has the correct prefix and that it exists"""
+        if (
+            fpath.startswith("/fs1") and
+            not os.path.exists(os.path.join(fpath, "Data/Intensities/BaseCalls"))
+        ):
             print(f"WARN: {fpath} does not exist! Fixing by removing '/fs1' prefix.")
             fpath = fpath.replace("/fs1", "")
         if fpath.startswith("NovaSeq"):
@@ -121,22 +185,23 @@ def check_format(fpath):
             data_fpath = "/data" + fpath
             if os.path.exists(os.path.join(fs2_fpath, "Data/Intensities/BaseCalls")):
                 return fs2_fpath
-            elif os.path.exists(os.path.join(isilon_fpath, "Data/Intensities/BaseCalls")):
+            if os.path.exists(os.path.join(isilon_fpath, "Data/Intensities/BaseCalls")):
                 return isilon_fpath
-            elif os.path.exists(os.path.join(data_fpath, "Data/Intensities/BaseCalls")):
+            if os.path.exists(os.path.join(data_fpath, "Data/Intensities/BaseCalls")):
                 return data_fpath
-            elif os.path.exists(fpath):
-                return fpath.rstrip("Data/Intensities/BaseCalls/")#.replace("Data/Intensities/BaseCalls", "")
-            else:
-                print(f"WARN: Base calls for {fpath} cannot be found.")
+            if os.path.exists(fpath):
+                return fpath.rstrip("Data/Intensities/BaseCalls/")
+            print(f"WARN: Base calls for {fpath} cannot be found.")
         return fpath
 
     @staticmethod
     def parse_dir(dir_fpath):
+        """Return filenames in directory"""
         return [filename.split("_")[0] for filename in os.listdir(dir_fpath)]
-    
+
     @staticmethod
     def filter_csv_dict(csv_dict, missing_samples):
+        """Filter out missing samples"""
         filtered_csv_dict = {}
         not_found = []
         for missing_sample in missing_samples:
@@ -148,9 +213,10 @@ def filter_csv_dict(csv_dict, missing_samples):
         print(f"{len(not_found)} samples could not be found")
         print(f"{len(filtered_csv_dict.keys())} samples remain after filtering")
         return filtered_csv_dict, not_found
-    
+
     @staticmethod
     def find_missing(meta_dict, analysis_dir_fnames, restore_dir):
+        """Find missing samples from jasen results directory"""
         sample_run = ""
         missing_samples = []
         csv_dict = {}
@@ -158,18 +224,16 @@ def find_missing(meta_dict, analysis_dir_fnames, restore_dir):
         for sample in meta_dict:
             if sample["id"] not in analysis_dir_fnames:
                 missing_samples.append(sample["id"])
-                if sample_run != sample["run"]: #if sample run changes based on 
+                if sample_run != sample["run"]: #if sample run changes based on
                     ss_dict = {}
                     sample_run_dir = Missing.check_format(sample["run"])
                     sample_sheets = Missing.find_files(r'.csv$', sample_run_dir)
                     if sample_sheets:
                         for sample_sheet in sample_sheets:
                             ss_dict |= Missing.parse_sample_sheet(sample_sheet, restore_dir)
-                        if not sample_sheet:
-                            print(f"sample sheets yieded nothing from {sample['run']}")
                         csv_dict |= ss_dict
                     else:
-                        print(f"WARN: No sample sheets exist in the following path path {sample['run']}!")
+                        print(f"WARN: No sample sheets exist in the following path: {sample['run']}!")
                     sample_run = sample["run"]
 
         print(f"{len(csv_dict.keys())} samples found")
@@ -180,6 +244,7 @@ def find_missing(meta_dict, analysis_dir_fnames, restore_dir):
 
     @staticmethod
     def create_bash_script(csv_dict, restore_dir):
+        """Create shell script that executes copying of files and starts nextflow analysis"""
         spring_command = ""
         shell_script_path = 'SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"\n'
         shell_fail_count = "FAIL=0\n"
@@ -189,7 +254,7 @@ def create_bash_script(csv_dict, restore_dir):
             unspring_command = ""
             try:
                 spring_fpaths, restored_fpaths = csv_dict[sample][3][0], csv_dict[sample][4]
-                read1, read2 = csv_dict[sample][2]
+                read1, _ = csv_dict[sample][2]
                 if not os.path.exists(restored_fpaths) and not os.path.exists(read1):
                     jcp_command = f'/fs2/sw/bnf-scripts/jcp {spring_fpaths} {restore_dir}/ && '
                     unspring_command = f'/fs2/sw/bnf-scripts/unspring_file.pl {restored_fpaths} {restore_dir}/ WAIT &\nPIDS="$PIDS $!"\n'
@@ -203,6 +268,7 @@ def create_bash_script(csv_dict, restore_dir):
 
     @staticmethod
     def remove_empty_files(csv_dict):
+        """Remove fastq filepaths if the file size is < 10 mb"""
         empty_files_dict = {}
         for sample in csv_dict:
             try:

From be0a496776ae810fa13542c95f9fb3c187797a34 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:30:56 +0100
Subject: [PATCH 12/18] Fix qc.py pylinting errors

---
 jasentool/qc.py | 50 ++++++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/jasentool/qc.py b/jasentool/qc.py
index 48868c3..3e357cb 100644
--- a/jasentool/qc.py
+++ b/jasentool/qc.py
@@ -1,8 +1,11 @@
+"""Module for retrieving qc results"""
+
 import os
 import json
 import subprocess
 
 class QC:
+    """Class for retrieving qc results"""
     def __init__(self, args):
         self.results = {}
         self.bam = args.bam
@@ -14,11 +17,13 @@ def __init__(self, args):
         self.paired = self.is_paired()
 
     def write_json_result(self, json_result, output_filepath):
-        with open(output_filepath, 'w') as json_file:
+        """Write out json file"""
+        with open(output_filepath, 'w', encoding="utf-8") as json_file:
             json_file.write(json_result)
 
-    def parse_basecov_bed(self, fn, thresholds):
-        with open(fn) as cov_fh:
+    def parse_basecov_bed(self, basecov_fpath, thresholds):
+        """Parse base coverage bed file"""
+        with open(basecov_fpath, "r", encoding="utf-8") as cov_fh:
             head_str = cov_fh.readline().strip().lstrip("#")
             head = head_str.split("\t")
             cov_field = head.index("COV")
@@ -29,12 +34,12 @@ def parse_basecov_bed(self, fn, thresholds):
             tot, cnt = 0, 0
             levels = {}
             for line in cov_fh:
-                a = line.strip().split("\t")
-                tot += int(a[2])
+                line = line.strip().split("\t")
+                tot += int(line[2])
                 cnt += 1
                 tot_bases += 1
                 for min_val in thresholds:
-                    if int(a[cov_field]) >= min_val:
+                    if int(line[cov_field]) >= min_val:
                         above_cnt[min_val] += 1
 
             above_pct = {min_val: 100 * (above_cnt[min_val] / tot_bases) for min_val in thresholds}
@@ -46,34 +51,37 @@ def parse_basecov_bed(self, fn, thresholds):
             q3_num = 3 * cnt / 4
             median_num = cnt / 2
             sum_val = 0
-            q1, q3, median = None, None, None
+            quartile1, quartile3, median = None, None, None
             iqr_median = "9999"
-            for l in sorted(levels):
-                sum_val += levels[l]
-                if sum_val >= q1_num and not q1:
-                    q1 = l
+            for level in sorted(levels):
+                sum_val += levels[level]
+                if sum_val >= q1_num and not quartile1:
+                    quartile1 = level
                 if sum_val >= median_num and not median:
-                    median = l
-                if sum_val >= q3_num and not q3:
-                    q3 = l
+                    median = level
+                if sum_val >= q3_num and not quartile3:
+                    quartile3 = level
 
-            if q1 and q3 and median:
-                iqr_median = (q3 - q1) / median
+            if quartile1 and quartile3 and median:
+                iqr_median = (quartile3 - quartile1) / median
 
             return above_pct, mean_cov, iqr_median
 
     def is_paired(self):
+        """Check if reads are paired"""
         line = subprocess.check_output(f"samtools view {self.bam} | head -n 1| awk '{{print $2}}'", shell=True, text=True)
         remainder = int(line) % 2
         is_paired = 1 if remainder else 0
         return is_paired
 
     def system_p(self, *cmd):
+        """Execute subproces"""
         print(f"RUNNING: {' '.join(cmd)}")
         print()
         subprocess.run(cmd, check=True)
 
     def run(self):
+        """Run QC info extraction"""
         if self.baits and self.reference:
             print("Calculating HS-metrics...")
             dict_file = self.reference
@@ -85,11 +93,11 @@ def run(self):
                 self.system_p(f"picard BedToIntervalList -I {self.baits} -O {self.baits}.interval_list -SD {dict_file}")
             self.system_p(f"picard CollectHsMetrics -I {self.bam} -O {self.bam}.hsmetrics -R {self.reference} -BAIT_INTERVALS {self.baits}.interval_list -TARGET_INTERVALS {self.bed}.interval_list")
 
-            with open(f"{self.bam}.hsmetrics") as hs:
-                for line in hs:
+            with open(f"{self.bam}.hsmetrics", "r", encoding="utf-8") as fin:
+                for line in fin:
                     if line.startswith("## METRICS CLASS"):
-                        next(hs)
-                        vals = next(hs).split("\t")
+                        next(fin)
+                        vals = next(fin).split("\t")
                         self.results['pct_on_target'] = vals[18]
                         self.results['fold_enrichment'] = vals[26]
                         self.results['median_coverage'] = vals[23]
@@ -104,7 +112,7 @@ def run(self):
         if self.paired:
             print("Collect insert sizes...")
             self.system_p(f"picard CollectInsertSizeMetrics -I {self.bam} -O {self.bam}.inssize -H {self.bam}.ins.pdf -STOP_AFTER 1000000")
-            with open(f"{self.bam}.inssize") as ins:
+            with open(f"{self.bam}.inssize", "r", encoding="utf-8") as ins:
                 for line in ins:
                     if line.startswith("## METRICS CLASS"):
                         next(ins)

From 5f32f5352606bfca29941a581c90cacdf816da8e Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:31:14 +0100
Subject: [PATCH 13/18] Fix tbprofiler.py pylinting errors

---
 jasentool/tbprofiler.py | 88 +++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 56 deletions(-)

diff --git a/jasentool/tbprofiler.py b/jasentool/tbprofiler.py
index 64ad372..1df43bf 100644
--- a/jasentool/tbprofiler.py
+++ b/jasentool/tbprofiler.py
@@ -1,20 +1,23 @@
+"""Module that handles TBProfiler's database (tbdb)"""
+
 import os
 import re
 import sys
 import pandas as pd
-from tqdm import tqdm
 from jasentool.utils import Utils
 
-class Tbprofiler(object):
+class Tbprofiler:
+    """Class that handles TBProfiler tb mutation catalogue"""
     def __init__(self, tbdb_dir):
         self.tbdb_filepath = os.path.join(tbdb_dir, "tbdb.csv")
         self.chr_name = "Chromosome"
-        self.aa_long2short = self.get_aa_dict()
+        self.aa_long2short = Utils.get_aa_dict()
 
     def fasta2dict(self, filepath):
+        """Convert fasta to dictionary"""
         fa_dict = {}
         seq_name = ""
-        with open(filepath, 'r') as fin:
+        with open(filepath, 'r', encoding="utf-8") as fin:
             for line in fin:
                 line = line.rstrip()
                 if line.startswith(">"):
@@ -22,7 +25,7 @@ def fasta2dict(self, filepath):
                     fa_dict[seq_name] = []
                 else:
                     fa_dict[seq_name].append(line)
-        return {seq: "".join(fa_dict[seq]) for seq in fa_dict}
+        return {seq: "".join(val) for seq, val in fa_dict.items()}
 
     def reverse_complement(self, seq):
         """Return reverse complement of a sequence"""
@@ -33,64 +36,38 @@ def complement(seq):
         return complement(seq[::-1])
 
     def write_gene_pos(self, infile, genes, outfile):
+        """Write out gene positions"""
         output_txt = ""
-        with open(infile, "r") as fin:
+        with open(infile, "r", encoding="utf-8") as fin:
             for line in fin:
                 row = line.strip().split()
                 rv, gene, chr_start, chr_end, gene_start, gene_end = [row[0], row[1]]+[int(row[i]) for i in range(2,6)]
                 if rv in genes:
-                    y = 0
+                    x = 0
                     for i, chr_pos in enumerate(range(chr_start, chr_end+1)):
                         x = 1 if gene_start< gene_end else -1
                         if gene_start+(x*i) == 0:
-                            y = 1 if gene_start< gene_end else -1
-                        output_txt += "%s\t%s\t%s\t%s\n" % (self.chr_name, chr_pos, rv, gene_start+(x*i)+y)
-        with open(outfile, "w") as fout:
+                            x = 1 if gene_start< gene_end else -1
+                        output_txt += f"{self.chr_name}\t{chr_pos}\t{rv}\t{gene_start+(x*i)+x}\n"
+        with open(outfile, "w", encoding="utf-8") as fout:
             fout.write(output_txt)
 
-    def get_aa_dict(self):
-        return {
-            'Ala': 'A',
-            'Arg': 'R',
-            'Asn': 'N',
-            'Asp': 'D',
-            'Asx': 'B',
-            'Cys': 'C',
-            'Glu': 'E',
-            'Gln': 'Q',
-            'Glx': 'Z',
-            'Gly': 'G',
-            'His': 'H',
-            'Ile': 'I',
-            'Leu': 'L',
-            'Lys': 'K',
-            'Met': 'M',
-            'Phe': 'F',
-            'Pro': 'P',
-            'Ser': 'S',
-            'Thr': 'T',
-            'Trp': 'W',
-            'Tyr': 'Y',
-            'Val': 'V',
-            "Stop":"*",
-            "-":"-"
-        }
-    
     def parse_mutation(self, mut, gene, fasta_dict, gene_info):
+        """Parse mutation and determine type"""
         # AA change
         re_obj = re.search("p.([A-Z][a-z][a-z])([0-9]+)([A-Z][a-z][a-z])", mut)
         if re_obj:
             ref_aa = self.aa_long2short[re_obj.group(1)]
             alt_aa = self.aa_long2short[re_obj.group(3)]
             codon_num = re_obj.group(2)
-            return ["%s%s>%s%s" % (codon_num, ref_aa, codon_num, alt_aa)]
+            return [f"{codon_num}{ref_aa}>{codon_num}{alt_aa}"]
         # Stop codon
         re_obj = re.search("p.([A-Z][a-z][a-z])([0-9]+)(\*)", mut)
         if re_obj:
             ref_aa = self.aa_long2short[re_obj.group(1)]
             alt_aa = re_obj.group(3)
             codon_num = re_obj.group(2)
-            return ["%s%s>%s%s" % (codon_num, ref_aa, codon_num, alt_aa)]
+            return [f"{codon_num}{ref_aa}>{codon_num}{alt_aa}"]
         # Deletion single base
         re_obj = re.search("c.([\-0-9]+)del", mut)
         if re_obj:
@@ -101,7 +78,7 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info):
             else:
                 chr_start_nt = gene_info[gene]["start"] - gene_info[gene]["gene_start"] + gene_start_nt - (0 if gene_start_nt<0 else 1)
             seq = fasta_dict["Chromosome"][chr_start_nt-2:chr_start_nt]
-            return ["%s%s>%s" % (chr_start_nt-1,seq,seq[0])]
+            return [f"{chr_start_nt-1}{seq}>{seq[0]}"]
         # Deletion multi base
         re_obj = re.search("c.([\-0-9]+)_([\-0-9]+)del", mut)
         if re_obj:
@@ -115,7 +92,7 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info):
                 chr_start_nt = gene_info[gene]["start"] - gene_info[gene]["gene_start"] + gene_start_nt - (0 if gene_start_nt<0 else 1)
             chr_end_nt = chr_start_nt+del_len-1
             seq = fasta_dict["Chromosome"][chr_start_nt-2:chr_end_nt]
-            return ["%s%s>%s" % (chr_start_nt-1, seq, seq[0])]
+            return [f"{chr_start_nt-1}{seq}>{seq[0]}"]
         # Insertion
         re_obj = re.search("c.([0-9]+)_([0-9]+)ins([A-Z]+)", mut)
         if re_obj:
@@ -128,7 +105,7 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info):
             else:
                 chr_start_nt = gene_info[gene]["start"] - gene_info[gene]["gene_start"] + gene_start_nt - 1
             seq_start = fasta_dict["Chromosome"][chr_start_nt-1]
-            return ["%s%s>%s" % (chr_start_nt,seq_start,seq_start+seq_ins)]
+            return [f"{chr_start_nt}{seq_start}>{seq_start+seq_ins}"]
         # Promoter Mutation
         ## c.-16G>C
         re_obj = re.search("c.(\-[0-9]+)([A-Z])>([A-Z])",mut)
@@ -139,11 +116,10 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info):
             strand = gene_info[gene]["strand"]
 
             if strand == "+":
-                chr_pos = gene_info[gene]["start"] - (gene_info[gene]["gene_start"] - nt_pos)
-                return ["%s%s>%s" % (nt_pos,ref_nt,alt_nt)]
-            else:
-                chr_pos = gene_info[gene]["end"] + (gene_info[gene]["gene_end"] - nt_pos)
-                return ["%s%s>%s" % (nt_pos, self.reverse_complement(ref_nt), self.reverse_complement(alt_nt))]
+                #chr_pos = gene_info[gene]["start"] - (gene_info[gene]["gene_start"] - nt_pos)
+                return [f"{nt_pos}{ref_nt}>{alt_nt}"]
+            #chr_pos = gene_info[gene]["end"] + (gene_info[gene]["gene_end"] - nt_pos)
+            return [f"{nt_pos}{self.reverse_complement(ref_nt)}>{self.reverse_complement(alt_nt)}"]
         # ncRNA Mutation
         ## r.514a>c
         re_obj = re.search("r.([0-9]+)([a-z]+)>([a-z]+)",mut)
@@ -151,7 +127,7 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info):
             nt_pos = re_obj.group(1)
             ref_nt = re_obj.group(2)
             alt_nt = re_obj.group(3)
-            return ["%s%s>%s" % (nt_pos,ref_nt.upper(),alt_nt.upper())]
+            return [f"{nt_pos}{ref_nt.upper()}>{alt_nt.upper()}"]
         # frameshift
         re_obj = re.search("frameshift",mut)
         if re_obj:
@@ -166,26 +142,26 @@ def parse_mutation(self, mut, gene, fasta_dict, gene_info):
         if re_obj:
             start = int(re_obj.group(1))
             end = int(re_obj.group(2))
-            return ["any_missense_codon_%s" % i for i in range(start,end+1)]
+            return [f"any_missense_codon_{pos}" for pos in range(start,end+1)]
         # Codon single
         ## any_missense_codon_425
         re_obj = re.search("any_missense_codon_([0-9]+)",mut)
         if re_obj:
             start = int(re_obj.group(1))
-            return ["any_missense_codon_%s" % start]
+            return [f"any_missense_codon_{start}"]
         # Indel range
         re_obj = re.search("any_indel_nucleotide_([0-9]+)_([0-9]+)",mut)
         if re_obj:
             start = int(re_obj.group(1))
             end = int(re_obj.group(2))
-            return ["any_indel_nucleotide_%s" % i for i in range(start,end+1)]
+            return [f"any_indel_nucleotide_{pos}" for pos in range(start,end+1)]
         # large_deletion
-        re_obj = re.search("large_deletion",mut)
+        re_obj = re.search("large_deletion", mut)
         if re_obj:
             return ["large_deletion"]
-        sys.exit("%s is not a valid formatted mutation... Exiting!" % mut)
-    
-    def _parse(self, fasta_filepath, gff_filepath, download_dir):
+        sys.exit(f"{mut} is not a valid formatted mutation... Exiting!")
+
+    def _parse(self, download_dir):
         utils = Utils()
         tbdb_url = "https://raw.githubusercontent.com/jodyphelan/tbdb/master/tbdb.csv"
         tbdb_filepath = os.path.join(download_dir, "tbdb.csv")

From c7f30395eba769b14e5b90ef6f4b04fd423218c6 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:31:29 +0100
Subject: [PATCH 14/18] Fix utils.py pylinting errors

---
 jasentool/utils.py | 74 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 16 deletions(-)

diff --git a/jasentool/utils.py b/jasentool/utils.py
index 9cc85b2..7e17ac4 100644
--- a/jasentool/utils.py
+++ b/jasentool/utils.py
@@ -1,34 +1,40 @@
-#!/usr/bin/env python3
+"""Module for utility tools"""
 
 import os
 import csv
 import shutil
 import pathlib
-import requests
 import subprocess
-import pandas as pd
 from time import sleep
-from zipfile import ZipFile 
+from zipfile import ZipFile
+import requests
 
-class Utils(object):
+class Utils:
+    """Class containing utilities used throughout jasentool"""
     @staticmethod
     def write_out_csv(csv_dict, assay, platform, out_fpath):
-        with open(out_fpath, 'w+') as csvfile:
+        """Write out file as csv"""
+        with open(out_fpath, 'w+', encoding="utf-8") as csvfile:
             fieldnames = ["id", "group", "species", "assay", "platform", "read1", "read2"] #header
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
             writer.writeheader()
             for sample in csv_dict:
-                row_dict = {"id":sample, "group": csv_dict[sample][0], "species": csv_dict[sample][1], "assay": assay, "platform": platform, "read1": csv_dict[sample][2][0], "read2": csv_dict[sample][2][1]} #write rows to CSV
+                row_dict = {"id":sample, "group": csv_dict[sample][0],
+                            "species": csv_dict[sample][1], "assay": assay,
+                            "platform": platform, "read1": csv_dict[sample][2][0],
+                            "read2": csv_dict[sample][2][1]} #write rows to CSV
                 writer.writerow(row_dict)
 
     @staticmethod
     def write_out_txt(output_txt, out_fpath):
-        with open(out_fpath, 'w+') as fout:
+        """Write out file as text"""
+        with open(out_fpath, 'w+', encoding="utf-8") as fout:
             fout.write(output_txt)
 
     @staticmethod
     def pipeline_ready(batch_file):
-        assays = ['saureus']
+        """Check if pipeline exists"""
+        assays = ['saureus', 'ecoli', 'mtuberculosis']
         for assay in assays:
             if assay in batch_file:
                 return True
@@ -36,6 +42,7 @@ def pipeline_ready(batch_file):
 
     @staticmethod
     def copy_batch_and_csv_files(batch_files, csv_files, remote_dir, remote_hostname, remote=False):
+        """Copy shell and csv files to desired (remote) location"""
         if remote:
             # Copy files to remote server using ssh/scp
             process = subprocess.run(
@@ -51,21 +58,24 @@ def copy_batch_and_csv_files(batch_files, csv_files, remote_dir, remote_hostname
         else:
             # Copy files to a local directory
             pathlib.Path(remote_dir).mkdir(parents=True, exist_ok=True)
-            for fn in batch_files + csv_files:
-                shutil.copy(fn, remote_dir)
+            for fin in batch_files + csv_files:
+                shutil.copy(fin, remote_dir)
 
     @staticmethod
     def start_remote_pipelines(batch_files, remote_hostname, remote_dir):
+        """Start nextflow pipelines on a remote server"""
         for batch_file in batch_files:
             if Utils.pipeline_ready(batch_file):
                 sleep(10.0) # Avoid maxing SSH auth connections
                 process = subprocess.Popen(
-                    ["ssh", remote_hostname, "bash", f"{remote_dir}/{os.path.basename(batch_file)}"],
+                    ["ssh", remote_hostname,
+                     "bash", f"{remote_dir}/{os.path.basename(batch_file)}"],
                     close_fds=True
                 )
 
     @staticmethod
     def download_and_save_file(url, output_filepath):
+        """Download the file and save it to the user-specified path"""
         try:
             # Make a request to the URL
             response = requests.get(url, stream=True)
@@ -79,18 +89,50 @@ def download_and_save_file(url, output_filepath):
 
             print(f"File downloaded and saved to: {output_filepath}")
 
-        except requests.exceptions.RequestException as e:
-            print(f"Error downloading the file: {e}")
+        except requests.exceptions.RequestException as error_code:
+            print(f"Error downloading the file: {error_code}")
 
     @staticmethod
     def unzip(zip_file, outdir):
+        """Unzip zip file"""
         with ZipFile(zip_file, 'r') as zip_object:
             zip_object.extractall(path=outdir)
 
     @staticmethod
     def copy_file(source, destination):
+        """Copy file from source to destination"""
         try:
             shutil.copy(source, destination)
             print(f"File copied from {source} to {destination}")
-        except Exception as e:
-            print(f"Error copying file: {e}")
+        except Exception as error_code:
+            print(f"Error copying file: {error_code}")
+
+    @staticmethod
+    def get_aa_dict():
+        """Amino acid one letter translations"""
+        return {
+            'Ala': 'A',
+            'Arg': 'R',
+            'Asn': 'N',
+            'Asp': 'D',
+            'Asx': 'B',
+            'Cys': 'C',
+            'Glu': 'E',
+            'Gln': 'Q',
+            'Glx': 'Z',
+            'Gly': 'G',
+            'His': 'H',
+            'Ile': 'I',
+            'Leu': 'L',
+            'Lys': 'K',
+            'Met': 'M',
+            'Phe': 'F',
+            'Pro': 'P',
+            'Ser': 'S',
+            'Thr': 'T',
+            'Trp': 'W',
+            'Tyr': 'Y',
+            'Val': 'V',
+            "Stop":"*",
+            "-":"-"
+        }

From f3d60c10072bd355de1cfa5abcce6423f9445648 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:31:49 +0100
Subject: [PATCH 15/18] Fix validate.py pylinting errors

---
 jasentool/validate.py | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/jasentool/validate.py b/jasentool/validate.py
index a4db9cf..c546776 100644
--- a/jasentool/validate.py
+++ b/jasentool/validate.py
@@ -1,36 +1,46 @@
-import os
-import sys
+"""Module for validating pipelines"""
+
 import json
 from jasentool.database import Database
 from jasentool.utils import Utils
 
-class Validate(object):
+class Validate:
+    """Class to validate old pipeline (cgviz) with new pipeline (jasen)"""
     def get_sample_id(self, results):
+        """Get sample ID from mongodb"""
         return results["sample_id"]
 
     def get_species_name(self, results):
+        """Get species name from mongodb"""
         return results["species_prediction"][0]["scientific_name"]
 
     def _check_exists(self, db_collection, sample_id):
-        return (True if list(Database.find(db_collection, {"id": sample_id}, {})) else False)
+        """Check if sample ID exists in mongodb"""
+        return bool(list(Database.find(db_collection, {"id": sample_id}, {})))
 
     def search(self, search_query, search_kw, search_list):
+        """Search for query in list of arrays"""
         return [element for element in search_list if element[search_kw] == search_query]
 
     def get_virulence_results(self, results):
+        """Get virulence results"""
         return self.search("VIRULENCE", "type", results["element_type_result"])
 
     def get_pvl(self, results):
+        """Get pvl result"""
         virulence_results = self.get_virulence_results(results)
-        return (True if self.search("lukS-PV", "gene_symbol", virulence_results[0]["result"]["genes"]) else False)
+        return bool(self.search("lukS-PV", "gene_symbol", virulence_results[0]["result"]["genes"]))
 
     def get_mlst(self, results):
+        """Get mlst result"""
         return self.search("mlst", "type", results["typing_result"])
 
     def get_cgmlst(self, results):
+        """Get cgmlst result"""
         return self.search("cgmlst", "type", results["typing_result"])
 
     def get_mdb_cgv_data(self, db_collection, sample_id):
+        """Get sample mongodb data"""
         mdb_pvl = list(Database.get_pvl(db_collection, {"id": sample_id, "metadata.QC": "OK"}))
         mdb_mlst = list(Database.get_mlst(db_collection, {"id": sample_id, "metadata.QC": "OK"}))
         mdb_cgmlst = list(Database.get_cgmlst(db_collection, {"id": sample_id, "metadata.QC": "OK"}))
@@ -39,20 +49,24 @@ def get_mdb_cgv_data(self, db_collection, sample_id):
             mdb_mlst_seqtype = str(mdb_mlst[0]["mlst"]["sequence_type"]) if mdb_mlst[0]["mlst"]["sequence_type"] != "-" else str(None)
             mdb_mlst_alleles = mdb_mlst[0]["mlst"]["alleles"]
             mdb_cgmlst_alleles = mdb_cgmlst[0]["alleles"]
-            return {"pvl": mdb_pvl_present, "mlst_seqtype": mdb_mlst_seqtype, "mlst_alleles": mdb_mlst_alleles, "cgmlst_alleles": mdb_cgmlst_alleles}
+            return {"pvl": mdb_pvl_present, "mlst_seqtype": mdb_mlst_seqtype,
+                    "mlst_alleles": mdb_mlst_alleles, "cgmlst_alleles": mdb_cgmlst_alleles}
         except IndexError:
             return False
 
     def get_fin_data(self, sample_json):
+        """Get sample input file data"""
         fin_pvl_present = self.get_pvl(sample_json)
         fin_mlst = self.get_mlst(sample_json)
         fin_cgmlst = self.get_cgmlst(sample_json)
         fin_mlst_seqtype = str(fin_mlst[0]["result"]["sequence_type"])
         fin_mlst_alleles = fin_mlst[0]["result"]["alleles"]
         fin_cgmlst_alleles = list(fin_cgmlst[0]["result"]["alleles"].values())
-        return {"pvl": fin_pvl_present, "mlst_seqtype": fin_mlst_seqtype, "mlst_alleles": fin_mlst_alleles, "cgmlst_alleles": fin_cgmlst_alleles}
+        return {"pvl": fin_pvl_present, "mlst_seqtype": fin_mlst_seqtype,
+                "mlst_alleles": fin_mlst_alleles, "cgmlst_alleles": fin_cgmlst_alleles}
 
     def compare_mlst_alleles(self, old_mlst_alleles, new_mlst_alleles):
+        """Parse through mlst alleles of old and new pipeline and compare results"""
         match_count, total_count = 0, 0
         for allele in old_mlst_alleles:
             if str(old_mlst_alleles[allele]) == str(new_mlst_alleles[allele]):
@@ -61,18 +75,21 @@ def compare_mlst_alleles(self, old_mlst_alleles, new_mlst_alleles):
         return 100*(match_count/total_count)
 
     def compare_cgmlst_alleles(self, old_cgmlst_alleles, new_cgmlst_alleles):
+        """Parse through cgmlst alleles of old and new pipeline and compare results"""
         match_count, total_count = 0, 0
-        for allele in range(0, len(old_cgmlst_alleles)):
-            if str(old_cgmlst_alleles[allele]) == str(new_cgmlst_alleles[allele]):
+        for idx, old_allele in enumerate(old_cgmlst_alleles):
+            if str(old_allele) == str(new_cgmlst_alleles[idx]):
                 match_count += 1
             total_count += 1
         return 100*(match_count/total_count)
 
     def compare_data(self, sample_id, old_data, new_data):
+        """Compare data between old pipeline and new pipeline"""
         pvl_comp = int(old_data["pvl"] == new_data["pvl"])
         mlst_seqtype_comp = int(old_data["mlst_seqtype"] == new_data["mlst_seqtype"])
         if mlst_seqtype_comp == 0:
-            mlst_at_list = [f'{old_data["mlst_alleles"][gene]},{new_data["mlst_alleles"][gene]}' for gene in sorted(old_data["mlst_alleles"].keys())]
+            mlst_at_list = [f'{old_data["mlst_alleles"][gene]},{new_data["mlst_alleles"][gene]}'
+                            for gene in sorted(old_data["mlst_alleles"].keys())]
             mlst_at_str = ",".join(mlst_at_list)
             print(f'{sample_id},{old_data["mlst_seqtype"]},{new_data["mlst_seqtype"]},{mlst_at_str}')
         mlst_alleles = self.compare_mlst_alleles(old_data["mlst_alleles"], new_data["mlst_alleles"])
@@ -80,10 +97,11 @@ def compare_data(self, sample_id, old_data, new_data):
         return f"{sample_id},{pvl_comp},{mlst_seqtype_comp},{mlst_alleles},{cgmlst_alleles}"
 
     def run(self, input_files, output_fpaths, db_collection, combined_output):
+        """Execute validation of new pipeline (jasen)"""
         utils = Utils()
         csv_output = "sample_id,pvl,mlst_seqtype,mlst_allele_matches(%),cgmlst_allele_matches(%)"
         for input_idx, input_file in enumerate(input_files):
-            with open(input_file, 'r') as fin:
+            with open(input_file, 'r', encoding="utf-8") as fin:
                 sample_json = json.load(fin)
                 sample_id = self.get_sample_id(sample_json)
                 if not self._check_exists(db_collection, sample_id):
@@ -91,7 +109,7 @@ def run(self, input_files, output_fpaths, db_collection, combined_output):
                     continue
                 mdb_data_dict = self.get_mdb_cgv_data(db_collection, sample_id)
                 if mdb_data_dict:
-                    species_name = self.get_species_name(sample_json)
+                    #species_name = self.get_species_name(sample_json)
                     fin_data_dict = self.get_fin_data(sample_json)
                     compared_data_output = self.compare_data(sample_id, mdb_data_dict, fin_data_dict)
                     csv_output += "\n" + compared_data_output

From ca04dae9e67f809d08de694ddd885f141c1c7880 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:32:05 +0100
Subject: [PATCH 16/18] Fix who.py pylinting errors

---
 jasentool/who.py | 172 ++++++++++++++++++++++++-----------------------
 1 file changed, 89 insertions(+), 83 deletions(-)

diff --git a/jasentool/who.py b/jasentool/who.py
index 0d89bf4..8e983de 100644
--- a/jasentool/who.py
+++ b/jasentool/who.py
@@ -1,49 +1,27 @@
+"""Module for handling WHO mutation catalogue"""
+
 import os
 import re
 import pandas as pd
 from tqdm import tqdm
 from jasentool.utils import Utils
 
-class WHO(object):
+class WHO:
+    """Class for handling WHO tb mutation catalogue"""
     def __init__(self):
-        self.aa_dict_1 = self.get_aa_dict()
+        self.aa_dict_1 = Utils.get_aa_dict()
         self.aa_dict_2 = self.inv_dict()
         self.nucleotide_complements = self.get_nt_complements()
         self.drug_dict = self.get_drug_dict()
         self.re_c, self.re_p, self.re_d, self.re_i = self.setup_re()
         self.re_attr = re.compile('Name=([^;]+).*locus_tag=([^;|\n]+)')
 
-    def get_aa_dict(self):
-        return {
-            'Ala': 'A',
-            'Arg': 'R',
-            'Asn': 'N',
-            'Asp': 'D',
-            'Asx': 'B',
-            'Cys': 'C',
-            'Glu': 'E',
-            'Gln': 'Q',
-            'Glx': 'Z',
-            'Gly': 'G',
-            'His': 'H',
-            'Ile': 'I',
-            'Leu': 'L',
-            'Lys': 'K',
-            'Met': 'M',
-            'Phe': 'F',
-            'Pro': 'P',
-            'Ser': 'S',
-            'Thr': 'T',
-            'Trp': 'W',
-            'Tyr': 'Y',
-            'Val': 'V',
-            '*': '!',
-        }
-    
     def inv_dict(self):
+        """Invert amino acid dictionary"""
         return {v: k for k, v in self.aa_dict_1.items()}
 
     def get_nt_complements(self):
+        """Get nucleotide complements"""
         return {
             'C': 'G',
             'G': 'C',
@@ -52,6 +30,7 @@ def get_nt_complements(self):
         }
 
     def get_drug_dict(self):
+        """Get drug 3 letter code translation dictionary"""
         return {
             'RIF': 'rifampicin',
             'INH': 'isoniazid',
@@ -71,31 +50,34 @@ def get_drug_dict(self):
         }
 
     def setup_re(self):
-        # Setup the regular expressions
-        re_c = re.compile('^(\w+)_([actg])(-*\d+)([actg])$') #regex pattern for 
-        re_p = re.compile('^(\w+)_([A-Z])(\d+)([A-Z!])$') #regex pattern for protein
-        re_d = re.compile('^(\w+)_(-*\d+)_del_(\d+)_([actg]+)_([actg]+)$') #regex pattern for deletions
-        re_i = re.compile('^(\w+)_(-*\d+)_ins_(\d+)_([actg]+)_([actg]+)$') #regex pattern for insertions
+        """Setup the regular expressions"""
+        re_c = re.compile('^(\w+)_([actg])(-*\d+)([actg])$') #regex pattern for nucleotide changes
+        re_p = re.compile('^(\w+)_([A-Z])(\d+)([A-Z!])$') #regex for protein
+        re_d = re.compile('^(\w+)_(-*\d+)_del_(\d+)_([actg]+)_([actg]+)$') #regex for deletions
+        re_i = re.compile('^(\w+)_(-*\d+)_ins_(\d+)_([actg]+)_([actg]+)$') #regex for insertions
         return re_c, re_p, re_d, re_i
 
     def lower_row(self, row):
+        """Lowercase string in row"""
         return row.str.lower()
 
     def read_files(self, gff_filepath, xlsx_filepath, h37rv_filepath):
+        """Read gff, excel & genome files"""
         # Load the reference GFF file
-        gff = pd.read_csv(gff_filepath, names=['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'], sep='\t', header=None)
+        columns = ['seqid', 'source', 'type', 'start', 'end',
+                   'score', 'strand', 'phase', 'attributes']
+        gff = pd.read_csv(gff_filepath, names=columns, sep='\t', header=None)
         # Load the WHO catalogue
         catalogue = pd.read_excel(xlsx_filepath, sheet_name='Catalogue_master_file', header=2)
         # Load the reference genome to impute missing data from deletions
         h37rv = ''
-        with open(h37rv_filepath, 'r') as fin:
+        with open(h37rv_filepath, 'r', encoding="utf-8") as fin:
             for line in fin:
                 h37rv += line.replace('\n', '')
         return gff, catalogue, h37rv
 
     def process_variant(self, variant, gff_dict):
-        '''Translates variants in the WHO catalogue format to HGVS'''
-
+        """Translates variants in the WHO catalogue format to HGVS"""
         c_match = self.re_c.match(variant)
         if c_match:
             if gff_dict[c_match[1]]['type'] == 'rRNA':
@@ -106,37 +88,37 @@ def process_variant(self, variant, gff_dict):
                 v_type = 'c'
                 ref = c_match[2].upper()
                 alt = c_match[4].upper()
-            return (c_match[1], v_type, '{}.{}{}>{}'.format(v_type, c_match[3], ref, alt), False, None)
+            return (c_match[1], v_type, f'{v_type}.{c_match[3]}{ref}>{alt}', False, None)
 
         p_match = self.re_p.match(variant)
         if p_match:
-            return (p_match[1], 'p', 'p.{}{}{}'.format(self.aa_dict_2[p_match[2].upper()], p_match[3], self.aa_dict_2[p_match[4].upper()]), False, None)
+            return (p_match[1], 'p', f'p.{self.aa_dict_2[p_match[2].upper()]}{p_match[3]}{self.aa_dict_2[p_match[4].upper()]}', False, None)
 
         d_match = self.re_d.match(variant)
         if d_match:
             if int(d_match[3]) != len(d_match[4]) - len(d_match[5]):
                 return (None, None, None, True, 'length mismatch')
 
-            starts = [pos for pos in range(1, len(d_match[4]) + 1 - int(d_match[3])) if d_match[4][:pos]+d_match[4][pos+int(d_match[3]):] == d_match[5]]
+            starts = [pos for pos in range(1, len(d_match[4])+1-int(d_match[3]))
+                      if d_match[4][:pos]+d_match[4][pos+int(d_match[3]):] == d_match[5]]
             if not starts:
                 return (None, None, None, True, 'invalid indel')
             if not gff_dict[d_match[1]]['strand']:
                 hgvs = []
                 for start in starts:
                     if int(d_match[3]) == 1:
-                        hgvs.append('c.{}del'.format(int(d_match[2])+start))
+                        hgvs.append(f'c.{int(d_match[2])+start}del')
                     else:
-                        hgvs.append('c.{}_{}del'.format(int(d_match[2])+start, int(d_match[2])+start-1+int(d_match[3])))
-                return (d_match[1], 'c', '|'.join(hgvs), False, None)
-            else:
-                hgvs = []
-                for start in starts:
-                    if int(d_match[3]) == 1:
-                        hgvs.append('c.{}del'.format(int(d_match[2]) - start - int(d_match[3]) + 1))
-                    else:
-                        v = 'c.{}_{}del'.format(int(d_match[2]) - start - int(d_match[3]) + 1, int(d_match[2]) - start)
-                        hgvs.append(v)
+                        hgvs.append(f'c.{int(d_match[2])+start}_{int(d_match[2])+start-1+int(d_match[3])}del')
                 return (d_match[1], 'c', '|'.join(hgvs), False, None)
+            hgvs = []
+            for start in starts:
+                if int(d_match[3]) == 1:
+                    hgvs.append(f'c.{int(d_match[2]) - start - int(d_match[3]) + 1}del')
+                else:
+                    hgvs_var = f'c.{int(d_match[2])-start-int(d_match[3])+1}_{int(d_match[2])-start}del'
+                    hgvs.append(hgvs_var)
+            return (d_match[1], 'c', '|'.join(hgvs), False, None)
 
         i_match = self.re_i.match(variant)
         if i_match:
@@ -148,18 +130,26 @@ def process_variant(self, variant, gff_dict):
             if not gff_dict[i_match[1]]['strand']:
                 hgvs = []
                 for start in starts:
-                    hgvs.append('c.{}_{}ins{}'.format(int(i_match[2])+start-1, int(i_match[2])+start, ''.join([i.upper() for i in i_match[5][start:start+int(i_match[3])]])))
-                return (i_match[1], 'c', '|'.join(hgvs), False, None)
-            else:
-                hgvs = []
-                for start in starts:
-                    v = 'c.{}_{}ins{}'.format(int(i_match[2])-start, int(i_match[2]) - start+1, ''.join([self.nucleotide_complements[i.upper()] for i in i_match[5][start:start+int(i_match[3])][::-1]]))
-                    hgvs.append(v)
+                    start_pos = int(i_match[2])+start-1
+                    end_pos = int(i_match[2])+start
+                    seq = ''.join([i.upper() for i in i_match[5][start:start+int(i_match[3])]])
+                    hgvs_var = f'c.{start_pos}_{end_pos}ins{seq}'
+                    hgvs.append(f'c.{start_pos}_{end_pos}ins{seq}')
                 return (i_match[1], 'c', '|'.join(hgvs), False, None)
+            hgvs = []
+            for start in starts:
+                start_pos = int(i_match[2])-start
+                end_pos = int(i_match[2]) - start+1
+                seq = ''.join([self.nucleotide_complements[i.upper()]
+                               for i in i_match[5][start:start+int(i_match[3])][::-1]])
+                hgvs_var = f'c.{start_pos}_{end_pos}ins{seq}'
+                hgvs.append(hgvs_var)
+            return (i_match[1], 'c', '|'.join(hgvs), False, None)
 
         return (None, None, None, True, 'does not match indel or variant')
 
     def extract_info(self, info_string):
+        """Extract gene name and locus tag from provided string"""
         if pd.notna(info_string):
             match = self.re_attr.search(info_string)
             if match:
@@ -169,8 +159,7 @@ def extract_info(self, info_string):
         return pd.Series([None, None])
 
     def get_gene_info(self, gff):
-        # Get the gene information from the GFF file
-        # Apply the function to the 'attributes' column
+        """Get gene info from the GFF file and apply the function to the 'attributes' column"""
         gff[['locus_tag', 'name']] = gff.attributes.apply(self.extract_info)
 
         gff_dict = {}
@@ -191,13 +180,14 @@ def get_gene_info(self, gff):
         return gff_dict
 
     def prep_catalogue(self, catalogue):
-        # Prepare the WHO catalogue dataframe
+        """Prepare the WHO catalogue dataframe"""
         classified = []
-        v = re.compile('^(.*) \((.*)\)')
+        variant_re = re.compile('^(.*) \((.*)\)')
+        who_mut_cat_url = 'https://www.who.int/publications/i/item/9789240028173'
         for var, row in catalogue[catalogue[('FINAL CONFIDENCE GRADING', 'Unnamed: 51_level_1')].apply(lambda conf: conf != 'combo')].iterrows():
             drug_key = row[('drug', 'Unnamed: 0_level_1')]
             drug = self.drug_dict[drug_key]
-            v_match = v.match(var)
+            v_match = variant_re.match(var)
             if v_match:
                 # Include all variants listed
                 variants = [v_match[1]] + [i.strip() for i in v_match[2].split(',')]
@@ -206,14 +196,16 @@ def prep_catalogue(self, catalogue):
             else:
                 variants = [var]
             category = ' '.join(row[('FINAL CONFIDENCE GRADING', 'Unnamed: 51_level_1')].split(' ')[1:])
-            genome_pos = '{:.0f}'.format(row[('Genome position', 'Unnamed: 3_level_1')])
+            #genome_pos = '{:.0f}'.format(row[('Genome position', 'Unnamed: 3_level_1')])
             for variant in variants:
-                classified.append([variant, drug, 'resistance', '', 'https://www.who.int/publications/i/item/9789240028173', category])
-        classified = pd.DataFrame(classified, columns=['variant', 'Drug', 'Confers', 'Interaction', 'Literature', 'WHO Confidence'])
+                row = [variant, drug, 'resistance', '', who_mut_cat_url, category]
+                classified.append()
+        column_names = ['variant', 'Drug', 'Confers', 'Interaction', 'Literature', 'WHO Confidence']
+        classified = pd.DataFrame(classified, columns=column_names)
         return classified
 
     def var2hgvs(self, classified, gff_dict):
-        # Convert the variants to HGVS format
+        """Convert the variants to HGVS format"""
         for idx, row in tqdm(classified.iterrows(), total=classified.shape[0]):
             gene, var_type, variant, fail, fail_reason = self.process_variant(row.variant, gff_dict)
             classified.loc[idx, 'gene'] = gene
@@ -224,27 +216,37 @@ def var2hgvs(self, classified, gff_dict):
         return classified
 
     def impute_del(self, classified, gff_dict, h37rv):
-        # Impute missing data for deletions
+        """Impute missing data for deletions"""
         length_mismatch = classified[classified.fail_reason == 'length mismatch'].sort_values(by='variant', key=self.lower_row)
 
         for idx, row in tqdm(length_mismatch.iterrows(), total=length_mismatch.shape[0]):
             d_match = self.re_d.match(row.variant)
             if d_match:
                 if not gff_dict[d_match[1]]['strand']:
-                    indexing_correction = -1 if int(d_match[2]) < 0 else -2 # correct for 0 based python indexing (-1 if promotor, -2 if within gene)
+                    # Correct for 0 based python indexing (-1 if promotor, -2 if within gene)
+                    indexing_correction = -1 if int(d_match[2]) < 0 else -2
+
                     start = int(gff_dict[d_match[1]]['start']) + int(d_match[2]) + int(indexing_correction)
-                    end = start + int(d_match[3]) + len(d_match[5]) # add the lenght of the alt allele to account for the bases not part of the indel
+
+                    # add the length of the alt allele to account for the bases not part of the indel
+                    end = start + int(d_match[3]) + len(d_match[5])
                     try:
-                        complete_variant = '{}_{}_del_{}_{}_{}'.format(d_match[1], d_match[2], d_match[3], h37rv[start:end].lower(), d_match[5])
+                        complete_variant = f'{d_match[1]}_{d_match[2]}_del_{d_match[3]}_{h37rv[start:end].lower()}_{d_match[5]}'
                     except TypeError:
                         print(f"{start}: {type(start)}\n{end}: {type(end)}")
                     classified.loc[idx, 'complete_variant'] = complete_variant
                     classified.loc[idx, 'complete_variant_fail'] = False
                 else:
-                    indexing_correction = -1 if int(d_match[2]) < 0 else 0 # correct for 0 based python indexing (-1 if promotor, 0 if within gene)
-                    start = int(gff_dict[d_match[1]]['end']) - int(d_match[2]) + int(indexing_correction) # subtract d_match[2] instead of adding as this is the opposite strand
-                    end = start + int(d_match[3]) + len(d_match[5]) # add the lenght of the alt allele to account for the bases not part of the indel
-                    complete_variant = '{}_{}_del_{}_{}_{}'.format(d_match[1], d_match[2], d_match[3], h37rv[start:end].lower(), d_match[5])
+                    # Correct for 0 based python indexing (-1 if promotor, 0 if within gene)
+                    indexing_correction = -1 if int(d_match[2]) < 0 else 0
+
+                    # Subtract d_match[2] instead of adding as this is the opposite strand
+                    start = int(gff_dict[d_match[1]]['end']) - int(d_match[2]) + int(indexing_correction)
+
+                    # Add the length of the alt allele to account for the bases not part of the indel
+                    end = start + int(d_match[3]) + len(d_match[5])
+
+                    complete_variant = f'{d_match[1]}_{d_match[2]}_del_{d_match[3]}_{h37rv[start:end].lower()}_{d_match[5]}'
                     classified.loc[idx, 'complete_variant'] = complete_variant
                     classified.loc[idx, 'complete_variant_fail'] = False
                 continue
@@ -259,10 +261,11 @@ def impute_del(self, classified, gff_dict, h37rv):
                     pass
                 continue
         return classified
-    
+
     def imp2hgvs(self, classified, gff_dict):
-        # Convert imputed deletions to HGVS format
-        for idx, row in tqdm(classified[classified.complete_variant_fail == False].iterrows(), total=classified[classified.complete_variant_fail == False].shape[0]):
+        """Convert imputed deletions to HGVS format"""
+        for idx, row in tqdm(classified[classified.complete_variant_fail == False].iterrows(),
+                             total=classified[classified.complete_variant_fail == False].shape[0]):
             if row.complete_variant_fail:
                 continue
             gene, var_type, variant, fail, fail_reason = self.process_variant(row.complete_variant, gff_dict)
@@ -274,17 +277,19 @@ def imp2hgvs(self, classified, gff_dict):
         return classified
 
     def write_out_csv(self, classified, csv_outpath):
-        # Write results to csv file
+        """Write results to csv file"""
         classified.to_csv(csv_outpath, index=False)
 
     def _parse(self, fasta_filepath, gff_filepath, download_dir):
+        """Parse WHO excel file"""
         utils = Utils()
         #who_url = "https://apps.who.int/iris/bitstream/handle/10665/341906/WHO-UCN-GTB-PCI-2021.7-eng.xlsx"
         who_url = "https://raw.githubusercontent.com/GTB-tbsequencing/mutation-catalogue-2023/main/Final%20Result%20Files/WHO-UCN-TB-2023.5-eng.xlsx"
         who_filepath = os.path.join(download_dir, "who.xlsx")
         utils.download_and_save_file(who_url, who_filepath)
-        gff, catalogue, h37rv = self.read_files(gff_filepath, who_filepath, fasta_filepath)
-        gff_dict = self.get_gene_info(gff)
+        _, catalogue, _ = self.read_files(gff_filepath, who_filepath, fasta_filepath)
+        #gff, catalogue, h37rv = self.read_files(gff_filepath, who_filepath, fasta_filepath)
+        #gff_dict = self.get_gene_info(gff)
         catalogue.columns = catalogue.columns.str.title()
         catalogue.rename(columns={'Final Confidence Grading': 'WHO Confidence'}, inplace=True)
         catalogue['Confers'] = 'resistance'
@@ -292,7 +297,8 @@ def _parse(self, fasta_filepath, gff_filepath, download_dir):
         catalogue['Literature'] = 'https://www.who.int/publications/i/item/9789240082410'
         catalogue['WHO Confidence'] = catalogue['WHO Confidence'].apply(lambda x: ' '.join(x.split(' ')[1:]))
         catalogue['Drug'] = catalogue['Drug'].apply(lambda x: x.lower())
-        catalogue = catalogue.loc[:, ["Drug","Confers","Interaction","Literature","WHO Confidence","Gene","Mutation"]]
+        catalogue = catalogue.loc[:, ["Drug", "Confers", "Interaction", "Literature",
+                                      "WHO Confidence", "Gene", "Mutation"]]
         csv_outpath = os.path.join(download_dir, "who.csv")
         self.write_out_csv(catalogue, csv_outpath)
         return catalogue

From c9b8876eac95d83b5c89819bc588cdc7f5e945c7 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:32:20 +0100
Subject: [PATCH 17/18] Fix setup.py pylinting errors

---
 setup.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 5c1e4e1..30d5385 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,5 @@
+"""setup.py for pip installation of jasentool"""
+
 import re
 import sys
 
@@ -9,10 +11,10 @@
         'Try runnning: python -m ensurepip'
     )
 
-with open("README.md", 'r') as fin:
+with open("README.md", 'r', encoding="utf-8") as fin:
     long_description = fin.read()
 
-with open("jasentool/__init__.py", 'r') as fin:
+with open("jasentool/__init__.py", 'r', encoding="utf-8") as fin:
     version_line_regex = re.compile(r'^\s*__version__\s*=\s*[\'"]([^\'"]+)[\'"]')
     for line in fin:
         match = version_line_regex.match(line)
@@ -33,6 +35,6 @@
     ],
     install_requires=["pymongo", "openpyxl", "biopython"],
     entry_points={"console_scripts": ["jasentool=jasentool.__main__:main"]},
-    packages=find_packages(exclude=("tests")),
+    packages=find_packages(exclude="tests"),
     package_data={"jasentool": ["data/dbs/*"]},
-)
\ No newline at end of file
+)

From 7422c5535531df3bd6a82125343a6d8acd12df42 Mon Sep 17 00:00:00 2001
From: ryanjameskennedy <ryanjameskennedy@icloud.com>
Date: Wed, 17 Jan 2024 14:33:06 +0100
Subject: [PATCH 18/18] Add pylint GA workflow

---
 .github/workflows/pylint.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 .github/workflows/pylint.yml

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
new file mode 100644
index 0000000..a68dccd
--- /dev/null
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,24 @@
+name: Pylint
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint pytest
+        pip install -e .
+    - name: Analysing the code with pylint
+      run: |
+        pylint --disable=W1401,R0914,W0718 --fail-under 9 $(git ls-files '*.py')
\ No newline at end of file