abridge

#! /usr/bin/env python3

########################################################################################################################################################
# The software 'abridge' will compress aligned files to a bare minimum needed for generating assemblies and producing read counts
#
# Changelist
#
# Write meaningful outputs
########################################################################################################################################################

from argparse import RawTextHelpFormatter
import argparse
import logging
import os
import pprint
import sys
import re
import time
import multiprocessing
import random
import glob
import time
import subprocess
from shutil import which
from pprint import pformat


def parseCommandLineArguments():
    parser = argparse.ArgumentParser( prog = "abridge", description = """
    Compress alignments for storage, decompress from compressed file, and generate coverages.
    The main purpose of this software is to achieve high compression and fast decompression. 
    Hence, ABRIDGE is not optimized to retrieve data from random locations
    In addition to that, ABRIDGE can be used to generate coverage information.
    Mandatory inputs to ABRIDGE include the alignment file, the reference and the name
    of the output file. Users can optionally provide a temporary directory where all 
    intermediate operations will be executed. 
    """, formatter_class = RawTextHelpFormatter )
    
    required_named = parser.add_argument_group( 'Required arguments' )
    optional_named = parser.add_argument_group( 'Optional arguments' )
    
    ################################################################################################################################################################################################################################################
    # Required arguments
    ################################################################################################################################################################################################################################################
    required_named.add_argument( "--outputfilename", help = "Enter the name of the output filename", required = True )
    required_named.add_argument( "--reference", help = "Enter a single fasta file for the reference", required = True )

    input_group = parser.add_mutually_exclusive_group( required = True )
    input_group.add_argument( "--inputalignedfilename", help = "Enter the name of the alignment file you wish to compress. Alignments in both SAM and BAM format only is accepted. Ensure that the file is sorted by coordinate. Also, files must have the header section with the reference information available. You can compress only one file at a time." )
    input_group.add_argument( "--inputabrfilename", help = "Enter the name of the compressed alignment files you wish to merge. These files must be compressed using abridge. You can decompress only one file at a time." )

    compress_decompress_group = parser.add_mutually_exclusive_group( required = True )
    compress_decompress_group.add_argument( "--compress", help = "Set this option if you wish to compress the alignment file", action = "store_true" )
    compress_decompress_group.add_argument( "--decompress", help = "Set this option if you wish to decompress the alignment file", action = "store_true" )
    compress_decompress_group.add_argument( "--header", help = "Print only the header of reference sequences during decompression", action = "store_true" )
    # compress_decompress_group.add_argument( "-r", "--random", help = "Retrieve alignments from random locations", action = "store_true" )

    ################################################################################################################################################################################################################################################
    # Optional arguments    
    ################################################################################################################################################################################################################################################
    optional_named.add_argument( "--temp_directory", help = """Enter the name of the temporary directory where all intermediate operations will be executed. 
                                                            All error files will be stored in this directory. If you do not wish to retrieve those please ignore this parameter.
                                                            Providing an argument will prevent clean up of intermediate files""")
    
    # Compression
    optional_named.add_argument( "--skip_shortening_read_names", help = "Request abridge to skip the step to reduce the size of read names. Please note that setting this option will enable faster compression but lead to a poor ratio of compression", action = "store_true" )
    optional_named.add_argument( "--ignore_alignment_scores", help = "Request abrigde to store the mapping quality scores and the alignment score (provided it is available through the AS tag)", action = "store_true" )
    optional_named.add_argument( "--ignore_all_quality_scores", help = "Ignore all quality scores", action = "store_true" )
    optional_named.add_argument( "--ignore_quality_scores_for_matched_bases", help = "Request abridge to save all quality scores", action = 'store_true' )
    optional_named.add_argument( "--ignore_soft_clippings", help = "Ignore all soft clippings. Read will be trimmed down to only the portion which were aligned to the reference", action = "store_true" )
    optional_named.add_argument( "--ignore_mismatches", help = "All mismatches will be ignored", action = "store_true" )
    optional_named.add_argument( "--ignore_unmapped_reads", help = "Request abridge to discard all reads that are unmapped", action = "store_true" )
    optional_named.add_argument( "--spring_or_fclqc", help = "Enter which compressor to use. Enter SPRING or FCLQC", default = "FCLQC")
    
    # Decompression
    optional_named.add_argument( "--quality", help = "Enter dummy quality scores while decompressing", default = 'I' )
    optional_named.add_argument( "--ignore_sequence", help = "Dummy nucleotide sequence will be produced during decompression. A string of A's will be generated", action = "store_true" )
    
    optional_named.add_argument( "--quiet", help = "Prevent abridge from printing any log information. By default logging is enables", action = "store_true" )
    optional_named.add_argument( "--cpu", help = "Enter the number of CPU cores to be used. This option will be used during compression or decompression.", default = 1 )
    optional_named.add_argument( "--run_diagnostics", help = "abridge will run diagnostics on the cigar compression and decompression. It will exit on discovery of any discrepancies", action = "store_true" )
    optional_named.add_argument( "--force", help = "Setting this argument will remove the output directory and start the computation from scratch", action = "store_true" )
    
    # Coverage generation
    # optional_named.add_argument( "-aq", "--save_exact_quality_scores", help = "Adjust quality scores for matched bases to achieve better encoding. For more details please check ...", action = "store_true" )
    # optional_named.add_argument( "-p", "--positions", help = "Enter the position as chromosome:start-end from which reads will be retrieved" )
    # optional_named.add_argument( "-rp", "--read_prefix", help = "Enter a read prefix for decompression - valid only for random access" )
    # optional_named.add_argument( "-l", "--level", help = "This can accept an integer from the set (1,2,3). If level is set to 1 then abridge will perform the fastest but low compression. abridge will use brotli to compress. Decompression will be fast. Setting level to 2 will prompt abridge to perform the medium level compression using 7z. Compression will take time but decompression will be fast. If level is set to 3 then abridge will perform the best compression using 7paq. Both compression and decompression will take average time to complete", type = int, default = 2 )
    
    # Suppressed arguments
    parser.add_argument( "--logfilename", help = argparse.SUPPRESS )  # Name of the logfile
    parser.add_argument( "--files_for_removal", help = argparse.SUPPRESS )  # Files will be removed later
    parser.add_argument( "--softwares", help = argparse.SUPPRESS )  # Software paths
    parser.add_argument( "--single_ended", help = argparse.SUPPRESS )
    parser.add_argument( "--reference_to_length", help = argparse.SUPPRESS )
    parser.add_argument( "--compile_programs", action = "store_true", help = argparse.SUPPRESS )  # Force abridge to compile the C programs
    parser.add_argument( "--outputs_and_errors_directory", action = "store_true", help = argparse.SUPPRESS)
    parser.add_argument( "--remove_temp_directory", action = "store_true", help = argparse.SUPPRESS)

    # Future enhancements
    compress_decompress_group.add_argument( "--generate_overlapping_coverage", help = argparse.SUPPRESS, action = "store_true" )  # Future - This option can be used in conjuction with --positions to construct coverage from a specific location # help="Enter the name of the compressed file from which you wish to generate an overlapping coverage of reads ",
    compress_decompress_group.add_argument( "--generate_non_overlapping_coverage", help = argparse.SUPPRESS, action = "store_true" )  # help="Enter the name of the compressed file from which you wish to generate a non-overlapping coverage of reads "

    # Options for generating coverage
    optional_named.add_argument( "--d", help = argparse.SUPPRESS, action = "store_true" )  # help = "Report the depth at each position in each A feature. Positions reported are one based.  Each position and depth follow the complete A feature.",
    optional_named.add_argument( "--bg", help = argparse.SUPPRESS, action = "store_true" )
    optional_named.add_argument( "--bga", help = argparse.SUPPRESS, action = "store_true" )
    optional_named.add_argument( "--split", help = argparse.SUPPRESS, action = "store_true" )  # help = "Treat \"split\" BAM or BED12 entries as distinct BED intervals.",
    optional_named.add_argument( "--max_memory", help = argparse.SUPPRESS, default = 10 )  # help="Enter the maximum memory allowed (in GB)"
    optional_named.add_argument( "--produce_tags", help = argparse.SUPPRESS, nargs = "*" )  # help="Enter a comma separated list of tags that you want abridge to produce during decompression. By default abridge will generate NH, MD and XS tags."
    return parser.parse_args()
        
def configureLogger( options ):
    if os.path.exists( options.logfilename ) == True:
        os.system( f"rm -f {options.logfilename}" )
    logging.basicConfig( format = '%(asctime)s - %(message)s', datefmt = '%d-%b-%y %H:%M:%S', level = logging.DEBUG, filename = options.logfilename )

"""def runDockerCommand( logging, name, version, image_location, container_name, volumes, command , cpus = 1, memory = '1g' ):
    
    #Runs the command in a docker container
    

    # Runs the main command
    docker_cmd = f" docker run "
    # docker_cmd += f" -ti "
    docker_cmd += f" --rm "
    docker_cmd += f" --cpus={cpus}"
    docker_cmd += f" --memory='{memory}'"
    # docker_cmd += f" --name {container_name}"
    for mapping in volumes:
        docker_cmd += f" -v {mapping}"
    docker_cmd += f" {image_location}:{version} "
    docker_cmd += f" bash -c '{command}'"
    logging.info( f"Running command - {docker_cmd}" )
    os.system( docker_cmd )

def runSingularityCommand( logging, name, version, image_location, container_name, volumes, command , cpus = 1, memory = '1g' ):
    
    #Runs the command in a Singularity container
    

    # Runs the main command
    singularity_cmd = f" singularity exec  "
    # singularity_cmd += f" --hostname {container_name}"
    for mapping in volumes:
        singularity_cmd += f" -B {mapping}"
    singularity_cmd += f" {image_location} "
    singularity_cmd += f" bash -c '{command}'"
    os.system( singularity_cmd )
    logging.info( f"Running command - {singularity_cmd}" )

def executeCommand(options, software, version,  singularity_sif_location, mem, cmd):
    
    #Executes the command within the requested framework
    
    if options.framework == "docker":
        runDockerCommand( logging,
                          name = software,
                            version = version,
                            image_location = f"ghcr.io/sagnikbanerjee15/dockerized_tools_and_pipelines/{software}",
                            container_name = f"{options.temp_directory.split('/')[-1]}",
                            volumes = 
                            command = cmd,
                            cpus = options.cpu,
                            memory = mem
            )
    elif options.framework == "singularity":
        runSingularityCommand( logging,
                            name = software,
                            version = version,
                            image_location = singularity_sif_location,
                            container_name = f"{options.temp_directory.split('/')[-1]}",
                            volumes = 
                            command = cmd,
                            cpus = options.cpu,
                            memory = mem
                            )
"""
        
def validateCommandLineArguments( options ):
    """
    """
    length_of_temp_directory_name = 10
    if options.temp_directory == None:
        random_string = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k = length_of_temp_directory_name))
        while os.path.dirname(random_string) == True:
            random_string = ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k = length_of_temp_directory_name))
        options.temp_directory = random_string
        options.remove_temp_directory = False
    else:
        options.remove_temp_directory = True
    
    cmd = f"mkdir -p {options.temp_directory}"
    os.system(cmd)
        
    # Set up symlink to reference
    abs_path = os.path.abspath(f"{options.temp_directory}/reference.fasta")
    cmd = f"ln -s {os.path.abspath(options.reference)} {abs_path}"
    options.reference = abs_path
    os.system(cmd)
    
    # set up symlink to the input alignment file
    if options.compress == True:
        extension = options.inputalignedfilename.split(".")[-1]
        abs_path = os.path.abspath(f"{options.temp_directory}/inputaligned.{extension}")
        cmd = f"ln -s {os.path.abspath(options.inputalignedfilename)} {abs_path}"
        options.inputalignedfilename = abs_path
        os.system(cmd)
    
    if options.decompress ==True:
        extension = options.inputabrfilename.split(".")[-1]
        abs_path = os.path.abspath(f"{options.temp_directory}/inputaligned.{extension}")
        cmd = f"ln -s {os.path.abspath(options.inputabrfilename)} {abs_path}"
        options.inputabrfilename = abs_path
        os.system(cmd)
    
    options.reference = f"{options.temp_directory}/reference.fasta"
    
    if options.compress == True and options.inputalignedfilename is None and options.inputabrfilename is not None:
        print( "For compression you need to provide a single alignment file using -ialigned " )
        if options.quiet == False:
            logging.info( "For compression you need to provide one aligned file in either SAM or BAM format using -ialigned " )
        sys.exit()
    if options.decompress == True and options.inputalignedfilename is not None and options.inputabrfilename is None:
        print( "For decompression you need to provide one abridge compressed file using -iabr" )
        if options.quiet == False:
            logging.info( "For decompression you need to provide a list of abridge compressed files using -iabr" )
        sys.exit()
        
    if options.inputalignedfilename is not None:
        inputfiles = options.inputalignedfilename
    else:
        inputfiles = options.inputabrfilename

    if os.path.exists( inputfiles ) == False:
        print( f"The input file {inputfiles} does not exist. Exiting..." )
        if options.quiet == False:
            logging.info( f"The input file {inputfiles} does not exist. Exiting..." )
        sys.exit()

    # Check if the input format is sam
    if options.inputalignedfilename is not None:
        if ( options.inputalignedfilename[-3:] != "sam" and options.inputalignedfilename[-3:] != "bam" ) and options.compress == True:
            print( f"The input file {options.inputalignedfilename} needs to be in either sam or bam format. Exiting..." )
            if options.quiet == False:
                logging.info( f"The input file {options.inputalignedfilename} needs to be in sam format. Exiting..." )
            sys.exit()


    if options.generate_overlapping_coverage == True or options.generate_non_overlapping_coverage == True:
        options.temp_directory = "/".join( options.inputabrfilename.split( "/" )[:-1] ) + "/" + str( int( time.time() ) )
        os.system( f"mkdir -p {options.temp_directory}" )
    
    options.outputs_and_errors_directory = f"{options.temp_directory}/output_and_errors"
    os.system( f"mkdir -p {options.outputs_and_errors_directory}" )
    
    if options.ignore_all_quality_scores == True:
        options.ignore_quality_scores_for_matched_bases = True
    
    if options.decompress == True:
        if options.inputabrfilename[-8:] != ".abridge":
            print( f"The input file {options.inputabrfilename} needs to be in abridge format. Exiting..." )
            if options.quiet == False:
                logging.info( f"The input file {options.inputabrfilename} needs to be in abridge format. Exiting..." )
            sys.exit()

    if options.generate_overlapping_coverage == False and options.generate_non_overlapping_coverage == False:
        if True in [options.d, options.bg, options.bga, options.split]:
            print( "Incorrect arguments. Please provide arguments -d -bg -bga or -split only when you wish to generate coverage" )
            if options.quiet == False:
                logging.info( "Incorrect arguments. Please provide arguments -d -bg -bga or -split only when you wish to generate coverage" )
            sys.exit()

    if options.generate_overlapping_coverage == True and options.generate_non_overlapping_coverage == True:
        print( "You can either generate overlapping or non-overlapping coverage. If you need to generate both please run abridge twice each time with either option" )
        if options.quiet == False:
            logging.info( "You can either generate overlapping or non-overlapping coverage. If you need to generate both please run abridge twice each time with either option" )
        sys.exit()

    if options.generate_overlapping_coverage == False or options.generate_non_overlapping_coverage == False:
        if [options.d, options.bg, options.bga].count( True ) > 1:
            print( "You can specify only one among -d, -bg or -bga" )
            if options.quiet == False:
                logging.info( "You can specify only one among -d, -bg or -bga" )
            sys.exit()

    if options.inputalignedfilename is not None:
        input_filename_without_location = options.inputalignedfilename.split( "/" )[-1][:-4]
    else:
        input_filename_without_location = options.inputabrfilename.split( "/" )[-1][:-8]
    #options.outputfilename = f"{options.temp_directory}/{input_filename_without_location}.abridge"

def cleanUp( options ):
    # Remove the reference files
    reference_filename_without_location = options.reference.split( "/" )[-1]
    options.reference = f"{options.temp_directory}/{reference_filename_without_location}"
    os.system( f"rm -rf {options.reference}*" )

    return
    for file in options.files_for_removal:
        cmd = f"rm -rf {file}"
        os.system( cmd )

def runCommand( eachpinput ):
    cmd, dummy = eachpinput
    os.system( cmd )

def constructFileNames( options ):
    #name_of_input_file_without_location = input_filename.split( "/" )[-1][:-4]
    
    programs_to_files = {}
    # Compression
    if options.compress == True:
        filenames = {}
        output_filenames = {}
        error_filenames = {}
        
        # compute_information_for_better_memory_management
        filenames['compute_information_for_better_memory_management'] = f"{options.temp_directory}/compute_information_for_better_memory_management"
        programs_to_files['compute_information_for_better_memory_management'] = {}
        programs_to_files['compute_information_for_better_memory_management']['summary_information_outputfilename'] = f"{options.temp_directory}/summary_information_about_aligned_reads"
        programs_to_files['compute_information_for_better_memory_management']['input_alignment_filename'] = options.inputalignedfilename
        programs_to_files['compute_information_for_better_memory_management']['output'] = f"{options.outputs_and_errors_directory}/compute_information_for_better_memory_management.output"
        programs_to_files['compute_information_for_better_memory_management']['error'] = f"{options.outputs_and_errors_directory}/compute_information_for_better_memory_management.error"
        
        # compress_alignment_file
        programs_to_files['compress_alignment_file'] = {}
        programs_to_files['compress_alignment_file']['input_alignment_filename'] = options.inputalignedfilename
        inputalignedfilename_with_path, inputalignedfilename_location, inputalignedfilename_full, inputalignedfilename_base, inputalignedfilename_extension = seggregateInformationFromFilename(programs_to_files['compress_alignment_file']['input_alignment_filename'] )
        programs_to_files['compress_alignment_file']['output_abridge_filename'] = f"{options.temp_directory}/{inputalignedfilename_base}_icigars"
        programs_to_files['compress_alignment_file']['reference_filename'] = options.reference
        programs_to_files['compress_alignment_file']['unmapped_filename'] = f"{options.temp_directory}/{inputalignedfilename_base}_unmapped"
        programs_to_files['compress_alignment_file']['name_of_file_with_quality_scores'] = f"{options.temp_directory}/{inputalignedfilename_base}_quality_scores"
        programs_to_files['compress_alignment_file']['name_of_file_with_read_names_to_short_read_names_and_NH'] = f"{options.temp_directory}/sorted_read_names_with_NH_and_short_read_names_sorted_by_pos"
        programs_to_files['compress_alignment_file']['samformatflag_dictionary_filename'] = f"{options.temp_directory}/samformatflag_dictionary_filename"
        
        programs_to_files['compress_alignment_file']['output'] = f"{options.outputs_and_errors_directory}/compress_alignment_file.output"
        programs_to_files['compress_alignment_file']['error'] = f"{options.outputs_and_errors_directory}/compress_alignment_file.error"
        
        programs_to_files['spring_compress_aligned_reads'] = {}
        programs_to_files['spring_compress_aligned_reads']['input_filename'] = programs_to_files['compress_alignment_file']['name_of_file_with_quality_scores']
        programs_to_files['spring_compress_aligned_reads']['output_filename'] = f"{programs_to_files['compress_alignment_file']['name_of_file_with_quality_scores']}.spring"
        programs_to_files['spring_compress_aligned_reads']['output'] = f"{options.outputs_and_errors_directory}/spring_compress_aligned_reads.output"
        programs_to_files['spring_compress_aligned_reads']['error'] = f"{options.outputs_and_errors_directory}/spring_compress_aligned_reads.error"

        programs_to_files['spring_compress_unaligned_reads'] = {}
        programs_to_files['spring_compress_unaligned_reads']['input_filename'] = programs_to_files['compress_alignment_file']['unmapped_filename']
        programs_to_files['spring_compress_unaligned_reads']['output_filename'] = f"{programs_to_files['compress_alignment_file']['unmapped_filename']}.spring"
        programs_to_files['spring_compress_unaligned_reads']['output'] = f"{options.outputs_and_errors_directory}/spring_compress_unaligned_reads.output"
        programs_to_files['spring_compress_unaligned_reads']['error'] = f"{options.outputs_and_errors_directory}/spring_compress_unaligned_reads.error"
        
        programs_to_files['fclqc_compress'] = {}
        programs_to_files['fclqc_compress']['parameter_file'] = f"{options.temp_directory}/parameter.json"
        programs_to_files['fclqc_compress']['outputfilename'] = f"{options.temp_directory}/{inputalignedfilename_base}_quality_scores_fclqc_compressed"
        programs_to_files['fclqc_compress']['output'] = f"{options.outputs_and_errors_directory}/fclqc_compress.output"
        programs_to_files['fclqc_compress']['error'] = f"{options.outputs_and_errors_directory}/fclqc_compress.error"
        
        programs_to_files['zpaq_compress'] = {}
        programs_to_files['zpaq_compress']['compressed_file'] = f"{options.outputfilename}"
        programs_to_files['zpaq_compress']['reference_sha512_filename'] = f"{options.temp_directory}/reference_sha512"
        programs_to_files['zpaq_compress']['output'] = f"{options.outputs_and_errors_directory}/zpaq_compress.output"
        programs_to_files['zpaq_compress']['error'] = f"{options.outputs_and_errors_directory}/zpaq_compress.error"
    
    # Decompress
    

    return  programs_to_files

def compressSamFile( options , tags_present ):
    """
    """
    pool = multiprocessing.Pool( processes = int( options.cpu ) )
    programs_to_files = constructFileNames(options)

    ######################################################################################
    # Find maximum number of reads mapped to a single location
    # Find maximum read length
    # Find total number of alignments
    ######################################################################################

    # for input_filename in options.inputalignedfilename:
    input_filename = options.inputalignedfilename
    if options.quiet == False:
        logging.info( f"Starting compression for {input_filename}" )

    input_alignment_file_format = "BAM" if "bam" in options.inputalignedfilename else "sam"
    cmd = f"(/usr/bin/time --verbose "
    cmd += f" compute_information_for_better_memory_management  "
    cmd += f" --input_alignment_filename {programs_to_files['compute_information_for_better_memory_management']['input_alignment_filename']} "
    cmd += f" --input_alignment_file_format {input_alignment_file_format}"
    cmd += f" --summary_information_outputfilename {programs_to_files['compute_information_for_better_memory_management']['summary_information_outputfilename']}"
    cmd += f")"
    cmd += f"1> {programs_to_files['compute_information_for_better_memory_management']['output']}  "
    cmd += f"2> {programs_to_files['compute_information_for_better_memory_management']['error'] }"
    if options.quiet == False:
        logging.info( f"Running command - {cmd}" )
    
    os.system(cmd)
    
    ######################################################################################
    #  Execute the compress SAM file command for single ended reads
    ######################################################################################
    all_commands = []
    # for input_filename in options.inputalignedfilename:
    input_filename = options.inputalignedfilename
    if options.quiet == False:
        logging.info( f"Starting compression for {input_filename}" )
    #print([row.split(":")[-1].strip() for row in open( programs_to_files['compute_information_for_better_memory_management']['summary_information_outputfilename'], "r" ).read().split("\n")])
    max_input_reads_in_a_single_nucl_loc, total_number_of_alignments, max_read_length = [int(row.split(":")[-1].strip()) for row in open( programs_to_files['compute_information_for_better_memory_management']['summary_information_outputfilename'], "r" ).read().split("\n")[:-2]]
    samformatflags = open( programs_to_files['compute_information_for_better_memory_management']['summary_information_outputfilename'], "r" ).read().split("\n")[-2].split()[-1].split(",")
    # N ommitted since it represents splices, D omitted since it represents Deletion
    all_possible_alphabets_for_samformatflag = ['A', 'B', 'C', 'E', 'F', 'G', 'I',  'J', 'K', 'L', 'M', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
    fhw = open(programs_to_files['compress_alignment_file']['samformatflag_dictionary_filename'], "w")
    for i in range(len(samformatflags)):
        fhw.write(samformatflags[i] + "\t" + all_possible_alphabets_for_samformatflag[i]+"\n")
    fhw.close()

    cmd = f"(/usr/bin/time --verbose "
    cmd += f" compress_alignment_file "
    cmd += f" --input_alignment_filename {programs_to_files['compress_alignment_file']['input_alignment_filename'] }"
    if input_filename[-4:] == ".sam":
        cmd += f" --input_alignment_file_format SAM"
    elif input_filename[-4:] == ".bam":
        cmd += f" --input_alignment_file_format BAM"
    cmd += f" --output_abridge_filename {programs_to_files['compress_alignment_file']['output_abridge_filename'] } "
    cmd += f" --reference_filename {programs_to_files['compress_alignment_file']['reference_filename'] }"   
    cmd += f" --unmapped_filename {programs_to_files['compress_alignment_file']['unmapped_filename'] } "
    cmd += f" --max_read_length {max_read_length}"
    cmd += f" --name_of_file_with_quality_scores {programs_to_files['compress_alignment_file']['name_of_file_with_quality_scores'] } "
    cmd += f" --name_of_file_with_read_names_to_short_read_names_and_NH {programs_to_files['compress_alignment_file']['name_of_file_with_read_names_to_short_read_names_and_NH'] } "
    cmd += f" --samformatflag_dictionary_filename {programs_to_files['compress_alignment_file']['samformatflag_dictionary_filename']}"
    if options.single_ended==True:
        cmd += f" --ended SE "
    else:
        cmd += f" --ended PE "
    cmd += f" --AS_tag_presence {tags_present['AS']}"

    if options.ignore_soft_clippings == True:
        cmd += f" --flag_ignore_soft_clippings"
    if options.ignore_mismatches == True:  
        cmd += f" --flag_ignore_mismatches "
    if options.ignore_all_quality_scores == True:
        cmd += f" --flag_ignore_all_quality_scores "
    if options.ignore_unmapped_reads == True:
        cmd += f" --flag_ignore_unmapped_sequences "
    if options.run_diagnostics == True:
        cmd += f" --run_diagnostics "
    if options.ignore_quality_scores_for_matched_bases == True:
        cmd += f" --flag_ignore_quality_scores_for_matched_bases "
    if options.ignore_alignment_scores == True:
        cmd += f" --flag_ignore_alignment_scores "
    if options.skip_shortening_read_names == True:
        cmd += f" --skip_shortening_read_names"
    cmd += f" --max_reads_in_a_single_nucl_loc {max_input_reads_in_a_single_nucl_loc}"
    cmd += f" --spring_or_fclqc {options.spring_or_fclqc}"    

    cmd += f") "
    cmd += f" 1> {programs_to_files['compress_alignment_file']['output'] }"
    cmd += f" 2> {programs_to_files['compress_alignment_file']['error'] }"
    if options.quiet == False:
        logging.info( f"Running command - {cmd}" )
    
    os.system(cmd)
  
    ######################################################################################
    # Compress Quality scores file using fclqc
    ######################################################################################
    number_of_quality_scores = total_number_of_alignments + 100

    if options.spring_or_fclqc == "FCLQC":
        # Create the parameter file for fclqc
        fhw = open( f"{options.temp_directory}/parameter.json", "w" )
        fhw.write( "{\n" )
        fhw.write( f"\t\"precision\": 52,\n" )
        fhw.write( f"\t\"file_size\": {number_of_quality_scores},\n" )
        fhw.write( f"\t\"thread_num\": {options.cpu},\n" )
        fhw.write( f"\t\"first_line\": 1,\n" )
        fhw.write( f"\t\"last_line\": {number_of_quality_scores*4}\n" )
        fhw.write( "}\n" )
        fhw.close()

        if options.quiet == False:
            logging.info( "Starting compression with fclqc" )
        cmd = f"(/usr/bin/time --verbose "
        cmd += f" main "
        cmd += f" -c {programs_to_files['compress_alignment_file']['name_of_file_with_quality_scores'] } "
        cmd += f" {programs_to_files['fclqc_compress']['outputfilename']}"
        cmd += f" {options.temp_directory}/parameter.json"
        cmd += f") "
        cmd += f" 1> {programs_to_files['fclqc_compress']['output']} "
        cmd += f" 2> {programs_to_files['fclqc_compress']['error']}"
        if options.quiet == False:
            logging.info( f"Running command - {cmd}" )
        os.system(cmd)
    elif options.spring_or_fclqc == "SPRING":
        # Compress with SPRING
        cmd = f"(/usr/bin/time --verbose "
        cmd += " spring "
        cmd += f" --compress "
        cmd += f" --num-threads {options.cpu} "
        cmd += f" --no-ids "
        cmd += f" --input-file {programs_to_files['spring_compress_aligned_reads']['input_filename']}"
        cmd += f" --output-file {programs_to_files['spring_compress_aligned_reads']['output_filename']}"
        cmd += f") "
        cmd += f" 1> {programs_to_files['spring_compress_aligned_reads']['output']} "
        cmd += f" 2> {programs_to_files['spring_compress_aligned_reads']['error']}"
        if options.quiet == False:
            logging.info( f"Running command - {cmd}" )
        os.system(cmd) 

        cmd = f"(/usr/bin/time --verbose "
        cmd += " spring "
        cmd += f" --compress "
        cmd += f" --num-threads {options.cpu} "
        cmd += f" --no-ids "
        cmd += f" --allow-read-reordering "
        cmd += f" --input-file {programs_to_files['spring_compress_unaligned_reads']['input_filename']}"
        cmd += f" --output-file {programs_to_files['spring_compress_unaligned_reads']['output_filename']}"
        cmd += f") "
        cmd += f" 1> {programs_to_files['spring_compress_unaligned_reads']['output']} "
        cmd += f" 2> {programs_to_files['spring_compress_unaligned_reads']['error']}"
        if options.quiet == False:
            logging.info( f"Running command - {cmd}" )
        os.system(cmd)
    
    ######################################################################################
    # Generate SHA512 for reference file
    ######################################################################################
    reference_sha512_filename = f"{options.temp_directory}/reference_sha512"
    cmd = f"sha512sum {options.reference} > {programs_to_files['zpaq_compress']['reference_sha512_filename']}.temp"
    os.system( cmd )
    open( reference_sha512_filename, "w" ).write( open( f"{programs_to_files['zpaq_compress']['reference_sha512_filename']}.temp", "r" ).read().strip().split()[0] )
    os.system( f"rm {programs_to_files['zpaq_compress']['reference_sha512_filename']}.temp" )

    ######################################################################################
    # Compress files
    ######################################################################################
    if options.quiet == False:
        logging.info( "Starting compression with zpaq" )

    if options.spring_or_fclqc == "FCLQC":
        cmd = f"(/usr/bin/time --verbose "
        cmd += f" zpaq a "
        cmd += f" {programs_to_files['zpaq_compress']['compressed_file']} "  # Final name of the compressed file ending in .abridge
        cmd += f" {programs_to_files['compress_alignment_file']['output_abridge_filename']} "  # The non redundant version of the sam file
        cmd += f" {programs_to_files['compress_alignment_file']['unmapped_filename']} "  # Set of unmapped reads
        cmd += f" {programs_to_files['zpaq_compress']['reference_sha512_filename']} "  # The 128 bit hash of the reference
        cmd += f" {programs_to_files['fclqc_compress']['outputfilename']}* "  # FCLQC compressed quality scores file
        #cmd += f" {programs_to_files['fclqc_compress']['outputfilename']}.prob "  # FCLQC compressed quality probability file
        cmd += f" -m3 -t{options.cpu} -f "
        cmd += f" -noattributes "
        cmd += f") "
        cmd += f"1> {programs_to_files['zpaq_compress']['output']} "
        cmd += f"2> {programs_to_files['zpaq_compress']['error']} "
        if options.quiet == False:
            logging.info( f"Running command - {cmd}" )
        os.system(cmd)
    elif options.spring_or_fclqc == "SPRING": 
        cmd = f"(/usr/bin/time --verbose "
        cmd += f" zpaq a "
        cmd += f" {programs_to_files['zpaq_compress']['compressed_file']} "  # Final name of the compressed file ending in .abridge
        cmd += f" {programs_to_files['compress_alignment_file']['output_abridge_filename']} "  # The non redundant version of the sam file
        cmd += f" {programs_to_files['zpaq_compress']['reference_sha512_filename']} "  # The 128 bit hash of the reference
        cmd += f" {programs_to_files['spring_compress_aligned_reads']['output_filename']} "  # SPRING compressed file for aligned reads
        cmd += f" {programs_to_files['spring_compress_unaligned_reads']['output_filename']} "  # SPRING compressed file for unaligned reads
        cmd += f" -m3 -t{options.cpu} -f "
        cmd += f" -noattributes "
        cmd += f") "
        cmd += f"1> {programs_to_files['zpaq_compress']['output']} "
        cmd += f"2> {programs_to_files['zpaq_compress']['error']} "
        if options.quiet == False:
            logging.info( f"Running command - {cmd}" )
        os.system(cmd) 

def findEndedness( options, logging ):
    """
    """
    endedness = []
    fhr = open( f"{options.inputalignedfilename}", "r" )
    for line in fhr:
        if line[0] != '@':
            samformatflag = int( line.strip().split()[1] )
            endedness.append( samformatflag % 2 )
            break

    if len( set( endedness ) ) != 1:
        print( "A mixture of single and paired ended files is not allowed. Exiting..." )
        if options.quiet == False:
            logging.info( "A mixture of single and paired ended files is not allowed. Exiting..." )

    if endedness[0] == 0:
        options.single_ended = True
    else:
        options.single_ended = False

def collectReferenceSequenceNameAndLength( options, logging ):
    """
    """
    reference_to_length = {}
    fhr = open( f"{options.inputalignedfilename[0][:-3]}header", "r" )
    for line in fhr:
        if line[:3] == "@SQ":
            useless, reference_name, reference_length = line.strip().split( "\t" )
            reference_name = reference_name.split( ":" )[-1]
            reference_length = int( reference_length.split( ":" )[-1] )
            reference_to_length[reference_name] = reference_length
    fhr.close()
    options.reference_to_length = reference_to_length

def findChromosomes( filename ):
    chromosomes = []
    fhr = open( filename, "r" )
    for line_num, line in enumerate( fhr ):
        if line_num == 0:continue
        chromosomes.append( line.split()[0] )
    fhr.close()
    return list( set( chromosomes ) )

def decompressSamFile( options  ):
    """
    """
    pool = multiprocessing.Pool( processes = int( options.cpu ) )

    input_filename = options.inputabrfilename[:-8] + ".sam"
    delete_these_files, other_files, error_files = constructFileNames( input_filename, options )
    name_of_max_input_reads_file, name_of_file_with_max_commas, name_of_file_max_read_length, name_of_total_number_of_alignments_file, frequency_of_flags_filename = delete_these_files
    outputfilename, index_outputfilename, unmapped_outputfilename, name_of_file_with_quality_scores, name_of_file_with_quality_scores_rle, name_of_file_dictionary, fclqc_output_filename = other_files

    outputfilename = f"{options.temp_directory}/inputalignedfilename_compressed"
    unmapped_outputfilename = f"{options.temp_directory}/inputalignedfilename_unmapped"
    name_of_file_with_quality_scores = f"{options.temp_directory}/inputalignedfilename_qual"

    cmd = f"samtools faidx {options.reference}"
    
    os.system(cmd)

    cmd = f"zpaq l {options.inputabrfilename} > {options.temp_directory}/file_list  2> {options.outputs_and_errors_directory}/zpaq_file_list.error"

    os.system(cmd)

    fhr = open( f"{options.temp_directory}/file_list", "r" )
    line_num = 0
    for line in fhr:
        line_num += 1
        if line_num == 4:
            compression_directory = "/".join( line.strip().split()[-1].split( "/" )[:-1] )
            break

    # compression_directory = compression_directory.split("/")[-2]
    fhr.close()
    cmd = f"zpaq x {options.inputabrfilename} {'/'.join(compression_directory.split('/')[:-1])} -to {options.temp_directory} -t{options.cpu} 2> {options.outputs_and_errors_directory}/zpaq_decompress.error"

    os.system(cmd)

    # Move files
    cmd = f"mv {options.temp_directory}/{compression_directory.split('/')[-1]}/* {options.temp_directory}"
    os.system(cmd)

    # Create the parameter file for fclqc
    fhw = open( f"{options.temp_directory}/parameter.json", "w" )
    fhw.write( "{\n" )
    fhw.write( f"\t\"precision\": 52,\n" )
    fhw.write( f"\t\"file_size\": 1,\n" )
    fhw.write( f"\t\"thread_num\": {options.cpu},\n" )
    fhw.write( f"\t\"first_line\": 1,\n" )
    fhw.write( f"\t\"last_line\": 1\n" )
    fhw.write( "}\n" )
    fhw.close()

    if len( glob.glob( f"{options.temp_directory}/*fclqc001.enc" ) ) > 0:
        enc_file_to_be_decompressed = glob.glob( f"{options.temp_directory}/*fclqc001.enc" )[0]
    else:
        # enc_file_to_be_decompressed =  f"{options.temp_directory}/*fclqc001.enc"
        os.system( f"touch {name_of_file_with_quality_scores}" )

    if os.path.exists( f"{name_of_file_with_quality_scores}" ) == False and len( glob.glob( f"{options.temp_directory}/*fclqc001.enc" ) ) > 0:
        cmd = f"(/usr/bin/time --verbose "
        cmd += f" main "
        cmd += f" -d "
        cmd += f" {enc_file_to_be_decompressed.split('001')[0]} "
        cmd += f" {fclqc_output_filename}_decompressed "
        cmd += f" {options.temp_directory}/parameter.json "
        cmd += f")"
        cmd += f"1> /dev/null "
        cmd += f"2> {options.outputs_and_errors_directory}/fclqc_decompress.error"

        os.system(cmd)

    cmd = f"mv {fclqc_output_filename}_decompressed001.dec {name_of_file_with_quality_scores}"

    if os.path.exists( f"{name_of_file_with_quality_scores}" ) == False and len( glob.glob( f"{options.temp_directory}/*fclqc001.enc" ) ) > 0:
        os.system(cmd)

    # flag_ignore_mismatches, flag_ignore_soft_clippings, flag_ignore_unmapped_sequences, flag_ignore_quality_score, ignore_quality_scores_for_matched_bases, save_exact_quality_scores, ignore_alignment_scores = list( map( int, open( outputfilename, "rb" ).readline().split() ) )
    flag_ignore_mismatches, flag_ignore_soft_clippings, flag_ignore_unmapped_sequences, flag_ignore_all_quality_scores   , flag_ignore_quality_scores_for_matched_bases, flag_ignore_alignment_scores, skip_shortening_read_names = open( outputfilename, "rb" ).readline().split()
    all_flags = [flag_ignore_mismatches, flag_ignore_soft_clippings, flag_ignore_unmapped_sequences, flag_ignore_all_quality_scores   , flag_ignore_quality_scores_for_matched_bases, flag_ignore_alignment_scores, skip_shortening_read_names ]
    all_flags_numeric = []
    for flag in all_flags:
        flag = flag.decode( 'ascii' )
        all_flags_numeric.append( int( flag.split( ":" )[-1] ) )
    flag_ignore_mismatches, flag_ignore_soft_clippings, flag_ignore_unmapped_sequences, flag_ignore_all_quality_scores, flag_ignore_quality_scores_for_matched_bases, flag_ignore_alignment_scores, skip_shortening_read_names = all_flags_numeric

    name_of_input_file_without_location = options.inputabrfilename.split( "/" )[-1].split( ".abridge" )[0]
    max_reads_in_each_line_filename = f"{options.temp_directory}/max_reads_in_each_line_filename"
    output_sam_filename = f"{options.temp_directory}/{name_of_input_file_without_location}_decompressed.sam"

    cmd = f" maxReadsInEachLine "
    cmd += f" {outputfilename} "
    cmd += f" {max_reads_in_each_line_filename} "

    os.system(cmd)

    max_reads_in_each_line = open( max_reads_in_each_line_filename, "r" ).readline().strip()

    cmd = f"(/usr/bin/time --verbose "
    if os.path.exists( name_of_file_dictionary ) == False:
        cmd += f" decompressSamFileSingleEnded "  # argv[0] - Name of the program
    else:
        cmd += f" decompressSamFilePairedEnded "  # argv[0] - Name of the program
    cmd += f" --reference {options.reference} "  # argv[1] - reference file
    cmd += f" --outputfilename {output_sam_filename} "  # argv[2] - decompressed SAM filename
    cmd += f" --compressedfile {outputfilename} "  # argv[3] - Name of the  filename
    cmd += f" --mockquality {options.quality} "  # argv[4] - Quality of reads
    if options.ignore_sequence:
        cmd += f" --ignoresequence"  # argv[5] - Whether or not to produce sequences from reference file
    cmd += f" --unmappedreadsfilename {unmapped_outputfilename}"  # argv[6] - Name of file with unmapped reads
    cmd += f" --qualityscoresfilename {name_of_file_with_quality_scores} "  # argv[7] - Name of file with quality scores
    if os.path.exists( name_of_file_dictionary ) == True:
        cmd += f" --dictionaryfilename {name_of_file_dictionary} "  # argv[8]

    cmd += f" --maxreadsineachline {max_reads_in_each_line} "  # argv[9]
    cmd += f") "
    #cmd += f"1> /dev/null "
    if options.keep_intermediate_error_files == True:
        cmd += f"2> {options.outputs_and_errors_directory}/{name_of_input_file_without_location}_decompress.error"
    else:
        cmd += f"2> /dev/null"

    os.system(cmd)

    """for file in files_to_be_removed:
        if options.quiet == False:
            logging.info( f"Deleting - {file}" )
        os.system( "rm -rf " + file )"""

def generateCoverage( _outputfilename, _index_outputfilename, options ):
    generateCoverage = options.softwares["generateCoverage"]
    findMaximumNumberOfReadsInEachLine = options.softwares["findMaximumNumberOfReadsInEachLine"]

    ######################################################################################
    # Compiling programs - will be removed during final version
    ######################################################################################
    if options.compile_programs == True:
        cmd = f"gcc {generateCoverage}.c -o {generateCoverage} -g -Ofast"
        os.system( cmd )

        cmd = f"gcc {findMaximumNumberOfReadsInEachLine}.c -o {findMaximumNumberOfReadsInEachLine} -g -Ofast"
        os.system( cmd )
    ######################################################################################
    if options.d == True:
        options.d = 1
    else:
        options.d = 0
    if options.bg == True:
        options.bg = 1
    else:
        options.bg = 0
    if options.bga == True:
        options.bga = 1
    else:
        options.bga = 0
    if options.split == True:
        options.split = 1
    else:
        options.split = 0
    if options.generate_overlapping_coverage == True:
        options.generate_overlapping_coverage = 1
    else:
        options.generate_overlapping_coverage = 0
    if options.generate_non_overlapping_coverage == True:
        options.generate_non_overlapping_coverage = 1
    else:
        options.generate_non_overlapping_coverage = 0

    name_of_input_file_without_location = options.inputabrfilename.split( "/" )[-1].split( ".abridge" )[0]
    dictionary_name = f"{options.temp_directory}/{name_of_input_file_without_location}.sam.dictionary"
    if os.path.exists( dictionary_name ) == True:
        single = 0
    else:
        single = 1

    max_reads_in_each_line_filename = f"{options.temp_directory}/{name_of_input_file_without_location}.max_read_in_each_line"

    cmd = f"(/usr/bin/time --verbose "
    cmd += f" {generateCoverage} "
    cmd += f" {_outputfilename} "  # argv[1]
    cmd += f" {_index_outputfilename} "  # argv[2]
    cmd += f" {options.d} "  # argv[3]
    cmd += f" {options.bg} "  # argv[4]
    cmd += f" {options.bga} "  # argv[5]
    cmd += f" {options.split} "  # argv[6]
    cmd += f" {options.generate_overlapping_coverage} "  # argv[7]
    cmd += f" {options.generate_non_overlapping_coverage} "  # argv[8]
    cmd += f" {single} "  # argv[9]
    if os.path.exists( dictionary_name ) == True:
        cmd += f" {dictionary_name} "  # argv[10]
        cmd_1 = f" {findMaximumNumberOfReadsInEachLine} "
        cmd_1 += f" {_outputfilename} "
        cmd_1 += f" 1> {max_reads_in_each_line_filename} "
        cmd_1 += f" 2> /dev/null "
        if options.quiet == False:
            logging.info( f"Running command - {cmd_1}" )
        os.system( cmd_1 )
        max_reads_in_each_line = open( max_reads_in_each_line_filename, "r" ).readline().strip()
        cmd += f" {max_reads_in_each_line} "  # argv[11]
    cmd += f") "
    if options.keep_intermediate_error_files == True:
        # cmd += f" 1> {options.outputs_and_errors_directory}/{name_of_input_file_without_location}_coverage_generation_{options.d}_{options.bg}_{options.bga}_{options.split}_{options.generate_overlapping_coverage}_{options.generate_non_overlapping_coverage}.output "
        cmd += f" 2> {options.outputs_and_errors_directory}/{name_of_input_file_without_location}_coverage_generation_{options.d}_{options.bg}_{options.bga}_{options.split}_{options.generate_overlapping_coverage}_{options.generate_non_overlapping_coverage}.error"
    else:
        # cmd += f" 1> /dev/null "
        cmd += f"2> /dev/null"
    if options.quiet == False:
        logging.info( f"Running command - {cmd}" )
    os.system( cmd )

def retrieveAlignmentsRandomly( chromosome, start, end, options ):
    """
    DEPRECATED - WILL BE REMOVED
    """
    randomRetrievalSingleEnded = options.softwares["randomRetrievalSingleEnded"]
    randomRetrievalPairedEnded = options.softwares["randomRetrievalPairedEnded"]
    deCompressQualityScoresFile = options.softwares["deCompressQualityScoresFile"]
    ######################################################################################
    # Compiling programs - will be removed during final version
    ######################################################################################
    if options.compile_programs == True:
        cmd = f"gcc {randomRetrievalSingleEnded}.c -o {randomRetrievalSingleEnded} -g -Ofast"
        os.system( cmd )
        cmd = f"gcc {randomRetrievalPairedEnded}.c -o {randomRetrievalPairedEnded} -g -Ofast"
        os.system( cmd )
        cmd = f"gcc {deCompressQualityScoresFile}.c -o {deCompressQualityScoresFile} -Ofast"
        os.system( cmd )
    ######################################################################################

    reference_filename = options.reference
    reference_index_filename = options.reference + ".fai"

    input_filename = options.inputabrfilename
    name_of_input_file_without_location = input_filename.split( "/" )[-1][:-8]
    _outputfilename = options.temp_directory + "/" + name_of_input_file_without_location + ".sam_concise"
    _index_outputfilename = options.temp_directory + "/" + name_of_input_file_without_location + ".sam_concise_index"
    unmapped_outputfilename = options.temp_directory + "/" + name_of_input_file_without_location + ".sam_conciseunmapped"
    name_of_file_with_quality_scores = f"{options.temp_directory}/{name_of_input_file_without_location}.sam_concisequal"
    compressed_abridged_filename = input_filename
    output_sam_filename = options.temp_directory + "/" + name_of_input_file_without_location + ".decompressed.sam"
    outputfilename = options.temp_directory + "/" + name_of_input_file_without_location + "_" + options.positions.replace( ":", "_" ) + ".sam"
    files_to_be_removed = []

    cmd = f"(/usr/bin/time --verbose "
    cmd += f" {deCompressQualityScoresFile} "
    cmd += f" {name_of_file_with_quality_scores}.rle "
    cmd += f" {name_of_file_with_quality_scores} "
    cmd += f") "
    cmd += f"1> /dev/null "
    if options.keep_intermediate_error_files == True:
        cmd += f"2> {options.outputs_and_errors_directory}/{name_of_input_file_without_location}_RLE_decompress.error"
    else:
        cmd += f"2> /dev/null"
    os.system( cmd )
    files_to_be_removed.append( f"{name_of_file_with_quality_scores}" )

    dictionary_name = f"{options.temp_directory}/{name_of_input_file_without_location}.sam.dictionary"
    # print(dictionary_name)
    if os.path.exists( dictionary_name ) == False:
        cmd = f" {randomRetrievalSingleEnded} "  # argv[0]
    else:
        cmd = f" {randomRetrievalPairedEnded} "  # argv[0]
    cmd += f" {_index_outputfilename} "  # argv[1]
    cmd += f" {reference_filename} "  # argv[2]
    cmd += f" {reference_index_filename} "  # argv[3]
    cmd += f" {chromosome} "  # argv[4]
    cmd += f" {_outputfilename}  "  # argv[5]
    cmd += f" {start} "  # argv[6]
    cmd += f" {end} "  # argv[7]
    cmd += f" {outputfilename}"  # argv[8]
    cmd += f" {options.quality}"  # argv[9[
    cmd += f" {options.ignore_sequence}"  # argv[10]
    if options.read_prefix is not None:
        cmd += f" {options.read_prefix}"  # argv[11]
    else:
        cmd += f" \"\" "  # argv[11]
    cmd += f" {name_of_file_with_quality_scores}"  # argv[12]
    if os.path.exists( dictionary_name ) == True:
        cmd += f" {dictionary_name} "  # argv[13]
    # print(cmd)
    # sys.stdout.flush()
    os.system( cmd )

def createPositionSortedSAMInputFileForCompressing( options,  remove_these_files,  logging ):
    """
    Performs the following operations:
    - Checks if the input file is a bamfile or samfile
    - Prepares a samfile coordinate sorted
    """
    inputalignedfilename_fullpath, inputalignedfilename_onlypath, inputalignedfilename_onlyfilename, inputalignedfilename_onlybasename, inputalignedfilename_onlyextension = seggregateInformationFromFilename(options.inputalignedfilename)
    
    if inputalignedfilename_onlyextension == "bam":
        cmd = f"samtools view -h -@ {options.cpu} {options.inputalignedfilename}|head > {options.temp_directory}/header"
        remove_these_files.append( f"{options.temp_directory}/header" )
        logging.info(f"Running command - {cmd}")
        os.system(cmd)

        first_line = open( f"{options.temp_directory}/header" , "r" ).readline().strip()
        if "SO:coordinate" in first_line:
            pass
        else:
            # Need to sort by coordinate
            cmd = f"samtools sort -@ {options.cpu} -o {options.inputalignedfilename}.sorted {options.inputalignedfilename}  "
            logging.info(f"Running command - {cmd}")
            os.system(cmd)
            
            cmd = f"unlink {options.inputalignedfilename}"
            logging.info(f"Running command - {cmd}")
            os.system(cmd)
            
            cmd = f"mv {options.inputalignedfilename}.sorted {options.inputalignedfilename}"
            logging.info(f"Running command - {cmd}")
            os.system(cmd)

    elif inputalignedfilename_onlyextension == "sam":
        first_line = open( options.inputalignedfilename, "r" ).readline().strip()
        if "SO:coordinate" in first_line:
            pass
        else:
            # Sort Samfile by coordinate
            cmd = f"samtools view -bS -@ {options.cpu} {options.inputalignedfilename}|samtools sort -@ {options.cpu}|samtools view -h -@ {options.cpu} > {options.temp_directory}/inputaligned.sam.temp "
            logging.info(f"Running command - {cmd}")
            os.system(cmd)
            
            cmd = f"unlink {options.inputalignedfilename}"
            logging.info(f"Running command - {cmd}")
            os.system(cmd)
            
            cmd = f"mv {options.temp_directory}/inputaligned.sam.temp {options.temp_directory}/inputaligned.sam"
            options.inputalignedfilename = f"{options.temp_directory}/inputaligned.sam"
            logging.info(f"Running command - {cmd}")
            os.system(cmd)
            

def returnTagList(options, samfilename, tagfilename):
    """
    Returns a dictionary of tags.
    For example:
    {"NH": 1 #NH tag is present
    "MD": 0 #MD tag is absent
    }
    Assume that the same tag is present for all the alignments if it is present for one
    """
    fhr = open(samfilename, "r")
    for line in fhr:
        if line[0]=='@': continue
        else: break
    fhr.close()
    
    
    tag_presence = {"NH":0, "MD":0, "AS": 0}
    if "NH:i:" in line:
        tag_presence["NH"] = 1
    if "MD:Z:" in line:
        tag_presence["MD"] = 1
    if "AS:i" in line:
        tag_presence["AS"] = 1
        
    return tag_presence

def alterReadNamesAndAddNHTag( options,  logging ):
    """
    Reduce the length of the read names
    Add more tags as required
    """
    # Check which tags are already present in the aligned file
    tags_present = returnTagList(options, options.inputalignedfilename, f"{options.temp_directory}/temp_file_tags")
    
    if tags_present["NH"] == 1 and options.skip_shortening_read_names == True: 
        # Creating a dummy file
        open(f"{options.temp_directory}/sorted_read_names_with_NH_and_short_read_names_sorted_by_pos","w").close()
        logging.info("Skipping the generation of short read names")
        return tags_present
    
    # Sort read names - to bring together multiple alignments of the same read and same mate pairs of PE reads
    # Output file contains 2 columns - 
    # $1 - Long read name
    # $2 - Line number where the read appears
    cmd = f"printReadNamesAndLineNumbers -i {options.inputalignedfilename}|sort "
    cmd += f"> {options.temp_directory}/sorted_read_names"
    if os.path.exists( f"{options.temp_directory}/sorted_read_names" ) == False:
        logging.info(f"Running command - {cmd}")
        os.system(cmd)

    logging.info( "Read names sorted" )

    # Count the number of multi maps of each read. Note that uniquely mapped PE reads will have a value of 2.
    # Output file contains 2 columns - 
    # $1 - Long read name
    # $2 - Number of times the read name appears 
    cmd = f"cat {options.temp_directory}/sorted_read_names" + "|cut -f1|uniq -c|awk -vOFS=\"\\t\" '{print $2,$1}'"
    cmd += f"> {options.temp_directory}/NH_vals"
    if os.path.exists( f"{options.temp_directory}/NH_vals" ) == False:
        logging.info(f"Running command - {cmd}")
        os.system(cmd)

    logging.info( "NH value has been calculated" )

    # Merge two files to associate the read names with their correspondng NH values
    # Output file contains 3 columns - 
    # $1 - Long read name
    # $2 - Line number where the long read appears
    # $3 - Number of times the read name appears
    cmd = f"join {options.temp_directory}/sorted_read_names {options.temp_directory}/NH_vals" + " |awk -vOFS=\"\\t\" '{print $1,$2,$3}' "
    cmd += f" > {options.temp_directory}/sorted_read_names_with_NH"
    if os.path.exists( f"{options.temp_directory}/sorted_read_names_with_NH" ) == False:
        os.system(cmd)
        logging.info(f"Running cmd - {cmd}")

    logging.info( "Read names have been associated with their corresponding NH values" )

    # Generate short name of reads
    # Output file contains 4 columns - 
    # $1 - Long read name
    # $2 - Line number where the long read appears
    # $3 - Number of times the read name appears
    # $4 - Short read name
    cmd  = f"associateShortNamesToReads "
    cmd += f" --inputfilename {options.temp_directory}/sorted_read_names_with_NH "
    cmd += f" --outputfilename {options.temp_directory}/sorted_read_names_with_NH_and_short_read_names "
    if options.skip_shortening_read_names == True:
        cmd += f" --skipshorteningreadname "
    if os.path.exists( f"{options.temp_directory}/sorted_read_names_with_NH_and_short_read_names" ) == False:
        logging.info(f"Running command - {cmd}")
        os.system(cmd)
        

    logging.info( "Short read names have been generated" )

    # Sort the short read names according to actual position of the long reads in the inputsamfile
    # To save time, the short reads are not copied back to the main inputsamfile. During compression,
    # this file will be provided as input and the program will read from this file simultaneously replace 
    # the long read names with the short read names
    # Output file contains 4 columns - 
    # $1 - Long read name
    # $2 - Line number where the long read appears
    # $3 - Number of times the read name appears
    # $4 - Short read name
    cmd = f"sort -nk2,2 {options.temp_directory}/sorted_read_names_with_NH_and_short_read_names "
    cmd += f" > {options.temp_directory}/sorted_read_names_with_NH_and_short_read_names_sorted_by_pos"
    if os.path.exists( f"{options.temp_directory}/sorted_read_names_with_NH_and_short_read_names_sorted_by_pos" ) == False:
        logging.info(f"Running command - {cmd}")
        os.system(cmd)
        

    logging.info( "Reads sorted according to their actual position" )
    
    return tags_present

def seggregateInformationFromFilename(filename):
    """
    Extracts the following information from the filename:
    
    - The location of the file (without the filename itself)
    - The actual filename
    - The basename (actual filename without the extension)
    - The file extension  
    """
    filename_location = "/".join(filename.split("/")[:-1])
    filename_extension = filename.split(".")[-1]
    filename_full = filename.split("/")[-1]
    filename_base = ".".join(filename_full.split(".")[:-1])
    return filename, filename_location, filename_full, filename_base, filename_extension

def main():
    commandLineArg = sys.argv
    if len( commandLineArg ) == 1:
        print( "Please use the --help option to get usage information" )
    options = parseCommandLineArguments()
    validateCommandLineArguments( options )
    #if options.force == True:
    #   os.system( f"rm -rf {options.temp_directory}" )
    os.system( f"mkdir -p {options.temp_directory}" )
    
    if options.logfilename is None:
        options.logfilename = f"{options.temp_directory}/progress.log"

    if options.quiet == False:
        configureLogger( options )
    if options.quiet == False:
        logging.info( "Logger has been configured" )
    if options.quiet == False:
        logging.info( "validateCommandLineArguments() execution is complete" )

    remove_these_files = []
    
    reference_fullpath, reference_onlypath, reference_onlyfilename, reference_onlybasename, reference_onlyextension = seggregateInformationFromFilename(options.reference)
    if options.compress == True:  
        inputalignedfilename_fullpath, inputalignedfilename_onlypath, inputalignedfilename_onlyfilename, inputalignedfilename_onlybasename, inputalignedfilename_onlyextension = seggregateInformationFromFilename(options.inputalignedfilename)
    elif options.decompress == True:
        inputabrfilename_fullpath, inputabrfilename_onlypath, inputabrfilename_onlyfilename, inputabrfilename_onlybasename, inputabrfilename_onlyextension = seggregateInformationFromFilename(options.inputabrfilename)
    
    ################################################################################################################################################################################################################################################


    ################################################################################################################################################################################################################################################
    # Convert to single line fasta file
    ################################################################################################################################################################################################################################################
    #if os.path.exists( f"{options.temp_directory}/{reference_onlyfilename}" ) == False:
    #    cmd = f"perl -pe \'/^>/ ? print \"\\n\" : chomp\' {options.reference} | tail -n +2 > {options.temp_directory}/{reference_onlyfilename}"
    #    os.system(cmd)
        
    #options.reference = f"{options.temp_directory}/{reference_onlyfilename}"
    ################################################################################################################################################################################################################################################
    
    
    if options.compress == True:
        createPositionSortedSAMInputFileForCompressing( options,  remove_these_files,  logging )
        if options.quiet == False:
            logging.info( f"Position sorted SAM input file has been generated" )
        
        tags_present = returnTagList(options, options.inputalignedfilename, f"{options.temp_directory}/temp_file_tags")
        print(tags_present)
        if tags_present["MD"] == 0:
            ################################################################################################################################################################################################################################################
            # Use samtools to add MD tag
            ################################################################################################################################################################################################################################################
            if options.inputalignedfilename[:-3] == "sam":
                cmd  = f" samtools "
                cmd += f" view "
                cmd += f" -@ {options.cpu} "
                cmd += f" -o {options.inputalignedfilename[:-4]}.temp.bam "
                cmd += f" {options.inputalignedfilename} "
                logging.info(f"Running command - {cmd}")
                os.system(cmd)
            else:
                cmd = f"mv {options.inputalignedfilename} {options.inputalignedfilename[:-4]}.temp.bam"
                logging.info(f"Running command - {cmd}")
                os.system(cmd)
            
            cmd  = f" samtools "
            cmd += f" calmd -bAr "
            cmd += f" -@ {options.cpu} "
            cmd += f" {options.inputalignedfilename[:-4]}.temp.bam "
            cmd += f" {options.reference} "
            cmd += f" > {options.inputalignedfilename[:-4]}.MD_added.bam"
            logging.info(f"Running command - {cmd}")
            os.system(cmd)
            
            cmd  = f"samtools "
            cmd += f" view "
            cmd += f" -h "
            cmd += f" -@ {options.cpu} "
            cmd += f" {options.inputalignedfilename[:-4]}.MD_added.bam "
            cmd += f" > {options.inputalignedfilename}"
            logging.info(f"Running command - {cmd}")
            os.system(cmd)
            
            cmd = f"rm {options.inputalignedfilename[:-4]}.temp.bam {options.inputalignedfilename[:-4]}.MD_added.bam "
            logging.info(f"Running command - {cmd}")
            os.system(cmd)
            
        tags_present = alterReadNamesAndAddNHTag( options,  logging )
        if options.quiet == False:
            logging.info( f"Read names have been altered and NH tags have been added" )
            
        findEndedness( options, logging )
        if options.quiet == False:
            logging.info( f"findEndedness() execution is complete" )
        
        compressSamFile( options, tags_present )
        if options.quiet == False:
            logging.info( f"compressSamFile() execution is complete" )

    if options.decompress == True:
        ################################################################################################################################################################################################################################################
        # Generate index for reference
        ################################################################################################################################################################################################################################################
        cmd = f"samtools faidx {options.reference}"
        os.system(cmd)
        ################################################################################################################################################################################################################################################

        decompressSamFile( options )
        if options.quiet == False:
            logging.info( f"decompressSamFile() execution is complete" )

    return


    if options.header == True:
        cmd = f"cat {options.inputalignedfilename[0][:-3]}|grep ^@ > {options.inputalignedfilename[0][:-3]}.headers"
        os.system( cmd )
        # print(open(f"{options.inputalignedfilename[0][:-3]}.headers","r").read())
        os.system( f"rm -f {options.inputalignedfilename[0][:-3]}.headers" )
    if options.generate_overlapping_coverage == True or options.generate_non_overlapping_coverage == True:
        input_filename = options.inputabrfilename
        name_of_input_file_without_location = input_filename.split( "/" )[-1][:-8]
        _outputfilename = options.temp_directory + "/" + name_of_input_file_without_location + ".sam_concise"
        _index_outputfilename = options.temp_directory + "/" + name_of_input_file_without_location + ".sam_concise_index"

        cmd = f"7za e {input_filename} -y "
        cmd += f" -o{options.temp_directory}"
        cmd += f" 1> /dev/null 2>/dev/null"
        if options.quiet == False:
            logging.info( f"Running command - {cmd}" )
        os.system( cmd )

        if options.inputabrfilename[-3:] == ".br":
            cmd = f"brotli -d -k -f {_outputfilename}.br {_index_outputfilename}.br"
            os.system( cmd )

        generateCoverage( _outputfilename, _index_outputfilename, options )
        os.system( f"rm -rf {options.temp_directory}" )

    cleanUp( options )
    if options.quiet == False:
        logging.info( f"cleanUp() execution is complete" )


if __name__ == "__main__":
    main()