From c3e2df0159ca0cdfa8e32f959e17e65ffac483f8 Mon Sep 17 00:00:00 2001 From: Samarendra Date: Wed, 19 Jun 2024 15:29:59 +0200 Subject: [PATCH 1/8] code formatted by black --- gaftools/__main__.py | 5 +- gaftools/cli/__init__.py | 2 + gaftools/cli/index.py | 119 +++++---- gaftools/cli/order_gfa.py | 162 ++++++++---- gaftools/cli/phase.py | 80 +++--- gaftools/cli/realign.py | 215 +++++++++++----- gaftools/cli/sort.py | 108 ++++---- gaftools/cli/stat.py | 110 ++++---- gaftools/cli/view.py | 127 +++++----- gaftools/conversion.py | 126 ++++++---- gaftools/gaf.py | 96 ++++--- gaftools/gfa.py | 104 +++++--- gaftools/utils.py | 42 ++-- setup.py | 2 +- tests/test_extract_path.py | 24 +- tests/test_gaf.py | 21 +- tests/test_gaf_gzipped.py | 24 +- tests/test_gfa_class.py | 87 ++++--- tests/test_index.py | 72 +++--- tests/test_parsegaf_tags.py | 12 +- tests/test_run_ordergfa.py | 14 +- tests/test_run_realign.py | 37 ++- tests/test_run_stat.py | 70 +++--- tests/test_sort.py | 35 ++- tests/test_view.py | 483 ++++++++++++++++-------------------- 25 files changed, 1229 insertions(+), 948 deletions(-) diff --git a/gaftools/__main__.py b/gaftools/__main__.py index ddf26db..1429760 100644 --- a/gaftools/__main__.py +++ b/gaftools/__main__.py @@ -1,5 +1,5 @@ -#The code has been taken from WhatsHap. -#Link to WhatsHap: https://github.com/whatshap/whatshap +# The code has been taken from WhatsHap. +# Link to WhatsHap: https://github.com/whatshap/whatshap import sys @@ -91,5 +91,6 @@ def main(argv=sys.argv[1:]): logger.debug("Command line error. Traceback:", exc_info=True) sys.exit(1) + if __name__ == "__main__": main() diff --git a/gaftools/cli/__init__.py b/gaftools/cli/__init__.py index 1fbcc70..acb4c40 100644 --- a/gaftools/cli/__init__.py +++ b/gaftools/cli/__init__.py @@ -9,9 +9,11 @@ logger = logging.getLogger(__name__) + class CommandLineError(Exception): """An anticipated command-line error occurred. This ends up as a user-visible error message""" + def log_memory_usage(include_children=False): if sys.platform == "linux": if include_children: diff --git a/gaftools/cli/index.py b/gaftools/cli/index.py index 39f8353..fe0e45f 100644 --- a/gaftools/cli/index.py +++ b/gaftools/cli/index.py @@ -24,31 +24,28 @@ logger = logging.getLogger(__name__) -def run(gaf_path, - gfa_path, - output=None - ): +def run(gaf_path, gfa_path, output=None): timers = StageTimer() if output == None: - output = gaf_path+".gvi" - - #Detecting if GAF has stable or unstable coordinate - gaf=GAF(gaf_path) - stable=None + output = gaf_path + ".gvi" + + # Detecting if GAF has stable or unstable coordinate + gaf = GAF(gaf_path) + stable = None # checking format in the first 10 lines. for i, gaf_line in enumerate(gaf.read_file()): if i == 10: break if i == 0: - stable=gaf_line.detect_path_format() - assert stable==gaf_line.detect_path_format() + stable = gaf_line.detect_path_format() + assert stable == gaf_line.detect_path_format() gaf.close() - + nodes = {} reference = defaultdict(lambda: []) ref_contig = [] - + gfa_file = GFA(graph_file=gfa_path, low_memory=True) contigs = list(gfa_file.contigs.keys()) ref_contig = [contig for contig in gfa_file.contigs if gfa_file.contigs[contig] == 0] @@ -60,12 +57,12 @@ def run(gaf_path, reference[contig].append(gfa_file[node]) nodes = gfa_file.nodes del gfa_file - + if utils.is_file_gzipped(gaf_path): - gaf_file = libcbgzf.BGZFile(gaf_path,"rb") + gaf_file = libcbgzf.BGZFile(gaf_path, "rb") else: - gaf_file = open(gaf_path,"rt") - + gaf_file = open(gaf_path, "rt") + out_dict = {} offset = 0 while True: @@ -74,67 +71,98 @@ def run(gaf_path, if not mapping: break try: - val = mapping.rstrip().split('\t') + val = mapping.rstrip().split("\t") except TypeError: - val = mapping.decode("utf-8").rstrip().split('\t') - + val = mapping.decode("utf-8").rstrip().split("\t") + if stable: with timers("convert_coord"): alignment = convert_coord(val, reference) else: - alignment = list(re.split('>|<', val[5]))[1:] + alignment = list(re.split(">|<", val[5]))[1:] for a in alignment: try: - out_dict[(nodes[a].id, nodes[a].tags['SN'][1], int(nodes[a].tags['SO'][1]), int(nodes[a].tags['SO'][1]) + int(nodes[a].tags['LN'][1]))].append(offset) + out_dict[ + ( + nodes[a].id, + nodes[a].tags["SN"][1], + int(nodes[a].tags["SO"][1]), + int(nodes[a].tags["SO"][1]) + int(nodes[a].tags["LN"][1]), + ) + ].append(offset) except KeyError: - out_dict[(nodes[a].id, nodes[a].tags['SN'][1], int(nodes[a].tags['SO'][1]), int(nodes[a].tags['SO'][1]) + int(nodes[a].tags['LN'][1]))] = [offset] + out_dict[ + ( + nodes[a].id, + nodes[a].tags["SN"][1], + int(nodes[a].tags["SO"][1]), + int(nodes[a].tags["SO"][1]) + int(nodes[a].tags["LN"][1]), + ) + ] = [offset] out_dict["ref_contig"] = ref_contig - + gaf_file.close() - - with open(output, 'wb') as handle: + + with open(output, "wb") as handle: with timers("write_file"): pickle.dump(out_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) - + logger.info("\n== SUMMARY ==") total_time = timers.total() log_memory_usage() - logger.info("Time to sort gfa: %9.2f s", timers.elapsed('sort_gfa')) - logger.info("Time to store contig info: %9.2f s", timers.elapsed('store_contig_info')) + logger.info("Time to sort gfa: %9.2f s", timers.elapsed("sort_gfa")) + logger.info( + "Time to store contig info: %9.2f s", timers.elapsed("store_contig_info") + ) logger.info("Total time: %9.2f s", total_time) def convert_coord(line, ref): unstable_coord = [] - gaf_contigs = list(filter(None, re.split('(>)|(<)', line[5]))) + gaf_contigs = list(filter(None, re.split("(>)|(<)", line[5]))) for nd in gaf_contigs: if nd == ">" or nd == "<": continue - if ':' in nd and '-' in nd: - tmp = nd.rstrip().split(':') + if ":" in nd and "-" in nd: + tmp = nd.rstrip().split(":") query_contig_name = tmp[0] - (query_start, query_end) = tmp[1].rstrip().split('-') + (query_start, query_end) = tmp[1].rstrip().split("-") else: query_start = line[7] - query_end = line[8] + query_end = line[8] query_contig_name = nd - - '''Find the matching nodes from the reference genome here''' - start, end = utils.search_intervals(ref[query_contig_name], int(query_start), int(query_end), 0, len(ref[query_contig_name])) - for node in ref[query_contig_name][start:end+1]: + """Find the matching nodes from the reference genome here""" + start, end = utils.search_intervals( + ref[query_contig_name], int(query_start), int(query_end), 0, len(ref[query_contig_name]) + ) + + for node in ref[query_contig_name][start : end + 1]: cases = -1 - if int(node.tags['SO'][1]) <= int(query_start) < int(node.tags['SO'][1]) + int(node.tags['LN'][1]): + if ( + int(node.tags["SO"][1]) + <= int(query_start) + < int(node.tags["SO"][1]) + int(node.tags["LN"][1]) + ): cases = 1 - elif int(node.tags['SO'][1]) < int(query_end) <= int(node.tags['SO'][1]) + int(node.tags['LN'][1]): + elif ( + int(node.tags["SO"][1]) + < int(query_end) + <= int(node.tags["SO"][1]) + int(node.tags["LN"][1]) + ): cases = 2 - elif int(query_start) < int(node.tags['SO'][1]) < int(node.tags['SO'][1]) + int(node.tags['LN'][1]) < int(query_end): + elif ( + int(query_start) + < int(node.tags["SO"][1]) + < int(node.tags["SO"][1]) + int(node.tags["LN"][1]) + < int(query_end) + ): cases = 3 - - if cases != -1: + + if cases != -1: unstable_coord.append(node.id) - + return unstable_coord @@ -148,7 +176,8 @@ def add_arguments(parser): help='Reference rGFA file') arg('-o', '--output', default=None, help='Path to the output Indexed GAF file. If omitted, use .gvi') - + + # fmt: on def validate(args, parser): return True diff --git a/gaftools/cli/order_gfa.py b/gaftools/cli/order_gfa.py index b1c88ff..9ce91cb 100644 --- a/gaftools/cli/order_gfa.py +++ b/gaftools/cli/order_gfa.py @@ -14,14 +14,42 @@ logger = logging.getLogger(__name__) -DEFAULT_CHROMOSOME = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY", "chrM"] +DEFAULT_CHROMOSOME = [ + "chr1", + "chr2", + "chr3", + "chr4", + "chr5", + "chr6", + "chr7", + "chr8", + "chr9", + "chr10", + "chr11", + "chr12", + "chr13", + "chr14", + "chr15", + "chr16", + "chr17", + "chr18", + "chr19", + "chr20", + "chr21", + "chr22", + "chrX", + "chrY", + "chrM", +] + + def run_order_gfa( - gfa_filename, - outdir, - chromosome_order=None, - with_sequence=False, + gfa_filename, + outdir, + chromosome_order=None, + with_sequence=False, ): - + if not chromosome_order is None: chromosome_order = chromosome_order.split(sep=",") @@ -49,10 +77,12 @@ def run_order_gfa( # and returns a dict of chromosome_name: {nodes...} components = name_comps(graph, components) # if not chromosome_order is None: # user gave a list - if chromosome_order != ['']: # user gave a list + if chromosome_order != [""]: # user gave a list for c in chromosome_order: if c not in set(components.keys()): - logger.error(f"The chromosome name provided {c} did not match with a component in the graph") + logger.error( + f"The chromosome name provided {c} did not match with a component in the graph" + ) logger.error(f" What was Found: {','.join(sorted(components.keys()))}") sys.exit(1) @@ -60,8 +90,10 @@ def run_order_gfa( try: assert set(components.keys()) == set(DEFAULT_CHROMOSOME) except AssertionError: - logger.error(f"chromosome order was not provided, so the default was taken, but the default did not match" - f" what was found in the graph, which is {','.join(sorted(components.keys()))}") + logger.error( + f"chromosome order was not provided, so the default was taken, but the default did not match" + f" what was found in the graph, which is {','.join(sorted(components.keys()))}" + ) sys.exit(1) chromosome_order = DEFAULT_CHROMOSOME # running index for the bubble index (BO) already used @@ -70,52 +102,69 @@ def run_order_gfa( # todo output final GFA with all the chromosomes ordered out_files = [] for chromosome in chromosome_order: - logger.info('Processing %s', chromosome) + logger.info("Processing %s", chromosome) component_nodes = components[chromosome] # Initialize files # f_gfa = open(outdir+'/'+gfa_filename.split("/")[-1][:-4]+'-'+chromosome+'.gfa', 'w') - scaffold_nodes, inside_nodes, node_order, bo, bubble_count = decompose_and_order(graph, component_nodes, - chromosome, bo) + scaffold_nodes, inside_nodes, node_order, bo, bubble_count = decompose_and_order( + graph, component_nodes, chromosome, bo + ) # skip a chromosome if something went wrong if scaffold_nodes: - f_gfa = outdir + os.sep + gfa_filename.split(os.sep)[-1].split(".")[0] + '-' + chromosome + ".gfa" + f_gfa = ( + outdir + + os.sep + + gfa_filename.split(os.sep)[-1].split(".")[0] + + "-" + + chromosome + + ".gfa" + ) out_files.append(f_gfa) - f_colors = open(outdir + os.sep + gfa_filename.split(os.sep)[-1][:-4] + '-' + chromosome + '.csv', 'w') - f_colors.write('Name,Color,SN,SO,BO,NO\n') + f_colors = open( + outdir + os.sep + gfa_filename.split(os.sep)[-1][:-4] + "-" + chromosome + ".csv", + "w", + ) + f_colors.write("Name,Color,SN,SO,BO,NO\n") total_bubbles += bubble_count for node_name in sorted(component_nodes): node = graph.nodes[node_name] bo_tag, no_tag = node_order[node_name] - node.tags['BO'] = ("i", bo_tag) - node.tags['NO'] = ("i", no_tag) + node.tags["BO"] = ("i", bo_tag) + node.tags["NO"] = ("i", no_tag) if node_name in scaffold_nodes: - color = 'orange' + color = "orange" elif node_name in inside_nodes: - color = 'blue' + color = "blue" else: - color = 'gray' + color = "gray" if "SN" in node.tags: - sn_tag = node.tags['SN'][1] + sn_tag = node.tags["SN"][1] else: sn_tag = "NA" if "SO" in node.tags: - so_tag = node.tags['SO'][1] + so_tag = node.tags["SO"][1] else: so_tag = "NA" - f_colors.write('{},{},{},{},{},{}\n'.format(node_name, color, sn_tag, so_tag, bo_tag, no_tag)) + f_colors.write( + "{},{},{},{},{},{}\n".format(node_name, color, sn_tag, so_tag, bo_tag, no_tag) + ) - graph.write_gfa(set_of_nodes=component_nodes, output_file=f_gfa, append=False, order_bo=True) + graph.write_gfa( + set_of_nodes=component_nodes, output_file=f_gfa, append=False, order_bo=True + ) f_colors.close() else: logger.warning(f"Chromosome {chromosome} was skipped") - final_gfa = outdir + os.sep + gfa_filename.split(os.sep)[-1].split(".")[0] + '-complete' + ".gfa" + final_gfa = ( + outdir + os.sep + gfa_filename.split(os.sep)[-1].split(".")[0] + "-complete" + ".gfa" + ) with open(final_gfa, "w") as outfile: # outputting all the S lines first for f in out_files: @@ -130,9 +179,9 @@ def run_order_gfa( if l.startswith("L"): outfile.write(l) - logger.info('Total bubbles: %d', total_bubbles) + logger.info("Total bubbles: %d", total_bubbles) + - def decompose_and_order(graph, component, component_name, bo_start=0): """ This function takes the graph and a component @@ -147,13 +196,15 @@ def decompose_and_order(graph, component, component_name, bo_start=0): new_graph = graph.graph_from_comp(component) start = time.perf_counter() all_biccs, artic_points = new_graph.biccs() - logger.info(f" It took {time.perf_counter() - start} seconds to find the Biconnected Components") + logger.info( + f" It took {time.perf_counter() - start} seconds to find the Biconnected Components" + ) bubbles = [] scaffold_graph = GFA() scaffold_node_types = dict() for n in artic_points: scaffold_graph.add_node(n) - scaffold_node_types[n] = 's' + scaffold_node_types[n] = "s" inside_nodes = set() for bc in all_biccs: @@ -171,7 +222,7 @@ def decompose_and_order(graph, component, component_name, bo_start=0): bubble_index = len(bubbles) bubbles.append(bc_inside_nodes) scaffold_graph.add_node(str(bubble_index)) - scaffold_node_types[str(bubble_index)] = 'b' + scaffold_node_types[str(bubble_index)] = "b" for end_node in bc_end_nodes: scaffold_graph.add_edge(str(bubble_index), "+", end_node, "+", 0) @@ -186,7 +237,8 @@ def decompose_and_order(graph, component, component_name, bo_start=0): assert len(degree_one) == 2 except AssertionError: logger.warning( - f"Error: In Chromosome {component_name}, we found more or less than two nodes with degree 1. Skipping this chromosome") + f"Error: In Chromosome {component_name}, we found more or less than two nodes with degree 1. Skipping this chromosome" + ) # hacky but for now maybe ok return None, None, None, None, None @@ -194,16 +246,19 @@ def decompose_and_order(graph, component, component_name, bo_start=0): assert len(degree_two) == len(scaffold_graph) - 2 except AssertionError: logger.warning( - f"Error: In Chromosome {component_name}, the number of nodes with degree 2 did not mach the expected number") + f"Error: In Chromosome {component_name}, the number of nodes with degree 2 did not mach the expected number" + ) return None, None, None, None, None # the scaffold graph should be a line graph here traversal = scaffold_graph.dfs(degree_one[0]) - traversal_scaffold_only = [node_name for node_name in traversal if scaffold_node_types[node_name] == 's'] + traversal_scaffold_only = [ + node_name for node_name in traversal if scaffold_node_types[node_name] == "s" + ] # check that all scaffold nodes carry the same sequence name (SN), i.e. all came for the linear reference - assert len(set(new_graph[n].tags['SN'] for n in traversal_scaffold_only)) == 1 + assert len(set(new_graph[n].tags["SN"] for n in traversal_scaffold_only)) == 1 # I save tags as key:(type, value), so "SO":(i, '123') - coordinates = list(int(new_graph[n].tags['SO'][1]) for n in traversal_scaffold_only) + coordinates = list(int(new_graph[n].tags["SO"][1]) for n in traversal_scaffold_only) # make sure that the traversal is in ascending order if coordinates[0] > coordinates[-1]: @@ -217,9 +272,9 @@ def decompose_and_order(graph, component, component_name, bo_start=0): bo = bo_start for node in traversal: node_type = scaffold_node_types[node] - if node_type == 's': + if node_type == "s": node_order[node] = (bo, 0) - elif node_type == 'b': + elif node_type == "b": for i, n in enumerate(sorted(bubbles[int(node)])): node_order[n] = (bo, i + 1) else: @@ -254,22 +309,33 @@ def name_comps(graph, components): if most_freq <= count: current_tag, most_freq = tag, count if current_tag == "": - raise ValueError("Were not able to assign a chromosome to component, SN tags could be missing") + raise ValueError( + "Were not able to assign a chromosome to component, SN tags could be missing" + ) named_comps[current_tag] = comp return named_comps def add_arguments(parser): arg = parser.add_argument - arg('--chromosome_order', default="", - help='Order in which to arrange chromosomes in terms of BO sorting. ' - 'Expecting comma-separated list. Default: chr1,...,chr22,chrX,chrY,chrM') - arg('--with-sequence', default=False, action='store_true', - help='Retain sequences in output (default is to strip sequences)') - arg('gfa_filename', metavar='GRAPH', - help='Input rGFA file') - arg('--outdir', default="./out", - help='Output Directory to store all the GFA and CSV files. Default location is a "out" folder from the directory of execution.') + arg( + "--chromosome_order", + default="", + help="Order in which to arrange chromosomes in terms of BO sorting. " + "Expecting comma-separated list. Default: chr1,...,chr22,chrX,chrY,chrM", + ) + arg( + "--with-sequence", + default=False, + action="store_true", + help="Retain sequences in output (default is to strip sequences)", + ) + arg("gfa_filename", metavar="GRAPH", help="Input rGFA file") + arg( + "--outdir", + default="./out", + help='Output Directory to store all the GFA and CSV files. Default location is a "out" folder from the directory of execution.', + ) def main(args): diff --git a/gaftools/cli/phase.py b/gaftools/cli/phase.py index e742f26..c55a63a 100644 --- a/gaftools/cli/phase.py +++ b/gaftools/cli/phase.py @@ -13,6 +13,7 @@ from gaftools.timer import StageTimer from gaftools.gaf import GAF + class Node: def __init__(self, chr_name, haplotype, phase_set): self.chr_name = chr_name @@ -22,6 +23,7 @@ def __init__(self, chr_name, haplotype, phase_set): logger = logging.getLogger(__name__) + def run(gaf_file, tsv_file, output=sys.stdout): timers = StageTimer() @@ -33,61 +35,81 @@ def run(gaf_file, tsv_file, output=sys.stdout): def add_phase_info(gaf_path, tsv_path, out_path): - '''This function adds phasing information to the gaf file using .tsv file of the WhatsHap + """This function adds phasing information to the gaf file using .tsv file of the WhatsHap Haplotag... The information is added as two tags ("ps:Z" and "ht:Z") before the cigar string - ''' + """ import itertools - + logger.info("INFO: Adding phasing information...") - - tsv_file = open(tsv_path,"r") - + + tsv_file = open(tsv_path, "r") + phase = {} for line in tsv_file: - line_elements = line.rstrip().split('\t') - + line_elements = line.rstrip().split("\t") + if line_elements[0] not in phase: tmp = Node(line_elements[3], line_elements[1], line_elements[2]) phase[line_elements[0]] = tmp - + gaf_out = open(out_path, "w") - + line_count = 0 missing_in_tsv = 0 phased = 0 gaf_file = GAF(gaf_path) for gaf_line in gaf_file.read_file(): - + if line_count != 0: gaf_out.write("\n") - + line_count += 1 - - gaf_out.write("%s\t%s\t%s\t%s\t+\t%s\t%d\t%d\t%d\t%d\t%d\t%d" - %(gaf_line.query_name, gaf_line.query_length, gaf_line.query_start, gaf_line.query_end, - gaf_line.path, gaf_line.path_length, gaf_line.path_start, gaf_line.path_end, - gaf_line.residue_matches, gaf_line.alignment_block_length, gaf_line.mapping_quality)) - + + gaf_out.write( + "%s\t%s\t%s\t%s\t+\t%s\t%d\t%d\t%d\t%d\t%d\t%d" + % ( + gaf_line.query_name, + gaf_line.query_length, + gaf_line.query_start, + gaf_line.query_end, + gaf_line.path, + gaf_line.path_length, + gaf_line.path_start, + gaf_line.path_end, + gaf_line.residue_matches, + gaf_line.alignment_block_length, + gaf_line.mapping_quality, + ) + ) + in_tsv = True if gaf_line.query_name not in phase: missing_in_tsv += 1 in_tsv = False - + if in_tsv and phase[gaf_line.query_name].haplotype != "none": - gaf_out.write("\tps:Z:%s-%s\tht:Z:%s\t" % (phase[gaf_line.query_name].chr_name, - phase[gaf_line.query_name].phase_set, phase[gaf_line.query_name].haplotype)) - phased +=1 + gaf_out.write( + "\tps:Z:%s-%s\tht:Z:%s\t" + % ( + phase[gaf_line.query_name].chr_name, + phase[gaf_line.query_name].phase_set, + phase[gaf_line.query_name].haplotype, + ) + ) + phased += 1 else: gaf_out.write("\tps:Z:none\tht:Z:none") - for k in gaf_line.tags.keys(): - gaf_out.write("\t%s:%s"%(k,gaf_line.tags[k])) - - gaf_out.write("\t%s"%gaf_line.cigar) - - logger.info("INFO: Added phasing info (ps:Z and ht:Z) for %d reads out of %d GAF lines - (%d reads are missing in .tsv)" %(phased, line_count, missing_in_tsv)) + gaf_out.write("\t%s:%s" % (k, gaf_line.tags[k])) + + gaf_out.write("\t%s" % gaf_line.cigar) + + logger.info( + "INFO: Added phasing info (ps:Z and ht:Z) for %d reads out of %d GAF lines - (%d reads are missing in .tsv)" + % (phased, line_count, missing_in_tsv) + ) gaf_file.close() tsv_file.close() gaf_out.close() @@ -105,9 +127,11 @@ def add_arguments(parser): arg('-o', '--output', default=sys.stdout, help='Output GAF file. If omitted, output is directed to standard output.') + # fmt: on def validate(args, parser): return True + def main(args): run(**vars(args)) diff --git a/gaftools/cli/realign.py b/gaftools/cli/realign.py index 98c292a..b5a18b0 100644 --- a/gaftools/cli/realign.py +++ b/gaftools/cli/realign.py @@ -17,27 +17,19 @@ from pywfa.align import WavefrontAligner, cigartuples_to_str - logger = logging.getLogger(__name__) -def run_realign( - gaf, - graph, - fasta, - output=None, - ext=False, - cores=1 -): +def run_realign(gaf, graph, fasta, output=None, ext=False, cores=1): timers = StageTimer() if output is None: output = sys.stdout else: - output = open(output, 'w') + output = open(output, "w") if cores > mp.cpu_count(): - logger.warning('Number of cores requested is greater than the total number of CPU cores.') + logger.warning("Number of cores requested is greater than the total number of CPU cores.") cores = min(mp.cpu_count() - 1, cores) # realign_gaf(gaf, graph, fasta, output, ext, cores) @@ -73,7 +65,9 @@ def filter_duplicates(aln): if cnt == cnt2 or line2.duplicate == True: continue - sim = overlap_ratio(line.query_start, line.query_end, line2.query_start, line2.query_end) + sim = overlap_ratio( + line.query_start, line.query_end, line2.query_start, line2.query_end + ) if sim > 0.75: if line.score > line2.score: @@ -91,14 +85,23 @@ def write_alignments(aln, output): for gaf_line in mappings: if not gaf_line.duplicate: # Write the alignment back to the GAF - output.write("%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" % (gaf_line.query_name, - gaf_line.query_length, - gaf_line.query_start, - gaf_line.query_end, gaf_line.strand, - gaf_line.path, gaf_line.path_length, - gaf_line.path_start, gaf_line.path_end, - match, cigar_len, - gaf_line.mapping_quality)) + output.write( + "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" + % ( + gaf_line.query_name, + gaf_line.query_length, + gaf_line.query_start, + gaf_line.query_end, + gaf_line.strand, + gaf_line.path, + gaf_line.path_length, + gaf_line.path_start, + gaf_line.path_end, + match, + cigar_len, + gaf_line.mapping_quality, + ) + ) for k in gaf_line.tags.keys(): output.write("\t%s%s" % (k, gaf_line.tags[k])) @@ -115,10 +118,12 @@ def wfa_alignment(seq_batch, queue): # Thus, we write the original alignment back to the GAF instead of realignment for gaf_line, ref, query in seq_batch: if gaf_line.query_end - gaf_line.query_start > 60_000: - out_string = (f"{gaf_line.query_name}\t{gaf_line.query_length}\t{gaf_line.query_start}\t" - f"{gaf_line.query_end}\t{gaf_line.strand}\t{gaf_line.path}\t{gaf_line.path_length}\t" - f"{gaf_line.path_start}\t{gaf_line.path_end}\t{gaf_line.residue_matches}" - f"\t{gaf_line.alignment_block_length}\t{gaf_line.mapping_quality}") + out_string = ( + f"{gaf_line.query_name}\t{gaf_line.query_length}\t{gaf_line.query_start}\t" + f"{gaf_line.query_end}\t{gaf_line.strand}\t{gaf_line.path}\t{gaf_line.path_length}\t" + f"{gaf_line.path_start}\t{gaf_line.path_end}\t{gaf_line.residue_matches}" + f"\t{gaf_line.alignment_block_length}\t{gaf_line.mapping_quality}" + ) for k in gaf_line.tags.keys(): out_string += f"\t{k}{gaf_line.tags[k]}" queue.put(out_string + "\n") @@ -149,10 +154,12 @@ def wfa_alignment(seq_batch, queue): assert False cigar_len += op_len - out_string = (f"{gaf_line.query_name}\t{gaf_line.query_length}\t{gaf_line.query_start}\t" - f"{gaf_line.query_end}\t{gaf_line.strand}\t{gaf_line.path}\t{gaf_line.path_length}\t" - f"{gaf_line.path_start}\t{gaf_line.path_end}\t{match}" - f"\t{cigar_len}\t{gaf_line.mapping_quality}") + out_string = ( + f"{gaf_line.query_name}\t{gaf_line.query_length}\t{gaf_line.query_start}\t" + f"{gaf_line.query_end}\t{gaf_line.strand}\t{gaf_line.path}\t{gaf_line.path_length}\t" + f"{gaf_line.path_start}\t{gaf_line.path_end}\t{match}" + f"\t{cigar_len}\t{gaf_line.mapping_quality}" + ) cigar = aligner.cigarstring.replace("M", "=") gaf_line.tags["cg:Z:"] = cigar # queue.put(out_string + "\n") @@ -173,14 +180,20 @@ def wfa_alignment_old(aln, gaf_line, ref, query, path_start, extended, queue): # gaf_line.query_length, gaf_line.query_start, gaf_line.query_end, gaf_line.strand, # gaf_line.path, gaf_line.path_length, gaf_line.path_start, gaf_line.path_end, # gaf_line.residue_matches, gaf_line.alignment_block_length, gaf_line.mapping_quality)) - out_string = "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" % (gaf_line.query_name, - gaf_line.query_length, gaf_line.query_start, - gaf_line.query_end, gaf_line.strand, - gaf_line.path, gaf_line.path_length, - gaf_line.path_start, gaf_line.path_end, - gaf_line.residue_matches, - gaf_line.alignment_block_length, - gaf_line.mapping_quality) + out_string = "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" % ( + gaf_line.query_name, + gaf_line.query_length, + gaf_line.query_start, + gaf_line.query_end, + gaf_line.strand, + gaf_line.path, + gaf_line.path_length, + gaf_line.path_start, + gaf_line.path_end, + gaf_line.residue_matches, + gaf_line.alignment_block_length, + gaf_line.mapping_quality, + ) for k in gaf_line.tags.keys(): # output.write("\t%s%s"%(k, gaf_line.tags[k])) @@ -192,7 +205,9 @@ def wfa_alignment_old(aln, gaf_line, ref, query, path_start, extended, queue): else: aligner = WavefrontAligner(ref) if extended: - res = aligner(query, clip_cigar=True, min_aligned_bases_left=30, min_aligned_bases_right=30) + res = aligner( + query, clip_cigar=True, min_aligned_bases_left=30, min_aligned_bases_right=30 + ) else: res = aligner(query, clip_cigar=False) @@ -224,19 +239,47 @@ def wfa_alignment_old(aln, gaf_line, ref, query, path_start, extended, queue): queue.put("NA") if gaf_line.query_name in aln: - aln[gaf_line.query_name].append(gaftools.gaf.Alignment("", gaf_line.query_length, res.text_start, - res.text_end, gaf_line.strand, gaf_line.path, - gaf_line.path_length, - path_start + res.pattern_start, - path_start + res.pattern_end, match, 0, 0, False, - cigar, - cigar_len, res.score)) + aln[gaf_line.query_name].append( + gaftools.gaf.Alignment( + "", + gaf_line.query_length, + res.text_start, + res.text_end, + gaf_line.strand, + gaf_line.path, + gaf_line.path_length, + path_start + res.pattern_start, + path_start + res.pattern_end, + match, + 0, + 0, + False, + cigar, + cigar_len, + res.score, + ) + ) else: - aln[gaf_line.query_name] = [gaftools.gaf.Alignment("", gaf_line.query_length, res.text_start, - res.text_end, gaf_line.strand, gaf_line.path, - gaf_line.path_length, path_start + res.pattern_start, - path_start + res.pattern_end, match, 0, 0, False, - cigar, cigar_len, res.score)] + aln[gaf_line.query_name] = [ + gaftools.gaf.Alignment( + "", + gaf_line.query_length, + res.text_start, + res.text_end, + gaf_line.strand, + gaf_line.path, + gaf_line.path_length, + path_start + res.pattern_start, + path_start + res.pattern_end, + match, + 0, + 0, + False, + cigar, + cigar_len, + res.score, + ) + ] else: cigar = aligner.cigarstring.replace("M", "=") @@ -245,13 +288,20 @@ def wfa_alignment_old(aln, gaf_line, ref, query, path_start, extended, queue): # gaf_line.query_length, gaf_line.query_start, gaf_line.query_end, gaf_line.strand, # gaf_line.path, gaf_line.path_length, gaf_line.path_start, gaf_line.path_end, # match, cigar_len, gaf_line.mapping_quality)) - out_string = "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" % (gaf_line.query_name, - gaf_line.query_length, - gaf_line.query_start, gaf_line.query_end, - gaf_line.strand, - gaf_line.path, gaf_line.path_length, - gaf_line.path_start, gaf_line.path_end, - match, cigar_len, gaf_line.mapping_quality) + out_string = "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" % ( + gaf_line.query_name, + gaf_line.query_length, + gaf_line.query_start, + gaf_line.query_end, + gaf_line.strand, + gaf_line.path, + gaf_line.path_length, + gaf_line.path_start, + gaf_line.path_end, + match, + cigar_len, + gaf_line.mapping_quality, + ) # queue.put(out_string + "\n") for k in gaf_line.tags.keys(): @@ -283,7 +333,7 @@ def realign_gaf(gaf, graph, fasta, output, cores=1): gaf_file = GAF(gaf) for line in gaf_file.read_file(): path_sequence = graph_obj.extract_path(line.path) - ref = path_sequence[line.path_start:line.path_end] + ref = path_sequence[line.path_start : line.path_end] query = fastafile.fetch(line.query_name, line.query_start, line.query_end) seq_batch.append((line, ref, query)) @@ -291,7 +341,15 @@ def realign_gaf(gaf, graph, fasta, output, cores=1): continue else: # logger.info(f"{time.time()} finished preparing one batch and adding a process for it") - processes.append(mp.Process(target=wfa_alignment, args=(seq_batch, queue,))) + processes.append( + mp.Process( + target=wfa_alignment, + args=( + seq_batch, + queue, + ), + ) + ) seq_batch = [] if len(processes) == cores: @@ -313,7 +371,15 @@ def realign_gaf(gaf, graph, fasta, output, cores=1): gaf_file.close() if len(seq_batch) > 0: # leftover alignments to re-align - processes.append(mp.Process(target=wfa_alignment, args=(seq_batch, queue,))) + processes.append( + mp.Process( + target=wfa_alignment, + args=( + seq_batch, + queue, + ), + ) + ) # leftover batches if len(processes) != 0: # logger.info("in the leftovers") @@ -334,6 +400,7 @@ def realign_gaf(gaf, graph, fasta, output, cores=1): # display_top(snapshot, limit=5) fastafile.close() + def realign_gaf_old(gaf, graph, fasta, output, extended, cores=1): """ Uses pyWFA (https://github.com/kcleal/pywfa) @@ -380,13 +447,36 @@ def realign_gaf_old(gaf, graph, fasta, output, extended, cores=1): # making a process processes.append( - mp.Process(target=wfa_alignment_old, args=(aln, line, ref, query, path_start, output, extended, queue,))) + mp.Process( + target=wfa_alignment_old, + args=( + aln, + line, + ref, + query, + path_start, + output, + extended, + queue, + ), + ) + ) # wfa_alignment(aln, line, ref, query, path_start, output, extended) else: - ref = path_sequence[line.path_start:line.path_end] + ref = path_sequence[line.path_start : line.path_end] query = fastafile.fetch(line.query_name, line.query_start, line.query_end) - processes.append(mp.Process(target=wfa_alignment, args=(line, ref, query, queue,))) + processes.append( + mp.Process( + target=wfa_alignment, + args=( + line, + ref, + query, + queue, + ), + ) + ) # wfa_alignment([], line, ref, query, 0, output, False) if len(processes) == cores: # start running the processes and checking when they're done before starting the next batch @@ -432,6 +522,7 @@ def realign_gaf_old(gaf, graph, fasta, output, extended, cores=1): filter_duplicates(aln) write_alignments(aln, output) + # fmt: off def add_arguments(parser): arg = parser.add_argument diff --git a/gaftools/cli/sort.py b/gaftools/cli/sort.py index a9b4072..df8c730 100644 --- a/gaftools/cli/sort.py +++ b/gaftools/cli/sort.py @@ -1,4 +1,4 @@ -''' +""" Sorting GAF alignments using BO and NO tags of the corresponding graph The script uses the BO and NO tags defined by the order_gfa command. Using the bubble ordering done, the alignments are sorted. @@ -9,7 +9,7 @@ 1. bo:i: - This is the BO tags from the GFA carried forward. The sorting uses a BO tag for each alignment, the BO tag for the start node (or end node based on overall orientation) of the alignment path. 2. sn:Z: - The name of the reference contig it mapped to. 3. iv:i: - 1 if the alignment path has an inversion. 0 otherwise. -''' +""" import sys import logging @@ -27,48 +27,50 @@ logger = logging.getLogger(__name__) timers = StageTimer() + def run_sort(gfa, gaf, outgaf=None, outind=None, bgzip=False): - + if outgaf == None: writer = sys.stdout index_file = None else: if bgzip: - writer = libcbgzf.BGZFile(outgaf, 'wb') + writer = libcbgzf.BGZFile(outgaf, "wb") else: writer = open(outgaf, "w") if outind: index_file = outind else: - index_file = outgaf+".gsi" + index_file = outgaf + ".gsi" index_dict = defaultdict(lambda: [None, None]) with timers("read_gfa"): gfa_file = GFA(graph_file=gfa, low_memory=True) with timers("total_sort"): sort(gaf, gfa_file.nodes, writer, index_dict, index_file) writer.close() - - + memory_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("\nMemory Information") logger.info(" Maximum memory usage: %.3f GB", memory_kb / 1e6) logger.info("\nTime Summary:") - logger.info(" Time to parse GFA file: %.3f seconds"%(timers.elapsed("read_gfa"))) - logger.info(" Total time to sort GAF file: %.3f seconds"%(timers.elapsed("total_sort"))) - logger.info(" Time to parse GAF file: %.3f seconds"%(timers.elapsed("read_gaf"))) - logger.info(" Time to sort GAF file: %.3f seconds"%(timers.elapsed("sort_gaf"))) - logger.info(" Time to write GAF file: %.3f seconds"%(timers.elapsed("write_gaf"))) - + logger.info(" Time to parse GFA file: %.3f seconds" % (timers.elapsed("read_gfa"))) + logger.info( + " Total time to sort GAF file: %.3f seconds" % (timers.elapsed("total_sort")) + ) + logger.info(" Time to parse GAF file: %.3f seconds" % (timers.elapsed("read_gaf"))) + logger.info(" Time to sort GAF file: %.3f seconds" % (timers.elapsed("sort_gaf"))) + logger.info(" Time to write GAF file: %.3f seconds" % (timers.elapsed("write_gaf"))) + def sort(gaf, nodes, writer, index_dict, index_file): - + logger.info("Parsing GAF file and sorting it") if utils.is_file_gzipped(gaf): - reader = libcbgzf.BGZFile(gaf, 'rb') + reader = libcbgzf.BGZFile(gaf, "rb") else: - reader = open(gaf, 'r') - - Alignment = namedtuple('Alignment', ['offset', 'BO', 'NO', 'start', 'inv', 'sn']) + reader = open(gaf, "r") + + Alignment = namedtuple("Alignment", ["offset", "BO", "NO", "start", "inv", "sn"]) gaf_alignments = [] count_inverse = 0 # First pass: Store all the alignment lines as minimally. Just storing line offset and alignment string. @@ -81,18 +83,20 @@ def sort(gaf, nodes, writer, index_dict, index_file): try: line = line.rstrip().split("\t") except TypeError: - line = line.decode('utf8').rstrip().split("\t") + line = line.decode("utf8").rstrip().split("\t") bo, no, start, inv, sn = process_alignment(line, nodes, offset) if inv == 1: count_inverse += 1 - gaf_alignments.append(Alignment(offset=offset, BO=bo, NO=no, start=start, inv=inv, sn=sn)) - - logger.info("\tNumber of alignments with inversions: %d"%(count_inverse)) + gaf_alignments.append( + Alignment(offset=offset, BO=bo, NO=no, start=start, inv=inv, sn=sn) + ) + + logger.info("\tNumber of alignments with inversions: %d" % (count_inverse)) # Sorting the alignments based on BO and NO tag with timers("sort_gaf"): logger.info("\tSorting the alignments...") gaf_alignments.sort(key=functools.cmp_to_key(compare_gaf)) - + # Writing the sorted file with timers("write_gaf"): logger.debug("Writing Output File...") @@ -101,12 +105,12 @@ def sort(gaf, nodes, writer, index_dict, index_file): reader.seek(off) line = reader.readline() if type(line) == bytes: - line = line.decode('utf-8').rstrip() + line = line.decode("utf-8").rstrip() elif type(line) == str: line = line.rstrip() else: - raise RuntimeError('GAF alignments not in string or byte format.') - line += "\tbo:i:%d\tsn:Z:%s\tiv:i:%d\n"%(alignment.BO, alignment.sn, alignment.inv) + raise RuntimeError("GAF alignments not in string or byte format.") + line += "\tbo:i:%d\tsn:Z:%s\tiv:i:%d\n" % (alignment.BO, alignment.sn, alignment.inv) if index_file != None: out_off = writer.tell() if index_dict[alignment.sn][0] == None: @@ -116,7 +120,7 @@ def sort(gaf, nodes, writer, index_dict, index_file): index_dict[alignment.sn][1] = out_off write_to_file(line, writer) if index_file != None: - index_dict.pop('unknown') + index_dict.pop("unknown") index_dict = dict(index_dict) with open(index_file, "wb") as ind: pkl.dump(index_dict, ind) @@ -124,7 +128,7 @@ def sort(gaf, nodes, writer, index_dict, index_file): def process_alignment(line, nodes, offset): - path = list(filter(None, re.split('(>)|(<)', line[5]))) + path = list(filter(None, re.split("(>)|(<)", line[5]))) orient = None # If there is no scaffold node present in the alignment, then assuming that it is in the correct orientation. # TODO: Need to find a way to deal with such alignments. @@ -132,26 +136,26 @@ def process_alignment(line, nodes, offset): no = None start = None orient_list = [] - inv = 0 # Whether the alignment has an inversion + inv = 0 # Whether the alignment has an inversion sn = None for n in path: if n in [">", "<"]: orient = n continue - - sn_tag = nodes[n].tags['SN'][1] - bo_tag = int(nodes[n].tags['BO'][1]) - no_tag = int(nodes[n].tags['NO'][1]) - sr_tag = int(nodes[n].tags['SR'][1]) + + sn_tag = nodes[n].tags["SN"][1] + bo_tag = int(nodes[n].tags["BO"][1]) + no_tag = int(nodes[n].tags["NO"][1]) + sr_tag = int(nodes[n].tags["SR"][1]) # Finding the chromosome where the alignment is if sn == None and sr_tag == 0: sn = sn_tag elif sr_tag == 0: assert sn == sn_tag - + if bo_tag == -1 or no_tag == -1: - logger.debug("[ERR]\tOF:i:%d\tND:Z:%s"%(offset,n)) + logger.debug("[ERR]\tOF:i:%d\tND:Z:%s" % (offset, n)) continue # Skipping the non-scaffold nodes if no_tag != 0: @@ -159,23 +163,23 @@ def process_alignment(line, nodes, offset): # Only keeping the orientation of the scaffold nodes in the path matching orient_list.append(orient) # If there are scaffold nodes in both direction, this indicates an inversion - if orient_list.count('>') !=0 and orient_list.count('<') != 0: + if orient_list.count(">") != 0 and orient_list.count("<") != 0: inv = 1 # TODO: What to do when the start node that we have is an untagged node? # One argument is to just sort them to the end and completely ignore them since they dont belong to any vcf bubble. # That works for my tools but not ideal for general purpose. - if orient_list.count('>') < orient_list.count('<'): + if orient_list.count(">") < orient_list.count("<"): l = int(line[6]) e = int(line[8]) - start = l-e + start = l - e n = path[-1] - bo = int(nodes[n].tags['BO'][1]) - no = int(nodes[n].tags['NO'][1]) + bo = int(nodes[n].tags["BO"][1]) + no = int(nodes[n].tags["NO"][1]) else: start = int(line[7]) n = path[1] - bo = int(nodes[n].tags['BO'][1]) - no = int(nodes[n].tags['NO'][1]) + bo = int(nodes[n].tags["BO"][1]) + no = int(nodes[n].tags["NO"][1]) if sn == None: sn = "unknown" return bo, no, start, inv, sn @@ -199,19 +203,19 @@ def compare_gaf(al1, al2): return -1 if al1.BO > al2.BO: return 1 - + # Comparing NO tags if al1.NO < al2.NO: return -1 if al1.BO > al2.BO: return 1 - + # Comparing start position in node if al1.start < al2.start: return -1 if al1.start > al2.start: return 1 - + # This will be execulted only when two reads start at the exact same position at the same node. Then we use offset values. So the read which comes first in the GAF file will come higher up. if al1.offset < al2.offset: return -1 @@ -225,7 +229,7 @@ def write_to_file(line, writer): except TypeError: writer.write(str.encode(line)) - + # fmt: off def add_arguments(parser): arg = parser.add_argument @@ -242,13 +246,17 @@ def add_arguments(parser): "If it is given and --outind is not specified, it will have same file name with .gsi extension)") arg("--bgzip", action='store_true', help="Flag to bgzip the output. Can only be given with --output.") - + + # fmt: on def validate(args, parser): if args.bgzip and not args.outgaf: - parser.error("--bgzip flag has been specified but not output path has been defined. Please define the output path.") + parser.error( + "--bgzip flag has been specified but not output path has been defined. Please define the output path." + ) if args.outind and not args.outgaf: parser.error("index path specified but no output gaf path. Please provide an output path.") - + + def main(args): run_sort(**vars(args)) diff --git a/gaftools/cli/stat.py b/gaftools/cli/stat.py index 84a1b34..08ed448 100644 --- a/gaftools/cli/stat.py +++ b/gaftools/cli/stat.py @@ -17,17 +17,17 @@ def run_stat( cigar_stat=False, output=None, ): - '''This function outputs some statistics of a GAF file. If you run with "--cigar" option, then + """This function outputs some statistics of a GAF file. If you run with "--cigar" option, then cigar related statistics are also output. This increases the running time more than 10 folds because I need to iterate through the cigar of each alignment making it run in O(n^2) - ''' + """ timers = StageTimer() if output is None: output = sys.stdout else: - output = open(output, 'w') + output = open(output, "w") read_names = set() total_aligned_bases = 0 @@ -45,99 +45,114 @@ def run_stat( total_match = 0 total_match_large = 0 total_perfect = 0 - + reads = {} gaf_file = GAF(gaf_path) for alignment_count, mapping in enumerate(gaf_file.read_file(), 1): - #hashed_readname = hash(mapping.query_name) - #read_names.add(hashed_readname) - + # hashed_readname = hash(mapping.query_name) + # read_names.add(hashed_readname) + if not (mapping.is_primary) or (mapping.mapping_quality <= 0): total_secondary += 1 continue - map_ratio = float(mapping.query_end - mapping.query_start)/(mapping.query_length) - seq_identity = (float(mapping.residue_matches)/mapping.alignment_block_length) + map_ratio = float(mapping.query_end - mapping.query_start) / (mapping.query_length) + seq_identity = float(mapping.residue_matches) / mapping.alignment_block_length if mapping.query_name not in reads: - reads[mapping.query_name] = Read(mapping.query_name, mapping.query_length, map_ratio, seq_identity) + reads[mapping.query_name] = Read( + mapping.query_name, mapping.query_length, map_ratio, seq_identity + ) else: reads[mapping.query_name].aln_count += 1 - #reads[mapping.query_name].total_map_ratio += map_ratio - #reads[mapping.query_name].total_seq_identity += seq_identity - + # reads[mapping.query_name].total_map_ratio += map_ratio + # reads[mapping.query_name].total_seq_identity += seq_identity + if reads[mapping.query_name].highest_map_ratio < map_ratio: reads[mapping.query_name].highest_map_ratio = map_ratio if reads[mapping.query_name].highest_seq_identity < seq_identity: reads[mapping.query_name].highest_seq_identity = seq_identity - total_aligned_bases += mapping.residue_matches total_mapq += mapping.mapping_quality total_primary += 1 if cigar_stat: - '''Cigar string analysis''' + """Cigar string analysis""" cigar = mapping.cigar all_cigars = ["".join(x) for _, x in itertools.groupby(cigar, key=str.isdigit)] if len(all_cigars) == 2: total_perfect += 1 - for cnt in range(0, len(all_cigars)-1, 2): - if all_cigars[cnt+1] == "D": + for cnt in range(0, len(all_cigars) - 1, 2): + if all_cigars[cnt + 1] == "D": total_del += 1 if int(all_cigars[cnt]) >= 50: total_del_large += 1 - elif all_cigars[cnt+1] == "I": + elif all_cigars[cnt + 1] == "I": total_ins += 1 if int(all_cigars[cnt]) >= 50: total_ins_large += 1 - elif all_cigars[cnt+1] == "X": + elif all_cigars[cnt + 1] == "X": total_x += 1 if int(all_cigars[cnt]) >= 50: total_x_large += 1 - elif all_cigars[cnt+1] == "=": + elif all_cigars[cnt + 1] == "=": total_match += 1 if int(all_cigars[cnt]) >= 50: total_match_large += 1 gaf_file.close() - #avg_total_seq_identity = 0.0 - #avg_total_map_ratio = 0.0 + # avg_total_seq_identity = 0.0 + # avg_total_map_ratio = 0.0 avg_highest_seq_identity = 0.0 avg_highest_map_ratio = 0.0 - - for k,v in reads.items(): - #avg_total_seq_identity += float(v.total_seq_identity) / v.aln_count - #avg_total_map_ratio += float(v.total_map_ratio) / v.aln_count + + for k, v in reads.items(): + # avg_total_seq_identity += float(v.total_seq_identity) / v.aln_count + # avg_total_map_ratio += float(v.total_map_ratio) / v.aln_count avg_highest_seq_identity += float(v.highest_seq_identity) avg_highest_map_ratio += float(v.highest_map_ratio) - #avg_total_seq_identity /= len(reads) - #avg_total_map_ratio /= len(reads) + # avg_total_seq_identity /= len(reads) + # avg_total_map_ratio /= len(reads) avg_highest_seq_identity /= len(reads) - avg_highest_map_ratio /= len(reads) + avg_highest_map_ratio /= len(reads) print() print("Total alignments:", alignment_count, file=output) print("\tPrimary:", total_primary, file=output) print("\tSecondary:", total_secondary, file=output) print("Reads with at least one alignment:", len(reads), file=output) print("Total aligned bases:", str(total_aligned_bases), file=output) - print("Average mapping quality:", round((total_mapq/alignment_count), 1), file=output) - #print("Average total sequence identity:", round(avg_total_seq_identity, 2), file=output) + print("Average mapping quality:", round((total_mapq / alignment_count), 1), file=output) + # print("Average total sequence identity:", round(avg_total_seq_identity, 2), file=output) print("Average highest sequence identity:", round(avg_highest_seq_identity, 3), file=output) - #print("Average total map ratio:", round(avg_total_map_ratio,2), file=output) + # print("Average total map ratio:", round(avg_total_map_ratio,2), file=output) print("Average highest map ratio:", round(avg_highest_map_ratio, 3), file=output) - + if cigar_stat: - print("Cigar string statistics:\n\tTotal deletion regions: %d (%d >50bps)\n\tTotal insertion regions: %d (%d >50bps)\n\tTotal substitution regions: %d (%d >50bps)\n\tTotal match regions: %d (%d >50bps)" % - (total_del, total_del_large, total_ins, total_ins_large, total_x, total_x_large, - total_match, total_match_large), file=output) - + print( + "Cigar string statistics:\n\tTotal deletion regions: %d (%d >50bps)\n\tTotal insertion regions: %d (%d >50bps)\n\tTotal substitution regions: %d (%d >50bps)\n\tTotal match regions: %d (%d >50bps)" + % ( + total_del, + total_del_large, + total_ins, + total_ins_large, + total_x, + total_x_large, + total_match, + total_match_large, + ), + file=output, + ) + print("Total perfect alignments (exact match):", total_perfect, file=output) - + print() - print("* Numbers are based on primary alignments and the ones with >0 mapping quality", file=output) + print( + "* Numbers are based on primary alignments and the ones with >0 mapping quality", + file=output, + ) logger.info("\n== SUMMARY ==") total_time = timers.total() log_memory_usage() @@ -147,12 +162,15 @@ def run_stat( def add_arguments(parser): arg = parser.add_argument # Positional arguments - arg('gaf_path', metavar='GAF', - help='Input GAF file (can be bgzip-compressed)') - arg('-o', '--output', default=None, - help='Output file. If omitted, use standard output.') - arg('--cigar', dest='cigar_stat', default=False, action='store_true', - help='Outputs cigar related statistics (requires more time)') + arg("gaf_path", metavar="GAF", help="Input GAF file (can be bgzip-compressed)") + arg("-o", "--output", default=None, help="Output file. If omitted, use standard output.") + arg( + "--cigar", + dest="cigar_stat", + default=False, + action="store_true", + help="Outputs cigar related statistics (requires more time)", + ) def validate(args, parser): @@ -161,5 +179,3 @@ def validate(args, parser): def main(args): run_stat(**vars(args)) - - diff --git a/gaftools/cli/view.py b/gaftools/cli/view.py index 125bf9e..b56090e 100644 --- a/gaftools/cli/view.py +++ b/gaftools/cli/view.py @@ -15,47 +15,56 @@ from gaftools.timer import StageTimer from gaftools.gaf import GAF from gaftools.gfa import GFA -from gaftools.conversion import StableNode, stable_to_unstable, unstable_to_stable, to_stable, to_unstable +from gaftools.conversion import ( + StableNode, + stable_to_unstable, + unstable_to_stable, + to_stable, + to_unstable, +) logger = logging.getLogger(__name__) -def run(gaf_path, - gfa=None, - output=None, - index=None, - nodes=[], - regions=[], - format=None - ): - + +def run(gaf_path, gfa=None, output=None, index=None, nodes=[], regions=[], format=None): + timers = StageTimer() - + if output == None: writer = sys.stdout else: - writer = open(output, 'w') + writer = open(output, "w") # Need to detect the format of the input gaf - gaf=GAF(gaf_path) - gaf_format=None + gaf = GAF(gaf_path) + gaf_format = None # checking format in the first 10 lines. for i, gaf_line in enumerate(gaf.read_file()): if i == 10: break if i == 0: - gaf_format=gaf_line.detect_path_format() - assert gaf_format==gaf_line.detect_path_format() + gaf_format = gaf_line.detect_path_format() + assert gaf_format == gaf_line.detect_path_format() gaf.close() - ref_contig=[] - gfa_nodes=None - contig_len={} + ref_contig = [] + gfa_nodes = None + contig_len = {} # if format is given, prepare some objects for use later if format: - if format == 'stable': + if format == "stable": if gaf_format == True: - raise CommandLineError('Input GAF already has stable coordinates. Please remove the --format stable option') + raise CommandLineError( + "Input GAF already has stable coordinates. Please remove the --format stable option" + ) gfa_file = GFA(graph_file=gfa, low_memory=True) - gfa_nodes = {id: StableNode(contig_id=gfa_file[id].tags['SN'][1], start=int(gfa_file[id].tags['SO'][1]), end=int(gfa_file[id].tags['SO'][1])+int(gfa_file[id].tags['LN'][1])) for id in gfa_file.nodes} + gfa_nodes = { + id: StableNode( + contig_id=gfa_file[id].tags["SN"][1], + start=int(gfa_file[id].tags["SO"][1]), + end=int(gfa_file[id].tags["SO"][1]) + int(gfa_file[id].tags["LN"][1]), + ) + for id in gfa_file.nodes + } ref_contig = [contig for contig in gfa_file.contigs if gfa_file.contigs[contig] == 0] # TODO: Update based on Fawaz's branch for contig in gfa_file.contigs: @@ -63,9 +72,11 @@ def run(gaf_path, print(contig_len) del gfa_file else: - assert format == 'unstable' + assert format == "unstable" if gaf_format == False: - raise CommandLineError('Input GAF already has unstable coordinates. Please remove the --format unstable option') + raise CommandLineError( + "Input GAF already has unstable coordinates. Please remove the --format unstable option" + ) reference = defaultdict(lambda: []) # Assuming that the gfa is sorted using the order_gfa function. gfa_file = GFA(graph_file=gfa, low_memory=True) @@ -79,23 +90,25 @@ def run(gaf_path, # now find out what lines to view and how to view if len(nodes) != 0 or len(regions) != 0: if index == None: - index = gaf_path+".gvi" + index = gaf_path + ".gvi" if not os.path.exists(index): - raise CommandLineError("No index found. Please provide the path to the index or create one with gaftools index.") + raise CommandLineError( + "No index found. Please provide the path to the index or create one with gaftools index." + ) ind = None - with open(index, 'rb') as tmp: + with open(index, "rb") as tmp: ind = pickle.load(tmp) - - ind_key = sorted(list(ind.keys()), key = lambda x: (x[1], x[2])) + + ind_key = sorted(list(ind.keys()), key=lambda x: (x[1], x[2])) ind_dict = {} for i in ind_key: ind_dict[i[0]] = i - + if regions: assert nodes == [] nodes = get_unstable(regions, ind) - offsets=ind[ind_dict[nodes[0]]] + offsets = ind[ind_dict[nodes[0]]] for nd in nodes[1:]: # extracting all the lines that touches at least one of the nodes offsets = list(set(offsets) | set(ind[ind_dict[nd]])) @@ -105,12 +118,12 @@ def run(gaf_path, gaf = GAF(gaf_path) # if format specified, have to make the changes. if format: - if format == 'stable': + if format == "stable": for ofs in offsets: line = gaf.read_line(ofs) print(to_stable(line, gfa_nodes, ref_contig, contig_len), file=writer) else: - assert format == 'unstable' + assert format == "unstable" for ofs in offsets: line = gaf.read_line(ofs) print(to_unstable(line, reference), file=writer) @@ -124,7 +137,7 @@ def run(gaf_path, # No nodes or regions indicates the entire file will be viewed # converting the format of the entire file if format: - if format == 'stable': + if format == "stable": for line in unstable_to_stable(gaf_path, gfa_nodes, ref_contig, contig_len): print(line, file=writer) else: @@ -139,7 +152,6 @@ def run(gaf_path, else: print(line.rstrip(), file=writer) - logger.info("\n== SUMMARY ==") total_time = timers.total() log_memory_usage() @@ -147,50 +159,51 @@ def run(gaf_path, def get_unstable(regions, index): - """ Takes the regions and returns the node IDs """ + """Takes the regions and returns the node IDs""" contig = [x.split(":")[0] for x in regions] node_dict = {} start = [x.split(":")[1].split("-")[0] for x in regions] end = [x.split(":")[1].split("-")[-1] for x in regions] - + result = [] for n, c in enumerate(contig): - + try: node_list = node_dict[c] except KeyError: - node_list = list(filter(lambda x: (x[1] == c),list(index.keys()))) - node_list.sort(key = lambda x: x[2]) + node_list = list(filter(lambda x: (x[1] == c), list(index.keys()))) + node_list.sort(key=lambda x: x[2]) node_dict[c] = node_list - + node = search([contig[n], start[n], end[n]], node_list) if len(node) > 1: - logger.info("INFO: Region %s spans multiple nodes.\nThe nodes are:"%(node[n])) + logger.info("INFO: Region %s spans multiple nodes.\nThe nodes are:" % (node[n])) for n in node: - logger.info("INFO: %s\t%s\t%d\t%d"%(n[0],n[1],n[2],n[3])) - + logger.info("INFO: %s\t%s\t%d\t%d" % (n[0], n[1], n[2], n[3])) + result.append(node[0][0]) - - return result + + return result + def search(node, node_list): - """ Find the unstable node id from the region """ - + """Find the unstable node id from the region""" + s = 0 pos = 0 - e = len(node_list)-1 + e = len(node_list) - 1 q_s = int(node[1]) q_e = int(node[2]) - while (s != e): - m = int((s+e)/2) + while s != e: + m = int((s + e) / 2) if (q_s >= node_list[m][2]) and (q_s < node_list[m][3]): pos = m break - elif (q_s >= node_list[m][3]): - s = m+1 + elif q_s >= node_list[m][3]: + s = m + 1 else: - e = m-1 + e = m - 1 pos = s # if there is only one node for the entire contig (case for non-reference nodes) # then the above loop is not executed and we extract the only node with pos=0 @@ -225,16 +238,18 @@ def add_arguments(parser): 'Multiple can be provided (Eg. gaftools view .... -r chr1:10-20 -r chr1:50-60 .....).') arg('-f', '--format', dest='format', metavar='FORMAT', help='format of output path (unstable | stable)') - + # fmt: on + def validate(args, parser): - if args.format and (args.format not in ['unstable', 'stable']): + if args.format and (args.format not in ["unstable", "stable"]): parser.error("--format only accepts unstable or stable as input.") if args.nodes and args.regions: parser.error("provide either of the --regions and --nodes options and not both.") if args.format and not args.gfa: parser.error("GFA file has to be provided along with --format.") + def main(args): run(**vars(args)) diff --git a/gaftools/conversion.py b/gaftools/conversion.py index 355e5ca..6aab41f 100644 --- a/gaftools/conversion.py +++ b/gaftools/conversion.py @@ -1,6 +1,7 @@ """ Core scripts for converting coordinates. """ + import logging import re @@ -10,6 +11,7 @@ logger = logging.getLogger(__name__) + # Stable Node representation of the Nodes. # Used in the coversion from unstable to stable coordinate system. class StableNode: @@ -17,20 +19,20 @@ def __init__(self, contig_id, start, end): self.contig_id = contig_id self.start = start self.end = end - + def to_string(self, orient): - return "%s%s:%d-%d"%(orient,self.contig_id,self.start,self.end) + return "%s%s:%d-%d" % (orient, self.contig_id, self.start, self.end) def merge_nodes(node1, node2, orient1, orient2): - + if (node1.contig_id != node2.contig_id) or (orient1 != orient2): return False if (orient1 == ">") and (node1.end != node2.start): return False if (orient1 == "<") and (node1.start != node2.end): return False - if (orient1 == "<"): + if orient1 == "<": node = StableNode(node1.contig_id, node2.start, node1.end) else: node = StableNode(node1.contig_id, node1.start, node2.end) @@ -38,27 +40,29 @@ def merge_nodes(node1, node2, orient1, orient2): def stable_to_unstable(gaf_path, reference): - ''' + """ This function converts a GAF file (mappings to a pangenome graph) into unstable coordinate. It does not expect sorted input however it strictly assumes that SO, LN and SN tags are available in the rGFA. - ''' + """ gaf_input = GAF(gaf_path) for gaf_line in gaf_input.read_file(): yield to_unstable(gaf_line, reference) gaf_input.close() + def unstable_to_stable(gaf_path, nodes, ref_contig, contig_len): - '''This function converts a gaf file (mappings to a pangenome graph). It does not need sorted + """This function converts a gaf file (mappings to a pangenome graph). It does not need sorted input however it strictly assumes that SO, LN and SN tags are available... - ''' + """ gaf_input = GAF(gaf_path) for gaf_line in gaf_input.read_file(): yield to_stable(gaf_line, nodes, ref_contig, contig_len) gaf_input.close() + # separate function for converting lines to unstable coordinate def to_unstable(gaf_line, reference): - gaf_contigs = list(filter(None, re.split('(>)|(<)', gaf_line.path))) + gaf_contigs = list(filter(None, re.split("(>)|(<)", gaf_line.path))) assert len(gaf_contigs) >= 1 unstable_coord = "" @@ -70,10 +74,10 @@ def to_unstable(gaf_line, reference): if nd == ">" or nd == "<": orient = nd continue - if ':' in nd and '-' in nd: - tmp = nd.rstrip().split(':') + if ":" in nd and "-" in nd: + tmp = nd.rstrip().split(":") query_contig_name = tmp[0] - (query_start, query_end) = tmp[1].rstrip().split('-') + (query_start, query_end) = tmp[1].rstrip().split("-") split_contig = True else: query_start = gaf_line.path_start @@ -85,13 +89,19 @@ def to_unstable(gaf_line, reference): orient = ">" else: orient = "<" - - '''Find the matching nodes from the reference genome here''' - start, end = utils.search_intervals(reference[query_contig_name], int(query_start), int(query_end), 0, len(reference[query_contig_name])) + + """Find the matching nodes from the reference genome here""" + start, end = utils.search_intervals( + reference[query_contig_name], + int(query_start), + int(query_end), + 0, + len(reference[query_contig_name]), + ) nodes_tmp = [] - for i in reference[query_contig_name][start:end+1]: - s = int(i.tags['SO'][1]) - e = int(i.tags['SO'][1]) + int(i.tags['LN'][1]) + for i in reference[query_contig_name][start : end + 1]: + s = int(i.tags["SO"][1]) + e = int(i.tags["SO"][1]) + int(i.tags["LN"][1]) cases = -1 if s <= int(query_start) < e: cases = 1 @@ -104,18 +114,18 @@ def to_unstable(gaf_line, reference): cases = 2 elif int(query_start) < s < e < int(query_end): cases = 3 - + if cases != -1: nodes_tmp.append(i.id) - new_total += (e - s) - + new_total += e - s + if orient == "<": for i in reversed(nodes_tmp): - unstable_coord += orient+i + unstable_coord += orient + i else: for i in nodes_tmp: - unstable_coord += orient+i - + unstable_coord += orient + i + if gaf_line.strand == "-": if split_contig: new_total = gaf_line.path_length @@ -132,17 +142,27 @@ def to_unstable(gaf_line, reference): else: new_end = new_start + (gaf_line.path_end - gaf_line.path_start) - new_line = "%s\t%s\t%s\t%s\t+\t%s\t%d\t%d\t%d\t%d\t%d\t%d"%(gaf_line.query_name, gaf_line.query_length, gaf_line.query_start, gaf_line.query_end, - unstable_coord, new_total, new_start, new_end, gaf_line.residue_matches, - gaf_line.alignment_block_length, gaf_line.mapping_quality) - - #Add cigar in reverse + new_line = "%s\t%s\t%s\t%s\t+\t%s\t%d\t%d\t%d\t%d\t%d\t%d" % ( + gaf_line.query_name, + gaf_line.query_length, + gaf_line.query_start, + gaf_line.query_end, + unstable_coord, + new_total, + new_start, + new_end, + gaf_line.residue_matches, + gaf_line.alignment_block_length, + gaf_line.mapping_quality, + ) + + # Add cigar in reverse if gaf_line.strand == "-": new_cigar = utils.reverse_cigar(gaf_line.cigar) - gaf_line.tags['cg:Z:'] = new_cigar + gaf_line.tags["cg:Z:"] = new_cigar for k in gaf_line.tags.keys(): - new_line += "\t%s%s"%(k,gaf_line.tags[k]) + new_line += "\t%s%s" % (k, gaf_line.tags[k]) return new_line @@ -151,12 +171,12 @@ def to_stable(gaf_line, nodes, ref_contig, contig_len): reverse_flag = False new_total = None new_start = None - gaf_nodes = list(filter(None, re.split('(>)|(<)', gaf_line.path))) + gaf_nodes = list(filter(None, re.split("(>)|(<)", gaf_line.path))) node_list = [] stable_coord = "" orient = None - new_line="" - + new_line = "" + for nd in gaf_nodes: if nd == ">" or nd == "<": orient = nd @@ -169,15 +189,15 @@ def to_stable(gaf_line, nodes, ref_contig, contig_len): for i in range(len(node_list) - 1): n1 = out_node[-1][0] o1 = out_node[-1][1] - n2 = node_list[i+1][0] - o2 = node_list[i+1][1] + n2 = node_list[i + 1][0] + o2 = node_list[i + 1][1] node_merge = merge_nodes(n1, n2, o1, o2) if node_merge == False: stable_coord += n1.to_string(o1) out_node.append([n2, o2]) else: out_node[-1] = node_merge - + if len(out_node) == 1 and out_node[0][0].contig_id in ref_contig: if out_node[0][1] == "<": reverse_flag = True @@ -185,7 +205,7 @@ def to_stable(gaf_line, nodes, ref_contig, contig_len): new_start = out_node[0][0].start + gaf_line.path_length - gaf_line.path_end else: new_start = out_node[0][0].start + gaf_line.path_start - + stable_coord = out_node[0][0].contig_id new_total = contig_len[stable_coord] else: @@ -193,18 +213,28 @@ def to_stable(gaf_line, nodes, ref_contig, contig_len): new_start = gaf_line.path_start new_total = gaf_line.path_length - new_line+="%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" %(gaf_line.query_name, - gaf_line.query_length, gaf_line.query_start, gaf_line.query_end, gaf_line.strand, - stable_coord, new_total, new_start, new_start + gaf_line.path_end - gaf_line.path_start, - gaf_line.residue_matches, gaf_line.alignment_block_length, gaf_line.mapping_quality) - - #Add cigar in reverse + new_line += "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" % ( + gaf_line.query_name, + gaf_line.query_length, + gaf_line.query_start, + gaf_line.query_end, + gaf_line.strand, + stable_coord, + new_total, + new_start, + new_start + gaf_line.path_end - gaf_line.path_start, + gaf_line.residue_matches, + gaf_line.alignment_block_length, + gaf_line.mapping_quality, + ) + + # Add cigar in reverse if reverse_flag: new_cigar = utils.reverse_cigar(gaf_line.cigar) - gaf_line.tags['cg:Z:'] = new_cigar - + gaf_line.tags["cg:Z:"] = new_cigar + # adding tags in the sequence it was found for k in gaf_line.tags.keys(): - new_line+="\t%s%s"%(k,gaf_line.tags[k]) + new_line += "\t%s%s" % (k, gaf_line.tags[k]) - return new_line \ No newline at end of file + return new_line diff --git a/gaftools/gaf.py b/gaftools/gaf.py index e8cdb13..8712a0f 100644 --- a/gaftools/gaf.py +++ b/gaftools/gaf.py @@ -6,12 +6,30 @@ logger = logging.getLogger(__name__) -complement = str.maketrans('ACGT', 'TGCA') +complement = str.maketrans("ACGT", "TGCA") + class Alignment: - def __init__(self, query_name, query_length, query_start, query_end, strand, path, path_length, path_start, - path_end, residue_matches, alignment_block_length, mapping_quality, is_primary, - cigar, cigar_length=None, score=None, tags=None): + def __init__( + self, + query_name, + query_length, + query_start, + query_end, + strand, + path, + path_length, + path_start, + path_end, + residue_matches, + alignment_block_length, + mapping_quality, + is_primary, + cigar, + cigar_length=None, + score=None, + tags=None, + ): self.query_name = query_name self.query_length = query_length @@ -31,27 +49,38 @@ def __init__(self, query_name, query_length, query_start, query_end, strand, pat self.is_primary = is_primary self.duplicate = False self.tags = tags - + # detect whether the paths are in stable or unstable coordinates # returns True for stable def detect_path_format(self): - if ':' in self.path: + if ":" in self.path: # this is possible for paths of the form :- return True - if '>' in self.path or '<' in self.path: + if ">" in self.path or "<" in self.path: # if : is not present, then path can be of form or unstable coordinate. Check unstable by looking for > or < return False # only possibility left is of form return True - + # return the alignment as a string def __str__(self): - line = "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d"%(self.query_name, self.query_length, self.query_start, self.query_end, - self.strand, self.path, self.path_length, self.path_start, self.path_end, self.residue_matches, - self.alignment_block_length, self.mapping_quality) - self.tags['cg:Z:'] = self.cigar + line = "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" % ( + self.query_name, + self.query_length, + self.query_start, + self.query_end, + self.strand, + self.path, + self.path_length, + self.path_start, + self.path_end, + self.residue_matches, + self.alignment_block_length, + self.mapping_quality, + ) + self.tags["cg:Z:"] = self.cigar for k in self.tags.keys(): - line += "\t%s%s"%(k,self.tags[k]) + line += "\t%s%s" % (k, self.tags[k]) return line @@ -71,12 +100,12 @@ def __init__(self, filename): self.file = None self.gz_flag = False if utils.is_file_gzipped(filename): - # gaf file has to be gzipped with bgzip to access blocks - self.file = libcbgzf.BGZFile(filename,"rb") + # gaf file has to be gzipped with bgzip to access blocks + self.file = libcbgzf.BGZFile(filename, "rb") self.gz_flag = True else: - self.file = open(filename,"r") - + self.file = open(filename, "r") + # closing the file def close(self): self.file.close() @@ -91,12 +120,12 @@ def read_line(self, offset): def parse_gaf_line(self, line): if not self.gz_flag: - fields = line.rstrip().split('\t') + fields = line.rstrip().split("\t") else: - fields = line.decode("utf-8").rstrip().split('\t') + fields = line.decode("utf-8").rstrip().split("\t") # If the query name has spaces (e.g., GraphAligner), we get rid of the segment after the space - query_name = fields[0].split(' ')[0] + query_name = fields[0].split(" ")[0] if fields[1].isdigit(): query_length = int(fields[1]) else: @@ -130,7 +159,7 @@ def parse_gaf_line(self, line): is_primary = True cigar = "" - #Check if there are additional tags + # Check if there are additional tags tags = {} for k in fields: if re.match("[A-Za-z][A-Za-z0-9]:[AifZHB]:[A-Za-z0-9]+", k): @@ -146,10 +175,24 @@ def parse_gaf_line(self, line): if pattern == "tp:A" and (val != "P" or val != "p"): is_primary = False - - return Alignment(query_name, query_length, query_start, query_end, strand, path, - path_length, path_start, path_end, residue_matches, alignment_block_length, - mapping_quality, is_primary, cigar, tags=tags) + + return Alignment( + query_name, + query_length, + query_start, + query_end, + strand, + path, + path_length, + path_start, + path_end, + residue_matches, + alignment_block_length, + mapping_quality, + is_primary, + cigar, + tags=tags, + ) def compare_aln(ln1, ln2): @@ -163,6 +206,3 @@ def compare_aln(ln1, ln2): return -1 else: return 1 - - - diff --git a/gaftools/gfa.py b/gaftools/gfa.py index ac87646..feb6408 100644 --- a/gaftools/gfa.py +++ b/gaftools/gfa.py @@ -12,7 +12,7 @@ class Node: - __slots__ = ('id', 'seq', 'seq_len', 'start', 'end', 'visited', 'tags') + __slots__ = ("id", "seq", "seq_len", "start", "end", "visited", "tags") def __init__(self, identifier): self.id = identifier @@ -81,7 +81,9 @@ def in_direction(self, other, direction): return True return False else: - raise ValueError(f"Trying to access a wrong direction in node {self.id}, give 0 for start or 1 for end") + raise ValueError( + f"Trying to access a wrong direction in node {self.id}, give 0 for start or 1 for end" + ) def children(self, direction): """ @@ -92,7 +94,9 @@ def children(self, direction): elif direction == 1: return [x[0] for x in self.end] else: - raise ValueError(f"Trying to access a wrong direction in node {self.id}, give 0 for start or 1 for end") + raise ValueError( + f"Trying to access a wrong direction in node {self.id}, give 0 for start or 1 for end" + ) def remove_from_start(self, neighbor, side, overlap): """ @@ -103,7 +107,8 @@ def remove_from_start(self, neighbor, side, overlap): self.start.remove((neighbor, side, overlap)) except KeyError: logging.warning( - f"Could not remove edge {(neighbor, side, overlap)} from {self.id}'s start as it does not exist") + f"Could not remove edge {(neighbor, side, overlap)} from {self.id}'s start as it does not exist" + ) def remove_from_end(self, neighbor, side, overlap): """ @@ -114,7 +119,8 @@ def remove_from_end(self, neighbor, side, overlap): self.end.remove((neighbor, side, overlap)) except KeyError: logging.warning( - f"Could not remove edge {(neighbor, side, overlap)} from {self.id}'s end as it does not exist") + f"Could not remove edge {(neighbor, side, overlap)} from {self.id}'s end as it does not exist" + ) def add_from_start(self, neighbor, side, overlap): """ @@ -149,12 +155,14 @@ class GFA: Graph object containing the important information about the graph """ - __slots__ = ['nodes', 'low_memory', 'edge_tags', 'contigs'] + __slots__ = ["nodes", "low_memory", "edge_tags", "contigs"] def __init__(self, graph_file=None, low_memory=False): self.nodes = dict() self.edge_tags = dict() - self.contigs = defaultdict(lambda: None) # storing contigs with their SR tag to determine primary contig + self.contigs = defaultdict( + lambda: None + ) # storing contigs with their SR tag to determine primary contig self.low_memory = low_memory if graph_file: if not os.path.exists(graph_file): @@ -276,20 +284,23 @@ def add_node(self, node_id, seq="", tags=None): for tag in tags: if not is_correct_tag(tag): raise ValueError( - f"The tag {tag} for node {node_id} did not match the specifications, check sam specification on tags") + f"The tag {tag} for node {node_id} did not match the specifications, check sam specification on tags" + ) tag = tag.split(":") # I am adding the tags as key:value, key is tag_name:type and value is the value at the end # e.g. SN:i:10 will be {"SN": ('i', '10')} self[node_id].tags[tag[0]] = (tag[1], tag[2]) # (type, value) - if 'SN' in self[node_id].tags and 'SR' in self[node_id].tags: - contig_name = self[node_id].tags['SN'][1] - contig_rank = int(self[node_id].tags['SR'][1]) + if "SN" in self[node_id].tags and "SR" in self[node_id].tags: + contig_name = self[node_id].tags["SN"][1] + contig_rank = int(self[node_id].tags["SR"][1]) if self.contigs[contig_name] == None: self.contigs[contig_name] = contig_rank else: assert self.contigs[contig_name] == contig_rank else: - logging.warning(f"You are trying to add node {node_id} and it already exists in the graph") + logging.warning( + f"You are trying to add node {node_id} and it already exists in the graph" + ) def add_edge(self, node1, node1_dir, node2, node2_dir, overlap, tags=None): assert node1_dir in {"+", "-"} @@ -316,10 +327,9 @@ def remove_lonely_nodes(self): for i in nodes_to_remove: self.remove_node(i) - def write_graph(self, set_of_nodes=None, - output_file="output_graph.gfa", - append=False, - order_bo=False): + def write_graph( + self, set_of_nodes=None, output_file="output_graph.gfa", append=False, order_bo=False + ): """ writes a graph file as GFA Can be given a set of nodes to only write those nodes with their edges @@ -328,8 +338,13 @@ def write_graph(self, set_of_nodes=None, if not output_file.endswith(".gfa"): output_file += ".gfa" # print("I am here") - self.write_gfa(self, set_of_nodes=set_of_nodes, output_file=output_file, - append=append, order_bo=order_bo) + self.write_gfa( + self, + set_of_nodes=set_of_nodes, + output_file=output_file, + append=append, + order_bo=order_bo, + ) def output_components(self, output_dir): """ @@ -400,15 +415,16 @@ def sort_bo_no(self, set_of_nodes): """ separate_bubbles = dict() for n in set_of_nodes: - if self[n].tags['BO'][1] not in separate_bubbles: - separate_bubbles[self[n].tags['BO'][1]] = [n] + if self[n].tags["BO"][1] not in separate_bubbles: + separate_bubbles[self[n].tags["BO"][1]] = [n] else: - separate_bubbles[self[n].tags['BO'][1]].append(n) + separate_bubbles[self[n].tags["BO"][1]].append(n) bo_ids = [] for bo, n_list in separate_bubbles.items(): bo_ids.append(bo) - separate_bubbles[bo] = sorted(separate_bubbles[bo], - key=lambda x: int(self.nodes[x].tags['NO'][1])) # sorting the BO bucket by NO + separate_bubbles[bo] = sorted( + separate_bubbles[bo], key=lambda x: int(self.nodes[x].tags["NO"][1]) + ) # sorting the BO bucket by NO sorted_set_of_nodes = [] for bo in sorted(bo_ids): @@ -416,7 +432,9 @@ def sort_bo_no(self, set_of_nodes): sorted_set_of_nodes.append(n_id) return sorted_set_of_nodes - def write_gfa(self, set_of_nodes=None, output_file="output_file.gfa", append=False, order_bo=False): + def write_gfa( + self, set_of_nodes=None, output_file="output_file.gfa", append=False, order_bo=False + ): """ Write a gfa out @@ -441,8 +459,9 @@ def write_gfa(self, set_of_nodes=None, output_file="output_file.gfa", append=Fal if os.path.exists(output_file): f = open(output_file, "a") else: - logging.warning("Trying to append to a non-existent file\n" - "creating an output file") + logging.warning( + "Trying to append to a non-existent file\n" "creating an output file" + ) f = open(output_file, "w+") # going through nodes twice to have the final output as all S line then all L lines @@ -471,10 +490,14 @@ def write_gfa(self, set_of_nodes=None, output_file="output_file.gfa", append=Fal if tags[0] == 0: tags = [] if n[1] == 0: - edge = str("\t".join(["L", str(n1), "-", str(n[0]), "+", overlap] + tags)) + edge = str( + "\t".join(["L", str(n1), "-", str(n[0]), "+", overlap] + tags) + ) edges.append(edge) else: - edge = str("\t".join(["L", str(n1), "-", str(n[0]), "-", overlap] + tags)) + edge = str( + "\t".join(["L", str(n1), "-", str(n[0]), "-", overlap] + tags) + ) edges.append(edge) for n in self.nodes[n1].end: @@ -489,10 +512,14 @@ def write_gfa(self, set_of_nodes=None, output_file="output_file.gfa", append=Fal if tags[0] == 0: tags = [] if n[1] == 0: - edge = str("\t".join(["L", str(n1), "+", str(n[0]), "+", overlap] + tags)) + edge = str( + "\t".join(["L", str(n1), "+", str(n[0]), "+", overlap] + tags) + ) edges.append(edge) else: - edge = str("\t".join(["L", str(n1), "+", str(n[0]), "-", overlap] + tags)) + edge = str( + "\t".join(["L", str(n1), "+", str(n[0]), "-", overlap] + tags) + ) edges.append(edge) for e in edges: @@ -663,16 +690,18 @@ def get_path(self, chrom): nodes_of_chrom.append(n) else: logging.warning( - f"Not able to get path for {chrom}, because node {n} doesn't have an SN tag. Stopping! Returning empty list") + f"Not able to get path for {chrom}, because node {n} doesn't have an SN tag. Stopping! Returning empty list" + ) return list() # sorting based on SO tags, so we get start to end of chromosome - sorted_nodes = sorted(nodes_of_chrom, key=lambda x: int(self.nodes[x].tags['SO'][1])) + sorted_nodes = sorted(nodes_of_chrom, key=lambda x: int(self.nodes[x].tags["SO"][1])) # this sorted list should be a path already spelling the chromosome if self.list_is_path(sorted_nodes): return sorted_nodes else: logging.warning( - f"The sorted nodes with SN tag {chrom} did not create a linear path. Stopping! Returning empty list") + f"The sorted nodes with SN tag {chrom} did not create a linear path. Stopping! Returning empty list" + ) return list() def get_contig_length(self, chrom): @@ -681,11 +710,12 @@ def get_contig_length(self, chrom): """ sorted_nodes = self.get_path(chrom) if not sorted_nodes: - logging.error("Was not able to return the length of the chromosome, check warning message(s)") + logging.error( + "Was not able to return the length of the chromosome, check warning message(s)" + ) sys.exit(1) else: - return sum([int(self.nodes[x].tags['LN'][1]) for x in sorted_nodes]) - + return sum([int(self.nodes[x].tags["LN"][1]) for x in sorted_nodes]) def extract_path(self, path): """ @@ -696,7 +726,7 @@ def extract_path(self, path): if not self.path_exists(path): return "" - for n in re.findall('[><][^><]+', path): + for n in re.findall("[><][^><]+", path): if n[1:] not in self: logging.error(f"The node {n[1:]} in path {path} does not seem to exist in this GFA") return "" diff --git a/gaftools/utils.py b/gaftools/utils.py index edbc7fd..12cd244 100644 --- a/gaftools/utils.py +++ b/gaftools/utils.py @@ -2,7 +2,7 @@ import tracemalloc import linecache -complement = str.maketrans('ACGT', 'TGCA') +complement = str.maketrans("ACGT", "TGCA") tag_regex = r"^[A-Za-z][A-Za-z][:][AifZHB][:][ !-~]*$" types_regex = { @@ -11,8 +11,8 @@ "f": r"^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$", "Z": r"^[ !-~]*$", "H": r"^([0-9A-F][0-9A-F])*$", - "B": r"^[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)*$" - } + "B": r"^[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)*$", +} def rev_comp(seq): @@ -31,20 +31,20 @@ def is_correct_tag(tag): def is_file_gzipped(src): with open(src, "rb") as inp: - return inp.read(2) == b'\x1f\x8b' + return inp.read(2) == b"\x1f\x8b" def search_intervals(intervals, query_start, query_end, start, end): - '''Given the start-end coordinates in the GFA file for the given contig (SO, SO+LN), it + """Given the start-end coordinates in the GFA file for the given contig (SO, SO+LN), it searches for the given (query_start, query_end) matches. (query_start, query_end) is the start and end location of a mapping in the gaf file. - ''' - - if start <= end: + """ + + if start <= end: mid = start + (end - start) // 2 - if query_end <= int(intervals[mid].tags['SO'][1]): + if query_end <= int(intervals[mid].tags["SO"][1]): return search_intervals(intervals, query_start, query_end, start, mid - 1) - elif query_start >= int(intervals[mid].tags['SO'][1]) + int(intervals[mid].tags['LN'][1]): + elif query_start >= int(intervals[mid].tags["SO"][1]) + int(intervals[mid].tags["LN"][1]): return search_intervals(intervals, query_start, query_end, mid + 1, end) else: return start, end @@ -54,19 +54,22 @@ def search_intervals(intervals, query_start, query_end, start, end): def reverse_cigar(cg): import itertools + cg = cg[5:] all_cigars = ["".join(x) for _, x in itertools.groupby(cg, key=str.isdigit)] new_cigar = "cg:Z:" for i in range(len(all_cigars), 0, -2): - new_cigar += str(all_cigars[i-2]) + str(all_cigars[i-1]) + new_cigar += str(all_cigars[i - 2]) + str(all_cigars[i - 1]) return new_cigar -def display_top(snapshot, key_type='lineno', limit=3): - snapshot = snapshot.filter_traces(( - tracemalloc.Filter(False, ""), - tracemalloc.Filter(False, ""), - )) +def display_top(snapshot, key_type="lineno", limit=3): + snapshot = snapshot.filter_traces( + ( + tracemalloc.Filter(False, ""), + tracemalloc.Filter(False, ""), + ) + ) top_stats = snapshot.statistics(key_type) print("Top %s lines" % limit) @@ -74,15 +77,14 @@ def display_top(snapshot, key_type='lineno', limit=3): frame = stat.traceback[0] # replace "/path/to/module/file.py" with "module/file.py" filename = "/".join(frame.filename.split("/")[-2:]) - print("#%s: %s:%s: %.1f KiB" - % (index, filename, frame.lineno, stat.size / 1024)) + print("#%s: %s:%s: %.1f KiB" % (index, filename, frame.lineno, stat.size / 1024)) line = linecache.getline(frame.filename, frame.lineno).strip() if line: - print(' %s' % line) + print(" %s" % line) other = top_stats[limit:] if other: size = sum(stat.size for stat in other) print("%s other: %.1f KiB" % (len(other), size / 1024)) total = sum(stat.size for stat in top_stats) - print("Total allocated size: %.1f KiB" % (total / 1024)) \ No newline at end of file + print("Total allocated size: %.1f KiB" % (total / 1024)) diff --git a/setup.py b/setup.py index eff5251..e236aac 100644 --- a/setup.py +++ b/setup.py @@ -13,4 +13,4 @@ setup( use_scm_version={"write_to": "gaftools/_version.py"}, install_requires=install_requires, -) \ No newline at end of file +) diff --git a/tests/test_extract_path.py b/tests/test_extract_path.py index 0cf3008..8e76c02 100644 --- a/tests/test_extract_path.py +++ b/tests/test_extract_path.py @@ -1,23 +1,24 @@ """ Tests for 'gaftools realign' """ + from gaftools.gfa import GFA def test_realign_gaf(): graph = GFA("tests/data/smallgraph_withN.gfa") - + s1_seq = "CACCCTAAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTCACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCCTACCCCTACCCCTACCCCTACCCCTAACCCTACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCCTAACCCTAACCCTAACCCTAACCCTAACCCGAACCCGAACCCGAACCCGAACCCGAACCCGAACCCGAACCCGAACCCGAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTAACCCTAACCCTAACCCGACCCTGACCCTGACCCTGACCCTGACCCTGGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTAACCCTGACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTAACGCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTCTCTGCAGGCGCAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGTCCCCTGGGGGGCGGGGGGAGGTGCGGCGCAGGCGCACAGAGGCGCGTCCCCTGGGGGGCGGGGGGAGGCGCGG" s1_seq_rev = "CCGCGCCTCCCCCCGCCCCCCAGGGGACGCGCCTCTGTGCGCCTGCGCCGCACCTCCCCCCGCCCCCCAGGGGACGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTCTGCGCCTGCGCCGGCGCGCCGCGCCTCTGCGCCTGCAGAGAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTTAGGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGCGTTAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTCAGGGTTAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCCAGGGTCAGGGTCAGGGTCAGGGTCAGGGTCGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTCGGGTTCGGGTTCGGGTTCGGGTTCGGGTTCGGGTTCGGGTTCGGGTTCGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTGGGGTTGGGGTTGGGGTTGGGGTTGGGGTTGGGGTTGGGGTAGGGTTAGGGGTAGGGGTAGGGGTAGGGGTAGGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTGAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTTAGGGTG" s2_seq = "CGCAGGCGCACAGTCANNNNNNNNCACGCCACCGGGACAGGAGCGCGGGGGTGGAGCGTTGTAGGCAGAGAGACGCACGTCCCCGGGGGCGCGGCACAGAGACGGGTTGAACCTCAGTAATCCGAAAAGCCGGGCTCGGGCGCCCCCTGCTTGCAGCCGGGCACTACAGGACCCGCTTACCCGCGGTGCTTTCCCAGTGCGCCCCCTGCTAGCGGCTAGGACAACTGCAGGGCCCTCTTGCTTACAGTGGTGGCCAGCGTCCCCTGCTGGCGCCGGGGCACTGCAGGGCTCTCTTGCTCGCTAAATAGTGGCGGCACGCCGCCTGCTGGCAGCTAGGGACGTTGCAGGG" s2_seq_rev = "CCCTGCAACGTCCCTAGCTGCCAGCAGGCGGCGTGCCGCCACTATTTAGCGAGCAAGAGAGCCCTGCAGTGCCCCGGCGCCAGCAGGGGACGCTGGCCACCACTGTAAGCAAGAGGGCCCTGCAGTTGTCCTAGCCGCTAGCAGGGGGCGCACTGGGAAAGCACCGCGGGTAAGCGGGTCCTGTAGTGCCCGGCTGCAAGCAGGGGGCGCCCGAGCCCGGCTTTTCGGATTACTGAGGTTCAACCCGTCTCTGTGCCGCGCCCCCGGGGACGTGCGTCTCTCTGCCTACAACGCTCCACCCCCGCGCTCCTGTCCCGGTGGCGTGNNNNNNNNTGACTGTGCGCCTGCG" - + s3_seq = "CGCAGG" s3_seq_rev = "CCTGCG" s4_seq = "CGCANNNGG" s4_seq_rev = "CCNNNTGCG" - + path1 = ">s1" path2 = "s8>s9' + assert gaf_lines[0].strand == "+" + assert gaf_lines[0].path == ">s8>s9" assert gaf_lines[0].path_length == 10109 assert gaf_lines[0].path_start == 6166 assert gaf_lines[0].path_end == 6564 assert gaf_lines[0].residue_matches == 398 assert gaf_lines[0].alignment_block_length == 398 assert gaf_lines[0].mapping_quality == 60 - assert gaf_lines[0].cigar == '398=' + assert gaf_lines[0].cigar == "398=" assert gaf_lines[0].is_primary - assert gaf_lines[1].query_name == 'read_s8_s9_deletion15' + assert gaf_lines[1].query_name == "read_s8_s9_deletion15" assert gaf_lines[1].query_length == 383 assert gaf_lines[1].query_start == 0 assert gaf_lines[1].query_end == 383 - assert gaf_lines[1].strand == '+' - assert gaf_lines[1].path == '>s8>s9' + assert gaf_lines[1].strand == "+" + assert gaf_lines[1].path == ">s8>s9" assert gaf_lines[1].path_length == 10109 assert gaf_lines[1].path_start == 6166 assert gaf_lines[1].path_end == 6564 assert gaf_lines[1].residue_matches == 383 assert gaf_lines[1].alignment_block_length == 398 assert gaf_lines[1].mapping_quality == 60 - assert gaf_lines[1].cigar == '189=2D1=13D193=' + assert gaf_lines[1].cigar == "189=2D1=13D193=" assert gaf_lines[1].is_primary diff --git a/tests/test_gaf_gzipped.py b/tests/test_gaf_gzipped.py index 88e1bd0..14d59b6 100644 --- a/tests/test_gaf_gzipped.py +++ b/tests/test_gaf_gzipped.py @@ -4,47 +4,47 @@ from gaftools.gaf import GAF, Alignment + def test_parse_gaf_gz(): - #gaftools realign alignments.gaf smallgraph.gfa reads.fa - gaf = GAF('tests/data/alignments-graphaligner.gaf.gz') + # gaftools realign alignments.gaf smallgraph.gfa reads.fa + gaf = GAF("tests/data/alignments-graphaligner.gaf.gz") gaf_lines = list(gaf.read_file()) assert len(gaf_lines) == 2 for gaf_line in gaf_lines: assert isinstance(gaf_line, Alignment) - assert gaf_lines[0].query_name == 'read_s8_s9' + assert gaf_lines[0].query_name == "read_s8_s9" assert gaf_lines[0].query_length == 398 assert gaf_lines[0].query_start == 0 assert gaf_lines[0].query_end == 398 - assert gaf_lines[0].strand == '+' - assert gaf_lines[0].path == '>s8>s9' + assert gaf_lines[0].strand == "+" + assert gaf_lines[0].path == ">s8>s9" assert gaf_lines[0].path_length == 10109 assert gaf_lines[0].path_start == 6166 assert gaf_lines[0].path_end == 6564 assert gaf_lines[0].residue_matches == 398 assert gaf_lines[0].alignment_block_length == 398 assert gaf_lines[0].mapping_quality == 60 - assert gaf_lines[0].cigar == '398=' + assert gaf_lines[0].cigar == "398=" assert gaf_lines[0].is_primary - assert gaf_lines[1].query_name == 'read_s8_s9_deletion15' + assert gaf_lines[1].query_name == "read_s8_s9_deletion15" assert gaf_lines[1].query_length == 383 assert gaf_lines[1].query_start == 0 assert gaf_lines[1].query_end == 383 - assert gaf_lines[1].strand == '+' - assert gaf_lines[1].path == '>s8>s9' + assert gaf_lines[1].strand == "+" + assert gaf_lines[1].path == ">s8>s9" assert gaf_lines[1].path_length == 10109 assert gaf_lines[1].path_start == 6166 assert gaf_lines[1].path_end == 6564 assert gaf_lines[1].residue_matches == 383 assert gaf_lines[1].alignment_block_length == 398 assert gaf_lines[1].mapping_quality == 60 - assert gaf_lines[1].cigar == '189=2D1=13D193=' + assert gaf_lines[1].cigar == "189=2D1=13D193=" assert gaf_lines[1].is_primary - + """first_line = gaf_lines[0].readline() for t in gaf_lines[0].tags.keys(): matching = list(filter(lambda x: t in x, fields)) assert(len(matching > 0))""" - diff --git a/tests/test_gfa_class.py b/tests/test_gfa_class.py index bd2af1f..222964c 100644 --- a/tests/test_gfa_class.py +++ b/tests/test_gfa_class.py @@ -17,33 +17,33 @@ def test_load_gfa(): graph = GFA("tests/data/test_GFA_class.gfa") assert len(graph) == 4 # check if the node sequences were loaded correctly - assert graph['1'].seq == "AGGTCG" - assert graph['2'].seq == "T" - assert graph['3'].seq == "G" - assert graph['4'].seq == "TGGC" + assert graph["1"].seq == "AGGTCG" + assert graph["2"].seq == "T" + assert graph["3"].seq == "G" + assert graph["4"].seq == "TGGC" # check that connectivity is correct - assert '1' in graph['2'].neighbors() - assert '2' in graph['1'].neighbors() + assert "1" in graph["2"].neighbors() + assert "2" in graph["1"].neighbors() - assert '1' in graph['3'].neighbors() - assert '3' in graph['1'].neighbors() + assert "1" in graph["3"].neighbors() + assert "3" in graph["1"].neighbors() - assert '2' in graph['4'].neighbors() - assert '4' in graph['2'].neighbors() + assert "2" in graph["4"].neighbors() + assert "4" in graph["2"].neighbors() - assert '4' in graph['3'].neighbors() - assert '3' in graph['4'].neighbors() + assert "4" in graph["3"].neighbors() + assert "3" in graph["4"].neighbors() # check directionality - assert graph['1'].in_direction('2', 1) # node 2 connects to node 1 from end of node 1 - assert graph['2'].in_direction('1', 0) # node 1 connects to node 2 from start of node 2 - assert graph['3'].in_direction('1', 1) # node 1 connects to node 3 from end of node 3 - assert graph['3'].in_direction('4', 0) # node 4 connects to node 3 from start of node 3 - assert not graph.nodes['1'].in_direction('2', 0) # checking the negative + assert graph["1"].in_direction("2", 1) # node 2 connects to node 1 from end of node 1 + assert graph["2"].in_direction("1", 0) # node 1 connects to node 2 from start of node 2 + assert graph["3"].in_direction("1", 1) # node 1 connects to node 3 from end of node 3 + assert graph["3"].in_direction("4", 0) # node 4 connects to node 3 from start of node 3 + assert not graph.nodes["1"].in_direction("2", 0) # checking the negative # check children in a certain direction - assert '2' in graph['1'].children(1) - assert '4' in graph['3'].children(0) + assert "2" in graph["1"].children(1) + assert "4" in graph["3"].children(0) # test file not found try: @@ -59,32 +59,32 @@ def test_load_gfa(): def test_node_tags(): graph = GFA("tests/data/test_GFA_class.gfa") - assert 'SG' in graph['1'].tags - assert graph['1'].tags['SG'] == ("Z", "testing_tags") - assert len(graph['2'].tags) == 2 + assert "SG" in graph["1"].tags + assert graph["1"].tags["SG"] == ("Z", "testing_tags") + assert len(graph["2"].tags) == 2 def test_delete_node(): graph = GFA("tests/data/test_GFA_class.gfa") - del graph['1'] - assert '1' not in graph.nodes + del graph["1"] + assert "1" not in graph.nodes # checking that the edges were also removed properly - assert '1' not in graph['2'].neighbors() - assert '1' not in graph['3'].neighbors() + assert "1" not in graph["2"].neighbors() + assert "1" not in graph["3"].neighbors() def test_add_node(): graph = GFA("tests/data/test_GFA_class.gfa") node_line = "S\t5\tCCCC".split() graph.add_node(node_line[1], node_line[2]) - assert '5' in graph - assert graph['5'].seq == "CCCC" - assert graph['5'].seq_len == 4 + assert "5" in graph + assert graph["5"].seq == "CCCC" + assert graph["5"].seq_len == 4 # adding an edge between end of node 4 and start of node 5 with 0 overlap - graph.add_edge('4', "+", '5', "+", 0) - assert '5' in graph['4'].neighbors() - assert graph['4'].in_direction('5', 1) - assert graph['5'].in_direction('4', 0) + graph.add_edge("4", "+", "5", "+", 0) + assert "5" in graph["4"].neighbors() + assert graph["4"].in_direction("5", 1) + assert graph["5"].in_direction("4", 0) def test_path_extraction(): @@ -95,8 +95,8 @@ def test_path_extraction(): assert graph.extract_path("<1") == "CGACCT" # testing non-ACTG characters - graph['1'].seq = graph['1'].seq + "N" - assert graph.extract_path(">1>2>4") == graph['1'].seq + graph['2'].seq + graph['4'].seq + graph["1"].seq = graph["1"].seq + "N" + assert graph.extract_path(">1>2>4") == graph["1"].seq + graph["2"].seq + graph["4"].seq def test_components(): @@ -104,24 +104,24 @@ def test_components(): components = graph.all_components() assert len(components) == 1 assert components[0] == set(graph.nodes.keys()) - graph.add_node('5', 'GGCC') + graph.add_node("5", "GGCC") graph.add_node("6", "CCGG") - graph.add_edge('5', "+", "6", "+", 2) + graph.add_edge("5", "+", "6", "+", 2) components = graph.all_components() assert len(components) == 2 comp1, comp2 = components if len(comp1) > len(comp2): - assert comp1 == {'1', '2', '3', '4'} - assert comp2 == {'5', '6'} + assert comp1 == {"1", "2", "3", "4"} + assert comp2 == {"5", "6"} else: - assert comp2 == {'1', '2', '3', '4'} - assert comp1 == {'5', '6'} + assert comp2 == {"1", "2", "3", "4"} + assert comp1 == {"5", "6"} def test_bicc(): graph = GFA("tests/data/smallgraph-noseq.gfa") biccs, artic_points = graph.biccs() - assert set(artic_points) == {'s8', 's6', 's4', 's2'} + assert set(artic_points) == {"s8", "s6", "s4", "s2"} assert len(biccs) == 5 @@ -133,5 +133,4 @@ def test_contig_length(): def test_dfs(): graph = GFA("tests/data/test_GFA_class.gfa") ordered_dfs = graph.dfs("1") - assert ordered_dfs == ['1', '3', '4', '2'] - + assert ordered_dfs == ["1", "3", "4", "2"] diff --git a/tests/test_index.py b/tests/test_index.py index 1f97269..90464c2 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -5,71 +5,61 @@ import pickle from gaftools.cli.index import run + # parsing the index file which is a pickled dictionary def parse_output(filename): data = None - with open(filename, 'rb') as handle: + with open(filename, "rb") as handle: data = pickle.load(handle) return data + ### Tests ### + # testing index of gaf with stable coordinates def test_index_stable(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gvi' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gvi' - run( - gaf_path=input_gaf, - gfa_path=input_gfa, - output=output - ) + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gvi" + truth = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gvi" + run(gaf_path=input_gaf, gfa_path=input_gfa, output=output) output_dict = parse_output(output) truth_dict = parse_output(truth) - assert (output_dict == truth_dict) + assert output_dict == truth_dict + # testing index of gaf with stable coordinates def test_index_unstable(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gvi' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gvi' - run( - gaf_path=input_gaf, - gfa_path=input_gfa, - output=output - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gvi" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gvi" + run(gaf_path=input_gaf, gfa_path=input_gfa, output=output) output_dict = parse_output(output) truth_dict = parse_output(truth) - assert (output_dict == truth_dict) + assert output_dict == truth_dict + # testing index of gzipped gaf with stable coordinates def test_index_stable_gzipped(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gvi' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz.gvi' - run( - gaf_path=input_gaf, - gfa_path=input_gfa, - output=output - ) + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gvi" + truth = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz.gvi" + run(gaf_path=input_gaf, gfa_path=input_gfa, output=output) output_dict = parse_output(output) truth_dict = parse_output(truth) - assert (output_dict == truth_dict) + assert output_dict == truth_dict + # testing index of gzipped gaf with stable coordinates def test_index_unstable_gzipped(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gvi' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz.gvi' - run( - gaf_path=input_gaf, - gfa_path=input_gfa, - output=output - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gvi" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz.gvi" + run(gaf_path=input_gaf, gfa_path=input_gfa, output=output) output_dict = parse_output(output) truth_dict = parse_output(truth) - assert (output_dict == truth_dict) \ No newline at end of file + assert output_dict == truth_dict diff --git a/tests/test_parsegaf_tags.py b/tests/test_parsegaf_tags.py index 995e42c..7ed4e6d 100644 --- a/tests/test_parsegaf_tags.py +++ b/tests/test_parsegaf_tags.py @@ -6,17 +6,17 @@ def test_parse_gaf_tags(): - #gaftools realign alignments.gaf smallgraph.gfa reads.fa - - gaf_file = open("tests/data/alignments-graphaligner.gaf", "r") + # gaftools realign alignments.gaf smallgraph.gfa reads.fa + + gaf_file = open("tests/data/alignments-graphaligner.gaf", "r") gaf = GAF("tests/data/alignments-graphaligner.gaf") gaf_file2 = list(gaf.read_file()) - + for cnt, line in enumerate(gaf_file): - fields = line.rstrip().split('\t') + fields = line.rstrip().split("\t") for t in gaf_file2[cnt].tags.keys(): matching = list(filter(lambda x: t in x, fields)) - assert(len(matching) > 0) + assert len(matching) > 0 gaf_file.close() diff --git a/tests/test_run_ordergfa.py b/tests/test_run_ordergfa.py index e4084ef..c31b892 100644 --- a/tests/test_run_ordergfa.py +++ b/tests/test_run_ordergfa.py @@ -7,22 +7,22 @@ def test_order_gfa(tmp_path): - input_gfa = 'tests/data/smallgraph.gfa' + input_gfa = "tests/data/smallgraph.gfa" run_order_gfa( - gfa_filename = 'tests/data/smallgraph.gfa', - outdir = str(tmp_path), - chromosome_order='chr1', + gfa_filename="tests/data/smallgraph.gfa", + outdir=str(tmp_path), + chromosome_order="chr1", with_sequence=False, ) - output_gfa = str(tmp_path) + '/smallgraph-chr1.gfa' - graph1 = GFA('tests/data/smallgraph-ordered.gfa', low_memory=True) + output_gfa = str(tmp_path) + "/smallgraph-chr1.gfa" + graph1 = GFA("tests/data/smallgraph-ordered.gfa", low_memory=True) graph2 = GFA(output_gfa, low_memory=True) assert graph1.is_equal_to(graph2) # testing the output CSV file that it makes sense csv_file = [] - with open(str(tmp_path) + '/smallgraph-chr1.csv', "r") as infile: + with open(str(tmp_path) + "/smallgraph-chr1.csv", "r") as infile: for l in infile: if l.startswith("Name"): assert l == "Name,Color,SN,SO,BO,NO\n" diff --git a/tests/test_run_realign.py b/tests/test_run_realign.py index 4d96ee5..3ab78e2 100644 --- a/tests/test_run_realign.py +++ b/tests/test_run_realign.py @@ -2,54 +2,53 @@ Tests for 'gaftools realign' """ - from gaftools.gaf import GAF from gaftools.cli.realign import run_realign def test_order_gfa(tmp_path): - #gaftools realign alignments.gaf smallgraph.gfa reads.fa - input_gaf = 'tests/data/alignments-graphaligner.gaf' - output_gaf = str(tmp_path) + '/output.gaf' + # gaftools realign alignments.gaf smallgraph.gfa reads.fa + input_gaf = "tests/data/alignments-graphaligner.gaf" + output_gaf = str(tmp_path) + "/output.gaf" run_realign( - gaf = input_gaf, - graph = 'tests/data/smallgraph.gfa', - fasta = 'tests/data/reads.fa', - output = output_gaf, + gaf=input_gaf, + graph="tests/data/smallgraph.gfa", + fasta="tests/data/reads.fa", + output=output_gaf, ext=False, - cores=1 + cores=1, ) gaf = GAF(output_gaf) gaf_lines = list(gaf.read_file()) assert len(gaf_lines) == 2 - assert gaf_lines[0].query_name == 'read_s8_s9' + assert gaf_lines[0].query_name == "read_s8_s9" assert gaf_lines[0].query_length == 398 assert gaf_lines[0].query_start == 0 assert gaf_lines[0].query_end == 398 - assert gaf_lines[0].strand == '+' - assert gaf_lines[0].path == '>s8>s9' + assert gaf_lines[0].strand == "+" + assert gaf_lines[0].path == ">s8>s9" assert gaf_lines[0].path_length == 10109 assert gaf_lines[0].path_start == 6166 assert gaf_lines[0].path_end == 6564 assert gaf_lines[0].residue_matches == 398 assert gaf_lines[0].alignment_block_length == 398 assert gaf_lines[0].mapping_quality == 60 - assert gaf_lines[0].cigar == '398=' - #assert gaf_lines[0].is_primary + assert gaf_lines[0].cigar == "398=" + # assert gaf_lines[0].is_primary - assert gaf_lines[1].query_name == 'read_s8_s9_deletion15' + assert gaf_lines[1].query_name == "read_s8_s9_deletion15" assert gaf_lines[1].query_length == 383 assert gaf_lines[1].query_start == 0 assert gaf_lines[1].query_end == 383 - assert gaf_lines[1].strand == '+' - assert gaf_lines[1].path == '>s8>s9' + assert gaf_lines[1].strand == "+" + assert gaf_lines[1].path == ">s8>s9" assert gaf_lines[1].path_length == 10109 assert gaf_lines[1].path_start == 6166 assert gaf_lines[1].path_end == 6564 assert gaf_lines[1].residue_matches == 383 assert gaf_lines[1].alignment_block_length == 398 assert gaf_lines[1].mapping_quality == 60 - assert gaf_lines[1].cigar == '189=15D194=' - #assert gaf_lines[1].is_primary + assert gaf_lines[1].cigar == "189=15D194=" + # assert gaf_lines[1].is_primary diff --git a/tests/test_run_stat.py b/tests/test_run_stat.py index e901511..8c87f5f 100644 --- a/tests/test_run_stat.py +++ b/tests/test_run_stat.py @@ -7,57 +7,57 @@ def parse_output(filename): def parse_line(line): - return tuple(s.strip() for s in line.split(':')) - return [parse_line(l) for l in open(filename)] + return tuple(s.strip() for s in line.split(":")) + return [parse_line(l) for l in open(filename)] def test_stat_graphaligner(tmp_path): - input_gaf = 'tests/data/alignments-graphaligner.gaf' - output = str(tmp_path) + '/output.log' + input_gaf = "tests/data/alignments-graphaligner.gaf" + output = str(tmp_path) + "/output.log" run_stat( - gaf_path = input_gaf, - cigar_stat = False, - output = output, + gaf_path=input_gaf, + cigar_stat=False, + output=output, ) output_lines = parse_output(output) - assert output_lines[0] == ('Total alignments', '2') - assert output_lines[1] == ('Primary', '2') - assert output_lines[2] == ('Secondary', '0') - assert output_lines[3] == ('Reads with at least one alignment', '2') - assert output_lines[4] == ('Total aligned bases', '781') - assert output_lines[5] == ('Average mapping quality', '60.0') + assert output_lines[0] == ("Total alignments", "2") + assert output_lines[1] == ("Primary", "2") + assert output_lines[2] == ("Secondary", "0") + assert output_lines[3] == ("Reads with at least one alignment", "2") + assert output_lines[4] == ("Total aligned bases", "781") + assert output_lines[5] == ("Average mapping quality", "60.0") def test_stat_minigraph_stable(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable.gaf' - output = str(tmp_path) + '/output.log' + input_gaf = "tests/data/alignments-minigraph-stable.gaf" + output = str(tmp_path) + "/output.log" run_stat( - gaf_path = input_gaf, - cigar_stat = False, - output = output, + gaf_path=input_gaf, + cigar_stat=False, + output=output, ) output_lines = parse_output(output) - assert output_lines[0] == ('Total alignments', '2') - assert output_lines[1] == ('Primary', '2') - assert output_lines[2] == ('Secondary', '0') - assert output_lines[3] == ('Reads with at least one alignment', '2') - assert output_lines[4] == ('Total aligned bases', '744') - assert output_lines[5] == ('Average mapping quality', '60.0') + assert output_lines[0] == ("Total alignments", "2") + assert output_lines[1] == ("Primary", "2") + assert output_lines[2] == ("Secondary", "0") + assert output_lines[3] == ("Reads with at least one alignment", "2") + assert output_lines[4] == ("Total aligned bases", "744") + assert output_lines[5] == ("Average mapping quality", "60.0") def test_stat_minigraph_unstable(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable.gaf' - output = str(tmp_path) + '/output.log' + input_gaf = "tests/data/alignments-minigraph-unstable.gaf" + output = str(tmp_path) + "/output.log" run_stat( - gaf_path = input_gaf, - cigar_stat = False, - output = output, + gaf_path=input_gaf, + cigar_stat=False, + output=output, ) output_lines = parse_output(output) - assert output_lines[0] == ('Total alignments', '2') - assert output_lines[1] == ('Primary', '2') - assert output_lines[2] == ('Secondary', '0') - assert output_lines[3] == ('Reads with at least one alignment', '2') - assert output_lines[4] == ('Total aligned bases', '744') - assert output_lines[5] == ('Average mapping quality', '60.0') + assert output_lines[0] == ("Total alignments", "2") + assert output_lines[1] == ("Primary", "2") + assert output_lines[2] == ("Secondary", "0") + assert output_lines[3] == ("Reads with at least one alignment", "2") + assert output_lines[4] == ("Total aligned bases", "744") + assert output_lines[5] == ("Average mapping quality", "60.0") diff --git a/tests/test_sort.py b/tests/test_sort.py index 109acf2..bda939d 100644 --- a/tests/test_sort.py +++ b/tests/test_sort.py @@ -7,28 +7,25 @@ def parse_output(filename): def parse_line(line): - return tuple(s.strip() for s in line.split('\t')) + return tuple(s.strip() for s in line.split("\t")) + return [parse_line(l) for l in open(filename)] def test_sort_gzipped_gaf(tmp_path): - ''' + """ Testing if a gzipped GAF file works as an input - ''' - input_gaf = 'tests/data/alignments-more-graphaligner.gaf.gz' - input_gfa = 'tests/data/smallgraph-ordered.gfa' - output = str(tmp_path) + '/output.log' - run_sort( - gfa=input_gfa, - gaf=input_gaf, - outgaf=output - ) + """ + input_gaf = "tests/data/alignments-more-graphaligner.gaf.gz" + input_gfa = "tests/data/smallgraph-ordered.gfa" + output = str(tmp_path) + "/output.log" + run_sort(gfa=input_gfa, gaf=input_gaf, outgaf=output) output_lines = parse_output(output) - assert output_lines[0][0] == 'read_s3' - assert output_lines[1][0] == 'read_s464827_revcomp' - assert output_lines[2][0] == 'read_s4_s5_100_100' - assert output_lines[3][0] == 'read_s4_s6_100_100' - assert output_lines[4][0] == 'read_s4_s5_95_105_revcomp' - assert output_lines[5][0] == 'read_s4_s5_90_110' - assert output_lines[6][0] == 'read_s7' - assert output_lines[7][0] == 'read_s8_s9' \ No newline at end of file + assert output_lines[0][0] == "read_s3" + assert output_lines[1][0] == "read_s464827_revcomp" + assert output_lines[2][0] == "read_s4_s5_100_100" + assert output_lines[3][0] == "read_s4_s6_100_100" + assert output_lines[4][0] == "read_s4_s5_95_105_revcomp" + assert output_lines[5][0] == "read_s4_s5_90_110" + assert output_lines[6][0] == "read_s7" + assert output_lines[7][0] == "read_s8_s9" diff --git a/tests/test_view.py b/tests/test_view.py index 9ad1eba..50d02a8 100644 --- a/tests/test_view.py +++ b/tests/test_view.py @@ -4,518 +4,459 @@ from gaftools.cli.view import run + def parse_output(filename): def parse_line(line): - return tuple(s.strip() for s in line.split('\t')) + return tuple(s.strip() for s in line.split("\t")) + return [parse_line(l) for l in open(filename)] + ###################### # Stable to Unstable # ###################### + # testing whole file conversion from stable to unstable def test_stable_to_unstable(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='unstable' - ) + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="unstable") output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from stable to unstable with nodes specified def test_stable_to_unstable_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='unstable', - nodes=['s2'] - ) + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="unstable", nodes=["s2"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from stable to unstable with regions specified def test_stable_to_unstable_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='unstable', - regions=['chr1:3300-3400'] + format="unstable", + regions=["chr1:3300-3400"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from stable to unstable with non-reference nodes specified def test_stable_to_unstable_non_reference_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='unstable', - nodes=['s464827'] - ) + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="unstable", nodes=["s464827"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from stable to unstable with non-reference regions specified def test_stable_to_unstable_non_reference_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf' + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='unstable', - regions=['NA20129#1#JAHEPE010000248.1:4927-5000'] + format="unstable", + regions=["NA20129#1#JAHEPE010000248.1:4927-5000"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from stable to unstable with multiple nodes specified def test_stable_to_unstable_multiple_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf" run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='unstable', - nodes=['s464827', 's2'] + gaf_path=input_gaf, gfa=input_gfa, output=output, format="unstable", nodes=["s464827", "s2"] ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from stable to unstable with multiple regions specified def test_stable_to_unstable_multiple_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='unstable', - regions=['NA20129#1#JAHEPE010000248.1:4927-5000', 'chr1:3300-3400'] + format="unstable", + regions=["NA20129#1#JAHEPE010000248.1:4927-5000", "chr1:3300-3400"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing whole gzipped file conversion from stable to unstable def test_gzipped_stable_to_unstable(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='unstable' - ) + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="unstable") output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from stable to unstable with nodes specified def test_gzipped_stable_to_unstable_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='unstable', - nodes=['s2'] - ) + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="unstable", nodes=["s2"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from stable to unstable with regions specified def test_gzipped_stable_to_unstable_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='unstable', - regions=['chr1:3300-3400'] + format="unstable", + regions=["chr1:3300-3400"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from stable to unstable with non-reference nodes specified def test_gzipped_stable_to_unstable_non_reference_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='unstable', - nodes=['s464827'] - ) + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="unstable", nodes=["s464827"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from stable to unstable with non-reference regions specified def test_gzipped_stable_to_unstable_non_reference_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf' + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='unstable', - regions=['NA20129#1#JAHEPE010000248.1:4927-5000'] + format="unstable", + regions=["NA20129#1#JAHEPE010000248.1:4927-5000"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from stable to unstable with multiple nodes specified def test_gzipped_stable_to_unstable_multiple_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf" run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='unstable', - nodes=['s464827', 's2'] + gaf_path=input_gaf, gfa=input_gfa, output=output, format="unstable", nodes=["s464827", "s2"] ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from stable to unstable with multiple regions specified def test_gzipped_stable_to_unstable_multiple_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-stable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='unstable', - regions=['NA20129#1#JAHEPE010000248.1:4927-5000', 'chr1:3300-3400'] + format="unstable", + regions=["NA20129#1#JAHEPE010000248.1:4927-5000", "chr1:3300-3400"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + ###################### # Unstable to Stable # ###################### + # testing whole file conversion from unstable to stable def test_unstable_to_stable(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='stable' - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="stable") output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from unstable to stable with nodes specified def test_unstable_to_stable_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='stable', - nodes=['s2'] - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="stable", nodes=["s2"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from unstable to stable with regions specified def test_unstable_to_stable_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='stable', - regions=['chr1:3300-3400'] + format="stable", + regions=["chr1:3300-3400"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from unstable to stable with non-reference nodes specified def test_unstable_to_stable_non_reference_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='stable', - nodes=['s464827'] - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="stable", nodes=["s464827"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from unstable to stable with non-reference regions specified def test_unstable_to_stable_non_reference_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf' + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='stable', - regions=['NA20129#1#JAHEPE010000248.1:4927-5000'] + format="stable", + regions=["NA20129#1#JAHEPE010000248.1:4927-5000"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from unstable to stable with multiple nodes specified def test_unstable_to_stable_multiple_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='stable', - nodes=['s464827', 's2'] - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="stable", nodes=["s464827", "s2"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing file conversion from unstable to stable with multiple regions specified def test_unstable_to_stable_multiple_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='stable', - regions=['NA20129#1#JAHEPE010000248.1:4927-5000', 'chr1:3300-3400'] + format="stable", + regions=["NA20129#1#JAHEPE010000248.1:4927-5000", "chr1:3300-3400"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) - - + assert output_lines[n] == truth_lines[n] # testing whole gzipped file conversion from unstable to stable def test_gzipped_unstable_to_stable(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='stable' - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="stable") output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from unstable to stable with nodes specified def test_gzipped_unstable_to_stable_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='stable', - nodes=['s2'] - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="stable", nodes=["s2"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from unstable to stable with regions specified def test_gzipped_unstable_to_stable_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='stable', - regions=['chr1:3300-3400'] + format="stable", + regions=["chr1:3300-3400"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from unstable to stable with non-reference nodes specified def test_gzipped_unstable_to_stable_non_reference_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='stable', - nodes=['s464827'] - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="stable", nodes=["s464827"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from unstable to stable with non-reference regions specified def test_gzipped_unstable_to_stable_non_reference_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf' + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='stable', - regions=['NA20129#1#JAHEPE010000248.1:4927-5000'] + format="stable", + regions=["NA20129#1#JAHEPE010000248.1:4927-5000"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from unstable to stable with multiple nodes specified def test_gzipped_unstable_to_stable_multiple_nodes(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf' - run( - gaf_path=input_gaf, - gfa=input_gfa, - output=output, - format='stable', - nodes=['s464827', 's2'] - ) + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf" + run(gaf_path=input_gaf, gfa=input_gfa, output=output, format="stable", nodes=["s464827", "s2"]) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) + assert output_lines[n] == truth_lines[n] + # testing gzipped file conversion from unstable to stable with multiple regions specified def test_gzipped_unstable_to_stable_multiple_regions(tmp_path): - input_gaf = 'tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz' - input_gfa = 'tests/data/smallgraph.gfa' - output = str(tmp_path) + '/output.gaf' - truth = 'tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf' + input_gaf = "tests/data/alignments-minigraph-unstable-conversioncheck.gaf.gz" + input_gfa = "tests/data/smallgraph.gfa" + output = str(tmp_path) + "/output.gaf" + truth = "tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf" run( gaf_path=input_gaf, gfa=input_gfa, output=output, - format='stable', - regions=['NA20129#1#JAHEPE010000248.1:4927-5000', 'chr1:3300-3400'] + format="stable", + regions=["NA20129#1#JAHEPE010000248.1:4927-5000", "chr1:3300-3400"], ) output_lines = parse_output(output) truth_lines = parse_output(truth) for n in range(len(output_lines)): - assert (output_lines[n] == truth_lines[n]) \ No newline at end of file + assert output_lines[n] == truth_lines[n] From 44455aaf78431c6b9fe42981d068ba676d7e5ee4 Mon Sep 17 00:00:00 2001 From: Samarendra Date: Thu, 20 Jun 2024 10:04:18 +0200 Subject: [PATCH 2/8] code passes tox, black, flake8, and twine check --- CHANGES.rst | 2 +- docs/develop.rst | 2 +- gaftools/cli/index.py | 13 ++++----- gaftools/cli/order_gfa.py | 6 ++-- gaftools/cli/phase.py | 9 ++---- gaftools/cli/realign.py | 55 ++++++------------------------------- gaftools/cli/sort.py | 26 +++++++++--------- gaftools/cli/stat.py | 2 +- gaftools/cli/view.py | 23 ++++++++-------- gaftools/conversion.py | 2 +- gaftools/gaf.py | 1 - gaftools/gfa.py | 14 +++++----- pyproject.toml | 15 +++++++++- tests/test_extract_path.py | 4 +-- tests/test_index.py | 2 +- tests/test_run_ordergfa.py | 4 +-- tox.ini | 56 ++++++++++++++++++++++++++++++++++++++ 17 files changed, 129 insertions(+), 107 deletions(-) create mode 100644 tox.ini diff --git a/CHANGES.rst b/CHANGES.rst index 580e70b..0e8459d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,6 +2,6 @@ Changes ======= v1.0 (19/06/2024) ----------------- +----------------- * Initial commit. \ No newline at end of file diff --git a/docs/develop.rst b/docs/develop.rst index 01a2e9d..1ae488e 100644 --- a/docs/develop.rst +++ b/docs/develop.rst @@ -1,4 +1,4 @@ -.. _developing +.. _developing: Developing ========== diff --git a/gaftools/cli/index.py b/gaftools/cli/index.py index fe0e45f..f9b9169 100644 --- a/gaftools/cli/index.py +++ b/gaftools/cli/index.py @@ -3,21 +3,18 @@ This script creates an inverse look-up table where: - key: the node information - - value: the offsets in the GAF file where the node is present + - value: the offsets in the GAF file where the node is present """ import re import pickle from pysam import libcbgzf from collections import defaultdict -import re -import pickle import logging import gaftools.utils as utils from gaftools.gaf import GAF from gaftools.gfa import GFA -from gaftools import __version__ from gaftools.timer import StageTimer from gaftools.cli import log_memory_usage @@ -27,7 +24,7 @@ def run(gaf_path, gfa_path, output=None): timers = StageTimer() - if output == None: + if output is None: output = gaf_path + ".gvi" # Detecting if GAF has stable or unstable coordinate @@ -170,11 +167,11 @@ def convert_coord(line, ref): def add_arguments(parser): arg = parser.add_argument # Positional arguments - arg('gaf_path', metavar='GAF', + arg('gaf_path', metavar='GAF', help='Input GAF file (can be bgzip-compressed)') - arg('gfa_path', metavar='rGFA', + arg('gfa_path', metavar='rGFA', help='Reference rGFA file') - arg('-o', '--output', default=None, + arg('-o', '--output', default=None, help='Path to the output Indexed GAF file. If omitted, use .gvi') diff --git a/gaftools/cli/order_gfa.py b/gaftools/cli/order_gfa.py index 9ce91cb..7008c05 100644 --- a/gaftools/cli/order_gfa.py +++ b/gaftools/cli/order_gfa.py @@ -9,7 +9,7 @@ import os import logging import time -from collections import namedtuple, defaultdict +from collections import defaultdict from gaftools.gfa import GFA logger = logging.getLogger(__name__) @@ -50,7 +50,7 @@ def run_order_gfa( with_sequence=False, ): - if not chromosome_order is None: + if chromosome_order is not None: chromosome_order = chromosome_order.split(sep=",") if not os.path.isdir(outdir): @@ -289,7 +289,7 @@ def count_sn(graph, comp): """ counts = defaultdict(int) for n in comp: - if not "SN" in graph[n].tags: + if "SN" not in graph[n].tags: continue counts[graph[n].tags["SN"][1]] += 1 return counts diff --git a/gaftools/cli/phase.py b/gaftools/cli/phase.py index c55a63a..8d55bff 100644 --- a/gaftools/cli/phase.py +++ b/gaftools/cli/phase.py @@ -8,7 +8,6 @@ import logging import sys -from gaftools import __version__ from gaftools.cli import log_memory_usage from gaftools.timer import StageTimer from gaftools.gaf import GAF @@ -39,8 +38,6 @@ def add_phase_info(gaf_path, tsv_path, out_path): Haplotag... The information is added as two tags ("ps:Z" and "ht:Z") before the cigar string """ - import itertools - logger.info("INFO: Adding phasing information...") tsv_file = open(tsv_path, "r") @@ -118,11 +115,11 @@ def add_phase_info(gaf_path, tsv_path, out_path): # fmt: off def add_arguments(parser): arg = parser.add_argument - + # Positional arguments - arg('gaf_file', metavar='GAF', + arg('gaf_file', metavar='GAF', help='Input GAF file (can be bgzip-compressed)') - arg('tsv_file', metavar='phase', + arg('tsv_file', metavar='phase', help='WhatsHap haplotag TSV file. Refer to https://whatshap.readthedocs.io/en/latest/guide.html#whatshap-haplotag') arg('-o', '--output', default=sys.stdout, help='Output GAF file. If omitted, output is directed to standard output.') diff --git a/gaftools/cli/realign.py b/gaftools/cli/realign.py index b5a18b0..fcc5eed 100644 --- a/gaftools/cli/realign.py +++ b/gaftools/cli/realign.py @@ -5,16 +5,14 @@ import logging import sys import pysam -import time import gaftools.gaf import multiprocessing as mp -from gaftools.utils import display_top from gaftools.cli import log_memory_usage from gaftools.timer import StageTimer -from gaftools.gaf import GAF, Alignment +from gaftools.gaf import GAF from gaftools.gfa import GFA -from pywfa.align import WavefrontAligner, cigartuples_to_str +from pywfa.align import WavefrontAligner logger = logging.getLogger(__name__) @@ -59,10 +57,10 @@ def filter_duplicates(aln): if len(mappings) == 1: continue for cnt, line in enumerate(mappings): - if line.duplicate == True: + if line.duplicate: continue for cnt2, line2 in enumerate(mappings): - if cnt == cnt2 or line2.duplicate == True: + if cnt == cnt2 or line2.duplicate: continue sim = overlap_ratio( @@ -79,37 +77,6 @@ def filter_duplicates(aln): break -def write_alignments(aln, output): - # Write the alignment back to the GAF - for read_name, mappings in aln.items(): - for gaf_line in mappings: - if not gaf_line.duplicate: - # Write the alignment back to the GAF - output.write( - "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d" - % ( - gaf_line.query_name, - gaf_line.query_length, - gaf_line.query_start, - gaf_line.query_end, - gaf_line.strand, - gaf_line.path, - gaf_line.path_length, - gaf_line.path_start, - gaf_line.path_end, - match, - cigar_len, - gaf_line.mapping_quality, - ) - ) - - for k in gaf_line.tags.keys(): - output.write("\t%s%s" % (k, gaf_line.tags[k])) - - # print(gaf_line.cigar) - # output.write("\tcg:Z:%s\n" %gaf_line.cigar) - - def wfa_alignment(seq_batch, queue): """ Realigns the sequence using the wavefront alignment algorithm. @@ -518,24 +485,20 @@ def realign_gaf_old(gaf, graph, fasta, output, extended, cores=1): fastafile.close() - if extended: - filter_duplicates(aln) - write_alignments(aln, output) - # fmt: off def add_arguments(parser): arg = parser.add_argument # Positional arguments - arg('gaf', metavar='GAF', + arg('gaf', metavar='GAF', help='Input GAF file (can be bgzip-compressed)') - arg('graph', metavar='rGFA', + arg('graph', metavar='rGFA', help='reference rGFA file') - arg('fasta', metavar='FASTA', + arg('fasta', metavar='FASTA', help='Input FASTA file of the read') - arg('-o', '--output', default=None, + arg('-o', '--output', default=None, help='Output GAF file. If omitted, use standard output.') - arg("-c", "--cores", metavar="CORES", default=1, type=int, + arg("-c", "--cores", metavar="CORES", default=1, type=int, help="Number of cores to use for alignments.") diff --git a/gaftools/cli/sort.py b/gaftools/cli/sort.py index df8c730..f09bf6e 100644 --- a/gaftools/cli/sort.py +++ b/gaftools/cli/sort.py @@ -30,7 +30,7 @@ def run_sort(gfa, gaf, outgaf=None, outind=None, bgzip=False): - if outgaf == None: + if outgaf is None: writer = sys.stdout index_file = None else: @@ -104,22 +104,22 @@ def sort(gaf, nodes, writer, index_dict, index_file): off = alignment.offset reader.seek(off) line = reader.readline() - if type(line) == bytes: + if isinstance(line, bytes): line = line.decode("utf-8").rstrip() - elif type(line) == str: + elif isinstance(line, str): line = line.rstrip() else: raise RuntimeError("GAF alignments not in string or byte format.") line += "\tbo:i:%d\tsn:Z:%s\tiv:i:%d\n" % (alignment.BO, alignment.sn, alignment.inv) - if index_file != None: + if index_file is not None: out_off = writer.tell() - if index_dict[alignment.sn][0] == None: + if index_dict[alignment.sn][0] is None: index_dict[alignment.sn][0] = out_off index_dict[alignment.sn][1] = out_off else: index_dict[alignment.sn][1] = out_off write_to_file(line, writer) - if index_file != None: + if index_file is not None: index_dict.pop("unknown") index_dict = dict(index_dict) with open(index_file, "wb") as ind: @@ -149,7 +149,7 @@ def process_alignment(line, nodes, offset): sr_tag = int(nodes[n].tags["SR"][1]) # Finding the chromosome where the alignment is - if sn == None and sr_tag == 0: + if sn is None and sr_tag == 0: sn = sn_tag elif sr_tag == 0: assert sn == sn_tag @@ -180,7 +180,7 @@ def process_alignment(line, nodes, offset): n = path[1] bo = int(nodes[n].tags["BO"][1]) no = int(nodes[n].tags["NO"][1]) - if sn == None: + if sn is None: sn = "unknown" return bo, no, start, inv, sn @@ -234,17 +234,17 @@ def write_to_file(line, writer): def add_arguments(parser): arg = parser.add_argument # Positional arguments - arg("gaf", metavar='GAF', + arg("gaf", metavar='GAF', help="Input GAF File (can be bgzip-compressed)") - arg("gfa", metavar='GFA', + arg("gfa", metavar='GFA', help="GFA file with the sort keys (BO and NO tagged). This is done with gaftools order_gfa") - arg("--outgaf", default=None, + arg("--outgaf", default=None, help="Output GAF File path (Default: sys.stdout)") - arg("--outind", default=None, + arg("--outind", default=None, help="Output Index File path for the GAF file. " "When --outgaf is not given, no index is created. " "If it is given and --outind is not specified, it will have same file name with .gsi extension)") - arg("--bgzip", action='store_true', + arg("--bgzip", action='store_true', help="Flag to bgzip the output. Can only be given with --output.") diff --git a/gaftools/cli/stat.py b/gaftools/cli/stat.py index 08ed448..b87d696 100644 --- a/gaftools/cli/stat.py +++ b/gaftools/cli/stat.py @@ -29,7 +29,7 @@ def run_stat( else: output = open(output, "w") - read_names = set() + # read_names = set() total_aligned_bases = 0 total_mapq = 0 total_primary = 0 diff --git a/gaftools/cli/view.py b/gaftools/cli/view.py index b56090e..c573f74 100644 --- a/gaftools/cli/view.py +++ b/gaftools/cli/view.py @@ -10,7 +10,6 @@ import sys from collections import defaultdict -from gaftools import __version__ from gaftools.cli import log_memory_usage, CommandLineError from gaftools.timer import StageTimer from gaftools.gaf import GAF @@ -30,7 +29,7 @@ def run(gaf_path, gfa=None, output=None, index=None, nodes=[], regions=[], forma timers = StageTimer() - if output == None: + if output is None: writer = sys.stdout else: writer = open(output, "w") @@ -52,7 +51,7 @@ def run(gaf_path, gfa=None, output=None, index=None, nodes=[], regions=[], forma # if format is given, prepare some objects for use later if format: if format == "stable": - if gaf_format == True: + if gaf_format is True: raise CommandLineError( "Input GAF already has stable coordinates. Please remove the --format stable option" ) @@ -73,7 +72,7 @@ def run(gaf_path, gfa=None, output=None, index=None, nodes=[], regions=[], forma del gfa_file else: assert format == "unstable" - if gaf_format == False: + if gaf_format is False: raise CommandLineError( "Input GAF already has unstable coordinates. Please remove the --format unstable option" ) @@ -89,7 +88,7 @@ def run(gaf_path, gfa=None, output=None, index=None, nodes=[], regions=[], forma # now find out what lines to view and how to view if len(nodes) != 0 or len(regions) != 0: - if index == None: + if index is None: index = gaf_path + ".gvi" if not os.path.exists(index): raise CommandLineError( @@ -221,22 +220,22 @@ def search(node, node_list): def add_arguments(parser): arg = parser.add_argument # Positional arguments - arg('gaf_path', metavar='GAF', + arg('gaf_path', metavar='GAF', help='Input GAF file (can be bgzip-compressed)') - arg('-g', '--gfa', dest='gfa', metavar='GFA', default=None, + arg('-g', '--gfa', dest='gfa', metavar='GFA', default=None, help='Input GFA file (can be gzip-compressed). Required when converting from one coordinate system to another.') - arg('-o', '--output', dest='output', metavar='OUTPUT', default=None, + arg('-o', '--output', dest='output', metavar='OUTPUT', default=None, help='Output file. Default is stdout.') - arg('-i', '--index', default=None, + arg('-i', '--index', default=None, help='Path to GAF Index file. This index is created using gaftools index. ' 'If path is not provided, it is assumed to be in the same directory as GAF file with the same name and .gvi extension (default location of the index script)') - arg('-n', '--node', dest='nodes', metavar='NODE', default=[], action='append', + arg('-n', '--node', dest='nodes', metavar='NODE', default=[], action='append', help='Nodes to search. ' 'Multiple can be provided (Eg. gaftools view .... -n s1 -n s2 -n s3 .....).') - arg('-r', '--region', dest='regions', metavar='REGION', default=[], action='append', + arg('-r', '--region', dest='regions', metavar='REGION', default=[], action='append', help='Regions to search. ' 'Multiple can be provided (Eg. gaftools view .... -r chr1:10-20 -r chr1:50-60 .....).') - arg('-f', '--format', dest='format', metavar='FORMAT', + arg('-f', '--format', dest='format', metavar='FORMAT', help='format of output path (unstable | stable)') # fmt: on diff --git a/gaftools/conversion.py b/gaftools/conversion.py index 6aab41f..8313589 100644 --- a/gaftools/conversion.py +++ b/gaftools/conversion.py @@ -192,7 +192,7 @@ def to_stable(gaf_line, nodes, ref_contig, contig_len): n2 = node_list[i + 1][0] o2 = node_list[i + 1][1] node_merge = merge_nodes(n1, n2, o1, o2) - if node_merge == False: + if node_merge is False: stable_coord += n1.to_string(o1) out_node.append([n2, o2]) else: diff --git a/gaftools/gaf.py b/gaftools/gaf.py index 8712a0f..de49043 100644 --- a/gaftools/gaf.py +++ b/gaftools/gaf.py @@ -2,7 +2,6 @@ import logging from pysam import libcbgzf import gaftools.utils as utils -from gaftools import __version__ logger = logging.getLogger(__name__) diff --git a/gaftools/gfa.py b/gaftools/gfa.py index feb6408..1b91a50 100644 --- a/gaftools/gfa.py +++ b/gaftools/gfa.py @@ -226,7 +226,7 @@ def is_equal_to(self, other, only_topo=False): return False for n_id, node1 in self.nodes.items(): node2 = other[n_id] - if node2 == None: # n_id not in other graph, so not equal graphs + if node2 is None: # n_id not in other graph, so not equal graphs return False if not node1.is_equal_to(node2, only_topo): return False @@ -237,7 +237,7 @@ def remove_node(self, n_id): remove a node and its corresponding edges """ - edges_to_remove = [] + # edges_to_remove = [] starts = [x for x in self.nodes[n_id].start] for n_start in starts: overlap = n_start[2] @@ -293,7 +293,7 @@ def add_node(self, node_id, seq="", tags=None): if "SN" in self[node_id].tags and "SR" in self[node_id].tags: contig_name = self[node_id].tags["SN"][1] contig_rank = int(self[node_id].tags["SR"][1]) - if self.contigs[contig_name] == None: + if self.contigs[contig_name] is None: self.contigs[contig_name] = contig_rank else: assert self.contigs[contig_name] == contig_rank @@ -467,7 +467,7 @@ def write_gfa( # going through nodes twice to have the final output as all S line then all L lines for n in sorted_set_of_nodes: if n not in self.nodes: - logging.warning("Node {} does not exist in the graph, skipped in output".format(n1)) + logging.warning("Node {} does not exist in the graph, skipped in output".format(n)) continue line = self.nodes[n].to_gfa_line() f.write(line + "\n") @@ -756,7 +756,7 @@ def dfs(self, start_node): Performs depth first search from start node given by user return the path as a list """ - if not start_node in self: + if start_node not in self: return [] if len(self) == 1: return [list(self.nodes.keys())[0]] @@ -785,7 +785,7 @@ def biccs(self, set_of_nodes=None): This function modelled after Networkx implementation https://networkx.org/documentation/networkx-1.9/reference/generated/networkx.algorithms.components.biconnected.biconnected_components.html """ - if set_of_nodes == None: + if set_of_nodes is None: set_of_nodes = set(self.nodes.keys()) def edge_stack_to_set(edge_stack): @@ -849,7 +849,7 @@ def next_child(stack_item): edge_stack.append((child, nn)) edge_stack_loc[(child, nn)] = len(edge_stack) - 1 - elif nn == None: + elif nn is None: stack.pop() if len(stack) > 1: if low[child] >= discovery[parent]: diff --git a/pyproject.toml b/pyproject.toml index 96c150e..bd8cdb0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,9 @@ gaftools = "gaftools.__main__:main" dev = [ "pytest", "coverage", + "tox", + "flake8", + "black", "sphinx", "sphinx-issues", "sphinx-rtd-theme", @@ -61,4 +64,14 @@ include = ["gaftools*"] [tool.setuptools_scm] -write_to = "gaftools/_version.py" \ No newline at end of file +write_to = "gaftools/_version.py" + + +[tool.black] +line-length = 100 +target-version = ["py37"] + + +[tool.pytest.ini_options] +addopts = "--doctest-modules" +testpaths = ["tests", "gaftools"] \ No newline at end of file diff --git a/tests/test_extract_path.py b/tests/test_extract_path.py index 8e76c02..b2fd4e3 100644 --- a/tests/test_extract_path.py +++ b/tests/test_extract_path.py @@ -13,10 +13,10 @@ def test_realign_gaf(): s2_seq = "CGCAGGCGCACAGTCANNNNNNNNCACGCCACCGGGACAGGAGCGCGGGGGTGGAGCGTTGTAGGCAGAGAGACGCACGTCCCCGGGGGCGCGGCACAGAGACGGGTTGAACCTCAGTAATCCGAAAAGCCGGGCTCGGGCGCCCCCTGCTTGCAGCCGGGCACTACAGGACCCGCTTACCCGCGGTGCTTTCCCAGTGCGCCCCCTGCTAGCGGCTAGGACAACTGCAGGGCCCTCTTGCTTACAGTGGTGGCCAGCGTCCCCTGCTGGCGCCGGGGCACTGCAGGGCTCTCTTGCTCGCTAAATAGTGGCGGCACGCCGCCTGCTGGCAGCTAGGGACGTTGCAGGG" s2_seq_rev = "CCCTGCAACGTCCCTAGCTGCCAGCAGGCGGCGTGCCGCCACTATTTAGCGAGCAAGAGAGCCCTGCAGTGCCCCGGCGCCAGCAGGGGACGCTGGCCACCACTGTAAGCAAGAGGGCCCTGCAGTTGTCCTAGCCGCTAGCAGGGGGCGCACTGGGAAAGCACCGCGGGTAAGCGGGTCCTGTAGTGCCCGGCTGCAAGCAGGGGGCGCCCGAGCCCGGCTTTTCGGATTACTGAGGTTCAACCCGTCTCTGTGCCGCGCCCCCGGGGACGTGCGTCTCTCTGCCTACAACGCTCCACCCCCGCGCTCCTGTCCCGGTGGCGTGNNNNNNNNTGACTGTGCGCCTGCG" - s3_seq = "CGCAGG" + # s3_seq = "CGCAGG" s3_seq_rev = "CCTGCG" - s4_seq = "CGCANNNGG" + # s4_seq = "CGCANNNGG" s4_seq_rev = "CCNNNTGCG" path1 = ">s1" diff --git a/tests/test_index.py b/tests/test_index.py index 90464c2..be1aed1 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -14,7 +14,7 @@ def parse_output(filename): return data -### Tests ### +# Tests # testing index of gaf with stable coordinates diff --git a/tests/test_run_ordergfa.py b/tests/test_run_ordergfa.py index c31b892..6671542 100644 --- a/tests/test_run_ordergfa.py +++ b/tests/test_run_ordergfa.py @@ -9,7 +9,7 @@ def test_order_gfa(tmp_path): input_gfa = "tests/data/smallgraph.gfa" run_order_gfa( - gfa_filename="tests/data/smallgraph.gfa", + gfa_filename=input_gfa, outdir=str(tmp_path), chromosome_order="chr1", with_sequence=False, @@ -20,8 +20,6 @@ def test_order_gfa(tmp_path): assert graph1.is_equal_to(graph2) # testing the output CSV file that it makes sense - csv_file = [] - with open(str(tmp_path) + "/smallgraph-chr1.csv", "r") as infile: for l in infile: if l.startswith("Name"): diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..66b49b8 --- /dev/null +++ b/tox.ini @@ -0,0 +1,56 @@ +[tox] +envlist = py38,py39,py310,py311,py312,flake8,docs,twinecheck,black +isolated_build = True + + +[testenv] +usedevelop = True +deps = + pytest +commands = pytest + + +[testenv:docs] +basepython = python3.10 +skip_install = true +deps = -r docs/requirements.txt +commands = + sphinx-build -q -W -b html -d {envtmpdir}/doctrees docs {envtmpdir}/html + + +[testenv:twinecheck] +basepython = python3.10 +skip_install = true +deps = + twine + build +commands = + python -m build --sdist --outdir {envtmpdir}/dist + twine check {envtmpdir}/dist/* + + +[testenv:black] +basepython = python3.10 +deps = black==24.3.0 +skip_install = true +commands = black --check gaftools/ tests/ setup.py + + +[testenv:flake8] +basepython = python3.10 +deps = flake8 +skip_install = true +commands = flake8 gaftools/ tests/ setup.py + + +[flake8] +max-line-length = 120 +max-complexity = 33 +# E203 whitespace before ':' -- must be ignored for Black +# +# The following ignores should be removed over time: +# +# E501 line too long +# E741 ambiguous variable name 'l' +# +extend-ignore = E203,E501,E741,E722 \ No newline at end of file From 0424207b9372e9dbdc0267ea18d42739b6af7111 Mon Sep 17 00:00:00 2001 From: Samarendra Date: Thu, 20 Jun 2024 13:15:02 +0200 Subject: [PATCH 3/8] adding pre-commit. replace black and flake8 with ruff. ruff formatting. updating documentation. --- .../add-bubble-info.py | 134 +++++++++++------- .../.vcf_based_sorting_scripts/bubble-sort.py | 104 +++++++------- .attic/bicc_networkx.py | 55 +++---- .attic/find_bubbles.py | 4 +- .attic/graph.py | 5 +- .attic/sort.py | 131 ++++++++--------- .gitignore | 2 +- .pre-commit-config.yaml | 16 +++ .readthedocs.yaml | 2 +- CHANGES.rst | 2 +- README.md | 2 +- docs/Makefile | 3 +- docs/changes.rst | 2 +- docs/develop.rst | 59 +++++++- docs/guide.rst | 36 ++--- docs/index.rst | 2 +- gaftools/cli/index.py | 2 - gaftools/cli/order_gfa.py | 11 +- gaftools/cli/phase.py | 2 - gaftools/cli/sort.py | 2 - gaftools/cli/view.py | 2 - gaftools/conversion.py | 1 - gaftools/gaf.py | 2 - pyproject.toml | 22 +-- requirements.txt | 2 +- ...nigraph-stable-conversioncheck-only-s2.gaf | 2 +- ...ph-stable-conversioncheck-only-s464827.gaf | 2 +- ...ments-minigraph-stable-conversioncheck.gaf | 2 +- ...graph-unstable-conversioncheck-only-s2.gaf | 2 +- ...-unstable-conversioncheck-only-s464827.gaf | 2 +- ...nts-minigraph-unstable-conversioncheck.gaf | 2 +- tests/data/reads-conversioncheck.fa | 2 +- tox.ini | 22 +-- 33 files changed, 360 insertions(+), 281 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.attic/.vcf_based_sorting_scripts/add-bubble-info.py b/.attic/.vcf_based_sorting_scripts/add-bubble-info.py index a7cea04..4ac7878 100644 --- a/.attic/.vcf_based_sorting_scripts/add-bubble-info.py +++ b/.attic/.vcf_based_sorting_scripts/add-bubble-info.py @@ -9,39 +9,52 @@ logger = logging.getLogger(__name__) + def setup_logging(debug): handler = logging.StreamHandler() root = logging.getLogger() root.addHandler(handler) root.setLevel(logging.DEBUG if debug else logging.INFO) + def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--gfa", required=True, help="GFA file which was used to create the VCF in the GFA-to-VCF pipeline") - parser.add_argument("--vcf", required=True, help="Input VCF file (output of the GFA-to-VCF conversion pipeline)") + parser.add_argument( + "--gfa", + required=True, + help="GFA file which was used to create the VCF in the GFA-to-VCF pipeline", + ) + parser.add_argument( + "--vcf", required=True, help="Input VCF file (output of the GFA-to-VCF conversion pipeline)" + ) parser.add_argument("--output", default=None, help="Output GFA File path (Default: sys.stdout)") - parser.add_argument("--bgzip", action='store_true', help="Flag to bgzip the output. Can only be given with --output.") + parser.add_argument( + "--bgzip", + action="store_true", + help="Flag to bgzip the output. Can only be given with --output.", + ) parser.add_argument("--debug", action="store_true", default=False, help="Print debug messages") options = parser.parse_args() setup_logging(options.debug) - + # Checks for argument parsing if options.bgzip and not options.output: - raise RuntimeError("--bgzip flag has been specified but not output path has been defined. Please define the output path.") + raise RuntimeError( + "--bgzip flag has been specified but not output path has been defined. Please define the output path." + ) add_bubble_tags(options) memory_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("\n##### Memory Information #####") logger.info("Maximum memory usage: %.3f GB", memory_kb / 1e6) - + def add_bubble_tags(options): """ Adding the bubble tags to the nodes in the GFA file. Sort Keys will be added to the GFA S-lines. For this bubble tag, only the top level bubbles (LV=0) are considered. - + Two tags will be added: 1. BO:i: - This tag gives the bubble order. Odd tag means that the node is a scaffold node (at the start or end of the bubble). Even tag means that the node is inside a bubble. 2. NO:i: - This tag gives the node order. For nodes with the same BO tag, they are distinguished based on this NO tag to order each node uniquely. @@ -59,34 +72,34 @@ def add_bubble_tags(options): General Notes: 1. So inconsistent nodes will have 0 for both BO and NO tag. """ - - if options.output == None: + + if options.output is None: writer = sys.stdout else: if options.bgzip: - writer = libcbgzf.BGZFile(options.output, 'wb') + writer = libcbgzf.BGZFile(options.output, "wb") else: writer = open(options.output, "w") - node = defaultdict(lambda: [-1,-1]) + node = defaultdict(lambda: [-1, -1]) read_vcf(options.vcf, node) modify_gfa(options.gfa, node, writer) writer.close() - + + def write_to_file(line, writer): try: writer.write(line) except TypeError: writer.write(str.encode(line)) - + def modify_gfa(gfa, node, writer): - logger.info("\n##### Parsing GFA file and adding sort keys #####") if is_file_gzipped(gfa): - reader = gzip.open(gfa, 'rt') + reader = gzip.open(gfa, "rt") else: - reader = open(gfa, 'r') - + reader = open(gfa, "r") + S_lines_not_in_vcf = 0 line_count = defaultdict(lambda: 0) while True: @@ -99,31 +112,30 @@ def modify_gfa(gfa, node, writer): continue fields = line.split("\t") n = fields[1] - if node[n] == [-1,-1]: + if node[n] == [-1, -1]: S_lines_not_in_vcf += 1 write_to_file(line, writer) continue bo_tag = "BO:i:" no_tag = "NO:i:" - assert node[n][0] != -1 and node[n][1] != -1, "Node %s has -1,-1 tag"%(n) + assert node[n][0] != -1 and node[n][1] != -1, "Node %s has -1,-1 tag" % (n) bo_tag += str(node[n][0]) no_tag += str(node[n][1]) - new_line = line.rstrip()+"\t"+bo_tag+"\t"+no_tag+"\n" + new_line = line.rstrip() + "\t" + bo_tag + "\t" + no_tag + "\n" write_to_file(new_line, writer) - + for line_type, value in line_count.items(): - logger.info("Number of %s lines: %d"%(line_type, value)) - logger.info("Number of S lines not present in VCF: %d"%(S_lines_not_in_vcf)) - + logger.info("Number of %s lines: %d" % (line_type, value)) + logger.info("Number of S lines not present in VCF: %d" % (S_lines_not_in_vcf)) + def read_vcf(vcf, node): - logger.info("\n##### Parsing VCF file and reading bubble information #####") if is_file_gzipped(vcf): - reader = gzip.open(vcf, 'rt') + reader = gzip.open(vcf, "rt") else: - reader = open(vcf, 'r') - + reader = open(vcf, "r") + total_variants = 0 inconsistent_bubble_starts = 0 inconsistent_nodes = defaultdict(lambda: 0) @@ -132,74 +144,92 @@ def read_vcf(vcf, node): line = reader.readline() if not line: break - if line[0] == '#': + if line[0] == "#": continue fields = line.split("\t") info = fields[7].split(";") lv = None lv = int([x[3:] for x in info if x[0:2] == "LV"][0]) - assert lv != None, "Variant at chromosome %s position %s does not have a LV field. Need LV info to process"%(fields[0], fields[1]) + assert lv is not None, ( + "Variant at chromosome %s position %s does not have a LV field. Need LV info to process" + % (fields[0], fields[1]) + ) # Not considering any bubble at LV>0 if lv != 0: continue total_variants += 1 - + at = None at = [x[3:].split(",") for x in info if x[0:2] == "AT"][0] - assert at != None, "Variant at chromosome %s position %s does not have a AT field. Need AT info to process"%(fields[0], fields[1]) - - flanks = list(filter(None, re.split('(>)|(<)', fields[2]))) + assert at is not None, ( + "Variant at chromosome %s position %s does not have a AT field. Need AT info to process" + % (fields[0], fields[1]) + ) + + flanks = list(filter(None, re.split("(>)|(<)", fields[2]))) assert flanks[0] == flanks[2], "Bad orientation for the bubble." if flanks[0] == "<": flanks = [">", flanks[-1], ">", flanks[1]] assert len(flanks) == 4 - + if bo == 1: - node[flanks[1]][0] = bo #Since there is no previous bubble, this value has to be assigned + node[flanks[1]][0] = ( + bo # Since there is no previous bubble, this value has to be assigned + ) node[flanks[1]][1] = 0 else: - if node[flanks[1]][0] != bo: #By definition, the end node of bubble 1 should be start node of bubble 2. Checking that and counting such inconsistent starts. + if ( + node[flanks[1]][0] != bo + ): # By definition, the end node of bubble 1 should be start node of bubble 2. Checking that and counting such inconsistent starts. inconsistent_bubble_starts += 1 node[flanks[1]][0] = bo node[flanks[1]][1] = 0 - + no = 1 for traversal in at: - tn = list(filter(None, re.split('(>)|(<)', traversal))) - assert (tn[0] == flanks[0] and tn[1] == flanks[1]) and (tn[-2] == flanks[2] and tn[-1] == flanks[3]) + tn = list(filter(None, re.split("(>)|(<)", traversal))) + assert (tn[0] == flanks[0] and tn[1] == flanks[1]) and ( + tn[-2] == flanks[2] and tn[-1] == flanks[3] + ) tn = tn[2:-2] for n in tn: if n == ">" or n == "<": continue - #This condition checks for the case where the scaffold node is also present inside the bubble. + # This condition checks for the case where the scaffold node is also present inside the bubble. if node[n][0] == bo: - node[n][1] = 1 #Assigning an NO tag of 1. Usually scaffold nodes are tagged with 0. Here it is tagged with 1 to show that this is also inside a bubble. Hence its orientation cannot be trusted. + node[n][1] = ( + 1 # Assigning an NO tag of 1. Usually scaffold nodes are tagged with 0. Here it is tagged with 1 to show that this is also inside a bubble. Hence its orientation cannot be trusted. + ) continue - if not (node[n][0] == -1 or node[n][0] == bo+1): - logger.debug("[0] Node %s present in Bubble %s is also present in Bubble %s at level 0. Anomaly found in variant at chromosome %s:%s."%(n, str((bo+1)/2), str(node[n][0]/2), fields[0], fields[1])) #Asserting that the node n has not been called before. This will violate the definition of top level bubble. + if not (node[n][0] == -1 or node[n][0] == bo + 1): + logger.debug( + "[0] Node %s present in Bubble %s is also present in Bubble %s at level 0. Anomaly found in variant at chromosome %s:%s." + % (n, str((bo + 1) / 2), str(node[n][0] / 2), fields[0], fields[1]) + ) # Asserting that the node n has not been called before. This will violate the definition of top level bubble. inconsistent_nodes[0] += 1 node[n][0] = 0 node[n][1] = 0 continue - node[n][0] = bo+1 + node[n][0] = bo + 1 if node[n][1] != -1: continue node[n][1] = no no += 1 - node[flanks[3]][0] = bo+2 #Assigning the sort key to the end node of the buble + node[flanks[3]][0] = bo + 2 # Assigning the sort key to the end node of the buble node[flanks[3]][1] = 0 bo += 2 - logger.info("Total Variant Bubbles Processed: %d"%(total_variants)) - logger.info("Inconsistent Bubble Starts: %d"%(inconsistent_bubble_starts)) + logger.info("Total Variant Bubbles Processed: %d" % (total_variants)) + logger.info("Inconsistent Bubble Starts: %d" % (inconsistent_bubble_starts)) for key, value in inconsistent_nodes.items(): - logger.info("Number of nodes found in 2 separate bubbles (at level %d): %d"%(key, value)) + logger.info("Number of nodes found in 2 separate bubbles (at level %d): %d" % (key, value)) reader.close() def is_file_gzipped(src): with open(src, "rb") as inp: - return inp.read(2) == b'\x1f\x8b' + return inp.read(2) == b"\x1f\x8b" + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/.attic/.vcf_based_sorting_scripts/bubble-sort.py b/.attic/.vcf_based_sorting_scripts/bubble-sort.py index 66a9bb0..790df44 100644 --- a/.attic/.vcf_based_sorting_scripts/bubble-sort.py +++ b/.attic/.vcf_based_sorting_scripts/bubble-sort.py @@ -1,8 +1,8 @@ -''' +""" This uses the BO and NO tags defined by Samarendra Pani through processing the bubbles in the VCF produced by Glenn Hickey for the MC graph. The script to add these BO and NO tags are in add-bubble-info.py in this directory. Here since the bubbles are already defined, there is no need to process the entire alignment to find orientation of alignment since inversions are already taken into consideration. -''' +""" import sys import argparse @@ -17,65 +17,71 @@ logger = logging.getLogger(__name__) timers = StageTimer() + + def setup_logging(debug): handler = logging.StreamHandler() root = logging.getLogger() root.addHandler(handler) root.setLevel(logging.DEBUG if debug else logging.INFO) + def main(): - parser = argparse.ArgumentParser() parser.add_argument("--debug", action="store_true", default=False, help="Print debug messages") - parser.add_argument("--gfa", required=True, help="GFA file with the sort keys (BO and NO tagged)") + parser.add_argument( + "--gfa", required=True, help="GFA file with the sort keys (BO and NO tagged)" + ) parser.add_argument("--gaf", required=True, help="Input GAF File") parser.add_argument("--output", default=None, help="Output GAF File path (Default: sys.stdout)") - parser.add_argument("--bgzip", action='store_true', help="Flag to bgzip the output. Can only be given with --output.") - + parser.add_argument( + "--bgzip", + action="store_true", + help="Flag to bgzip the output. Can only be given with --output.", + ) + options = parser.parse_args() setup_logging(options.debug) - + validate_arguments(options) - + bubble_sort(options) memory_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("\nMemory Information") logger.info(" Maximum memory usage: %.3f GB", memory_kb / 1e6) logger.info("\nTime Summary:") - logger.info(" Time to parse GFA file: %.3f"%(timers.elapsed("read_gfa"))) - logger.info(" Total time to sort GAF file: %.3f"%(timers.elapsed("total_sort"))) - logger.info(" Time to parse GAF file: %.3f"%(timers.elapsed("read_gaf"))) - logger.info(" Time to sort GAF file: %.3f"%(timers.elapsed("sort_gaf"))) - logger.info(" Time to write GAF file: %.3f"%(timers.elapsed("write_gaf"))) - + logger.info(" Time to parse GFA file: %.3f" % (timers.elapsed("read_gfa"))) + logger.info(" Total time to sort GAF file: %.3f" % (timers.elapsed("total_sort"))) + logger.info(" Time to parse GAF file: %.3f" % (timers.elapsed("read_gaf"))) + logger.info(" Time to sort GAF file: %.3f" % (timers.elapsed("sort_gaf"))) + logger.info(" Time to write GAF file: %.3f" % (timers.elapsed("write_gaf"))) + def bubble_sort(options): - - if options.output == None: + if options.output is None: writer = sys.stdout else: if options.bgzip: - writer = libcbgzf.BGZFile(options.output, 'wb') + writer = libcbgzf.BGZFile(options.output, "wb") else: writer = open(options.output, "w") - nodes = defaultdict(lambda: [-1,-1]) + nodes = defaultdict(lambda: [-1, -1]) with timers("read_gfa"): read_gfa(options.gfa, nodes) with timers("total_sort"): sort(options.gaf, nodes, writer) writer.close() - + def sort(gaf, nodes, writer): - logger.info("\n##### Parsing GAF file and sorting it #####") if is_file_gzipped(gaf): - reader = gzip.open(gaf, 'rt') + reader = gzip.open(gaf, "rt") else: - reader = open(gaf, 'r') - - Alignment = namedtuple('Alignment', ['offset', 'BO', 'NO', 'start']) + reader = open(gaf, "r") + + Alignment = namedtuple("Alignment", ["offset", "BO", "NO", "start"]) gaf_alignments = [] # First pass: Store all the alignment lines as minimally. Just storing line offset and alignment string. @@ -86,15 +92,15 @@ def sort(gaf, nodes, writer): line = reader.readline() if not line: break - line = line.split('\t') + line = line.split("\t") bo, no, start = process_alignment(line, nodes, offset) gaf_alignments.append(Alignment(offset=offset, BO=bo, NO=no, start=start)) - + # Sorting the alignments based on BO and NO tag with timers("sort_gaf"): logger.debug("Sorting the alignments...") gaf_alignments.sort(key=functools.cmp_to_key(compare_gaf)) - + # Writing the sorted file with timers("write_gaf"): logger.debug("Writing Output File...") @@ -103,24 +109,24 @@ def sort(gaf, nodes, writer): reader.seek(off) line = reader.readline() write_to_file(line, writer) - + reader.close() + def read_gfa(gfa, node): - logger.info("\n##### Parsing GFA file and reading sort key information #####") if is_file_gzipped(gfa): - reader = gzip.open(gfa, 'rt') + reader = gzip.open(gfa, "rt") else: - reader = open(gfa, 'r') - + reader = open(gfa, "r") + total_nodes = 0 tagged_nodes = 0 while True: line = reader.readline() if not line: break - if line[0] != 'S': + if line[0] != "S": continue total_nodes += 1 fields = line.split("\t") @@ -130,14 +136,14 @@ def read_gfa(gfa, node): tagged_nodes += 1 if f.startswith("NO:i:"): node[fields[1]][1] = int(f[5:]) - - logger.info("Total Nodes Processed: %d"%(total_nodes)) - logger.info("Nodes with tags: %d"%(tagged_nodes)) + + logger.info("Total Nodes Processed: %d" % (total_nodes)) + logger.info("Nodes with tags: %d" % (tagged_nodes)) reader.close() def process_alignment(line, nodes, offset): - path = list(filter(None, re.split('(>)|(<)', line[5]))) + path = list(filter(None, re.split("(>)|(<)", line[5]))) orient = None # If there is no scaffold node present in the alignment, then assuming that it is in the correct orientation. # TODO: Need to find a way to deal with such alignments. @@ -150,11 +156,11 @@ def process_alignment(line, nodes, offset): orient = n continue if nodes[n][0] == -1 or nodes[n][1] == -1: - logger.debug("[ERR]\tOF:i:%d\tND:Z:%s"%(offset,n)) - #raise RuntimeError("Found a node which was not in VCF.") - #These nodes exist. + logger.debug("[ERR]\tOF:i:%d\tND:Z:%s" % (offset, n)) + # raise RuntimeError("Found a node which was not in VCF.") + # These nodes exist. continue - if nodes[n][0]%2 == 1 and nodes[n][1] == 0: + if nodes[n][0] % 2 == 1 and nodes[n][1] == 0: # Here we assume that the orientation of the first scaffold node found is the orientation for all the scaffold nodes. (Here we will have to keep in mind that some scaffold nodes are inside the bubbles also and their orientation can be anything there.) # TODO: Have to check for these scaffold nodes that can be present inside bubbles (Cannot trust them) # TODO: Have given the NO tag of 1 to these scaffold nodes to distinguish them from normal scaffold nodes which are tagged with 0. @@ -169,7 +175,7 @@ def process_alignment(line, nodes, offset): if rv: l = int(line[6]) e = int(line[8]) - start = l-e + start = l - e n = path[-1] bo = nodes[n][0] no = nodes[n][1] @@ -178,7 +184,7 @@ def process_alignment(line, nodes, offset): n = path[1] bo = nodes[n][0] no = nodes[n][1] - #print(bo, no, start, sep='\t') + # print(bo, no, start, sep='\t') return bo, no, start @@ -200,19 +206,19 @@ def compare_gaf(al1, al2): return -1 if al1.BO > al2.BO: return 1 - + # Comparing NO tags if al1.NO < al2.NO: return -1 if al1.BO > al2.BO: return 1 - + # Comparing start position in node if al1.start < al2.start: return -1 if al1.start > al2.start: return 1 - + # This will be execulted only when two reads start at the exact same position at the same node. Then we use offset values. So the read which comes first in the GAF file will come higher up. if al1.offset < al2.offset: return -1 @@ -229,12 +235,14 @@ def write_to_file(line, writer): def is_file_gzipped(src): with open(src, "rb") as inp: - return inp.read(2) == b'\x1f\x8b' + return inp.read(2) == b"\x1f\x8b" def validate_arguments(options): if options.bgzip and not options.output: - raise RuntimeError("--bgzip flag has been specified but not output path has been defined. Please define the output path.") + raise RuntimeError( + "--bgzip flag has been specified but not output path has been defined. Please define the output path." + ) if __name__ == "__main__": diff --git a/.attic/bicc_networkx.py b/.attic/bicc_networkx.py index cc7bc6f..907d0ee 100644 --- a/.attic/bicc_networkx.py +++ b/.attic/bicc_networkx.py @@ -1,11 +1,3 @@ -import os -import sys -import pdb -from gaftools.GFA import GFA -from itertools import chain -import networkx as nx - - def biccs_nx(graph): def edge_stack_to_set(edge_stack): out_set = set() @@ -30,8 +22,8 @@ def next_child(stack_item): for n in graph.nodes: if n in visited: continue - discovery = {n:0} - low = {n:0} + discovery = {n: 0} + low = {n: 0} root_children = 0 artic_points = [] components = [] @@ -59,7 +51,7 @@ def next_child(stack_item): visited.add(nn) stack.append([child, nn, 0, graph[nn].neighbors()]) edge_stack.append((child, nn)) - elif nn == None: + elif nn is None: stack.pop() if len(stack) > 1: if low[child] >= discovery[parent]: @@ -72,15 +64,18 @@ def next_child(stack_item): low[parent] = min(low[parent], low[child]) elif stack: root_children += 1 - components.append(edge_stack_to_set(edge_stack[edge_stack.index((parent, child)):])) + components.append( + edge_stack_to_set(edge_stack[edge_stack.index((parent, child)) :]) + ) if root_children > 1: artic_points.append[n] return components, artic_points - - def bi_cc_rec(self, n_id, parent, low, disc, stack, node_ids, all_bi_cc, artic_points, disc_time): + def bi_cc_rec( + self, n_id, parent, low, disc, stack, node_ids, all_bi_cc, artic_points, disc_time + ): # Count of children in current node u = node_ids[n_id] children = 0 @@ -88,23 +83,33 @@ def bi_cc_rec(self, n_id, parent, low, disc, stack, node_ids, all_bi_cc, artic_p disc[u] = disc_time[0] low[u] = disc_time[0] disc_time[0] += 1 - + # Recur for all the vertices adjacent to this vertex for neighbor_id in self.nodes[n_id].neighbors(): v = node_ids[neighbor_id] # If v is not visited yet, then make it a child of u # in DFS tree and recur for it - if disc[v] == -1 : + if disc[v] == -1: parent[v] = n_id children += 1 - stack.append((n_id, neighbor_id)) # store the edge in stack - self.bi_cc_rec(neighbor_id, parent, low, disc, stack, node_ids, all_bi_cc, artic_points, disc_time) - + stack.append((n_id, neighbor_id)) # store the edge in stack + self.bi_cc_rec( + neighbor_id, + parent, + low, + disc, + stack, + node_ids, + all_bi_cc, + artic_points, + disc_time, + ) + # Check if the subtree rooted with v has a connection to # one of the ancestors of u # Case 1 -- per Strongly Connected Components Article low[u] = min(low[u], low[v]) - + # If u is an articulation point, pop # all edges from stack until (u, v) if parent[u] == -1 and children > 1 or parent[u] != -1 and low[v] >= disc[u]: @@ -116,9 +121,9 @@ def bi_cc_rec(self, n_id, parent, low, disc, stack, node_ids, all_bi_cc, artic_p for n in w: one_bi_cc.add(n) all_bi_cc.append(one_bi_cc) - + elif neighbor_id != parent[u] and low[u] > disc[v]: - low[u] = min(low [u], disc[v]) + low[u] = min(low[u], disc[v]) stack.append((n_id, neighbor_id)) def bicc(self): @@ -146,7 +151,9 @@ def bicc(self): for n_id in self.nodes.keys(): i = node_ids[n_id] if disc[i] == -1: - self.bi_cc_rec(n_id, parent, low, disc, stack, node_ids, all_bi_cc, artic_points, disc_time) + self.bi_cc_rec( + n_id, parent, low, disc, stack, node_ids, all_bi_cc, artic_points, disc_time + ) # If stack is not empty, pop all edges from stack if stack: one_bi_cc = set() @@ -155,4 +162,4 @@ def bicc(self): for n in w: one_bi_cc.add(n) all_bi_cc.append(one_bi_cc) - return all_bi_cc, artic_points \ No newline at end of file + return all_bi_cc, artic_points diff --git a/.attic/find_bubbles.py b/.attic/find_bubbles.py index 82c4766..ef60a76 100644 --- a/.attic/find_bubbles.py +++ b/.attic/find_bubbles.py @@ -4,9 +4,10 @@ https://doi.org/10.1093/bioinformatics/btac448 """ + def find_sb_alg(graph, s, direction, only_simple=False): """ - takes the graph and a start node s and add a bubble to the chain + takes the graph and a start node s and add a bubble to the chain if one is found if s was the source """ # I tuples of node ids and the direction @@ -17,7 +18,6 @@ def find_sb_alg(graph, s, direction, only_simple=False): # seen.add(s.id) S = {(s, direction)} while len(S) > 0: - v = S.pop() v = (v[0], v[1]) visited.add(v[0].id) diff --git a/.attic/graph.py b/.attic/graph.py index fe48ebb..84a1587 100644 --- a/.attic/graph.py +++ b/.attic/graph.py @@ -1,10 +1,7 @@ """ -The code has been taken from WhatsHap. -Link to WhatsHap: https://github.com/whatshap/whatshap -""" -""" Find connected components. """ + from abc import abstractmethod from collections import OrderedDict from typing import TypeVar, Generic, Optional, Iterable diff --git a/.attic/sort.py b/.attic/sort.py index 6c5ec48..2e2c380 100644 --- a/.attic/sort.py +++ b/.attic/sort.py @@ -5,7 +5,6 @@ import logging import sys -from gaftools import __version__ from gaftools.cli import log_memory_usage from gaftools.timer import StageTimer @@ -13,26 +12,24 @@ def run(gaf_file, gfa_file, output=sys.stdout): - timers = StageTimer() - if (gaf_file): + if gaf_file: gaf_sort(gaf_file, output) else: gfa_sort(gfa_file, output) - + logger.info("\n== SUMMARY ==") total_time = timers.total() log_memory_usage() logger.info("Total time: %9.2f s", total_time) - -def gfa_sort(gfa_path, out_path = None, return_list = True): - '''This function sorts the given gfa file based on the contig name and start position within the +def gfa_sort(gfa_path, out_path=None, return_list=True): + """This function sorts the given gfa file based on the contig name and start position within the contig. Note that it only sorts S lines and leaves the others. This can be called from the command line or from another funtion by providing "True" to the return_list argument. - ''' + """ import glob from heapq import merge @@ -40,141 +37,133 @@ def gfa_sort(gfa_path, out_path = None, return_list = True): import gzip import os - gfa_lines = [] path = "part*.gfa" chunk_size = 250000 chunk_id = 1 if is_file_gzipped(gfa_path): - open_gfa = gzip.open - is_gzipped = True + open_gfa = gzip.open else: open_gfa = open - - with open_gfa(gfa_path, 'rt') as gfa_file: - f_out = open('part_{}.gfa'.format(chunk_id), 'w') - + + with open_gfa(gfa_path, "rt") as gfa_file: + f_out = open("part_{}.gfa".format(chunk_id), "w") + for line_num, mapping in enumerate(gfa_file, 1): - val = mapping.rstrip().split('\t') + val = mapping.rstrip().split("\t") gfa_lines.append(val) - + if not line_num % chunk_size: gfa_lines.sort(key=functools.cmp_to_key(compare_gfa)) - + for line_count, line in enumerate(gfa_lines): - f_out.write('\t'.join(line) + '\n') - - logger.info('INFO: Splitting %d' %chunk_id) + f_out.write("\t".join(line) + "\n") + + logger.info("INFO: Splitting %d" % chunk_id) f_out.close() gfa_lines = [] chunk_id += 1 - f_out = open('part_{}.gfa'.format(chunk_id), 'w') - + f_out = open("part_{}.gfa".format(chunk_id), "w") if gfa_lines: - logger.info('INFO: Splitting %d' %chunk_id) + logger.info("INFO: Splitting %d" % chunk_id) gfa_lines.sort(key=functools.cmp_to_key(compare_gfa)) for line_count, line in enumerate(gfa_lines): - f_out.write('\t'.join(line) + '\n') + f_out.write("\t".join(line) + "\n") f_out.close() gfa_lines = [] chunks = [] for filename in glob.glob(path): - chunks += [open(filename, 'r')] - + chunks += [open(filename, "r")] + if return_list: gfa_s = [] tmp = merge(*chunks, key=functools.cmp_to_key(compare_gfa2)) for i in tmp: - #print(i) + # print(i) if i[0] == "S": - #print(i.rstrip()) - gfa_s.append(i.rstrip().split('\t')) - + # print(i.rstrip()) + gfa_s.append(i.rstrip().split("\t")) + for part_file in glob.glob(path): if os.path.isfile(part_file): os.remove(part_file) return gfa_s - with open(out_path, 'w') as f_out: + with open(out_path, "w") as f_out: f_out.writelines(merge(*chunks, key=functools.cmp_to_key(compare_gfa2))) - + for part_file in glob.glob(path): if os.path.isfile(part_file): os.remove(part_file) -def gaf_sort(gaf_path, out_path = None): - '''This function sorts a gaf file (mappings to a pangenome graph) in sstable coordinate system based on 1)Contig name 2)Start +def gaf_sort(gaf_path, out_path=None): + """This function sorts a gaf file (mappings to a pangenome graph) in sstable coordinate system based on 1)Contig name 2)Start position of the contig's mapping loci. - ''' + """ - import glob + import glob import functools import gzip from heapq import merge import os - gaf_lines = [] path = "part*.gaf" chunk_size = 500000 chunk_id = 1 if is_file_gzipped(gaf_path): - open_gaf = gzip.open - is_gzipped = True + open_gaf = gzip.open else: open_gaf = open + with open_gaf(gaf_path, "rt") as gaf_file: + f_out = open("part_{}.gaf".format(chunk_id), "w") - with open_gaf(gaf_path, 'rt') as gaf_file: - f_out = open('part_{}.gaf'.format(chunk_id), 'w') - for line_num, mapping in enumerate(gaf_file, 1): - val = mapping.rstrip().split('\t') + val = mapping.rstrip().split("\t") gaf_lines.append(val) - + if not line_num % chunk_size: gaf_lines.sort(key=functools.cmp_to_key(compare_gaf)) - + for line_count, line in enumerate(gaf_lines): - f_out.write('\t'.join(line) + '\n') - - logger.info('INFO: Splitting', chunk_id) + f_out.write("\t".join(line) + "\n") + + logger.info("INFO: Splitting", chunk_id) f_out.close() gaf_lines = [] chunk_id += 1 - f_out = open('part_{}.gaf'.format(chunk_id), 'w') - + f_out = open("part_{}.gaf".format(chunk_id), "w") if gaf_lines: - logger.info('INFO: Splitting', chunk_id) + logger.info("INFO: Splitting", chunk_id) gaf_lines.sort(key=functools.cmp_to_key(compare_gaf)) for line_count, line in enumerate(gaf_lines): - f_out.write('\t'.join(line) + '\n') + f_out.write("\t".join(line) + "\n") f_out.close() gaf_lines = [] chunks = [] for filename in glob.glob(path): - chunks += [open(filename, 'r')] - - with open(out_path, 'w') as f_out: + chunks += [open(filename, "r")] + + with open(out_path, "w") as f_out: f_out.writelines(merge(*chunks, key=functools.cmp_to_key(compare_gaf2))) - + for part_file in glob.glob(path): if os.path.isfile(part_file): os.remove(part_file) def compare_gfa(ln1, ln2): - if not ln1[0] == "S" and not ln2[0] == "S": - #If both are not S lines, then leave it + # If both are not S lines, then leave it return -1 elif ln1[0] == "S" and not ln2[0] == "S": return -1 @@ -198,12 +187,11 @@ def compare_gfa(ln1, ln2): def compare_gfa2(ln1, ln2): - - ln1 = ln1.rstrip().split('\t') - ln2 = ln2.rstrip().split('\t') - #print(ln1, ln2) + ln1 = ln1.rstrip().split("\t") + ln2 = ln2.rstrip().split("\t") + # print(ln1, ln2) if not ln1[0] == "S" and not ln2[0] == "S": - #If both are not S lines, then leave it + # If both are not S lines, then leave it return -1 elif ln1[0] == "S" and not ln2[0] == "S": return -1 @@ -227,16 +215,14 @@ def compare_gfa2(ln1, ln2): def compare_gaf2(ln1, ln2): - - ln1 = ln1.rstrip().split('\t') - ln2 = ln2.rstrip().split('\t') + ln1 = ln1.rstrip().split("\t") + ln2 = ln2.rstrip().split("\t") if ln1[5][0] == ">" or ln1[5][0] == "<": chr1 = ln1[5][1:].lower() else: chr1 = ln1[5].lower() - if ln2[5][0] == ">" or ln2[5][0] == "<": chr2 = ln2[5][1:].lower() else: @@ -255,14 +241,13 @@ def compare_gaf2(ln1, ln2): else: return 1 -def compare_gaf(ln1, ln2): +def compare_gaf(ln1, ln2): if ln1[5][0] == ">" or ln1[5][0] == "<": chr1 = ln1[5][1:].lower() else: chr1 = ln1[5].lower() - if ln2[5][0] == ">" or ln2[5][0] == "<": chr2 = ln2[5][1:].lower() else: @@ -284,7 +269,8 @@ def compare_gaf(ln1, ln2): def is_file_gzipped(src): with open(src, "rb") as inp: - return inp.read(2) == b'\x1f\x8b' + return inp.read(2) == b"\x1f\x8b" + # fmt: off def add_arguments(parser): @@ -295,12 +281,13 @@ def add_arguments(parser): arg('-o', '--output', default=sys.stdout, help='Output GAF file. If omitted, use standard output.') # fmt: on + def validate(args, parser): if args.gaf_file and args.gfa_file: parser.error("Please input either a GAF or GFA file.") if not args.gaf_file and not args.gfa_file: parser.error("Please input a GAF or a GFA file to be sorted") - + def main(args): run(**vars(args)) diff --git a/.gitignore b/.gitignore index d383535..32cff98 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,4 @@ gaftools.egg-info/ .eggs/ gaftools/_version.py docs/_build -.coverage \ No newline at end of file +.coverage diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..c4f28a0 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: "v4.5.0" + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.4.9 + hooks: + # Run the linter. + - id: ruff + entry: ruff check gaftools/ tests/ setup.py + # Run the formatter. + - id: ruff-format + args: [gaftools/, tests/, setup.py] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 6bd71a8..1449082 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -19,4 +19,4 @@ sphinx: configuration: "docs/conf.py" formats: - - pdf \ No newline at end of file + - pdf diff --git a/CHANGES.rst b/CHANGES.rst index 0e8459d..e553822 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,4 +4,4 @@ Changes v1.0 (19/06/2024) ----------------- -* Initial commit. \ No newline at end of file +* Initial commit. diff --git a/README.md b/README.md index 389adb8..e1cd46c 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,4 @@ This is a suite of scripts developed for working with GAF files and their corresponding rGFA files. -Detailed documentation is available [here](https://gaftools.readthedocs.io/en/latest/index.html). \ No newline at end of file +Detailed documentation is available [here](https://gaftools.readthedocs.io/en/latest/index.html). diff --git a/docs/Makefile b/docs/Makefile index 948c52a..20e5495 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -33,7 +33,7 @@ help: @echo " text to make text files" @echo " man to make manual pages" @echo " changes to make an overview of all changed/added/deprecated items" - + clean: rm -rf $(BUILDDIR)/* @@ -79,4 +79,3 @@ changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." - diff --git a/docs/changes.rst b/docs/changes.rst index 96dd280..58b3592 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -1,3 +1,3 @@ .. _changes: -.. include:: ../CHANGES.rst \ No newline at end of file +.. include:: ../CHANGES.rst diff --git a/docs/develop.rst b/docs/develop.rst index 1ae488e..28d505a 100644 --- a/docs/develop.rst +++ b/docs/develop.rst @@ -26,7 +26,7 @@ Adding a new subcommand For creating a new subcommand under gaftools, add a new script under :code:`gaftools/cli/`. Make sure to follow the same format and add test cases for the subcommand under :code:`tests`. -Since gaftools is purely written in Python, adding a script is enough. +Since gaftools is purely written in Python, adding a script is enough. If the new script adds external dependencies, then add the dependencies to :code:`requirements.txt`. @@ -44,9 +44,33 @@ To generate an html report on the test coverage:: coverage html firefox htmlcov/index.html +To format using :code:`ruff`, run:: + + ruff format gaftools/ test/ setup.py + +To check for syntax and style errors using :code:`ruff`, run:: + + ruff check gaftools/ test/ setup.py + +To check for proper documentation build, proper syntax and styling, and pytest on multiple python versions (installed locally), run:: + + tox + Before creating a pull request - * write test cases for the new code. - * run :code:`pytest`. + +#. write test cases for the new code. +#. run :code:`tox` after fixing formatting and syntax with :code:`black` and :code:`flake8`. +#. create the pull request, if the tox check passes with your local python version. + + +Enabling Pre-Commit +------------------- + +The tox pipeline has been integrated into GitHub as a CI/CD pipeline. +Instead of doing all the checks locally, the checks can be done online. + +There is also pre-commit support which also allows developers to skip the +formatting and syntax checking using :code:`ruff`. Writing Documentation @@ -69,4 +93,31 @@ Adding C++ and Cython scripts Currently, the setup script does not support building of C++ scripts and wrapping them into Python using Cython. If it is required for any future work, refer to the WhatsHap setup structure and its documentation to add C++ and Cython scripts. -Accordingly this documentation for developing will have to be changed. \ No newline at end of file +Accordingly this documentation for developing will have to be changed. + + +Making a Release +---------------- + +#. Update ``CHANGES.rst``: Set the correct version number and ensure that + all nontrivial, user-visible changes are listed. + +#. Ensure you have no uncommitted changes in the working copy. + +#. Run ``tox``. + +#. Tag the current commit with the version number (there must be a ``v`` prefix):: + + git tag -a -m "Version 0.1" v0.1 + +#. Push the tag:: + + git push --tags + +#. Wait for the GitHub Action to finish. It will deploy the sdist and wheels to + PyPI if everything worked correctly. + +If something went wrong, fix the problem and follow the above instructions again, +but with an incremented revision in the version number. That is, go from version +x.y to x.y.1. PyPI will not allow you to change a version that has already been +uploaded. diff --git a/docs/guide.rst b/docs/guide.rst index 218b998..6fde677 100644 --- a/docs/guide.rst +++ b/docs/guide.rst @@ -41,30 +41,30 @@ The index is a reverse look-up table with the keys being nodes in the graph and gaftools order_gfa ------------------ -This subcommand establishes an order to the graph based on the "bubbles" in the graph. +This subcommand establishes an order to the graph based on the "bubbles" in the graph. Here, we define the bubbles as biconnected components, i.e. not the strict definition of a bubble found in other papers. The graph input here has to be an `rGFA `_, with SN and SO tags. The basic idea here is that we first detect all biconnected components (bubbles), and collapse the detected bubbles into one node, which should result in a line graph made from scaffold nodes that belong to the reference and the collapsed bubbles inbetween. We then -order this line graph in ascending order based on the coordinates in the SO tag. Each node in this ordered line graph +order this line graph in ascending order based on the coordinates in the SO tag. Each node in this ordered line graph gets an ascending BO tag from 1 to N (N being the number of nodes in the line graph). For the nodes that represent a collapsed -bubbles, all the nodes in that bubble will get the same BO tag (Figure 1). As for the NO tag, the nodes in a bubble get an ascending +bubbles, all the nodes in that bubble will get the same BO tag (Figure 1). As for the NO tag, the nodes in a bubble get an ascending number from 1 to M (M being the number of nodes in a bubble). However, the NO tag inside a bubble is assigned based on the node id order, i.e. in a lexicographic order of the node IDs. -In the graph shown below, which is a part of a longer graph, when the bubbles collapsed, +In the graph shown below, which is a part of a longer graph, when the bubbles collapsed, this will result in a line graph of 9 nodes. Below we see a chain of 4 bubbles (biconnected components) and 5 scaffold nodes, where the nodes inside -the bubbles are colored blue and the scaffold nodes are colored yellow. The numbers of the node is the -BO tag, where it increases by 1 starting from the first scaffold node on the left (19 to 27), and we see that +the bubbles are colored blue and the scaffold nodes are colored yellow. The numbers of the node is the +BO tag, where it increases by 1 starting from the first scaffold node on the left (19 to 27), and we see that all the nodes in a bubble have the same BO tag .. image:: _static/bo_tags.png :width: 600 -In this figure, we see the same graph but with the NO tags marked on the nodes. Scaffold nodes always +In this figure, we see the same graph but with the NO tags marked on the nodes. Scaffold nodes always have a NO tag of 0, and the nodes inside a bubble are marked with an increasing order of the NO tag. .. image:: _static/no_tags.png @@ -76,16 +76,16 @@ have a NO tag of 0, and the nodes inside a bubble are marked with an increasing gaftools phase -------------- -This subcommands adds the phase information of the GAF reads from a haplotag TSV file generated using -:code:`whatshap haplotag`. +This subcommands adds the phase information of the GAF reads from a haplotag TSV file generated using +:code:`whatshap haplotag`. .. _gaftools-realign: gaftools realign ---------------- - -This subcommand realigns all the alignments in GAF back the rGFA it was originally aligned to using Wavefront Alignment. -This fixes alignment issues found in GraphAligner where large indels are represented as a series of small indels in the + +This subcommand realigns all the alignments in GAF back the rGFA it was originally aligned to using Wavefront Alignment. +This fixes alignment issues found in GraphAligner where large indels are represented as a series of small indels in the CIGAR string. .. _gaftools-sort: @@ -93,7 +93,7 @@ CIGAR string. gaftools sort ------------- -This subcommand sorts the alignments in the GAF file using the BO and NO tags generated by :code:`gaftools order_gfa`. Hence this +This subcommand sorts the alignments in the GAF file using the BO and NO tags generated by :code:`gaftools order_gfa`. Hence this subcommand requires initial processing of the rGFA with :code:`order_gfa`. @@ -102,7 +102,7 @@ subcommand requires initial processing of the rGFA with :code:`order_gfa`. gaftools stat ------------- -This subcommand returns basic statistics of the GAF alignments like number of primary and secondary alignments, total aligned bases, +This subcommand returns basic statistics of the GAF alignments like number of primary and secondary alignments, total aligned bases, average mapping quality, etc. .. _gaftools-view: @@ -110,11 +110,5 @@ average mapping quality, etc. gaftools view ------------- -This subcommand helps view the GAF alignments, convert formatting from stable to unstable and vice-versa, and subsetting +This subcommand helps view the GAF alignments, convert formatting from stable to unstable and vice-versa, and subsetting the files based on nodes or regions given by the user. - - - - - - diff --git a/docs/index.rst b/docs/index.rst index 645491a..60bc3c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -40,7 +40,7 @@ Requirements gaftools has the following requirement: -* python +* python>=3.8 * pysam * pywfa diff --git a/gaftools/cli/index.py b/gaftools/cli/index.py index f9b9169..0bb24e9 100644 --- a/gaftools/cli/index.py +++ b/gaftools/cli/index.py @@ -22,7 +22,6 @@ def run(gaf_path, gfa_path, output=None): - timers = StageTimer() if output is None: output = gaf_path + ".gvi" @@ -115,7 +114,6 @@ def run(gaf_path, gfa_path, output=None): def convert_coord(line, ref): - unstable_coord = [] gaf_contigs = list(filter(None, re.split("(>)|(<)", line[5]))) for nd in gaf_contigs: diff --git a/gaftools/cli/order_gfa.py b/gaftools/cli/order_gfa.py index 7008c05..1134313 100644 --- a/gaftools/cli/order_gfa.py +++ b/gaftools/cli/order_gfa.py @@ -49,7 +49,6 @@ def run_order_gfa( chromosome_order=None, with_sequence=False, ): - if chromosome_order is not None: chromosome_order = chromosome_order.split(sep=",") @@ -57,8 +56,14 @@ def run_order_gfa( logging.warning(f"The directory {outdir} does not exist, creating one") try: os.makedirs(outdir) - except: # I know broad exceptions are not recommended but I think for here it's fine - logging.error("were not able to create directory") + except PermissionError: + logging.error(f"were not able to create directory {outdir}. Permission Denied") + sys.exit() + except FileNotFoundError: + logging.error(f"were not able to create directory {outdir}, File not found") + sys.exit() + except OSError: + logging.error(f"were not able to create directory {outdir}, OSError") sys.exit() logger.info(f"Reading {gfa_filename}") diff --git a/gaftools/cli/phase.py b/gaftools/cli/phase.py index 8d55bff..9e9f5c6 100644 --- a/gaftools/cli/phase.py +++ b/gaftools/cli/phase.py @@ -24,7 +24,6 @@ def __init__(self, chr_name, haplotype, phase_set): def run(gaf_file, tsv_file, output=sys.stdout): - timers = StageTimer() add_phase_info(gaf_file, tsv_file, output) logger.info("\n== SUMMARY ==") @@ -57,7 +56,6 @@ def add_phase_info(gaf_path, tsv_path, out_path): phased = 0 gaf_file = GAF(gaf_path) for gaf_line in gaf_file.read_file(): - if line_count != 0: gaf_out.write("\n") diff --git a/gaftools/cli/sort.py b/gaftools/cli/sort.py index f09bf6e..7c4ac66 100644 --- a/gaftools/cli/sort.py +++ b/gaftools/cli/sort.py @@ -29,7 +29,6 @@ def run_sort(gfa, gaf, outgaf=None, outind=None, bgzip=False): - if outgaf is None: writer = sys.stdout index_file = None @@ -63,7 +62,6 @@ def run_sort(gfa, gaf, outgaf=None, outind=None, bgzip=False): def sort(gaf, nodes, writer, index_dict, index_file): - logger.info("Parsing GAF file and sorting it") if utils.is_file_gzipped(gaf): reader = libcbgzf.BGZFile(gaf, "rb") diff --git a/gaftools/cli/view.py b/gaftools/cli/view.py index c573f74..7af07b3 100644 --- a/gaftools/cli/view.py +++ b/gaftools/cli/view.py @@ -26,7 +26,6 @@ def run(gaf_path, gfa=None, output=None, index=None, nodes=[], regions=[], format=None): - timers = StageTimer() if output is None: @@ -167,7 +166,6 @@ def get_unstable(regions, index): result = [] for n, c in enumerate(contig): - try: node_list = node_dict[c] except KeyError: diff --git a/gaftools/conversion.py b/gaftools/conversion.py index 8313589..74b3c8c 100644 --- a/gaftools/conversion.py +++ b/gaftools/conversion.py @@ -25,7 +25,6 @@ def to_string(self, orient): def merge_nodes(node1, node2, orient1, orient2): - if (node1.contig_id != node2.contig_id) or (orient1 != orient2): return False if (orient1 == ">") and (node1.end != node2.start): diff --git a/gaftools/gaf.py b/gaftools/gaf.py index de49043..9c96e83 100644 --- a/gaftools/gaf.py +++ b/gaftools/gaf.py @@ -29,7 +29,6 @@ def __init__( score=None, tags=None, ): - self.query_name = query_name self.query_length = query_length self.query_start = query_start @@ -195,7 +194,6 @@ def parse_gaf_line(self, line): def compare_aln(ln1, ln2): - if ln1.query_start < ln2.query_start: return -1 elif ln1.query_start == ln2.query_start: diff --git a/pyproject.toml b/pyproject.toml index bd8cdb0..180192c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ authors = [ ] description = "general purpose utility tool for working with GAF files" readme = 'README.md' -requires-python = ">=3.10" +requires-python = ">=3.8" license = {text = "MIT"} classifiers = [ "Development Status :: 4 - Beta", @@ -43,8 +43,7 @@ dev = [ "pytest", "coverage", "tox", - "flake8", - "black", + "ruff", "sphinx", "sphinx-issues", "sphinx-rtd-theme", @@ -67,11 +66,16 @@ include = ["gaftools*"] write_to = "gaftools/_version.py" -[tool.black] -line-length = 100 -target-version = ["py37"] - - [tool.pytest.ini_options] addopts = "--doctest-modules" -testpaths = ["tests", "gaftools"] \ No newline at end of file +testpaths = ["tests", "gaftools"] + + +[tool.ruff] +line-length = 100 +lint.ignore = [ + "E203", # whitespace before ':' -- must be ignored for Black + "E501", # line too long + "E722", # bare except + "E741", # Ambiguous variable name +] diff --git a/requirements.txt b/requirements.txt index 44003c1..f066c61 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ pysam -pywfa==0.5.1 \ No newline at end of file +pywfa==0.5.1 diff --git a/tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf b/tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf index 875ffd3..ed0d686 100644 --- a/tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf +++ b/tests/data/alignments-minigraph-stable-conversioncheck-only-s2.gaf @@ -1,2 +1,2 @@ read_s2_s464827_s3_s4 400 1 285 + >chr1:3293-3634>NA20129#1#JAHEPE010000248.1:4917-5103 527 194 478 284 284 60 tp:A:P NM:i:0 cm:i:29 s1:i:326 s2:i:109 dv:f:0.0271 cg:Z:284= -read_s1_s2 2128 119 2121 + chr1 21837 1357 3359 2002 2002 60 tp:A:P NM:i:0 cm:i:193 s1:i:1442 s2:i:0 dv:f:0.0061 cg:Z:2002= \ No newline at end of file +read_s1_s2 2128 119 2121 + chr1 21837 1357 3359 2002 2002 60 tp:A:P NM:i:0 cm:i:193 s1:i:1442 s2:i:0 dv:f:0.0061 cg:Z:2002= diff --git a/tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf b/tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf index 62a901e..020cbe5 100644 --- a/tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf +++ b/tests/data/alignments-minigraph-stable-conversioncheck-only-s464827.gaf @@ -1 +1 @@ -read_s2_s464827_s3_s4 400 1 285 + >chr1:3293-3634>NA20129#1#JAHEPE010000248.1:4917-5103 527 194 478 284 284 60 tp:A:P NM:i:0 cm:i:29 s1:i:326 s2:i:109 dv:f:0.0271 cg:Z:284= \ No newline at end of file +read_s2_s464827_s3_s4 400 1 285 + >chr1:3293-3634>NA20129#1#JAHEPE010000248.1:4917-5103 527 194 478 284 284 60 tp:A:P NM:i:0 cm:i:29 s1:i:326 s2:i:109 dv:f:0.0271 cg:Z:284= diff --git a/tests/data/alignments-minigraph-stable-conversioncheck.gaf b/tests/data/alignments-minigraph-stable-conversioncheck.gaf index a23e76a..0aa5f87 100644 --- a/tests/data/alignments-minigraph-stable-conversioncheck.gaf +++ b/tests/data/alignments-minigraph-stable-conversioncheck.gaf @@ -1,3 +1,3 @@ read_s2_s464827_s3_s4 400 1 285 + >chr1:3293-3634>NA20129#1#JAHEPE010000248.1:4917-5103 527 194 478 284 284 60 tp:A:P NM:i:0 cm:i:29 s1:i:326 s2:i:109 dv:f:0.0271 cg:Z:284= read_s1_s2 2128 119 2121 + chr1 21837 1357 3359 2002 2002 60 tp:A:P NM:i:0 cm:i:193 s1:i:1442 s2:i:0 dv:f:0.0061 cg:Z:2002= -read_s5_s6 537 10 532 + chr1 21837 4759 5281 522 522 60 tp:A:P NM:i:0 cm:i:74 s1:i:518 s2:i:0 dv:f:0.0060 cg:Z:522= \ No newline at end of file +read_s5_s6 537 10 532 + chr1 21837 4759 5281 522 522 60 tp:A:P NM:i:0 cm:i:74 s1:i:518 s2:i:0 dv:f:0.0060 cg:Z:522= diff --git a/tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf b/tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf index 325fab0..3e5e5f2 100644 --- a/tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf +++ b/tests/data/alignments-minigraph-unstable-conversioncheck-only-s2.gaf @@ -1,2 +1,2 @@ read_s2_s464827_s3_s4 400 1 285 + >s2>s464827 527 194 478 284 284 60 tp:A:P NM:i:0 cm:i:29 s1:i:326 s2:i:109 dv:f:0.0271 cg:Z:284= -read_s1_s2 2128 119 2121 + >s1>s2 3634 1357 3359 2002 2002 60 tp:A:P NM:i:0 cm:i:193 s1:i:1442 s2:i:0 dv:f:0.0061 cg:Z:2002= \ No newline at end of file +read_s1_s2 2128 119 2121 + >s1>s2 3634 1357 3359 2002 2002 60 tp:A:P NM:i:0 cm:i:193 s1:i:1442 s2:i:0 dv:f:0.0061 cg:Z:2002= diff --git a/tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf b/tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf index a675059..de84e97 100644 --- a/tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf +++ b/tests/data/alignments-minigraph-unstable-conversioncheck-only-s464827.gaf @@ -1 +1 @@ -read_s2_s464827_s3_s4 400 1 285 + >s2>s464827 527 194 478 284 284 60 tp:A:P NM:i:0 cm:i:29 s1:i:326 s2:i:109 dv:f:0.0271 cg:Z:284= \ No newline at end of file +read_s2_s464827_s3_s4 400 1 285 + >s2>s464827 527 194 478 284 284 60 tp:A:P NM:i:0 cm:i:29 s1:i:326 s2:i:109 dv:f:0.0271 cg:Z:284= diff --git a/tests/data/alignments-minigraph-unstable-conversioncheck.gaf b/tests/data/alignments-minigraph-unstable-conversioncheck.gaf index 1310633..5b279ad 100644 --- a/tests/data/alignments-minigraph-unstable-conversioncheck.gaf +++ b/tests/data/alignments-minigraph-unstable-conversioncheck.gaf @@ -1,3 +1,3 @@ read_s2_s464827_s3_s4 400 1 285 + >s2>s464827 527 194 478 284 284 60 tp:A:P NM:i:0 cm:i:29 s1:i:326 s2:i:109 dv:f:0.0271 cg:Z:284= read_s1_s2 2128 119 2121 + >s1>s2 3634 1357 3359 2002 2002 60 tp:A:P NM:i:0 cm:i:193 s1:i:1442 s2:i:0 dv:f:0.0061 cg:Z:2002= -read_s5_s6 537 10 532 + >s5>s6 6941 31 553 522 522 60 tp:A:P NM:i:0 cm:i:74 s1:i:518 s2:i:0 dv:f:0.0060 cg:Z:522= \ No newline at end of file +read_s5_s6 537 10 532 + >s5>s6 6941 31 553 522 522 60 tp:A:P NM:i:0 cm:i:74 s1:i:518 s2:i:0 dv:f:0.0060 cg:Z:522= diff --git a/tests/data/reads-conversioncheck.fa b/tests/data/reads-conversioncheck.fa index 9cc1ac5..ae4d1fa 100644 --- a/tests/data/reads-conversioncheck.fa +++ b/tests/data/reads-conversioncheck.fa @@ -3,4 +3,4 @@ CCCCCTGCTAGCGGCTAGGACAACTGCAGGGCCCTCTTGCTTACAGTGGTGGCCAGCGTCCCCTGCTGGCGCCGGGGCAC >read_s1_s2 ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCCTACCCCTACCCCTACCCCTACCCCTAACCCTACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCCTAACCCTAACCCTAACCCTAACCCTAACCCGAACCCGAACCCGAACCCGAACCCGAACCCGAACCCGAACCCGAACCCGAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTTAACCCTAACCCTAACCCTAACCCGACCCTGACCCTGACCCTGACCCTGACCCTGGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTAACCCTGACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTGACCCTAACGCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTCTCTGCAGGCGCAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGGCGCGCCGGCGCAGGCGCAGAGAGGCGCGTCCCCTGGGGGGCGGGGGGAGGTGCGGCGCAGGCGCACAGAGGCGCGTCCCCTGGGGGGCGGGGGGAGGCGCGGCGCAGGCGCACAGTCACACGCCACCGGGACAGGAGCGCGGGGGTGGAGCGTTGTAGGCAGAGAGACGCACGTC >read_s5_s6 -TCATCCTTACACTAAAAATGCTAAACTATACAATTTCTAGAAGAAACAATAGAAGAAAAGCTATGTGCCTTTGGGTTTGGTAATGAATTTTAACAAATGATACAAAAGGTTGATATACACAAAAGAAATGACATTGTGGTTTTCTTAATATTTAAAGTTTATACTCTGGAAGACACCTTGTTAAGAGAACAAAAAGACAAGCCACATATTGAAGAAAATATTTGCAAAATACACATCTGAGAAAGAATTTGTCTTCAAAATATATAAAAAAATATTAAAACTAAACAATAAGTTAAACAGCCCAACTAAAAATGCACACACATCTGAACAGACACCTCACCAAAGAAGATCTACAGATGGCAAGTAAACATACAAAAAGATGCTCAACATACTAGAGAACTGAAAACCACAATGAGATAGCACAGCTGGTCTATATCTCTTAGAACTGCTAAACTCCCTAAAAAATGACAAATTGCTGGAGGAAAAACAAGAACTCTTTTCATTGCCGGTGGAACACAGTGTACAAGACCAAAATAT \ No newline at end of file +TCATCCTTACACTAAAAATGCTAAACTATACAATTTCTAGAAGAAACAATAGAAGAAAAGCTATGTGCCTTTGGGTTTGGTAATGAATTTTAACAAATGATACAAAAGGTTGATATACACAAAAGAAATGACATTGTGGTTTTCTTAATATTTAAAGTTTATACTCTGGAAGACACCTTGTTAAGAGAACAAAAAGACAAGCCACATATTGAAGAAAATATTTGCAAAATACACATCTGAGAAAGAATTTGTCTTCAAAATATATAAAAAAATATTAAAACTAAACAATAAGTTAAACAGCCCAACTAAAAATGCACACACATCTGAACAGACACCTCACCAAAGAAGATCTACAGATGGCAAGTAAACATACAAAAAGATGCTCAACATACTAGAGAACTGAAAACCACAATGAGATAGCACAGCTGGTCTATATCTCTTAGAACTGCTAAACTCCCTAAAAAATGACAAATTGCTGGAGGAAAAACAAGAACTCTTTTCATTGCCGGTGGAACACAGTGTACAAGACCAAAATAT diff --git a/tox.ini b/tox.ini index 66b49b8..f519691 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py38,py39,py310,py311,py312,flake8,docs,twinecheck,black +envlist = py38,py39,py310,py311,py312,ruff,docs,twinecheck isolated_build = True @@ -29,23 +29,15 @@ commands = twine check {envtmpdir}/dist/* -[testenv:black] +[testenv:ruff] basepython = python3.10 -deps = black==24.3.0 +deps = ruff skip_install = true -commands = black --check gaftools/ tests/ setup.py +commands = ruff check gaftools/ tests/ setup.py -[testenv:flake8] -basepython = python3.10 -deps = flake8 -skip_install = true -commands = flake8 gaftools/ tests/ setup.py - - -[flake8] -max-line-length = 120 -max-complexity = 33 +[ruff] +line-length = 120 # E203 whitespace before ':' -- must be ignored for Black # # The following ignores should be removed over time: @@ -53,4 +45,4 @@ max-complexity = 33 # E501 line too long # E741 ambiguous variable name 'l' # -extend-ignore = E203,E501,E741,E722 \ No newline at end of file +extend-ignore = E203,E501,E741,E722 From 87cfcac125435bcbfdc6e4be1ce532ac6cb319f8 Mon Sep 17 00:00:00 2001 From: Samarendra Date: Thu, 20 Jun 2024 13:30:15 +0200 Subject: [PATCH 4/8] updating docs for pre-commit --- docs/develop.rst | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/docs/develop.rst b/docs/develop.rst index 28d505a..48908d7 100644 --- a/docs/develop.rst +++ b/docs/develop.rst @@ -63,8 +63,8 @@ Before creating a pull request #. create the pull request, if the tox check passes with your local python version. -Enabling Pre-Commit -------------------- +Using Pre-Commit +---------------- The tox pipeline has been integrated into GitHub as a CI/CD pipeline. Instead of doing all the checks locally, the checks can be done online. @@ -72,6 +72,18 @@ Instead of doing all the checks locally, the checks can be done online. There is also pre-commit support which also allows developers to skip the formatting and syntax checking using :code:`ruff`. +To install :code:`pre-commit`, run:: + + pip install pre-commit + cd + pre-commit install + +Now you :code:`pre-commit` will run everytime you commit to the repository. But the pre-commit +run is restricted to the staged files. As an optional step, you can run :code:`pre-commit` on +all the file using:: + + pre-commit run --all-files + Writing Documentation --------------------- From b6f76149630b24243bc45f5d92b75de3a2f033ba Mon Sep 17 00:00:00 2001 From: Samarendra Date: Thu, 20 Jun 2024 14:24:20 +0200 Subject: [PATCH 5/8] checking if ruff excludes .attic folder --- .attic/bicc_networkx.py | 3 +++ .pre-commit-config.yaml | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.attic/bicc_networkx.py b/.attic/bicc_networkx.py index 907d0ee..cf99584 100644 --- a/.attic/bicc_networkx.py +++ b/.attic/bicc_networkx.py @@ -7,6 +7,9 @@ def edge_stack_to_set(edge_stack): return out_set def next_child(stack_item): + + + if not stack_item[3]: return None if stack_item[2] >= len(stack_item[3]): diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4f28a0..af7f8aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,14 +3,16 @@ repos: rev: "v4.5.0" hooks: - id: end-of-file-fixer + exclude: .attic - id: trailing-whitespace + exclude: .attic - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.4.9 hooks: # Run the linter. - id: ruff - entry: ruff check gaftools/ tests/ setup.py + exclude: .attic # Run the formatter. - id: ruff-format - args: [gaftools/, tests/, setup.py] + exclude: .attic From ee1dea2c5f8fdd8c103c740437a823415391a021 Mon Sep 17 00:00:00 2001 From: Samarendra Date: Thu, 20 Jun 2024 14:25:12 +0200 Subject: [PATCH 6/8] changing the code back to normal --- .attic/bicc_networkx.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/.attic/bicc_networkx.py b/.attic/bicc_networkx.py index cf99584..907d0ee 100644 --- a/.attic/bicc_networkx.py +++ b/.attic/bicc_networkx.py @@ -7,9 +7,6 @@ def edge_stack_to_set(edge_stack): return out_set def next_child(stack_item): - - - if not stack_item[3]: return None if stack_item[2] >= len(stack_item[3]): From c30093194d96ec0f5cddaeb1a38dd096e7fffff0 Mon Sep 17 00:00:00 2001 From: Samarendra Date: Thu, 20 Jun 2024 14:35:16 +0200 Subject: [PATCH 7/8] adding CI pipeline for linting, building and testing --- .github/workflows/ci.yml | 66 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..9c13121 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,66 @@ +name: CI + +on: [push, pull_request] + +jobs: + lint: + # Run for PRs only if they come from a forked repo (avoids duplicate runs) + if: >- + github.event_name != 'pull_request' || + github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name + timeout-minutes: 10 + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + toxenv: [ruff, docs, twinecheck] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: python -m pip install tox + - name: Run tox ${{ matrix.toxenv }} + run: tox -e ${{ matrix.toxenv }} + + build: + if: >- + github.event_name != 'pull_request' || + github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install dependencies + run: python -m pip install build + - name: Build temporary sdist and wheel + run: python -m build + + test: + if: >- + github.event_name != 'pull_request' || + github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name + timeout-minutes: 15 + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + os: [ubuntu-latest] + include: + - os: macos-13 + python-version: "3.10" + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: python -m pip install tox + - name: Test with tox + run: tox -e py From 6b18ee84edd0a4fdcba398bcef4e55cbd5f4e874 Mon Sep 17 00:00:00 2001 From: Samarendra Date: Thu, 20 Jun 2024 15:36:24 +0200 Subject: [PATCH 8/8] updating development documentation --- .readthedocs.yaml | 2 +- docs/develop.rst | 11 ++++++++--- docs/index.rst | 10 ++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 1449082..857a1dd 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -3,7 +3,7 @@ version: 2 build: - os: "ubuntu-22.04" + os: "ubuntu-latest" tools: python: "3.10" diff --git a/docs/develop.rst b/docs/develop.rst index 48908d7..6a967f6 100644 --- a/docs/develop.rst +++ b/docs/develop.rst @@ -58,9 +58,9 @@ To check for proper documentation build, proper syntax and styling, and pytest o Before creating a pull request -#. write test cases for the new code. -#. run :code:`tox` after fixing formatting and syntax with :code:`black` and :code:`flake8`. -#. create the pull request, if the tox check passes with your local python version. +#. write test cases for the new code +#. run :code:`tox` after fixing formatting and syntax with :code:`ruff` +#. create the pull request, if the tox check passes with your local python version Using Pre-Commit @@ -84,6 +84,9 @@ all the file using:: pre-commit run --all-files +When the staged files fail the conditions defined in the pre-commit conditions, :code:`ruff` formatting +will attempt to fix it. After adding the files updated by :code:`ruff`, attempt to commit again. If it fails, +then the errors have to be manually fixed. Writing Documentation --------------------- @@ -111,6 +114,8 @@ Accordingly this documentation for developing will have to be changed. Making a Release ---------------- +[No releases made yet. This text is a template.] + #. Update ``CHANGES.rst``: Set the correct version number and ensure that all nontrivial, user-visible changes are listed. diff --git a/docs/index.rst b/docs/index.rst index 60bc3c6..f9f0828 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,10 +30,20 @@ We recommend installing inside a conda environment to allow easy removal:: conda activate gaftools-env pip install git+https://github.com/marschall-lab/gaftools +If you already have a clone locally, then run:: + + git clone https://github.com/marschall-lab/gaftools + conda create -n gaftools-env python=3.10 + conda activate gaftools-env + cd gaftools + pip install . + To remove gaftools, the conda environment needs to be removed using:: conda env remove -n gaftools-env +gaftools can be used with python>=3.8 + Requirements ------------