diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index 7d182e466..2742372d0 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -5,7 +5,7 @@ import csv import os import logging -from aligntools import CigarActions +from aligntools import CigarActions, CigarHit, Cigar from gotoh import align_it, align_it_aa from mappy import Alignment, Aligner @@ -173,7 +173,7 @@ def __init__(self, self.coordinate_name = self.consensus = self.amino_consensus = '' self.algorithm = '' self.consensus_offset = 0 - self.alignments: List[Alignment] = [] + self.alignments: List[CigarHit] = [] self.reading_frames: Dict[int, List[SeedAmino]] = {} self.seed_nucs: List[SeedNucleotide] = [] self.amino_alignments: List[AminoAlignment] = [] @@ -270,15 +270,19 @@ def start_contig(self, except KeyError: coordinate_seq = self.projects.getReference(coordinate_name) aligner = Aligner(seq=coordinate_seq, preset='map-ont') - self.alignments = list(aligner.map(self.consensus)) - if self.alignments or 10_000 < len(self.consensus): + mappy_alignments = list(aligner.map(self.consensus)) + if mappy_alignments or 10_000 < len(self.consensus): self.algorithm = 'minimap2' else: self.algorithm = 'gotoh' self.align_gotoh(coordinate_seq, self.consensus) - self.alignments = [alignment - for alignment in self.alignments - if alignment.is_primary] + mappy_alignments = [alignment + for alignment in mappy_alignments + if alignment.is_primary] + self.alignments = [CigarHit(Cigar(x.cigar), + r_st=x.r_st, r_ei=x.r_en - 1, + q_st=x.q_st, q_ei=x.q_en - 1) + for x in mappy_alignments] self.alignments.sort(key=attrgetter('q_st')) if self.overall_alignments_writer is not None: @@ -290,7 +294,7 @@ def start_contig(self, "consensus_offset": self.consensus_offset, "ref_start": alignment.r_st, "ref_end": alignment.r_en, - "cigar_str": alignment.cigar_str} + "cigar_str": str(alignment.cigar)} self.overall_alignments_writer.writerow(row) def align_gotoh(self, coordinate_seq: str, consensus: str): @@ -355,7 +359,7 @@ def find_amino_alignments(self, amino_sections = [] query_end = alignment.q_st + self.consensus_offset ref_end = alignment.r_st - for cigar_index, (size, action) in enumerate(alignment.cigar): + for action in alignment.cigar.iterate_operations(): ref_reading_frame = (ref_end - (start_pos-1)) % 3 conseq_codon_start = query_end - ref_reading_frame reading_frame = -conseq_codon_start % 3 @@ -366,13 +370,13 @@ def find_amino_alignments(self, action, reading_frame) if action == CigarActions.MATCH: - amino_alignment.query_end += size - amino_alignment.ref_end += size + amino_alignment.query_end += 1 + amino_alignment.ref_end += 1 elif action == CigarActions.INSERT: - amino_alignment.query_end += size + amino_alignment.query_end += 1 else: assert action == CigarActions.DELETE - amino_alignment.ref_end += size + amino_alignment.ref_end += 1 query_end = amino_alignment.query_end ref_end = amino_alignment.ref_end if amino_alignment.has_overlap(start_pos, end_pos): @@ -857,40 +861,37 @@ def build_nucleotide_report(self, consensus_nuc_index = alignment.q_st + self.consensus_offset source_amino_index = 0 seed_aminos = self.reading_frames[0] - for section_size, section_action in alignment.cigar: + for section_action in alignment.cigar.iterate_operations(): if section_action == CigarActions.INSERT: - for _ in range(section_size): - if start_pos - 1 <= ref_nuc_index < end_pos: - self.inserts.add(consensus_nuc_index) - consensus_nuc_index += 1 + if start_pos - 1 <= ref_nuc_index < end_pos: + self.inserts.add(consensus_nuc_index) + consensus_nuc_index += 1 continue if section_action == CigarActions.DELETE: coverage = self.get_deletion_coverage(consensus_nuc_index) seed_nuc = SeedNucleotide() seed_nuc.count_nucleotides('-', coverage) - for _ in range(section_size): - if start_pos - 1 <= ref_nuc_index < end_pos: - target_nuc_index = ref_nuc_index - start_pos + 1 - report_nuc = report_nucleotides[target_nuc_index] - report_nuc.seed_nucleotide.add(seed_nuc) - ref_nuc_index += 1 - continue - assert section_action == CigarActions.MATCH, section_action - for _ in range(section_size): if start_pos - 1 <= ref_nuc_index < end_pos: target_nuc_index = ref_nuc_index - start_pos + 1 - while True: - seed_amino = seed_aminos[source_amino_index] - codon_nuc_index = (consensus_nuc_index - - seed_amino.consensus_nuc_index) - if codon_nuc_index < 3: - break - source_amino_index += 1 - seed_nuc = seed_amino.nucleotides[codon_nuc_index] report_nuc = report_nucleotides[target_nuc_index] report_nuc.seed_nucleotide.add(seed_nuc) ref_nuc_index += 1 - consensus_nuc_index += 1 + continue + assert section_action == CigarActions.MATCH, section_action + if start_pos - 1 <= ref_nuc_index < end_pos: + target_nuc_index = ref_nuc_index - start_pos + 1 + while True: + seed_amino = seed_aminos[source_amino_index] + codon_nuc_index = (consensus_nuc_index - + seed_amino.consensus_nuc_index) + if codon_nuc_index < 3: + break + source_amino_index += 1 + seed_nuc = seed_amino.nucleotides[codon_nuc_index] + report_nuc = report_nucleotides[target_nuc_index] + report_nuc.seed_nucleotide.add(seed_nuc) + ref_nuc_index += 1 + consensus_nuc_index += 1 def seed_concordance(self, seed_name, projects, seed_coordinates, excluded_regions, included_regions=None): if self.seed_concordance_writer is None: @@ -925,22 +926,21 @@ def coord_concordance(self, half_window_size: int = 10) -> List[float]: for alignment in coord_alignments: ref_progress = alignment.r_st query_progress = alignment.q_st - for cigar_index, (size, action) in enumerate(alignment.cigar): + for action in alignment.cigar.iterate_operations(): if action == CigarActions.INSERT: - query_progress += size + query_progress += 1 elif action == CigarActions.DELETE: - ref_progress += size + ref_progress += 1 else: assert action == CigarActions.MATCH - for pos in range(0, size): - ref_pos = ref_progress + pos - query_pos = query_progress + pos - if self.consensus[query_pos] == coord_ref[ref_pos]: - query_matches[query_pos] = 1 - else: - query_matches[query_pos] = 0 - ref_progress += size - query_progress += size + ref_pos = ref_progress + query_pos = query_progress + if self.consensus[query_pos] == coord_ref[ref_pos]: + query_matches[query_pos] = 1 + else: + query_matches[query_pos] = 0 + ref_progress += 1 + query_progress += 1 for pos in range(0, len(self.consensus)): start = max(0, pos - half_window_size)