Skip to content

Commit

Permalink
Eliminate overlaps from minimap2 alignments, as part of #589.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Jun 25, 2021
1 parent b2f6b1d commit b38c020
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 1 deletion.
58 changes: 58 additions & 0 deletions micall/tests/test_aln2counts_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,64 @@ def test_nucleotide_coordinates(default_sequence_report):
assert report_file.getvalue() == expected_report


def test_minimap_overlap(default_sequence_report, projects):
""" Similar sections can fool minimap2 into reporting a section twice.
In this example, positions 1-104 of the read map to pos 4401-4504 of the
reference. Positions 101-200 of the read map to pos 3001-3100 of the ref.
Even though positions 101-104 are in both alignments, only report them once.
"""
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
read_seq = seed_seq[4440:4500] + seed_seq[3000:3060]

# refname,qcut,rank,count,offset,seq
aligned_reads = prepare_reads(f"""\
HIV1-B-FR-K03455-seed,15,0,9,0,{read_seq}
""")

# TODO: Remove repeated query positions: 61, 62, 60.
# A,C,G,T
expected_text = """\
HIV1-B-FR-K03455-seed,INT,15,51,262,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,52,263,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,53,264,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,54,265,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,55,266,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,56,267,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,57,268,0,9,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,58,269,0,9,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,59,270,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,60,271,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,61,272,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,62,273,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,60,451,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,61,452,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,62,453,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,63,454,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,64,455,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,65,456,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,66,457,0,0,0,9,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,67,458,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,68,459,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,69,460,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,70,461,9,0,0,0,0,0,0,0,0,9"""
report_file = StringIO()
default_sequence_report.write_nuc_header(report_file)
default_sequence_report.read(aligned_reads)
default_sequence_report.write_nuc_counts()

report = report_file.getvalue()
report_lines = report.splitlines()
expected_size = 124
if len(report_lines) != expected_size:
assert (len(report_lines), report) == (expected_size, '')

key_lines = report_lines[51:74]
key_report = '\n'.join(key_lines)
assert key_report == expected_text


# noinspection DuplicatedCode
def test_contig_coverage_report_huge_gap(default_sequence_report):
""" A gap so big that Gotoh can't bridge it, but minimap2 can. """
Expand Down
44 changes: 44 additions & 0 deletions micall/tests/test_consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ def assert_alignments(aligner: ConsensusAligner,
__tracebackhide__ = True
wrapped_alignments = tuple(AlignmentWrapper.wrap(alignment)
for alignment in aligner.alignments)
if repr(wrapped_alignments) != repr(expected_alignments):
assert wrapped_alignments == expected_alignments
for i, (wrapped_alignment, expected_alignment) in enumerate(
zip(wrapped_alignments, expected_alignments)):
for field_name in AlignmentWrapper.init_fields:
wrapped = (i, field_name, getattr(wrapped_alignment, field_name))
expected = (i, field_name, getattr(expected_alignment, field_name))
assert wrapped == expected
assert wrapped_alignments == expected_alignments


Expand Down Expand Up @@ -112,6 +120,42 @@ def test_start_contig_multiple_sections(projects):
assert_alignments(aligner, *expected_alignments)


def test_start_contig_overlapping_sections(projects):
""" Similar sections can fool minimap2 into reporting a section twice.
In this example, positions 1-60 of the read map to pos 4441-4500 of the
reference. Positions 55-120 of the read map to pos 2995-3060 of the ref.
Since positions 55-60 are in both alignments, remove them from the second
one.
"""
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[4440:4500] + seed_seq[3000:3060]
expected_alignments = [AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=4440,
r_en=4500,
q_st=0,
q_en=60,
mapq=27),
AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=3000,
r_en=3060,
q_st=60,
q_en=120,
mapq=13,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1)]
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)

assert_alignments(aligner, *expected_alignments)


def test_start_contig_short_consensus(projects):
""" Consensus is too short for minimap2, fall back to Gotoh. """
seed_name = 'SARS-CoV-2-seed'
Expand Down
22 changes: 21 additions & 1 deletion micall/utils/consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def __new__(cls,
if not blen:
blen = max(q_en-q_st, r_en-r_st)
if not cigar:
cigar = [[blen, CigarActions.MATCH]]
cigar = [[max(q_en-q_st, r_en-r_st), CigarActions.MATCH]]
return super().__new__(cls,
ctg,
ctg_len,
Expand Down Expand Up @@ -201,6 +201,7 @@ def start_contig(self,
for alignment in self.alignments
if alignment.is_primary]
self.alignments.sort(key=attrgetter('q_st'))
self.remove_overlaps()

def align_gotoh(self, coordinate_seq, consensus):
gap_open_penalty = 15
Expand Down Expand Up @@ -244,6 +245,25 @@ def align_gotoh(self, coordinate_seq, consensus):
q_en=consensus_index,
cigar=cigar))

def remove_overlaps(self):
max_query_pos = -1
for alignment_num, alignment in enumerate(self.alignments):
query_start = alignment.q_st
if query_start < max_query_pos:
offset = max_query_pos - alignment.q_st
new_cigar = [section[:] for section in alignment.cigar]
new_cigar[0][0] -= offset
new_alignment = AlignmentWrapper.wrap(alignment,
q_st=max_query_pos,
r_st=alignment.r_st+offset,
cigar=new_cigar,
# Unneeded fields => -1.
mlen=-1,
blen=-1,
NM=-1)
self.alignments[alignment_num] = new_alignment
max_query_pos = alignment.q_en

def clear(self):
self.coordinate_name = self.consensus = self.amino_consensus = ''
self.algorithm = ''
Expand Down

0 comments on commit b38c020

Please sign in to comment.