Skip to content

Commit

Permalink
Close #479 by splitting alignments around large deletions.
Browse files Browse the repository at this point in the history
Also improve test coverage of amplicon_finder.py.
  • Loading branch information
donkirkby committed Jul 12, 2021
1 parent 3eaa6a7 commit bf58a52
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 7 deletions.
11 changes: 7 additions & 4 deletions micall/core/amplicon_finder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import typing
from collections import Counter
from csv import DictWriter, DictReader
from pathlib import Path
Expand Down Expand Up @@ -37,11 +38,11 @@ def merge_reads(fastq1_path, fastq2_path, joined_fastq_path, merge_lengths_csv,
writer.writerow(dict(merge_length=merge_length, count=count))


def plot_merge_lengths(read_entropy_path: Path):
def plot_merge_lengths(read_entropy_file: typing.IO) -> plt.Figure:
plots = []
fig, ax1 = plt.subplots()
# noinspection PyTypeChecker
merge_entropy = np.genfromtxt(read_entropy_path,
merge_entropy = np.genfromtxt(read_entropy_file,
delimiter=',',
names=True,
dtype=float)
Expand Down Expand Up @@ -105,7 +106,8 @@ def plot_merge_lengths(read_entropy_path: Path):


def write_merge_lengths_plot(read_entropy_path: str, plot_path: str):
fig = plot_merge_lengths(Path(read_entropy_path))
with open(read_entropy_path) as read_entropy:
fig = plot_merge_lengths(read_entropy)
fig.savefig(plot_path)
plt.close(fig)

Expand Down Expand Up @@ -186,7 +188,8 @@ def main():
'84681A-V3-WYD-T2-1-V3LOOP_S40')
scratch_path = scratch_paths['hcv']
read_entropy_path = scratch_path / 'read_entropy.csv'
f = plot_merge_lengths(read_entropy_path)
with read_entropy_path.open() as read_entropy:
f = plot_merge_lengths(read_entropy)
f.savefig(scratch_path / 'read_entropy.svg')
plt.show()
print('Done.')
Expand Down
30 changes: 30 additions & 0 deletions micall/tests/test_aln2counts_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,6 +789,36 @@ def test_minimap_overlap(default_sequence_report, projects):
assert key_report == expected_text


def test_minimap_gap(default_sequence_report, projects):
""" Large gap should still have coverage on either side. """
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
read_seq = seed_seq[789:1282] + seed_seq[1863:2292]

# refname,qcut,rank,count,offset,seq
aligned_reads = prepare_reads(f"""\
HIV1-B-FR-K03455-seed,15,0,9,0,{read_seq}
""")
# A,C,G,T
expected_text = """\
HIV1-B-FR-K03455-seed,HIV1B-gag,15,493,493,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,HIV1B-gag,15,494,1075,9,0,0,0,0,0,0,0,0,9"""
report_file = StringIO()
default_sequence_report.write_nuc_header(report_file)
default_sequence_report.read(aligned_reads)
default_sequence_report.write_nuc_counts()

report = report_file.getvalue()
report_lines = report.splitlines()
expected_size = 961
if len(report_lines) != expected_size:
assert (len(report_lines), report) == (expected_size, '')

key_lines = report_lines[493:495]
key_report = '\n'.join(key_lines)
assert key_report == expected_text


# noinspection DuplicatedCode
def test_contig_coverage_report_huge_gap(default_sequence_report):
""" A gap so big that Gotoh can't bridge it, but minimap2 can. """
Expand Down
18 changes: 17 additions & 1 deletion micall/tests/test_amplicon_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path

from micall.core.amplicon_finder import merge_reads, count_kmers, calculate_entropy_from_counts, calculate_entropy, \
merge_for_entropy
merge_for_entropy, plot_merge_lengths

microtest_path = Path(__file__).parent / 'microtest'

Expand Down Expand Up @@ -137,6 +137,22 @@ def test_entropy_from_fastq():
assert expected_entropy == entropy


def test_plot_merge_lengths():
read_entropy = StringIO("""\
merge_length,count,entropy
100,5,0.2
101,5,0.2
102,5,0.9
103,5,0.2
""")

figure = plot_merge_lengths(read_entropy)

assert len(figure.axes) == 2
assert figure.axes[0].get_title() == 'Count of merged reads at each length'
assert figure.axes[1].get_title() == ''


def test_merge_for_entropy(tmpdir):
read_entropy_csv = StringIO()
expected_read_entropy = """\
Expand Down
74 changes: 74 additions & 0 deletions micall/tests/test_consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,80 @@ def test_start_contig_overlapping_sections(projects):
assert_alignments(aligner, *expected_alignments)


def test_start_contig_big_deletion(projects):
""" Split alignments around big deletions. """
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[789:1282] + seed_seq[1863:2278]
expected_alignments = [AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=789,
r_en=1282,
q_st=0,
q_en=493,
mapq=60,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1),
AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=1863,
r_en=2278,
q_st=493,
q_en=908,
mapq=60,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1)]
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)

assert_alignments(aligner, *expected_alignments)


def test_start_contig_insert_and_big_deletion(projects):
""" Split alignments around big deletions. """
seed_name = 'HIV1-B-FR-K03455-seed'
seed_seq = projects.getReference(seed_name)
consensus = (seed_seq[789:900] +
'ACTGAC' +
seed_seq[900:1282] +
seed_seq[1863:2278])
expected_alignments = [AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=789,
r_en=1282,
q_st=0,
q_en=499,
cigar=[[111, CigarActions.MATCH],
[6, CigarActions.INSERT],
[382, CigarActions.MATCH]],
mapq=60,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1),
AlignmentWrapper(ctg='N/A',
ctg_len=len(seed_seq),
r_st=1863,
r_en=2278,
q_st=499,
q_en=914,
mapq=60,
# Unneeded fields forced to -1.
mlen=-1,
blen=-1,
NM=-1)]
aligner = ConsensusAligner(projects)

aligner.start_contig(seed_name, consensus)

assert_alignments(aligner, *expected_alignments)


def test_start_contig_short_consensus(projects):
""" Consensus is too short for minimap2, fall back to Gotoh. """
seed_name = 'SARS-CoV-2-seed'
Expand Down
59 changes: 57 additions & 2 deletions micall/utils/consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def start_contig(self,
for alignment in self.alignments
if alignment.is_primary]
self.alignments.sort(key=attrgetter('q_st'))
self.remove_overlaps()
self.adjust_alignments()

def align_gotoh(self, coordinate_seq, consensus):
gap_open_penalty = 15
Expand Down Expand Up @@ -279,7 +279,10 @@ def align_gotoh(self, coordinate_seq, consensus):
q_en=consensus_index,
cigar=cigar))

def remove_overlaps(self):
def adjust_alignments(self):
""" Remove big deletions and overlaps between alignments. """

# Overlaps
max_query_pos = -1
for alignment_num, alignment in enumerate(self.alignments):
query_start = alignment.q_st
Expand All @@ -298,6 +301,58 @@ def remove_overlaps(self):
self.alignments[alignment_num] = new_alignment
max_query_pos = alignment.q_en

# Big deletions
new_alignments = []
for alignment in self.alignments:
new_cigar = []
query_start = query_end = alignment.q_st
ref_start = ref_end = alignment.r_st
for size, action in alignment.cigar:
if action == CigarActions.MATCH:
new_cigar.append([size, action])
query_end += size
ref_end += size
elif action == CigarActions.INSERT:
new_cigar.append([size, action])
query_end += size
else:
assert action == CigarActions.DELETE
if size <= 6:
new_cigar.append([size, action])
ref_end += size
else:
new_alignments.append(AlignmentWrapper.wrap(
alignment,
q_st=query_start,
q_en=query_end,
r_st=ref_start,
r_en=ref_end,
cigar=new_cigar,
# Unneeded fields => -1.
mlen=-1,
blen=-1,
NM=-1))
query_start = query_end
ref_end += size
ref_start = ref_end
new_cigar = []
if len(new_cigar) == len(alignment.cigar):
# No change, use original alignment.
new_alignments.append(alignment)
else:
new_alignments.append(AlignmentWrapper.wrap(
alignment,
q_st=query_start,
q_en=query_end,
r_st=ref_start,
r_en=ref_end,
cigar=new_cigar,
# Unneeded fields => -1.
mlen=-1,
blen=-1,
NM=-1))
self.alignments = new_alignments

def clear(self):
self.coordinate_name = self.consensus = self.amino_consensus = ''
self.algorithm = ''
Expand Down

0 comments on commit bf58a52

Please sign in to comment.