Skip to content

Commit

Permalink
Trim boundary codons, as part of #589.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Jun 30, 2021
1 parent b38c020 commit 6f6f098
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 18 deletions.
8 changes: 2 additions & 6 deletions micall/tests/test_aln2counts_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,6 @@ def test_minimap_overlap(default_sequence_report, projects):
HIV1-B-FR-K03455-seed,15,0,9,0,{read_seq}
""")

# TODO: Remove repeated query positions: 61, 62, 60.
# A,C,G,T
expected_text = """\
HIV1-B-FR-K03455-seed,INT,15,51,262,0,0,9,0,0,0,0,0,0,9
Expand All @@ -764,9 +763,6 @@ def test_minimap_overlap(default_sequence_report, projects):
HIV1-B-FR-K03455-seed,INT,15,58,269,0,9,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,59,270,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,60,271,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,61,272,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,INT,15,62,273,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,60,451,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,61,452,9,0,0,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,62,453,0,0,9,0,0,0,0,0,0,9
HIV1-B-FR-K03455-seed,RT,15,63,454,0,0,9,0,0,0,0,0,0,9
Expand All @@ -784,11 +780,11 @@ def test_minimap_overlap(default_sequence_report, projects):

report = report_file.getvalue()
report_lines = report.splitlines()
expected_size = 124
expected_size = 121
if len(report_lines) != expected_size:
assert (len(report_lines), report) == (expected_size, '')

key_lines = report_lines[51:74]
key_lines = report_lines[51:71]
key_report = '\n'.join(key_lines)
assert key_report == expected_text

Expand Down
30 changes: 30 additions & 0 deletions micall/tests/test_aln2counts_seed_amino.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,36 @@ def test_add_first_time():
assert amino.consensus_nuc_index == 7


def test_add_clipped_start():
amino = SeedAmino(None)
other = SeedAmino(consensus_nuc_index=7)
other.count_aminos('ACT', 5)
expected_counts = {}
expected_nucleotide_counts = [{}, {'C': 5}, {'T': 5}]

amino.add(other, start_nuc=1)

nucleotide_counts = [nuc.counts for nuc in amino.nucleotides]
assert amino.counts == expected_counts
assert nucleotide_counts == expected_nucleotide_counts
assert amino.partial == 0 # Only count partials in the middle.


def test_add_clipped_end():
amino = SeedAmino(None)
other = SeedAmino(consensus_nuc_index=7)
other.count_aminos('ACT', 5)
expected_counts = {}
expected_nucleotide_counts = [{'A': 5}, {'C': 5}, {}]

amino.add(other, end_nuc=1)

nucleotide_counts = [nuc.counts for nuc in amino.nucleotides]
assert amino.counts == expected_counts
assert nucleotide_counts == expected_nucleotide_counts
assert amino.partial == 0 # Only count partials in the middle.


def test_amino_repeat_nuc0():
""" Repeat first nucleotide in the codon.
Columns are: A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*
Expand Down
68 changes: 61 additions & 7 deletions micall/utils/consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,40 @@ def __repr__(self):
f'{self.q_st}, {self.q_en})')


def clip_seed_aminos(seed_aminos: typing.List[SeedAmino],
region_start_consensus_amino_index: int,
region_end_consensus_amino_index: int,
start_codon_nuc_index: int,
end_codon_nuc_index: int):
""" Extract a section of seed aminos for counting.
Handles boundary codons by clipping as needed.
:param seed_aminos: source seed aminos to copy
:param region_start_consensus_amino_index: first seed amino to copy
:param region_end_consensus_amino_index: one past the last one to copy
:param start_codon_nuc_index: first codon index within the first seed amino
that should be copied
:param end_codon_nuc_index: last codon index to copy within the last codon
"""
assert 0 <= region_start_consensus_amino_index
assert 0 <= region_end_consensus_amino_index
assert 0 <= start_codon_nuc_index
assert 0 <= end_codon_nuc_index
result = seed_aminos[region_start_consensus_amino_index:
region_end_consensus_amino_index]
if start_codon_nuc_index != 0:
old_start_amino = result[0]
start_amino = SeedAmino(old_start_amino.consensus_nuc_index)
start_amino.add(old_start_amino, start_nuc=start_codon_nuc_index)
result[0] = start_amino
if end_codon_nuc_index != 2:
old_end_amino = result[-1]
end_amino = SeedAmino(old_end_amino.consensus_nuc_index)
end_amino.add(old_end_amino, end_nuc=end_codon_nuc_index)
result[-1] = end_amino
return result


class ConsensusAligner:
def __init__(self, projects: ProjectConfig):
self.projects = projects
Expand Down Expand Up @@ -350,6 +384,8 @@ def build_amino_report(self,
frame_offset = -1
region_start_consensus_amino_index = -1
region_end_consensus_amino_index = -1
start_codon_nuc_index = -1
end_codon_nuc_index = -1
source_amino_index = 0
seed_aminos = self.reading_frames[0]
region_seed_aminos: typing.List[SeedAmino] = []
Expand Down Expand Up @@ -382,22 +418,40 @@ def build_amino_report(self,
source_amino_index += 1
if region_start_consensus_amino_index < 0:
region_start_consensus_amino_index = source_amino_index
start_codon_nuc_index = codon_nuc_index
region_end_consensus_amino_index = source_amino_index + 1
end_codon_nuc_index = codon_nuc_index
if (repeat_position == ref_nuc_index + 1 and
start_pos < repeat_position):
codon_nuc_index != 0):
# Shift reading frames.
region_seed_aminos = seed_aminos[
region_start_consensus_amino_index:
region_end_consensus_amino_index]
region_seed_aminos = clip_seed_aminos(
seed_aminos,
region_start_consensus_amino_index,
region_end_consensus_amino_index,
start_codon_nuc_index,
end_codon_nuc_index)
# Reset start positions
region_start_consensus_amino_index = -1
start_codon_nuc_index = -1
# Step back source position
source_amino_index -= 1
# Repeat current position
ref_nuc_index -= 1
consensus_nuc_index -= 1
section_nuc_index -= 1
# Adjust reading frame
frame_offset = (frame_offset + 1) % 3
seed_aminos = self.reading_frames[frame_offset]
ref_nuc_index += 1
consensus_nuc_index += 1
section_nuc_index += 1
region_seed_aminos.extend(
seed_aminos[region_start_consensus_amino_index:
region_end_consensus_amino_index])
if 0 <= region_start_consensus_amino_index:
region_seed_aminos.extend(clip_seed_aminos(
seed_aminos,
region_start_consensus_amino_index,
region_end_consensus_amino_index,
start_codon_nuc_index,
end_codon_nuc_index))
if not region_seed_aminos:
continue
self.amino_consensus = ''.join(seed_amino.get_consensus() or '?'
Expand Down
21 changes: 16 additions & 5 deletions micall/utils/report_amino.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def count_aminos(self, codon_seq, count):
self.deletions += count
elif '-' in codon_seq:
self.partial += count # Partial deletion
elif ' ' not in codon_seq and 'n' not in codon_seq:
elif ' ' not in codon_seq and 'n' not in codon_seq and len(codon_seq) == 3:
amino = translate(codon_seq.upper())
self.counts[amino] += count
elif 'nnn' == codon_seq:
Expand All @@ -62,20 +62,31 @@ def count_aminos(self, codon_seq, count):
seed_nucleotide = self.nucleotides[i]
seed_nucleotide.count_nucleotides(nuc, count)

def add(self, other: 'SeedAmino'):
def add(self, other: 'SeedAmino', start_nuc: int = 0, end_nuc: int = 2):
""" Add counts from another SeedAmino to this one.
:param other: source to copy from
:param start_nuc: first nucleotide index to copy: 0, 1, or 2.
:param end_nuc: last nucleotide index to copy: 0, 1, or 2.
"""
if self.read_count and other.read_count:
self.consensus_nuc_index = None
elif other.read_count:
self.consensus_nuc_index = other.consensus_nuc_index
self.counts += other.counts
if 0 < start_nuc or end_nuc < 2:
prefix = ' ' * start_nuc
for nucs, count in other.codon_counts.items():
self.count_aminos(prefix + nucs[start_nuc:end_nuc+1], count)
else:
self.counts += other.counts
for nuc, other_nuc in zip(self.nucleotides, other.nucleotides):
nuc.add(other_nuc)
self.partial += other.partial
self.deletions += other.deletions
self.read_count += other.read_count
self.low_quality += other.low_quality
self.nucleotides_to_skip = other.nucleotides_to_skip
self.ref_offset = other.ref_offset
for nuc, other_nuc in zip(self.nucleotides, other.nucleotides):
nuc.add(other_nuc)

def get_report(self) -> str:
""" Build a report string with the counts of each amino acid.
Expand Down

0 comments on commit 6f6f098

Please sign in to comment.