Trim boundary codons, as part of #589.

cfe-lab · Jun 30, 2021 · 6f6f098 · 6f6f098
1 parent b38c020
commit 6f6f098
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 18 deletions.
diff --git a/micall/tests/test_aln2counts_report.py b/micall/tests/test_aln2counts_report.py
@@ -751,7 +751,6 @@ def test_minimap_overlap(default_sequence_report, projects):
 HIV1-B-FR-K03455-seed,15,0,9,0,{read_seq}
 """)
 
-    # TODO: Remove repeated query positions: 61, 62, 60.
     #                                     A,C,G,T
     expected_text = """\
 HIV1-B-FR-K03455-seed,INT,15,51,262,0,0,9,0,0,0,0,0,0,9
@@ -764,9 +763,6 @@ def test_minimap_overlap(default_sequence_report, projects):
 HIV1-B-FR-K03455-seed,INT,15,58,269,0,9,0,0,0,0,0,0,0,9
 HIV1-B-FR-K03455-seed,INT,15,59,270,9,0,0,0,0,0,0,0,0,9
 HIV1-B-FR-K03455-seed,INT,15,60,271,0,0,9,0,0,0,0,0,0,9
-HIV1-B-FR-K03455-seed,INT,15,61,272,9,0,0,0,0,0,0,0,0,9
-HIV1-B-FR-K03455-seed,INT,15,62,273,0,0,9,0,0,0,0,0,0,9
-HIV1-B-FR-K03455-seed,RT,15,60,451,0,0,9,0,0,0,0,0,0,9
 HIV1-B-FR-K03455-seed,RT,15,61,452,9,0,0,0,0,0,0,0,0,9
 HIV1-B-FR-K03455-seed,RT,15,62,453,0,0,9,0,0,0,0,0,0,9
 HIV1-B-FR-K03455-seed,RT,15,63,454,0,0,9,0,0,0,0,0,0,9
@@ -784,11 +780,11 @@ def test_minimap_overlap(default_sequence_report, projects):
 
     report = report_file.getvalue()
     report_lines = report.splitlines()
-    expected_size = 124
+    expected_size = 121
     if len(report_lines) != expected_size:
         assert (len(report_lines), report) == (expected_size, '')
 
-    key_lines = report_lines[51:74]
+    key_lines = report_lines[51:71]
     key_report = '\n'.join(key_lines)
     assert key_report == expected_text
 

diff --git a/micall/tests/test_aln2counts_seed_amino.py b/micall/tests/test_aln2counts_seed_amino.py
@@ -205,6 +205,36 @@ def test_add_first_time():
     assert amino.consensus_nuc_index == 7
 
 
+def test_add_clipped_start():
+    amino = SeedAmino(None)
+    other = SeedAmino(consensus_nuc_index=7)
+    other.count_aminos('ACT', 5)
+    expected_counts = {}
+    expected_nucleotide_counts = [{}, {'C': 5}, {'T': 5}]
+
+    amino.add(other, start_nuc=1)
+
+    nucleotide_counts = [nuc.counts for nuc in amino.nucleotides]
+    assert amino.counts == expected_counts
+    assert nucleotide_counts == expected_nucleotide_counts
+    assert amino.partial == 0  # Only count partials in the middle.
+
+
+def test_add_clipped_end():
+    amino = SeedAmino(None)
+    other = SeedAmino(consensus_nuc_index=7)
+    other.count_aminos('ACT', 5)
+    expected_counts = {}
+    expected_nucleotide_counts = [{'A': 5}, {'C': 5}, {}]
+
+    amino.add(other, end_nuc=1)
+
+    nucleotide_counts = [nuc.counts for nuc in amino.nucleotides]
+    assert amino.counts == expected_counts
+    assert nucleotide_counts == expected_nucleotide_counts
+    assert amino.partial == 0  # Only count partials in the middle.
+
+
 def test_amino_repeat_nuc0():
     """ Repeat first nucleotide in the codon.
     Columns are:       A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,*

diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py
@@ -150,6 +150,40 @@ def __repr__(self):
                 f'{self.q_st}, {self.q_en})')
 
 
+def clip_seed_aminos(seed_aminos: typing.List[SeedAmino],
+                     region_start_consensus_amino_index: int,
+                     region_end_consensus_amino_index: int,
+                     start_codon_nuc_index: int,
+                     end_codon_nuc_index: int):
+    """ Extract a section of seed aminos for counting.
+
+    Handles boundary codons by clipping as needed.
+    :param seed_aminos: source seed aminos to copy
+    :param region_start_consensus_amino_index: first seed amino to copy
+    :param region_end_consensus_amino_index: one past the last one to copy
+    :param start_codon_nuc_index: first codon index within the first seed amino
+        that should be copied
+    :param end_codon_nuc_index: last codon index to copy within the last codon
+    """
+    assert 0 <= region_start_consensus_amino_index
+    assert 0 <= region_end_consensus_amino_index
+    assert 0 <= start_codon_nuc_index
+    assert 0 <= end_codon_nuc_index
+    result = seed_aminos[region_start_consensus_amino_index:
+                         region_end_consensus_amino_index]
+    if start_codon_nuc_index != 0:
+        old_start_amino = result[0]
+        start_amino = SeedAmino(old_start_amino.consensus_nuc_index)
+        start_amino.add(old_start_amino, start_nuc=start_codon_nuc_index)
+        result[0] = start_amino
+    if end_codon_nuc_index != 2:
+        old_end_amino = result[-1]
+        end_amino = SeedAmino(old_end_amino.consensus_nuc_index)
+        end_amino.add(old_end_amino, end_nuc=end_codon_nuc_index)
+        result[-1] = end_amino
+    return result
+
+
 class ConsensusAligner:
     def __init__(self, projects: ProjectConfig):
         self.projects = projects
@@ -350,6 +384,8 @@ def build_amino_report(self,
             frame_offset = -1
             region_start_consensus_amino_index = -1
             region_end_consensus_amino_index = -1
+            start_codon_nuc_index = -1
+            end_codon_nuc_index = -1
             source_amino_index = 0
             seed_aminos = self.reading_frames[0]
             region_seed_aminos: typing.List[SeedAmino] = []
@@ -382,22 +418,40 @@ def build_amino_report(self,
                             source_amino_index += 1
                         if region_start_consensus_amino_index < 0:
                             region_start_consensus_amino_index = source_amino_index
+                            start_codon_nuc_index = codon_nuc_index
                         region_end_consensus_amino_index = source_amino_index + 1
+                        end_codon_nuc_index = codon_nuc_index
                         if (repeat_position == ref_nuc_index + 1 and
-                                start_pos < repeat_position):
+                                codon_nuc_index != 0):
                             # Shift reading frames.
-                            region_seed_aminos = seed_aminos[
-                                                 region_start_consensus_amino_index:
-                                                 region_end_consensus_amino_index]
+                            region_seed_aminos = clip_seed_aminos(
+                                seed_aminos,
+                                region_start_consensus_amino_index,
+                                region_end_consensus_amino_index,
+                                start_codon_nuc_index,
+                                end_codon_nuc_index)
+                            # Reset start positions
                             region_start_consensus_amino_index = -1
+                            start_codon_nuc_index = -1
+                            # Step back source position
+                            source_amino_index -= 1
+                            # Repeat current position
+                            ref_nuc_index -= 1
+                            consensus_nuc_index -= 1
+                            section_nuc_index -= 1
+                            # Adjust reading frame
                             frame_offset = (frame_offset + 1) % 3
                             seed_aminos = self.reading_frames[frame_offset]
                     ref_nuc_index += 1
                     consensus_nuc_index += 1
                     section_nuc_index += 1
-            region_seed_aminos.extend(
-                seed_aminos[region_start_consensus_amino_index:
-                            region_end_consensus_amino_index])
+            if 0 <= region_start_consensus_amino_index:
+                region_seed_aminos.extend(clip_seed_aminos(
+                    seed_aminos,
+                    region_start_consensus_amino_index,
+                    region_end_consensus_amino_index,
+                    start_codon_nuc_index,
+                    end_codon_nuc_index))
             if not region_seed_aminos:
                 continue
             self.amino_consensus = ''.join(seed_amino.get_consensus() or '?'

diff --git a/micall/utils/report_amino.py b/micall/utils/report_amino.py
@@ -51,7 +51,7 @@ def count_aminos(self, codon_seq, count):
             self.deletions += count
         elif '-' in codon_seq:
             self.partial += count  # Partial deletion
-        elif ' ' not in codon_seq and 'n' not in codon_seq:
+        elif ' ' not in codon_seq and 'n' not in codon_seq and len(codon_seq) == 3:
             amino = translate(codon_seq.upper())
             self.counts[amino] += count
         elif 'nnn' == codon_seq:
@@ -62,20 +62,31 @@ def count_aminos(self, codon_seq, count):
                 seed_nucleotide = self.nucleotides[i]
                 seed_nucleotide.count_nucleotides(nuc, count)
 
-    def add(self, other: 'SeedAmino'):
+    def add(self, other: 'SeedAmino', start_nuc: int = 0, end_nuc: int = 2):
+        """ Add counts from another SeedAmino to this one.
+
+        :param other: source to copy from
+        :param start_nuc: first nucleotide index to copy: 0, 1, or 2.
+        :param end_nuc: last nucleotide index to copy: 0, 1, or 2.
+        """
         if self.read_count and other.read_count:
             self.consensus_nuc_index = None
         elif other.read_count:
             self.consensus_nuc_index = other.consensus_nuc_index
-        self.counts += other.counts
+        if 0 < start_nuc or end_nuc < 2:
+            prefix = ' ' * start_nuc
+            for nucs, count in other.codon_counts.items():
+                self.count_aminos(prefix + nucs[start_nuc:end_nuc+1], count)
+        else:
+            self.counts += other.counts
+            for nuc, other_nuc in zip(self.nucleotides, other.nucleotides):
+                nuc.add(other_nuc)
         self.partial += other.partial
         self.deletions += other.deletions
         self.read_count += other.read_count
         self.low_quality += other.low_quality
         self.nucleotides_to_skip = other.nucleotides_to_skip
         self.ref_offset = other.ref_offset
-        for nuc, other_nuc in zip(self.nucleotides, other.nucleotides):
-            nuc.add(other_nuc)
 
     def get_report(self) -> str:
         """ Build a report string with the counts of each amino acid.