Add alignment step in amino acid space, as part of #589.

cfe-lab · Jun 3, 2021 · 7afa861 · 7afa861
1 parent d89f544
commit 7afa861
Show file tree

Hide file tree

Showing 5 changed files with 193 additions and 46 deletions.
diff --git a/micall/core/aln2counts.py b/micall/core/aln2counts.py
@@ -300,31 +300,15 @@ def _map_to_coordinate_ref(self, coordinate_name):
             repeated_ref_pos = None
         if is_amino_coordinate:
             report_aminos = self.reports[coordinate_name]
+            amino_ref = self.projects.getReference(coordinate_name)
         else:
-            report_aminos = None
+            report_aminos = amino_ref = None
         self.consensus_aligner.report_region(region_info['start'],
                                              region_info['end'],
                                              report_nucleotides,
                                              report_aminos,
-                                             repeated_ref_pos)
-
-    @staticmethod
-    def map_amino_sequences(from_seq, to_seq, from_aligned, to_aligned):
-        seq_map = {}  # {from_aa_index: to_aa_index}
-        from_index = to_index = 0
-        for from_aa, to_aa in zip(from_aligned, to_aligned):
-            if (to_index < len(to_seq) and
-                    to_aa == to_seq[to_index]):
-                seq_map[from_index] = to_index
-                to_index += 1
-            if (from_index < len(from_seq) and
-                    from_aa == from_seq[from_index]):
-                from_index += 1
-        return seq_map
-
-    def map_nuc_sequences(self):
-        consensus_length = len(self.consensus_aligner.consensus)
-        return {i: i for i in range(consensus_length)}
+                                             repeated_ref_pos,
+                                             amino_ref)
 
     def process_reads(self,
                       aligned_csv,

diff --git a/micall/tests/test_aln2counts.py b/micall/tests/test_aln2counts.py
@@ -833,7 +833,7 @@ def testSubstitutionAtBoundary(self):
         self.report.read(aligned_reads)
         self.report.write_amino_counts()
 
-        self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
+        self.assertEqual(expected_text, self.report_file.getvalue())
 
     def testCoverageSummary(self):
         """ R1 has coverage 9.
@@ -1211,7 +1211,7 @@ def testDeletionBetweenSeedAndConsensusAminoReport(self):
         self.report.read(aligned_reads)
         self.report.write_amino_counts()
 
-        self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
+        self.assertEqual(expected_text, self.report_file.getvalue())
 
     def testDeletionWithMinorityVariant(self):
         """ Aligned reads are mostly K-R, but some are KFR.

diff --git a/micall/tests/test_aln2counts_report.py b/micall/tests/test_aln2counts_report.py
@@ -569,7 +569,7 @@ def test_duplicated_sars_base_amino(default_sequence_report):
 
     report = report_file.getvalue()
     report_lines = report.splitlines()
-    expected_size = 39
+    expected_size = 40
     if len(report_lines) != expected_size:
         assert (len(report_lines), report) == (expected_size, '')
 
@@ -609,7 +609,7 @@ def test_duplicated_sars_base_amino_offset10(default_sequence_report):
 
     report = report_file.getvalue()
     report_lines = report.splitlines()
-    expected_size = 31
+    expected_size = 38
     if len(report_lines) != expected_size:
         assert (len(report_lines), report) == (expected_size, '')
 
@@ -649,7 +649,7 @@ def test_duplicated_sars_base_amino_offset11(default_sequence_report):
 
     report = report_file.getvalue()
     report_lines = report.splitlines()
-    expected_size = 29
+    expected_size = 37
     if len(report_lines) != expected_size:
         assert (len(report_lines), report) == (expected_size, '')
 
@@ -688,7 +688,7 @@ def test_duplicated_sars_base_nuc(default_sequence_report):
 
     report = report_file.getvalue()
     report_lines = report.splitlines()
-    expected_size = 96
+    expected_size = 115
     if len(report_lines) != expected_size:
         assert (len(report_lines), report) == (expected_size, '')
 

diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py
@@ -349,12 +349,17 @@ def test_report_region(projects):
     # ORF7b runs from 1-based positions 27756 to 27884 (excluding stop codon).
     consensus = seed_seq[27000:27999]
     reading_frames = create_reading_frames(consensus)
+    amino_ref = projects.getReference('SARS-CoV-2-ORF7b')
     aligner = ConsensusAligner(projects)
     aligner.start_contig(seed_name, reading_frames=reading_frames)
 
     report_aminos: typing.List[ReportAmino] = []
     report_nucleotides: typing.List[ReportNucleotide] = []
-    aligner.report_region(27756, 27884, report_nucleotides, report_aminos)
+    aligner.report_region(27756,
+                          27884,
+                          report_nucleotides,
+                          report_aminos,
+                          amino_ref=amino_ref)
 
     assert len(report_aminos) == 43
     assert len(report_nucleotides) == 129  # 27884-27756+1
@@ -385,12 +390,17 @@ def test_report_region_no_overlap(projects):
     seed_seq = projects.getReference(seed_name)
     consensus = seed_seq[27000:27999]
     reading_frames = create_reading_frames(consensus)
+    amino_ref = 'F' * 100
     aligner = ConsensusAligner(projects)
     aligner.start_contig(seed_name, reading_frames=reading_frames)
 
     report_aminos: typing.List[ReportAmino] = []
     report_nucleotides: typing.List[ReportNucleotide] = []
-    aligner.report_region(10_001, 10_300, report_nucleotides, report_aminos)
+    aligner.report_region(10_001,
+                          10_300,
+                          report_nucleotides,
+                          report_aminos,
+                          amino_ref=amino_ref)
 
     assert len(report_aminos) == 100
     assert len(report_nucleotides) == 300
@@ -404,12 +414,17 @@ def test_report_region_after_start(projects):
     seed_seq = projects.getReference(seed_name)
     consensus = seed_seq[28000:28999]
     reading_frames = create_reading_frames(consensus)
+    amino_ref = 'PVSYSLLF*M'
     aligner = ConsensusAligner(projects)
     aligner.start_contig(seed_name, reading_frames=reading_frames)
 
     report_aminos: typing.List[ReportAmino] = []
     report_nucleotides: typing.List[ReportNucleotide] = []
-    aligner.report_region(27998, 28027, report_nucleotides, report_aminos)
+    aligner.report_region(27998,
+                          28027,
+                          report_nucleotides,
+                          report_aminos,
+                          amino_ref=amino_ref)
 
     assert len(report_aminos) == 10
     assert report_aminos[0].seed_amino.consensus_nuc_index is None
@@ -423,12 +438,17 @@ def test_report_region_before_end(projects):
     seed_seq = projects.getReference(seed_name)
     consensus = seed_seq[28000:28999]
     reading_frames = create_reading_frames(consensus)
+    amino_ref = 'QQQGQ'
     aligner = ConsensusAligner(projects)
     aligner.start_contig(seed_name, reading_frames=reading_frames)
 
     report_aminos: typing.List[ReportAmino] = []
     report_nucleotides: typing.List[ReportNucleotide] = []
-    aligner.report_region(28991, 29005, report_nucleotides, report_aminos)
+    aligner.report_region(28991,
+                          29005,
+                          report_nucleotides,
+                          report_aminos,
+                          amino_ref=amino_ref)
 
     assert len(report_aminos) == 5
     assert report_aminos[0].seed_amino.consensus_nuc_index == 990
@@ -442,6 +462,7 @@ def test_report_region_with_repeated_nucleotide(projects):
     # nsp12 runs from 1-based positions 13442 to 16236, with repeat at 13468.
     consensus = seed_seq[13000:13500]
     reading_frames = create_reading_frames(consensus)
+    amino_ref = projects.getReference('SARS-CoV-2-nsp12')
     aligner = ConsensusAligner(projects)
     aligner.start_contig(seed_name, reading_frames=reading_frames)
 
@@ -451,7 +472,8 @@ def test_report_region_with_repeated_nucleotide(projects):
                           16236,
                           report_nucleotides,
                           report_aminos,
-                          repeat_position=13468)
+                          repeat_position=13468,
+                          amino_ref=amino_ref)
 
     amino_consensus = ''.join(report_amino.seed_amino.get_consensus()
                               for report_amino in report_aminos[:21])