Skip to content

Commit

Permalink
Add alignment step in amino acid space, as part of #589.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Jun 3, 2021
1 parent d89f544 commit 7afa861
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 46 deletions.
24 changes: 4 additions & 20 deletions micall/core/aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,31 +300,15 @@ def _map_to_coordinate_ref(self, coordinate_name):
repeated_ref_pos = None
if is_amino_coordinate:
report_aminos = self.reports[coordinate_name]
amino_ref = self.projects.getReference(coordinate_name)
else:
report_aminos = None
report_aminos = amino_ref = None
self.consensus_aligner.report_region(region_info['start'],
region_info['end'],
report_nucleotides,
report_aminos,
repeated_ref_pos)

@staticmethod
def map_amino_sequences(from_seq, to_seq, from_aligned, to_aligned):
seq_map = {} # {from_aa_index: to_aa_index}
from_index = to_index = 0
for from_aa, to_aa in zip(from_aligned, to_aligned):
if (to_index < len(to_seq) and
to_aa == to_seq[to_index]):
seq_map[from_index] = to_index
to_index += 1
if (from_index < len(from_seq) and
from_aa == from_seq[from_index]):
from_index += 1
return seq_map

def map_nuc_sequences(self):
consensus_length = len(self.consensus_aligner.consensus)
return {i: i for i in range(consensus_length)}
repeated_ref_pos,
amino_ref)

def process_reads(self,
aligned_csv,
Expand Down
4 changes: 2 additions & 2 deletions micall/tests/test_aln2counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ def testSubstitutionAtBoundary(self):
self.report.read(aligned_reads)
self.report.write_amino_counts()

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
self.assertEqual(expected_text, self.report_file.getvalue())

def testCoverageSummary(self):
""" R1 has coverage 9.
Expand Down Expand Up @@ -1211,7 +1211,7 @@ def testDeletionBetweenSeedAndConsensusAminoReport(self):
self.report.read(aligned_reads)
self.report.write_amino_counts()

self.assertMultiLineEqual(expected_text, self.report_file.getvalue())
self.assertEqual(expected_text, self.report_file.getvalue())

def testDeletionWithMinorityVariant(self):
""" Aligned reads are mostly K-R, but some are KFR.
Expand Down
8 changes: 4 additions & 4 deletions micall/tests/test_aln2counts_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ def test_duplicated_sars_base_amino(default_sequence_report):

report = report_file.getvalue()
report_lines = report.splitlines()
expected_size = 39
expected_size = 40
if len(report_lines) != expected_size:
assert (len(report_lines), report) == (expected_size, '')

Expand Down Expand Up @@ -609,7 +609,7 @@ def test_duplicated_sars_base_amino_offset10(default_sequence_report):

report = report_file.getvalue()
report_lines = report.splitlines()
expected_size = 31
expected_size = 38
if len(report_lines) != expected_size:
assert (len(report_lines), report) == (expected_size, '')

Expand Down Expand Up @@ -649,7 +649,7 @@ def test_duplicated_sars_base_amino_offset11(default_sequence_report):

report = report_file.getvalue()
report_lines = report.splitlines()
expected_size = 29
expected_size = 37
if len(report_lines) != expected_size:
assert (len(report_lines), report) == (expected_size, '')

Expand Down Expand Up @@ -688,7 +688,7 @@ def test_duplicated_sars_base_nuc(default_sequence_report):

report = report_file.getvalue()
report_lines = report.splitlines()
expected_size = 96
expected_size = 115
if len(report_lines) != expected_size:
assert (len(report_lines), report) == (expected_size, '')

Expand Down
32 changes: 27 additions & 5 deletions micall/tests/test_consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,12 +349,17 @@ def test_report_region(projects):
# ORF7b runs from 1-based positions 27756 to 27884 (excluding stop codon).
consensus = seed_seq[27000:27999]
reading_frames = create_reading_frames(consensus)
amino_ref = projects.getReference('SARS-CoV-2-ORF7b')
aligner = ConsensusAligner(projects)
aligner.start_contig(seed_name, reading_frames=reading_frames)

report_aminos: typing.List[ReportAmino] = []
report_nucleotides: typing.List[ReportNucleotide] = []
aligner.report_region(27756, 27884, report_nucleotides, report_aminos)
aligner.report_region(27756,
27884,
report_nucleotides,
report_aminos,
amino_ref=amino_ref)

assert len(report_aminos) == 43
assert len(report_nucleotides) == 129 # 27884-27756+1
Expand Down Expand Up @@ -385,12 +390,17 @@ def test_report_region_no_overlap(projects):
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[27000:27999]
reading_frames = create_reading_frames(consensus)
amino_ref = 'F' * 100
aligner = ConsensusAligner(projects)
aligner.start_contig(seed_name, reading_frames=reading_frames)

report_aminos: typing.List[ReportAmino] = []
report_nucleotides: typing.List[ReportNucleotide] = []
aligner.report_region(10_001, 10_300, report_nucleotides, report_aminos)
aligner.report_region(10_001,
10_300,
report_nucleotides,
report_aminos,
amino_ref=amino_ref)

assert len(report_aminos) == 100
assert len(report_nucleotides) == 300
Expand All @@ -404,12 +414,17 @@ def test_report_region_after_start(projects):
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[28000:28999]
reading_frames = create_reading_frames(consensus)
amino_ref = 'PVSYSLLF*M'
aligner = ConsensusAligner(projects)
aligner.start_contig(seed_name, reading_frames=reading_frames)

report_aminos: typing.List[ReportAmino] = []
report_nucleotides: typing.List[ReportNucleotide] = []
aligner.report_region(27998, 28027, report_nucleotides, report_aminos)
aligner.report_region(27998,
28027,
report_nucleotides,
report_aminos,
amino_ref=amino_ref)

assert len(report_aminos) == 10
assert report_aminos[0].seed_amino.consensus_nuc_index is None
Expand All @@ -423,12 +438,17 @@ def test_report_region_before_end(projects):
seed_seq = projects.getReference(seed_name)
consensus = seed_seq[28000:28999]
reading_frames = create_reading_frames(consensus)
amino_ref = 'QQQGQ'
aligner = ConsensusAligner(projects)
aligner.start_contig(seed_name, reading_frames=reading_frames)

report_aminos: typing.List[ReportAmino] = []
report_nucleotides: typing.List[ReportNucleotide] = []
aligner.report_region(28991, 29005, report_nucleotides, report_aminos)
aligner.report_region(28991,
29005,
report_nucleotides,
report_aminos,
amino_ref=amino_ref)

assert len(report_aminos) == 5
assert report_aminos[0].seed_amino.consensus_nuc_index == 990
Expand All @@ -442,6 +462,7 @@ def test_report_region_with_repeated_nucleotide(projects):
# nsp12 runs from 1-based positions 13442 to 16236, with repeat at 13468.
consensus = seed_seq[13000:13500]
reading_frames = create_reading_frames(consensus)
amino_ref = projects.getReference('SARS-CoV-2-nsp12')
aligner = ConsensusAligner(projects)
aligner.start_contig(seed_name, reading_frames=reading_frames)

Expand All @@ -451,7 +472,8 @@ def test_report_region_with_repeated_nucleotide(projects):
16236,
report_nucleotides,
report_aminos,
repeat_position=13468)
repeat_position=13468,
amino_ref=amino_ref)

amino_consensus = ''.join(report_amino.seed_amino.get_consensus()
for report_amino in report_aminos[:21])
Expand Down
Loading

0 comments on commit 7afa861

Please sign in to comment.