Skip to content

Commit

Permalink
Fix landmark definitions for HIV to be 1-based, as part of #589.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Jun 17, 2021
1 parent f7ea48f commit d786b28
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 120 deletions.
4 changes: 2 additions & 2 deletions micall/data/landmark_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ def get_gene(self,
continue
region_copy = dict(region)
if 'end' not in region_copy:
region_copy['end'] = regions[i+1]['start']
region_copy['end'] = regions[i+1]['start']-1
break
else:
raise ValueError(f'Landmarks not found for gene {gene_name!r} in '
f'{coordinates}.')
if drop_stop_codon:
if drop_stop_codon and region_copy.get('stop') != 'N':
region_copy['end'] -= 3
return region_copy

Expand Down
197 changes: 102 additions & 95 deletions micall/data/landmark_references.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,153 +2,160 @@
- seed_pattern: HIV.*
coordinates: HIV1-B-FR-K03455-seed
landmarks:
# 1-based, inclusive positions in HXB2, including stop codons.
- {name: "5' LTR", start: 1, end: 634, frame: 0, colour: darkgrey}
- {name: gag, full_name: HIV1B-gag, start: 789, end: 2289, frame: 0, colour: lightblue}
- {name: vif, full_name: HIV1B-vif, start: 5040, end: 5616, frame: 0, colour: steelblue}
- {name: tat, start: 8379, end: 8469, frame: 0, colour: plum}
- {name: nef, full_name: HIV1B-nef, start: 8796, end: 9414, frame: 0, colour: mediumaquamarine}
- {name: tat, start: 5831, end: 6045, frame: 1, colour: plum}
- {name: vpu, full_name: HIV1B-vpu, start: 6061, end: 6307, frame: 1, colour: red}
- {name: rev, start: 8379, end: 8653, frame: 1, colour: yellowgreen}
- {name: gag, full_name: HIV1B-gag, start: 790, end: 2292, frame: 0, colour: lightblue}
- {name: vif, full_name: HIV1B-vif, start: 5041, end: 5619, frame: 0, colour: steelblue}

# Actually starts at 8379, spliced to the end of tat in the middle of a codon.
- {name: tat, full_name: tat2, start: 8380, end: 8469, frame: 0, colour: plum}

- {name: nef, full_name: HIV1B-nef, start: 8797, end: 9417, frame: 0, colour: mediumaquamarine}
- {name: tat, start: 5831, end: 6049, frame: 1, colour: plum}
- {name: vpu, full_name: HIV1B-vpu, start: 6062, end: 6310, frame: 1, colour: red}

# Actually starts at 8379, spliced to the end of rev in the middle of a codon.
- {name: rev, full_name: rev2, start: 8378, end: 8653, frame: 1, colour: yellowgreen}

- {name: "3' LTR", start: 9086, end: 9719, frame: 1, colour: darkgrey}
- {name: pol, start: 2085, end: 5096, frame: 2, colour: orange}
- {name: vpr, full_name: HIV1B-vpr, start: 5558, end: 5847, frame: 2, colour: turquoise}
- {name: rev, start: 5970, end: 6045, frame: 2, colour: yellowgreen}
- {name: vpr, full_name: HIV1B-vpr, start: 5559, end: 5850, frame: 2, colour: turquoise}
- {name: rev, start: 5970, end: 6048, frame: 2, colour: yellowgreen, stop: N}
- {name: env, start: 6225, end: 8795, frame: 2, colour: salmon}
- {name: PR, start: 2252, end: 2549, frame: 3, colour: orange}
- {name: RT, start: 2549, end: 3869, frame: 3, colour: darkorange}
- {name: INT, start: 4229, end: 5093, frame: 3, colour: orange}
- {name: V3, full_name: V3LOOP, start: 7109, end: 7217, frame: 3, colour: salmon}
- {name: GP41, start: 7757, end: 8792, frame: 3, colour: salmon}
- {name: GP120, start: 6224, end: 7757, frame: 3} # Not displayed.
- {name: PR, start: 2253, end: 2549, frame: 3, colour: orange, stop: N}
- {name: RT, start: 2550, end: 3869, frame: 3, colour: darkorange, stop: N}
- {name: INT, start: 4230, end: 5096, frame: 3, colour: orange}
- {name: V3, full_name: V3LOOP, start: 7110, end: 7217, frame: 3, colour: salmon, stop: N}
- {name: GP41, start: 7758, end: 8795, frame: 3, colour: salmon}
- {name: GP120, start: 6225, end: 7757, frame: 3, stop: N} # Not displayed.
- seed_pattern: HCV-1b
coordinates: HCV-1b
prefix: HCV1B-Con1-
landmarks:
- {name: "5'", start: 1, colour: darkgrey}
- {name: C, full_name: Core, start: 342, colour: '#1f77b4'}
- {name: E1, start: 915, colour: '#ff7f0e'}
- {name: E2, start: 1491, colour: '#2ca02c'}
- {name: p7, start: 2580, colour: '#d62728'}
- {name: NS2, start: 2769, colour: '#9467bd'}
- {name: NS3, start: 3420, colour: '#8c564b'}
- {name: C, full_name: Core, start: 342, colour: '#1f77b4', stop: N}
- {name: E1, start: 915, colour: '#ff7f0e', stop: N}
- {name: E2, start: 1491, colour: '#2ca02c', stop: N}
- {name: p7, start: 2580, colour: '#d62728', stop: N}
- {name: NS2, start: 2769, colour: '#9467bd', stop: N}
- {name: NS3, start: 3420, colour: '#8c564b', stop: N}
# Reverse order to avoid NS4a's label getting cut off.
- {name: NS4b, start: 5475, colour: '#7f7f7f'}
- {name: NS4a, start: 5313, colour: '#e377c2'}
- {name: NS5a, start: 6258, colour: '#bcbd22'}
- {name: NS5b, start: 7599, colour: '#17becf'}
- {name: NS4b, start: 5475, colour: '#7f7f7f', stop: N}
- {name: NS4a, start: 5313, colour: '#e377c2', stop: N}
- {name: NS5a, start: 6258, colour: '#bcbd22', stop: N}
- {name: NS5b, start: 7599, colour: '#17becf', stop: N}
- {name: "3'", start: 9372, end: 9646, colour: darkgrey}
- seed_pattern: HCV-1.*
coordinates: HCV-1a
prefix: HCV1A-H77-
landmarks:
- {name: "5'", start: 1, colour: darkgrey}
- {name: C, full_name: Core, start: 342, colour: '#1f77b4'}
- {name: E1, start: 915, colour: '#ff7f0e'}
- {name: E2, start: 1491, colour: '#2ca02c'}
- {name: p7, start: 2580, colour: '#d62728'}
- {name: NS2, start: 2769, colour: '#9467bd'}
- {name: NS3, start: 3420, colour: '#8c564b'}
- {name: NS4b, start: 5475, colour: '#7f7f7f'}
- {name: NS4a, start: 5313, colour: '#e377c2'}
- {name: NS5a, start: 6258, colour: '#bcbd22'}
- {name: NS5b, start: 7602, colour: '#17becf'}
- {name: C, full_name: Core, start: 342, colour: '#1f77b4', stop: N}
- {name: E1, start: 915, colour: '#ff7f0e', stop: N}
- {name: E2, start: 1491, colour: '#2ca02c', stop: N}
- {name: p7, start: 2580, colour: '#d62728', stop: N}
- {name: NS2, start: 2769, colour: '#9467bd', stop: N}
- {name: NS3, start: 3420, colour: '#8c564b', stop: N}
- {name: NS4b, start: 5475, colour: '#7f7f7f', stop: N}
- {name: NS4a, start: 5313, colour: '#e377c2', stop: N}
- {name: NS5a, start: 6258, colour: '#bcbd22', stop: N}
- {name: NS5b, start: 7602, colour: '#17becf', stop: N}
- {name: "3'", start: 9375, end: 9646, colour: darkgrey}
- seed_pattern: HCV-2.*
coordinates: HCV-2a
prefix: HCV2-JFH-1-
landmarks:
- {name: "5'", start: 1, colour: darkgrey}
- {name: C, full_name: Core, start: 341, colour: '#1f77b4'}
- {name: E1, start: 914, colour: '#ff7f0e'}
- {name: E2, start: 1490, colour: '#2ca02c'}
- {name: p7, start: 2591, colour: '#d62728'}
- {name: NS2, start: 2780, colour: '#9467bd'}
- {name: NS3, start: 3431, colour: '#8c564b'}
- {name: NS4b, start: 5486, colour: '#7f7f7f'}
- {name: NS4a, start: 5324, colour: '#e377c2'}
- {name: NS5a, start: 6269, colour: '#bcbd22'}
- {name: NS5b, start: 7667, colour: '#17becf'}
- {name: C, full_name: Core, start: 341, colour: '#1f77b4', stop: N}
- {name: E1, start: 914, colour: '#ff7f0e', stop: N}
- {name: E2, start: 1490, colour: '#2ca02c', stop: N}
- {name: p7, start: 2591, colour: '#d62728', stop: N}
- {name: NS2, start: 2780, colour: '#9467bd', stop: N}
- {name: NS3, start: 3431, colour: '#8c564b', stop: N}
- {name: NS4b, start: 5486, colour: '#7f7f7f', stop: N}
- {name: NS4a, start: 5324, colour: '#e377c2', stop: N}
- {name: NS5a, start: 6269, colour: '#bcbd22', stop: N}
- {name: NS5b, start: 7667, colour: '#17becf', stop: N}
- {name: "3'", start: 9441, end: 9646, colour: darkgrey}
- seed_pattern: HCV-3.*
coordinates: HCV-3a
prefix: HCV3-S52-
landmarks:
- {name: "5'", start: 1, colour: darkgrey}
- {name: C, full_name: Core, start: 340, colour: '#1f77b4'}
- {name: E1, start: 913, colour: '#ff7f0e'}
- {name: E2, start: 1489, colour: '#2ca02c'}
- {name: p7, start: 2596, colour: '#d62728'}
- {name: NS2, start: 2785, colour: '#9467bd'}
- {name: NS3, start: 3436, colour: '#8c564b'}
- {name: NS4b, start: 5491, colour: '#7f7f7f'}
- {name: NS4a, start: 5329, colour: '#e377c2'}
- {name: NS5a, start: 6274, colour: '#bcbd22'}
- {name: NS5b, start: 7630, colour: '#17becf'}
- {name: C, full_name: Core, start: 340, colour: '#1f77b4', stop: N}
- {name: E1, start: 913, colour: '#ff7f0e', stop: N}
- {name: E2, start: 1489, colour: '#2ca02c', stop: N}
- {name: p7, start: 2596, colour: '#d62728', stop: N}
- {name: NS2, start: 2785, colour: '#9467bd', stop: N}
- {name: NS3, start: 3436, colour: '#8c564b', stop: N}
- {name: NS4b, start: 5491, colour: '#7f7f7f', stop: N}
- {name: NS4a, start: 5329, colour: '#e377c2', stop: N}
- {name: NS5a, start: 6274, colour: '#bcbd22', stop: N}
- {name: NS5b, start: 7630, colour: '#17becf', stop: N}
- {name: "3'", start: 9404, end: 9646, colour: darkgrey}
- seed_pattern: HCV-4.*
coordinates: HCV-4a
prefix: HCV4-ED43-
landmarks:
- {name: "5'", start: 1, colour: darkgrey}
- {name: C, full_name: Core, start: 341, colour: '#1f77b4'}
- {name: E1, start: 914, colour: '#ff7f0e'}
- {name: E2, start: 1490, colour: '#2ca02c'}
- {name: p7, start: 2579, colour: '#d62728'}
- {name: NS2, start: 2768, colour: '#9467bd'}
- {name: NS3, start: 3419, colour: '#8c564b'}
- {name: NS4b, start: 5474, colour: '#7f7f7f'}
- {name: NS4a, start: 5312, colour: '#e377c2'}
- {name: NS5a, start: 6257, colour: '#bcbd22'}
- {name: NS5b, start: 7592, colour: '#17becf'}
- {name: C, full_name: Core, start: 341, colour: '#1f77b4', stop: N}
- {name: E1, start: 914, colour: '#ff7f0e', stop: N}
- {name: E2, start: 1490, colour: '#2ca02c', stop: N}
- {name: p7, start: 2579, colour: '#d62728', stop: N}
- {name: NS2, start: 2768, colour: '#9467bd', stop: N}
- {name: NS3, start: 3419, colour: '#8c564b', stop: N}
- {name: NS4b, start: 5474, colour: '#7f7f7f', stop: N}
- {name: NS4a, start: 5312, colour: '#e377c2', stop: N}
- {name: NS5a, start: 6257, colour: '#bcbd22', stop: N}
- {name: NS5b, start: 7592, colour: '#17becf', stop: N}
- {name: "3'", start: 9366, end: 9646, colour: darkgrey}
- seed_pattern: HCV-5.*
coordinates: HCV-5a
prefix: HCV5-SA13-
landmarks:
- {name: "5'", start: 1, colour: darkgrey}
- {name: C, full_name: Core, start: 247, colour: '#1f77b4'}
- {name: E1, start: 820, colour: '#ff7f0e'}
- {name: E2, start: 1396, colour: '#2ca02c'}
- {name: p7, start: 2488, colour: '#d62728'}
- {name: NS2, start: 2677, colour: '#9467bd'}
- {name: NS3, start: 3328, colour: '#8c564b'}
- {name: NS4b, start: 5383, colour: '#7f7f7f'}
- {name: NS4a, start: 5221, colour: '#e377c2'}
- {name: NS5a, start: 6166, colour: '#bcbd22'}
- {name: NS5b, start: 7516, colour: '#17becf'}
- {name: C, full_name: Core, start: 247, colour: '#1f77b4', stop: N}
- {name: E1, start: 820, colour: '#ff7f0e', stop: N}
- {name: E2, start: 1396, colour: '#2ca02c', stop: N}
- {name: p7, start: 2488, colour: '#d62728', stop: N}
- {name: NS2, start: 2677, colour: '#9467bd', stop: N}
- {name: NS3, start: 3328, colour: '#8c564b', stop: N}
- {name: NS4b, start: 5383, colour: '#7f7f7f', stop: N}
- {name: NS4a, start: 5221, colour: '#e377c2', stop: N}
- {name: NS5a, start: 6166, colour: '#bcbd22', stop: N}
- {name: NS5b, start: 7516, colour: '#17becf', stop: N}
- {name: "3'", start: 9289, end: 9646, colour: darkgrey}
- seed_pattern: HCV-6.*
coordinates: HCV-6a
prefix: HCV6-EUHK2-
landmarks:
- {name: "5'", start: 1, colour: darkgrey}
- {name: C, full_name: Core, start: 284, colour: '#1f77b4'}
- {name: E1, start: 857, colour: '#ff7f0e'}
- {name: E2, start: 1433, colour: '#2ca02c'}
- {name: p7, start: 2534, colour: '#d62728'}
- {name: NS2, start: 2723, colour: '#9467bd'}
- {name: NS3, start: 3374, colour: '#8c564b'}
- {name: NS4b, start: 5429, colour: '#7f7f7f'}
- {name: NS4a, start: 5267, colour: '#e377c2'}
- {name: NS5a, start: 6212, colour: '#bcbd22'}
- {name: NS5b, start: 7565, colour: '#17becf'}
- {name: C, full_name: Core, start: 284, colour: '#1f77b4', stop: N}
- {name: E1, start: 857, colour: '#ff7f0e', stop: N}
- {name: E2, start: 1433, colour: '#2ca02c', stop: N}
- {name: p7, start: 2534, colour: '#d62728', stop: N}
- {name: NS2, start: 2723, colour: '#9467bd', stop: N}
- {name: NS3, start: 3374, colour: '#8c564b', stop: N}
- {name: NS4b, start: 5429, colour: '#7f7f7f', stop: N}
- {name: NS4a, start: 5267, colour: '#e377c2', stop: N}
- {name: NS5a, start: 6212, colour: '#bcbd22', stop: N}
- {name: NS5b, start: 7565, colour: '#17becf', stop: N}
- {name: "3'", start: 9339, end: 9646, colour: darkgrey}
- seed_pattern: HCV-7.*
coordinates: HCV-7a
prefix: HCV7-QC69-
landmarks:
- {name: "5'", start: 1, colour: darkgrey}
- {name: C, full_name: Core, start: 309, colour: '#1f77b4'}
- {name: E1, start: 882, colour: '#ff7f0e'}
- {name: E2, start: 1458, colour: '#2ca02c'}
- {name: p7, start: 2559, colour: '#d62728'}
- {name: NS2, start: 2748, colour: '#9467bd'}
- {name: NS3, start: 3399, colour: '#8c564b'}
- {name: NS4b, start: 5454, colour: '#7f7f7f'}
- {name: NS4a, start: 5292, colour: '#e377c2'}
- {name: NS5a, start: 6237, colour: '#bcbd22'}
- {name: NS5b, start: 7575, colour: '#17becf'}
- {name: C, full_name: Core, start: 309, colour: '#1f77b4', stop: N}
- {name: E1, start: 882, colour: '#ff7f0e', stop: N}
- {name: E2, start: 1458, colour: '#2ca02c', stop: N}
- {name: p7, start: 2559, colour: '#d62728', stop: N}
- {name: NS2, start: 2748, colour: '#9467bd', stop: N}
- {name: NS3, start: 3399, colour: '#8c564b', stop: N}
- {name: NS4b, start: 5454, colour: '#7f7f7f', stop: N}
- {name: NS4a, start: 5292, colour: '#e377c2', stop: N}
- {name: NS5a, start: 6237, colour: '#bcbd22', stop: N}
- {name: NS5b, start: 7575, colour: '#17becf', stop: N}
- {name: "3'", start: 9349, end: 9646, colour: darkgrey}
- seed_pattern: HLA-B-seed
coordinates: HLA-B-seed
Expand Down
35 changes: 29 additions & 6 deletions micall/tests/test_landmark_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_gene_without_end():
""")
expected_gene = dict(name='gene2',
start=789,
end=5040,
end=5039,
frame=0,
colour='lightblue')
reader = LandmarkReader.load(landmarks_yaml)
Expand All @@ -59,10 +59,11 @@ def test_gene_without_end():

def test_load_defaults():
expected_gene = dict(name='PR',
start=2252,
start=2253,
end=2549,
frame=3,
colour='orange')
colour='orange',
stop='N')
reader = LandmarkReader.load()

gene = reader.get_gene('HIV1-B-FR-K03455-seed', 'PR', drop_stop_codon=False)
Expand All @@ -82,7 +83,7 @@ def test_gene_with_prefix():
""")
expected_gene = dict(name='gene2',
start=789,
end=5040,
end=5039,
frame=0,
colour='lightblue')
reader = LandmarkReader.load(landmarks_yaml)
Expand All @@ -103,7 +104,7 @@ def test_gene_drop_stop_codon():
""")
expected_gene = dict(name='gene2',
start=789,
end=5037,
end=5036,
frame=0,
colour='lightblue')
reader = LandmarkReader.load(landmarks_yaml)
Expand All @@ -113,6 +114,28 @@ def test_gene_drop_stop_codon():
assert gene == expected_gene


def test_gene_no_stop_codon():
landmarks_yaml = StringIO("""\
- seed_pattern: R1
coordinates: R1-seed
landmarks:
- {name: gene1, start: 1, frame: 0, colour: darkgrey}
- {name: gene2, start: 789, frame: 0, colour: lightblue, stop: N}
- {name: gene3, start: 5040, end: 5616, frame: 0, colour: steelblue}
""")
expected_gene = dict(name='gene2',
start=789,
end=5039,
frame=0,
colour='lightblue',
stop='N')
reader = LandmarkReader.load(landmarks_yaml)

gene = reader.get_gene('R1-seed', 'gene2', drop_stop_codon=True)

assert gene == expected_gene


def test_gene_missing_prefix():
landmarks_yaml = StringIO("""\
- seed_pattern: R1
Expand Down Expand Up @@ -143,7 +166,7 @@ def test_gene_with_full_name():
expected_gene = dict(name='2',
full_name='gene2',
start=789,
end=5040,
end=5039,
frame=0,
colour='lightblue')
reader = LandmarkReader.load(landmarks_yaml)
Expand Down
16 changes: 8 additions & 8 deletions micall/tests/test_plot_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,10 +812,10 @@ def test_plot_genome_coverage_blast_aligns_refs():
1,HIV1-G-CM-KP718923-seed,300,1,90,1,10,1653,1662
""")
expected_figure = """\
5' LTR[1-634], gag[789-2289], vif[5040-5616], tat[8379-8469], nef[8796-9414]
tat[5831-6045], vpu[6061-6307], rev[8379-8653], 3' LTR[9086-9719]
pol[2085-5096], vpr[5558-5847], rev[5970-6045], env[6225-8795]
PR[2252-2549], RT[2549-3869], INT[4229-5093], V3[7109-7217], GP41[7757-8792]
5' LTR[1-634], gag[790-2292], vif[5041-5619], tat[8380-8469], nef[8797-9417]
tat[5831-6049], vpu[6062-6310], rev[8378-8653], 3' LTR[9086-9719]
pol[2085-5096], vpr[5559-5850], rev[5970-6048], env[6225-8795]
PR[2253-2549], RT[2550-3869], INT[4230-5096], V3[7110-7217], GP41[7758-8795]
2261--1.1->2270
2261--1.1->2270
Coverage 5x10
Expand Down Expand Up @@ -849,10 +849,10 @@ def test_plot_genome_coverage_g2p():
HIV1-CON-XX-Consensus-seed,HIV1-B-FR-K03455-seed,1,6,7201,7206
""")
expected_figure = """\
5' LTR[1-634], gag[789-2289], vif[5040-5616], tat[8379-8469], nef[8796-9414]
tat[5831-6045], vpu[6061-6307], rev[8379-8653], 3' LTR[9086-9719]
pol[2085-5096], vpr[5558-5847], rev[5970-6045], env[6225-8795]
PR[2252-2549], RT[2549-3869], INT[4229-5093], V3[7109-7217], GP41[7757-8792]
5' LTR[1-634], gag[790-2292], vif[5041-5619], tat[8380-8469], nef[8797-9417]
tat[5831-6049], vpu[6062-6310], rev[8378-8653], 3' LTR[9086-9719]
pol[2085-5096], vpr[5559-5850], rev[5970-6048], env[6225-8795]
PR[2253-2549], RT[2550-3869], INT[4230-5096], V3[7110-7217], GP41[7758-8795]
2261--1.1->2266
Coverage 100x6
[7201-7206], HIV1-CON-XX-Consensus-seed - depth 100(1-9719)
Expand Down
6 changes: 3 additions & 3 deletions micall/utils/consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,12 @@ def start_contig(self,
self.coordinate_name = coordinate_name
coordinate_seq = self.projects.getReference(coordinate_name)
aligner = Aligner(seq=coordinate_seq, preset='map-ont')
self.alignments = list(aligner.map(consensus))
if self.alignments or 10_000 < len(consensus):
self.alignments = list(aligner.map(self.consensus))
if self.alignments or 10_000 < len(self.consensus):
self.algorithm = 'minimap2'
else:
self.algorithm = 'gotoh'
self.align_gotoh(coordinate_seq, consensus)
self.align_gotoh(coordinate_seq, self.consensus)
self.alignments = [alignment
for alignment in self.alignments
if alignment.is_primary]
Expand Down
Loading

0 comments on commit d786b28

Please sign in to comment.