Fix landmark definitions for HLA to be 1-based, as part of #589.

cfe-lab · Jun 18, 2021 · e73b058 · e73b058
1 parent d786b28
commit e73b058
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 53 deletions.
diff --git a/micall/data/landmark_reader.py b/micall/data/landmark_reader.py
@@ -42,9 +42,6 @@ def get_gene(self,
         genotype_landmarks, = matches
         regions = sorted(genotype_landmarks['landmarks'], key=itemgetter('start'))
         prefix = genotype_landmarks.get('prefix', '')
-        if (gene_name.startswith('SARS-CoV-2-TRS-B') or
-                gene_name.startswith('SARS-CoV-2-nsp')):
-            drop_stop_codon = False
         if not gene_name.startswith(prefix):
             raise ValueError(f'Gene name {gene_name!r} does not start with '
                              f'prefix {prefix!r}.')

diff --git a/micall/data/landmark_references.yaml b/micall/data/landmark_references.yaml
@@ -159,45 +159,50 @@
     - {name: "3'", start: 9349, end: 9646, colour: darkgrey}
 - seed_pattern: HLA-B-seed
   coordinates: HLA-B-seed
+  prefix: HLA-B-
   landmarks:
-    - {name: exon2, start: 200, end: 470, colour: '#ff7f0e'}
-    - {name: exon3, start: 716, end: 992, colour: '#2ca02c'}
+    # Seed reference starts at exon1, so all positions have an offset of 301.
+    # Exons are spliced together in the middle of a codon, so we end exon2 one
+    # nucleotide early and start exon3 one nucleotide early to make the reading
+    # frame work.
+    - {name: exon2, start: 201, end: 470, colour: '#ff7f0e', stop: N}
+    - {name: exon3, start: 717, end: 992, colour: '#2ca02c', stop: N}
 - seed_pattern: SARS-CoV-2-seed
   coordinates: SARS-CoV-2-seed
   prefix: SARS-CoV-2-
   landmarks:
-    - {name: 5', full_name: 5'UTR, start: 1, end: 265, frame: 0, colour: darkgrey}
+    - {name: 5', full_name: 5'UTR, start: 1, end: 265, frame: 0, colour: darkgrey, stop: N}
     - {name: 3a, full_name: ORF3a, start: 25393, end: 26220, frame: 0, colour: '#2ca02c'}
     - {name: E, start: 26245, end: 26472, frame: 0, colour: '#d62728'}
     - {name: '6', full_name: ORF6, start: 27202, end: 27387, frame: 0, colour: '#8c564b'}
     - {name: 7a, full_name: ORF7a, start: 27394, end: 27759, frame: 0, colour: '#7f7f7f'}
-    - {name: 3', full_name: 3'UTR, start: 29675, end: 29903, frame: 0, colour: darkgrey}
-    - {name: TRS-B-1, start: 21556, end: 21562, frame: 0}  # Not displayed.
-    - {name: TRS-B-2, start: 25385, end: 25392, frame: 0}  # Not displayed.
-    - {name: TRS-B-3, start: 26221, end: 26244, frame: 0}  # Not displayed.
-    - {name: TRS-B-4, start: 26473, end: 26522, frame: 0}  # Not displayed.
-    - {name: TRS-B-5, start: 27192, end: 27201, frame: 0}  # Not displayed.
-    - {name: TRS-B-6, start: 27388, end: 27393, frame: 0}  # Not displayed.
-    - {name: TRS-B-7, start: 27888, end: 27893, frame: 0}  # Not displayed.
-    - {name: TRS-B-8, start: 28260, end: 28273, frame: 0}  # Not displayed.
-    - {name: TRS-B-9, start: 29534, end: 29557, frame: 0}  # Not displayed.
-    - {name: ORF1a, start: 266, end: 13483, frame: 1}  # Not displayed.
+    - {name: 3', full_name: 3'UTR, start: 29675, end: 29903, frame: 0, colour: darkgrey, stop: N}
+    - {name: TRS-B-1, start: 21556, end: 21562, frame: 0, stop: N}  # Not displayed.
+    - {name: TRS-B-2, start: 25385, end: 25392, frame: 0, stop: N}  # Not displayed.
+    - {name: TRS-B-3, start: 26221, end: 26244, frame: 0, stop: N}  # Not displayed.
+    - {name: TRS-B-4, start: 26473, end: 26522, frame: 0, stop: N}  # Not displayed.
+    - {name: TRS-B-5, start: 27192, end: 27201, frame: 0, stop: N}  # Not displayed.
+    - {name: TRS-B-6, start: 27388, end: 27393, frame: 0, stop: N}  # Not displayed.
+    - {name: TRS-B-7, start: 27888, end: 27893, frame: 0, stop: N}  # Not displayed.
+    - {name: TRS-B-8, start: 28260, end: 28273, frame: 0, stop: N}  # Not displayed.
+    - {name: TRS-B-9, start: 29534, end: 29557, frame: 0, stop: N}  # Not displayed.
+    - {name: ORF1a, start: 266, end: 13480, frame: 1, stop: N}  # Not displayed.
     - {name: ORF1b, start: 13468, end: 21555, frame: 1}  # Not displayed.
-    - {name: nsp2, start: 806, end: 2719, frame: 1, colour: darkgrey}
-    - {name: nsp1, start: 266, end: 805, frame: 1, colour: grey}
-    - {name: nsp3, start: 2720, end: 8554, frame: 1, colour: grey}
-    - {name: nsp4, start: 8555, end: 10054, frame: 1, colour: darkgrey}
-    - {name: nsp5, start: 10055, end: 10972, frame: 1, colour: grey}
-    - {name: nsp6, start: 10973, end: 11842, frame: 1, colour: darkgrey}
-    - {name: '7', full_name: nsp7, start: 11843, end: 12091, frame: 1, colour: grey}
-    - {name: '8', full_name: nsp8, start: 12092, end: 12685, frame: 1, colour: darkgrey}
-    - {name: '9', full_name: nsp9, start: 12686, end: 13024, frame: 1, colour: grey}
-    - {name: '10', full_name: nsp10, start: 13025, end: 13441, frame: 1, colour: darkgrey}
-    - {name: nsp12, start: 13442, end: 16236, frame: 1, colour: grey}
-    - {name: nsp13, start: 16237, end: 18039, frame: 1, colour: darkgrey}
-    - {name: nsp14, start: 18040, end: 19620, frame: 1, colour: grey}
-    - {name: nsp15, start: 19621, end: 20658, frame: 1, colour: darkgrey}
-    - {name: nsp16, start: 20659, end: 21552, frame: 1, colour: grey}
+    - {name: nsp2, start: 806, end: 2719, frame: 1, colour: darkgrey, stop: N}
+    - {name: nsp1, start: 266, end: 805, frame: 1, colour: grey, stop: N}
+    - {name: nsp3, start: 2720, end: 8554, frame: 1, colour: grey, stop: N}
+    - {name: nsp4, start: 8555, end: 10054, frame: 1, colour: darkgrey, stop: N}
+    - {name: nsp5, start: 10055, end: 10972, frame: 1, colour: grey, stop: N}
+    - {name: nsp6, start: 10973, end: 11842, frame: 1, colour: darkgrey, stop: N}
+    - {name: '7', full_name: nsp7, start: 11843, end: 12091, frame: 1, colour: grey, stop: N}
+    - {name: '8', full_name: nsp8, start: 12092, end: 12685, frame: 1, colour: darkgrey, stop: N}
+    - {name: '9', full_name: nsp9, start: 12686, end: 13024, frame: 1, colour: grey, stop: N}
+    - {name: '10', full_name: nsp10, start: 13025, end: 13441, frame: 1, colour: darkgrey, stop: N}
+    - {name: nsp12, start: 13442, end: 16236, frame: 1, colour: grey, stop: N}
+    - {name: nsp13, start: 16237, end: 18039, frame: 1, colour: darkgrey, stop: N}
+    - {name: nsp14, start: 18040, end: 19620, frame: 1, colour: grey, stop: N}
+    - {name: nsp15, start: 19621, end: 20658, frame: 1, colour: darkgrey, stop: N}
+    - {name: nsp16, start: 20659, end: 21552, frame: 1, colour: grey, stop: N}
     - {name: S, start: 21563, end: 25384, frame: 1, colour: '#ff7f0e'}
     - {name: N, start: 28274, end: 29533, frame: 1, colour: '#17becf'}
     - {name: '10', full_name: ORF10, start: 29558, end: 29674, frame: 1, colour: darkorange}

diff --git a/micall/projects.json b/micall/projects.json
@@ -32188,7 +32188,7 @@
         "CAATCTTTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCC",
         "ACGCGGAGTACGATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCT",
         "AATGTGTAAAATTAATTTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGA",
-        "CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+        "CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
       ],
       "seed_group": null
     },
@@ -32199,7 +32199,7 @@
         "CTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTAT",
         "AATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTT",
         "ACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAG",
-        "GT"
+        "GTAAG"
       ],
       "seed_group": null
     },

diff --git a/micall/utils/fetch_sequences.py b/micall/utils/fetch_sequences.py
@@ -191,11 +191,38 @@ def check_hiv_coordinates(project_config, unchecked_ref_names: set):
 def extract_region(landmark_reader: LandmarkReader,
                    coordinate_name: str,
                    coordinate_seq: str,
-                   region_name: str) -> str:
+                   region_name: str,
+                   ref_positions: set = None,
+                   duplicated_base: int = None) -> str:
+    """ Extract a section of reference sequence, based on a gene landmark.
+
+    :param landmark_reader: source of landmark definitions
+    :param coordinate_name: name of coordinate reference for the landmarks
+    :param coordinate_seq: coordinate sequence to extract from
+    :param region_name: gene region to extract
+    :param ref_positions: track which positions have been extracted, so we can
+        check for gaps.
+    :param duplicated_base: the one-based position of a nucleotide that gets
+        duplicated in the extracted sequence.
+    :return: extracted sequence for the gene region
+    """
     region = landmark_reader.get_gene(coordinate_name,
-                                      region_name,
-                                      drop_stop_codon=False)
-    source_nuc_sequence = coordinate_seq[region['start'] - 1:region['end']]
+                                      region_name)
+    full_region = landmark_reader.get_gene(coordinate_name,
+                                           region_name,
+                                           drop_stop_codon=False)
+    check_stop_codons = True
+    if check_stop_codons:
+        region = full_region
+    start = region['start']
+    end = region['end']
+    source_nuc_sequence = coordinate_seq[start - 1:end]
+    if ref_positions is not None:
+        ref_positions.update(range(full_region['start'], full_region['end'] + 1))
+    if start < duplicated_base <= end:
+        source_nuc_sequence = (
+                source_nuc_sequence[:duplicated_base - start + 1] +
+                source_nuc_sequence[duplicated_base - start:])
     return source_nuc_sequence
 
 
@@ -353,7 +380,8 @@ def check_hla_seeds(project_config, unchecked_ref_names: set):
 
 def check_hla_coordinates(project_config, unchecked_ref_names: set):
     print("""\
-HLA coordinate references are translated from the seed reference.
+HLA coordinate references are translated from the seed reference. Coordinates
+come from the features described at https://www.ncbi.nlm.nih.gov/nuccore/AJ458991
 """)
     ref_names = ('HLA-B-exon2', 'HLA-B-exon3')
 
@@ -363,8 +391,10 @@ def check_hla_coordinates(project_config, unchecked_ref_names: set):
 
     source_sequences = {}
     for ref_name in ref_names:
-        region = landmark_reader.get_gene('HLA-B-seed', ref_name[6:])
-        source_nuc_sequence = seed_sequence[region['start']:region['end']]
+        source_nuc_sequence = extract_region(landmark_reader,
+                                             'HLA-B-seed',
+                                             seed_sequence,
+                                             ref_name)
         source_sequences[ref_name] = translate(source_nuc_sequence)
 
     report, error_count = compare_config(ref_names,
@@ -455,18 +485,12 @@ def check_sars_coordinates(project_config, unchecked_ref_names: set):
     source_sequences = {}
     ref_positions = set()
     for ref_name in ref_names:
-        region = landmark_reader.get_gene(seed_name, ref_name)
-        full_region = landmark_reader.get_gene(seed_name,
-                                               ref_name,
-                                               drop_stop_codon=False)
-        start = region['start']
-        end = region['end']
-        ref_positions = ref_positions.union(range(start, full_region['end']+1))
-        source_nuc_sequence = seed_sequence[start-1:end]
-        if start < duplicated_base <= end:
-            source_nuc_sequence = (
-                source_nuc_sequence[:duplicated_base-start+1] +
-                source_nuc_sequence[duplicated_base-start:])
+        source_nuc_sequence = extract_region(landmark_reader,
+                                             seed_name,
+                                             seed_sequence,
+                                             ref_name,
+                                             ref_positions,
+                                             duplicated_base)
         if 'TRS-B' in ref_name or ref_name.endswith('UTR'):
             source_sequences[ref_name] = source_nuc_sequence
         else: