Skip to content

Commit

Permalink
Fix landmark definitions for HLA to be 1-based, as part of #589.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Jun 18, 2021
1 parent d786b28 commit e73b058
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 53 deletions.
3 changes: 0 additions & 3 deletions micall/data/landmark_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ def get_gene(self,
genotype_landmarks, = matches
regions = sorted(genotype_landmarks['landmarks'], key=itemgetter('start'))
prefix = genotype_landmarks.get('prefix', '')
if (gene_name.startswith('SARS-CoV-2-TRS-B') or
gene_name.startswith('SARS-CoV-2-nsp')):
drop_stop_codon = False
if not gene_name.startswith(prefix):
raise ValueError(f'Gene name {gene_name!r} does not start with '
f'prefix {prefix!r}.')
Expand Down
63 changes: 34 additions & 29 deletions micall/data/landmark_references.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -159,45 +159,50 @@
- {name: "3'", start: 9349, end: 9646, colour: darkgrey}
- seed_pattern: HLA-B-seed
coordinates: HLA-B-seed
prefix: HLA-B-
landmarks:
- {name: exon2, start: 200, end: 470, colour: '#ff7f0e'}
- {name: exon3, start: 716, end: 992, colour: '#2ca02c'}
# Seed reference starts at exon1, so all positions have an offset of 301.
# Exons are spliced together in the middle of a codon, so we end exon2 one
# nucleotide early and start exon3 one nucleotide early to make the reading
# frame work.
- {name: exon2, start: 201, end: 470, colour: '#ff7f0e', stop: N}
- {name: exon3, start: 717, end: 992, colour: '#2ca02c', stop: N}
- seed_pattern: SARS-CoV-2-seed
coordinates: SARS-CoV-2-seed
prefix: SARS-CoV-2-
landmarks:
- {name: 5', full_name: 5'UTR, start: 1, end: 265, frame: 0, colour: darkgrey}
- {name: 5', full_name: 5'UTR, start: 1, end: 265, frame: 0, colour: darkgrey, stop: N}
- {name: 3a, full_name: ORF3a, start: 25393, end: 26220, frame: 0, colour: '#2ca02c'}
- {name: E, start: 26245, end: 26472, frame: 0, colour: '#d62728'}
- {name: '6', full_name: ORF6, start: 27202, end: 27387, frame: 0, colour: '#8c564b'}
- {name: 7a, full_name: ORF7a, start: 27394, end: 27759, frame: 0, colour: '#7f7f7f'}
- {name: 3', full_name: 3'UTR, start: 29675, end: 29903, frame: 0, colour: darkgrey}
- {name: TRS-B-1, start: 21556, end: 21562, frame: 0} # Not displayed.
- {name: TRS-B-2, start: 25385, end: 25392, frame: 0} # Not displayed.
- {name: TRS-B-3, start: 26221, end: 26244, frame: 0} # Not displayed.
- {name: TRS-B-4, start: 26473, end: 26522, frame: 0} # Not displayed.
- {name: TRS-B-5, start: 27192, end: 27201, frame: 0} # Not displayed.
- {name: TRS-B-6, start: 27388, end: 27393, frame: 0} # Not displayed.
- {name: TRS-B-7, start: 27888, end: 27893, frame: 0} # Not displayed.
- {name: TRS-B-8, start: 28260, end: 28273, frame: 0} # Not displayed.
- {name: TRS-B-9, start: 29534, end: 29557, frame: 0} # Not displayed.
- {name: ORF1a, start: 266, end: 13483, frame: 1} # Not displayed.
- {name: 3', full_name: 3'UTR, start: 29675, end: 29903, frame: 0, colour: darkgrey, stop: N}
- {name: TRS-B-1, start: 21556, end: 21562, frame: 0, stop: N} # Not displayed.
- {name: TRS-B-2, start: 25385, end: 25392, frame: 0, stop: N} # Not displayed.
- {name: TRS-B-3, start: 26221, end: 26244, frame: 0, stop: N} # Not displayed.
- {name: TRS-B-4, start: 26473, end: 26522, frame: 0, stop: N} # Not displayed.
- {name: TRS-B-5, start: 27192, end: 27201, frame: 0, stop: N} # Not displayed.
- {name: TRS-B-6, start: 27388, end: 27393, frame: 0, stop: N} # Not displayed.
- {name: TRS-B-7, start: 27888, end: 27893, frame: 0, stop: N} # Not displayed.
- {name: TRS-B-8, start: 28260, end: 28273, frame: 0, stop: N} # Not displayed.
- {name: TRS-B-9, start: 29534, end: 29557, frame: 0, stop: N} # Not displayed.
- {name: ORF1a, start: 266, end: 13480, frame: 1, stop: N} # Not displayed.
- {name: ORF1b, start: 13468, end: 21555, frame: 1} # Not displayed.
- {name: nsp2, start: 806, end: 2719, frame: 1, colour: darkgrey}
- {name: nsp1, start: 266, end: 805, frame: 1, colour: grey}
- {name: nsp3, start: 2720, end: 8554, frame: 1, colour: grey}
- {name: nsp4, start: 8555, end: 10054, frame: 1, colour: darkgrey}
- {name: nsp5, start: 10055, end: 10972, frame: 1, colour: grey}
- {name: nsp6, start: 10973, end: 11842, frame: 1, colour: darkgrey}
- {name: '7', full_name: nsp7, start: 11843, end: 12091, frame: 1, colour: grey}
- {name: '8', full_name: nsp8, start: 12092, end: 12685, frame: 1, colour: darkgrey}
- {name: '9', full_name: nsp9, start: 12686, end: 13024, frame: 1, colour: grey}
- {name: '10', full_name: nsp10, start: 13025, end: 13441, frame: 1, colour: darkgrey}
- {name: nsp12, start: 13442, end: 16236, frame: 1, colour: grey}
- {name: nsp13, start: 16237, end: 18039, frame: 1, colour: darkgrey}
- {name: nsp14, start: 18040, end: 19620, frame: 1, colour: grey}
- {name: nsp15, start: 19621, end: 20658, frame: 1, colour: darkgrey}
- {name: nsp16, start: 20659, end: 21552, frame: 1, colour: grey}
- {name: nsp2, start: 806, end: 2719, frame: 1, colour: darkgrey, stop: N}
- {name: nsp1, start: 266, end: 805, frame: 1, colour: grey, stop: N}
- {name: nsp3, start: 2720, end: 8554, frame: 1, colour: grey, stop: N}
- {name: nsp4, start: 8555, end: 10054, frame: 1, colour: darkgrey, stop: N}
- {name: nsp5, start: 10055, end: 10972, frame: 1, colour: grey, stop: N}
- {name: nsp6, start: 10973, end: 11842, frame: 1, colour: darkgrey, stop: N}
- {name: '7', full_name: nsp7, start: 11843, end: 12091, frame: 1, colour: grey, stop: N}
- {name: '8', full_name: nsp8, start: 12092, end: 12685, frame: 1, colour: darkgrey, stop: N}
- {name: '9', full_name: nsp9, start: 12686, end: 13024, frame: 1, colour: grey, stop: N}
- {name: '10', full_name: nsp10, start: 13025, end: 13441, frame: 1, colour: darkgrey, stop: N}
- {name: nsp12, start: 13442, end: 16236, frame: 1, colour: grey, stop: N}
- {name: nsp13, start: 16237, end: 18039, frame: 1, colour: darkgrey, stop: N}
- {name: nsp14, start: 18040, end: 19620, frame: 1, colour: grey, stop: N}
- {name: nsp15, start: 19621, end: 20658, frame: 1, colour: darkgrey, stop: N}
- {name: nsp16, start: 20659, end: 21552, frame: 1, colour: grey, stop: N}
- {name: S, start: 21563, end: 25384, frame: 1, colour: '#ff7f0e'}
- {name: N, start: 28274, end: 29533, frame: 1, colour: '#17becf'}
- {name: '10', full_name: ORF10, start: 29558, end: 29674, frame: 1, colour: darkorange}
Expand Down
4 changes: 2 additions & 2 deletions micall/projects.json
Original file line number Diff line number Diff line change
Expand Up @@ -32188,7 +32188,7 @@
"CAATCTTTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCC",
"ACGCGGAGTACGATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCT",
"AATGTGTAAAATTAATTTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGA",
"CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
"CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
],
"seed_group": null
},
Expand All @@ -32199,7 +32199,7 @@
"CTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTAT",
"AATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTT",
"ACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAG",
"GT"
"GTAAG"
],
"seed_group": null
},
Expand Down
62 changes: 43 additions & 19 deletions micall/utils/fetch_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,11 +191,38 @@ def check_hiv_coordinates(project_config, unchecked_ref_names: set):
def extract_region(landmark_reader: LandmarkReader,
coordinate_name: str,
coordinate_seq: str,
region_name: str) -> str:
region_name: str,
ref_positions: set = None,
duplicated_base: int = None) -> str:
""" Extract a section of reference sequence, based on a gene landmark.
:param landmark_reader: source of landmark definitions
:param coordinate_name: name of coordinate reference for the landmarks
:param coordinate_seq: coordinate sequence to extract from
:param region_name: gene region to extract
:param ref_positions: track which positions have been extracted, so we can
check for gaps.
:param duplicated_base: the one-based position of a nucleotide that gets
duplicated in the extracted sequence.
:return: extracted sequence for the gene region
"""
region = landmark_reader.get_gene(coordinate_name,
region_name,
drop_stop_codon=False)
source_nuc_sequence = coordinate_seq[region['start'] - 1:region['end']]
region_name)
full_region = landmark_reader.get_gene(coordinate_name,
region_name,
drop_stop_codon=False)
check_stop_codons = True
if check_stop_codons:
region = full_region
start = region['start']
end = region['end']
source_nuc_sequence = coordinate_seq[start - 1:end]
if ref_positions is not None:
ref_positions.update(range(full_region['start'], full_region['end'] + 1))
if start < duplicated_base <= end:
source_nuc_sequence = (
source_nuc_sequence[:duplicated_base - start + 1] +
source_nuc_sequence[duplicated_base - start:])
return source_nuc_sequence


Expand Down Expand Up @@ -353,7 +380,8 @@ def check_hla_seeds(project_config, unchecked_ref_names: set):

def check_hla_coordinates(project_config, unchecked_ref_names: set):
print("""\
HLA coordinate references are translated from the seed reference.
HLA coordinate references are translated from the seed reference. Coordinates
come from the features described at https://www.ncbi.nlm.nih.gov/nuccore/AJ458991
""")
ref_names = ('HLA-B-exon2', 'HLA-B-exon3')

Expand All @@ -363,8 +391,10 @@ def check_hla_coordinates(project_config, unchecked_ref_names: set):

source_sequences = {}
for ref_name in ref_names:
region = landmark_reader.get_gene('HLA-B-seed', ref_name[6:])
source_nuc_sequence = seed_sequence[region['start']:region['end']]
source_nuc_sequence = extract_region(landmark_reader,
'HLA-B-seed',
seed_sequence,
ref_name)
source_sequences[ref_name] = translate(source_nuc_sequence)

report, error_count = compare_config(ref_names,
Expand Down Expand Up @@ -455,18 +485,12 @@ def check_sars_coordinates(project_config, unchecked_ref_names: set):
source_sequences = {}
ref_positions = set()
for ref_name in ref_names:
region = landmark_reader.get_gene(seed_name, ref_name)
full_region = landmark_reader.get_gene(seed_name,
ref_name,
drop_stop_codon=False)
start = region['start']
end = region['end']
ref_positions = ref_positions.union(range(start, full_region['end']+1))
source_nuc_sequence = seed_sequence[start-1:end]
if start < duplicated_base <= end:
source_nuc_sequence = (
source_nuc_sequence[:duplicated_base-start+1] +
source_nuc_sequence[duplicated_base-start:])
source_nuc_sequence = extract_region(landmark_reader,
seed_name,
seed_sequence,
ref_name,
ref_positions,
duplicated_base)
if 'TRS-B' in ref_name or ref_name.endswith('UTR'):
source_sequences[ref_name] = source_nuc_sequence
else:
Expand Down

0 comments on commit e73b058

Please sign in to comment.