FIX: mutability map to sample synthetic passengers

bbglab · Nov 10, 2023 · 844ea1a · 844ea1a
1 parent fca1865
commit 844ea1a
Show file tree

Hide file tree

Showing 6 changed files with 277 additions and 37 deletions.
diff --git a/containers_build/boostdm/_old_passengers.py b/containers_build/boostdm/_old_passengers.py
@@ -0,0 +1,181 @@
+# Usage
+# -----
+
+
+# Imports
+# -------
+
+import os
+import itertools
+from collections import namedtuple
+from contextlib import suppress
+
+import bgreference
+import numpy as np
+import pandas as pd
+
+from boostdm.globals import CANONICAL_TRANSCRIPTS_FILE, GENOME_BUILD
+
+
+# Utils
+# -----
+
+CB = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
+TRIPLETS = [p[0] + c + p[1] for c in ['C', 'T'] for p in itertools.product(CB.keys(), repeat=2)]
+CHANGES = {'A': ['C', 'G', 'T'], 'C': ['A', 'G', 'T'], 'G': ['A', 'C', 'T'], 'T': ['A', 'C', 'G']}
+
+cds_data = pd.read_csv(CANONICAL_TRANSCRIPTS_FILE,
+                       sep='\t', header=None, compression='gzip', low_memory=False)
+cds_data = cds_data[[0, 1, 2, 3, 6]].copy()
+cds_data.columns = ['chr', 'start', 'end', 'strand', 'gene']
+
+
+# Retrieve exons
+# --------------
+
+def retrieve_exons(gene):
+
+    """
+    Returns
+        chromosome
+        CDS of the gene: list of genomic positions of the CDS
+        list of exon sequences (): each exon sequence has 1-bp offset at the flanks
+    """
+
+    df = cds_data[cds_data['gene'] == gene].copy()
+    exons = []
+    cds = []
+
+    if GENOME_BUILD == 'hg38':
+        func = bgreference.hg38
+    elif GENOME_BUILD == 'hg19':
+        func = bgreference.hg19
+
+    for ind, row in df.iterrows():
+        exons.append(func(row['chr'], int(row['start']) - 1, size=int(row['end']) - int(row['start']) + 3))
+        cds += list(range(int(row['start']), int(row['end']) + 1))
+    return row['chr'], cds, exons
+
+
+# Triplet Utils
+# -------------
+
+def triplet_index(triplet):
+
+    """Gives index of triplet according to TRIPLETS sorting"""
+    if triplet[1] not in ['C', 'T']:
+        triplet = CB[triplet[2]] + CB[triplet[1]] + CB[triplet[0]]
+    return TRIPLETS.index(triplet)
+
+
+def reverse_complement(triplet):
+
+    return CB[triplet[2]] + CB[triplet[1]] + CB[triplet[0]]
+
+
+def mut_key_gen():
+
+    for ref in ['C', 'T']:
+        for alt in CB.keys():
+            if ref == alt:
+                continue
+            else:
+                for p in itertools.product(CB.keys(), repeat=2):
+                    yield p[0] + ref + p[1], alt
+
+
+def sequence_triplet_index(segments):
+
+    """Takes segments and returns single sequence of triplet indices"""
+    sequence = []
+    for seg in segments:
+        for i, c in enumerate(seg[1: -1]):
+            triplet = seg[i: i+3]
+            tset = set(triplet)
+            if not tset.issubset(set('ACGT')):
+                sequence.append(-1)  # index = -1 implies that mutrate not defined,
+                                     # then we impute in the function probability_vector
+            else:
+                sequence.append(triplet_index(triplet))
+    return sequence
+
+
+def give_alt_allele(nucleotide, n):
+
+    """Gives the n-th possible change in lexicographic order from nucleotide"""
+    return CHANGES[nucleotide][n]
+
+
+# Indices of reference triplets in standard 96-channel order
+# ----------------------------------------------------------
+
+TRIPLET_INDEX_SPECTRA = [triplet_index(triplet) for triplet, alt in mut_key_gen()]
+
+
+# Probability
+# -----------
+
+def probability_vector(sequence, total_mutrate):
+
+    unfold_rates = []
+    for ind in sequence:
+        if ind == -1:
+            unfold_rates += [0, 0, 0]
+        else:
+            pos = TRIPLET_INDEX_SPECTRA.index(ind)
+            for i in range(3):
+                unfold_rates.append(total_mutrate[pos + i * 16])
+    return np.array(unfold_rates) / sum(unfold_rates)  # TODO: beware of zero-values!
+
+
+def to_mutation(mut_index, cds, sequence, chrom):
+
+    """
+    Return:
+        position and index of spectra context
+    """
+
+    index_in_sequence = mut_index // 3
+    change = mut_index % 3
+    pos = cds[index_in_sequence]
+
+    if GENOME_BUILD == 'hg38':
+        func = bgreference.hg38
+    elif GENOME_BUILD == 'hg19':
+        func = bgreference.hg19
+
+    ref_triplet = func(chrom, pos - 1, size=3)
+    seq_triplet = TRIPLETS[sequence[index_in_sequence]]
+    assert((ref_triplet == seq_triplet) or (ref_triplet == reverse_complement(seq_triplet)))
+    alt = give_alt_allele(ref_triplet[1], change)
+    Mutation = namedtuple('Mutation', ['pos', 'ref_triplet', 'alt'])
+    return Mutation(pos, ref_triplet, alt)
+
+
+# Main: randomize per gene
+# ------------------------
+
+def randomize(mutrate, chrom, cds, segments, n_randomizations):
+
+    """
+    Return:
+        chromosome, Mutation('pos', 'ref_triplet', 'alt')
+    """
+
+    sequence = sequence_triplet_index(segments)  # sequence of triplet, given as their indices in TRIPLETS list
+    prob = probability_vector(sequence, mutrate)
+    try:
+        random_draw = np.random.choice(np.arange(len(prob)), size=n_randomizations, p=prob)
+    except ValueError:
+        prob = 1 / (3 * len(sequence)) * np.ones(3 * len(sequence))  # prob: extended sequence with x3 as many positions
+        random_draw = np.random.choice(np.arange(len(prob)), size=n_randomizations, p=prob)
+
+    for mut_index in random_draw:
+        with suppress(AssertionError):
+            mut = to_mutation(mut_index, cds, sequence, chrom)
+            yield chrom, mut
+
+
+if __name__ == '__main__':
+
+    pass
diff --git a/containers_build/boostdm/annotations/cohort.py b/containers_build/boostdm/annotations/cohort.py
@@ -77,6 +77,7 @@ def set_string_chr(row):
 
 
 def oncotree_sisters(cohort):
+
     """Generator of cohorts belonging to the same ttype type as 'cohort'"""
     # TODO re-implement function in terms of oncotree
     tree = Oncotree()
@@ -226,6 +227,7 @@ def initialize_trainset(df, drivers):
 
 
 def build_positive_set(df_expect):
+
     canonical_transcript = retrieve_transcript()
     pos = intersect_region_mutations(canonical_transcript, df_expect)
     pos['response'] = 1
@@ -305,7 +307,7 @@ def build_table(cohort, dndscv_file, dndscv_annotated_file,
     df['pos'] = df.apply(lambda row: int(row['pos']), axis=1)
 
     # Reset index
-    df.reset_index(inplace=True)
+    df.reset_index(drop=True, inplace=True)
 
     # Add features
     df = features(df, cohort, clustl_group_file, hotmaps_group_file, smregions_group_file)
@@ -320,19 +322,15 @@ def build_table(cohort, dndscv_file, dndscv_annotated_file,
 
 @click.command()
 @click.option('--cohort', type=str)
-# @click.option('--drivers-summary', 'summary', type=click.Path(exists=True), help='Drivers summary from IntOGen')
 @click.option('--dndscv-path', 'dndscv_path', type=click.Path(exists=True), help='Cohort dNdsCV out')
 @click.option('--dndscv-annotmuts-path', 'dnds_muts_path', type=click.Path(exists=True), help='Cohort dNdsCV annotmuts out')
 @click.option('--mutrate-path', 'mutrate_path', type=click.Path(exists=True), help='Cohort mutrate out')
-# @click.option('--clustl-path', 'clustl_path', type=click.Path(exists=True), help='Cohort OncodriveCLUSTL out')
 @click.option('--clustl-group-path', 'clustl_group_path', type=click.Path(exists=True), help='Combined OncodriveCLUSTL out')
-# @click.option('--hotmaps-path', 'hotmaps_path', type=click.Path(exists=True), help='Cohort HotMAPS out')
 @click.option('--hotmaps-group-path', 'hotmaps_group_path', type=click.Path(exists=True), help='Combined HotMAPS out')
-# @click.option('--smregions-path', 'smregions_path', type=click.Path(exists=True), help='Cohort smregions out')
 @click.option('--smregions-group-path', 'smregions_group_path', type=click.Path(exists=True), help='Combined smregions out')
 @click.option('--out', type=click.Path())
 @click.option('--seed', type=int, default=None)
-@click.option('--splits', type=int, default=42)
+@click.option('--splits', type=int, default=50)
 @click.option('--threshold', type=float, default=0.85)
 def cli(cohort, dndscv_path, dnds_muts_path, mutrate_path, clustl_group_path,
         hotmaps_group_path, smregions_group_path, out, seed, splits, threshold):

diff --git a/containers_build/boostdm/annotations/gene.py b/containers_build/boostdm/annotations/gene.py
@@ -115,7 +115,7 @@ def build_table(mutations_file, tumor, path_clustl, path_hotmaps, path_smregions
     muts = read_muts(mutations_file)
 
     # Reset index
-    muts.reset_index(inplace=True)
+    muts.reset_index(drop=True, inplace=True)
 
     # annotate mutations
     df = features(muts, tumor, path_clustl, path_hotmaps, path_smregions)
@@ -149,10 +149,6 @@ def cli(gene, ttype, mutations, clustl_group, hotmaps_group, smregions_group):
     df = build_table(mutations, ttype, clustl_group, hotmaps_group, smregions_group)
     df.to_csv(f'{gene}.{ttype}.annotated.tsv.gz', sep='\t', index=False, compression="gzip")
 
-    # for testing only:
-    # df = pd.DataFrame([])
-    # df.to_csv(f'{gene}.{ttype}.annotated.tsv.gz', sep='\t', index=False, compression="gzip")
-
 
 if __name__ == '__main__':
     cli()
diff --git a/containers_build/boostdm/evaluation/data.py b/containers_build/boostdm/evaluation/data.py
@@ -66,6 +66,7 @@ def evaluate(model_evaluations):
 @click.option('--discovery_path', 'discovery_path', type=click.Path(), help='file path to discovery output table')
 @click.option('--output', 'output_file', type=click.Path(), help='output folder')
 def cli(eval_folder, discovery_path, output_file):
+
     models = {}
 
     df_discovery = pd.read_csv(discovery_path, sep='\t')