Skip to content

Commit

Permalink
FIX: mutability map to sample synthetic passengers
Browse files Browse the repository at this point in the history
  • Loading branch information
koszulordie committed Nov 10, 2023
1 parent fca1865 commit 844ea1a
Show file tree
Hide file tree
Showing 6 changed files with 277 additions and 37 deletions.
181 changes: 181 additions & 0 deletions containers_build/boostdm/_old_passengers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# Usage
# -----


# Imports
# -------

import os
import itertools
from collections import namedtuple
from contextlib import suppress

import bgreference
import numpy as np
import pandas as pd

from boostdm.globals import CANONICAL_TRANSCRIPTS_FILE, GENOME_BUILD


# Utils
# -----

CB = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
TRIPLETS = [p[0] + c + p[1] for c in ['C', 'T'] for p in itertools.product(CB.keys(), repeat=2)]
CHANGES = {'A': ['C', 'G', 'T'], 'C': ['A', 'G', 'T'], 'G': ['A', 'C', 'T'], 'T': ['A', 'C', 'G']}

cds_data = pd.read_csv(CANONICAL_TRANSCRIPTS_FILE,
sep='\t', header=None, compression='gzip', low_memory=False)
cds_data = cds_data[[0, 1, 2, 3, 6]].copy()
cds_data.columns = ['chr', 'start', 'end', 'strand', 'gene']


# Retrieve exons
# --------------

def retrieve_exons(gene):

"""
Returns
chromosome
CDS of the gene: list of genomic positions of the CDS
list of exon sequences (): each exon sequence has 1-bp offset at the flanks
"""

df = cds_data[cds_data['gene'] == gene].copy()
exons = []
cds = []

if GENOME_BUILD == 'hg38':
func = bgreference.hg38
elif GENOME_BUILD == 'hg19':
func = bgreference.hg19

for ind, row in df.iterrows():
exons.append(func(row['chr'], int(row['start']) - 1, size=int(row['end']) - int(row['start']) + 3))
cds += list(range(int(row['start']), int(row['end']) + 1))
return row['chr'], cds, exons


# Triplet Utils
# -------------

def triplet_index(triplet):

"""Gives index of triplet according to TRIPLETS sorting"""
if triplet[1] not in ['C', 'T']:
triplet = CB[triplet[2]] + CB[triplet[1]] + CB[triplet[0]]
return TRIPLETS.index(triplet)


def reverse_complement(triplet):

return CB[triplet[2]] + CB[triplet[1]] + CB[triplet[0]]


def mut_key_gen():

for ref in ['C', 'T']:
for alt in CB.keys():
if ref == alt:
continue
else:
for p in itertools.product(CB.keys(), repeat=2):
yield p[0] + ref + p[1], alt


def sequence_triplet_index(segments):

"""Takes segments and returns single sequence of triplet indices"""
sequence = []
for seg in segments:
for i, c in enumerate(seg[1: -1]):
triplet = seg[i: i+3]
tset = set(triplet)
if not tset.issubset(set('ACGT')):
sequence.append(-1) # index = -1 implies that mutrate not defined,
# then we impute in the function probability_vector
else:
sequence.append(triplet_index(triplet))
return sequence


def give_alt_allele(nucleotide, n):

"""Gives the n-th possible change in lexicographic order from nucleotide"""
return CHANGES[nucleotide][n]


# Indices of reference triplets in standard 96-channel order
# ----------------------------------------------------------

TRIPLET_INDEX_SPECTRA = [triplet_index(triplet) for triplet, alt in mut_key_gen()]


# Probability
# -----------

def probability_vector(sequence, total_mutrate):

unfold_rates = []
for ind in sequence:
if ind == -1:
unfold_rates += [0, 0, 0]
else:
pos = TRIPLET_INDEX_SPECTRA.index(ind)
for i in range(3):
unfold_rates.append(total_mutrate[pos + i * 16])
return np.array(unfold_rates) / sum(unfold_rates) # TODO: beware of zero-values!


def to_mutation(mut_index, cds, sequence, chrom):

"""
Return:
position and index of spectra context
"""

index_in_sequence = mut_index // 3
change = mut_index % 3
pos = cds[index_in_sequence]

if GENOME_BUILD == 'hg38':
func = bgreference.hg38
elif GENOME_BUILD == 'hg19':
func = bgreference.hg19

ref_triplet = func(chrom, pos - 1, size=3)
seq_triplet = TRIPLETS[sequence[index_in_sequence]]
assert((ref_triplet == seq_triplet) or (ref_triplet == reverse_complement(seq_triplet)))
alt = give_alt_allele(ref_triplet[1], change)
Mutation = namedtuple('Mutation', ['pos', 'ref_triplet', 'alt'])
return Mutation(pos, ref_triplet, alt)


# Main: randomize per gene
# ------------------------

def randomize(mutrate, chrom, cds, segments, n_randomizations):

"""
Return:
chromosome, Mutation('pos', 'ref_triplet', 'alt')
"""

sequence = sequence_triplet_index(segments) # sequence of triplet, given as their indices in TRIPLETS list
prob = probability_vector(sequence, mutrate)
try:
random_draw = np.random.choice(np.arange(len(prob)), size=n_randomizations, p=prob)
except ValueError:
prob = 1 / (3 * len(sequence)) * np.ones(3 * len(sequence)) # prob: extended sequence with x3 as many positions
random_draw = np.random.choice(np.arange(len(prob)), size=n_randomizations, p=prob)

for mut_index in random_draw:
with suppress(AssertionError):
mut = to_mutation(mut_index, cds, sequence, chrom)
yield chrom, mut


if __name__ == '__main__':

pass
10 changes: 4 additions & 6 deletions containers_build/boostdm/annotations/cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def set_string_chr(row):


def oncotree_sisters(cohort):

"""Generator of cohorts belonging to the same ttype type as 'cohort'"""
# TODO re-implement function in terms of oncotree
tree = Oncotree()
Expand Down Expand Up @@ -226,6 +227,7 @@ def initialize_trainset(df, drivers):


def build_positive_set(df_expect):

canonical_transcript = retrieve_transcript()
pos = intersect_region_mutations(canonical_transcript, df_expect)
pos['response'] = 1
Expand Down Expand Up @@ -305,7 +307,7 @@ def build_table(cohort, dndscv_file, dndscv_annotated_file,
df['pos'] = df.apply(lambda row: int(row['pos']), axis=1)

# Reset index
df.reset_index(inplace=True)
df.reset_index(drop=True, inplace=True)

# Add features
df = features(df, cohort, clustl_group_file, hotmaps_group_file, smregions_group_file)
Expand All @@ -320,19 +322,15 @@ def build_table(cohort, dndscv_file, dndscv_annotated_file,

@click.command()
@click.option('--cohort', type=str)
# @click.option('--drivers-summary', 'summary', type=click.Path(exists=True), help='Drivers summary from IntOGen')
@click.option('--dndscv-path', 'dndscv_path', type=click.Path(exists=True), help='Cohort dNdsCV out')
@click.option('--dndscv-annotmuts-path', 'dnds_muts_path', type=click.Path(exists=True), help='Cohort dNdsCV annotmuts out')
@click.option('--mutrate-path', 'mutrate_path', type=click.Path(exists=True), help='Cohort mutrate out')
# @click.option('--clustl-path', 'clustl_path', type=click.Path(exists=True), help='Cohort OncodriveCLUSTL out')
@click.option('--clustl-group-path', 'clustl_group_path', type=click.Path(exists=True), help='Combined OncodriveCLUSTL out')
# @click.option('--hotmaps-path', 'hotmaps_path', type=click.Path(exists=True), help='Cohort HotMAPS out')
@click.option('--hotmaps-group-path', 'hotmaps_group_path', type=click.Path(exists=True), help='Combined HotMAPS out')
# @click.option('--smregions-path', 'smregions_path', type=click.Path(exists=True), help='Cohort smregions out')
@click.option('--smregions-group-path', 'smregions_group_path', type=click.Path(exists=True), help='Combined smregions out')
@click.option('--out', type=click.Path())
@click.option('--seed', type=int, default=None)
@click.option('--splits', type=int, default=42)
@click.option('--splits', type=int, default=50)
@click.option('--threshold', type=float, default=0.85)
def cli(cohort, dndscv_path, dnds_muts_path, mutrate_path, clustl_group_path,
hotmaps_group_path, smregions_group_path, out, seed, splits, threshold):
Expand Down
6 changes: 1 addition & 5 deletions containers_build/boostdm/annotations/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def build_table(mutations_file, tumor, path_clustl, path_hotmaps, path_smregions
muts = read_muts(mutations_file)

# Reset index
muts.reset_index(inplace=True)
muts.reset_index(drop=True, inplace=True)

# annotate mutations
df = features(muts, tumor, path_clustl, path_hotmaps, path_smregions)
Expand Down Expand Up @@ -149,10 +149,6 @@ def cli(gene, ttype, mutations, clustl_group, hotmaps_group, smregions_group):
df = build_table(mutations, ttype, clustl_group, hotmaps_group, smregions_group)
df.to_csv(f'{gene}.{ttype}.annotated.tsv.gz', sep='\t', index=False, compression="gzip")

# for testing only:
# df = pd.DataFrame([])
# df.to_csv(f'{gene}.{ttype}.annotated.tsv.gz', sep='\t', index=False, compression="gzip")


if __name__ == '__main__':
cli()
1 change: 1 addition & 0 deletions containers_build/boostdm/evaluation/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def evaluate(model_evaluations):
@click.option('--discovery_path', 'discovery_path', type=click.Path(), help='file path to discovery output table')
@click.option('--output', 'output_file', type=click.Path(), help='output folder')
def cli(eval_folder, discovery_path, output_file):

models = {}

df_discovery = pd.read_csv(discovery_path, sep='\t')
Expand Down
Loading

0 comments on commit 844ea1a

Please sign in to comment.