gagneurlab · mumichae · Jun 24, 2021 · Jun 25, 2021 · Jun 29, 2021 · Jul 2, 2021
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R
@@ -5,11 +5,14 @@
 #'  log:
 #'    - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "03_filter.Rds")`'
 #'  params:
-#'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'   - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`'
 #'  input:
+#'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
 #'   - theta:  '`sm cfg.getProcessedDataDir()+
 #'                  "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`'
+#'   - txdb: '`sm cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/txdb.db"`'
+#'   - addAnnotation:  '`sm cfg.AS.getWorkdir() + "/Counting/fds_annotation.R"`'
+#'   - spliceTypeSetup: '`sm cfg.AS.getWorkdir() + "/spliceTypeConfig.R"`'
 #'  output:
 #'   - fds: '`sm cfg.getProcessedDataDir() +
 #'                "/aberrant_splicing/datasets/savedObjects/{dataset}/fds-object.RDS"`'
@@ -20,7 +23,9 @@
 #'---
 
 saveRDS(snakemake, snakemake@log$snakemake)
-source(snakemake@params$setup, echo=FALSE)
+source(snakemake@input$setup, echo=FALSE)
+source(snakemake@input$spliceTypeSetup, echo=FALSE)
+source(snakemake@input$addAnnotation)
 
 opts_chunk$set(fig.width=12, fig.height=8)
 
@@ -53,5 +58,14 @@ if (params$filter == TRUE) {
     message(paste("filtered to", nrow(fds), "junctions"))
 }
 
+fds <- saveFraserDataSet(fds)
+
+# Add the junction annotations to the fds
+message("load db for annotation")
+txdb <- loadDb(snakemake@input$txdb)
+seqlevelsStyle(txdb) <- seqlevelsStyle(fds)
+fds <- createFDSAnnotations(fds, txdb)
+message("save object after annotation")
+
 fds <- saveFraserDataSet(fds)
 file.create(snakemake@output$done)
diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/fds_annotation.R b/drop/modules/aberrant-splicing-pipeline/Counting/fds_annotation.R
@@ -0,0 +1,68 @@
+### 20210604 klutz
+
+### basic annotations (start, end, none, both) for full fds
+
+createFDSAnnotations <- function(fds, txdb){
+  print("loading introns")
+  #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
+  introns <- unique(unlist(intronsByTranscript(txdb)))
+  # reduce the introns to only the actually expressed introns
+  fds_known <- fds[unique(to(findOverlaps(introns, rowRanges(fds, type = "j"), type = "equal"))),]
+  grAnno <- rowRanges(fds_known, type="psi5")
+  anno_introns <- as.data.table(grAnno)
+  anno_introns <- anno_introns[,.(seqnames, start, end, strand)]
+
+  #calculate extra columns with mean/median intron expression count
+  #add the new columns
+  print("adding median count to introns")
+  sampleCounts <- K(fds_known, type = "psi5")
+  anno_introns[, "meanCount" := rowMeans(sampleCounts)]
+  anno_introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))]
+
+  anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, keep.extra.columns = TRUE)
+
+  ### get all fds junctions
+  fds_junctions <- rowRanges(fds, type = "j")
+
+  ### Do the annotation just for the most used intron (highest median expression)
+  print("start calculating annotations")
+  annotations <- sapply(c(1:length(fds_junctions)), function(i){
+    #print(i)
+    #print("-------------")
+    overlap <- to(findOverlaps(fds_junctions[i], anno_introns_ranges))
+    if(length(overlap) == 0) return("none") #no overlap with any intron
+
+    expre <- sapply(overlap, function(j){
+      elementMetadata(anno_introns_ranges[j])$medianCount
+    })
+    maxExpr <- which.max(expre)
+
+    hit_equal <- from(findOverlaps(fds_junctions[i], anno_introns_ranges[overlap[maxExpr]], type="equal"))
+    if(length(hit_equal) > 0) return("both")
+
+    hit_start <- from(findOverlaps(fds_junctions[i], anno_introns_ranges[overlap[maxExpr]], type="start"))
+    if(length(hit_start) > 0) return("start")
+    hit_end   <- from(findOverlaps(fds_junctions[i], anno_introns_ranges[overlap[maxExpr]], type="end"))
+    if(length(hit_end) > 0) return("end")
+
+    return("none") #overlaps but no start/end match
+  })
+
+  #table(annotations)
+  rowRanges(fds)$annotatedJunction = annotations
+  print("annotations done")
+  return(fds)
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
@@ -21,6 +21,8 @@
 #'                 "padjBetaBinomial_theta.h5"`'
 #'   - txdb: '`sm cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/txdb.db"`'
 #'   - gene_name_mapping: '`sm cfg.getProcessedDataDir() + "/aberrant_expression/{annotation}/gene_name_mapping_{annotation}.tsv"`'
+#'   - spliceTypeSetup: '`sm cfg.AS.getWorkdir() + "/spliceTypeConfig.R"`'
+#'   - addAnnotation:  '`sm cfg.AS.getWorkdir() + "/FRASER/fds_annotation.R"`'
 #'  output:
 #'   - resultTableJunc: '`sm cfg.getProcessedResultsDir() + 
 #'                          "/aberrant_splicing/results/{annotation}/fraser/{dataset}/results_per_junction.tsv"`'
@@ -34,6 +36,8 @@
 saveRDS(snakemake, snakemake@log$snakemake)
 source(snakemake@input$setup, echo=FALSE)
 source(snakemake@input$add_HPO_cols)
+source(snakemake@input$spliceTypeSetup, echo=FALSE)
+source(snakemake@input$addAnnotation)
 library(AnnotationDbi)
 
 opts_chunk$set(fig.width=12, fig.height=8)
@@ -65,6 +69,9 @@ seqlevelsStyle(txdb) <- seqlevelsStyle(fds)
 fds <- annotateRangesWithTxDb(fds, txdb = txdb, orgDb = orgdb, feature = 'gene_name', 
                               featureName = 'hgnc_symbol', keytype = 'gene_id')
 
+# Add the junction annotations to the fds
+fds <- testFct(fds)
+
 # Extract results per junction
 res_junc <- results(fds,
                     padjCutoff=snakemake@params$padjCutoff,

diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/fds_annotation.R b/drop/modules/aberrant-splicing-pipeline/FRASER/fds_annotation.R
@@ -0,0 +1,76 @@
+### 20210604 klutz
+
+### basic annotations (start, end, none, both) for full fds
+
+testFct <- function(fds){
+  message("start test function")
+  test_vector <- rep("test", times = length(rowRanges(fds, type="j")))
+  rowRanges(fds)$test = test_vector
+  message("end test functiont")
+  return(fds)
+}
+
+createFDSAnnotations <- function(fds, txdb){
+  print("loading introns")
+  #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1]
+  introns <- unique(unlist(intronsByTranscript(txdb)))
+  # reduce the introns to only the actually expressed introns
+  fds_known <- fds[unique(to(findOverlaps(introns, rowRanges(fds, type = "j"), type = "equal"))),]
+  grAnno <- rowRanges(fds_known, type="psi5")
+  anno_introns <- as.data.table(grAnno)
+  anno_introns <- anno_introns[,.(seqnames, start, end, strand)]
+
+  #calculate extra columns with mean/median intron expression count
+  #add the new columns
+  print("adding median count to introns")
+  sampleCounts <- K(fds_known, type = "psi5")
+  anno_introns[, "meanCount" := rowMeans(sampleCounts)]
+  anno_introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))]
+
+  anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, keep.extra.columns = TRUE)
+
+  ### get all fds junctions
+  fds_junctions <- rowRanges(fds, type = "j")
+
+  ### Do the annotation just for the most used intron (highest median expression)
+  print("start calculating annotations")
+  annotations <- sapply(c(1:length(fds_junctions)), function(i){
+    #print(i)
+    #print("-------------")
+    overlap <- to(findOverlaps(fds_junctions[i], anno_introns_ranges))
+    if(length(overlap) == 0) return("none") #no overlap with any intron
+
+    expre <- sapply(overlap, function(j){
+      elementMetadata(anno_introns_ranges[j])$medianCount
+    })
+    maxExpr <- which.max(expre)
+
+    hit_equal <- from(findOverlaps(fds_junctions[i], anno_introns_ranges[overlap[maxExpr]], type="equal"))
+    if(length(hit_equal) > 0) return("both")
+
+    hit_start <- from(findOverlaps(fds_junctions[i], anno_introns_ranges[overlap[maxExpr]], type="start"))
+    if(length(hit_start) > 0) return("start")
+    hit_end   <- from(findOverlaps(fds_junctions[i], anno_introns_ranges[overlap[maxExpr]], type="end"))
+    if(length(hit_end) > 0) return("end")
+
+    return("none") #overlaps but no start/end match
+  })
+
+  #table(annotations)
+  rowRanges(fds)$annotatedJunction = annotations
+  print("annotations done")
+  return(fds)
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/drop/modules/aberrant-splicing-pipeline/resource/hg19-blacklist.v2.bed.gz b/drop/modules/aberrant-splicing-pipeline/resource/hg19-blacklist.v2.bed.gz
diff --git a/drop/modules/aberrant-splicing-pipeline/resource/hg38-blacklist.v2.bed.gz b/drop/modules/aberrant-splicing-pipeline/resource/hg38-blacklist.v2.bed.gz
diff --git a/drop/modules/aberrant-splicing-pipeline/spliceTypeConfig.R b/drop/modules/aberrant-splicing-pipeline/spliceTypeConfig.R
@@ -0,0 +1,7 @@
+##--------------------------------------------
+## required packages for the aberrantSpliceTypes
+message("Load aberrant splice type packages")
+suppressPackageStartupMessages({
+  library(AnnotationDbi)
+  library(rtracklayer) #to import that blacklist file
+})