Strand specific counting wrt samples (gagneurlab#72)

* make strandSpecificity by sample * move strand specific error to validObject * minor changes in strandSpecific function and properties * correct identation --------- Co-authored-by: Christian Mertes <mertes@in.tum.de>
gagneurlab · Apr 25, 2024 · e775832 · e775832
1 parent 6f5c919
commit e775832
Show file tree

Hide file tree

Showing 9 changed files with 70 additions and 37 deletions.
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
@@ -151,25 +151,44 @@ setReplaceMethod("workingDir", "FraserDataSet", function(object, value) {
 #' @export
 #' @rdname fds-methods
 setMethod("strandSpecific", "FraserDataSet", function(object) {
-    return(slot(object, "strandSpecific"))
+    if(!"strand" %in% colnames(colData(object))){
+        warning("Strand is not specified. Please set the used RNA-seq",
+                " protocol by using 'strandSpecific(object) <- c(...)'.",
+                "\n\nWe assume as default a non stranded protocol.")
+        return(rep(0, ncol(object)))
+    }
+    return(colData(object)$strand)
 })
 
 #' @export
 #' @rdname fds-methods
 setReplaceMethod("strandSpecific", "FraserDataSet", function(object, value) {
+    if (length(value) != ncol(object)){
+        if(length(value) == 1){
+            warning("Only one value is provided as strand for all samples.\n",
+                  "  We assume that all samples are of the same provided strand.")
+            strandSpecific(object) <- rep(value, ncol(object))
+        }
+        else{
+            stop("Number of strand values should be equal to the number of samples: (",
+                  paste0(length(value), " != ", ncol(object), ")"))
+        }
+    }
     if(is.logical(value)){
         value <- as.integer(value)
     }
     if(is.character(value)){
-        value <- switch(tolower(value),
-                        'no' = 0L,
-                        'unstranded' = 0L,
-                        'yes' = 1L,
-                        'stranded' = 1L,
-                        'reverse' = 2L,
-                        -1L)
+        val_chr <- tolower(value)
+        value <- sapply(val_chr, switch, 'no' = 0L, 'unstranded' = 0L, 
+                'yes' = 1L, 'stranded' = 1L, 'reverse' = 2L, -1L)
+        if(any(value < 0)){
+          stop("Please specify correct strandness of the samples.", 
+                  " It needs to be one of: 'no', 'unstranded', 'yes',",
+                  " 'stranded' or 'reverse'.", "\n\nIt was specified: ", 
+                  paste(collapse = ", ", val_chr[value < 0]))
+        }
     }
-    slot(object, "strandSpecific") <- value
+    colData(object)$strand <- as.integer(value)
     validObject(object)
     return(object)
 })
@@ -309,7 +328,6 @@ subset.FRASER <- function(x, i, j, by=c("j", "ss"), ..., drop=FALSE){
             subX,
             name            = name(x),
             bamParam        = scanBamParam(x),
-            strandSpecific  = strandSpecific(x),
             workingDir      = workingDir(x),
             nonSplicedReads = nsrObj
     )

diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R
@@ -14,14 +14,12 @@ setClass("FraserDataSet",
     slots = list(
         name            = "character",
         bamParam        = "ScanBamParam",
-        strandSpecific  = "integer",
         workingDir      = "character",
         nonSplicedReads = "RangedSummarizedExperiment"
     ),
     prototype = list(
         name            = "Data Analysis",
         bamParam        = ScanBamParam(mapqFilter=0),
-        strandSpecific  = 0L,
         workingDir      = "FRASER_output",
         nonSplicedReads = SummarizedExperiment(rowRanges=GRanges())
     )
@@ -73,9 +71,19 @@ validateBamParam <- function(object) {
 }
 
 validateStrandSpecific <- function(object) {
-    if(!isScalarInteger(object@strandSpecific)) {
-        return(paste("The 'strandSpecific' option must be 0L (unstranded),",
-                        "1L (stranded) or 2L (reverse)."))
+    sampleData <- as.data.table(colData(object))
+    if("strand" %in% colnames(sampleData) && 
+            (!is.integer(sampleData[,strand]) 
+                || any(!sampleData[,strand] %in% 0:2))){
+        return(paste("The 'strand' column in the sample annotation in",
+                "'colData(fds)' must be empty or contain only integers:",
+                "0L == 'no', 1L == 'yes', 2L == 'reverse'."))
+    }
+    # Check mixed strand type
+    ss <- strandSpecific(object)
+    if ((any(ss == 0) && any(ss == 1)) || (any(ss == 0) && any(ss == 2))){
+      stop(paste("Data contains a mix of stranded and unstranded samples.\n ",
+                 "Please analyse them separately to ensure consistency during counting."))
     }
     NULL
 }
@@ -85,8 +93,8 @@ validatePairedEnd <- function(object) {
     if("pairedEnd" %in% colnames(sampleData) && 
             any(!is.logical(sampleData[,pairedEnd]))){
         return(paste("The 'pairedEnd' column in the sample annotation in",
-                    "'colData(fds)' must only contain logical values ",
-                    "(TRUE or FALSE)."))
+                "'colData(fds)' must only contain logical values ",
+                "(TRUE or FALSE)."))
     }
     NULL
 }
@@ -114,7 +122,8 @@ validateNonSplicedReadsType <- function(object) {
         return("'nonSplicedReads' must be a RangedSummarizedExperiment object")
     }
     if(length(object) != 0 && dim(object@nonSplicedReads)[2] != dim(object)[2]){
-        return("The nonSplicedReads dimensions are not correct. This is a internal error!")
+        return(paste("The nonSplicedReads dimensions are not correct.",
+                "This is a internal error!"))
     }
     ans <- validObject(object@nonSplicedReads)
     if(!isScalarLogical(ans) || ans == FALSE){
@@ -134,21 +143,24 @@ validateAssays <- function(object){
     NULL
 }
 
-# for non-empty fds objects check if non-spliced reads are overlapping with at least 1 donor/acceptor site
+# for non-empty fds objects check if non-spliced reads are overlapping 
+# with at least 1 donor/acceptor site
 validateNonSplicedReadsSanity <- function(object){
     # fds object must have samples and junctions
     if(all(dim(object) > c(0,0))){
 
         # fds object must be annotated with start/end/spliceSite indexes
-        if("startID" %in% names(mcols(object)) && "endID" %in% names(mcols(object)) &&
-           "spliceSiteID" %in% names(mcols(object, type="theta"))){
+        if(all(c("startID", "endID") %in% names(mcols(object))) &&
+                        "spliceSiteID" %in% names(mcols(object, type="theta"))){
 
-                # check that every spliceSiteID matches either a start or end index
-                if(length( intersect( mcols(object, type="theta")$spliceSiteID, unlist(mcols(object)[, c("startID", "endID")] ) ) )
-                      != nrow(nonSplicedReads(object))){
-                return("The nonSplicedReads do not have corresponding splitReads. This is probably the result of merging")
-                }
+            # check that every spliceSiteID matches either a start or end index
+            if(nrow(nonSplicedReads(object)) != length(intersect(
+                    mcols(object, type="theta")$spliceSiteID, 
+                    unlist(mcols(object)[, c("startID", "endID")])))){
+                return(paste("The nonSplicedReads do not have corresponding",
+                        "splitReads. This is probably the result of merging"))
             }
+        }
     }
     NULL
 }

diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R
@@ -270,7 +270,8 @@ findAnnotatedJunction <- function(fds, annotation, annotateNames=TRUE,
     # check if strandspecific data is used
     gr <- rowRanges(fds, type="psi5")
 
-    if(isFALSE(as.logical(stranded))){
+    # 
+    if(isFALSE(as.logical(unique(stranded)))){
         strand(gr) <- "*"
     }
 

diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
@@ -406,7 +406,6 @@ addCountsToFraserDataSet <- function(fds, splitCounts, nonSplitCounts){
                 splitCounts,
                 name            = name(fds),
                 bamParam        = scanBamParam(fds),
-                strandSpecific  = strandSpecific(fds),
                 workingDir      = workingDir(fds),
                 nonSplicedReads = nonSplitCounts,
                 metadata        = metadata(fds)
@@ -461,14 +460,14 @@ countSplitReads <- function(sampleID, fds, NcpuPerSample=1, genome=NULL,
                     recount=FALSE, keepNonStandardChromosomes=TRUE,
                     bamfile=bamFile(fds[,sampleID]),
                     pairedend=pairedEnd(fds[,sampleID]),
-                    strandmode=strandSpecific(fds),
+                    strandmode=strandSpecific(fds[,sampleID]),
                     cacheFile=getSplitCountCacheFile(sampleID, fds),
                     scanbamparam=scanBamParam(fds),
                     coldata=colData(fds)){
 
     # check for valid fds
     validObject(fds)
-    
+
     # check cache if available
     if(isFALSE(recount) && !is.null(cacheFile) && file.exists(cacheFile)){
         cache <- readRDS(cacheFile)
@@ -865,6 +864,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds,
     # unstranded case: for counting only non spliced reads we 
     # skip this information
     isPairedEnd <- pairedEnd(fds[,samples(fds) == sampleID])[[1]]
+    strand <- strandSpecific(fds[,samples(fds) == sampleID])[[1]]
     doAutosort <- isPairedEnd
 
     # check cache if available
@@ -906,7 +906,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds,
             allowMultiOverlap=TRUE,
             checkFragLength=FALSE,
             minMQS=bamMapqFilter(scanBamParam(fds)),
-            strandSpecific=as.integer(strandSpecific(fds)),
+            strandSpecific=strand,
 
             # activating long read mode
             isLongRead=longRead,

diff --git a/R/helper-functions.R b/R/helper-functions.R
@@ -547,7 +547,12 @@ getBPParam <- function(worker, tasks=0, ...){
 }
 
 getStrandString <- function(fds){
-    strand <- switch(strandSpecific(fds)+1L, "no", "yes", "reverse")
+    strand <- sapply(strandSpecific(fds)+1L, switch, "no", "yes (forward)", "yes (reverse)")
+    if (length(unique(strand)) == 2){
+        strand <- "yes (forward and reverse)"
+    } else {
+        strand <- unique(strand)
+    }
     return(strand)
 }
 

diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R
@@ -113,7 +113,6 @@ makeSimulatedFraserDataSet_BetaBinomial <- function(m=200, j=10000, q=10,
             junctionData,
             name            = name(fds),
             bamParam        = scanBamParam(fds),
-            strandSpecific  = strandSpecific(fds),
             workingDir      = workingDir(fds),
             nonSplicedReads = nonSplitData)
 
@@ -364,7 +363,6 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10,
             junctionData,
             name            = name(fds),
             bamParam        = scanBamParam(fds),
-            strandSpecific  = strandSpecific(fds),
             workingDir      = workingDir(fds),
             nonSplicedReads = nonSplitData)
 

diff --git a/R/mergeExternalData.R b/R/mergeExternalData.R
@@ -155,7 +155,6 @@ mergeExternalData <- function(fds, countFiles, sampleIDs, annotation=NULL){
     ans <- new("FraserDataSet",
             name = name(fds),
             bamParam = scanBamParam(fds),
-            strandSpecific = strandSpecific(fds),
             workingDir = workingDir(fds),
             colData = newColData,
             assays = Assays(SimpleList(

diff --git a/man/countRNA.Rd b/man/countRNA.Rd
diff --git a/tests/testthat/test_counting.R b/tests/testthat/test_counting.R
@@ -33,7 +33,7 @@ test_that("Strand spcific counting", {
     fds <- createTestFraserSettings()
     strandSpecific(fds) <- TRUE
     ans <- countSplitReadsPerChromosome("chrUn_gl000218", bamFile(fds)[1], 
-            pairedEnd=TRUE, strandMode=strandSpecific(fds), genome=NULL,
+            pairedEnd=TRUE, strandMode=strandSpecific(fds)[1], genome=NULL,
             scanBamParam=scanBamParam(fds))
     expect_equivalent(ans, GRanges())