Dev (#352)

* add acknowledgements * Update README.md * Update config.yaml * as character sample from vcf * Fix typo in exportCounts input. Rename raw-{dataset} to raw-local-{dataset}. There is no rule that produces raw-{dataset} as output. * remove STRAND column from results * bug fix for AE import count reading * fix sample anno character style * fix sample anno character style * fix sample anno character style * fix sample anno character style * fix sample anno character style * external counts NAs handling * Update filterCounts.R * version bump Co-authored-by: Vicente Yepez <30469316+vyepez88@users.noreply.github.com> Co-authored-by: Spencer Skylar Chan <54919210+schance995@users.noreply.github.com> Co-authored-by: Smith Nicholas <smith@in.tum.de>
gagneurlab · Aug 3, 2022 · 257cb0f · 257cb0f
1 parent a00f59c
commit 257cb0f
Show file tree

Hide file tree

Showing 17 changed files with 52 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -11,6 +11,8 @@ The manuscript is available in [Nature Protocols](https://www.nature.com/article
 
 
 ## What's new
+Version 1.2.2 fixes some critical bugs that affected the performance of the `aberrantExpression` pipeline, and allows sample IDs to be numeric.
+
 As of version 1.2.1 DROP has a new module that performs RNA-seq variant calling. The input are BAM files and the output either a single-sample or a multi-sample VCF file (option specified by the user) annotated with allele frequencies from gnomAD (if specified by the user). The sample annotation table does not need to be changed, but several new parameters in the config file have to be added and tuned. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#rna-variant-calling-dictionary).
 
 Also, the integration of external split and non-split counts to detect aberrant splicing is now possible. Simply specify in a new column in the sample annotation the directory containing the counts. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#external-count-examples).

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -23,7 +23,7 @@
 author = 'Michaela Müller'
 
 # The full version, including alpha/beta/rc tags
-release_ = '1.2.1'
+release_ = '1.2.2'
 
 
 

diff --git a/drop/__init__.py b/drop/__init__.py
@@ -4,5 +4,5 @@
 from . import utils
 from . import demo
 
-__version__ = "1.2.1"
+__version__ = "1.2.2"
 
diff --git a/drop/cli.py b/drop/cli.py
@@ -17,7 +17,7 @@
 
 @click.group()
 @click_log.simple_verbosity_option(logger)
-@click.version_option('1.2.1',prog_name='drop')
+@click.version_option('1.2.2',prog_name='drop')
 
 
 def main():

diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
@@ -181,6 +181,9 @@ def subsetSampleAnnotation(self, column, values, subset=None):
         # check if column is valid
         elif column not in sa_cols:
             raise KeyError(f"Column '{column}' not present in sample annotation.")
+        # return empty subset if empty
+        elif subset.empty:
+            return subset
         #subset column for matching values
         else:
             return utils.subsetBy(subset, column, values)
@@ -272,7 +275,6 @@ def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_F
         #subset for the annotation_key in the annotation group and the group_key in the group
         subset = self.subsetSampleAnnotation(annotation_key, annotation)
         subset = self.subsetSampleAnnotation(group_key, group, subset)
-
         ans = subset[file_type].tolist()
         if asSet:
             ans = set(ans)

diff --git a/drop/config/submodules/AberrantExpression.py b/drop/config/submodules/AberrantExpression.py
@@ -48,15 +48,16 @@ def getCountFiles(self, annotation, group):
         :return: list of files
         """
 
-        # if sample annotation table does not contain GENE_COUNTS_FILE column. return no external counts
-        if("GENE_COUNTS_FILE" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS):
-            return []
 
         bam_IDs = self.sampleAnnotation.getIDsByGroup(group, assay="RNA")
         file_stump = self.processedDataDir / "aberrant_expression" / annotation / "counts" / "{sampleID}.Rds"
         count_files = expand(str(file_stump), sampleID=bam_IDs)
-        extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE")
-        count_files.extend(extCountFiles)
+        # if sample annotation table does not contain GENE_COUNTS_FILE column. return no external counts
+        if("GENE_COUNTS_FILE" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS):
+            extCountFiles = []
+        else:
+            extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE")
+            count_files.extend(extCountFiles)
         return count_files
 
     def getCountParams(self, rnaID):

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R
@@ -36,7 +36,17 @@ ods <- filterExpression(ods, gtfFile=txdb, filter=FALSE,
 # add column for genes with at least 1 gene
 rowData(ods)$counted1sample = rowSums(assay(ods)) > 0
 
-has_external <- !(all(ods@colData$GENE_COUNTS_FILE == "") || is.null(ods@colData$GENE_COUNTS_FILE))
+# External data check
+if (is.null(ods@colData$GENE_COUNTS_FILE)){ #column does not exist in sample annotation table
+    has_external <- FALSE
+}else if(all(is.na(ods@colData$GENE_COUNTS_FILE))){ #column exists but it has no values
+    has_external <- FALSE
+}else if(all(ods@colData$GENE_COUNTS_FILE == "")){ #column exists with non-NA values but this group has all empty strings
+    has_external <- FALSE
+}else{ #column exists with non-NA values and this group has at least 1 non-empty string
+    has_external <- TRUE
+}
+
 if(has_external){
     ods@colData$isExternal <- as.factor(ods@colData$GENE_COUNTS_FILE != "")
 }else{

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R
@@ -56,10 +56,10 @@ total_counts <- SummarizedExperiment(assays=list(counts=merged_assays))
 rowRanges(total_counts) <- count_ranges
 
 # Add sample annotation data (colData)
-sample_anno <- fread(snakemake@config$sampleAnnotation)
+sample_anno <- fread(snakemake@config$sampleAnnotation,
+                    colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))
 sample_anno <- sample_anno[, .SD[1], by = RNA_ID]
 col_data <- data.table(RNA_ID = as.character(colnames(total_counts)))
-sample_anno[, RNA_ID := as.character(RNA_ID)]
 col_data <- left_join(col_data, sample_anno, by = "RNA_ID")
 rownames(col_data) <- col_data$RNA_ID
 colData(total_counts) <- as(col_data, "DataFrame")

diff --git a/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R
@@ -55,7 +55,8 @@ if(!is.null(gene_annot_dt$gene_name)){
 }
 
 # Add HPO terms, requires online connection and for there to be annotated HPO terms
-sa <- fread(snakemake@config$sampleAnnotation)
+sa <- fread(snakemake@config$sampleAnnotation, 
+              colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))
 if(!is.null(sa$HPO_TERMS) & nrow(res) > 0){
   if(!all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){
     res <- add_HPO_cols(res, hpo_file = snakemake@params$hpoFile)

diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R b/drop/modules/aberrant-splicing-pipeline/Counting/exportCounts.R
@@ -10,7 +10,7 @@
 #'  input:
 #'   - annotation: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/txdb.db"`'
 #'   - fds_theta: '`sm cfg.getProcessedDataDir() + 
-#'                    "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`'
+#'                    "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/theta.h5"`'
 #'  output:
 #'    - k_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/k_{metric}_counts.tsv.gz", metric=["j", "theta"])`'
 #'    - n_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/n_{metric}_counts.tsv.gz", metric=["psi5", "psi3", "theta"])`'

diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
@@ -81,8 +81,7 @@ if(nrow(res_junc_dt) > 0){
 
   # add colData to the results
   res_junc_dt <- merge(res_junc_dt, as.data.table(colData(fds)), by = "sampleID")
-  res_junc_dt[, c("bamFile", "pairedEnd") := NULL]
-  setnames(res_junc_dt, 'STRAND', 'STRAND_SPECIFIC')  # otherwise it's confusing with the 'strand' column from the junction
+  res_junc_dt[, c("bamFile", "pairedEnd", "STRAND") := NULL]
 } else{
   warning("The aberrant splicing pipeline gave 0 results for the ", dataset, " dataset.")
 }
@@ -91,11 +90,11 @@ if(nrow(res_junc_dt) > 0){
 if(length(res_junc) > 0){
   res_genes_dt <- resultsByGenes(res_junc) %>% as.data.table
   res_genes_dt <- merge(res_genes_dt, as.data.table(colData(fds)), by = "sampleID")
-  res_genes_dt[, c("bamFile", "pairedEnd") := NULL]
-  setnames(res_genes_dt, 'STRAND', 'STRAND_SPECIFIC')
-
+  res_genes_dt[, c("bamFile", "pairedEnd", "STRAND") := NULL]
+
   # add HPO overlap information
-  sa <- fread(snakemake@config$sampleAnnotation)
+  sa <- fread(snakemake@config$sampleAnnotation, 
+              colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))
   if(!is.null(sa$HPO_TERMS)){
     if(!all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){
       res_genes_dt <- add_HPO_cols(res_genes_dt, hpo_file = snakemake@params$hpoFile)

diff --git a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
@@ -59,7 +59,8 @@ ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.
 #' * Is the sample a relative of the other?
 #' 
 
-sa <- fread(snakemake@config$sampleAnnotation)[, .(DNA_ID, RNA_ID)]
+sa <- fread(snakemake@config$sampleAnnotation, 
+            colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))[, .(DNA_ID, RNA_ID)]
 sa[, ANNOTATED_MATCH := TRUE]
 colnames(melt_mat)[1:2] <- c('DNA_ID', 'RNA_ID')
 

diff --git a/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R b/drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R
@@ -27,14 +27,15 @@ suppressPackageStartupMessages({
 })
 
 register(MulticoreParam(snakemake@threads))
-sa <- fread(snakemake@config$sampleAnnotation)
+sa <- fread(snakemake@config$sampleAnnotation, 
+            colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))
 
 # Read the test vcf as GRanges
 gr_test <- readVcf(snakemake@config$mae$qcVcf) %>% granges()
 mcols(gr_test)$GT <- "0/0"
 
 # Obtain the rna and vcf files
-rna_samples <- snakemake@params$rnaIds
+rna_samples <- as.character(snakemake@params$rnaIds)
 mae_res <- snakemake@input$mae_res
 
 vcf_cols <- sa[RNA_ID %in% rna_samples, .(DNA_ID, DNA_VCF_FILE)] %>% unique %>% na.omit()
@@ -46,7 +47,7 @@ N <- length(vcf_files)
 lp <- bplapply(1:N, function(i){
 
   # Read sample vcf file
-  sample <- wes_samples[i]
+  sample <- wes_samples[i] %>% as.character()
   param <-  ScanVcfParam(fixed=NA, info='NT', geno='GT', samples=sample, trimEmpty=TRUE) 
   vcf_sample <- readVcf(vcf_files[i], param = param, row.names = FALSE)
   # Get GRanges and add Genotype

diff --git a/drop/template/config.yaml b/drop/template/config.yaml
@@ -1,10 +1,10 @@
 projectTitle: "DROP: Detection of RNA Outliers Pipeline"
-root:             # root directory of all intermediate output
+root:             # root directory of all output objects and tables
 htmlOutputPath:   # path for HTML rendered reports
 indexWithFolderName: true # whether the root base name should be part of the index name
 
 hpoFile: null  # if null, downloads it from webserver
-sampleAnnotation: # path to sample annotation (see documenation on how to create it)
+sampleAnnotation: # path to sample annotation (see documentation on how to create it)
 
 root: Output
 sampleAnnotation: Data/sample_annotation.tsv
@@ -14,18 +14,18 @@ genomeAssembly: hg19
 genome: Data/hg19.fa
 
 hpoFile: null
-random_seed: true  # just for demo data, remove for proper analysis
+random_seed: true  # just for demo data, remove for analysis
 
 exportCounts:
     # specify which gene annotations to include and which
     # groups to exclude when exporting counts
     geneAnnotations:
         - v29
-    excludeGroups
+    excludeGroups:
         - group1
-genome: # path to genome sequence in fasta format.
-    # You can define multiple reference genomes in yaml format, ncbi: path_to_ncbi, ucsc: path_to_ucsc
-    # the keywords that define the path should be in the GENOME column of the SA table
+genome: # path to reference genome sequence in fasta format.
+    # You can define multiple reference genomes in yaml format, ncbi: path/to/ncbi, ucsc: path/to/ucsc
+    # the keywords that define the path should be in the GENOME column of the sample annotation table
 
 aberrantExpression:
     run: true

diff --git a/drop/utils.py b/drop/utils.py
@@ -86,6 +86,8 @@ def subsetBy(df, column, values):
     if not isinstance(values, str) :
         inner_regex = "(" + "|".join(values) + ")"
 
+    if df[column].isnull().all():
+        return df[[False for i in range(df.shape[0])]]
     return  df[df[column].str.contains("(?:^|,)" + inner_regex + "(?:,|$)", na = False)]
 
 def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False):

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.2.1
+current_version = 1.2.2
 
 commit = True
 

diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@
 
 setuptools.setup(
     name="drop",
-    version="1.2.1",
+    version="1.2.2",
 
     author="Vicente A. Yépez, Michaela Müller, Nicholas H. Smith, Daniela Klaproth-Andrade, Luise Schuller, Ines Scheller, Christian Mertes <mertes@in.tum.de>, Julien Gagneur <gagneur@in.tum.de>",
     author_email="yepez@in.tum.de",