Skip to content

Commit

Permalink
Dev (#352)
Browse files Browse the repository at this point in the history
* add acknowledgements

* Update README.md

* Update config.yaml

* as character sample from vcf

* Fix typo in exportCounts input.

Rename raw-{dataset} to raw-local-{dataset}. There is no rule that produces raw-{dataset} as output.

* remove STRAND column from results

* bug fix for AE import count reading

* fix sample anno character style

* fix sample anno character style

* fix sample anno character style

* fix sample anno character style

* fix sample anno character style

* external counts NAs handling

* Update filterCounts.R

* version bump

Co-authored-by: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
Co-authored-by: Spencer Skylar Chan <54919210+schance995@users.noreply.github.com>
Co-authored-by: Smith Nicholas <smith@in.tum.de>
  • Loading branch information
4 people committed Aug 3, 2022
1 parent a00f59c commit 257cb0f
Show file tree
Hide file tree
Showing 17 changed files with 52 additions and 33 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ The manuscript is available in [Nature Protocols](https://www.nature.com/article


## What's new
Version 1.2.2 fixes some critical bugs that affected the performance of the `aberrantExpression` pipeline, and allows sample IDs to be numeric.

As of version 1.2.1 DROP has a new module that performs RNA-seq variant calling. The input are BAM files and the output either a single-sample or a multi-sample VCF file (option specified by the user) annotated with allele frequencies from gnomAD (if specified by the user). The sample annotation table does not need to be changed, but several new parameters in the config file have to be added and tuned. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#rna-variant-calling-dictionary).

Also, the integration of external split and non-split counts to detect aberrant splicing is now possible. Simply specify in a new column in the sample annotation the directory containing the counts. For more info, refer to the [documentation](https://gagneurlab-drop.readthedocs.io/en/latest/prepare.html#external-count-examples).
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
author = 'Michaela Müller'

# The full version, including alpha/beta/rc tags
release_ = '1.2.1'
release_ = '1.2.2'



Expand Down
2 changes: 1 addition & 1 deletion drop/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
from . import utils
from . import demo

__version__ = "1.2.1"
__version__ = "1.2.2"

2 changes: 1 addition & 1 deletion drop/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

@click.group()
@click_log.simple_verbosity_option(logger)
@click.version_option('1.2.1',prog_name='drop')
@click.version_option('1.2.2',prog_name='drop')


def main():
Expand Down
4 changes: 3 additions & 1 deletion drop/config/SampleAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@ def subsetSampleAnnotation(self, column, values, subset=None):
# check if column is valid
elif column not in sa_cols:
raise KeyError(f"Column '{column}' not present in sample annotation.")
# return empty subset if empty
elif subset.empty:
return subset
#subset column for matching values
else:
return utils.subsetBy(subset, column, values)
Expand Down Expand Up @@ -272,7 +275,6 @@ def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_F
#subset for the annotation_key in the annotation group and the group_key in the group
subset = self.subsetSampleAnnotation(annotation_key, annotation)
subset = self.subsetSampleAnnotation(group_key, group, subset)

ans = subset[file_type].tolist()
if asSet:
ans = set(ans)
Expand Down
11 changes: 6 additions & 5 deletions drop/config/submodules/AberrantExpression.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,16 @@ def getCountFiles(self, annotation, group):
:return: list of files
"""

# if sample annotation table does not contain GENE_COUNTS_FILE column. return no external counts
if("GENE_COUNTS_FILE" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS):
return []

bam_IDs = self.sampleAnnotation.getIDsByGroup(group, assay="RNA")
file_stump = self.processedDataDir / "aberrant_expression" / annotation / "counts" / "{sampleID}.Rds"
count_files = expand(str(file_stump), sampleID=bam_IDs)
extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE")
count_files.extend(extCountFiles)
# if sample annotation table does not contain GENE_COUNTS_FILE column. return no external counts
if("GENE_COUNTS_FILE" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS):
extCountFiles = []
else:
extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE")
count_files.extend(extCountFiles)
return count_files

def getCountParams(self, rnaID):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,17 @@ ods <- filterExpression(ods, gtfFile=txdb, filter=FALSE,
# add column for genes with at least 1 gene
rowData(ods)$counted1sample = rowSums(assay(ods)) > 0

has_external <- !(all(ods@colData$GENE_COUNTS_FILE == "") || is.null(ods@colData$GENE_COUNTS_FILE))
# External data check
if (is.null(ods@colData$GENE_COUNTS_FILE)){ #column does not exist in sample annotation table
has_external <- FALSE
}else if(all(is.na(ods@colData$GENE_COUNTS_FILE))){ #column exists but it has no values
has_external <- FALSE
}else if(all(ods@colData$GENE_COUNTS_FILE == "")){ #column exists with non-NA values but this group has all empty strings
has_external <- FALSE
}else{ #column exists with non-NA values and this group has at least 1 non-empty string
has_external <- TRUE
}

if(has_external){
ods@colData$isExternal <- as.factor(ods@colData$GENE_COUNTS_FILE != "")
}else{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ total_counts <- SummarizedExperiment(assays=list(counts=merged_assays))
rowRanges(total_counts) <- count_ranges

# Add sample annotation data (colData)
sample_anno <- fread(snakemake@config$sampleAnnotation)
sample_anno <- fread(snakemake@config$sampleAnnotation,
colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))
sample_anno <- sample_anno[, .SD[1], by = RNA_ID]
col_data <- data.table(RNA_ID = as.character(colnames(total_counts)))
sample_anno[, RNA_ID := as.character(RNA_ID)]
col_data <- left_join(col_data, sample_anno, by = "RNA_ID")
rownames(col_data) <- col_data$RNA_ID
colData(total_counts) <- as(col_data, "DataFrame")
Expand Down
3 changes: 2 additions & 1 deletion drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ if(!is.null(gene_annot_dt$gene_name)){
}

# Add HPO terms, requires online connection and for there to be annotated HPO terms
sa <- fread(snakemake@config$sampleAnnotation)
sa <- fread(snakemake@config$sampleAnnotation,
colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))
if(!is.null(sa$HPO_TERMS) & nrow(res) > 0){
if(!all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){
res <- add_HPO_cols(res, hpo_file = snakemake@params$hpoFile)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#' input:
#' - annotation: '`sm cfg.getProcessedDataDir() + "/preprocess/{annotation}/txdb.db"`'
#' - fds_theta: '`sm cfg.getProcessedDataDir() +
#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`'
#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/theta.h5"`'
#' output:
#' - k_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/k_{metric}_counts.tsv.gz", metric=["j", "theta"])`'
#' - n_counts: '`sm expand(cfg.exportCounts.getFilePattern(str_=True, expandStr=True) + "/n_{metric}_counts.tsv.gz", metric=["psi5", "psi3", "theta"])`'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ if(nrow(res_junc_dt) > 0){

# add colData to the results
res_junc_dt <- merge(res_junc_dt, as.data.table(colData(fds)), by = "sampleID")
res_junc_dt[, c("bamFile", "pairedEnd") := NULL]
setnames(res_junc_dt, 'STRAND', 'STRAND_SPECIFIC') # otherwise it's confusing with the 'strand' column from the junction
res_junc_dt[, c("bamFile", "pairedEnd", "STRAND") := NULL]
} else{
warning("The aberrant splicing pipeline gave 0 results for the ", dataset, " dataset.")
}
Expand All @@ -91,11 +90,11 @@ if(nrow(res_junc_dt) > 0){
if(length(res_junc) > 0){
res_genes_dt <- resultsByGenes(res_junc) %>% as.data.table
res_genes_dt <- merge(res_genes_dt, as.data.table(colData(fds)), by = "sampleID")
res_genes_dt[, c("bamFile", "pairedEnd") := NULL]
setnames(res_genes_dt, 'STRAND', 'STRAND_SPECIFIC')

res_genes_dt[, c("bamFile", "pairedEnd", "STRAND") := NULL]

# add HPO overlap information
sa <- fread(snakemake@config$sampleAnnotation)
sa <- fread(snakemake@config$sampleAnnotation,
colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))
if(!is.null(sa$HPO_TERMS)){
if(!all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){
res_genes_dt <- add_HPO_cols(res_genes_dt, hpo_file = snakemake@params$hpoFile)
Expand Down
3 changes: 2 additions & 1 deletion drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.
#' * Is the sample a relative of the other?
#'

sa <- fread(snakemake@config$sampleAnnotation)[, .(DNA_ID, RNA_ID)]
sa <- fread(snakemake@config$sampleAnnotation,
colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))[, .(DNA_ID, RNA_ID)]
sa[, ANNOTATED_MATCH := TRUE]
colnames(melt_mat)[1:2] <- c('DNA_ID', 'RNA_ID')

Expand Down
7 changes: 4 additions & 3 deletions drop/modules/mae-pipeline/QC/create_matrix_dna_rna_cor.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ suppressPackageStartupMessages({
})

register(MulticoreParam(snakemake@threads))
sa <- fread(snakemake@config$sampleAnnotation)
sa <- fread(snakemake@config$sampleAnnotation,
colClasses = c(RNA_ID = 'character', DNA_ID = 'character'))

# Read the test vcf as GRanges
gr_test <- readVcf(snakemake@config$mae$qcVcf) %>% granges()
mcols(gr_test)$GT <- "0/0"

# Obtain the rna and vcf files
rna_samples <- snakemake@params$rnaIds
rna_samples <- as.character(snakemake@params$rnaIds)
mae_res <- snakemake@input$mae_res

vcf_cols <- sa[RNA_ID %in% rna_samples, .(DNA_ID, DNA_VCF_FILE)] %>% unique %>% na.omit()
Expand All @@ -46,7 +47,7 @@ N <- length(vcf_files)
lp <- bplapply(1:N, function(i){

# Read sample vcf file
sample <- wes_samples[i]
sample <- wes_samples[i] %>% as.character()
param <- ScanVcfParam(fixed=NA, info='NT', geno='GT', samples=sample, trimEmpty=TRUE)
vcf_sample <- readVcf(vcf_files[i], param = param, row.names = FALSE)
# Get GRanges and add Genotype
Expand Down
14 changes: 7 additions & 7 deletions drop/template/config.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
projectTitle: "DROP: Detection of RNA Outliers Pipeline"
root: # root directory of all intermediate output
root: # root directory of all output objects and tables
htmlOutputPath: # path for HTML rendered reports
indexWithFolderName: true # whether the root base name should be part of the index name

hpoFile: null # if null, downloads it from webserver
sampleAnnotation: # path to sample annotation (see documenation on how to create it)
sampleAnnotation: # path to sample annotation (see documentation on how to create it)

root: Output
sampleAnnotation: Data/sample_annotation.tsv
Expand All @@ -14,18 +14,18 @@ genomeAssembly: hg19
genome: Data/hg19.fa

hpoFile: null
random_seed: true # just for demo data, remove for proper analysis
random_seed: true # just for demo data, remove for analysis

exportCounts:
# specify which gene annotations to include and which
# groups to exclude when exporting counts
geneAnnotations:
- v29
excludeGroups
excludeGroups:
- group1
genome: # path to genome sequence in fasta format.
# You can define multiple reference genomes in yaml format, ncbi: path_to_ncbi, ucsc: path_to_ucsc
# the keywords that define the path should be in the GENOME column of the SA table
genome: # path to reference genome sequence in fasta format.
# You can define multiple reference genomes in yaml format, ncbi: path/to/ncbi, ucsc: path/to/ucsc
# the keywords that define the path should be in the GENOME column of the sample annotation table

aberrantExpression:
run: true
Expand Down
2 changes: 2 additions & 0 deletions drop/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def subsetBy(df, column, values):
if not isinstance(values, str) :
inner_regex = "(" + "|".join(values) + ")"

if df[column].isnull().all():
return df[[False for i in range(df.shape[0])]]
return df[df[column].str.contains("(?:^|,)" + inner_regex + "(?:,|$)", na = False)]

def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False):
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.2.1
current_version = 1.2.2

commit = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

setuptools.setup(
name="drop",
version="1.2.1",
version="1.2.2",

author="Vicente A. Yépez, Michaela Müller, Nicholas H. Smith, Daniela Klaproth-Andrade, Luise Schuller, Ines Scheller, Christian Mertes <mertes@in.tum.de>, Julien Gagneur <gagneur@in.tum.de>",
author_email="yepez@in.tum.de",
Expand Down

0 comments on commit 257cb0f

Please sign in to comment.