Skip to content

Commit

Permalink
Merge pull request #17 from TBradley27/v1.3.0-beta
Browse files Browse the repository at this point in the history
v1.3.0 beta
  • Loading branch information
TBradley27 committed Sep 22, 2020
2 parents 05dbfcd + e205c79 commit 18933df
Show file tree
Hide file tree
Showing 15 changed files with 133 additions and 73 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ reports/fastqc/
*.rds
*.pl
PCT_parameters/
*.out
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ python: "3.6"

branches:
only:
- master
- beta3

os:
- linux

Expand All @@ -25,7 +25,7 @@ install:
- conda config --add channels r
- conda config --add channels conda-forge
- conda config --add channels bioconda
- travis_wait conda create -q -n test-environment snakemake r r-devtools perl-app-cpanminus python=$TRAVIS_PYTHON_VERSION
- travis_wait conda create -q -n test-environment snakemake r r-devtools perl-app-cpanminus perl=5.26.2=h470a237_0 python=$TRAVIS_PYTHON_VERSION

script:
- source activate test-environment
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
[![Build Status](https://travis-ci.com/TBradley27/FilTar.svg?token=ZSHcZ6Hizcm8MJsG95AA&branch=master)](https://travis-ci.com/TBradley27/FilTar/)
[![GitHub release](https://img.shields.io/github/release/TBradley27/FilTar.svg)](https://github.com/TBradley27/FilTar/releases/)
[![Snakemake](https://img.shields.io/badge/snakemake-≥5.4.0-brightgreen.svg?style=flat)](https://snakemake.readthedocs.io)
[![Snakemake](https://img.shields.io/badge/snakemake-≥5.24.0-brightgreen.svg?style=flat)](https://snakemake.readthedocs.io)

# FilTar

Expand Down
14 changes: 13 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@
#along with this program. If not, see <https://www.gnu.org/licenses/>.

import re
import pandas

# import metadata
metadata = pandas.read_table("metadata.tsv")

configfile: "config/basic.yaml"
configfile: "config/species.yaml"
configfile: "config/dependencies.yaml"

if config['reannotation'] == True:
include: "modules/with_reannotation/Snakefile"
Expand All @@ -43,6 +48,13 @@ elif config['sequence_data_source'] == 'User':
else:
raise Exception("\nPlease enter a value of either 'ENA' or 'SRA' or 'User' for the 'sequence_data_source' key. Default values can be set in config/basic.yaml\n")

if config['prediction_algorithm'] == 'TargetScan7':
pass
elif config['prediction_algorithm'] == 'miRanda':
pass
else:
raise Exception("\nPlease enter a valid name for a miRNA target prediction algorithm. Choose either 'TargetScan7' or 'miRanda'\n")

if config['prediction_algorithm'] == 'TargetScan7' and config['reannotation'] == True:
include: "modules/target_prediction/targetscan/Snakefile"
include: "modules/target_prediction/targetscan/with_reannotation/Snakefile"
Expand Down Expand Up @@ -73,7 +85,7 @@ for transcript in list(config['transcripts']):
raise Exception('\nInvalid transcript identifier "{}". Identifiers must adhere to official Ensembl identifier patterns e.g. "ENSMUST00000189888.6". Please revise.\n'.format(transcript))

include: "modules/data_download/Snakefile"
include: "modules/trim_reads/trim_galore/Snakefile"
include: "modules/trim_reads/Snakefile"
include: "modules/quant_reads/salmon/Snakefile"
include: "modules/mirna/Snakefile"
include: 'modules/get_target_coordinates/Snakefile'
Expand Down
9 changes: 0 additions & 9 deletions config/basic.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,3 @@
tissues:
mmu:
oocyte: [SRS540320]
samples:
SRS540320: [SRR1137901,SRR1137902]

single_end: [SRR1137901,SRR1137902]
paired_end: []

mirnas: [mmu-miR-188-5p] # if blank, all miRNA for a given species are used
transcripts: [] # if blank, all available transcripts for a given species are used

Expand Down
46 changes: 46 additions & 0 deletions config/dependencies.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# trim galore

trim_galore.length: 35
trim_galore.stringency: 4

# salmon

salmon.index_threads: 8 # the number of threads requested for the transcriptome indexing step of salmon
salmon.quant_threads: 4 # the number of threads requested for the transcript quantification step of salmon

# read-to-genome mapping

hisat2.num_threads_for_indexing: 16 # the number of threads to use for indexing a genome
hisat2.num_threads_for_mapping: 8 # the number of threads to use for mapping reads to a genome
samtools.view.num_threads: 8 # the number of threads to use when converting sam files to bam format
samtools.sort.num_threads: 8 # the number of threads to use when using samtools sort

# APAtrap

## Identify distal 3'UTRs for each transcript

APAtrap.utr_extension_size: 10000 # The limit to which the 3'UTR could potentially be extended
APAtrap.window_size: 100 # size of the scanning window
APAtrap.min_window_coverage: 0.05 # min coverage expected the 3'UTR for each nucleotide as a proportion of the the coverage in the coding region of the transcript
APAtrap.min_proportion_of_valid_nucleotides_in_window: 0.80 # scanning window stops on 3'UTR if this criteria is not met

## APA prediction on reannotatd 3'UTRs

APAtrap.min_cov_variation_between_APA_sites: 0.2 # 'The minimum degree of coverage variation between two adjacent APA sites' for those two sites to be called as distinct
APAtrap.min_average_cov: 20 # 'The minimum average coverage required for each called 3'UTR'. Must be 10 or greater
APAtrap.min_distance_between_APA_sites: 100 # 'The minimum distance between the predicted APA sites'. Must be 20 or greater.
APAtrap.predictAPA_window_size: 50 # 'Window size used to scan the profile'. Must be 20 or greater

# miRNA target prediction

## miRanda

miRanda.strict: False # 'demand strict 5' seed pairing'
miRanda.minimum_alignment_score: 140.0 # the minimum alignment score between miRNA and target for the interaction to be reported in the output
miRanda.minimum_energy_score: 1.0 # the minimum (absolute) energy score for the alignment to be reported. units: kcal/mol, sign: (-)
miRanda.5_prime_3_prime_scaling_factor: 4.0 # a scaling factor accounting for the greater importance of the miRNA 5' end (compared to the 3' end) when generating alignments.
miRanda.alignment_gap_open_penalty: -4.0
miRanda.alignment_gap_extension_penalty: -9.0



3 changes: 3 additions & 0 deletions metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
species biological_context sample_accession run_accession pe_or_se
mmu oocyte SRS540320 SRR1137901 single_end
mmu oocyte SRS540320 SRR1137902 single_end
13 changes: 2 additions & 11 deletions modules/data_download/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,5 @@ rule decompress_cdna_file:
shell: "gunzip {input}"

rule download_first_APAtrap_script:
output: 'scripts/identifyDistal3UTR.pl'
shell: 'wget https://sourceforge.net/projects/apatrap/files/Source%20Codes/identifyDistal3UTR.pl/download && mv download {output} && chmod +x {output}'

rule download_second_APAtrap_script:
output: 'scripts/predictAPA.pl'
shell: 'wget https://sourceforge.net/projects/apatrap/files/Source%20Codes/predictAPA.pl/download && mv download {output} && chmod +x {output}'





output: 'scripts/APAtrap/identifyDistal3UTR', 'scripts/APAtrap/predictAPA'
shell: 'wget https://sourceforge.net/projects/apatrap/files/APAtrap_Linux.zip/download && mv download scripts && unzip -d scripts/ scripts/download'
32 changes: 24 additions & 8 deletions modules/quant_reads/salmon/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ rule salmon_index:
get_cDNA_file
output:
directory("results/salmon/indexes/{species}")
threads: 8
threads: config['salmon.index_threads']
conda: "envs/salmon.yaml"
shell:
"salmon index --threads {threads} -t {input} -i {output} --type quasi -k 31"
Expand All @@ -35,18 +35,18 @@ rule salmon_index_for_lib_types:
get_cDNA_file
output:
directory("results/salmon/indexes/lib_type_identification/{species}")
threads: 8
threads: config['salmon.index_threads']
conda: "envs/salmon.yaml"
shell:
"salmon index --threads {threads} -t {input} -i {output} --type quasi -k 31"

def get_input_files (wildcards):
if wildcards.run_accession in config['paired_end']:
if wildcards.run_accession in set(metadata.loc[(metadata['pe_or_se'] == 'paired_end')]['run_accession']):
input_files = ['results/trimmed_fastq/{}_1_val_1.fq.gz'.format(wildcards.run_accession),
'results/trimmed_fastq/{}_2_val_2.fq.gz'.format(wildcards.run_accession)
]
return(input_files)
elif wildcards.run_accession in config['single_end']:
elif wildcards.run_accession in set(metadata.loc[(metadata['pe_or_se'] == 'single_end')]['run_accession']):
input_file = ["results/trimmed_fastq/{}_trimmed.fq.gz".format(wildcards.run_accession)
]
return(input_file)
Expand All @@ -62,7 +62,7 @@ rule salmon_quant_lib_type:
"envs/salmon.yaml"
wildcard_constraints:
run_accession="((?!lib_type).)*" # excludes substring 'lib_type'
threads: 4
threads: config['salmon.quant_threads']
script:
"quant_salmon.py"

Expand All @@ -75,7 +75,7 @@ rule salmon_quant:
"envs/salmon.yaml"
wildcard_constraints:
run_accession="((?!lib_type).)*"
threads: 4
threads: config['salmon.quant_threads']
script:
"quant_salmon.py"

Expand All @@ -86,8 +86,16 @@ rule salmon_get_lib_type:
run_accession="((?!lib_type).)*"
shell: "grep 'expected' {input}/lib_format_counts.json | awk '{{print $2}}' | sed 's/\"//g' | sed 's/,//g' > {output}"

def get_salmon_run_directory_names(wildcards):
metadata_tmp = metadata.loc[(metadata['species'] == wildcards.species) & (metadata['sample_accession'] == wildcards.sample)]
metadata_tmp = metadata_tmp['run_accession']
metadata_tmp = set(metadata_tmp)

file_names = expand('results/salmon/runs/{species}/{run_accession}', species=wildcards.species, run_accession=metadata_tmp)
return (file_names)

rule salmon_quantmerge_by_run:
input: lambda wildcards: expand("results/salmon/runs/{species}/{run_accession}", run_accession=config['samples'][wildcards.sample], species=wildcards.species)
input: get_salmon_run_directory_names
output: temp("results/salmon/samples/{species}/{sample}.quant.tmp.sf")
conda: "envs/salmon.yaml"
shell: "salmon quantmerge --quants {input} --names {input} -o {output}"
Expand All @@ -99,8 +107,16 @@ rule salmon_average_quantmerge_runs:
sample="((?!quant.tmp).)*" # anything not containing quant.tmp
script: "get_average_quant.R"

def get_salmon_sample_directory_names(wildcards):
metadata_tmp = metadata.loc[(metadata['species'] == wildcards.species) & (metadata['biological_context'] == wildcards.tissue)]
metadata_tmp = metadata_tmp['sample_accession']
metadata_tmp = set(metadata_tmp)

file_names = expand('results/salmon/samples/{species}/{sample}', species=wildcards.species, sample=metadata_tmp)
return (file_names)

rule salmon_quantmerge_by_sample:
input: lambda wildcards: expand('results/salmon/samples/{species}/{sample}', sample=config['tissues'][wildcards.species][wildcards.tissue], species=wildcards.species)
input: get_salmon_sample_directory_names
output: temp("results/salmon/{species}/{tissue}.sf.tmp")
conda: "envs/salmon.yaml"
shell: "salmon quantmerge --quants {input} --names {input} -o {output}"
Expand Down
14 changes: 10 additions & 4 deletions modules/target_prediction/miRanda/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ def get_dna_file(wildcards):

return('data/' + sci_species_name + '.' + genome_build + '.dna.chromosome.{}.fa'.format(wildcards.chrom))

def get_miRanda_strict_bool(wildcards):
if config['miRanda.strict']==True:
return (' -strict')
elif config['miRanda.strict']==False:
return ('')
else:
raise Exception ("Value for 'miRanda.strict' config option must be 'True' or 'False'")

rule fix_fasta_output:
input: "results/msa/{species}_{tissue}_chr{chrom}_3UTR.tmp2.fa"
output: temp("results/msa/{species}_{tissue}_chr{chrom}.fa")
Expand All @@ -31,7 +39,8 @@ rule miRanda:
mirna="data/mirbase_mature_{species}.filtered.fa"
output: temp("results/targets/miRanda/{species}/{tissue}_chr{chrom}.txt")
conda: "envs/miRanda.yaml"
shell: "miranda {input.mirna} {input.utr} > {output}"
params: get_miRanda_strict_bool
shell: "miranda {input.mirna} {input.utr} {params} -sc {config[miRanda.minimum_alignment_score]} -en {config[miRanda.minimum_energy_score]} -scale {config[miRanda.5_prime_3_prime_scaling_factor]} -go {config[miRanda.alignment_gap_open_penalty]} -ge {config[miRanda.alignment_gap_extension_penalty]} > {output}"

rule convert_miRanda_to_tsv:
input: "results/targets/miRanda/{species}/{tissue}_chr{chrom}.txt"
Expand All @@ -49,6 +58,3 @@ rule add_miRanda_header:
input: "results/targets/miRanda/{species}/{tissue}.temp.tsv"
output: temp("results/targets/miRanda/{species}/{tissue}.temp2.tsv")
script: "add_miRanda_header.R"



Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ rule trim_single_end_reads:
conda:
"envs/trim-galore.yaml"
shell:
"trim_galore --output_dir results/trimmed_fastq/ --length 35 --stringency 4 {input}"
"trim_galore --output_dir results/trimmed_fastq/ --length {config[trim_galore.length]} --stringency {config[trim_galore.stringency]} {input}"

rule trim_paired_end_reads:
input:
Expand All @@ -44,4 +44,4 @@ rule trim_paired_end_reads:
conda:
"envs/trim-galore.yaml"
shell:
"trim_galore --output_dir results/trimmed_fastq/ --length 35 --stringency 4 --paired {input[0]} {input[1]}"
"trim_galore --output_dir results/trimmed_fastq/ --length {config[trim_galore.length]} --stringency {config[trim_galore.stringency]} --paired {input[0]} {input[1]}"
14 changes: 0 additions & 14 deletions modules/trim_reads/cutadapt/Snakefile

This file was deleted.

File renamed without changes.
35 changes: 22 additions & 13 deletions modules/with_reannotation/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ def get_bed6_file(wildcards):

return ( 'results/' + sci_species_name + '.' + genome_build + '.{}.chr{}.filtered.bed6'.format(config['ensembl_release'], wildcards.chrom) )

def get_sample_level_bedgraph_file_names(wildcards):
metadata_tmp = metadata.loc[(metadata['species'] == wildcards.species) & (metadata['biological_context'] == wildcards.cell_line)]
metadata_tmp = metadata_tmp['sample_accession']
metadata_tmp = set(metadata_tmp)

file_names = expand("results/bam/sample/{species}/{sample}_chr{chrom}.bedgraph", species=wildcards.species, sample=metadata_tmp, chrom=wildcards.chrom)
return (file_names)

rule get_bed6_file: #three_prime_utrs only
input:
script="scripts/gtf_to_bed.sh",
Expand Down Expand Up @@ -91,9 +99,16 @@ rule split_bedgraph:
output: 'results/bam/run/{species}/{accession}_chr{chrom}.bedgraph'
shell: 'grep -E "^{wildcards.chrom}\s" {input} > {output}'

def get_run_level_bedgraph_file_names(wildcards):
metadata_tmp = metadata.loc[(metadata['species'] == wildcards.species) & (metadata['sample_accession'] == wildcards.sample)]
metadata_tmp = metadata_tmp['run_accession']
metadata_tmp = set(metadata_tmp)

file_names = expand("results/bam/run/{species}/{accession}_chr{chrom}.bedgraph", species=wildcards.species, accession=metadata_tmp, chrom=wildcards.chrom)
return (file_names)

rule merge_bedgraphs_by_run:
input:
lambda wildcards: expand("results/bam/run/{species}/{accession}_chr{chrom}.bedgraph", species=wildcards.species, accession=config['samples'][wildcards.sample], chrom=wildcards.chrom)
input: get_run_level_bedgraph_file_names
output:
temp("results/bam/sample/{species}/{sample}_chr{chrom}_tmp.bedgraph")
wildcard_constraints:
Expand All @@ -113,8 +128,7 @@ rule avg_merged_bedgraph_by_run:
"get_average_bedgraph.R"

rule merge_bedgraphs_by_sample:
input:
lambda wildcards: expand("results/bam/sample/{species}/{sample}_chr{chrom}.bedgraph", species=wildcards.species, sample=config['tissues'][wildcards.species][wildcards.cell_line], chrom=wildcards.chrom)
input: get_sample_level_bedgraph_file_names
output:
temp("results/bam/tissue/{species}/{cell_line}_chr{chrom}.bedgraph.tmp")
wildcard_constraints:
Expand Down Expand Up @@ -145,17 +159,12 @@ rule filter_bed12:

rule reannotate_3utrs:
input:
script="scripts/identifyDistal3UTR.pl",
script="scripts/APAtrap/identifyDistal3UTR",
bed=get_bed_file,
bedgraphs= 'results/bam/tissue/{species}/{cell_line}_chr{chrom}.bedgraph'
output:
temp("results/bed/{species}_{cell_line}_chr{chrom}.utr.bed")
params:
percentage_cutoff=0.80,
coverage_cutoff=0.05,
window_size=100
shell:
"perl {input.script} -i {input.bedgraphs} -p {params.percentage_cutoff} -c {params.coverage_cutoff} -w {params.window_size} -m {input.bed} -o {output}"
shell: "./{input.script} -i {input.bedgraphs} -p {config[APAtrap.min_proportion_of_valid_nucleotides_in_window]} -c {config[APAtrap.min_window_coverage]} -w {config[APAtrap.window_size]} -e {config[APAtrap.utr_extension_size]} -m {input.bed} -o {output}"

rule get_extended_bed_file:
input:
Expand All @@ -177,13 +186,13 @@ rule aggregrate_extended_bed_files:

rule identify_APA_sites:
input:
script="scripts/predictAPA.pl",
script="scripts/APAtrap/predictAPA",
bedgraphs= "results/bam/tissue/{species}/{tissue}_chr{chrom}.bedgraph",
bed="results/bed/{species}_{tissue}_chr{chrom}.utr.bed"
output:
temp("results/targets/{species}_{tissue}_chr{chrom}.APA.txt")
shell:
"perl {input.script} -i {input.bedgraphs} -g 1 -n 1 -u {input.bed} -o {output}"
"./{input.script} -i {input.bedgraphs} -g 1 -n 1 -d {config[APAtrap.min_cov_variation_between_APA_sites]} -c {config[APAtrap.min_average_cov]} -a {config[APAtrap.min_distance_between_APA_sites]} -w {config[APAtrap.predictAPA_window_size]} -u {input.bed} -o {output}"

rule aggregate_APA_sites:
input: lambda wildcards: expand("results/targets/{species}_{tissue}_chr{chrom}.APA.txt", chrom=config['chromosomes'][wildcards.species], species=wildcards.species, tissue=wildcards.tissue)
Expand Down
Loading

0 comments on commit 18933df

Please sign in to comment.