Merge pull request #17 from TBradley27/v1.3.0-beta

v1.3.0 beta
TBradley27 · Sep 22, 2020 · 18933df · 18933df
2 parents 05dbfcd + e205c79
commit 18933df
Show file tree

Hide file tree

Showing 15 changed files with 133 additions and 73 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,4 @@ reports/fastqc/
 *.rds
 *.pl
 PCT_parameters/
+*.out
diff --git a/.travis.yml b/.travis.yml
@@ -3,8 +3,8 @@ python: "3.6"
 
 branches:
     only:
-        - master
         - beta3
+
 os:
  - linux
 
@@ -25,7 +25,7 @@ install:
    - conda config --add channels r
    - conda config --add channels conda-forge
    - conda config --add channels bioconda
-   - travis_wait conda create -q -n test-environment snakemake r r-devtools perl-app-cpanminus python=$TRAVIS_PYTHON_VERSION
+   - travis_wait conda create -q -n test-environment snakemake r r-devtools perl-app-cpanminus perl=5.26.2=h470a237_0 python=$TRAVIS_PYTHON_VERSION
 
 script:
    - source activate test-environment

diff --git a/README.md b/README.md
@@ -1,6 +1,5 @@
-[![Build Status](https://travis-ci.com/TBradley27/FilTar.svg?token=ZSHcZ6Hizcm8MJsG95AA&branch=master)](https://travis-ci.com/TBradley27/FilTar/)
 [![GitHub release](https://img.shields.io/github/release/TBradley27/FilTar.svg)](https://github.com/TBradley27/FilTar/releases/)
-[![Snakemake](https://img.shields.io/badge/snakemake-≥5.4.0-brightgreen.svg?style=flat)](https://snakemake.readthedocs.io)
+[![Snakemake](https://img.shields.io/badge/snakemake-≥5.24.0-brightgreen.svg?style=flat)](https://snakemake.readthedocs.io)
 
 # FilTar
 

diff --git a/Snakefile b/Snakefile
@@ -15,9 +15,14 @@
 #along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import re
+import pandas
+
+# import metadata
+metadata = pandas.read_table("metadata.tsv")
 
 configfile: "config/basic.yaml"
 configfile: "config/species.yaml"
+configfile: "config/dependencies.yaml"
 
 if config['reannotation'] == True:
 	include: "modules/with_reannotation/Snakefile"
@@ -43,6 +48,13 @@ elif config['sequence_data_source'] == 'User':
 else:
 	raise Exception("\nPlease enter a value of either 'ENA' or 'SRA' or 'User' for the 'sequence_data_source' key. Default values can be set in config/basic.yaml\n")
 
+if config['prediction_algorithm'] == 'TargetScan7':
+	pass
+elif config['prediction_algorithm'] == 'miRanda':
+	pass
+else:
+	raise Exception("\nPlease enter a valid name for a miRNA target prediction algorithm. Choose either 'TargetScan7' or 'miRanda'\n")
+
 if config['prediction_algorithm'] == 'TargetScan7' and config['reannotation'] == True:
 	include: "modules/target_prediction/targetscan/Snakefile"
 	include: "modules/target_prediction/targetscan/with_reannotation/Snakefile"
@@ -73,7 +85,7 @@ for transcript in list(config['transcripts']):
 		raise Exception('\nInvalid transcript identifier "{}". Identifiers must adhere to official Ensembl identifier patterns e.g. "ENSMUST00000189888.6". Please revise.\n'.format(transcript))
 
 include: "modules/data_download/Snakefile"
-include: "modules/trim_reads/trim_galore/Snakefile"
+include: "modules/trim_reads/Snakefile"
 include: "modules/quant_reads/salmon/Snakefile"
 include: "modules/mirna/Snakefile"
 include: 'modules/get_target_coordinates/Snakefile'

diff --git a/config/basic.yaml b/config/basic.yaml
@@ -1,12 +1,3 @@
-tissues:
-        mmu: 
-                oocyte: [SRS540320]
-samples:
-         SRS540320: [SRR1137901,SRR1137902]
-
-single_end: [SRR1137901,SRR1137902]
-paired_end: []
-
 mirnas: [mmu-miR-188-5p]                 # if blank, all miRNA for a given species are used 
 transcripts: []                          # if blank, all available transcripts for a given species are used
 

diff --git a/config/dependencies.yaml b/config/dependencies.yaml
@@ -0,0 +1,46 @@
+# trim galore
+
+trim_galore.length: 35
+trim_galore.stringency: 4
+
+# salmon
+
+salmon.index_threads: 8 # the number of threads requested for the transcriptome indexing step of salmon
+salmon.quant_threads: 4 # the number of threads requested for the transcript quantification step of salmon
+
+# read-to-genome mapping
+
+hisat2.num_threads_for_indexing: 16 # the number of threads to use for indexing a genome
+hisat2.num_threads_for_mapping: 8 # the number of threads to use for mapping reads to a genome
+samtools.view.num_threads: 8 # the number of threads to use when converting sam files to bam format
+samtools.sort.num_threads: 8 # the number of threads to use when using samtools sort
+
+# APAtrap
+
+## Identify distal 3'UTRs for each transcript
+
+APAtrap.utr_extension_size: 10000 # The limit to which the 3'UTR could potentially be extended 
+APAtrap.window_size: 100 # size of the scanning window 
+APAtrap.min_window_coverage: 0.05 # min coverage expected the 3'UTR for each nucleotide as a proportion of the the coverage in the coding region of the transcript 
+APAtrap.min_proportion_of_valid_nucleotides_in_window: 0.80 # scanning window stops on 3'UTR if this criteria is not met
+
+## APA prediction on reannotatd 3'UTRs
+
+APAtrap.min_cov_variation_between_APA_sites: 0.2 # 'The minimum degree of coverage variation between two adjacent APA sites' for those two sites to be called as distinct
+APAtrap.min_average_cov: 20 # 'The minimum average coverage required for each called 3'UTR'. Must be 10 or greater
+APAtrap.min_distance_between_APA_sites: 100 # 'The minimum distance between the predicted APA sites'. Must be 20 or greater.
+APAtrap.predictAPA_window_size: 50 # 'Window size used to scan the profile'. Must be 20 or greater
+
+# miRNA target prediction
+
+## miRanda
+
+miRanda.strict: False # 'demand strict 5' seed pairing'
+miRanda.minimum_alignment_score: 140.0 # the minimum alignment score between miRNA and target for the interaction to be reported in the output
+miRanda.minimum_energy_score: 1.0 # the minimum (absolute) energy score for the alignment to be reported. units: kcal/mol, sign: (-)
+miRanda.5_prime_3_prime_scaling_factor: 4.0 # a scaling factor accounting for the greater importance of the miRNA 5' end (compared to the 3' end) when generating alignments.
+miRanda.alignment_gap_open_penalty: -4.0
+miRanda.alignment_gap_extension_penalty: -9.0 
+
+
+
diff --git a/metadata.tsv b/metadata.tsv
@@ -0,0 +1,3 @@
+species	biological_context	sample_accession	run_accession	pe_or_se
+mmu	oocyte	SRS540320	SRR1137901	single_end
+mmu	oocyte	SRS540320	SRR1137902	single_end
diff --git a/modules/data_download/Snakefile b/modules/data_download/Snakefile
@@ -93,14 +93,5 @@ rule decompress_cdna_file:
         shell: "gunzip {input}"
 
 rule download_first_APAtrap_script:
-	output: 'scripts/identifyDistal3UTR.pl'
-	shell: 'wget https://sourceforge.net/projects/apatrap/files/Source%20Codes/identifyDistal3UTR.pl/download && mv download {output} && chmod +x {output}'
-
-rule download_second_APAtrap_script:
-	output: 'scripts/predictAPA.pl'
-	shell: 'wget https://sourceforge.net/projects/apatrap/files/Source%20Codes/predictAPA.pl/download && mv download {output} && chmod +x {output}'
-
-
-
-
-
+	output: 'scripts/APAtrap/identifyDistal3UTR', 'scripts/APAtrap/predictAPA'
+	shell: 'wget https://sourceforge.net/projects/apatrap/files/APAtrap_Linux.zip/download && mv download scripts && unzip -d scripts/ scripts/download'
diff --git a/modules/quant_reads/salmon/Snakefile b/modules/quant_reads/salmon/Snakefile
@@ -25,7 +25,7 @@ rule salmon_index:
              get_cDNA_file
         output:
             directory("results/salmon/indexes/{species}")
-        threads: 8
+        threads: config['salmon.index_threads']
         conda: "envs/salmon.yaml"
         shell:
             "salmon index --threads {threads} -t {input} -i {output} --type quasi -k 31"
@@ -35,18 +35,18 @@ rule salmon_index_for_lib_types:
              get_cDNA_file
         output:
             directory("results/salmon/indexes/lib_type_identification/{species}")
-        threads: 8
+        threads: config['salmon.index_threads']
         conda: "envs/salmon.yaml"
         shell:
             "salmon index --threads {threads} -t {input} -i {output} --type quasi -k 31"
 
 def get_input_files (wildcards):
-        if wildcards.run_accession in config['paired_end']:
+        if wildcards.run_accession in set(metadata.loc[(metadata['pe_or_se'] == 'paired_end')]['run_accession']):
                 input_files = ['results/trimmed_fastq/{}_1_val_1.fq.gz'.format(wildcards.run_accession),
                                 'results/trimmed_fastq/{}_2_val_2.fq.gz'.format(wildcards.run_accession)
                                 ]
                 return(input_files)
-        elif wildcards.run_accession in config['single_end']:
+        elif wildcards.run_accession in set(metadata.loc[(metadata['pe_or_se'] == 'single_end')]['run_accession']):
                 input_file = ["results/trimmed_fastq/{}_trimmed.fq.gz".format(wildcards.run_accession)
                              ]
                 return(input_file)
@@ -62,7 +62,7 @@ rule salmon_quant_lib_type:
 		"envs/salmon.yaml"
 	wildcard_constraints:
 		run_accession="((?!lib_type).)*" # excludes substring 'lib_type'
-	threads: 4
+	threads: config['salmon.quant_threads']
 	script:
 		"quant_salmon.py"
 
@@ -75,7 +75,7 @@ rule salmon_quant:
 		"envs/salmon.yaml"
 	wildcard_constraints:
 		run_accession="((?!lib_type).)*"
-	threads: 4
+	threads: config['salmon.quant_threads']
 	script:
 		"quant_salmon.py"
 
@@ -86,8 +86,16 @@ rule salmon_get_lib_type:
 		run_accession="((?!lib_type).)*"
 	shell: "grep 'expected' {input}/lib_format_counts.json | awk '{{print $2}}' | sed 's/\"//g' | sed 's/,//g' > {output}"
 
+def get_salmon_run_directory_names(wildcards):
+	metadata_tmp = metadata.loc[(metadata['species'] == wildcards.species) & (metadata['sample_accession'] == wildcards.sample)]
+	metadata_tmp = metadata_tmp['run_accession']
+	metadata_tmp = set(metadata_tmp)
+
+	file_names = expand('results/salmon/runs/{species}/{run_accession}', species=wildcards.species, run_accession=metadata_tmp)
+	return (file_names)
+
 rule salmon_quantmerge_by_run:
-	input:	lambda wildcards: expand("results/salmon/runs/{species}/{run_accession}", run_accession=config['samples'][wildcards.sample], species=wildcards.species)
+	input:	get_salmon_run_directory_names
 	output: temp("results/salmon/samples/{species}/{sample}.quant.tmp.sf")
         conda: "envs/salmon.yaml"
 	shell: "salmon quantmerge --quants {input} --names {input} -o {output}"
@@ -99,8 +107,16 @@ rule salmon_average_quantmerge_runs:
                sample="((?!quant.tmp).)*" # anything not containing quant.tmp
         script: "get_average_quant.R"
 
+def get_salmon_sample_directory_names(wildcards):
+	metadata_tmp = metadata.loc[(metadata['species'] == wildcards.species) & (metadata['biological_context'] == wildcards.tissue)]
+	metadata_tmp = metadata_tmp['sample_accession']
+	metadata_tmp = set(metadata_tmp)
+
+	file_names = expand('results/salmon/samples/{species}/{sample}', species=wildcards.species, sample=metadata_tmp)
+	return (file_names)
+
 rule salmon_quantmerge_by_sample:
-        input:  lambda wildcards: expand('results/salmon/samples/{species}/{sample}', sample=config['tissues'][wildcards.species][wildcards.tissue], species=wildcards.species)
+        input:  get_salmon_sample_directory_names
         output: temp("results/salmon/{species}/{tissue}.sf.tmp")
         conda: "envs/salmon.yaml"
         shell: "salmon quantmerge --quants {input} --names {input} -o {output}"

diff --git a/modules/target_prediction/miRanda/Snakefile b/modules/target_prediction/miRanda/Snakefile
@@ -20,6 +20,14 @@ def get_dna_file(wildcards):
 
         return('data/' + sci_species_name + '.' + genome_build + '.dna.chromosome.{}.fa'.format(wildcards.chrom))
 
+def get_miRanda_strict_bool(wildcards):
+	if config['miRanda.strict']==True:
+		return (' -strict')
+	elif config['miRanda.strict']==False:
+		return ('')
+	else:
+		raise Exception ("Value for 'miRanda.strict' config option must be 'True' or 'False'")
+
 rule fix_fasta_output:
 	input: "results/msa/{species}_{tissue}_chr{chrom}_3UTR.tmp2.fa"
 	output: temp("results/msa/{species}_{tissue}_chr{chrom}.fa")
@@ -31,7 +39,8 @@ rule miRanda:
 		mirna="data/mirbase_mature_{species}.filtered.fa"
 	output: temp("results/targets/miRanda/{species}/{tissue}_chr{chrom}.txt")
 	conda: "envs/miRanda.yaml"
-	shell: "miranda {input.mirna} {input.utr} > {output}"
+	params: get_miRanda_strict_bool
+	shell: "miranda {input.mirna} {input.utr} {params} -sc {config[miRanda.minimum_alignment_score]} -en {config[miRanda.minimum_energy_score]} -scale {config[miRanda.5_prime_3_prime_scaling_factor]} -go {config[miRanda.alignment_gap_open_penalty]} -ge {config[miRanda.alignment_gap_extension_penalty]} > {output}"
 
 rule convert_miRanda_to_tsv:
         input: "results/targets/miRanda/{species}/{tissue}_chr{chrom}.txt"
@@ -49,6 +58,3 @@ rule add_miRanda_header:
 	input: "results/targets/miRanda/{species}/{tissue}.temp.tsv"
 	output: temp("results/targets/miRanda/{species}/{tissue}.temp2.tsv")
 	script: "add_miRanda_header.R"
-
-
-
diff --git a/modules/trim_reads/trim_galore/Snakefile → modules/trim_reads/Snakefile b/modules/trim_reads/trim_galore/Snakefile → modules/trim_reads/Snakefile
@@ -33,7 +33,7 @@ rule trim_single_end_reads:
        conda: 
           "envs/trim-galore.yaml"
        shell:
-          "trim_galore --output_dir results/trimmed_fastq/  --length 35 --stringency 4 {input}"
+          "trim_galore --output_dir results/trimmed_fastq/  --length {config[trim_galore.length]} --stringency {config[trim_galore.stringency]} {input}"
 
 rule trim_paired_end_reads:
        input:
@@ -44,4 +44,4 @@ rule trim_paired_end_reads:
        conda:
           "envs/trim-galore.yaml"
        shell:
-          "trim_galore --output_dir results/trimmed_fastq/ --length 35 --stringency 4 --paired {input[0]} {input[1]}"
+          "trim_galore --output_dir results/trimmed_fastq/ --length  {config[trim_galore.length]} --stringency {config[trim_galore.stringency]}  --paired {input[0]} {input[1]}"
diff --git a/modules/trim_reads/cutadapt/Snakefile b/modules/trim_reads/cutadapt/Snakefile
diff --git a/...m_reads/trim_galore/envs/trim-galore.yaml → modules/trim_reads/envs/trim-galore.yaml b/...m_reads/trim_galore/envs/trim-galore.yaml → modules/trim_reads/envs/trim-galore.yaml
diff --git a/modules/with_reannotation/Snakefile b/modules/with_reannotation/Snakefile
@@ -32,6 +32,14 @@ def get_bed6_file(wildcards):
 
         return ( 'results/' + sci_species_name + '.' + genome_build + '.{}.chr{}.filtered.bed6'.format(config['ensembl_release'], wildcards.chrom) )
 
+def get_sample_level_bedgraph_file_names(wildcards):
+	metadata_tmp = metadata.loc[(metadata['species'] == wildcards.species) & (metadata['biological_context'] == wildcards.cell_line)]
+	metadata_tmp = metadata_tmp['sample_accession']
+	metadata_tmp = set(metadata_tmp)
+
+	file_names = expand("results/bam/sample/{species}/{sample}_chr{chrom}.bedgraph", species=wildcards.species, sample=metadata_tmp, chrom=wildcards.chrom)
+	return (file_names)
+
 rule get_bed6_file: #three_prime_utrs only
         input:
             script="scripts/gtf_to_bed.sh",
@@ -91,9 +99,16 @@ rule split_bedgraph:
         output: 'results/bam/run/{species}/{accession}_chr{chrom}.bedgraph'
         shell: 'grep -E "^{wildcards.chrom}\s" {input} > {output}'
 
+def get_run_level_bedgraph_file_names(wildcards):
+	metadata_tmp = metadata.loc[(metadata['species'] == wildcards.species) & (metadata['sample_accession'] == wildcards.sample)]
+	metadata_tmp = metadata_tmp['run_accession']
+	metadata_tmp = set(metadata_tmp)
+
+	file_names = expand("results/bam/run/{species}/{accession}_chr{chrom}.bedgraph", species=wildcards.species, accession=metadata_tmp, chrom=wildcards.chrom)
+	return (file_names)
+
 rule merge_bedgraphs_by_run:
-        input:
-             lambda wildcards: expand("results/bam/run/{species}/{accession}_chr{chrom}.bedgraph", species=wildcards.species, accession=config['samples'][wildcards.sample], chrom=wildcards.chrom)
+        input: get_run_level_bedgraph_file_names
         output:
              temp("results/bam/sample/{species}/{sample}_chr{chrom}_tmp.bedgraph")
         wildcard_constraints:
@@ -113,8 +128,7 @@ rule avg_merged_bedgraph_by_run:
                 "get_average_bedgraph.R"
 
 rule merge_bedgraphs_by_sample:
-        input:
-             lambda wildcards: expand("results/bam/sample/{species}/{sample}_chr{chrom}.bedgraph", species=wildcards.species, sample=config['tissues'][wildcards.species][wildcards.cell_line], chrom=wildcards.chrom)
+        input: get_sample_level_bedgraph_file_names 
         output:
              temp("results/bam/tissue/{species}/{cell_line}_chr{chrom}.bedgraph.tmp")
         wildcard_constraints:
@@ -145,17 +159,12 @@ rule filter_bed12:
 
 rule reannotate_3utrs:
     input:
-       script="scripts/identifyDistal3UTR.pl",
+       script="scripts/APAtrap/identifyDistal3UTR",
        bed=get_bed_file,
        bedgraphs= 'results/bam/tissue/{species}/{cell_line}_chr{chrom}.bedgraph'
     output:
        temp("results/bed/{species}_{cell_line}_chr{chrom}.utr.bed")
-    params:
-        percentage_cutoff=0.80,
-        coverage_cutoff=0.05,
-        window_size=100
-    shell:
-       "perl {input.script} -i {input.bedgraphs} -p {params.percentage_cutoff} -c {params.coverage_cutoff} -w {params.window_size} -m {input.bed} -o {output}"
+    shell: "./{input.script} -i {input.bedgraphs} -p {config[APAtrap.min_proportion_of_valid_nucleotides_in_window]} -c {config[APAtrap.min_window_coverage]} -w {config[APAtrap.window_size]} -e {config[APAtrap.utr_extension_size]} -m {input.bed} -o {output}"
 
 rule get_extended_bed_file:
          input:
@@ -177,13 +186,13 @@ rule aggregrate_extended_bed_files:
 
 rule identify_APA_sites:
      input:
-        script="scripts/predictAPA.pl",
+        script="scripts/APAtrap/predictAPA",
         bedgraphs= "results/bam/tissue/{species}/{tissue}_chr{chrom}.bedgraph",
         bed="results/bed/{species}_{tissue}_chr{chrom}.utr.bed"
      output:
         temp("results/targets/{species}_{tissue}_chr{chrom}.APA.txt")
      shell:
-        "perl {input.script} -i {input.bedgraphs} -g 1 -n 1 -u {input.bed}  -o {output}"
+        "./{input.script} -i {input.bedgraphs} -g 1 -n 1 -d {config[APAtrap.min_cov_variation_between_APA_sites]} -c {config[APAtrap.min_average_cov]} -a {config[APAtrap.min_distance_between_APA_sites]} -w {config[APAtrap.predictAPA_window_size]} -u {input.bed}  -o {output}"
 
 rule aggregate_APA_sites:
         input: lambda wildcards: expand("results/targets/{species}_{tissue}_chr{chrom}.APA.txt", chrom=config['chromosomes'][wildcards.species], species=wildcards.species, tissue=wildcards.tissue)