diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 570f6c0dc..e5337fdc7 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -5,10 +5,12 @@ report_comment: > report_section_order: "nf-core-circrna-methods-description": order: -1000 - software_versions: + benchmarking: order: -1001 - "nf-core-circrna-summary": + software_versions: order: -1002 + "nf-core-circrna-summary": + order: -1003 export_plots: true diff --git a/assets/schema_input.json b/assets/schema_input.json index 7e9257a06..d79fa8076 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -27,6 +27,12 @@ "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, + "benchmarking": { + "type": "boolean", + "default": false, + "errorMessage": "Benchmarking must be a boolean value", + "meta": ["benchmarking"] + }, "strandedness": { "type": "string", "enum": ["unstranded", "forward", "reverse", "auto"], diff --git a/conf/modules.config b/conf/modules.config index b23ea157f..765827fcd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -813,6 +813,23 @@ process { ] } + withName: '.*:CIRCRNA_DISCOVERY_BENCHMARKING:ANNOTATION:REMOVE_SCORE_STRAND' { + ext.suffix = "_benchmarking.tidy.bed" + } + + withName: '.*:BENCHMARKING:SORT' { + ext.args = "-k 1,1 -k2,2n -k3,3n -u" + ext.suffix = "combined.bed" + } + + withName: '.*:BENCHMARKING:BEDTOOLS_MERGE' { + ext.args = "-s -c 6 -o distinct" + } + + withName: '.*:BENCHMARKING:BEDTOOLS_GENOMECOV' { + ext.args = "-dz" + } + withName: ADD_BACKSPLICE { ext.args = "-c fastx '{ if (\$name ~ /^circ_/) { \$seq = \$seq substr(\$seq, 1, 25) } print \">\" \$name; print \$seq }'" ext.suffix = "backspliced.fa" diff --git a/modules.json b/modules.json index c1bed4145..5c2c28bac 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bedtools/genomecov": { + "branch": "master", + "git_sha": "81b90194ce9911dbd55bba2c65c6919f6677abc4", + "installed_by": ["modules"] + }, "bedtools/getfasta": { "branch": "master", "git_sha": "cdcdd5e3d806f0ff3983c40c69e0b07bb44ec299", @@ -20,6 +25,16 @@ "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", "installed_by": ["modules"] }, + "bedtools/jaccard": { + "branch": "master", + "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668", + "installed_by": ["modules"] + }, + "bedtools/merge": { + "branch": "master", + "git_sha": "a5377837fe9013bde89de8689829e83e84086536", + "installed_by": ["modules"] + }, "bedtools/sort": { "branch": "master", "git_sha": "571a5feac4c9ce0a8df0bc15b94230e7f3e8db47", diff --git a/modules/local/benchmarking/average_tsv/main.nf b/modules/local/benchmarking/average_tsv/main.nf new file mode 100644 index 000000000..79a0b5c13 --- /dev/null +++ b/modules/local/benchmarking/average_tsv/main.nf @@ -0,0 +1,19 @@ +process AVERAGE_TSV { + label "process_single" + + conda "bioconda::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + + input: + path(tsv) + + output: + path(tsv), emit: tsv + path("versions.yml"), emit: versions + + script: + template "average.py" +} diff --git a/modules/local/benchmarking/average_tsv/templates/average.py b/modules/local/benchmarking/average_tsv/templates/average.py new file mode 100644 index 000000000..54f27a37c --- /dev/null +++ b/modules/local/benchmarking/average_tsv/templates/average.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import pandas as pd +import platform + +tsv = "$tsv" +data = pd.read_csv(tsv, sep='\\t') + +data['tool'] = data['tool'].str.replace('tool:', '') +average_values = data.groupby('tool')['pearson_corr'].mean().reset_index() + +output_file_path = tsv +average_values.to_csv(output_file_path, sep='\\t', index=False) + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "pandas": pd.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) + diff --git a/modules/local/benchmarking/location_plots/main.nf b/modules/local/benchmarking/location_plots/main.nf new file mode 100644 index 000000000..4b4747bf5 --- /dev/null +++ b/modules/local/benchmarking/location_plots/main.nf @@ -0,0 +1,17 @@ +process LOCATION_PLOT { + tag "$meta.id" + label "process_single" + + conda "bioconda::seaborn=0.11.2" + container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75' + + input: + tuple val(meta), path(bedfile1), path(bedfile2) + + output: + path("*_mqc.png"), emit: plots + path("versions.yml"), emit: versions + + script: + template "create_plots.py" +} diff --git a/modules/local/benchmarking/location_plots/templates/create_plots.py b/modules/local/benchmarking/location_plots/templates/create_plots.py new file mode 100755 index 000000000..1479bc844 --- /dev/null +++ b/modules/local/benchmarking/location_plots/templates/create_plots.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +from matplotlib.lines import Line2D +import platform + +input_bed_file_1 = '$bedfile1' +input_bed_file_2 = '$bedfile2' + +# Read input files +def read_bed_file(file_path, label): + data = {'chromosome': [], 'start': [], 'strand': [], 'file_label': []} + with open(file_path, 'r') as file: + reader = csv.reader(file, delimiter='\\t') + for row in reader: + data['chromosome'].append(row[0]) + data['start'].append(int(row[1])) + data['strand'].append(row[3]) + data['file_label'].append(label.lower()) + return data + +def combine_data(data1, data2): + for key in data1: + data1[key].extend(data2[key]) + return data1 + +data1 = read_bed_file(input_bed_file_1, 'total') +data2 = read_bed_file(input_bed_file_2, 'polya') + +# Combine the two datasets +combined_data = combine_data(data1, data2) + +# Create a DataFrame +df = pd.DataFrame({ + 'Chromosome': combined_data['chromosome'], + 'Start Location': combined_data['start'], + 'Strand': combined_data['strand'], + 'File Label': combined_data['file_label'] +}) + +# Sort DataFrame to ensure consistent plotting order +df.sort_values(by=['File Label', 'Strand'], inplace=True) + +# Plotting +fig, ax = plt.subplots(figsize=(12, 6)) + +palette = { + "total +": "red", + "total -": "lightcoral", + "polya +": "blue", + "polya -": "lightblue" +} + +# Draw violins +for file_label in df['File Label'].unique(): + sns.violinplot( + x="Chromosome", + y="Start Location", + hue="Strand", + data=df[df['File Label'] == file_label], + palette={"+" : palette[f"{file_label} +"], "-" : palette[f"{file_label} -"]}, + split=True, + ax=ax, + scale="count", + scale_hue=False, + saturation=0.75, + inner=None + ) + +# Set transparency for all violins +for violin in ax.collections: + violin.set_alpha(0.25) + +# Legend +custom_lines = [ + Line2D([0], [0], color=palette[f"{file} {strand}"], lw=4, alpha=0.25) + for file in df['File Label'].unique() + for strand in ["+", "-"] +] +ax.legend( + custom_lines, + [f"{file} : {strand}" for file in df['File Label'].unique() for strand in ["+", "-"]], + title="File : Strand" +) + +plt.title('Start Locations of circRNA by Chromosome and Strand') + +plot_file_name = f"{input_bed_file_1.replace('.bed','')}_{input_bed_file_2.replace('.bed','')}_mqc.png" + +# Save the plot +plt.savefig(plot_file_name, bbox_inches='tight') + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "pandas": pd.__version__, + "seaborn": sns.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/benchmarking/multiqc/main.nf b/modules/local/benchmarking/multiqc/main.nf new file mode 100644 index 000000000..a39675d30 --- /dev/null +++ b/modules/local/benchmarking/multiqc/main.nf @@ -0,0 +1,18 @@ +process BENCHMARKING_MULTIQC { + label "process_single" + + conda "bioconda::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + input: + path(jaccard) + + output: + path("*_mqc.json") , emit: report + path("versions.yml"), emit: versions + + script: + template "benchmarking.py" +} diff --git a/modules/local/benchmarking/multiqc/templates/benchmarking.py b/modules/local/benchmarking/multiqc/templates/benchmarking.py new file mode 100644 index 000000000..1515e8198 --- /dev/null +++ b/modules/local/benchmarking/multiqc/templates/benchmarking.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +import pandas as pd +import json +import platform + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +df = pd.read_csv("$jaccard", sep='\\t', index_col=0) + +for metric in df.columns: + data = { + "id": f"benchmarking_{metric}", + "parent_id": "benchmarking", + "parent_name": "Benchmarking", + "parent_description": "Benchmarking of the tools", + "section_name": metric.capitalize(), + "description": f"{metric.capitalize()} values of the tools", + "plot_type": "bargraph", + "data": df[[metric]].T.to_dict() + } + + with open(f"benchmarking_{metric}_mqc.json", "w") as f: + json.dump(data, f, indent=4) + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "pandas": pd.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/benchmarking/overlap_plot/main.nf b/modules/local/benchmarking/overlap_plot/main.nf new file mode 100644 index 000000000..161b62036 --- /dev/null +++ b/modules/local/benchmarking/overlap_plot/main.nf @@ -0,0 +1,17 @@ +process OVERLAP_PLOT { + tag "$meta.id" + label "process_single" + + conda "bioconda::seaborn=0.11.2" + container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75' + + input: + tuple val(meta), path(bed) + + output: + path("*_mqc.png") , emit: plots + path("versions.yml"), emit: versions + + script: + template "create_plots.py" +} diff --git a/modules/local/benchmarking/overlap_plot/templates/create_plots.py b/modules/local/benchmarking/overlap_plot/templates/create_plots.py new file mode 100644 index 000000000..fcbae5eaf --- /dev/null +++ b/modules/local/benchmarking/overlap_plot/templates/create_plots.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +import platform + +input_bed_file = '$bed' + +# Read input file +def read_bed_file(file_path): + data = {'chromosome': [], 'start': [], 'strand': []} + with open(file_path, 'r') as file: + reader = csv.reader(file, delimiter='\t') + for row in reader: + data['chromosome'].append(row[0]) + data['start'].append(int(row[1])) + data['strand'].append(row[3]) + return data + +# Read the data +data = read_bed_file(input_bed_file) + +# Create a DataFrame +df = pd.DataFrame({ + 'Chromosome': data['chromosome'], + 'Start Location': data['start'], + 'Strand': data['strand'] +}) + +# Plotting +plt.figure(figsize=(12, 6)) +sns.violinplot(x='Chromosome', y='Start Location', hue='Strand', data=df, split=True, inner="quartile") +plt.title('Start Locations of circRNA by Chromosome and Strand') +plt.legend(title="Strand") +plot_file_name = f"{input_bed_file.replace('.bed','')}_overlaps_mqc.png" + +plt.savefig(plot_file_name, bbox_inches='tight') + + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "pandas": pd.__version__, + "seaborn": sns.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/benchmarking/png_json/main.nf b/modules/local/benchmarking/png_json/main.nf new file mode 100644 index 000000000..e6205bfc1 --- /dev/null +++ b/modules/local/benchmarking/png_json/main.nf @@ -0,0 +1,20 @@ +process PNG_JSON { + label "process_single" + + conda "bioconda::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + input: + path(png) + val(title) + val(description) + + output: + path("*_mqc.json") , emit: report + path("versions.yml"), emit: versions + + script: + template "json.py" +} diff --git a/modules/local/benchmarking/png_json/templates/json.py b/modules/local/benchmarking/png_json/templates/json.py new file mode 100644 index 000000000..639cfab2f --- /dev/null +++ b/modules/local/benchmarking/png_json/templates/json.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +import json +import os +import base64 +import platform + +png = "$png" +title = "$title" +description = "$description" + +absolute_path = os.path.abspath(png) +filename = os.path.basename(absolute_path) +name = filename.rstrip("_mcq.png") +id = title.lower().replace(" ", "_") + +# Convert the PNG file to a base64 string +with open(absolute_path, "rb") as image_file: + base64_string = base64.b64encode(image_file.read()).decode('utf-8') + +# Create the data structure for JSON +data = { + "id": f"benchmarking_{name}", + "parent_id": id, + "parent_name": title, + "parent_description": description, + "section_name": name, + "plot_type": "image", + "data": f'
' +} + +# Write the JSON data to a file +with open(f"{name}_mqc.json", "w") as f: + json.dump(data, f, indent=4) + +print(f"JSON file created for {absolute_path}") + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version() + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/benchmarking/seq_depth_plot/main.nf b/modules/local/benchmarking/seq_depth_plot/main.nf new file mode 100644 index 000000000..cdb5c1ed6 --- /dev/null +++ b/modules/local/benchmarking/seq_depth_plot/main.nf @@ -0,0 +1,17 @@ +process SEQ_DEPTH_CORRELLATION { + tag "$meta.id" + label "process_single" + + conda "bioconda::matplotlib=3.5.1 bioconda::seaborn=0.11.2" + container 'https://depot.galaxyproject.org/singularity/bionumpy:0.2.12--pyha8f3691_0' + + input: + tuple val(meta), + path(bed) + path(depth) + output: + path("*.tsv") , emit: report + path("versions.yml"), emit: versions + script: + template "correlation.py" +} diff --git a/modules/local/benchmarking/seq_depth_plot/templates/correlation.py b/modules/local/benchmarking/seq_depth_plot/templates/correlation.py new file mode 100644 index 000000000..78daf86d2 --- /dev/null +++ b/modules/local/benchmarking/seq_depth_plot/templates/correlation.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +import os +import re +import platform +import numpy as np + +meta = "$meta" +depth = "$depth" +bed = "$bed" + +def calculate_correlation(bed_file_path, depth_file_path): + # Read the BED file and store circRNA regions + circRNA_regions = [] + with open(bed_file_path, 'r') as bed_file: + for line in bed_file: + if line.startswith('#'): continue + parts = line.strip().split() + chr = parts[0] + start = int(parts[1]) + end = int(parts[2]) + circRNA_regions.append((chr, start, end)) + + # Read the sequence depth file + circRNA_presence = [] + sequencing_depth = [] + + with open(depth_file_path, 'r') as depth_file: + bed_idx = 0 + bed_len = len(circRNA_regions) + + for line in depth_file: + parts = line.strip().split() + chr = parts[0] + pos = int(parts[1]) + depth = int(parts[2]) + + # Move bed_idx to the correct region + while bed_idx < bed_len and (circRNA_regions[bed_idx][0] < chr or (circRNA_regions[bed_idx][0] == chr and circRNA_regions[bed_idx][2] < pos)): + bed_idx += 1 + + # Check if the current position is within a circRNA region + if bed_idx < bed_len and circRNA_regions[bed_idx][0] == chr and circRNA_regions[bed_idx][1] <= pos <= circRNA_regions[bed_idx][2]: + circRNA_presence.append(1) + else: + circRNA_presence.append(0) + + sequencing_depth.append(depth) + + # Calculate the correlation coefficient + correlation = np.corrcoef(sequencing_depth, circRNA_presence)[0, 1] + + return correlation + +id = re.search(r'id:(\\w+)', meta).group(1) + +with open(depth, 'r') as paths: + for path in paths: + path = path.strip() + basename = os.path.splitext(os.path.basename(path))[0] + + if basename == id: + # Calculate correlation + corr = calculate_correlation(bed, path) + # Write the correlation to stats.txt + with open('corr_mqc.tsv', 'w') as outfile: + header = "tool\\tpearson_corr\\n" + outfile.write(header + meta.split(",")[4][:-1] + '\\t' + str(corr)) + +#version capture +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +versions = { + "${task.process}" : { + "python": platform.python_version(), + "numpy": np.__version__ + } +} +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/nf-core/bedtools/genomecov/environment.yml b/modules/nf-core/bedtools/genomecov/environment.yml new file mode 100644 index 000000000..8fbe20c31 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_genomecov +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/genomecov/main.nf b/modules/nf-core/bedtools/genomecov/main.nf new file mode 100644 index 000000000..954e8913d --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/main.nf @@ -0,0 +1,74 @@ +process BEDTOOLS_GENOMECOV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(intervals), val(scale) + path sizes + val extension + val sort + + output: + tuple val(meta), path("*.${extension}"), emit: genomecov + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + args += (scale > 0 && scale != 1) ? " -scale $scale" : "" + if (!args_list.contains('-bg') && (scale > 0 && scale != 1)) { + args += " -bg" + } + def sort_cmd = sort ? '| bedtools sort' : '' + + def prefix = task.ext.prefix ?: "${meta.id}" + if (intervals.name =~ /\.bam/) { + """ + bedtools \\ + genomecov \\ + -ibam $intervals \\ + $args \\ + $sort_cmd \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + } else { + """ + bedtools \\ + genomecov \\ + -i $intervals \\ + -g $sizes \\ + $args \\ + $sort_cmd \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/genomecov/meta.yml b/modules/nf-core/bedtools/genomecov/meta.yml new file mode 100644 index 000000000..2b2385e3e --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/meta.yml @@ -0,0 +1,59 @@ +name: bedtools_genomecov +description: Computes histograms (default), per-base reports (-d) and BEDGRAPH (-bg) summaries of feature coverage (e.g., aligned sequences) for a given genome. +keywords: + - bed + - bam + - genomecov + - bedtools + - histogram +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals: + type: file + description: BAM/BED/GFF/VCF + pattern: "*.{bam|bed|gff|vcf}" + - scale: + type: integer + description: Number containing the scale factor for the output. Set to 1 to disable. Setting to a value other than 1 will also get the -bg bedgraph output format as this is required for this command switch + - sizes: + type: file + description: Tab-delimited table of chromosome names in the first column and chromosome sizes in the second column + - extension: + type: string + description: Extension of the output file (e. g., ".bg", ".bedgraph", ".txt", ".tab", etc.) It is set arbitrarily by the user and corresponds to the file format which depends on arguments. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - genomecov: + type: file + description: Computed genome coverage file + pattern: "*.${extension}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" + - "@chris-cheshire" +maintainers: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" + - "@chris-cheshire" diff --git a/modules/nf-core/bedtools/genomecov/tests/main.nf.test b/modules/nf-core/bedtools/genomecov/tests/main.nf.test new file mode 100644 index 000000000..8213cff95 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/tests/main.nf.test @@ -0,0 +1,122 @@ +nextflow_process { + name "Test Process BEDTOOLS_GENOMECOV" + script "../main.nf" + process "BEDTOOLS_GENOMECOV" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "bedtools" + tag "bedtools/genomecov" + + test("sarscov2 - no scale") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true), + 1 + ] + // sizes + input[1] = [] + // extension + input[2] = "txt" + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("no_scale") } + ) + } + + } + + test("sarscov2 - dummy sizes") { + when { + process { + """ + input[0] = [ + [ id:'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true), + 0.5 + ] + // sizes + input[1] = file('dummy_chromosome_sizes') + // extension + input[2] = 'txt' + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("dummy_sizes") } + ) + } + + } + + test("sarscov2 - scale") { + when { + process { + """ + input[0] = [ + [ id:'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/bed/baits.bed", checkIfExists: true), + 0.5 + ] + // sizes + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.sizes", checkIfExists: true) + // extension + input[2] = 'txt' + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("scale") } + ) + } + + } + + test("stub") { + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.bam", checkIfExists: true), + 1 + ] + // sizes + input[1] = [] + // extension + input[2] = 'txt' + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.genomecov[0][1]).name).match("stub") } + ) + } + + } + +} diff --git a/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap b/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap new file mode 100644 index 000000000..8f9191e4c --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/tests/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "dummy_sizes": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,01291b6e1beab72e046653e709eb0e10" + ] + ], + "1": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ], + "genomecov": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,01291b6e1beab72e046653e709eb0e10" + ] + ], + "versions": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ] + } + ], + "timestamp": "2023-12-05T17:35:58.35232" + }, + "no_scale": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,66083198daca6c001d328ba9616e9b53" + ] + ], + "1": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ], + "genomecov": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,66083198daca6c001d328ba9616e9b53" + ] + ], + "versions": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ] + } + ], + "timestamp": "2023-12-05T17:35:51.142496" + }, + "stub": { + "content": [ + "test.coverage.txt" + ], + "timestamp": "2023-12-05T17:36:13.084709" + }, + "scale": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,de3c59c0ea123bcdbbad27bc0a0a601e" + ] + ], + "1": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ], + "genomecov": [ + [ + { + "id": "test" + }, + "test.coverage.txt:md5,de3c59c0ea123bcdbbad27bc0a0a601e" + ] + ], + "versions": [ + "versions.yml:md5,5fd44452613992a6f71f2c73d2e117f2" + ] + } + ], + "timestamp": "2023-12-05T17:36:05.962006" + } +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/genomecov/tests/nextflow.config b/modules/nf-core/bedtools/genomecov/tests/nextflow.config new file mode 100644 index 000000000..bdb74ae5a --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: BEDTOOLS_GENOMECOV { + ext.prefix = { "${meta.id}.coverage" } + } + +} diff --git a/modules/nf-core/bedtools/genomecov/tests/tags.yml b/modules/nf-core/bedtools/genomecov/tests/tags.yml new file mode 100644 index 000000000..55fce4780 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/tests/tags.yml @@ -0,0 +1,2 @@ +bedtools/genomecov: + - "modules/nf-core/bedtools/genomecov/**" diff --git a/modules/nf-core/bedtools/jaccard/environment.yml b/modules/nf-core/bedtools/jaccard/environment.yml new file mode 100644 index 000000000..f0f2a38fc --- /dev/null +++ b/modules/nf-core/bedtools/jaccard/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_jaccard +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/jaccard/main.nf b/modules/nf-core/bedtools/jaccard/main.nf new file mode 100644 index 000000000..32a41210d --- /dev/null +++ b/modules/nf-core/bedtools/jaccard/main.nf @@ -0,0 +1,49 @@ +process BEDTOOLS_JACCARD { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(input_a), path(input_b) + tuple val(meta2), path(genome_file) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def genome = genome_file ? "-g ${genome_file}" : "" + """ + bedtools jaccard \\ + -a ${input_a} \\ + -b ${input_b} \\ + ${genome} \\ + ${args} \\ + > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/jaccard/meta.yml b/modules/nf-core/bedtools/jaccard/meta.yml new file mode 100644 index 000000000..2b5422841 --- /dev/null +++ b/modules/nf-core/bedtools/jaccard/meta.yml @@ -0,0 +1,57 @@ +name: "bedtools_jaccard" +description: Calculate Jaccard statistic b/w two feature files. +keywords: + - vcf + - gff + - bed + - jaccard + - intersection + - union + - statistics +tools: + - "bedtools": + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_a: + type: file + description: VCF,GFF or BED file to use with the `-a` option + pattern: "*.{vcf,vcf.gz,bed,bed.gz,gff}" + - input_b: + type: file + description: VCF,GFF or BED file to use with the `-b` option + pattern: "*.{vcf,vcf.gz,bed,bed.gz,gff}" + - meta2: + type: map + description: | + Groovy Map containing genome file information + e.g. [ id:'test', single_end:false ] + - genome_file: + type: file + description: A file containing all the contigs of the genome used to create the input files + pattern: "*.{txt,sizes,fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - tsv: + type: file + description: TSV file containing the results + pattern: "*.tsv" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/bedtools/merge/environment.yml b/modules/nf-core/bedtools/merge/environment.yml new file mode 100644 index 000000000..99707878b --- /dev/null +++ b/modules/nf-core/bedtools/merge/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_merge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/merge/main.nf b/modules/nf-core/bedtools/merge/main.nf new file mode 100644 index 000000000..5310647de --- /dev/null +++ b/modules/nf-core/bedtools/merge/main.nf @@ -0,0 +1,47 @@ +process BEDTOOLS_MERGE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(bed) + + output: + tuple val(meta), path('*.bed'), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bed" == "${prefix}.bed") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + bedtools \\ + merge \\ + -i $bed \\ + $args \\ + > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/merge/meta.yml b/modules/nf-core/bedtools/merge/meta.yml new file mode 100644 index 000000000..d7463e3db --- /dev/null +++ b/modules/nf-core/bedtools/merge/meta.yml @@ -0,0 +1,45 @@ +name: bedtools_merge +description: combines overlapping or “book-ended” features in an interval file into a single feature which spans all of the combined features. +keywords: + - bed + - merge + - bedtools + - overlapped bed +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/merge.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Input BED file + pattern: "*.{bed}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Overlapped bed file with combined features + pattern: "*.{bed}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" +maintainers: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" diff --git a/modules/nf-core/bedtools/merge/tests/main.nf.test b/modules/nf-core/bedtools/merge/tests/main.nf.test new file mode 100644 index 000000000..95dba8e54 --- /dev/null +++ b/modules/nf-core/bedtools/merge/tests/main.nf.test @@ -0,0 +1,34 @@ +nextflow_process { + + name "Test Process BEDTOOLS_MERGE" + script "../main.nf" + config "./nextflow.config" + process "BEDTOOLS_MERGE" + + tag "modules" + tag "modules_nfcore" + tag "bedtools" + tag "bedtools/merge" + + test("test_bedtools_merge") { + + when { + process { + """ + input[0] = [ [ id:'test'], + file(params.test_data['sarscov2']['genome']['test_bed'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/merge/tests/main.nf.test.snap b/modules/nf-core/bedtools/merge/tests/main.nf.test.snap new file mode 100644 index 000000000..ee6c4e634 --- /dev/null +++ b/modules/nf-core/bedtools/merge/tests/main.nf.test.snap @@ -0,0 +1,35 @@ +{ + "test_bedtools_merge": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_out.bed:md5,0cf6ed2b6f470cd44a247da74ca4fe4e" + ] + ], + "1": [ + "versions.yml:md5,2d134badb4cd1e4e903696c7967f28d6" + ], + "bed": [ + [ + { + "id": "test" + }, + "test_out.bed:md5,0cf6ed2b6f470cd44a247da74ca4fe4e" + ] + ], + "versions": [ + "versions.yml:md5,2d134badb4cd1e4e903696c7967f28d6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T17:07:09.721153" + } +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/merge/tests/nextflow.config b/modules/nf-core/bedtools/merge/tests/nextflow.config new file mode 100644 index 000000000..16444e982 --- /dev/null +++ b/modules/nf-core/bedtools/merge/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: BEDTOOLS_MERGE { + ext.prefix = { "${meta.id}_out" } + } + +} diff --git a/modules/nf-core/bedtools/merge/tests/tags.yml b/modules/nf-core/bedtools/merge/tests/tags.yml new file mode 100644 index 000000000..60c8cad1c --- /dev/null +++ b/modules/nf-core/bedtools/merge/tests/tags.yml @@ -0,0 +1,2 @@ +bedtools/merge: + - "modules/nf-core/bedtools/merge/**" diff --git a/nextflow.config b/nextflow.config index ff6b0bbcb..1f3b85a27 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,6 +22,7 @@ params { tool_filter = 1 exon_boundary = 0 save_intermediates = false + benchmarking = false // References genome = null diff --git a/nextflow_schema.json b/nextflow_schema.json index ee8ea89c8..268428f74 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -139,6 +139,13 @@ "help_text": "During annotation, if one of the start or end position of a circular candidate imperfectly overlaps an exon boundary, the script will consider positions within 'exon_boundary' (default 0bp) to be an exonic circRNA. If they fall outside of this range, the candidate is assumed to be an exonic-intronic circRNA, and the entire underlying sequence is taken for miRNA analysis, as opposed to just the exonic sequences for canonical exonic circRNAs. ", "default": 0 }, + "benchmarking": { + "type": "boolean", + "description": "Run the pipeline in benchmarking mode.", + "default": false, + "fa_icon": "fas fa-chart-line", + "help_text": "This enables parallel analysis of polyA-enriched data with the same circRNA quantification tools to compare results." + }, "mirna_tools": { "type": "string", "fa_icon": "fas fa-wrench", diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index dea819a65..e14290c55 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -1,6 +1,5 @@ include { BEDTOOLS_INTERSECT as INTERSECT_GTF } from '../../modules/nf-core/bedtools/intersect' include { GAWK as INGEST_DATABASE_NAMES } from '../../modules/nf-core/gawk' -include { GNU_SORT as COMBINE_DATABASES } from '../../modules/nf-core/gnu/sort' include { BEDTOOLS_INTERSECT as INTERSECT_DATABASE } from '../../modules/nf-core/bedtools/intersect' include { ANNOTATION as ANNOTATE } from '../../modules/local/annotation' diff --git a/subworkflows/local/benchmarking.nf b/subworkflows/local/benchmarking.nf new file mode 100644 index 000000000..09fbda3d3 --- /dev/null +++ b/subworkflows/local/benchmarking.nf @@ -0,0 +1,119 @@ +include { GNU_SORT as SORT } from '../../modules/nf-core/gnu/sort' +include { BEDTOOLS_MERGE } from '../../modules/nf-core/bedtools/merge' +include { BEDTOOLS_INTERSECT } from '../../modules/nf-core/bedtools/intersect' +include { BEDTOOLS_JACCARD } from '../../modules/nf-core/bedtools/jaccard' +include { BEDTOOLS_GENOMECOV } from '../../modules/nf-core/bedtools/genomecov' +include { BENCHMARKING_MULTIQC as JACCARD_MULTIQC } from '../../modules/local/benchmarking/multiqc' +include { BENCHMARKING_MULTIQC as CORRELATION_MULTIQC } from '../../modules/local/benchmarking/multiqc' +include { PNG_JSON as LOCATION_JSON } from '../../modules/local/benchmarking/png_json' +include { PNG_JSON as OVERLAP_JSON } from '../../modules/local/benchmarking/png_json' +include { LOCATION_PLOT } from '../../modules/local/benchmarking/location_plots' +include { OVERLAP_PLOT } from '../../modules/local/benchmarking/overlap_plot' +include { SEQ_DEPTH_CORRELLATION } from '../../modules/local/benchmarking/seq_depth_plot' +include { AVERAGE_TSV } from '../../modules/local/benchmarking/average_tsv' + + + +workflow BENCHMARKING { + + take: + ch_real_bed + ch_benchmarking_bed + ch_real_bam + ch_benchmarking_bam + ch_trim_report + + main: + + //data preparation + ch_versions = Channel.empty() + + ch_all = ch_real_bed.mix(ch_benchmarking_bed) + .map{ meta, bed -> [[id: meta.tool + "_" + (meta.benchmarking ? "benchmarking" : "real"), + tool: meta.tool, + benchmarking: meta.benchmarking], bed]} + .groupTuple() + + SORT(ch_all) + ch_versions = ch_versions.mix(SORT.out.versions) + + BEDTOOLS_MERGE(SORT.out.sorted).bed.branch{ meta, bed -> + real: !meta.benchmarking + benchmarking: meta.benchmarking + }.set { ch_merged } + ch_versions = ch_versions.mix(BEDTOOLS_MERGE.out.versions) + + ch_joined = ch_merged.real.map{ meta, bed -> [[id: meta.tool], bed]} + .join(ch_merged.benchmarking.map{ meta, bed -> [[id: meta.tool], bed]}) + + //Overlap plot + ch_intersect = BEDTOOLS_INTERSECT(ch_joined,[[], []]) + OVERLAP_PLOT(ch_intersect.intersect) + OVERLAP_JSON(OVERLAP_PLOT.out.plots, "Overlap plots", "Plot the overlap circRNAs found in total and polyA data for the tools") + ch_versions = ch_versions.mix(BEDTOOLS_INTERSECT.out.versions) + ch_versions = ch_versions.mix(OVERLAP_PLOT.out.versions) + ch_versions = ch_versions.mix(OVERLAP_JSON.out.versions) + + //Location plot workflow + LOCATION_PLOT(ch_joined) + LOCATION_JSON(LOCATION_PLOT.out.plots, "Location plots", "Plots the location of the circRNAs found" ) + ch_versions = ch_versions.mix(LOCATION_PLOT.out.versions) + ch_versions = ch_versions.mix(LOCATION_JSON.out.versions) + + //Pearson correllation workflow + ch_meta = ch_real_bam.map { it[0] } + ch_path = ch_real_bam.map { it[1] } + ch_scale = Channel.value(1) + ch_genomecov_inputs = ch_meta.combine(ch_path).combine(ch_scale) + .map { meta, path, scale -> + tuple(meta, path, scale) + } + + ch_genomecov = BEDTOOLS_GENOMECOV(ch_genomecov_inputs, [], "bg",false) + ch_versions = ch_versions.mix(BEDTOOLS_GENOMECOV.out.versions) + + ch_seqdepths = ch_genomecov.genomecov + .map { genomecov_result -> genomecov_result[1].toString() } + .collectFile(name: 'genomecov_paths.txt', + newLine: true) + + ch_corr = SEQ_DEPTH_CORRELLATION(ch_real_bed, ch_seqdepths.collect()).report + ch_versions = ch_versions.mix(SEQ_DEPTH_CORRELLATION.out.versions) + + ch_pearson = ch_corr.splitCsv(header: true, sep: "\t") + .map{ values -> [values.tool, values.pearson_corr]} + .collectFile( newLine: true, + storeDir: params.outdir, + seed: "tool\tpearson_corr") { + row -> ["pearson.tsv", row.join("\t")] + } + + AVERAGE_TSV(ch_pearson) + CORRELATION_MULTIQC(AVERAGE_TSV.out.tsv) + ch_versions = ch_versions.mix(AVERAGE_TSV.out.versions) + ch_versions = ch_versions.mix(CORRELATION_MULTIQC.out.versions) + + //Jaccard Workflow + ch_jaccard = BEDTOOLS_JACCARD(ch_joined, [[], []]).tsv + ch_versions = ch_versions.mix(BEDTOOLS_JACCARD.out.versions) + + ch_stats = ch_jaccard.splitCsv(header: true, sep: "\t") + .map{ meta, values -> [meta.id, values.intersection, values.union, values.jaccard, values.n_intersections]} + .collectFile( newLine: true, + storeDir: params.outdir, + seed: "tool\tintersection\tunion\tjaccard\tn_intersections") { + row -> ["jaccard.tsv", row.join("\t")] + } + + JACCARD_MULTIQC(ch_stats) + ch_versions = ch_versions.mix(JACCARD_MULTIQC.out.versions) + + //combine results + ch_reports = JACCARD_MULTIQC.out.report.mix(LOCATION_JSON.out.report) + ch_reports = ch_reports.mix(OVERLAP_JSON.out.report) + ch_reports = ch_reports.mix(CORRELATION_MULTIQC.out.report) + + emit: + reports = ch_reports + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bsj_detection.nf b/subworkflows/local/bsj_detection.nf index 89ddec4ed..d85bf3304 100644 --- a/subworkflows/local/bsj_detection.nf +++ b/subworkflows/local/bsj_detection.nf @@ -229,11 +229,13 @@ workflow BSJ_DETECTION { ) emit: - bed = ch_bsj_bed_combined - bed12 = ch_bsj_bed12_combined - gtf = ch_bsj_gtf_combined - fasta = ch_bsj_fasta_combined - - multiqc_files = ch_multiqc_files - versions = ch_versions + bed = ch_bsj_bed_combined + bed12 = ch_bsj_bed12_combined + gtf = ch_bsj_gtf_combined + fasta = ch_bsj_fasta_combined + star_bam = STAR2PASS.out.bam + bed_per_sample_tool = ch_bsj_bed_per_sample_tool + + multiqc_files = ch_multiqc_files + versions = ch_versions } diff --git a/subworkflows/local/detection_tools/star2pass.nf b/subworkflows/local/detection_tools/star2pass.nf index 4e216545b..4545c909b 100644 --- a/subworkflows/local/detection_tools/star2pass.nf +++ b/subworkflows/local/detection_tools/star2pass.nf @@ -29,6 +29,7 @@ workflow STAR2PASS { junction = PASS_2.out.junction sam = PASS_2.out.sam tab = PASS_2.out.tab + bam = PASS_2.out.bam versions = ch_versions } diff --git a/subworkflows/local/mirna_prediction.nf b/subworkflows/local/mirna_prediction.nf index b3fc46cc0..1b6b046bb 100644 --- a/subworkflows/local/mirna_prediction.nf +++ b/subworkflows/local/mirna_prediction.nf @@ -1,8 +1,7 @@ // MODULES -include { BIOAWK as ADD_BACKSPLICE } from '../../modules/nf-core/bioawk' -include { DESEQ2_NORMALIZATION } from '../../modules/local/deseq2/normalization' -include { MIRNA_FILTERING } from '../../modules/local/mirna_filtering' -include { COMPUTE_CORRELATIONS } from '../../modules/local/compute_correlations' +include { DESEQ2_NORMALIZATION } from '../../modules/local/deseq2/normalization' +include { MIRNA_FILTERING } from '../../modules/local/mirna_filtering' +include { COMPUTE_CORRELATIONS } from '../../modules/local/compute_correlations' // SUBWORKFLOWS include { MIRNA_BINDINGSITES } from './mirna/mirna_bindingsites' diff --git a/workflows/circrna/main.nf b/workflows/circrna/main.nf index 048b4da34..b656bd019 100644 --- a/workflows/circrna/main.nf +++ b/workflows/circrna/main.nf @@ -22,17 +22,19 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil */ // SUBWORKFLOWS: -include { paramsSummaryMap } from 'plugin/nf-validation' -include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' -include { validateInputSamplesheet } from '../../subworkflows/local/utils_nfcore_circrna_pipeline' - -include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' -include { PREPARE_GENOME } from '../../subworkflows/local/prepare_genome' -include { BSJ_DETECTION } from '../../subworkflows/local/bsj_detection' -include { ANNOTATION } from '../../subworkflows/local/annotation' -include { QUANTIFICATION } from '../../subworkflows/local/quantification' -include { MIRNA_PREDICTION } from '../../subworkflows/local/mirna_prediction' -include { STATISTICAL_TESTS } from '../../subworkflows/local/statistical_tests' +include { paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' +include { validateInputSamplesheet } from '../../subworkflows/local/utils_nfcore_circrna_pipeline' + +include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' +include { PREPARE_GENOME } from '../../subworkflows/local/prepare_genome' +include { BSJ_DETECTION } from '../../subworkflows/local/bsj_detection' +include { BSJ_DETECTION as BSJ_DETECTION_BENCHMARKING } from '../../subworkflows/local/bsj_detection' +include { BENCHMARKING } from '../../subworkflows/local/benchmarking' +include { ANNOTATION } from '../../subworkflows/local/annotation' +include { QUANTIFICATION } from '../../subworkflows/local/quantification' +include { MIRNA_PREDICTION } from '../../subworkflows/local/mirna_prediction' +include { STATISTICAL_TESTS } from '../../subworkflows/local/statistical_tests' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,8 +138,13 @@ workflow CIRCRNA { // 2. BSJ Discovery // + FASTQC_TRIMGALORE.out.reads.branch{ + real: !it[0].benchmarking + benchmarking: it[0].benchmarking + }.set{ ch_reads } + BSJ_DETECTION( - FASTQC_TRIMGALORE.out.reads, + ch_reads.real, ch_fasta, ch_gtf, ch_annotation, @@ -155,13 +162,45 @@ workflow CIRCRNA { ch_versions = ch_versions.mix(BSJ_DETECTION.out.versions) // - // 3. circRNA quantification + // 3. Benchmarking + // + + if (params.benchmarking) { + BSJ_DETECTION_BENCHMARKING( + ch_reads.benchmarking, + ch_fasta, + ch_gtf, + ch_annotation, + bowtie_index, + bowtie2_index, + bwa_index, + chromosomes, + hisat2_index, + star_index, + params.bsj_reads, + params.exon_boundary + ) + + BENCHMARKING( + BSJ_DETECTION.out.bed_per_sample_tool, + BSJ_DETECTION_BENCHMARKING.out.bed_per_sample_tool, + BSJ_DETECTION.out.star_bam, + BSJ_DETECTION_BENCHMARKING.out.star_bam, + FASTQC_TRIMGALORE.out.trim_log + ) + + ch_multiqc_files = ch_multiqc_files.mix(BENCHMARKING.out.reports) + ch_versions = ch_versions.mix(BSJ_DETECTION_BENCHMARKING.out.versions) + } + + // + // 4. Quantification // QUANTIFICATION( ch_gtf, ch_fasta, - FASTQC_TRIMGALORE.out.reads, + ch_reads.real, BSJ_DETECTION.out.bed12, BSJ_DETECTION.out.gtf, params.bootstrap_samples, @@ -172,7 +211,7 @@ workflow CIRCRNA { ch_versions = ch_versions.mix(QUANTIFICATION.out.versions) // - // 4. miRNA prediction + // 5. miRNA prediction // if (params.mature) {