nf-core · Fabian-Boehm · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -5,10 +5,12 @@ report_comment: >
 report_section_order:
   "nf-core-circrna-methods-description":
     order: -1000
-  software_versions:
+  benchmarking:
     order: -1001
-  "nf-core-circrna-summary":
+  software_versions:
     order: -1002
+  "nf-core-circrna-summary":
+    order: -1003
 
 export_plots: true
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -27,6 +27,12 @@
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
                 "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             },
+            "benchmarking": {
+                "type": "boolean",
+                "default": false,
+                "errorMessage": "Benchmarking must be a boolean value",
+                "meta": ["benchmarking"]
+            },
             "strandedness": {
                 "type": "string",
                 "enum": ["unstranded", "forward", "reverse", "auto"],

diff --git a/conf/modules.config b/conf/modules.config
@@ -813,6 +813,23 @@ process {
         ]
     }
 
+    withName: '.*:CIRCRNA_DISCOVERY_BENCHMARKING:ANNOTATION:REMOVE_SCORE_STRAND' {
+        ext.suffix = "_benchmarking.tidy.bed"
+    }
+
+    withName: '.*:BENCHMARKING:SORT' {
+        ext.args = "-k 1,1 -k2,2n -k3,3n -u"
+        ext.suffix = "combined.bed"
+    }
+
+    withName: '.*:BENCHMARKING:BEDTOOLS_MERGE' {
+        ext.args = "-s -c 6 -o distinct"
+    }
+
+    withName: '.*:BENCHMARKING:BEDTOOLS_GENOMECOV' {
+        ext.args = "-dz"
+    }
+
     withName: ADD_BACKSPLICE {
         ext.args = "-c fastx '{ if (\$name ~ /^circ_/) { \$seq = \$seq substr(\$seq, 1, 25) } print \">\" \$name; print \$seq }'"
         ext.suffix = "backspliced.fa"

diff --git a/modules.json b/modules.json
@@ -5,6 +5,11 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "bedtools/genomecov": {
+                        "branch": "master",
+                        "git_sha": "81b90194ce9911dbd55bba2c65c6919f6677abc4",
+                        "installed_by": ["modules"]
+                    },
                     "bedtools/getfasta": {
                         "branch": "master",
                         "git_sha": "cdcdd5e3d806f0ff3983c40c69e0b07bb44ec299",
@@ -20,6 +25,16 @@
                         "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83",
                         "installed_by": ["modules"]
                     },
+                    "bedtools/jaccard": {
+                        "branch": "master",
+                        "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668",
+                        "installed_by": ["modules"]
+                    },
+                    "bedtools/merge": {
+                        "branch": "master",
+                        "git_sha": "a5377837fe9013bde89de8689829e83e84086536",
+                        "installed_by": ["modules"]
+                    },
                     "bedtools/sort": {
                         "branch": "master",
                         "git_sha": "571a5feac4c9ce0a8df0bc15b94230e7f3e8db47",

diff --git a/modules/local/benchmarking/average_tsv/main.nf b/modules/local/benchmarking/average_tsv/main.nf
@@ -0,0 +1,19 @@
+process AVERAGE_TSV {
+    label "process_single"
+
+    conda "bioconda::pandas=1.5.2"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'biocontainers/pandas:1.5.2' }"
+
+
+    input:
+    path(tsv)
+
+    output:
+    path(tsv), emit: tsv
+    path("versions.yml"), emit: versions
+
+    script:
+    template "average.py"
+}
diff --git a/modules/local/benchmarking/average_tsv/templates/average.py b/modules/local/benchmarking/average_tsv/templates/average.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import platform
+
+tsv = "$tsv"
+data = pd.read_csv(tsv, sep='\\t')
+
+data['tool'] = data['tool'].str.replace('tool:', '')
+average_values = data.groupby('tool')['pearson_corr'].mean().reset_index()
+
+output_file_path = tsv
+average_values.to_csv(output_file_path, sep='\\t', index=False)
+
+#version capture
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+versions = {
+    "${task.process}" : {
+        "python": platform.python_version(),
+        "pandas": pd.__version__
+    }
+}
+with open("versions.yml", "w") as f:
+    f.write(format_yaml_like(versions))
+
diff --git a/modules/local/benchmarking/location_plots/main.nf b/modules/local/benchmarking/location_plots/main.nf
@@ -0,0 +1,17 @@
+process LOCATION_PLOT {
+    tag "$meta.id"
+    label "process_single"
+
+    conda "bioconda::seaborn=0.11.2"
+    container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75'
+
+    input:
+        tuple val(meta), path(bedfile1), path(bedfile2)
+
+    output:
+        path("*_mqc.png"), emit: plots
+        path("versions.yml"), emit: versions
+
+    script:
+        template "create_plots.py"
+}
diff --git a/modules/local/benchmarking/location_plots/templates/create_plots.py b/modules/local/benchmarking/location_plots/templates/create_plots.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+import csv
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from matplotlib.lines import Line2D
+import platform
+
+input_bed_file_1 = '$bedfile1'
+input_bed_file_2 = '$bedfile2'
+
+# Read input files
+def read_bed_file(file_path, label):
+    data = {'chromosome': [], 'start': [], 'strand': [], 'file_label': []}
+    with open(file_path, 'r') as file:
+        reader = csv.reader(file, delimiter='\\t')
+        for row in reader:
+            data['chromosome'].append(row[0])
+            data['start'].append(int(row[1]))
+            data['strand'].append(row[3])
+            data['file_label'].append(label.lower())
+    return data
+
+def combine_data(data1, data2):
+    for key in data1:
+        data1[key].extend(data2[key])
+    return data1
+
+data1 = read_bed_file(input_bed_file_1, 'total')
+data2 = read_bed_file(input_bed_file_2, 'polya')
+
+# Combine the two datasets
+combined_data = combine_data(data1, data2)
+
+# Create a DataFrame
+df = pd.DataFrame({
+    'Chromosome': combined_data['chromosome'],
+    'Start Location': combined_data['start'],
+    'Strand': combined_data['strand'],
+    'File Label': combined_data['file_label']
+})
+
+# Sort DataFrame to ensure consistent plotting order
+df.sort_values(by=['File Label', 'Strand'], inplace=True)
+
+# Plotting
+fig, ax = plt.subplots(figsize=(12, 6))
+
+palette = {
+    "total +": "red",
+    "total -": "lightcoral",
+    "polya +": "blue",
+    "polya -": "lightblue"
+}
+
+# Draw violins
+for file_label in df['File Label'].unique():
+    sns.violinplot(
+        x="Chromosome",
+        y="Start Location",
+        hue="Strand",
+        data=df[df['File Label'] == file_label],
+        palette={"+" : palette[f"{file_label} +"], "-" : palette[f"{file_label} -"]},
+        split=True,
+        ax=ax,
+        scale="count",
+        scale_hue=False,
+        saturation=0.75,
+        inner=None
+    )
+
+# Set transparency for all violins
+for violin in ax.collections:
+    violin.set_alpha(0.25)
+
+# Legend
+custom_lines = [
+    Line2D([0], [0], color=palette[f"{file} {strand}"], lw=4, alpha=0.25)
+    for file in df['File Label'].unique()
+    for strand in ["+", "-"]
+]
+ax.legend(
+    custom_lines,
+    [f"{file} : {strand}" for file in df['File Label'].unique() for strand in ["+", "-"]],
+    title="File : Strand"
+)
+
+plt.title('Start Locations of circRNA by Chromosome and Strand')
+
+plot_file_name = f"{input_bed_file_1.replace('.bed','')}_{input_bed_file_2.replace('.bed','')}_mqc.png"
+
+# Save the plot
+plt.savefig(plot_file_name, bbox_inches='tight')
+
+#version capture
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+versions = {
+    "${task.process}" : {
+        "python": platform.python_version(),
+        "pandas": pd.__version__,
+        "seaborn": sns.__version__
+    }
+}
+with open("versions.yml", "w") as f:
+    f.write(format_yaml_like(versions))
diff --git a/modules/local/benchmarking/multiqc/main.nf b/modules/local/benchmarking/multiqc/main.nf
@@ -0,0 +1,18 @@
+process BENCHMARKING_MULTIQC {
+    label "process_single"
+
+    conda "bioconda::pandas=1.5.2"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'biocontainers/pandas:1.5.2' }"
+
+    input:
+    path(jaccard)
+
+    output:
+    path("*_mqc.json")  , emit: report
+    path("versions.yml"), emit: versions
+
+    script:
+    template "benchmarking.py"
+}
diff --git a/modules/local/benchmarking/multiqc/templates/benchmarking.py b/modules/local/benchmarking/multiqc/templates/benchmarking.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import json
+import platform
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+df = pd.read_csv("$jaccard", sep='\\t', index_col=0)
+
+for metric in df.columns:
+    data = {
+        "id": f"benchmarking_{metric}",
+        "parent_id": "benchmarking",
+        "parent_name": "Benchmarking",
+        "parent_description": "Benchmarking of the tools",
+        "section_name": metric.capitalize(),
+        "description": f"{metric.capitalize()} values of the tools",
+        "plot_type": "bargraph",
+        "data": df[[metric]].T.to_dict()
+    }
+
+    with open(f"benchmarking_{metric}_mqc.json", "w") as f:
+        json.dump(data, f, indent=4)
+
+versions = {
+    "${task.process}" : {
+        "python": platform.python_version(),
+        "pandas": pd.__version__
+    }
+}
+with open("versions.yml", "w") as f:
+    f.write(format_yaml_like(versions))
diff --git a/modules/local/benchmarking/overlap_plot/main.nf b/modules/local/benchmarking/overlap_plot/main.nf
@@ -0,0 +1,17 @@
+process OVERLAP_PLOT {
+    tag "$meta.id"
+    label "process_single"
+
+    conda "bioconda::seaborn=0.11.2"
+    container 'community.wave.seqera.io/library/seaborn:0.13.2--ef0811a05c6fcc75'
+
+    input:
+        tuple val(meta), path(bed)
+
+    output:
+        path("*_mqc.png") , emit: plots
+        path("versions.yml"), emit: versions
+
+    script:
+        template "create_plots.py"
+}