diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_protocol.sh old mode 100644 new mode 100755 diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_summary.sh b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_summary.sh new file mode 100755 index 00000000..96cae508 --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/bin/generate_summary.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +sample_IDs_file=${1} +host=${2} +input_dir=${3} +output_file=${4} + +# starting output file +printf "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent_${host}_reads_removed\n" > ${output_file} + +# looping through all input files and generating columns for final table +for sample in $(cat ${sample_IDs_file}) +do + cat ${input_dir}/${sample}-removal-info.tmp >> ${output_file} +done \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/kraken2.yaml b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/kraken2.yaml index 43312292..2d04cb8f 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/kraken2.yaml +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/envs/kraken2.yaml @@ -1,6 +1,5 @@ channels: - conda-forge - bioconda - - defaults dependencies: - kraken2=2.1.6 diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf index da11473f..85501a16 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/main.nf @@ -1,31 +1,29 @@ // main.nf nextflow.enable.dsl=2 -// Terminal text color definitions -c_back_bright_red = "\u001b[41;1m"; -c_reset = "\033[0m"; +include { paramsHelp } from 'plugin/nf-schema' +include { validateParameters } from 'plugin/nf-schema' -include { KRAKEN2_DB } from './modules/kraken2_db.nf' -include { KRAKEN_2 } from './modules/kraken2.nf' -include { SUMMARY } from './modules/summary.nf' +include { KRAKEN2_DB } from './modules/kraken2_db.nf' +include { KRAKEN_2 } from './modules/kraken2.nf' +include { SUMMARY } from './modules/summary.nf' +include { COMPILE_SUMMARY } from './modules/summary.nf' -include { SOFTWARE_VERSIONS } from './modules/utils.nf' -include { GENERATE_PROTOCOL } from './modules/generate_protocol.nf' +include { SOFTWARE_VERSIONS } from './modules/utils.nf' +include { GENERATE_PROTOCOL } from './modules/generate_protocol.nf' workflow { - if (!params.ref_dbs_Dir){ - error("""${c_back_bright_red}INPUT ERROR! - Please supply the path to the directory storing kraken2 reference databases - by passing --ref_dbs_Dir. - ${c_reset}""") - } + main: + + // check input parameters + validateParameters() // Capture software versions - software_versions_ch = Channel.empty() + software_versions_ch = channel.empty() // Get host info - host_info = Channel + host_info = channel .fromPath(params.hosts_table) .splitCsv(header:true) .filter { row -> row.name.toLowerCase() == params.host.toLowerCase() } // match host @@ -38,19 +36,18 @@ workflow { // Check if kraken2 database already exists or needs to be built def host_id = params.host.replaceAll(' ', '_').toLowerCase() - def host_db = file("${params.ref_dbs_Dir}/kraken2-${host_id}-db") + def host_db = file("${params.ref_dbs_dir}/kraken2-${host_id}-db") def db_exists = host_db.exists() - if (db_exists) { - database_ch = Channel.value(host_db) - } + if (db_exists) + database_ch = channel.value(host_db) else { - build_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, hostID, fasta) } - KRAKEN2_DB(build_ch) + build_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, host_id, fasta) } + KRAKEN2_DB(build_ch, params.ref_dbs_dir) database_ch = KRAKEN2_DB.out.first() } - Channel + channel .fromPath(params.sample_id_list) .splitText() .map { it.trim() } @@ -66,31 +63,19 @@ workflow { } .set {generated_reads_ch} - KRAKEN_2(database_ch, generated_reads_ch) + KRAKEN_2(database_ch, generated_reads_ch, params.out_suffix) KRAKEN_2.out.version | mix(software_versions_ch) | set{software_versions_ch} - // Generate summary and compile one file + // Generate summary and compile into one file SUMMARY(KRAKEN_2.out.output, KRAKEN_2.out.report) - SUMMARY.out - .collect() - .subscribe { summary_files -> - def outfile = file("${params.outdir}/results/Host-read-removal-summary.tsv") - def header = "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent_host_reads_removed\n" - outfile.text = header + summary_files.collect { it.text }.join() - - // summary.tmp cleanup - summary_files.each { f -> - def tmpFile = f.toFile() - tmpFile.delete() - } - } + COMPILE_SUMMARY(SUMMARY.out.collect(), channel.fromPath(params.sample_id_list), params.host) // Software Version Capturing - combining all captured software versions nf_version = "Nextflow Version ".concat("${nextflow.version}") - nextflow_version_ch = Channel.value(nf_version) + nextflow_version_ch = channel.value(nf_version) // Write software versions to file - software_versions_ch | map { it.text.strip() } + software_versions_ch | map { it -> it.text.strip() } | unique | mix(nextflow_version_ch) | collectFile({it -> it}, newLine: true, cache: false) @@ -99,6 +84,41 @@ workflow { // Protocol always needs name, refseq ID, and genome build protocol_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, refseq, genome) } - GENERATE_PROTOCOL(protocol_ch, SOFTWARE_VERSIONS.out) + def protocol = host_db.resolve('read-removal-protocol-text.txt') + protocol_out = GENERATE_PROTOCOL(protocol_ch, SOFTWARE_VERSIONS.out, channel.value(protocol)) + + publish: + protocol_out = protocol_out + software_versions = SOFTWARE_VERSIONS.out + fastq_out = KRAKEN_2.out.host_removed + kraken2_out = KRAKEN_2.out.output + kraken2_report = KRAKEN_2.out.report + summary_stats = COMPILE_SUMMARY.out.summary_file + +} + +output { + protocol_out { + path "processing_info" + } + software_versions { + path "processing_info" + } + + fastq_out { + path "${params.reads_outdir}" + } + + kraken2_out { + path "results/kraken2-output" + } + + kraken2_report { + path "results/kraken2-output" + } + + summary_stats { + path "results" + } } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf index 090043e9..23273bf0 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/generate_protocol.nf @@ -1,17 +1,22 @@ process GENERATE_PROTOCOL { beforeScript "chmod +x ${projectDir}/bin/*" - tag "Generating your analysis protocol..." - publishDir "${params.outdir}/processing_info" + tag "Generating analysis protocol text..." input: tuple val(host), val(refSeq_ID), val(genome) - path(software_versions) + path software_versions + path protocol output: path("protocol.txt") script: + if (protocol.exists()) + """ + cp ${protocol} protocol.txt + """ + else """ generate_protocol.sh ${software_versions} ${host} "${refSeq_ID}" ${genome} > protocol.txt """ diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2.nf index eb13355d..1585c6d5 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2.nf @@ -1,38 +1,50 @@ process KRAKEN_2 { tag "${meta.id}" - publishDir "${params.reads_outdir}", pattern: '*.fastq.gz' - publishDir "${params.outdir}/results/kraken2-output", pattern: '*.{txt,tsv}' input: path database tuple val(meta), path(reads) + val out_suffix output: - path "${meta.id}*_HRremoved_*.gz", emit: host_removed - path "${meta.id}-kraken2-output.txt", emit: output - path "${meta.id}-kraken2-report.tsv", emit: report + path "${meta.id}*${out_suffix}.fastq.gz", emit: host_removed + path("${meta.id}-kraken2-output.txt"), emit: output + path("${meta.id}-kraken2-report.tsv"), emit: report path("versions.txt"), emit: version script: - def pe_flag = meta.paired_end ? "--paired" : "" - def input = meta.paired_end ? "${reads[0]} ${reads[1]}" : "${reads}" - def output = meta.paired_end ? "${meta.id}_R#.fastq" : "${meta.id}.fastq" - """ - kraken2 --db $database --gzip-compressed \ - --threads ${task.cpus} --use-names ${pe_flag} \ + if (meta.paired_end) + """ + kraken2 --db $database --gzip-compressed \ + --threads ${task.cpus} --use-names --paired \ --output ${meta.id}-kraken2-output.txt \ --report ${meta.id}-kraken2-report.tsv \ - --unclassified-out ${output} \ - ${input} + --unclassified-out ${meta.id}_R#.fastq \ + ${reads[0]} ${reads[1]} - # Compress intermediate FASTQ files - gzip ${input} + # Rename and compress files to final output names + mv "${meta.id}_R_1.fastq" "${meta.id}_R1${out_suffix}.fastq" && \ + gzip ${meta.id}_R1${out_suffix}.fastq + + mv "${meta.id}_R_2.fastq" "${meta.id}_R2${out_suffix}.fastq" && \ + gzip ${meta.id}_R2${out_suffix}.fastq - # Rename compressed files to final output names - mv ${meta.id}_R_1.fastq ${meta.id}${params.R1_out_suffix} - mv ${meta.id}_R_2.fastq ${meta.id}${params.R2_out_suffix} + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ + else + """ + kraken2 --db $database --gzip-compressed \ + --threads ${task.cpus} --use-names \ + --output ${meta.id}-kraken2-output.txt \ + --report ${meta.id}-kraken2-report.tsv \ + --unclassified-out ${meta.id}.fastq \ + ${reads} - echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt - """ + # Rename and compress files to final output names + test -f "${meta.id}.fastq" && mv "${meta.id}.fastq" "${meta.id}${out_suffix}.fastq" && \ + gzip ${meta.id}${out_suffix}.fastq + + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf index d97e3f4f..6033893d 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/kraken2_db.nf @@ -1,26 +1,70 @@ process KRAKEN2_DB { - tag "Downloading host reads database to ${params.ref_dbs_Dir}" - publishDir "${params.ref_dbs_Dir}", mode: 'copy' + tag "Creating host reads database in ${ref_dbs_dir}" + publishDir "${ref_dbs_dir}", mode: 'copy' input: - tuple val(host), val(host_id), val(fasta_url) + tuple val(host_name), val(host_url) , path(host_fasta) + path ref_dbs_dir output: - path "kraken2-${host_id}-db/" + path("kraken2-${host_name}-db/"), emit: krakendb_dir + path("versions.txt"), emit: version script: - """ - k2 download-taxonomy --db kraken2-${host_id}-db/ + if (host_url != null) + """ + echo "Downloading and unpacking database from ${host_url} + wget -O ${host_name}.tar.gz --timeout=3600 --tries=0 --continue ${host_url} - # Download FASTA file and uncompress it - wget -q ${fasta_url} -O host_assembly.fasta.gz - gunzip -c host_assembly.fasta.gz > host_assembly.fasta + mkdir kraken2-${host_name}-db/ && tar -zxvf -C kraken2-${host_name}-db/ - kraken2-build --add-to-library host_assembly.fasta --db kraken2-${host_id}-db/ --threads ${task.cpus} --no-masking + # Cleaning up + [ -f ${host_name}.tar.gz ] && rm -rf ${host_name}.tar.gz - kraken2-build --build --db kraken2-${host_id}-db/ --threads ${task.cpus} + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ + else if (host_fasta != null) + """ + echo "Attempting to build a custom ${host_name} reference database from ${host_fasta}" - kraken2-build --clean --db kraken2-${host_id}-db/ + # install taxonomy + k2 download-taxonomy --db kraken2-${host_name}-db/ + + # add sequence to database's genomic library + k2 add-to-library --db kraken2-${host_name}-db/ --threads ${task.cpus} \ + --files ${host_fasta} --no-masking + + # build the kraken2 database + k2 build --db kraken2-${host_name}-db/ --threads ${task.cpus} \ + --kmer-len 35 --minimizer-len 31 + + # remove intermediate files + k2 clean --db kraken2-${host_name}-db/ + + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ + else if (host_name != null) + """ + echo "Download and build kraken reference for named host: ${host_name}" + + # download genomic sequences + k2 download-library --db kraken2-${host_name}-db/ --threads ${task.cpus} \ + --library ${host_name} --no-masking + + # install taxonomy + k2 download-taxonomy --db kraken2-${host_name}-db/ + + # build the kraken2 database + k2 build --db kraken2-${host_name}-db/ --threads ${task.cpus} \ + --kmer-len 35 --minimizer-len 31 + + # remove intermediate files + k2 clean --db kraken2-${host_name}-db/ - """ + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ + else + error "Input error, host_name, host_url, and host_fasta are all set to null. Please supply at least one valid parameter for database creation" + + } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/summary.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/summary.nf index 749c273e..53f7c254 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/summary.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/summary.nf @@ -1,14 +1,13 @@ process SUMMARY { tag "${kraken_output.simpleName.replaceFirst(/-kraken2-output$/, '')}" - publishDir "${params.outdir}/results/kraken2-output" input: path kraken_output path kraken_report output: - path "*-removal-info.tmp" + path "*-removal-info.tmp", emit: sample_stats script: """ @@ -18,6 +17,25 @@ process SUMMARY { fragments_retained=\$(grep -w -m 1 "unclassified" $kraken_report | cut -f 2) perc_removed=\$(printf "%.2f\\n" \$(echo "scale=4; 100 - \$fragments_retained / \$total_fragments * 100" | bc -l)) - echo -e "\$meta_id\\t\$total_fragments\\t\$fragments_retained\\t\$perc_removed\\n" > \$meta_id-removal-info.tmp + echo -e "\$meta_id\\t\$total_fragments\\t\$fragments_retained\\t\$perc_removed" > \$meta_id-removal-info.tmp + """ +} + +process COMPILE_SUMMARY { + + tag "Generating summary statistics..." + beforeScript "chmod +x ${projectDir}/bin/*" + + input: + path summary_tmp_files + path sample_IDs_file + val host + + output: + path "${host}-read-removal-summary.tsv", emit: summary_file + + script: + """ + generate_summary.sh ${sample_IDs_file} ${host} ./ ${host}-read-removal-summary.tsv """ } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf index 1265c05e..816825d7 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/modules/utils.nf @@ -1,7 +1,6 @@ process SOFTWARE_VERSIONS { tag "Writing out software versions..." - publishDir "${params.outdir}/processing_info" input: path(software_versions) diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config index 4ffca71d..3ffd2169 100644 --- a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow.config @@ -1,3 +1,16 @@ +/* +---------------------------------------------------------------------------------------- + GeneLab Data Processing Remove Host Reads Workflow Nextflow config file +---------------------------------------------------------------------------------------- + Default config options for all compute environments +---------------------------------------------------------------------------------------- +*/ + +// Plugins +plugins { + id 'nf-schema@2.6.1' +} + params { is_single = false // Boolean to set if the reads are single-end sample_id_list = null // Path to Sample_ID list @@ -9,13 +22,13 @@ params { host = "human" hosts_table = "$projectDir/assets/hosts.csv" - ref_dbs_Dir = null // Path to kraken2 database (or where it will be downloaded to if it's not set up yet), required + ref_dbs_dir = null // Path to kraken2 database (or where it will be downloaded to if it's not set up yet), required - outdir = "${launchDir}" - reads_outdir = "${params.outdir}/${params.host}-removed-reads" // Output directory to hold -removed reads - R1_out_suffix = "_R1_HRremoved_raw.fastq.gz" - R2_out_suffix = "_R2_HRremoved_raw.fastq.gz" - single_out_suffix = "_HRremoved_raw.fastq.gz" + reads_outdir = "${params.host}-removed-reads" // Output directory to hold -removed reads + out_suffix = "_HRrm" + + publish_dir_mode = 'link' // Published outputs may be symlinks if using containerized environments + trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') } profiles { @@ -88,16 +101,27 @@ process { } } -def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +workflow { + output { + mode = params.publish_dir_mode + } +} + timeline { enabled = true - file = "${params.outdir}/processing_info/execution_timeline_${trace_timestamp}.html" + file = "nextflow_info/execution_timeline_${params.trace_timestamp}.html" } report { enabled = true - file = "${params.outdir}/processing_info/execution_report_${trace_timestamp}.html" + file = "nextflow_info/execution_report_${params.trace_timestamp}.html" } trace { enabled = true - file = "${params.outdir}/processing_info/execution_trace_${trace_timestamp}.txt" + file = "nextflow_info/execution_trace_${params.trace_timestamp}.txt" +} + +validation { + help { + enabled = true + } } \ No newline at end of file diff --git a/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json new file mode 100644 index 00000000..190627e5 --- /dev/null +++ b/Metagenomics/Remove_host_reads/Workflow_Documentation/NF_MGRemoveHostReads/workflow_code/nextflow_schema.json @@ -0,0 +1,75 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com///nextflow_schema.json", + "title": "NASA GeneLab Data Processing Host Read Removal workflow parameters", + "description": "GeneLab workflow for removing human reads in metagenomics sequencing data, as described in GL-DPPD-7105-B", + "type": "object", + "properties": { + "is_single": { + "type": "boolean", + "default": false, + "description": "Are the reads single-ended?" + }, + "sample_id_list": { + "type": "string", + "description": "list of unique sample IDs that identifies the files to process" + }, + "reads_dir": { + "type": "string", + "description": "Path to input reads" + }, + "R1_suffix": { + "type": "string", + "default": "_R1_raw.fastq.gz", + "description": "file suffix for mate1 reads" + }, + "R2_suffix": { + "type": "string", + "default": "_R2_raw.fastq.gz", + "description": "file suffix for mate2 reads" + }, + "single_suffix": { + "type": "string", + "default": "_raw.fastq.gz", + "description": "file suffix for single-ended reads" + }, + "host": { + "type": "string", + "default": "human", + "description": "simple name of host organism" + }, + "hosts_table": { + "type": "string", + "default": "${projectDir}/assets/hosts.csv", + "description": "comma-separated table of host information", + "hidden": true + }, + "ref_dbs_dir": { + "type": "string", + "description": "path to kraken2 databases" + }, + "reads_outdir": { + "type": "string", + "default": "${host}-removed-reads", + "description": "output folder for host-removed reads" + }, + "out_suffix": { + "type": "string", + "default": "_HRrm", + "description": "file suffix for all output read files" + }, + "publish_dir_mode": { + "type": "string", + "default": "link", + "description": "Nextflow workflow publish mode", + "enum": ["copy", "copyNoFollow", "link", "move", "relink", "symlink"], + "hidden": true + }, + "trace_timestamp": { + "type": "string", + "default": "2026-02-13_09-22-42", + "hidden": true + } + }, + "required": ["ref_dbs_dir", "sample_id_list", "reads_dir"] +}