Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env bash

sample_IDs_file=${1}
host=${2}
input_dir=${3}
output_file=${4}

# starting output file
printf "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent_${host}_reads_removed\n" > ${output_file}

# looping through all input files and generating columns for final table
for sample in $(cat ${sample_IDs_file})
do
cat ${input_dir}/${sample}-removal-info.tmp >> ${output_file}
done
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- kraken2=2.1.6
Original file line number Diff line number Diff line change
@@ -1,31 +1,29 @@
// main.nf
nextflow.enable.dsl=2

// Terminal text color definitions
c_back_bright_red = "\u001b[41;1m";
c_reset = "\033[0m";
include { paramsHelp } from 'plugin/nf-schema'
include { validateParameters } from 'plugin/nf-schema'

include { KRAKEN2_DB } from './modules/kraken2_db.nf'
include { KRAKEN_2 } from './modules/kraken2.nf'
include { SUMMARY } from './modules/summary.nf'
include { KRAKEN2_DB } from './modules/kraken2_db.nf'
include { KRAKEN_2 } from './modules/kraken2.nf'
include { SUMMARY } from './modules/summary.nf'
include { COMPILE_SUMMARY } from './modules/summary.nf'

include { SOFTWARE_VERSIONS } from './modules/utils.nf'
include { GENERATE_PROTOCOL } from './modules/generate_protocol.nf'
include { SOFTWARE_VERSIONS } from './modules/utils.nf'
include { GENERATE_PROTOCOL } from './modules/generate_protocol.nf'

workflow {

if (!params.ref_dbs_Dir){
error("""${c_back_bright_red}INPUT ERROR!
Please supply the path to the directory storing kraken2 reference databases
by passing --ref_dbs_Dir.
${c_reset}""")
}
main:

// check input parameters
validateParameters()

// Capture software versions
software_versions_ch = Channel.empty()
software_versions_ch = channel.empty()

// Get host info
host_info = Channel
host_info = channel
.fromPath(params.hosts_table)
.splitCsv(header:true)
.filter { row -> row.name.toLowerCase() == params.host.toLowerCase() } // match host
Expand All @@ -38,19 +36,18 @@ workflow {

// Check if kraken2 database already exists or needs to be built
def host_id = params.host.replaceAll(' ', '_').toLowerCase()
def host_db = file("${params.ref_dbs_Dir}/kraken2-${host_id}-db")
def host_db = file("${params.ref_dbs_dir}/kraken2-${host_id}-db")
def db_exists = host_db.exists()

if (db_exists) {
database_ch = Channel.value(host_db)
}
if (db_exists)
database_ch = channel.value(host_db)
else {
build_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, hostID, fasta) }
KRAKEN2_DB(build_ch)
build_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, host_id, fasta) }
KRAKEN2_DB(build_ch, params.ref_dbs_dir)
database_ch = KRAKEN2_DB.out.first()
}

Channel
channel
.fromPath(params.sample_id_list)
.splitText()
.map { it.trim() }
Expand All @@ -66,31 +63,19 @@ workflow {
}
.set {generated_reads_ch}

KRAKEN_2(database_ch, generated_reads_ch)
KRAKEN_2(database_ch, generated_reads_ch, params.out_suffix)
KRAKEN_2.out.version | mix(software_versions_ch) | set{software_versions_ch}

// Generate summary and compile one file
// Generate summary and compile into one file
SUMMARY(KRAKEN_2.out.output, KRAKEN_2.out.report)
SUMMARY.out
.collect()
.subscribe { summary_files ->
def outfile = file("${params.outdir}/results/Host-read-removal-summary.tsv")
def header = "Sample_ID\tTotal_fragments_before\tTotal_fragments_after\tPercent_host_reads_removed\n"
outfile.text = header + summary_files.collect { it.text }.join()

// summary.tmp cleanup
summary_files.each { f ->
def tmpFile = f.toFile()
tmpFile.delete()
}
}
COMPILE_SUMMARY(SUMMARY.out.collect(), channel.fromPath(params.sample_id_list), params.host)

// Software Version Capturing - combining all captured software versions
nf_version = "Nextflow Version ".concat("${nextflow.version}")
nextflow_version_ch = Channel.value(nf_version)
nextflow_version_ch = channel.value(nf_version)

// Write software versions to file
software_versions_ch | map { it.text.strip() }
software_versions_ch | map { it -> it.text.strip() }
| unique
| mix(nextflow_version_ch)
| collectFile({it -> it}, newLine: true, cache: false)
Expand All @@ -99,6 +84,41 @@ workflow {
// Protocol always needs name, refseq ID, and genome build
protocol_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, refseq, genome) }

GENERATE_PROTOCOL(protocol_ch, SOFTWARE_VERSIONS.out)
def protocol = host_db.resolve('read-removal-protocol-text.txt')
protocol_out = GENERATE_PROTOCOL(protocol_ch, SOFTWARE_VERSIONS.out, channel.value(protocol))

publish:
protocol_out = protocol_out
software_versions = SOFTWARE_VERSIONS.out
fastq_out = KRAKEN_2.out.host_removed
kraken2_out = KRAKEN_2.out.output
kraken2_report = KRAKEN_2.out.report
summary_stats = COMPILE_SUMMARY.out.summary_file

}

output {
protocol_out {
path "processing_info"
}

software_versions {
path "processing_info"
}

fastq_out {
path "${params.reads_outdir}"
}

kraken2_out {
path "results/kraken2-output"
}

kraken2_report {
path "results/kraken2-output"
}

summary_stats {
path "results"
}
}
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
process GENERATE_PROTOCOL {

beforeScript "chmod +x ${projectDir}/bin/*"
tag "Generating your analysis protocol..."
publishDir "${params.outdir}/processing_info"
tag "Generating analysis protocol text..."

input:
tuple val(host), val(refSeq_ID), val(genome)
path(software_versions)
path software_versions
path protocol

output:
path("protocol.txt")

script:
if (protocol.exists())
"""
cp ${protocol} protocol.txt
"""
else
"""
generate_protocol.sh ${software_versions} ${host} "${refSeq_ID}" ${genome} > protocol.txt
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,38 +1,50 @@
process KRAKEN_2 {

tag "${meta.id}"
publishDir "${params.reads_outdir}", pattern: '*.fastq.gz'
publishDir "${params.outdir}/results/kraken2-output", pattern: '*.{txt,tsv}'

input:
path database
tuple val(meta), path(reads)
val out_suffix

output:
path "${meta.id}*_HRremoved_*.gz", emit: host_removed
path "${meta.id}-kraken2-output.txt", emit: output
path "${meta.id}-kraken2-report.tsv", emit: report
path "${meta.id}*${out_suffix}.fastq.gz", emit: host_removed
path("${meta.id}-kraken2-output.txt"), emit: output
path("${meta.id}-kraken2-report.tsv"), emit: report
path("versions.txt"), emit: version

script:
def pe_flag = meta.paired_end ? "--paired" : ""
def input = meta.paired_end ? "${reads[0]} ${reads[1]}" : "${reads}"
def output = meta.paired_end ? "${meta.id}_R#.fastq" : "${meta.id}.fastq"
"""
kraken2 --db $database --gzip-compressed \
--threads ${task.cpus} --use-names ${pe_flag} \
if (meta.paired_end)
"""
kraken2 --db $database --gzip-compressed \
--threads ${task.cpus} --use-names --paired \
--output ${meta.id}-kraken2-output.txt \
--report ${meta.id}-kraken2-report.tsv \
--unclassified-out ${output} \
${input}
--unclassified-out ${meta.id}_R#.fastq \
${reads[0]} ${reads[1]}

# Compress intermediate FASTQ files
gzip ${input}
# Rename and compress files to final output names
mv "${meta.id}_R_1.fastq" "${meta.id}_R1${out_suffix}.fastq" && \
gzip ${meta.id}_R1${out_suffix}.fastq

mv "${meta.id}_R_2.fastq" "${meta.id}_R2${out_suffix}.fastq" && \
gzip ${meta.id}_R2${out_suffix}.fastq

# Rename compressed files to final output names
mv ${meta.id}_R_1.fastq ${meta.id}${params.R1_out_suffix}
mv ${meta.id}_R_2.fastq ${meta.id}${params.R2_out_suffix}
echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt
"""
else
"""
kraken2 --db $database --gzip-compressed \
--threads ${task.cpus} --use-names \
--output ${meta.id}-kraken2-output.txt \
--report ${meta.id}-kraken2-report.tsv \
--unclassified-out ${meta.id}.fastq \
${reads}

echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt
"""
# Rename and compress files to final output names
test -f "${meta.id}.fastq" && mv "${meta.id}.fastq" "${meta.id}${out_suffix}.fastq" && \
gzip ${meta.id}${out_suffix}.fastq

echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt
"""
}
Original file line number Diff line number Diff line change
@@ -1,26 +1,70 @@
process KRAKEN2_DB {
tag "Downloading host reads database to ${params.ref_dbs_Dir}"
publishDir "${params.ref_dbs_Dir}", mode: 'copy'
tag "Creating host reads database in ${ref_dbs_dir}"
publishDir "${ref_dbs_dir}", mode: 'copy'

input:
tuple val(host), val(host_id), val(fasta_url)
tuple val(host_name), val(host_url) , path(host_fasta)
path ref_dbs_dir

output:
path "kraken2-${host_id}-db/"
path("kraken2-${host_name}-db/"), emit: krakendb_dir
path("versions.txt"), emit: version

script:
"""
k2 download-taxonomy --db kraken2-${host_id}-db/
if (host_url != null)
"""
echo "Downloading and unpacking database from ${host_url}
wget -O ${host_name}.tar.gz --timeout=3600 --tries=0 --continue ${host_url}

# Download FASTA file and uncompress it
wget -q ${fasta_url} -O host_assembly.fasta.gz
gunzip -c host_assembly.fasta.gz > host_assembly.fasta
mkdir kraken2-${host_name}-db/ && tar -zxvf -C kraken2-${host_name}-db/

kraken2-build --add-to-library host_assembly.fasta --db kraken2-${host_id}-db/ --threads ${task.cpus} --no-masking
# Cleaning up
[ -f ${host_name}.tar.gz ] && rm -rf ${host_name}.tar.gz

kraken2-build --build --db kraken2-${host_id}-db/ --threads ${task.cpus}
echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt
"""
else if (host_fasta != null)
"""
echo "Attempting to build a custom ${host_name} reference database from ${host_fasta}"

kraken2-build --clean --db kraken2-${host_id}-db/
# install taxonomy
k2 download-taxonomy --db kraken2-${host_name}-db/

# add sequence to database's genomic library
k2 add-to-library --db kraken2-${host_name}-db/ --threads ${task.cpus} \
--files ${host_fasta} --no-masking

# build the kraken2 database
k2 build --db kraken2-${host_name}-db/ --threads ${task.cpus} \
--kmer-len 35 --minimizer-len 31

# remove intermediate files
k2 clean --db kraken2-${host_name}-db/

echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt
"""
else if (host_name != null)
"""
echo "Download and build kraken reference for named host: ${host_name}"

# download genomic sequences
k2 download-library --db kraken2-${host_name}-db/ --threads ${task.cpus} \
--library ${host_name} --no-masking

# install taxonomy
k2 download-taxonomy --db kraken2-${host_name}-db/

# build the kraken2 database
k2 build --db kraken2-${host_name}-db/ --threads ${task.cpus} \
--kmer-len 35 --minimizer-len 31

# remove intermediate files
k2 clean --db kraken2-${host_name}-db/

"""
echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt
"""
else
error "Input error, host_name, host_url, and host_fasta are all set to null. Please supply at least one valid parameter for database creation"


}
Loading