diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea27a584..43e2a189 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -21,7 +21,11 @@ }, // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] + "extensions": [ + "ms-python.python", + "ms-python.vscode-pylance", + "nf-core.nf-core-extensionpack" + ] } } } diff --git a/.nf-core.yml b/.nf-core.yml index d580fa84..f8f60b75 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -44,7 +44,6 @@ repository_type: pipeline template: name: bacterial-genomics/wf-paired-end-illumina-assembly - -update: - https://github.com/nf-core/modules.git: - nf-core: False +# update: +# https://github.com/nf-core/modules.git: +# nf-core: False diff --git a/CHANGELOG.md b/CHANGELOG.md index b1820d2e..4e63f546 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,65 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v3.0.0 - November 15, 2024 + +### `Added` + +- Consistent metrics reported for each read cleaning step (@chrisgulvik) +- Added SeqFu for FastQ format validation (@chrisgulvik) +- Checksum (SHA-512) reporting of intermediate and output files (@chrisgulvik) +- Report full input paths for each sample (@chrisgulvik) +- For assembly depth reporting, added stdev depth metrics; added total paired+single mapped stats (@chrisgulvik) + +### `Changed` + +- Default uses SeqKit rather than SeqTk for downsampling (@chrisgulvik) +- Output structure and filenames revised (@chrisgulvik) +- For MLST, exclude all MLST databases with a \*\_ by default (> 1) to ensure the original MLST database version is used for each taxon (e.g., excludes leptospira_2 and leptospira_3) and avoids inconsistent versions used within a run which would occasionally give one sample a leptospira and a different sample leptospira_3 making it impossible to immediately compare between samples. (@chrisgulvik) +- For MLST, store novel FastA when that situation occurs (@chrisgulvik) +- Sample name in outputs and file content no longer contains assembler name (@chrisgulvik) +- Changed RDP output to exclude unneccesary data columns such as "Phylum\nphylum", "Genus\ngenus" (@chrisgulvik) +- Use both R1 and R2 and only Phred30 for estimate bp input for more accurate estimation of genome size (@chrisgulvik) +- Changed default to always on to store stats and FastA of discarded contigs during biopython filtering (@chrisgulvik) +- Output filenames within `pipeline_info/` changed to show month by name and include day of the week (@chrisgulvik) + +### `Fixed` + +- Order of operations in Trimmomatic process now ensures final output reads have minimum sequence length (default: 50 bp) (@chrisgulvik) +- Fixed issue with missing column header names in the .kraken_summary.tsv output files (@chrisgulvik) +- Fixed trailing tab character in Kraken1 and Kraken2 output TSV summaries, which made pandas XLSX conversion fail due to different column numbers in header and data (@chrisgulvik) +- Fixed VERSION reporting RDP bug by removing spaces (@chrisgulvik) + +### `Updated` + +- Coloring of workflow process now corresponds to tab color in XLSX output summary sheet (@chrisgulvik) +- Docker container version updates (@chrisgulvik) +- Updated description on output files based on new files created as well as some renamed output files (@chrisgulvik) + +### `Deprecated` + +- Removed gene calling from QUAST output summary (@chrisgulvik) + +## v2.4.0 - August 28, 2024 + +### `Added` + +- Statistics output files during the downsampling routine are all stored as output files under their CleanedReads// (@chrisgulvik) +- FastQ outputs for individual steps are now saved (perhaps temporary and to a non-default option) (@chrisgulvik) +- SeqKit downsampling option (already had Seqtk) (@chrisgulvik) +- Use of a default seed value for both SeqKit and Seqtk subsampling (@chrisgulvik) + +### `Fixed` + +- RDP Classifier always failed once before succeeding on retry, so a higher RAM request was given as label to make it succeed on first attempt for speed (@chrisgulvik) +- RDP Classifier data output are not tab-delimited instead of space-delimited (@chrisgulvik) + +### `Updated` + +- TSV output data files have header names with no spaces, all underscores replaced them (@chrisgulvik) + +### `Deprecated` + ## v2.3.1 - August 23, 2024 ### `Added` diff --git a/CITATIONS.md b/CITATIONS.md index 8af44d65..fe9caab5 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,80 +10,133 @@ ## Pipeline tools -- [SPAdes](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3342519/) +- [AWK](https://www.thriftbooks.com/w/the-awk-programming-language_brian-w-kernighan_alfred-v-aho/254416/?resultid=8ae6bc1c-db73-4eb0-b297-9d28a08ee38f#edition=2350735&idiq=2920572) - > Bankevich A, Nurk S, Antipov D, et al. SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing. J Comput Biol. May 2012;19(5):455-77. doi:10.1089/cmb.2012.0021 + > Aho AV, Kernighan BW, Weinberger PJ. The AWK programming language. 224 pp. Pearson. ISBN-13: 978-0201079814 -- [Trimmomatic](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/) +- [Bakta](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8743544/) - > Bolger AM, Lohse M, Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics. Aug 1 2014;30(15):2114-20. doi:10.1093/bioinformatics/btu170 + > Schwengers O, Jelonek L, Dieckmann MA, Beyvers S, Blom J, Goesmann A. Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification. Microb Genom. Nov 2021;7(11):000685. doi: 10.1099/mgen.0.000685 + +- [BEDTools](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4213956/) + + > Quinlan AR. BEDTools: The Swiss-Army Tool for Genome Feature Analysis. Curr Protoc Bioinformatics. Sep 8 2014;47:11.12.1-34. doi:10.1002/0471250953.bi1112s47 - [BLAST+](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-10-421) > Camacho C, Coulouris G, Avagyan V, et al. BLAST+: architecture and applications. BMC Bioinformatics. 2009/12/15 2009;10(1):421. doi:10.1186/1471-2105-10-421 -- [GTDB-Tk](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9710552/) +- [BUSCO](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8476166/) - > Chaumeil PA, Mussig AJ, Hugenholtz P, Parks DH. GTDB-Tk v2: memory friendly classification with the genome taxonomy database. Bioinformatics. Nov 30 2022;38(23):5315-5316. doi:10.1093/bioinformatics/btac672 + > Manni M, Berkeley MR, Seppey M, Simão FA, Zdobnov EM. BUSCO Update: Novel and Streamlined Workflows along with Broader and Deeper Phylogenetic Coverage for Scoring of Eukaryotic, Prokaryotic, and Viral Genomes. Mol Biol Evol. Sep 27 2021;38(10):4647-4654. doi:10.1093/molbev/msab199 -- [BioPython](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2682512/) +- [BWA-MEM](https://arxiv.org/abs/1303.3997) - > Cock PJ, Antao T, Chang JT, et al. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics. Jun 1 2009;25(11):1422-3. doi:10.1093/bioinformatics/btp163 + > Li H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. May 26 2013; arXiv:1303.3997. doi:10.48550/arXiv.1303.3997 -- [QUAST](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3624806/) +- [BioPython](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2682512/) - > Gurevich A, Saveliev V, Vyahhi N, Tesler G. QUAST: quality assessment tool for genome assemblies. Bioinformatics. Apr 15 2013;29(8):1072-5. doi:10.1093/bioinformatics/btt086 + > Cock PJ, Antao T, Chang JT, et al. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics. Jun 1 2009;25(11):1422-3. doi:10.1093/bioinformatics/btp163 -- [PubMLST](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6192448/) +- [CAT](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6805573/) - > Jolley KA, Bray JE, Maiden MCJ. Open-access bacterial population genomics: BIGSdb software, the PubMLST.org website and their applications. Wellcome Open Res. 2018;3:124. doi:10.12688/wellcomeopenres.14826.1 + > von Meijenfeldt FAB, Arkhipova K, Cambuy DD, Coutinho FH, Dutilh BE. Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome Biol. Oct 22 2019;20(1):217. doi: 10.1186/s13059-019-1817-x -- [RNAmmer](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1888812/) +- [CheckM2](https://pubmed.ncbi.nlm.nih.gov/37500759/) - > Lagesen K, Hallin P, Rødland EA, Staerfeldt HH, Rognes T, Ussery DW. RNAmmer: consistent and rapid annotation of ribosomal RNA genes. Nucleic Acids Res. 2007;35(9):3100-8. doi:10.1093/nar/gkm160 + > Chklovski A, Parks DH, Woodcroft BJ, Tyson GW. CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning. Nat Methods. Aug 2023;20(8):1203-1212. doi: 10.1038/s41592-023-01940-w -- [SAMtools](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2723002/) +- [Fastp](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6129281/) - > Li H, Handsaker B, Wysoker A, et al. The Sequence Alignment/Map format and SAMtools. Bioinformatics. Aug 15 2009;25(16):2078-9. doi:10.1093/bioinformatics/btp352 + > Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. Sep 1 2018;34(17):i884-i890. doi: 10.1093/bioinformatics/bty560 -- [BWA-MEM](https://arxiv.org/abs/1303.3997) +- [Fastp latest](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10989850/) - > Li H. Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. May 26 2013; arXiv:1303.3997. doi:10.48550/arXiv.1303.3997 + > Chen S. Ultrafast one-pass FASTQ data preprocessing, quality control, and deduplication using fastp. Imeta. May 8 2023;2(2):e107. doi: 10.1002/imt2.107 - [FLASH](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3198573/) > Magoč T, Salzberg SL. FLASH: fast length adjustment of short reads to improve genome assemblies. Bioinformatics. Nov 1 2011;27(21):2957-63. doi:10.1093/bioinformatics/btr507 -- [BUSCO](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8476166/) +- [GTDB-Tk](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9710552/) - > Manni M, Berkeley MR, Seppey M, Simão FA, Zdobnov EM. BUSCO Update: Novel and Streamlined Workflows along with Broader and Deeper Phylogenetic Coverage for Scoring of Eukaryotic, Prokaryotic, and Viral Genomes. Mol Biol Evol. Sep 27 2021;38(10):4647-4654. doi:10.1093/molbev/msab199 + > Chaumeil PA, Mussig AJ, Hugenholtz P, Parks DH. GTDB-Tk v2: memory friendly classification with the genome taxonomy database. Bioinformatics. Nov 30 2022;38(23):5315-5316. doi:10.1093/bioinformatics/btac672 -- [BEDTools](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4213956/) +- [Hostile](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10749771/) - > Quinlan AR. BEDTools: The Swiss-Army Tool for Genome Feature Analysis. Curr Protoc Bioinformatics. Sep 8 2014;47:11.12.1-34. doi:10.1002/0471250953.bi1112s47 + > Constantinides B, Hunt M, Crook DW. Hostile: accurate decontamination of microbial host sequences. Dec 1 2023;39(12):btad728. doi: 10.1093/bioinformatics/btad728 + +- [Kraken](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4053813/) + + > Wood DE, Salzberg SL. Kraken: ultrafast metagenomic sequence classification using exact alignments. Genome Biology. 2014/03/03 2014;15(3):R46. doi:10.1186/gb-2014-15-3-r46 + +- [Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) + + > Wood DE, Lu J, Langmead B. Improved metagenomic analysis with Kraken 2. Genome Biology. 2019/11/28 2019;20(1):257. doi:10.1186/s13059-019-1891-0 - [mlst](https://github.com/tseemann/mlst) > Seeman T. mlst. Github: +- [Pilon](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4237348/) + + > Walker BJ, Abeel T, Shea T, et al. Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement. PLOS ONE. 2014;9(11):e112963. doi:10.1371/journal.pone.0112963 + +- [PubMLST](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6192448/) + + > Jolley KA, Bray JE, Maiden MCJ. Open-access bacterial population genomics: BIGSdb software, the PubMLST.org website and their applications. Wellcome Open Res. 2018;3:124. doi:10.12688/wellcomeopenres.14826.1 + - [Prokka](https://academic.oup.com/bioinformatics/article/30/14/2068/2390517) > Seemann T. Prokka: rapid prokaryotic genome annotation. Bioinformatics. Jul 15 2014;30(14):2068-9. doi:10.1093/bioinformatics/btu153 +- [QUAST](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3624806/) + + > Gurevich A, Saveliev V, Vyahhi N, Tesler G. QUAST: quality assessment tool for genome assemblies. Bioinformatics. Apr 15 2013;29(8):1072-5. doi:10.1093/bioinformatics/btt086 + +- [QUAST latest](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6022658/) + + > Mikheenko A, Prjibelski A, Saveliev V, Antipov D, Gurevich A. Versatile genome assembly evaluation with QUAST-LG. Jul 1 2018;34(13):i142-i150. doi: 10.1093/bioinformatics/bty266 + +- [RDP Classifier](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11008197/) + + > Wang Q, Cole JR. Updated RDP taxonomy and RDP Classifier for more accurate taxonomic classification. Microbiol Resour Announc. Apr 11 2024;13(4):e0106323. doi: 10.1128/mra.01063-23 + +- [RNAmmer](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1888812/) + + > Lagesen K, Hallin P, Rødland EA, Staerfeldt HH, Rognes T, Ussery DW. RNAmmer: consistent and rapid annotation of ribosomal RNA genes. Nucleic Acids Res. 2007;35(9):3100-8. doi:10.1093/nar/gkm160 + +- [SAMtools](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2723002/) + + > Li H, Handsaker B, Wysoker A, et al. The Sequence Alignment/Map format and SAMtools. Bioinformatics. Aug 15 2009;25(16):2078-9. doi:10.1093/bioinformatics/btp352 + +- [SeqFu](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8148589/) + + > Telatin A, Fariselli P, Birolo G. SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files. May 7 2021;8(5):59. doi: 10.3390/bioengineering8050059 + +- [SeqKit](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5051824/) + + > Shen W, Le S, Li Y, Hu F. SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. Oct 5 2016;11(10):e0163962. doi: 10.1371/journal.pone.0163962 + +- [SeqKit latest](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11183193/) + + > Shen W, Sipos B, Zhao L. SeqKit2: A Swiss army knife for sequence and alignment processing. Imeta. Apr 5 2024;3(3):e191. doi: 10.1002/imt2.191 + - [SKESA](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1540-z) > Souvorov A, Agarwala R, Lipman DJ. SKESA: strategic k-mer extension for scrupulous assemblies. Genome Biology. 2018/10/04 2018;19(1):153. doi:10.1186/s13059-018-1540-z -- [Pilon](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4237348/) +- [SPAdes](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3342519/) - > Walker BJ, Abeel T, Shea T, et al. Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement. PLOS ONE. 2014;9(11):e112963. doi:10.1371/journal.pone.0112963 + > Bankevich A, Nurk S, Antipov D, et al. SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing. J Comput Biol. May 2012;19(5):455-77. doi:10.1089/cmb.2012.0021 -- [Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) +- [SPAdes latest](https://pubmed.ncbi.nlm.nih.gov/32559359/) - > Wood DE, Lu J, Langmead B. Improved metagenomic analysis with Kraken 2. Genome Biology. 2019/11/28 2019;20(1):257. doi:10.1186/s13059-019-1891-0 + > Prjibelski A, Antipov D, Meleshko D, Lapidus A, Korobeynikov A. Using SPAdes De Novo Assembler. Curr Protoc Bioinformatics. Jun 2020;70(1):e102. doi: 10.1002/cpbi.102 -- [Kraken](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4053813/) - > Wood DE, Salzberg SL. Kraken: ultrafast metagenomic sequence classification using exact alignments. Genome Biology. 2014/03/03 2014;15(3):R46. doi:10.1186/gb-2014-15-3-r46 +- [Trimmomatic](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/) + + > Bolger AM, Lohse M, Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics. Aug 1 2014;30(15):2114-20. doi:10.1093/bioinformatics/btu170 ## Software packaging/containerisation tools diff --git a/README.md b/README.md index fb8856f5..9106ccbb 100644 --- a/README.md +++ b/README.md @@ -181,8 +181,7 @@ PhiX reference [NC_001422.1](https://www.ncbi.nlm.nih.gov/nuccore/NC_001422.1) c [Default: NaN] ``` -> [!NOTE] -> _If user does not specify inputs for parameters with a default set to `NaN`, these options will not be performed during workflow analysis._ +> [!NOTE] > _If user does not specify inputs for parameters with a default set to `NaN`, these options will not be performed during workflow analysis._ ### Additional parameters @@ -201,9 +200,9 @@ nextflow run \ The most well-tested and supported is a Univa Grid Engine (UGE) job scheduler with Singularity for dependency handling. 1. UGE/SGE - - Additional tips for UGE processing are [here](docs/HPC-UGE-scheduler.md). + - Additional tips for UGE processing are [here](docs/HPC-UGE-scheduler.md). 2. No Scheduler - - It has also been confirmed to work on desktop and laptop environments without a job scheduler using Docker with more tips [here](docs/local-device.md). + - It has also been confirmed to work on desktop and laptop environments without a job scheduler using Docker with more tips [here](docs/local-device.md). ## Output diff --git a/_run_assembly.uge-nextflow b/_run_assembly.uge-nextflow index 9ab5e1c2..eacc0935 100755 --- a/_run_assembly.uge-nextflow +++ b/_run_assembly.uge-nextflow @@ -4,47 +4,52 @@ SCRIPT_NAME="$(basename ${0#_} .uge-nextflow)" # Set profile # Get node number - <=230 = biolinux, >=231 = rosalind -NODE_NUM=$(echo ${HOSTNAME%%.*} | sed 's/node//1') +NODE_NUM"=$(echo ${HOSTNAME%%.*} | sed 's/node//1')" if [[ ${NODE_NUM} -ge 231 ]]; then HPC='rosalind_hpc' else HPC='aspen_hpc' fi +time_stamp="$(date '+%Y-%b-%d_%a_%H-%M-%S')" + module load nextflow nextflow \ - -log ${OUT}/pipeline_info/nextflow_log.${SCRIPT_NAME}.txt \ + -log "${OUT}/pipeline_info/nextflow_log.${SCRIPT_NAME}.txt" \ run \ - ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/main.nf \ - -profile ${HPC} \ - --input ${IN} \ - --outdir ${OUT} \ + "${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/main.nf" \ + -profile "${HPC}" \ + --input "${IN}" \ + --outdir "${OUT}" \ -ansi-log false \ - -N ${USER}@cdc.gov \ - -w ${OUT}/.work \ - --blast_db ${LAB_HOME}/.databases/ncbi \ - --kraken1_db ${LAB_HOME}/.databases/kraken1-db-v1.0.0 \ - --kraken2_db ${LAB_HOME}/.databases/kraken2 \ + -N "${USER}@cdc.gov" \ + -w "${OUT}/.work" \ + -with-dag "${OUT}/pipeline_info/dag.${time_stamp}.html" \ + --blast_db "${LAB_HOME}/.databases/ncbi" \ + --checkm2_db "${LAB_HOME}/.databases/checkm2" \ + --kraken1_db "${LAB_HOME}/.databases/kraken1-db-v1.0.0" \ + --kraken2_db "${LAB_HOME}/.databases/kraken2" \ + --sra_scrubber_db "${LAB_HOME}/.databases/sra-human-scrubber/data/human_filter.db" \ + --subsample_tool seqkit \ + --create_excel_outputs \ -resume # Check for errors and add to errors.tsv # Get nextflow run name -run_name=$(grep "Launching" ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME} | cut -d '[' -f 2 | cut -d ']' -f 1) - -time_stamp=$(date '+%Y-%b-%d %a %H:%M:%S') +run_name=$(grep Launching "${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME}" | cut -d '[' -f 2 | cut -d ']' -f 1) # Read each line from nextflow log, find info, and add to errors.tsv while read -r line; do # If process is already running, clean up error if [[ "${line}" =~ ^Unable[[:space:]]to[[:space:]]acquire[[:space:]]lock.* ]]; then error="You are trying to resume the execution of an already running pipeline." - ASM_OUT=$(realpath ${OUT}/pipeline_info/ASM_*.o*) - echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv + ASM_OUT="$(realpath ${OUT}/pipeline_info/ASM_*.o*)" + echo -e "-\t-\t${error}\t${ASM_OUT}\t${time_stamp}\t${run_name}" >> "${OUT}/pipeline_info/errors.tsv" else # Workflow ran some processes - sample_name=$(grep "nf-" ${line}/.command.run | cut -d '(' -f 2 | cut -d ')' -f 1) - process=$(grep "NEXTFLOW TASK" ${line}/.command.run | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) - error=$(tail -n 1 ${line}/.command.err | sed -e 's/\[[^][]*\] //g') + sample_name=$(grep "nf-" "${line}/.command.run" | cut -d '(' -f 2 | cut -d ')' -f 1) + process=$(grep 'NEXTFLOW TASK' "${line}/.command.run" | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) + error=$(tail -n 1 "${line}/.command.err" | sed -e 's/\[[^][]*\] //g') # Kraken 2 places "Loading database information... done." in error log if [[ ${process} =~ .*READ_CLASSIFY_KRAKEN_TWO$ ]] \ @@ -55,13 +60,13 @@ while read -r line; do # BBDuk java errors if [[ ${process} =~ .*REMOVE_PHIX_BBDUK$ ]] \ && [[ "${error}" =~ .*at.* ]]; then - error=$(grep -A1 "java.lang" ${line}/.command.err | head -n 2 | tail -n 1) - if [[ ! ${error} ]]; then + error=$(grep -A1 'java.lang' "${line}/.command.err" | head -n 2 | tail -n 1) + if [[ ! "${error}" ]]; then continue elif [[ ${error} =~ ^Mismatch.* ]]; then - error=${error} + error="${error}" else - error=$(grep "java.lang" ${line}/.command.err | awk -F ': ' 'END {print $2}') + error=$(grep 'java.lang' "${line}/.command.err" | awk -F ': ' 'END {print $2}') fi elif [[ ${process} =~ .*REMOVE_PHIX_BBDUK$ ]] \ && [[ "${error}" =~ "Input is being processed as unpaired" ]]; then @@ -75,74 +80,74 @@ while read -r line; do # Check if error is from file checks if [[ ${error} =~ .+Check[[:space:]]failed$ ]]; then - get_previous_process_workdir=$(dirname $(grep "ln -s" ${line}/.command.run | grep "work" | awk 'END {print $(NF-1)}' )) - process=$(grep "nf-" ${get_previous_process_workdir}/.command.run | awk -F 'nf-' '{print $2}' | sed -e 's/_(.*//') + get_previous_process_workdir=$(dirname $(grep "ln -s" "${line}/.command.run" | grep "work" | awk 'END {print $(NF-1)}' )) + process=$(grep "nf-" "${get_previous_process_workdir}/.command.run" | awk -F 'nf-' '{print $2}' | sed -e 's/_(.*//') line="${get_previous_process_workdir}" fi # If process for sample retried and succeeded, ignore if [[ $(find "${OUT}/pipeline_info/process_logs/" -type f -name "${sample_name}.*${process}*.command.out") ]] \ - && [[ $(cat ${line}/.exitcode) = @(0|71|104|134|137|139|140|143|245|250|255) ]]; then + && [[ $(cat "${line}/.exitcode") = @(0|71|104|134|137|139|140|143|245|250|255) ]]; then continue else - echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv + echo -e "${sample_name}\t${process}\t${error}\t${line}\t${time_stamp}\t${run_name}" >> "${OUT}/pipeline_info/errors.tsv" fi fi -done < <(nextflow log ${run_name} -filter 'status == "FAILED"') +done < <(nextflow log "${run_name}" -filter 'status == "FAILED"') # Look for errors from process EXTRACT_16S_BIOPYTHON -biopython_rna_errors=( $(find ${OUT}/.work -type f -name ".command.err" -exec grep -l "ERROR: \['16S ribosomal RNA'\]" '{}' \;) ) -if [[ $biopython_rna_errors ]]; then - for line in ${biopython_rna_errors[@]}; do - work_dir=$(dirname ${line}) - error=$(tail -n 1 ${line} | sed -e 's/[][]//g' | awk -F '/' '{print $1 $NF}') - sample_name=$(grep "nf-" ${work_dir}/.command.run | cut -d '(' -f 2 | cut -d ')' -f 1) - process=$(grep "NEXTFLOW TASK" ${work_dir}/.command.run | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) +biopython_rna_errors=( $(find "${OUT}/.work" -type f -name ".command.err" -exec grep -l "ERROR: \['16S ribosomal RNA'\]" '{}' \;) ) +if [[ "${biopython_rna_errors}" ]]; then + for line in "${biopython_rna_errors[@]}"; do + work_dir=$(dirname "${line}") + error=$(tail -n 1 "${line}" | sed -e 's/[][]//g' | awk -F '/' '{print $1 $NF}') + sample_name=$(grep "nf-" "${work_dir}/.command.run" | cut -d '(' -f 2 | cut -d ')' -f 1) + process=$(grep "NEXTFLOW TASK" "${work_dir}/.command.run" | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) # Append to errors.tsv - echo -e "${sample_name}\t${process}\t${error}\t${work_dir}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv + echo -e "${sample_name}\t${process}\t${error}\t${work_dir}\t${time_stamp}\t${run_name}" >> "${OUT}/pipeline_info/errors.tsv" done fi # Parse HPC stdout file for QC check failures QC_FAILURES=() while read -r line; do - QC_FAILURES+=("$line") -done < <(awk '/QC check failed/ {print $(NF-3), "("$NF")"}' ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME}) + QC_FAILURES+=("${line}") +done < <(awk '/QC check failed/ {print $(NF-3), "("$NF")"}' "${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME}") -if [[ $QC_FAILURES ]]; then +if [[ "${QC_FAILURES}" ]]; then # Loop over each QC failure for f in "${QC_FAILURES[@]}"; do # Get work directory - short_wd=$(grep "$f" ${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME} | awk -F '[][]' '{print $2}') - wd_path=$(realpath ${OUT}/.work/${short_wd}*) + short_wd=$(grep "$f" "${OUT}/pipeline_info/ASM_*.o${SCRIPT_NAME}" | awk -F '[][]' '{print $2}') + wd_path=$(realpath "${OUT}/.work/${short_wd}"*) # Get first error - error=$(grep "ERROR" ${wd_path}/.command.err | head -n 1 | sed -e 's/\[[^][]*\] //g') + error=$(grep "ERROR" "${wd_path}/.command.err" | head -n 1 | sed -e 's/\[[^][]*\] //g') - process=$(grep "NEXTFLOW TASK" ${wd_path}/.command.run | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) - sample_name=$(echo "$f" | awk -F "[()]" '{print $2}') + process=$(grep "NEXTFLOW TASK" "${wd_path}/.command.run" | awk -F ':' '{print $NF}' | cut -d ' ' -f 1) + sample_name=$(echo "${f}" | awk -F "[()]" '{print $2}') # Append to errors.tsv - echo -e "${sample_name}\t${process}\t${error}\t${wd_path}\t${time_stamp}\t${run_name}" >> ${OUT}/pipeline_info/errors.tsv + echo -e "${sample_name}\t${process}\t${error}\t${wd_path}\t${time_stamp}\t${run_name}" >> "${OUT}/pipeline_info/errors.tsv" done fi -# If errors.tsv found.. +# If errors.tsv found ... if [[ -f "${OUT}/pipeline_info/errors.tsv" ]]; then # Add column headers - sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' ${OUT}/pipeline_info/errors.tsv + sed -i '1i Sample Name\tProcess\tError\tError Directory\tTimestamp\tRun Name' "${OUT}/pipeline_info/errors.tsv" # Remove duplicate lines and lines that have an empty first column - awk -F '\t' '!_[$1,$2,$3,$6]++' ${OUT}/pipeline_info/errors.tsv \ - | awk -F '\t' '$1{print $0}' \ - > ${OUT}/pipeline_info/errors_new.tsv + awk -F $'\t' '!_[$1,$2,$3,$6]++' "${OUT}/pipeline_info/errors.tsv" \ + | awk -F $'\t' '$1{print $0}' \ + > "${OUT}/pipeline_info/errors_new.tsv" # Delete original errors.tsv and rename errors_new.tsv - rm ${OUT}/pipeline_info/errors.tsv + rm "${OUT}/pipeline_info/errors.tsv" - mv ${OUT}/pipeline_info/errors_new.tsv \ - ${OUT}/pipeline_info/errors.tsv + mv "${OUT}/pipeline_info/errors_new.tsv" \ + "${OUT}/pipeline_info/errors.tsv" fi # Count lines in Summary.GenomeCoverage.tsv diff --git a/bin/summarize_kraken.sh b/bin/summarize_kraken.sh index 595f9ba0..680ca0fb 100755 --- a/bin/summarize_kraken.sh +++ b/bin/summarize_kraken.sh @@ -1,28 +1,35 @@ #!/usr/bin/env bash summarize_kraken() { - #BASE=$(basename $1 _kraken.tsv) - #echo -en "$BASE\t" + # $1 = input tab-delimited kraken data file (e.g., "*.kraken2_output.tsv") from `kraken --report ` - # report unclassified - UNCL=($(grep $'\tU\t' ${1} | head -n1 | awk '{print $1,$2,$6}')) + # Initialize an array to collect all output sections + output=() + + # Report unclassified + UNCL=($(grep $'\tU\t' "${1}" | head -n1 | awk '{print $1,$2,$6}')) if [[ ${#UNCL[@]} -eq 3 ]]; then - tabline=$(echo ${UNCL[@]} | sed -E 's/ +/%\t/1' | sed -E 's/ +/\t/1') - echo -en "$tabline\t" + tabline=$(echo "${UNCL[@]}" | sed -E 's/ +/%\t/1' | sed -E 's/ +/\t/1') + output+=("$tabline") else - echo -en "0%\t0\tUnclassified\t" + output+=("0%\t0\tUnclassified") fi # At most top 3 genera while read -r l; do tabline=$(echo "${l}" | sed -E 's/ +/%\t/1' | sed -E 's/ +/\t/1') - echo -en "$tabline\t" - done < <(grep $'\tG\t' ${1} | head -n3 | awk -F $'\t' '{print $1,$2,$6}') + output+=("$tabline") + done < <(grep $'\tG\t' "${1}" | head -n3 | awk -F $'\t' '{print $1,$2,$6}') # At most top 3 species while read -r l; do tabline=$(echo "${l}" | sed -E 's/ +/%\t/1' | sed -E 's/ +/\t/1') - echo -en "$tabline\t" - done < <(grep $'\tS\t' ${1} | head -n3 | awk -F $'\t' '{print $1,$2,$6}') - echo '' + output+=("$tabline") + done < <(grep $'\tS\t' "${1}" | head -n3 | awk -F $'\t' '{print $1,$2,$6}') + + # Join the array with tabs and print the final result + ( + IFS=$'\t' + echo "${output[*]}" + ) } diff --git a/bin/tsv_to_excel.py b/bin/tsv_to_excel.py new file mode 100755 index 00000000..40d17725 --- /dev/null +++ b/bin/tsv_to_excel.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 + +import glob +import os +import re +import sys +from argparse import ArgumentParser +import pandas as pd +from openpyxl import load_workbook +from openpyxl.styles import Color + + +def parseArgs(): + parser = ArgumentParser( + description="Create an Excel files from TSV files and optionally color sheet tabs.", + add_help=False, + epilog='NOTE: sheet tabs are named based on filenames with .tsv removed and Summary*. removed') + req = parser.add_argument_group('Required') + req.add_argument('tsv_files', + nargs='+', + metavar='FILE', + type=str, + help='One or more TSV files (or wildcard pattern) to convert to single XLSX.') + opt = parser.add_argument_group('Optional') + opt.add_argument('-h', '--help', + action='help', + help='show this help message and exit') + opt.add_argument('-d', '--color-dict', + metavar='FILE', + type=str, + required=False, + help='Path to the color dictionary file. Each line must have Summary*. prefix and .tsv suffix stripped, and colors are RGB, e.g., "MLST=255, 255, 0" [None]') + opt.add_argument('-o', '--outfile', + required=False, + type=str, + metavar='FILE', + default='Summary-Report.xlsx', + help='Microsoft Excel XLSX formatted output file [Summary-Report.xlsx]') + opt.add_argument('-c', '--xlsx-property-category', + required=False, + type=str, + metavar='STR', + default='', + help='Output XLSX file property category [None]') + opt.add_argument('-s', '--xlsx-property-subject', + required=False, + type=str, + metavar='STR', + default='', + help='Output XLSX file property subject [None]') + opt.add_argument('-t', '--xlsx-property-title', + required=False, + type=str, + metavar='STR', + default='', + help='Output XLSX file property title [None]') + return parser.parse_args() + +def get_target_creation_time(file): + """Get the creation time of the target file, following symlinks.""" + target = os.path.realpath(file) + return os.path.getctime(target) + +def adjust_tsv_headers(tsv_file): + """Fix TSV files that have missing header column names and fill in missing data cells with empty.""" + with open(tsv_file, 'r') as f: + lines = f.readlines() + + header = lines[0].strip().split('\t') + data_rows = [line.strip().split('\t') for line in lines[1:]] + + max_cols = max(len(row) for row in data_rows) + + if len(header) < max_cols: + header.extend([header[-1]] * (max_cols - len(header))) + + for i, row in enumerate(data_rows): + if len(row) < max_cols: + data_rows[i].extend([''] * (max_cols - len(row))) + + data = pd.DataFrame(data_rows, columns=header) + return data + +def create_summary_workbook(output_file, tsv_file): + """Create an Excel workbook summary from a TSV file.""" + prefix_stripped = re.sub(r'^Summary.*?\.', '', tsv_file) + sheet_name = prefix_stripped.removesuffix(".tsv") + try: + data = adjust_tsv_headers(tsv_file) + data.to_excel(output_file, sheet_name=sheet_name, index=False) + except pd.errors.ParserError as e: + print(f"ERROR: Skipping {tsv_file} due to parsing error: {e}") + +def rgb_to_hex(rgb): + """Convert an RGB tuple to a hex string.""" + return ''.join(f'{x:02X}' for x in rgb) + +def load_color_dict(color_dict_file): + """Load the color dictionary from a text file.""" + color_dict = {} + with open(color_dict_file, 'r') as ifh: + for line in ifh: + line = line.strip() + + # Skip commented lines + if line.startswith('#') or line.startswith('//') or line.startswith(';'): + continue + + # The RGB color has to be after the equals sign + if '=' in line: + name, rgb_str = line.split('=', 1) + name = name.strip() + rgb_str = rgb_str.strip().strip('{}') + + print(f"Processing XLSX sheet line: name={name}, rgb_str={rgb_str}") + + try: + rgb = tuple(map(int, rgb_str.split(','))) + if len(rgb) == 3: + color_dict[name] = rgb_to_hex(rgb) + else: + print(f"Warning: RGB value for '{name}' lacks 3 components.") + except ValueError: + print(f"Error: Invalid RGB value '{rgb_str}' for '{name}'.") + + return color_dict + +def color_sheet_tabs(filename, color_dict_hex, title, category, subject): + """Color tabs in XLSX based on a color dictionary of names and color codes.""" + workbook = load_workbook(filename) + + workbook.properties.title = title + workbook.properties.category = category + workbook.properties.subject = subject + + for sheet_name in workbook.sheetnames: + if sheet_name in color_dict_hex: + hex_color = color_dict_hex[sheet_name] + color = Color(rgb=hex_color) + workbook[sheet_name].sheet_properties.tabColor = color + + workbook.save(filename) + +def main(): + # I/O Handling + opts = parseArgs() + xlsx_outfile = opts.outfile + + _, ext = os.path.splitext(xlsx_outfile) + if ext.lower() != ".xlsx": + sys.stderr.write("ERROR: The file extension must be '.xlsx'.\n") + sys.exit(1) + + # Ensure the directory exists + output_dir = os.path.dirname(os.path.realpath(xlsx_outfile)) + os.makedirs(output_dir, exist_ok=True) + + tsv_files = [] + for pattern in opts.tsv_files: + tsv_files.extend(glob.glob(pattern)) + + if not tsv_files: + print("No TSV files found.") + return + + # Sort TSV files by creation time (Follow symlink targets) + tsv_files.sort(key=get_target_creation_time) + + # Do the work, converting all TSvs into single XLSX + with pd.ExcelWriter(xlsx_outfile, engine='openpyxl') as output_file: + for tsv_file in tsv_files: + create_summary_workbook(output_file, tsv_file) + + # Optionally color the XLSX tabs + if opts.color_dict is not None: + color_dict_hex = load_color_dict(opts.color_dict) + color_sheet_tabs(xlsx_outfile, color_dict_hex, opts.xlsx_property_title, opts.xlsx_property_category, opts.xlsx_property_subject) + +if __name__ == "__main__": + main() diff --git a/conf/modules.config b/conf/modules.config index 61ae89ab..3d773b2b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,17 +35,34 @@ process { /* * Local modules */ - withName: ANNOTATE_PROKKA { + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Validate Inputs" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + withName: INFILE_HANDLING_UNIX { publishDir = [ [ - path: { "${params.outdir}/Annotation/Prokka" }, + path: { "${params.outdir}/Input/Checksums" }, mode: params.publish_dir_mode, - pattern: "*.gbk" + pattern: "*.SHA512-checksums.tsv" ], [ - path: params.qc_filecheck_log_dir, + path: params.process_log_dir, mode: params.publish_dir_mode, - pattern: "*_File.tsv" + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: CALC_STATS_INPUT_FQ_SEQKIT { + publishDir = [ + [ + path: { "${params.outdir}/Input/SeqKit" }, + mode: params.publish_dir_mode, + pattern: "*.Input_Reads.metrics_summary.tsv" ], [ path: params.process_log_dir, @@ -56,17 +73,37 @@ process { ] } - withName: ASSEMBLE_CONTIGS_SKESA { + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Human Removal" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: 'REMOVE_HOST_HOSTILE' { publishDir = [ [ - path: { "${params.outdir}/Assembly/${meta.assembler}/${meta.id}/" }, + path: { "${params.outdir}/Host_Remove/Hostile" }, mode: params.publish_dir_mode, - pattern: "*_contigs.fasta" + pattern: "*.Hostile_Removal.tsv" + ], + [ + path: { "${params.outdir}/Host_Remove/Hostile" }, + mode: params.publish_dir_mode, + pattern: "hostile/*.clean_{1,2}.{fq,fastq}{,.gz}", + saveAs: { file -> + def newName = file.toString() // Ensure it's treated as a string + .replaceFirst(/^.*\//, '') // Remove the directory path ("hostile/") + return "${newName}" } ], [ path: params.qc_filecheck_log_dir, mode: params.publish_dir_mode, - pattern: "*_File.tsv" + pattern: "*.Hostile-removed_FastQ_File.tsv" + ], + [ + path: { "${params.outdir}/Host_Remove/Hostile" }, + mode: params.publish_dir_mode, + pattern: "*.SHA512-checksums.tsv" ], [ path: params.process_log_dir, @@ -77,17 +114,43 @@ process { ] } - withName: 'ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES' { + withName: 'CALC_STATS_HOSTILE_REMOVED_FQ_SEQKIT' { publishDir = [ [ - path: { "${params.outdir}/Assembly/${meta.assembler}/${meta.id}/" }, + path: { "${params.outdir}/Host_Remove/Hostile" }, mode: params.publish_dir_mode, - pattern: "*.{log,gfa,gz,fasta}" + pattern: "*.Hostile_Removed_Reads.metrics_summary.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: 'REMOVE_HOST_SRA_HUMAN_SCRUBBER' { + publishDir = [ + [ + path: { "${params.outdir}/Host_Remove/SRA_Human_Scrubber" }, + mode: params.publish_dir_mode, + pattern: "*.SRA_Human_Scrubber_Removal.tsv" + ], + [ + path: { "${params.outdir}/Host_Remove/SRA_Human_Scrubber" }, + mode: params.publish_dir_mode, + pattern: "*_scrubbed.fastq.gz" + ], + [ + path: { "${params.outdir}/Host_Remove/SRA_Human_Scrubber" }, + mode: params.publish_dir_mode, + pattern: "*.stderr.txt" ], [ path: params.qc_filecheck_log_dir, mode: params.publish_dir_mode, - pattern: "*_File.tsv" + pattern: "*.SRA_Human_Scrubber_FastQ_File.tsv" ], [ path: params.process_log_dir, @@ -98,18 +161,45 @@ process { ] } - withName: BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON { + withName: 'CALC_STATS_SCRUB_REMOVED_FQ_SEQKIT' { publishDir = [ [ - path: { "${params.outdir}/SSU/BLAST" }, + path: { "${params.outdir}/Host_Remove/SRA_Human_Scrubber" }, mode: params.publish_dir_mode, - pattern: "*.blast.tsv.gz" + pattern: "*.SRA_Scrubbed_Reads.metrics_summary.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: 'REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR' { + publishDir = [ + [ + + path: { "${params.outdir}/Host_Remove/BBTools_Repair" }, + mode: params.publish_dir_mode, + pattern: "*.BBTools_Repair_Removal.tsv" + ], + [ + path: { "${params.outdir}/Host_Remove/BBTools_Repair" }, + mode: params.publish_dir_mode, + pattern: "*repaired*.fastq.gz" ], [ path: params.qc_filecheck_log_dir, mode: params.publish_dir_mode, pattern: "*_File.tsv" ], + [ + path: { "${params.outdir}/Host_Remove/BBTools_Repair" }, + mode: params.publish_dir_mode, + pattern: "*.SHA512-checksums.tsv" + ], [ path: params.process_log_dir, mode: params.publish_dir_mode, @@ -119,17 +209,34 @@ process { ] } - withName: CALCULATE_COVERAGE_UNIX { + withName: 'CALC_STATS_REPAIR_REMOVED_FQ_SEQKIT' { publishDir = [ [ - path: { "${params.outdir}/Assembly/QA/${meta.id}/"}, + path: { "${params.outdir}/Host_Remove/BBTools_Repair" }, mode: params.publish_dir_mode, - pattern: "*GenomeCoverage.tsv" + pattern: "*.BBTools_Repaired_Reads.metrics_summary.tsv" ], [ - path: params.qc_filecheck_log_dir, + path: params.process_log_dir, mode: params.publish_dir_mode, - pattern: "*_File.tsv" + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Downsample" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: ESTIMATE_GENOME_SIZE_KMC { + publishDir = [ + [ + path: { "${params.outdir}/Downsample/KMC" }, + mode: params.publish_dir_mode, + pattern: "*.{stdout.log,genome_size.txt}" ], [ path: params.process_log_dir, @@ -140,17 +247,161 @@ process { ] } - withName: CLASSIFY_16S_RDP { + withName: COUNT_TOTAL_BP_INPUT_READS_SEQKIT { publishDir = [ [ - path: { "${params.outdir}/SSU/RDP" }, + path: { "${params.outdir}/Downsample/${params.subsample_tool}/counts" }, mode: params.publish_dir_mode, - pattern: "*.RDP.tsv" + pattern: "*.input_total_bp.txt" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: COUNT_TOTAL_BP_INPUT_READS_SEQTK { + publishDir = [ + [ + path: { "${params.outdir}/Downsample/${params.subsample_tool}/counts" }, + mode: params.publish_dir_mode, + pattern: "*.input_total_bp.txt" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX { + publishDir = [ + [ + path: { "${params.outdir}/Downsample/Unix" }, + mode: params.publish_dir_mode, + pattern: "*.{initial_depth.txt,fraction_of_reads_to_use.txt}" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: SUBSAMPLE_READS_TO_DEPTH_SEQKIT { + publishDir = [ + [ + path: { "${params.outdir}/Downsample/${params.subsample_tool}" }, + mode: params.publish_dir_mode, + pattern: "*_R{1,2}.subsampled.{fq,fastq}.gz" + ], + [ + // No direct built-in functionality within the publishDir directive to + // check file sizes or content before saving, so checksums file will + // be stored as existing but empty outfile. + path: { "${params.outdir}/Downsample/${params.subsample_tool}" }, + mode: params.publish_dir_mode, + pattern: "*.SHA512-checksums.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: SUBSAMPLE_READS_TO_DEPTH_SEQTK { + publishDir = [ + [ + path: { "${params.outdir}/Downsample/${params.subsample_tool}" }, + mode: params.publish_dir_mode, + pattern: "*_R{1,2}.subsampled.{fq,fastq}.gz" + ], + [ + // No direct built-in functionality within the publishDir directive to + // check file sizes or content before saving, so checksums file will + // be stored as existing but empty outfile. + path: { "${params.outdir}/Downsample/${params.subsample_tool}" }, + mode: params.publish_dir_mode, + pattern: "*.SHA512-checksums.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: CALC_STATS_DOWNSAMPLE_FQ_SEQKIT { + publishDir = [ + [ + path: { "${params.outdir}/Downsample/${params.subsample_tool}" }, + mode: params.publish_dir_mode, + pattern: "*.Downsampled_Reads.metrics_summary.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: CALC_STATS_DOWNSAMPLE_FQ_SEQTK { + publishDir = [ + [ + path: { "${params.outdir}/Downsample/${params.subsample_tool}" }, + mode: params.publish_dir_mode, + pattern: "*.Downsampled_Reads.metrics_summary.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Clean Reads" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: REMOVE_PHIX_BBDUK { + publishDir = [ + [ + path: { "${params.outdir}/Clean_Reads/BBDuk" }, + mode: params.publish_dir_mode, + pattern: "*.PhiX.tsv" + ], + [ + path: { "${params.outdir}/Clean_Reads/BBDuk" }, + mode: params.publish_dir_mode, + pattern: "*_noPhiX-R{1,2}.fsq" ], [ path: params.qc_filecheck_log_dir, mode: params.publish_dir_mode, - pattern: "*_File.tsv" + pattern: "*.PhiX_Genome_File.tsv" + ], + [ + path: { "${params.outdir}/Clean_Reads/BBDuk" }, + mode: params.publish_dir_mode, + pattern: "*.SHA512-checksums.tsv" ], [ path: params.process_log_dir, @@ -161,17 +412,43 @@ process { ] } - withName: EXTRACT_16S_BARRNAP { + withName: CALC_STATS_NOPHIX_FQ_SEQKIT { publishDir = [ [ - path: { "${params.outdir}/SSU" }, + path: { "${params.outdir}/Clean_Reads/BBDuk" }, mode: params.publish_dir_mode, - pattern: "16S.*.fa" + pattern: "*.PhiX_Removed_Reads.metrics_summary.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: TRIM_READS_FASTP { + publishDir = [ + [ + path: { "${params.outdir}/Clean_Reads/${params.trim_reads_tool}" }, + mode: params.publish_dir_mode, + pattern: "*.{F,f}astp.*" + ], + [ + path: { "${params.outdir}/Clean_Reads/${params.trim_reads_tool}" }, + mode: params.publish_dir_mode, + pattern: "*{paired,single}.fq" ], [ path: params.qc_filecheck_log_dir, mode: params.publish_dir_mode, - pattern: "*_File.tsv" + pattern: "*.Adapters_FastA_File.tsv" + ], + [ + path: { "${params.outdir}/Clean_Reads/${params.trim_reads_tool}" }, + mode: params.publish_dir_mode, + pattern: "*.SHA512-checksums.tsv" ], [ path: params.process_log_dir, @@ -182,17 +459,43 @@ process { ] } - withName: MAP_CONTIGS_BWA { + withName: TRIM_READS_TRIMMOMATIC { publishDir = [ [ - path: { "${params.outdir}/Assembly" }, + path: { "${params.outdir}/Clean_Reads/${params.trim_reads_tool}" }, mode: params.publish_dir_mode, - pattern: "*.fna" + pattern: "*.Trimmomatic.tsv" + ], + [ + path: { "${params.outdir}/Clean_Reads/${params.trim_reads_tool}" }, + mode: params.publish_dir_mode, + pattern: "*{paired,single}.fq" ], [ path: params.qc_filecheck_log_dir, mode: params.publish_dir_mode, - pattern: "*_File.tsv" + pattern: "*.Adapters_FastA_File.tsv" + ], + [ + path: { "${params.outdir}/Clean_Reads/${params.trim_reads_tool}" }, + mode: params.publish_dir_mode, + pattern: "*.SHA512-checksums.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: CALC_STATS_TRIM_FQ_SEQKIT { + publishDir = [ + [ + path: { "${params.outdir}/Clean_Reads/${params.trim_reads_tool}" }, + mode: params.publish_dir_mode, + pattern: "*.Adapter_QC_Trim_Reads.metrics_summary.tsv" ], [ path: params.process_log_dir, @@ -206,12 +509,12 @@ process { withName: OVERLAP_PAIRED_READS_FLASH { publishDir = [ [ - path: { "${params.outdir}/CleanedReads/FLASH" }, + path: { "${params.outdir}/Clean_Reads/FLASh" }, mode: params.publish_dir_mode, - pattern: "*.FLASH.tsv" + pattern: "*.Overlap.tsv" ], [ - path: { "${params.outdir}/CleanedReads" }, + path: { "${params.outdir}/Clean_Reads" }, mode: params.publish_dir_mode, pattern: "*.fq.gz" ], @@ -221,6 +524,10 @@ process { pattern: "*_File.tsv" ], [ + path: { "${params.outdir}/Clean_Reads/FLASh" }, + mode: params.publish_dir_mode, + pattern: "*.SHA512-checksums.tsv" + ], [ path: params.process_log_dir, mode: params.publish_dir_mode, pattern: ".command.{out,err}", @@ -229,17 +536,35 @@ process { ] } - withName: POLISH_ASSEMBLY_BWA_PILON { + withName: CALC_STATS_CLEANEDREADS_FQ_SEQKIT { + // aka overlapped (after FLAsh) publishDir = [ [ - path: { "${params.outdir}/Assembly" }, + path: { "${params.outdir}/Clean_Reads/FLASh" }, mode: params.publish_dir_mode, - pattern: "*.fna" + pattern: "*.Clean_Reads.metrics_summary.tsv" ], [ - path: { "${params.outdir}/Assembly/${meta.assembler}/${meta.id}/" }, + path: params.process_log_dir, mode: params.publish_dir_mode, - pattern: "*.{InDels,SNPs}-corrected.cnt.tsv" + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Genome Assembly" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: 'ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SKESA' { + publishDir = [ + [ + path: { "${params.outdir}/Assembly/${meta.assembler}/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*_contigs.fasta" ], [ path: params.qc_filecheck_log_dir, @@ -255,12 +580,12 @@ process { ] } - withName: QA_ASSEMBLY_QUAST { + withName: 'ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES' { publishDir = [ [ - path: { "${params.outdir}/Assembly/QA/${meta.id}/" }, + path: { "${params.outdir}/Assembly/${meta.assembler}/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*{QuastSummary,CleanedReads-Bases}.tsv" + pattern: "*.{log,gfa,gz,fasta}" ], [ path: params.qc_filecheck_log_dir, @@ -276,10 +601,79 @@ process { ] } + withName: FILTER_CONTIGS_BIOPYTHON { + publishDir = [ + [ + path: { "${params.outdir}/Assembly/Discard_Contigs/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.filter-contigs-stats.txt" + ], + [ + path: { "${params.outdir}/Assembly/Discard_Contigs/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.discarded-contigs.fa.gz" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: POLISH_ASSEMBLY_BWA_PILON { + publishDir = [ + [ + path: { "${params.outdir}/Assembly" }, + mode: params.publish_dir_mode, + pattern: "*.fna" + ], + [ + path: { "${params.outdir}/Assembly/Polish_Contigs/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{InDels,SNPs}-corrected.cnt.tsv" + ], + [ + path: params.qc_filecheck_log_dir, + mode: params.publish_dir_mode, + pattern: "*_File.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Taxonomic Classifcation of Reads" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: KRAKEN1_DB_PREPARATION_UNIX { + publishDir = [ + [ + path: { "${params.outdir}/Taxonomy/Kraken" }, + mode: params.publish_dir_mode, + pattern: "*.Kraken_Database.SHA512-checksums.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + withName: 'ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE' { publishDir = [ [ - path: { "${params.outdir}/Taxonomy/kraken/${meta.id}/" }, + path: { "${params.outdir}/Taxonomy/Kraken/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.{tsv.gz,tsv}" ], @@ -292,10 +686,26 @@ process { ] } - withName: READ_CLASSIFY_KRAKEN_TWO { + withName: KRAKEN2_DB_PREPARATION_UNIX { publishDir = [ [ - path: { "${params.outdir}/Taxonomy/kraken2/${meta.id}/" }, + path: { "${params.outdir}/Taxonomy/Kraken2" }, + mode: params.publish_dir_mode, + pattern: "*.Kraken2_Database.SHA512-checksums.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: 'ASSEMBLY:READ_CLASSIFY_KRAKEN_TWO' { + publishDir = [ + [ + path: { "${params.outdir}/Taxonomy/Kraken2/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.{tsv.gz,tsv}" ], @@ -308,12 +718,18 @@ process { ] } - withName: TRIM_READS_FASTP { + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Depth of Coverage" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: MAP_CONTIGS_BWA { publishDir = [ [ - path: { "${params.outdir}/CleanedReads/fastp" }, + path: { "${params.outdir}/Assembly" }, mode: params.publish_dir_mode, - pattern: "*.fastp.*" + pattern: "*.fna" ], [ path: params.qc_filecheck_log_dir, @@ -329,12 +745,82 @@ process { ] } - withName: TRIM_READS_TRIMMOMATIC { + withName: EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS { publishDir = [ [ - path: { "${params.outdir}/CleanedReads/Trimmomatic" }, + path: { "${params.outdir}/Assembly/QA/Depth"}, mode: params.publish_dir_mode, - pattern: "*.Trimmomatic.tsv" + pattern: "*Clean_Reads-AlnStats.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + withName: CALCULATE_COVERAGE_UNIX { + publishDir = [ + [ + path: { "${params.outdir}/Assembly/QA/Depth"}, + mode: params.publish_dir_mode, + pattern: "*GenomeCoverage.tsv" + ], + [ + path: params.qc_filecheck_log_dir, + mode: params.publish_dir_mode, + pattern: "*_File.tsv" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Genotyping" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: MLST_MLST { + publishDir = [ + [ + path: { "${params.outdir}/Assembly/QA/MLST"}, + mode: params.publish_dir_mode, + pattern: "*.MLST.tsv" + ], + [ + path: { "${params.outdir}/Assembly/QA/MLST"}, + mode: params.publish_dir_mode, + pattern: "*.MLST.novel.fasta" + ], + [ + path: params.process_log_dir, + mode: params.publish_dir_mode, + pattern: ".command.{out,err}", + saveAs: { filename -> "${meta.id}.${task.process}${filename}" } + ] + ] + } + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Genome Annotation" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: ANNOTATE_PROKKA { + publishDir = [ + [ + path: { "${params.outdir}/Annotation/Prokka" }, + mode: params.publish_dir_mode, + pattern: "*.gbk" ], [ path: params.qc_filecheck_log_dir, @@ -350,12 +836,18 @@ process { ] } - withName: REMOVE_HOST_HOSTILE { + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "16S rRNA Gene Classification" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: EXTRACT_16S_BARRNAP { publishDir = [ [ - path: { "${params.outdir}/CleanedReads/Hostile" }, + path: { "${params.outdir}/SSU" }, mode: params.publish_dir_mode, - pattern: "*.Hostile-Removal.tsv" + pattern: "16S.*.fa" ], [ path: params.qc_filecheck_log_dir, @@ -371,12 +863,12 @@ process { ] } - withName: REMOVE_HOST_SRA_HUMAN_SCRUBBER { + withName: CLASSIFY_16S_RDP { publishDir = [ [ - path: { "${params.outdir}/CleanedReads/SRA-Human-Scrubber" }, + path: { "${params.outdir}/SSU/RDP" }, mode: params.publish_dir_mode, - pattern: "*.SRA-Human-Scrubber-Removal.tsv" + pattern: "*.RDP.tsv" ], [ path: params.qc_filecheck_log_dir, @@ -392,12 +884,12 @@ process { ] } - withName: REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR { + withName: BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON { publishDir = [ [ - path: { "${params.outdir}/CleanedReads/SRA-Human-Scrubber" }, + path: { "${params.outdir}/SSU/BLAST" }, mode: params.publish_dir_mode, - pattern: "*.BBTools-Repair-Removal.tsv" + pattern: "*.blast.tsv.gz" ], [ path: params.qc_filecheck_log_dir, @@ -413,12 +905,18 @@ process { ] } - withName: REMOVE_PHIX_BBDUK { + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + "Assembly Assessment" section + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: ASSESS_ASSEMBLY_CHECKM2 { publishDir = [ [ - path: { "${params.outdir}/CleanedReads/BBDUK" }, + path: { "${params.outdir}/Assembly/QA/CheckM2/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*.BBDuk.tsv" + pattern: "*.{tsv.gz,log.gz,results.tsv}" ], [ path: params.qc_filecheck_log_dir, @@ -437,9 +935,19 @@ process { withName: CLASSIFY_CONTIGS_CAT { publishDir = [ [ - path: { "${params.outdir}/Assembly/QA/${meta.id}/CAT/" }, + path: { "${params.outdir}/Taxonomy/CAT/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.CAT-Classification.*{tsv,txt}" + ], + [ + path: { "${params.outdir}/Taxonomy/CAT/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*CAT-Classification*.{tsv,log.gz}" + pattern: "*.Contigs.tsv" + ], + [ + path: { "${params.outdir}/Taxonomy/CAT/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.unique-lineages.ORF.tsv" ], [ path: params.process_log_dir, @@ -450,12 +958,12 @@ process { ] } - withName: ASSESS_ASSEMBLY_CHECKM2 { + withName: QA_ASSEMBLY_QUAST { publishDir = [ [ - path: { "${params.outdir}/Assembly/QA/${meta.id}/CheckM2/" }, + path: { "${params.outdir}/Assembly/QA/QUAST" }, mode: params.publish_dir_mode, - pattern: "*.{tsv.gz,log.gz,report.tsv}" + pattern: "*.tsv" ], [ path: params.qc_filecheck_log_dir, @@ -471,12 +979,18 @@ process { ] } + /* + ================================================================================ + misc section + ================================================================================ + */ + withName: CREATE_EXCEL_RUN_SUMMARY_PYTHON { publishDir = [ [ path: { "${params.outdir}/Summaries" }, mode: params.publish_dir_mode, - pattern: "Summary-Report_*.xlsx" + pattern: "Summary-Report.xlsx" ], [ path: params.process_log_dir, @@ -491,12 +1005,17 @@ process { * nf-core modules */ withName: QA_ASSEMBLY_GTDBTK { + ext.args = "--pplacer_cpus 1 --min_perc_aa 20" publishDir = [ [ - path: { "${params.outdir}/Summaries" }, + path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.summary.tsv" + ], + [ + path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*.summary.tsv", - saveAs: { filename -> "Summary.GTDB-Tk.tsv" } + pattern: "*.classify.tree.gz" ], [ path: params.process_log_dir, @@ -508,13 +1027,22 @@ process { } withName: QA_ASSEMBLY_BUSCO { - ext.args = "-m genome" + ext.args = "--mode genome" publishDir = [ [ - path: { "${params.outdir}/Summaries" }, + path: { "${params.outdir}/Assembly/QA/BUSCO/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*-busco.batch_summary.txt" + ], + [ + path: { "${params.outdir}/Assembly/QA/BUSCO/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "short_summary.*.{json,txt}" + ], + [ + path: { "${params.outdir}/Assembly/QA/BUSCO/${meta.id}" }, mode: params.publish_dir_mode, - pattern: "*.batch_summary.txt", - saveAs: { filename -> "Summary.BUSCO.tsv" } + pattern: "*-busco/*/run_*/full_table.tsv" ], [ path: params.process_log_dir, diff --git a/conf/params.config b/conf/params.config index dfa31ad4..5b2b7e2d 100644 --- a/conf/params.config +++ b/conf/params.config @@ -34,6 +34,9 @@ params { // Downsampling depth = 100 genome_size = "" + subsample_tool = "seqkit" + seqtk_seed = 947266746 + seqkit_seed = 947266746 // Read trimming trim_reads_tool = "trimmomatic" @@ -47,7 +50,7 @@ params { kraken1_db = "https://ccb.jhu.edu/software/kraken/dl/minikraken_20171019_8GB.tgz" kraken2_db = "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20231009.tar.gz" checkm2_db = "https://zenodo.org/records/5571251/files/checkm2_database.tar.gz" - sra_scrubber_db = "https://ftp.ncbi.nlm.nih.gov/sra/dbs/human_filter/human_filter.db.20231218v2" + sra_scrubber_db = "https://ftp.ncbi.nlm.nih.gov/sra/dbs/human_filter/human_filter.db.20240718v2" // SKESA skesa_steps = 11 @@ -63,12 +66,13 @@ params { download_cat_db = false // GTDB-Tk - gtdb_db = null - mash_db = "https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh" + // avoid downloading GTDB due to size and duration + // 102G = (compressed file) gtdbtk_r220_data.tar.gz + // 106G = (uncompressed dir) release220/ + gtdb_db = null // "https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" + mash_db = null // "https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh" skip_gtdbtk = false - gtdbtk_min_af = 0.65 gtdbtk_pplacer_cpus = 1 - gtdbtk_min_perc_aa = 10 gtdbtk_pplacer_scratch = "" // BUSCO @@ -134,5 +138,5 @@ params { */ // Ignore "Found unexpected parameters" warning profile_cache_dir = "${projectDir}/assets/.cache" - schema_ignore_params = "filter_blast_bitscore,filter_blast_column,min_filesize_filtered_blastn,min_filesize_blastn_output,min_filesize_blastn_db,min_filesize_extracted_ssu_file,min_filesize_renamed_ssu_file,genbank_search_type,genbank_query_qualifier,genbank_query_feature,genbank_query,min_filesize_annotated_genbank,min_filesize_binary_se_alignment,min_filesize_final_assembly,min_filesize_polished_assembly,min_filesize_binary_pe_alignment,min_filesize_filtered_assembly,filter_contigs_no_sort,filter_contigs_deflines,filter_contigs_keep_low_complexity,filter_contigs_length,filter_contigs_gcskew,filter_contigs_discard_file,filter_contigs_coverage,min_filesize_raw_assembly,min_filesize_non_overlapping_fastq,min_filesize_fastq_adapters_removed,min_filesize_adapters,min_filesize_fastq_phix_removed,min_filesize_phix_genome,min_filesize_fastq_input,workflows,available_workflows,max_retry,bigdata,logpath,qc_filecheck_log_dir,process_log_dir,kraken1_db,kraken2_db,blast_db,polish_corrections,skesa_allow_snps,skesa_min_contig_length,skesa_max_snp_length,skesa_fraction,skesa_steps,skesa_vector_percent,skesa_kmer_length,excel_sheet_name,merge_lanes,sge_high_memory,sge_options,sge_queue_size,sge_queue,sge_penv,singularity_cache,sge_process_time,gtdbtk_pplacer_scratch,gtdbtk_min_perc_aa,gtdbtk_pplacer_cpus,gtdbtk_min_af,depth,genome_size,busco_config,adapter_reference,phix_reference,spades_mode,spades_kmer_sizes,validationSchemaIgnoreParams,validationShowHiddenParams,validation-schema-ignore-params,validation-show-hidden-params,mash_db,min_filesize_sra_human_scrubber_db_file,trimmomatic_keep_both_reads,trimmomatic_palindrome_clip_threshold,trimmomatic_simple_clip_threshold,trimmomatic_required_quality,trimmomatic_trailing_quality,trimmomatic_leading_quality,trimmomatic_min_length,trimmomatic_min_adapter_length,trimmomatic_seed_mismatches,trimmomatic_window_size,trimmomatic_phred,create_excel_outputs,rdp_phylomarker,rdp_output_format,min_filesize_rdp_output,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SKESA,min_filesize_checkm2_report,cat_db,min_filesize_cat_output,download_cat_db,trim_reads_tool" + schema_ignore_params = "filter_blast_bitscore,filter_blast_column,min_filesize_filtered_blastn,min_filesize_blastn_output,min_filesize_blastn_db,min_filesize_extracted_ssu_file,min_filesize_renamed_ssu_file,genbank_search_type,genbank_query_qualifier,genbank_query_feature,genbank_query,min_filesize_annotated_genbank,min_filesize_binary_se_alignment,min_filesize_final_assembly,min_filesize_polished_assembly,min_filesize_binary_pe_alignment,min_filesize_filtered_assembly,filter_contigs_no_sort,filter_contigs_deflines,filter_contigs_keep_low_complexity,filter_contigs_length,filter_contigs_gcskew,filter_contigs_coverage,min_filesize_raw_assembly,min_filesize_non_overlapping_fastq,min_filesize_fastq_adapters_removed,min_filesize_adapters,min_filesize_fastq_phix_removed,min_filesize_phix_genome,min_filesize_fastq_input,workflows,available_workflows,max_retry,bigdata,logpath,qc_filecheck_log_dir,process_log_dir,kraken1_db,kraken2_db,blast_db,polish_corrections,skesa_allow_snps,skesa_min_contig_length,skesa_max_snp_length,skesa_fraction,skesa_steps,skesa_vector_percent,skesa_kmer_length,excel_sheet_name,merge_lanes,sge_high_memory,sge_options,sge_queue_size,sge_queue,sge_penv,singularity_cache,sge_process_time,gtdbtk_pplacer_scratch,gtdbtk_min_perc_aa,gtdbtk_pplacer_cpus,gtdbtk_min_af,depth,genome_size,busco_config,adapter_reference,phix_reference,spades_mode,spades_kmer_sizes,validationSchemaIgnoreParams,validationShowHiddenParams,validation-schema-ignore-params,validation-show-hidden-params,mash_db,min_filesize_sra_human_scrubber_db_file,trimmomatic_keep_both_reads,trimmomatic_palindrome_clip_threshold,trimmomatic_simple_clip_threshold,trimmomatic_required_quality,trimmomatic_trailing_quality,trimmomatic_leading_quality,trimmomatic_min_length,trimmomatic_min_adapter_length,trimmomatic_seed_mismatches,trimmomatic_window_size,trimmomatic_phred,create_excel_outputs,rdp_phylomarker,rdp_output_format,min_filesize_rdp_output,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SKESA,min_filesize_checkm2_report,cat_db,min_filesize_cat_output,download_cat_db,trim_reads_tool,subsample_tool,seqtk_seed,seqkit_seed" } diff --git a/docs/ADD_MODULE_GUIDE.md b/docs/ADD_MODULE_GUIDE.md index e9d5df1e..272b5e67 100644 --- a/docs/ADD_MODULE_GUIDE.md +++ b/docs/ADD_MODULE_GUIDE.md @@ -17,15 +17,15 @@ If you're not used to this workflow with git, you can start with some [docs from The first step is to fork the [wf-paired-end-illumina-workflow](https://github.com/bacterial-genomics/wf-paired-end-illumina-assembly) repository: 1. On the [GitHub repository](https://github.com/bacterial-genomics/wf-paired-end-illumina-assembly) in the top right corner, click **Fork**. - ![GitHub fork](images/github_fork.PNG) + ![GitHub fork](images/github_fork.PNG) 2. Under "Owner", select the dropdown menu and click and owner for the forked repository. 3. By default, forks are named the same as their upstream repositories. Optionally, to further distinguish your fork, in the "Repository name" field, type a name. 4. Unselect "Copy the `main` branch only. The new module should be added to the `dev` branch of the workflow. 5. Click **Create fork**. 6. Then clone your forked repository: - `git clone https://github.com/YOURUSERNAME/wf-paired-end-illumina-assembly.git` + `git clone https://github.com/YOURUSERNAME/wf-paired-end-illumina-assembly.git` 7. Then create a new branch on your forked repository: - `git checkout -b NEWBRANCHNAME` + `git checkout -b NEWBRANCHNAME` Please create a new branch with the appropriate branch name for the module you are trying to add. This will make things easier when reviewing and ultimately merging the branches on the repository. diff --git a/docs/images/wf-paired-end-illumina-assembly_workflow.png b/docs/images/wf-paired-end-illumina-assembly_workflow.png index be9a0a72..8ad0f109 100644 Binary files a/docs/images/wf-paired-end-illumina-assembly_workflow.png and b/docs/images/wf-paired-end-illumina-assembly_workflow.png differ diff --git a/docs/images/wf-paired-end-illumina-assembly_workflow.svg b/docs/images/wf-paired-end-illumina-assembly_workflow.svg index af4a0cd4..fecf3d70 100644 --- a/docs/images/wf-paired-end-illumina-assembly_workflow.svg +++ b/docs/images/wf-paired-end-illumina-assembly_workflow.svg @@ -14,7 +14,7 @@ version="1.1" id="svg8" inkscape:version="1.0.1 (3bc2e813f5, 2020-09-07)" - sodipodi:docname="wf-paired-end-illumina-assembly_workflow.svg" + sodipodi:docname="wf-paired-end-illumina-assembly_workflow.colored-XLSX.svg" inkscape:export-filename="C:\Users\ylo1\Downloads\wf-paired-end-illumina-assembly_workflow.png" inkscape:export-xdpi="300" inkscape:export-ydpi="300"> @@ -275,7 +275,7 @@ xlink:href="#linearGradient5668" id="linearGradient30137" gradientUnits="userSpaceOnUse" - gradientTransform="matrix(0,-0.26133059,0.09559887,0,260.33483,102.8218)" + gradientTransform="matrix(0,-0.26133059,0.09559887,0,258.21816,86.41763)" x1="75" y1="103.70081" x2="105" @@ -581,9 +581,9 @@ borderopacity="1.0" inkscape:pageopacity="0.0" inkscape:pageshadow="2" - inkscape:zoom="1.5762172" - inkscape:cx="406.60886" - inkscape:cy="542.07044" + inkscape:zoom="0.7881086" + inkscape:cx="1087.3102" + inkscape:cy="-37.637114" inkscape:document-units="mm" inkscape:current-layer="layer1" showgrid="false" @@ -670,7 +670,7 @@ height="64.886429" width="67.65844" id="rect6854-0" - style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.993346;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + style="fill:#ffffff;fill-opacity:1;stroke:#0000ff;stroke-width:0.993346;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + id="g4154" + transform="translate(-2.5321496,-16.40417)"> + + Depth ofCoverage + id="g25176-6" + transform="translate(172.30372,-113.03812)"> + style="fill:#24af63;fill-opacity:1;stroke:#000000;stroke-width:0.3;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" + id="rect4770-6-1-8-6-6-3-6-9-5-1-87" + width="27" + height="4.9341259" + x="84.505341" + y="178.49588" + ry="2.467063" /> Depth ofCoverage - - - BEDTools - + style="font-size:3.175px;fill:#000000;fill-opacity:1;stroke-width:0.264583">BEDTools - + + Extract 16S rRNA Genes + id="g955" + transform="translate(195.84385,-14.777841)"> + Extract 16S rRNA - - - Biopython - - - - Biopython + + + + Barrnap - + id="tspan7034" + sodipodi:role="line">Barrnap + style="fill:#ffffff;fill-opacity:1;stroke:#9acd32;stroke-width:0.971032;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> @@ -1050,7 +1042,7 @@ style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;line-height:1;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro Bold';text-align:center;text-anchor:middle;stroke-width:0.264583" y="122.18478" x="97.750854" - sodipodi:role="line">Filter out low coverage, short,Discard low coverage, short, + style="fill:#ffffff;fill-opacity:1;stroke:#ff0000;stroke-width:1;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> - - - gunzip, cat - + id="g33553-3" + transform="translate(10.423379,7.372834)"> + FastQ corruptiontests + style="font-size:3.175px;fill:#000000;fill-opacity:1;stroke-width:0.264583" + y="81.567963" + x="22.327847" + id="tspan6550-3" + sodipodi:role="line">SeqFu + FastQ corruptiontests @@ -1556,315 +1544,228 @@ d="m 116.20042,95.620934 1.4e-4,-33.061026 0.44468,2.176628 c 3.97068,19.435018 6.04591,24.199823 13.12774,30.142376 l 1.81557,1.523501 -1.73392,1.727061 c -3.25784,3.244956 -4.74492,5.078976 -6.42222,7.920376 -2.17445,3.68379 -4.02804,9.05562 -5.96216,17.27886 -0.41856,1.77955 -0.87556,3.71203 -1.0155,4.29441 -0.21882,0.91038 -0.25445,-3.57692 -0.25433,-32.002186 z" style="fill:url(#linearGradient24018-7);fill-opacity:1;stroke:#000000;stroke-width:0.135481;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> - - - Downsample - - - - Seqtk - - Count reads - - - - - KMC - - Estimategenome size - - - - - Seqtk - - Subsample reads - - - - - AWK - - Estimateoriginal depth - - + id="rect6528-4-6-1" + style="fill:#ffffff;fill-opacity:1;stroke:#a52a2a;stroke-width:1;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> Human Removal + id="tspan6538-5-4-3">Downsample + id="g1728" + transform="translate(0,-2.1166667)"> + id="g33553-3-7-7-9" + transform="translate(124.33544,-17.696223)"> BBTools repair + x="23.815245" + id="tspan6550-3-4-6-7" + sodipodi:role="line">KMC Discard brokenEstimatepairs + id="tspan6644-6-8-3-4">genome size + + + + AWK + + Estimateoriginal depth + + + Human Removal + + + BBTools repair + + Discard brokenpairs @@ -2086,7 +1987,7 @@ height="75.415016" width="42.185223" id="rect6528-4-6-19" - style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + style="fill:#ffffff;fill-opacity:1;stroke:#ffff00;stroke-width:1;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> @@ -2422,7 +2323,7 @@ height="42.730301" width="107.43295" id="rect6528" - style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.826493;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + style="fill:#ffffff;fill-opacity:1;stroke:#ff00ff;stroke-width:0.826493;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> @@ -2613,7 +2514,7 @@ height="4.9341259" width="31.516283" id="rect6548" - style="fill:#24af63;fill-opacity:1;stroke:#000000;stroke-width:0.3;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:0.3;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + style="fill:#ffffff;fill-opacity:1;stroke:#00ff00;stroke-width:0.929041;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + style="fill:#ffffff;fill-opacity:1;stroke:#00ffff;stroke-width:1;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> Annotation - - - + + Prokka - - - - Prokka + + + + Bakta - + style="font-size:3.175px;fill:#000000;fill-opacity:1;stroke-width:0.264583">Bakta + style="fill:#ffffff;fill-opacity:1;stroke:#800080;stroke-width:1;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" /> + + + + SeqKit + + Subsample reads + + + Seqtk + + + + Count reads + + + SeqKit + + + + Seqtk + + diff --git a/docs/images/workflow_dag_v1.1.0.png b/docs/images/workflow_dag_v1.1.0.png deleted file mode 100644 index 44663a19..00000000 Binary files a/docs/images/workflow_dag_v1.1.0.png and /dev/null differ diff --git a/docs/output.md b/docs/output.md index a88304cb..30f25132 100644 --- a/docs/output.md +++ b/docs/output.md @@ -16,7 +16,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and is used to - [PhiX read removal](#phix-read-removal) - [Adapter clipping and quality trimming](#adapter-clipping-and-quality-trimming) - [Merge overlapping sister reads](#merge-overlapping-sister-reads) to create singletons -- [Taxonomic classification of trimmed reads](#taxonomic-classification-of-trimmed-reads) +- [Taxonomic classification of cleaned reads](#taxonomic-classification-of-cleaned-reads) - [Kraken](#kraken) - [Kraken2](#kraken2) - [Assembly](#assembly) of trimmed reads @@ -57,7 +57,7 @@ Input files are checked for corruption and must meet a minimum file size to be p ### Host read removal -Host read removal can be skipped or performed by Hostile and/or NCBI SRA-Human-Scrubber by specifying `--host_remove {both,hostile,sra-human-scrubber,skip}`. For SRA-Human-Scrubber, reads are repaired using BBTools to discard broken sister reads. Information about the number of reads discarded and retained are saved in the output directory. +Host read removal can be skipped or performed by Hostile and/or NCBI SRA-Human-Scrubber by specifying `--host_remove {both,hostile,sra-human-scrubber,skip}`. When `both` is invoked, they occur in sequential fashion-- first SRA Scrub tool and repair, then hostile occurs. For SRA-Human-Scrubber, reads are repaired using BBTools to discard broken sister reads. Information about the number of reads discarded and retained are saved in the output directory. Please see the [host removal using Hostile documentation](../modules/local/remove_host_hostile/README.md) and [host removal using SRA-Human-Scrubber documentation](../modules/local/remove_host_sra_human_scrubber/README.md) for more information.
@@ -72,13 +72,15 @@ Please see the [host removal using Hostile documentation](../modules/local/remov
Output files -- `CleanedReads/Hostile/` +- `Clean_Reads/Hostile/` - - `[sample].Summary.Hostile-Removal.[tsv,xlsx]`: Summary of the number of reads discarded and retained from Hostile. + - `[sample].fastq.gz`: Human host-removed FastQ files. + - `[sample].Summary.Hostile.tsv`: Summary of the number of reads discarded and retained from Hostile. + - `[sample].Hostile_FastQ.SHA512-checksums.tsv`: Checksum values for each FastQ output from Hostile. -- `CleanedReads/SRA-Human-Scrubber/` - - `[sample].Summary.BBTools-Repair-Removal.[tsv,xlsx]`: Summary of the number of reads discarded and retained after repairing broken sister reads produced from SRA-Human-Scrubber. - - `[sample].Summary.SRA-Human-Scrubber-Removal.[tsv,xlsx]`: Summary of the number of reads discarded and retained from SRA-Human-Scrubber. +- `Clean_Reads/SRA-Human-Scrubber/` + - `[sample].Summary.BBTools_Repair_Removal.tsv`: Summary of the number of reads discarded and retained after repairing broken sister reads produced from SRA-Human-Scrubber. + - `[sample].Summary.SRA_Human_Scrubber_Removal.tsv`: Summary of the number of reads discarded and retained from SRA-Human-Scrubber.
@@ -100,8 +102,10 @@ PhiX reads are commonly used as a positive control for Illumina sequencing. Duri
Output files -- `CleanedReads/BBDUK/` - - `[sample].Summary.PhiX.[tsv,xlsx]`: Number of reads discarded and retained from BBDuk. +- `Clean_Reads/BBDuk/` + - `[sample].noPhiX_FastQ.SHA512-checksums.tsv`: Checksum values for each PhiX-free FastQ output of BBDuk. + - `[sample].PhiX_Removed_Reads.metrics_summary.tsv`: Metrics on FastQ reads output including minimum, average, and maximum lengths as well as total counts and total length. + - `[sample].PhiX.tsv`: Number of reads discarded and retained from BBDuk.
@@ -128,8 +132,10 @@ Please see the [adapter clipping and quality trimming using Trimmomatic document
Output files -- `CleanedReads/Trimmomatic/` - - `[sample].trimmomatic.[tsv,xlsx]`: Summary of the number of reads discarded and retained from Trimmomatic. +- `Clean_Reads/Trimmomatic/` + - `[sample].Adapter_and_Quality_Trimmed_Reads.metrics_summary.tsv`: Metrics on FastQ reads output including minimum, average, and maximum lengths as well as total counts and total length. + - `[sample].Trim_FastQ.SHA512-checksums.tsv`: Checksum values for each FastQ output from Trimmomatic. + - `[sample].Trimmomatic.tsv`: Summary of the number of reads discarded and retained from Trimmomatic.
@@ -151,8 +157,10 @@ fastp is able to clip adapters, perform quality trimming, and retain broken sist
Output files -- `CleanedReads/fastp/` - - `[sample].fastp.[tsv,xlsx]`: Summary of the number of reads discarded and retained from Trimmomatic. +- `Clean_Reads/fastp/` + - `[sample].Adapter_and_Quality_Trimmed_Reads.metrics_summary.tsv`: Metrics on FastQ reads output including minimum, average, and maximum lengths as well as total counts and total length. + - `[sample].Trim_FastQ.SHA512-checksums.tsv`: Checksum values for each FastQ output from Fastp. + - `[sample].Fastp.tsv`: Summary of the number of reads discarded and retained from Fastp.
@@ -172,68 +180,77 @@ Overlapping content between sister reads that are at least 80% similar are colla
Output files -- `CleanedReads/` +- `Clean_Reads/` - `[sample]_single.fq.gz`: Final cleaned singleton reads. - `[sample]_R[1/2].paired.fq.gz`: Final cleaned paired reads. -- `CleanedReads/FLASH/` - - `[sample].overlap.[tsv,xlsx]`: Number of reads that were overlapped into singleton reads. - - `[sample].clean-reads.[tsv,xlsx]`: Number of non-overlapping reads. +- `Clean_Reads/FLASh/` + - `[sample].Clean_Reads_FastQ.metrics_summary.tsv`: Metrics on FastQ reads output including minimum, average, and maximum lengths as well as total counts and total length. + - `[sample].Clean_Reads_FastQ.SHA512-checksums.tsv`: Checksum values for each FastQ output from FLASh. + - `[sample].Overlap.tsv`: Number of reads that were overlapped into singleton reads. + +> [!NOTE] +> FastQ sequences after overlapping with FLASh are stored in `Clean_Reads/` rather than `Clean_Reads/FLASh` as they are the final outputs for read cleaning.
-## Taxonomic classification of trimmed reads +## Taxonomic classification of cleaned reads -These classifiers perform classifications on a read-by-read basis or through the use of k-mers on the cleaned and trimmed FastQ files. The results that are obtained are heavily dependent on the quality and diversity of the database used. Therefore, the results produced from these classifiers should be used as a quality check to evaluate the possibility of sample contamination. +These classifiers perform classifications on a read-by-read basis or through the use of k-mers on the cleaned FastQ files. The results that are obtained are heavily dependent on the quality and diversity of the database used. Therefore, the results produced from these classifiers should be used as a quality check to evaluate the possibility of sample contamination. > [!WARNING] > Taxonomic classification tools will be skipped if the accompanying database is not specified. ### Kraken -Kraken is a k-mer based classification tool that assigns taxonomic labels using the Lowest Common Ancestor (LCA) algorithm. +Kraken is a k-mer based classification tool that assigns taxonomic labels using the Lowest Common Ancestor (LCA) algorithm. It consumes much more RAM than Kraken2, but Kraken can be valuable for precise exact k-mer matches of target organisms with lots of highly similar near taxonomic neighbors (e.g., occurring within a species complex).
Output files -- `Taxonomy/kraken/[sample]/` - - `[sample].kraken_summary.[tsv,xlsx]`: Summary of the unclassified, top 3 genus and top 3 species classifications from the Kraken report. - - `[sample].kraken_output.[tsv,xlsx].gz`: Taxonomic classification in the Kraken report format. +- `Taxonomy/Kraken/[sample]/` + - `[sample].kraken_summary.tsv`: Summary of the unclassified, top 3 genus and top 3 species classifications from the Kraken report. + - `[sample].kraken_output.tsv.gz`: Full taxonomic k-mer matches, not yet filtered for top hits, in the Kraken report format.
### Kraken2 -Kraken2 is a k-mer based classification tool that assigns taxonomic labels using the Lowest Common Ancestor (LCA) algorithm. +Kraken2 is a minimizer based classification tool that assigns taxonomic labels using the Lowest Common Ancestor (LCA) algorithm.
Output files -- `Taxonomy/kraken2/[sample]/` - - `[sample].kraken2_summary.[tsv,xlsx]`: Summary of the unclassified, top 3 genus and top 3 species classifications from the Kraken report. - - `[sample].kraken2_output.[tsv,xlsx].gz`: Taxonomic classification in the Kraken report format. +- `Taxonomy/Kraken2/[sample]/` + - `[sample].kraken2_summary.tsv`: Summary of the unclassified, top 3 genus and top 3 species classifications from the Kraken2 report. + - `[sample].kraken2_output.tsv.gz`: Full taxonomic minimizer matches, not yet filtered for top hits, in the Kraken2 report format. + +> [!TIP] +> Some pure isolates consistently give near neighbor matches, but you might wonder if an assembly used in database creation was contaminated. If you're curious which specific assemblies were used in your Kraken or Kraken2 database creation, you can view each and every one of them! +> +> Many of the pre-computed databases from Dr. Ben Langmead [here](https://benlangmead.github.io/aws-indexes/k2) have a file called "inspect.txt" which has a line-by-line listing of each assembly used to create the database. If you do not have that, you can create it with `kraken-inspect --db /my/db/path --threads 12 > inspect.txt` or `kraken2-inspect --db /my/db/path --threads 12 > inspect.txt` depending on whether you're using Kraken or Kraken2 (respectively).
## Assembly -The cleaned and trimmed reads are used to assemble contigs using SPAdes or SKESA `[Default: SPAdes]`. Contigs that have low compositional complexity are discarded. Please see the [contig filtering documentation](../modules/local/filter_contigs_biopython/README.md) for more information. Contigs from SPAdes require polishing to create the final genome assembly, which is done by using BWA, Pilon, and Samtools. Contigs from SKESA do not require this step. +The cleaned reads are used to assemble contigs using SPAdes or SKESA `[Default: SPAdes]`. Contigs that have low compositional complexity are discarded. Please see the [contig filtering documentation](../modules/local/filter_contigs_biopython/README.md) for more information. Contigs from SPAdes also involves SNP and InDel correction/polishing to create the final genome assembly, which is done by using BWA, Pilon, and Samtools. Contigs from SKESA do not require this step. > [!IMPORTANT] > Outputs generated by SPAdes and SKESA cannot be compared even when using the same FastQ inputs. > [!TIP] -> For many input FastQ files, SKESA may be useful in decreasing runtime. For input FastQ files that may be heavily contaminated, SPAdes may help maintain contiguity. +> For many input FastQ files where you might want to decrease overall runtime, SKESA is useful to still obtain high SNP-level accuracy but at the expense of longer contigs. For tasks where contiguity (longer contigs) are important (e.g., gene neighborhood evaluations, operon detection), SPAdes is the more appropriate choice.
QC Steps - The contigs produced from an assembler software package must meet a minimum file size criteria `[Default: 1M]`. This is to prevent the analysis of a highly incomplete bacterial genome. -- The resulting contig file after low compositional complexity contigs are discarded must meet a minimum file size `[Default: 1M]`. This is to prevent the analysis of a highly incomplete bacterial genome. +- The resulting contig file (after filtering out low coverage, short, and low compositional complexity contigs) must meet a minimum file size `[Default: 1M]`. This is to prevent the analysis of a highly incomplete bacterial genome. -- The cleaned paired-end reads are mapped onto the filtered assembly file in sequential steps (`[Default: 3]`), and the resulting binary paired-end alignment file must meet a minimum file size criteria `[Default: 25M]`. This is to prevent the analysis of an assembly file that has an unusually low read sequence amount. +- The cleaned paired-end reads are mapped onto the filtered assembly file in sequential steps (`[Default: 3]`), and the resulting binary paired-end alignment file must meet a minimum file size criteria `[Default: 6M]`. This is to prevent the analysis of an assembly file that has an unusually low read sequence amount. - The assembly file goes through SNP and InDel corrections in sequential steps (`[Default: 3]`), and the resulting assembly file must meet a minimum file size criteria `[Default: 1M]`. This is to prevent further analysis of an unusually incomplete genome. @@ -241,6 +258,12 @@ The cleaned and trimmed reads are used to assemble contigs using SPAdes or SKESA - If singletons (single-end reads) exist after read cleaning, they are mapped onto the assembly file and the resulting binary single-end alignment file must meet a minimum file size criteria `[Default: 1k]`. This is to ensure that read depth calculations can be performed on the single-end reads. +> [!TIP] +> Discarded contigs from filtering are stored in `Assembly/[assembler]/[sample]/[sample]-[assembler].discarded-contigs.fa.gz`. You can view the reason for each individual contig being discarded by `zcat discarded-contigs.fa.gz | grep '>'` and within the contig name there will be 1 or more reasons listed. For example "Failed=complexityFailed=lengthFailed=gc_content" had 3 independent reasons for being removed, whereas "Failed=length" was simply too short of a contig. + +> [!TIP] +> Contig filtering statistics are stored in `Assembly/[assembler]/[sample]/[sample]-[assembler].filter-contigs-stats.txt`. There you'll find total contig counts and cumulative lengths for input, removed, and saved. Also, for coverage statistics there are minimum, average, maximum, 25% quartile, 50% quartile (median), and 75% quartile coverage values. All of these statistics are meant to help guide alternative contig filtering if you have an unusual assembly that requires non-default parameters. +

@@ -261,13 +284,15 @@ SPAdes is a k-mer based software that forms a genome assembly from read sequence Output files - `Assembly/SPAdes/[sample]/` - - `[sample]-SPAdes.log.gz`: SPAdes log file. + - `[sample]-SPAdes_contigs.fasta`: Assembled contigs in FastA format. - `[sample]-SPAdes_graph.gfa`: Assembly graph in gfa format. - - `[sample]-SPAdes_warnings.log`: Log file that lists warnings when forming a genome assembly. + - `[sample]-SPAdes.discarded-contigs.fa.gz`: Post-assembly contigs that were filtered out (i.e., discarded). + - `[sample]-SPAdes.filter-contigs-stats.txt`: Post-assembly contig filtering statistics. + - `[sample]-SPAdes.InDels-corrected.cnt.txt`: Number of InDels corrected in each round of corrections. + - `[sample]-SPAdes.log.gz`: SPAdes log file. - `[sample]-SPAdes_params.txt.gz`: Command used to perform the SPAdes analysis. - - `[sample]-SPAdes_contigs.fasta`: Assembled contigs in FastA format. + - `[sample]-SPAdes_warnings.log`: Log file that lists warnings when forming a genome assembly. - `[sample]-SPAdes.SNPs-corrected.cnt.txt`: Number of SNPs corrected in each round of corrections. - - `[sample]-SPAdes.InDels-corrected.cnt.txt`: Number of InDels corrected in each round of corrections.
@@ -280,6 +305,8 @@ Strategic K-mer Extension for Scrupulous Assemblies (SKESA) is a software that i - `Assembly/SKESA/[sample]/` - `[sample]-SKESA_contigs.fasta`: Assembled contigs in FastA format. + - `[sample]-SKESA.discarded-contigs.fa.gz`: Post-assembly contigs that were filtered out (i.e., discarded). + - `[sample]-SKESA.filter-contigs-stats.txt`: Post-assembly contig filtering statistics. @@ -293,9 +320,9 @@ QUAST is used to perform quality assessment on the assembly file to report metri Output files - `Assembly/QA/[sample]/` - - `[sample]-[assembler].QuastSummary.[tsv,xlsx]`: Assembly metrics such as N50, cumulative length, longest contig length, and GC composition. - - `[sample]-[assembler].GenomeCoverage.[tsv,xlsx]`: Genome coverage information. - - `[sample]-[assembler].CleanedReads-Bases.[tsv,xlsx]`: Number of cleaned bases. + - `[sample]-[assembler].QuastSummary.tsv`: Assembly metrics such as N50, cumulative length, longest contig length, and GC composition. + - `[sample]-[assembler].GenomeCoverage.tsv`: Genome coverage information. + - `[sample]-[assembler].Clean_Reads-Bases.tsv`: Number of cleaned bases. @@ -307,7 +334,7 @@ The final assembly file is scanned against PubMLST typing schemes to determine t Output files - `Summaries/` - - `Summary.MLST.[tsv,xlsx]`: Summary of the MLST results for all samples. + - `Summary.MLST.tsv`: Summary of the MLST results for all samples. @@ -352,15 +379,15 @@ The final assembly file is annotated to identify and label features using Prokka ### 16S ribosomal RNA (rRNA) classification -The GenBank file is parsed for 16S rRNA gene records. If there are no 16S rRNA gene records, Barrnap is used to predict 16S rRNA genes using the assembly file. BLAST is then used to align these gene records to its database, where the best alignment is filtered out based on bit score. +The GenBank file is parsed for 16S rRNA gene records (with BioPython). If there are no 16S rRNA gene records, Barrnap, with relaxed settings, is used to find partial 16S rRNA genes using the assembly file. BLAST+ (blastn) is then used to align these gene records to its database, where the best alignment is filtered out based on bit score. The default database is an NCBI currated set of 16S rRNA genes of species Type strains, but best matches are not always perfect. > [!NOTE] > Some assembled genomes do not contain identifiable 16S rRNA sequences and therefore 16S is not able to be classified. If the classification of 16S rRNA sequences is required, the sample must be re-sequenced. > [!IMPORTANT] -> The 16S rRNA classification produced should not be used as a definitive classification as some taxa have 16S sequences that are extremely similar between different species. +> The 16S rRNA classification produced should **not** be used as a definitive classification as some taxa have 16S sequences that are extremely similar between different species. > -> For an exact species match, 100% identity and 100% alignment are needed. +> When the top bitscore match is < 100% identity or < 100% alignment, you should be extra cautious in what species you're reporting.
QC Steps @@ -382,11 +409,16 @@ The GenBank file is parsed for 16S rRNA gene records. If there are no 16S rRNA g - `SSU/` - - `16S-top-species.[tsv,xlsx]`: Summary of the best BLAST alignment for each sample. - `16S.[sample]-[assembler].fa`: 16S rRNA gene sequence of the best BLAST alignment in FastA format. + - `[assembler].16S_top_genus_RDP.tsv`: Summary of the best RDP match for each sample. + - `[assembler].16S_top_species_BLAST.tsv`: Summary of the best BLAST alignment for each sample. - `SSU/BLAST/` - - `[sample]-[assembler].blast.[tsv,xlsx].gz`: BLAST output 16S rRNA gene records in tab-separated value (TSV) format. + + - `[sample]-[assembler].blast.tsv.gz`: Full, not yet bitscore sorted, BLASTn output for each 16S rRNA gene record in tab-separated value (TSV) format using the BLAST outfmt 6 standard with additional taxonomy fields + +- `SSU/RDP/` + - `[sample]-[assembler].rdp.tsv`: RDP classification output for each 16S rRNA gene record in tab-separated value (TSV) format
@@ -398,7 +430,7 @@ GTDB-Tk is a taxonomic classification tool that uses the Genome Database Taxonom Output files - `Summaries/` - - `Summary.GTDB-Tk.[tsv,xlsx]`: Summary of the GTDB-Tk taxonomic classification for each sample. + - `Summary.GTDB-Tk.tsv`: Summary of the GTDB-Tk taxonomic classification for each sample. @@ -407,26 +439,37 @@ GTDB-Tk is a taxonomic classification tool that uses the Genome Database Taxonom Concatenation of output metrics for all samples. > [!NOTE] -> The Summary-Report excel file is only created when the parameter `--create_excel_outputs` is used. +> The first column for **all** "Summary" files contains the sample name. > -> The Summary-Report excel file has the date and time appended to the filename using the following shorthand notation: year (yyyy), month (MM), day (dd), hour (HH), minute (mm), second (ss). +> Each Summary file is sorted based on sample names for easy cross-comparisons.
Output files - `Summaries/` - - `Summary.16S.[tsv,xlsx]`: Summary of the best BLAST alignment for each sample. - - `Summary.RDP.[tsv,xlsx]`: Summary of RDP Classifier on predicted 16S ribosomal RNA genes. - - `Summary.MLST.[tsv,xlsx]`: Summary of the MLST results for all samples. - - `Summary.Assemblies.[tsv,xlsx]`: Assembly metrics such as N50, cumulative length, longest contig length, and GC composition for each sample. - - `Summary.PhiX_Removal.[tsv,xlsx]`: Number of reads discarded and retained for each sample. - - `Summary.QC_File_Checks.[tsv,xlsx]`: Summary of all QC file checks detailing if a sample passes or fails each process. - - `Summary.GenomeCoverage.[tsv,xlsx]`: Summary of the overall genome coverage for each sample. - - `Summary.CleanedReads-Bases.[tsv,xlsx]`: Summary of the number of cleaned bases for each sample. - - `Summary.Clean_and_Overlapping_Reads.[tsv,xlsx]`: Summary of the merging of overlapping sister reads. - - `Summary.CleanedReads-AlignmentStats.[tsv,xlsx]`: Summary of the genome size and coverages of the paired-end and single-end reads for each sample. - - `Summary-[fastp,trimmomatic].Adapter_and_QC_Trimming.[tsv,xlsx]`: Summary of adapter clipping and quality trimming for each sample. - - `Summary-Report_yyyy-MM-dd_HH-mm-ss.xlsx`: Excel workbook where each file in the Summaries directory is added to a separate worksheet within the workbook. + - `Summary.16S_Genus_RDP.tsv`: RDP Classifier best matches from predicted 16S ribosomal RNA genes (to genus-level) + - `Summary.16S_Species_BLAST.tsv`: Top bitscore BLASTn alignments for each 16S rRNA gene (to species-level) + - `Summary.Adapter_QC_Trim_Reads.Metrics.tsv`: Sequence metrics after adapter clipping and quality trimming + - `Summary.Annotation_Checksums.tsv`: Checksum (hash) values for annotated GenBank output files + - `Summary.Assembly_Checksums.tsv`: Checksum (hash) values for final output assembly FastA output files + - `Summary.Assembly_Depth.tsv`: Assembly depth of coverage mean and standard deviation values (units in "x") + - `Summary.Assembly_Metrics.tsv`: Assembly metrics (e.g., N50, cumulative length, longest contig length, and GC composition) + - `Summary.CheckM2.tsv`: Estimation percentages on completeness and contamination of each genome assembly + - `Summary.Clean_and_Overlapped.tsv`: Counts and percentages from overlapping sister reads + - `Summary.Clean_Reads_Aligned.tsv`: Paired, singleton, and total cleaned reads mapped onto the assembly statistics (e.g., mean and standard deviation depths, basepairs mapped, assembly size) + - `Summary.Clean_Reads_Checksums.tsv`: Checksum (hash) values for final output cleaned reads FastQ output files + - `Summary.Clean_Reads.Metrics.tsv`: Sequence metrics after all read cleaning steps + - `Summary.Downsampled_Reads.Metrics.tsv`: Sequence metrics after subsampling the read set (if performed) + - `Summary-[Fastp,Trimmomatic].Adapter_and_QC_Trim.tsv`: Number of discarded reads and singleton reads that remain after adapter clipping and quality trimming + - `Summary.Input_Checksums.tsv`: Checksum (hash) values for FastQ input files + - `Summary.Input_Reads.Metrics.tsv`: Sequence metrics on the initial user-provided input sequences + - `Summary.Kraken2.tsv`: Counts and proportions of unclassified, top 3 genera, top 3 species with k-mer matches with the cleaned reads + - `Summary.Kraken.tsv`: Counts and proportions of unclassified, top 3 genera, top 3 species with k-mer matches with the cleaned reads + - `Summary.MLST.tsv`: MLST genotyping results + - `Summary.PhiX_Removal.tsv`: Number of reads discarded and retained after PhiX k-mer match removal + - `Summary.PhiX_Removed_Reads.Metrics.tsv`: Sequence metrics after PhiX was removed from the reads + - `Summary.QC_File_Checks.tsv`: All QC file checks detailing if a sample passes or fails after each process + - `Summary-Report.xlsx`: Microsoft Excel workbook where each file in the Summaries directory is added to a separate worksheet within the workbook
@@ -441,7 +484,7 @@ Information about the pipeline execution, output logs, error logs, and QC file c Pipeline information - `pipeline_info/` - - `software_versions.yml`: Summary of the software packages used in each process and their version information. + - `software_versions.yml`: All software packages used and their version information - `nextflow_log.[job_id].txt`: Execution log file produced by Nextflow. - `ASM_[num_of_samples].o[job_id]`: Output log file produced by the job scheduler. - `ASM_[num_of_samples].e[job_id]`: Error log file produced by the job scheduler. @@ -466,25 +509,25 @@ Information about the pipeline execution, output logs, error logs, and QC file c QC file checks - `pipeline_info/qc_file_checks/` - - `[sample].Raw_Initial_FastQ_Files.[tsv,xlsx]`: Details if both reads (R1,R2) meet the minimum file size criteria for the pipeline `[Default: 25M]`. - - `[sample].Summary.Hostile-Removal.[tsv,xlsx]`: Details if both reads (R1,R2) meet the minimum file size criteria for after host removal using Hostile `[Default: 25M]`. - - `[sample].SRA_Human_Scrubber_FastQ_File.[tsv,xlsx]`: Details if both reads (R1,R2) meet the minimum file size criteria for after host removal using SRA-Human-Scrubber `[Default: 25M]`. - - `[sample].BBTools-Repair-removed_FastQ_Files.[tsv,xlsx]`: Details if both reads (R1,R2) meet the minimum file size criteria after repairing broken sister reads from SRA-Human-Scrubber `[Default: 25M]`. - - `[sample].PhiX_Genome.[tsv,xlsx]`: Details if the input PhiX reference genome meets the minimum file size criteria `[Default: 5k]`. - - `[sample].PhiX-removed_FastQ_Files.[tsv,xlsx]`: Details if both reads (R1,R2) meet the minimum file size criteria after PhiX reads have been removed `[Default: 25M]`. - - `[sample].Adapters_FastA.[tsv,xlsx]`: Details if the input adapters reference file meets the minimum file size criteria `[Default: 10k]`. - - `[sample].Adapter-removed_FastQ_Files.[tsv,xlsx]`: Details if both reads (R1,R2) meet the minimum file size criteria after adapter sequences have been removed `[Default: 25M]`. - - `[sample].Non-overlapping_FastQ_Files.[tsv,xlsx]`: Details if both reads (R1,R2) meet the minimum file size criteria after removing overlapping reads `[Default: 20M]`. - - `[sample].Raw_Assembly_File.[tsv,xlsx]`: Details if the genome assembly file produced by an assembler software package meets the minimum file size criteria `[Default: 1M]`. - - `[sample].Filtered_Assembly_File.[tsv,xlsx]`: Details if the genome assembly file meets the minimum file size criteria after low compositional complexity contigs are discarded `[Default: 1M]`. - - `[sample].Binary_PE_Alignment_Map_File.[tsv,xlsx]`: Details if the binary paired-end (PE) alignment file meets the minimum file size criteria after the cleaned paired-end reads are mapped onto the filtered genome assembly `[Default: 25M]`. - - `[sample].Polished_Assembly_File.[tsv,xlsx]`: Details if the genome assembly file meets the minimum file size criteria after SNP and InDel corrections are performed `[Default: 1M]`. - - `[sample].Final_Corrected_Assembly_FastA_File.[tsv,xlsx]`: Details if the final error-corrected genome assembly file meets the minimum file size criteria `[Default: 1M]`. - - `[sample].Binary_SE_Alignment_Map_File.[tsv,xlsx]`: Details if the single-end (SE) alignment file meets the minimum file size criteria after the cleaned singleton reads are mapped onto the final genome assembly file `[Default: 1k]`. - - `[sample].Annotated_GenBank_File.[tsv,xlsx]`: Details if the annotated GenBank file meets the minimum file size criteria `[Default: 3M]`. - - `[sample].SSU_Extracted_File.[tsv,xlsx]`: Details if the extracted 16S rRNA gene sequence file meets the minimum file size criteria `[Default: 500b]`. - - `[sample]-[assembler].SSU_Renamed_File.[tsv,xlsx]`: Details if the 16S rRNA gene sequence file meets the minimum file size criteria after sample identifiers are added to each sequence `[Default: 500b]`. - - `[sample].16S_BLASTn_Output_File.[tsv,xlsx]`: Details if the BLASTn output file meets the minimum file size criteria `[Default: 10b]`. - - `[sample].Filtered_16S_BLASTn_File.[tsv,xlsx]`: Details if the best BLASTn alignment sequence meets the minimum file size criteria `[Default: 10b]`. + - `[sample].Raw_Initial_FastQ_Files.tsv`: Details if both reads (R1,R2) meet the minimum file size criteria for the pipeline `[Default: 25M]`. + - `[sample].Summary.Hostile.tsv`: Details if both reads (R1,R2) meet the minimum file size criteria for after host removal using Hostile `[Default: 25M]`. + - `[sample].SRA_Human_Scrubber_FastQ_File.tsv`: Details if both reads (R1,R2) meet the minimum file size criteria for after host removal using SRA-Human-Scrubber `[Default: 25M]`. + - `[sample].BBTools-Repair-removed_FastQ_Files.tsv`: Details if both reads (R1,R2) meet the minimum file size criteria after repairing broken sister reads from SRA-Human-Scrubber `[Default: 25M]`. + - `[sample].PhiX_Genome.tsv`: Details if the input PhiX reference genome meets the minimum file size criteria `[Default: 5k]`. + - `[sample].PhiX-removed_FastQ_Files.tsv`: Details if both reads (R1,R2) meet the minimum file size criteria after PhiX reads have been removed `[Default: 25M]`. + - `[sample].Adapters_FastA.tsv`: Details if the input adapters reference file meets the minimum file size criteria `[Default: 10k]`. + - `[sample].Adapter-removed_FastQ_Files.tsv`: Details if both reads (R1,R2) meet the minimum file size criteria after adapter sequences have been removed `[Default: 25M]`. + - `[sample].Non-overlapping_FastQ_Files.tsv`: Details if both reads (R1,R2) meet the minimum file size criteria after removing overlapping reads `[Default: 20M]`. + - `[sample].Raw_Assembly_File.tsv`: Details if the genome assembly file produced by an assembler software package meets the minimum file size criteria `[Default: 1M]`. + - `[sample].Filtered_Assembly_File.tsv`: Details if the genome assembly file meets the minimum file size criteria after low compositional complexity contigs are discarded `[Default: 1M]`. + - `[sample].Binary_PE_Alignment_Map_File.tsv`: Details if the binary paired-end (PE) alignment file meets the minimum file size criteria after the cleaned paired-end reads are mapped onto the filtered genome assembly `[Default: 6M]`. + - `[sample].Polished_Assembly_File.tsv`: Details if the genome assembly file meets the minimum file size criteria after SNP and InDel corrections are performed `[Default: 1M]`. + - `[sample].Final_Corrected_Assembly_FastA_File.tsv`: Details if the final error-corrected genome assembly file meets the minimum file size criteria `[Default: 1M]`. + - `[sample].Binary_SE_Alignment_Map_File.tsv`: Details if the single-end (SE) alignment file meets the minimum file size criteria after the cleaned singleton reads are mapped onto the final genome assembly file `[Default: 1k]`. + - `[sample].Annotated_GenBank_File.tsv`: Details if the annotated GenBank file meets the minimum file size criteria `[Default: 3M]`. + - `[sample].SSU_Extracted_File.tsv`: Details if the extracted 16S rRNA gene sequence file meets the minimum file size criteria `[Default: 500b]`. + - `[sample]-[assembler].SSU_Renamed_File.tsv`: Details if the 16S rRNA gene sequence file meets the minimum file size criteria after sample identifiers are added to each sequence `[Default: 500b]`. + - `[sample].16S_BLASTn_Output_File.tsv`: Details if the BLASTn output file meets the minimum file size criteria `[Default: 10b]`. + - `[sample].Filtered_16S_BLASTn_File.tsv`: Details if the best BLASTn alignment sequence meets the minimum file size criteria `[Default: 10b]`. diff --git a/docs/usage.md b/docs/usage.md index 2920927b..0dac78dd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -239,35 +239,35 @@ The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementatio 2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) 3. Create the custom config accordingly: - - For Docker: - - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` - - - For Singularity: - - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` - - - For Conda: - - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` + - For Docker: + + ```nextflow + process { + withName: PANGOLIN { + container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' + } + } + ``` + + - For Singularity: + + ```nextflow + process { + withName: PANGOLIN { + container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' + } + } + ``` + + - For Conda: + + ```nextflow + process { + withName: PANGOLIN { + conda = 'bioconda::pangolin=3.0.5' + } + } + ``` > [!NOTE] > If you wish to periodically update individual tool-specific results (e.g., Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. diff --git a/modules.json b/modules.json index 215b9369..1d7e71f6 100644 --- a/modules.json +++ b/modules.json @@ -12,7 +12,7 @@ }, "gtdbtk/classifywf": { "branch": "master", - "git_sha": "c67eaf89682a12966f60008a8fa30f5dd29239df", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] } } diff --git a/modules/local/align_16S_blast/main.nf b/modules/local/align_16S_blast/main.nf index 16ee728c..28957222 100644 --- a/modules/local/align_16S_blast/main.nf +++ b/modules/local/align_16S_blast/main.nf @@ -30,12 +30,12 @@ process ALIGN_16S_BLAST { -out "!{meta.id}-!{meta.assembler}.blast.tsv" \ -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovhsp ssciname" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.16S_BLASTn_Output_File.tsv" - if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.blast.tsv" '16S BLASTn Output File' "!{params.min_filesize_blastn_output}"; then - echo -e "!{meta.id}\t16S BLASTn Output File\tPASS" \ + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.16S_BLASTn_Output_File.tsv" + if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.blast.tsv" '16S BLASTn Output TSV File' "!{params.min_filesize_blastn_output}"; then + echo -e "!{meta.id}\t16S BLASTn Output TSV File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.16S_BLASTn_Output_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\t16S BLASTn Output File\tFAIL" \ + echo -e "!{meta.id}\t16S BLASTn Output TSV File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.16S_BLASTn_Output_File.tsv" fi diff --git a/modules/local/annotate_prokka/main.nf b/modules/local/annotate_prokka/main.nf index 4b2c53cf..485801ea 100644 --- a/modules/local/annotate_prokka/main.nf +++ b/modules/local/annotate_prokka/main.nf @@ -2,7 +2,7 @@ process ANNOTATE_PROKKA { label "process_high" tag { "${meta.id}-${meta.assembler}" } - container "snads/prokka@sha256:ef7ee0835819dbb35cf69d1a2c41c5060691e71f9138288dd79d4922fa6d0050" + container "staphb/prokka@sha256:6bb2522e077ef08a8be7a3856fe80372ede3b832becba0728e1ddbe83d89a042" input: tuple val(meta), path(assembly) @@ -11,6 +11,7 @@ process ANNOTATE_PROKKA { tuple val(meta), path("${meta.id}-${meta.assembler}.Annotated_GenBank_File.tsv"), emit: qc_filecheck tuple val(meta), path("${meta.id}-${meta.assembler}.gbk") , emit: prokka_genbank_file path("prokka/${meta.id}-${meta.assembler}.log.gz") + path("${meta.id}.Annotation_GenBank.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") path("versions.yml") , emit: versions @@ -24,7 +25,7 @@ process ANNOTATE_PROKKA { sed -i "s/!{meta.id}/${short_base}/g" !{assembly} # Annotate cleaned and corrected assembly - msg "INFO: Annotating assembly using Prokka" + msg "INFO: Annotating assembly using Prokka for !{meta.id}..." # Run Prokka prokka \ @@ -39,6 +40,8 @@ process ANNOTATE_PROKKA { --cpus !{task.cpus} \ !{assembly} + msg "INFO: Completed annotating the assembly using Prokka for !{meta.id}..." + # Regardless of the file extension, unify to GBK extension for GenBank format for ext in gb gbf gbff gbk; do if [ -s "prokka/!{meta.id}-!{meta.assembler}.${ext}" ]; then @@ -49,7 +52,7 @@ process ANNOTATE_PROKKA { done # Verify output file - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Annotated_GenBank_File.tsv" + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Annotated_GenBank_File.tsv" if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.gbk" 'Annotated GenBank File' "!{params.min_filesize_annotated_genbank}"; then echo -e "!{meta.id}-!{meta.assembler}\tAnnotated GenBank File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Annotated_GenBank_File.tsv" @@ -61,10 +64,25 @@ process ANNOTATE_PROKKA { # Compress the bulky verbose logfile for compact storage gzip -9f "prokka/!{meta.id}-!{meta.assembler}.log" + # Calculate checksum + FILE="!{meta.id}-!{meta.assembler}.gbk" + CHECKSUM=$(awk '/^LOCUS/ {gsub(/[[:space:]]+[0-9]{2}-[A-Z]{3}-[0-9]{4}/, "", $0); print} !/^LOCUS/ {print}' "${FILE}" | sha512sum | awk '{print $1}') + echo "${CHECKSUM}" | awk -v sample_id="!{meta.id}" -v file="${FILE}" ' + BEGIN { + # Print the header once + print "Sample_name\tChecksum_(SHA-512)\tFile" + } + { + # Print the data row once, using the CHECKSUM from input + print sample_id "\t" $1 "\t" file + }' \ + > "!{meta.id}.Annotation_GenBank.SHA512-checksums.tsv" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": prokka: $(prokka --version 2>&1 | awk 'NF>1{print $NF}') + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') END_VERSIONS ''' } diff --git a/modules/local/assemble_contigs_skesa/main.nf b/modules/local/assemble_contigs_skesa/main.nf index 1ccffee0..2264eb61 100644 --- a/modules/local/assemble_contigs_skesa/main.nf +++ b/modules/local/assemble_contigs_skesa/main.nf @@ -2,7 +2,7 @@ process ASSEMBLE_CONTIGS_SKESA { label "process_high" tag { "${meta.id}" } - container "gregorysprenger/skesa@sha256:4455882b5d0fd968630325428729395422be7340301c31d15874a295904b7f26" + container "staphb/skesa@sha256:b520da51cd3929683c5eb94739bcd6c32045863dab16e777a4e02d2ff3802f20" input: tuple val(meta), path(cleaned_fastq_files) @@ -19,7 +19,7 @@ process ASSEMBLE_CONTIGS_SKESA { ''' source bash_functions.sh - msg "INFO: Assembling contigs using SKESA" + msg "INFO: Assembling !{meta.id} contigs using SKESA ..." if [[ ! -f "!{meta.id}-SKESA_contigs.fasta" ]]; then skesa \ @@ -37,14 +37,18 @@ process ASSEMBLE_CONTIGS_SKESA { --vector_percent !{params.skesa_vector_percent} fi - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" - if verify_minimum_file_size "!{meta.id}-SKESA_contigs.fasta" 'Raw Assembly File' "!{params.min_filesize_raw_assembly}"; then - echo -e "!{meta.id}-!{meta.assembler}\tRaw Assembly File\tPASS" \ + msg "INFO: Completed genome assembly for !{meta.id} using SKESA" + + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" + if verify_minimum_file_size "!{meta.id}-SKESA_contigs.fasta" 'Raw Assembly FastA File' "!{params.min_filesize_raw_assembly}"; then + echo -e "!{meta.id}\tRaw Assembly FastA File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tRaw Assembly File\tFAIL" > "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" + echo -e "!{meta.id}\tRaw Assembly FastA File\tFAIL" > "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" fi + msg "INFO: Completed QC file checks for !{meta.id} SKESA" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": diff --git a/modules/local/assemble_contigs_spades/main.nf b/modules/local/assemble_contigs_spades/main.nf index a2e5486d..f4862db0 100644 --- a/modules/local/assemble_contigs_spades/main.nf +++ b/modules/local/assemble_contigs_spades/main.nf @@ -2,7 +2,7 @@ process ASSEMBLE_CONTIGS_SPADES { label "process_high" tag { "${meta.id}" } - container "gregorysprenger/spades@sha256:3fe1ebda8f5746ca3e3ff79c74c220d2ca75e3120f20441c3e6ae88eff03b4dc" + container "staphb/spades@sha256:5df39e8404df2678ccc6c6ed9d7aa0e59b79dfa798aef7fd4fc06cc86ba0b4c0" input: tuple val(meta), path(cleaned_fastq_files) @@ -35,12 +35,12 @@ process ASSEMBLE_CONTIGS_SPADES { --threads !{task.cpus} # Verify file output - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" - if verify_minimum_file_size "SPAdes/contigs.fasta" 'Raw Assembly File' "!{params.min_filesize_raw_assembly}"; then - echo -e "!{meta.id}-!{meta.assembler}\tRaw Assembly File\tPASS" \ + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" + if verify_minimum_file_size "SPAdes/contigs.fasta" 'Raw Assembly FastA File' "!{params.min_filesize_raw_assembly}"; then + echo -e "!{meta.id}\tRaw Assembly FastA File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tRaw Assembly File\tFAIL" \ + echo -e "!{meta.id}\tRaw Assembly FastA File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Raw_Assembly_File.tsv" fi diff --git a/modules/local/assess_assembly_checkm2/README.md b/modules/local/assess_assembly_checkm2/README.md index 6207e5ae..99352556 100644 --- a/modules/local/assess_assembly_checkm2/README.md +++ b/modules/local/assess_assembly_checkm2/README.md @@ -9,9 +9,10 @@ This process uses [CheckM2](https://github.com/chklovski/CheckM2) published in [ ## How CheckM2 works From [CheckM2's documentation](https://github.com/chklovski/CheckM2): + > CheckM2 uses two distinct machine learning models to predict genome completeness. > > - The 'general' gradient boost model is able to generalize well and is intended to be used on organisms not well represented in GenBank or RefSeq (roughly, when an organism is novel at the level of order, class or phylum). > - The 'specific' neural network model is more accurate when predicting completeness of organisms more closely related to the reference training set (roughly, when an organism belongs to a known species, genus or family). -> CheckM2 uses a cosine similarity calculation to automatically determine the appropriate completeness model for each input genome, but you can also force the use of a particular completeness model, or get the prediction outputs for both. -> There is only one contamination model (based on gradient boost) which is applied regardless of taxonomic novelty and works well across all cases. +> CheckM2 uses a cosine similarity calculation to automatically determine the appropriate completeness model for each input genome, but you can also force the use of a particular completeness model, or get the prediction outputs for both. +> There is only one contamination model (based on gradient boost) which is applied regardless of taxonomic novelty and works well across all cases. diff --git a/modules/local/assess_assembly_checkm2/main.nf b/modules/local/assess_assembly_checkm2/main.nf index 2c122c15..fd0e98bc 100644 --- a/modules/local/assess_assembly_checkm2/main.nf +++ b/modules/local/assess_assembly_checkm2/main.nf @@ -1,7 +1,8 @@ process ASSESS_ASSEMBLY_CHECKM2 { - label "process_high" + label "process_high" //NOTE: h_vmem=126.6G+ and h_rss=103114M+ normally required or else exit 140 status code tag { "${meta.id}" } + // container "quay.io/biocontainers/checkm2:1.0.2--pyh7cba7a3_0" container "quay.io/biocontainers/checkm2:1.0.1--pyh7cba7a3_0" input: @@ -9,21 +10,22 @@ process ASSESS_ASSEMBLY_CHECKM2 { path(database) output: - tuple val(meta), path("${meta.id}.CheckM2_Report_File.tsv"), emit: qc_filecheck - tuple val(meta), path("${meta.id}.CheckM2.report.tsv") , emit: summary - path("${meta.id}.CheckM2.log.gz") - path("${meta.id}.CheckM2.alignments.tsv.gz") + tuple val(meta), path("${meta.id}.CheckM2_Report_File.tsv"), emit: qc_filecheck + tuple val(meta), path("${meta.id}-${meta.assembler}.CheckM2.results.tsv"), emit: summary + path("${meta.id}-${meta.assembler}.CheckM2.log.gz") + path("${meta.id}-${meta.assembler}.CheckM2.alignments.tsv.gz") path(".command.{out,err}") - path("versions.yml") , emit: versions + path("versions.yml"), emit: versions shell: ''' source bash_functions.sh # Assess the full FastA assembly with CheckM2 - msg "INFO: Evaluating the assembly contig set for completeness and contamination with CheckM2" + msg "INFO: Evaluating the assembly contig set of !{meta.id} for completeness and contamination with CheckM2" # Run CheckM2 + # NOTE: h_vmem=126.6G+ and h_rss=103114M+ normally required or else exit 140 status code checkm2 \ predict \ --input !{assembly} \ @@ -33,20 +35,30 @@ process ASSESS_ASSEMBLY_CHECKM2 { !{params.checkm2_model} \ --threads !{task.cpus} + msg "INFO: CheckM2 completed for !{meta.id} assembly" + # Move and rename the report, alignments, and log files - mv -f checkm2/quality_report.tsv "!{meta.id}.CheckM2.report.tsv" - mv -f checkm2/diamond_output/DIAMOND_RESULTS.tsv "!{meta.id}.CheckM2.alignments.tsv" - mv -f checkm2/checkm2.log "!{meta.id}.CheckM2.log" + mv -f checkm2/quality_report.tsv "!{meta.id}-!{meta.assembler}.CheckM2.results.tsv" + mv -f checkm2/diamond_output/DIAMOND_RESULTS.tsv "!{meta.id}-!{meta.assembler}.CheckM2.alignments.tsv" + mv -f checkm2/checkm2.log "!{meta.id}-!{meta.assembler}.CheckM2.log" - # Verify output report file - if verify_minimum_file_size "!{meta.id}.CheckM2.report.tsv" 'CheckM2 Report File' "!{params.min_filesize_checkm2_report}"; then - echo -e "!{meta.id}\tCheckM2 Report File\tPASS" > !{meta.id}.CheckM2_Report_File.tsv + # Test/verify paired FastQ outfiles sizes are reasonable to continue + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.CheckM2_Report_File.tsv" + if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.CheckM2.results.tsv" 'CheckM2 Report File' "!{params.min_filesize_checkm2_report}"; then + echo -e "!{meta.id}\tCheckM2 Report File\tPASS" >> "!{meta.id}.CheckM2_Report_File.tsv" else - echo -e "!{meta.id}\tCheckM2 Report File\tFAIL" > !{meta.id}.CheckM2_Report_File.tsv + echo -e "!{meta.id}\tCheckM2 Report File\tFAIL" >> "!{meta.id}.CheckM2_Report_File.tsv" fi + # Replace space characters in header line with underscores + sed -i '1s/ /_/g' "!{meta.id}-!{meta.assembler}.CheckM2.results.tsv" + sed -i '1s/^Name/Sample_name/1' "!{meta.id}-!{meta.assembler}.CheckM2.results.tsv" + + # Replace id-assembler with just id in the data row + sed -i "2s/!{meta.id}-!{meta.assembler}/!{meta.id}/1" "!{meta.id}-!{meta.assembler}.CheckM2.results.tsv" + # Compress the log and alignments files for compact storage - gzip -9f "!{meta.id}.CheckM2.log" "!{meta.id}.CheckM2.alignments.tsv" + gzip -9f "!{meta.id}-!{meta.assembler}.CheckM2.log" "!{meta.id}-!{meta.assembler}.CheckM2.alignments.tsv" # Get process version information cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/best_16S_blastn_bitscore_taxon_python/main.nf b/modules/local/best_16S_blastn_bitscore_taxon_python/main.nf index 30460817..f3ce166a 100644 --- a/modules/local/best_16S_blastn_bitscore_taxon_python/main.nf +++ b/modules/local/best_16S_blastn_bitscore_taxon_python/main.nf @@ -8,7 +8,7 @@ process BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON { output: tuple val(meta), path("${meta.id}-${meta.assembler}.Filtered_16S_BLASTn_File.tsv"), emit: qc_filecheck - tuple val(meta), path("${meta.id}-${meta.assembler}.16S-top-species.tsv") , emit: top_blast_species + tuple val(meta), path("${meta.id}-${meta.assembler}.16S-top-species.tsv") , emit: summary path("${meta.id}-${meta.assembler}.blast.tsv.gz") path(".command.{out,err}") path("versions.yml") , emit: versions @@ -17,27 +17,32 @@ process BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON { ''' source bash_functions.sh + msg "INFO: filtering !{blast_output} for highest bitscore value to report top 16S BLASTn species match" + # Get the top match by bitscore filter.blast.py \ -i "!{blast_output}" \ - -o "!{meta.id}-!{meta.assembler}.blast.tsv" \ + -o "!{meta.id}-!{meta.assembler}.top-blast-bitscore.tsv" \ -c !{params.filter_blast_column} \ -s !{params.filter_blast_bitscore} - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Filtered_16S_BLASTn_File.tsv" + msg "INFO: top 16S rRNA gene match to species by BLAST created: !{meta.id}-!{meta.assembler}.top-blast-bitscore.tsv" + + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Filtered_16S_BLASTn_File.tsv" - if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.blast.tsv" 'Filtered 16S BLASTn File' "!{params.min_filesize_filtered_blastn}"; then + if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.top-blast-bitscore.tsv" 'Filtered 16S BLASTn File' "!{params.min_filesize_filtered_blastn}"; then echo -e "!{meta.id}-!{meta.assembler}\tFiltered 16S BLASTn File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Filtered_16S_BLASTn_File.tsv" # Report the top alignment match data: %nucl iden, %query cov aln, taxon - awk -F $'\t' 'BEGIN{OFS=FS}; {print $1, $3 "% identity", $13 "% alignment", $14}' \ - "!{meta.id}-!{meta.assembler}.blast.tsv" \ - > "!{meta.id}-!{meta.assembler}.16S-top-species.tsv" - - sed -i \ - '1i Sample name\tPercent identity\tPercent alignment\tSpecies match' \ - "!{meta.id}-!{meta.assembler}.16S-top-species.tsv" + # and split up the first column "_"; add header. + # NOTE: a[length(a)] is used to take the last item in cases where + # samplename is Name_S1_L001_1 so it would get the final "1" + awk 'BEGIN { FS=OFS="\t"; print "Sample_name\tUnique_16S_rRNA_extraction_count\tIdentity_(%)\tAlignment_(%)\tSpecies_match" } + { split($1, a, "_"); print a[1], a[length(a)], $3, $13, $14 }' \ + "!{meta.id}-!{meta.assembler}.top-blast-bitscore.tsv" \ + > tmp.tsv \ + && mv -f tmp.tsv "!{meta.id}-!{meta.assembler}.16S-top-species.tsv" else echo -e "!{meta.id}-!{meta.assembler}\tFiltered 16S BLASTn File\tFAIL" \ @@ -49,24 +54,25 @@ process BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON { # Add header to BLAST output SUMMARY_HEADER=( - "Query ID" - "Reference ID" - "Identity (%)" - "Alignment length" - "Number of mismatches" - "Number of gap openings" - "Query start position" - "Query end position" - "Reference start position" - "Reference end position" - "Expect value" - "Bit score" - "Query coverage per HSP" - "Reference scientific name" + "Query_Name" + "Reference_Name" + "Identity_(%)" + "Alignment_length_(bp)" + "Mismatches_(#)" + "Gap_openings_(#)" + "Query_start_position" + "Query_end_position" + "Reference_start_position" + "Reference_end_position" + "Expect_value_(e-value)" + "Bit_score_(bits)" + "Query_coverage_per_HSP_(%)" + "Reference_scientific_name" ) - SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}") + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') sed -i "1i ${SUMMARY_HEADER}" "!{blast_output}" + # Compress input TSV full alignment file to be saved in outdir gzip -f !{blast_output} # Get process version information diff --git a/modules/local/blast_db_preparation_unix/main.nf b/modules/local/blast_db_preparation_unix/main.nf index 81e8e2ef..9dfb4a33 100644 --- a/modules/local/blast_db_preparation_unix/main.nf +++ b/modules/local/blast_db_preparation_unix/main.nf @@ -19,7 +19,7 @@ process BLAST_DB_PREPARATION_UNIX { # Make sure database contains 16S_ribosomal_RNA files if [[ $(find database/ -type f -name "16S_ribosomal_RNA*" | wc -l) -lt 1 ]]; then - msg "ERROR: Missing 16S ribosomal RNA database files from NCBI BLAST!" + msg "ERROR: Missing 16S ribosomal RNA database files from NCBI BLAST!" >&2 exit 1 fi diff --git a/modules/local/busco_db_preparation_unix/main.nf b/modules/local/busco_db_preparation_unix/main.nf index a4ddfdba..d3f80cd7 100644 --- a/modules/local/busco_db_preparation_unix/main.nf +++ b/modules/local/busco_db_preparation_unix/main.nf @@ -25,7 +25,7 @@ process BUSCO_DB_PREPARATION_UNIX { if [[ !{meta.id} =~ odb10 ]]; then for directory in info hmms; do if [[ ! -d "!{output_dir}/${directory}" ]]; then - msg "ERROR: BUSCO dataset is missing required directory: `${directory}`." + msg "ERROR: BUSCO dataset is missing required directory: `${directory}`." >&2 exit 1 fi done @@ -36,7 +36,7 @@ process BUSCO_DB_PREPARATION_UNIX { num_info_dirs=$(find !{output_dir}/ -maxdepth 3 -type d -name "info" | wc -l) if [[ $num_odb10_dirs != $num_hmms_dirs ]] && [[ $num_odb10_dirs != $num_info_dirs ]]; then - msg "ERROR: BUSCO database does not have the required directories `hmms` and `info` in each lineage dataseet." + msg "ERROR: BUSCO database does not have the required directories `hmms` and `info` in each lineage dataseet." >&2 exit 1 fi fi diff --git a/modules/local/calculate_coverage_unix/main.nf b/modules/local/calculate_coverage_unix/main.nf index 81392c9f..f3201b80 100644 --- a/modules/local/calculate_coverage_unix/main.nf +++ b/modules/local/calculate_coverage_unix/main.nf @@ -4,7 +4,7 @@ process CALCULATE_COVERAGE_UNIX { container "ubuntu:jammy" input: - tuple val(meta), path(summary_assemblies), path(summary_reads), path(summary_stats) + tuple val(meta), path(summary_assembly_metrics), path(summary_cleanedreads), path(summary_aligned_stats) output: path("${meta.id}-${meta.assembler}.GenomeCoverage.tsv"), emit: summary @@ -15,36 +15,56 @@ process CALCULATE_COVERAGE_UNIX { ''' source bash_functions.sh - # Report coverage + # Use assembly report sample names echo -n '' > "!{meta.id}-!{meta.assembler}.GenomeCoverage.tsv" i=0 while IFS=$'\t' read -r -a ln; do - if grep -q -e "skesa_" -e "unicyc_" -e ".uncorrected" <<< "${ln[0]}"; then + msg "INFO: looking for depth of coverage for ${ln[0]}" + + # Skip unusually named assembly filenames that shouldn't be in this workflow + if grep -q -e "unicyc_" -e ".uncorrected" <<< "${ln[0]}"; then + msg "INFO: skipping ${ln[0]} due to containing unicyc_ or .uncorrected" continue fi - basepairs=$(grep ${ln[0]} !{summary_stats} 2> /dev/null \ - | awk 'BEGIN{FS="\t"}; {print $2}' | awk '{print $1}' | sort -u) + # If the mapped/aligned stats file exists, use those depth values + mapped_coverage=$(grep ${ln[0]} !{summary_aligned_stats} 2> /dev/null | awk '{print $9"x_+/-_" $10"x"}') - if [[ "${basepairs}" =~ ^[0-9]+$ ]]; then - msg "INFO: Read alignment data for ${ln[0]} used for coverage" >&2 - else - basepairs=$(grep ${ln[0]} !{summary_reads} | cut -f 2) - msg "INFO: Read alignment data absent for ${ln[0]}, so cleaned bases" >&2 - msg " given to the assembler were used to calculate coverage" >&2 - fi + if [[ "${mapped_coverage}" =~ ^[0-9]+[.][0-9]{1}x_[+][/][-]_[0-9]+[.][0-9]{1}x$ ]]; then + msg "INFO: Cleaned read sequences mapped to assembly Coverage data found for !{meta.id}-!{meta.assembler}: ${mapped_coverage}" + echo -e "${ln[0]}\t${mapped_coverage}" >> "!{meta.id}-!{meta.assembler}.GenomeCoverage.tsv" + ((i=i+1)) - genomelen=${ln[7]} - cov=$(echo | awk -v x=${basepairs} -v y=${genomelen} '{printf ("%0.1f", x/y)}') - msg "INFO: Coverage of !{meta.id}-!{meta.assembler}: $cov" + # If mapped/aligned stats are missing, revert to just extracting the + # cleaned reads basepair count and the assembly cumulative length to + # estimate coverage. + # NOTE: This shouldn't happen, but it's a backup and easily discernible + # from the mapped-based method, because no stdev ("_+/-_") is + # reported with this method. + else + msg "INFO: Read alignment data absent for ${ln[0]}, so cleaned bases" + msg " given to the assembler were used to calculate coverage" + basepairs=$(grep ${ln[0]} !{summary_cleanedreads} | cut -f 2) + assembly_length=$(grep ${ln[0]} !{summary_assembly_metrics} | cut -f 4) + if [[ -z "$basepairs" || -z "$assembly_length" || ! "$basepairs" =~ ^[0-9]+$ || ! "$assembly_length" =~ ^[0-9]+$ || "$basepairs" -le 0 || "$assembly_length" -le 0 ]]; then + msg "ERROR: skipping ${ln[0]}: $basepairs bp or $assembly_length bp are unset, empty, not integers, or not greater than zero" >&2 + continue + fi + cleaned_basepairs_per_assembly_length=$(awk -v a="${basepairs}" -v b="${assembly_length}" '{print a / b}') + cleaned_basepairs_per_assembly_length=$(printf "%.1fx" "$cleaned_basepairs_per_assembly_length") - if [[ "${cov}" =~ ^[0-9]+([.][0-9]+)?$ ]]; then - echo -e "${ln[0]}\t${cov}x" >> "!{meta.id}-!{meta.assembler}.GenomeCoverage.tsv" - ((i=i+1)) + if [[ "${cleaned_basepairs_per_assembly_length}" =~ ^[0-9]+[.][0-9]{1}x$ ]]; then + msg "INFO: Cleaned read basepairs per assembly site Coverage for !{meta.id}-!{meta.assembler}: ${cleaned_basepairs_per_assembly_length}" + echo -e "${ln[0]}\t${cleaned_basepairs_per_assembly_length}" >> "!{meta.id}-!{meta.assembler}.GenomeCoverage.tsv" + ((i=i+1)) + fi fi - done < <(grep -v 'Total length' !{summary_assemblies}) + done < <(grep -v -e 'Total_length' -e 'Total length' !{summary_assembly_metrics}) + + msg "INFO: stored coverage values for ${i} sample(s)" - sed -i '1i Sample name\tCoverage' "!{meta.id}-!{meta.assembler}.GenomeCoverage.tsv" + # Add header row to data output + sed -i '1i Sample_name\tCoverage_(mean[x]_+/-_stdev[x])' "!{meta.id}-!{meta.assembler}.GenomeCoverage.tsv" # Get process version information cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/calculate_metrics_fastq_seqkit/main.nf b/modules/local/calculate_metrics_fastq_seqkit/main.nf new file mode 100644 index 00000000..1424bd8b --- /dev/null +++ b/modules/local/calculate_metrics_fastq_seqkit/main.nf @@ -0,0 +1,124 @@ +process CALCULATE_METRICS_FASTQ_SEQKIT { + + tag { "${meta.id}" } + container "staphb/seqkit@sha256:8eb09a52ae932f7c25cfbb8db0df7110567087a187c7e90d46f499962d1c82c9" + + input: + tuple val(meta), path(reads) + val(input_fastq_type) + + output: + path("${meta.id}.${input_fastq_type}.metrics_summary.tsv"), emit: output + path(".command.{out,err}") + path("versions.yml") , emit: versions + + shell: + ''' + source bash_functions.sh + + msg "INFO: Calculating !{meta.id} statistics of !{input_fastq_type} FastQ input with SeqKit..." + + fastq_files=( !{reads} ) + num_reads="${#fastq_files[@]}" + + msg "INFO: Found ${num_reads} of !{input_fastq_type} FastQ files: ${fastq_files[@]}" + msg "INFO: Found !{input_fastq_type} FastQ files: !{reads}" + + # Calculate stats on 1 or more FastQ input files + # SeqKit prints each file stats separately, line-by-line, no total. + # Unlike seqtk, SeqKit autohandles input files with or without compression + # and reports correct numbers regardless (thank you! @shenwei356) + seqkit \ + stats \ + --tabular \ + --threads "!{task.cpus}" \ + --out-file "!{meta.id}.!{input_fastq_type}.seqkit_stats.tsv" \ + !{reads} + + msg "INFO: Calculated statistics with SeqKit for !{input_fastq_type} FastQ files: !{reads}" + msg "INFO: Calculated statistics with SeqKit for ${num_reads} !{input_fastq_type} FastQ files: ${fastq_files[@]}" + + awk -v sample_id="!{meta.id}" ' + BEGIN { + # Set the header contents (renamed) + OFS = "\t" + header_translation["num_seqs"] = "Total_Length_[bp]" + header_translation["sum_len"] = "Total_Sequences_[#]" + header_translation["min_len"] = "Minimum_Sequence_Length_[bp]" + header_translation["avg_len"] = "Mean_Sequence_Length_[bp]" + header_translation["max_len"] = "Maximum_Sequence_Length_[bp]" + } + + NR == 1 { + # Change header row item from "file" into "Sample_name" + $1 = "Sample_name" + + # Rename specific header names + for (i = 1; i <= NF; i++) { + if ($i in header_translation) { + $i = header_translation[$i] + } + } + + # Print the modified header, excluding columns 2 and 3 ("format" and "type") + print $1, $4, $5, $6, $7, $8 + } + + NR > 1 { + # Process data rows, excluding columns 2 and 3 ("format" and "type") + # Aggregate results if there are multiple rows (from >1 FastQ input) + num_seqs += $4 + sum_len += $5 + + if (NR == 2) { + # First row: initialize min/max/avg values + min_len = $6 + max_len = $8 + avg_len = $7 + } else { + # Update min and max + if ($6 < min_len) min_len = $6 + if ($8 > max_len) max_len = $8 + } + } + + END { + # Always print the aggregated results (or single line if only one row) + print sample_id, sum_len, num_seqs, min_len, avg_len, max_len + }' \ + "!{meta.id}.!{input_fastq_type}.seqkit_stats.tsv" \ + > "!{meta.id}.!{input_fastq_type}.metrics_summary.tsv" + + filepath="$(readlink -f !{meta.id}.!{input_fastq_type}.metrics_summary.tsv)" + msg "INFO: Summarized SeqKit statistics of !{meta.id} !{input_fastq_type} for ${filepath}" + + # NOTE: DONE! + # # TO-DO: move this unix-only component to separate QA_READS_BASEPAIR_COUNT_UNIX + # # Count nucleotides per read set + # echo -n '' > "!{meta.id}-!{meta.assembler}.Clean_Reads-Bases.tsv" + # for (( i=0; i<3; i+=3 )); do + # R1=$(basename "!{meta.id}_R1.paired.fq.gz" _R1.paired.fq.gz) + # R2=$(basename "!{meta.id}_R2.paired.fq.gz" _R2.paired.fq.gz) + # single=$(basename "!{meta.id}_single.fq.gz" _single.fq.gz) + + # # Verify each set of reads groups properly + # nr_uniq_str=$(echo -e "${R1}\\n${R2}\\n${single}" | sort -u | wc -l) + # if [ "${nr_uniq_str}" -ne 1 ]; then + # msg "ERROR: improperly grouped ${R1} ${R2} ${single}" >&2 + # exit 1 + # fi + # echo -ne "${R1}\t" >> "!{meta.id}-!{meta.assembler}.Clean_Reads-Bases.tsv" + # zcat "!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" "!{meta.id}_single.fq.gz" | \ + # awk 'BEGIN{SUM=0} {if(NR%4==2){SUM+=length($0)}} END{OFMT="%f"; print SUM}' \ + # >> "!{meta.id}-!{meta.assembler}.Clean_Reads-Bases.tsv" + + # sed -i '1i Sample_name\tCleaned_bases_(#)' "!{meta.id}-!{meta.assembler}.Clean_Reads-Bases.tsv" + # done + + # Get process version information + cat <<-END_VERSIONS > versions.yml + "!{task.process}": + seqkit: $(seqkit 2>&1 | grep "^Version: " | sed 's/^Version: //1') + END_VERSIONS + ''' +} diff --git a/modules/local/calculate_metrics_fastq_seqtk/main.nf b/modules/local/calculate_metrics_fastq_seqtk/main.nf new file mode 100644 index 00000000..178a82df --- /dev/null +++ b/modules/local/calculate_metrics_fastq_seqtk/main.nf @@ -0,0 +1,85 @@ +process CALCULATE_METRICS_FASTQ_SEQTK { + + tag { "${meta.id}" } + container "staphb/seqtk@sha256:82797114adb664ba939b4f6dfcb822483a4af827def4288e5207be559055f2cc" + + input: + tuple val(meta), path(reads) + val(input_fastq_type) + + output: + path("${meta.id}.${input_fastq_type}.seqtk.metrics_summary.tsv"), emit: output + path(".command.{out,err}") + path("versions.yml") , emit: versions + + shell: + ''' + source bash_functions.sh + + msg "INFO: Calculating !{meta.id} statistics of !{input_fastq_type} FastQ input with Seqtk..." + + fastq_files=( !{reads} ) + num_reads="${#fastq_files[@]}" + + msg "INFO: Found ${num_reads} of !{input_fastq_type} FastQ files: ${fastq_files[@]}" + msg "INFO: Found !{input_fastq_type} FastQ files: !{reads}" + + # Calculate stats on 1 or more FastQ input files + # Seqtk prints only totals regardless of 1, 2, or 3 input FastQ files, and + # gives 0-code exit status for files with GZ input but wrong numbers + # unless you pipe them in from stdin. + if [[ !{reads[0]} == *.gz ]]; then + zcat !{reads} | seqtk size > "!{meta.id}.!{input_fastq_type}.Seqtk_stats.tsv" + else + seqtk size !{reads} > "!{meta.id}.!{input_fastq_type}.Seqtk_stats.tsv" + fi + + msg "INFO: Calculated statistics with Seqtk for !{input_fastq_type} FastQ files: !{reads}" + + awk -v sample_name="!{meta.id}" ' + BEGIN { + # Add a header row of the FastQ sequence count data + print "Sample_name\tTotal_Length_[bp]\tTotal_Sequences_[#]" + } + + { + # Print the data row + print sample_name, $2, $1 + }' OFS="\t" \ + "!{meta.id}.!{input_fastq_type}.Seqtk_stats.tsv" \ + > "!{meta.id}.!{input_fastq_type}.seqtk.metrics_summary.tsv" + + filepath="$(readlink -f !{meta.id}.!{input_fastq_type}.seqtk.metrics_summary.tsv)" + msg "INFO: Summarized Seqtk statistics of !{meta.id} !{input_fastq_type} for ${filepath}" + + # NOTE: DONE! This section can be removed later but keeping for now + # as a reminder of what this is supposed to accomplish. + # # TO-DO: move this unix-only component to separate QA_READS_BASEPAIR_COUNT_UNIX + # # Count nucleotides per read set + # echo -n '' > "!{meta.id}-!{meta.assembler}.Clean_Reads-Bases.tsv" + # for (( i=0; i<3; i+=3 )); do + # R1=$(basename "!{meta.id}_R1.paired.fq.gz" _R1.paired.fq.gz) + # R2=$(basename "!{meta.id}_R2.paired.fq.gz" _R2.paired.fq.gz) + # single=$(basename "!{meta.id}_single.fq.gz" _single.fq.gz) + + # # Verify each set of reads groups properly + # nr_uniq_str=$(echo -e "${R1}\\n${R2}\\n${single}" | sort -u | wc -l) + # if [ "${nr_uniq_str}" -ne 1 ]; then + # msg "ERROR: improperly grouped ${R1} ${R2} ${single}" >&2 + # exit 1 + # fi + # echo -ne "${R1}\t" >> "!{meta.id}-!{meta.assembler}.Clean_Reads-Bases.tsv" + # zcat "!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" "!{meta.id}_single.fq.gz" | \ + # awk 'BEGIN{SUM=0} {if(NR%4==2){SUM+=length($0)}} END{OFMT="%f"; print SUM}' \ + # >> "!{meta.id}-!{meta.assembler}.Clean_Reads-Bases.tsv" + + # sed -i '1i Sample_name\tCleaned_bases_(#)' "!{meta.id}-!{meta.assembler}.Clean_Reads-Bases.tsv" + # done + + # Get process version information + cat <<-END_VERSIONS > versions.yml + "!{task.process}": + seqtk: $(seqtk 2>&1 | grep "^Version: " | sed 's/^Version: //1') + END_VERSIONS + ''' +} diff --git a/modules/local/classify_16S_rdp/main.nf b/modules/local/classify_16S_rdp/main.nf index f017afc6..c5a0c085 100644 --- a/modules/local/classify_16S_rdp/main.nf +++ b/modules/local/classify_16S_rdp/main.nf @@ -1,7 +1,8 @@ process CLASSIFY_16S_RDP { + label "process_medium" // single CPU but needs the RAM boost tag { "${meta.id}" } - container "tpaisie/rdp@sha256:ee388dff2e17c567946b7f2bf326765586d30f4ea0a203800616c44f599d53cc" + container "staphb/rdp@sha256:c1e9882d51cdbcf8293fc2a0740679c2f630185bb7604f9c1a375ba3b6643802" input: tuple val(meta), path(barnapp_extracted_rna) @@ -14,36 +15,43 @@ process CLASSIFY_16S_RDP { shell: // WARN: RDP does not report version information. This variable must be updated when container is updated. - VERSION = '2.14' + VERSION='2.14' ''' source bash_functions.sh - msg "INFO: Performing RDP 16S Classification" + msg "INFO: Performing RDP 16S Classification of !{meta.id} ..." classifier \ classify \ --format "!{params.rdp_output_format}" \ --gene "!{params.rdp_phylomarker}" \ - --outputFile "!{meta.id}.RDP.tsv" \ + --outputFile "!{meta.id}.RDP-raw.tsv" \ "!{barnapp_extracted_rna}" - if [[ "!{params.rdp_output_format}" == "fixrank" ]]; then - # Drop unnecessary columns - awk -F '\t' '{print $1,$3,$5,$6,$8,$9,$11,$12,$14,$15,$17,$18,$20}' \ - "!{meta.id}.RDP.tsv" \ - > "!{meta.id}.RDP_tmp.tsv" + msg "INFO: Completed RDP 16S Classification of !{meta.id}" - mv -f "!{meta.id}.RDP_tmp.tsv" "!{meta.id}.RDP.tsv" + if [[ "!{params.rdp_output_format}" == "fixrank" ]]; then + # Split up the first column "_"; add header; + # discard some columns that are now stored as headers + # (e.g., "domain", "phylum", "class", "order", "family", "genus") + # NOTE: a[length(a)] is used to take the last item in cases where + # samplename is Name_S1_L001_1 so it would get the final "1" + awk 'BEGIN { + FS=OFS="\t"; + print "Sample_name\tUnique_16S_rRNA_extraction_count\tDomain_result\tPhylum_result\tClass_result\tOrder_result\tFamily_result\tGenus_result" + } + { + split($1, a, "_"); + print a[1], a[length(a)], $3, $6, $9, $12, $15, $18 + }' "!{meta.id}.RDP-raw.tsv" \ + > "!{meta.id}.RDP.tsv" - # Add header - sed -i \ - '1i Domain\tDomain result\tPhylum\tPhylum result\tClass\tClass result\tOrder\tOrder result\tFamily\tFamily result\tGenus\tGenus result' \ - "!{meta.id}.RDP.tsv" else - # Add RDP format as a header for file collection - sed -i "1i !{params.rdp_output_format}" "!{meta.id}.RDP.tsv" + # Other `--format ` options have varying numbers and names for header, so avoid adding any for now + msg "WARN: RDP Classifier with `--format !{params.rdp_output_format}` unknown header column names might prevent downstream XLSX summary conversion" fi + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.RDP_Classification_File.tsv" if verify_minimum_file_size "!{meta.id}.RDP.tsv" '16S Classification Output File' "!{params.min_filesize_rdp_output}"; then echo -e "!{meta.id}\t16S RDP Output File\tPASS" >> !{meta.id}.RDP_Classification_File.tsv else diff --git a/modules/local/classify_contigs_cat/main.nf b/modules/local/classify_contigs_cat/main.nf index f974e648..fc5611eb 100644 --- a/modules/local/classify_contigs_cat/main.nf +++ b/modules/local/classify_contigs_cat/main.nf @@ -10,6 +10,7 @@ process CLASSIFY_CONTIGS_CAT { output: tuple val(meta), path("${meta.id}.CAT-Classification*tsv"), emit: summary + path("${meta.id}.Contigs.tsv") , emit: output tuple val(meta), path("${meta.id}.CAT_Output_File.tsv") , emit: qc_filecheck path("${meta.id}.CAT-Classification.log.gz") path(".command.{out,err}") @@ -20,7 +21,7 @@ process CLASSIFY_CONTIGS_CAT { source bash_functions.sh # Classify FastA assembly contigs with CAT - msg "INFO: Classifying each assembly contig independently with CAT" + msg "INFO: Classifying each !{meta.id} assembly contig independently with CAT ..." # Run CAT CAT \ @@ -31,8 +32,13 @@ process CLASSIFY_CONTIGS_CAT { --out_prefix "!{meta.id}.CAT-Classification" \ --nproc !{task.cpus} + msg "INFO: Completed classification of each !{meta.id} contig with CAT" + # Verify output files + msg "INFO: Creating QC summary file of CAT output files for !{meta.id} ..." + CREATE_SUMMARY=true + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.CAT_Output_File.tsv" for file in !{meta.id}.CAT-Classification.ORF2LCA.txt !{meta.id}.CAT-Classification.contig2classification.txt; do cat_file_type=$(echo "${file}" | cut -d '.' -f 3) if verify_minimum_file_size "${file}" 'CAT Output File' "!{params.min_filesize_cat_output}"; then @@ -43,10 +49,13 @@ process CLASSIFY_CONTIGS_CAT { fi done + msg "INFO: Completed QC summary file of CAT output files for !{meta.id}" + # Add taxonomic names to the CAT output if the GTDB database is not used if [[ "${CREATE_SUMMARY}" ]] && \ [[ ! $(grep "__" *.txt) ]]; then - msg "INFO: Adding names to CAT ORF2LCA output file" + msg "INFO: Adding names to CAT ORF2LCA output file for !{meta.id} ..." + CAT \ add_names \ --only_official \ @@ -54,7 +63,8 @@ process CLASSIFY_CONTIGS_CAT { --output_file !{meta.id}.CAT-Classification.ORF2LCA.names.tsv \ --taxonomy_folder tax - msg "INFO: Adding names to CAT contig2classification output file" + msg "INFO: Adding names to CAT contig2classification output file for !{meta.id} ..." + CAT \ add_names \ --only_official \ @@ -62,21 +72,51 @@ process CLASSIFY_CONTIGS_CAT { --output_file !{meta.id}.CAT-Classification.names.tsv \ --taxonomy_folder tax - msg "INFO: Creating CAT summary file" + msg "INFO: Creating CAT summary file for !{meta.id} ..." CAT \ summarise \ --input_file !{meta.id}.CAT-Classification.names.tsv \ --output_file !{meta.id}.CAT-Classification.names.summary.tsv \ --contigs_fasta "!{assembly}" + + # Custom simplified TSV reports + msg "INFO: Creating custom simpler CAT summary file for !{meta.id} ..." + grep '^# rank' !{meta.id}.CAT-Classification.names.summary.tsv \ + | sed "s/^# /Sample_name\t/1;s/ /_/g" \ + > Contigs.header_line.tsv + grep -v -e "no support" -e "^#" !{meta.id}.CAT-Classification.names.summary.tsv \ + | awk -v var=!{meta.id} '{print var "\t" $0}' \ + > Contigs.only-supported-data.tsv + cat Contigs.header_line.tsv \ + Contigs.only-supported-data.tsv \ + > !{meta.id}.Contigs.tsv + + grep '^# ORF' !{meta.id}.CAT-Classification.ORF2LCA.names.tsv \ + | sed "s/^# /Sample_name\t/1;s/ /_/g" \ + | awk -v var=!{meta.id} '{print var "\t" $0}' \ + > ORF.header_line.unique-lineages.tsv + grep -v -e $'no support\tno support' -e "^#" !{meta.id}.CAT-Classification.ORF2LCA.names.tsv \ + | sort -u -k3,3 | sed "s/ /_/g" > ORF.only-supported-data.unique-lineages.tsv + cat ORF.header_line.unique-lineages.tsv \ + ORF.only-supported-data.unique-lineages.tsv \ + > !{meta.id}.unique-lineages.ORF.tsv + + msg "INFO: Completed CAT reports for !{meta.id}" + else + msg "WARN: Skipping addition of names to CAT ORF2LCA output file for !{meta.id} ..." + # Avoid "no output file found" error if name/summary tsv files are not created cp "!{meta.id}.CAT-Classification.ORF2LCA.txt" "!{meta.id}.CAT-Classification.ORF2LCA.tsv" cp "!{meta.id}.CAT-Classification.contig2classification.txt" "!{meta.id}.CAT-Classification.contig2classification.tsv" + touch !{meta.id}.Contigs.tsv fi # Compress the bulky verbose logfile for compact storage gzip -9f !{meta.id}.CAT-Classification.log + msg "INFO: Completed CAT classification process for !{meta.id}" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": diff --git a/modules/local/count_total_bp_input_reads_seqkit/main.nf b/modules/local/count_total_bp_input_reads_seqkit/main.nf new file mode 100644 index 00000000..9f28a253 --- /dev/null +++ b/modules/local/count_total_bp_input_reads_seqkit/main.nf @@ -0,0 +1,63 @@ +process COUNT_TOTAL_BP_INPUT_READS_SEQKIT { + + tag { "${meta.id}" } + container "staphb/seqkit@sha256:8eb09a52ae932f7c25cfbb8db0df7110567087a187c7e90d46f499962d1c82c9" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("${meta.id}.input_total_bp.txt"), emit: input_total_bp + path(".command.{out,err}") + path("versions.yml") , emit: versions + + shell: + total_bp = 0 + ''' + source bash_functions.sh + + # Calculate total bp input for R1 and R2 FastQ file with Phred 30+ + # NOTE: specified quality to trim param depends on user-supplied read + # trimmer selected (e.g., Fastp or Trimmomatic) and each have + # different variables for minimum quality score to use: + # - Fastp = params.fastp_window_mean_quality + # - Trimmomatic = params.trimmomatic_required_quality + # but we'll just use Phred30 cutoff to be conservative for the + # downsampling step here (`--min-qual 30`). + + msg "INFO: Calculating !{meta.id} basepairs above Phred 30 with SeqKit for subsampling calculations..." + + seqkit seq \ + --min-qual 30 \ + !{reads[0]} !{reads[1]} \ + | \ + seqkit stats \ + --tabular \ + > seqkit-seq-stats.!{meta.id}.stdout.log + + msg "INFO: Calculated !{meta.id} basepairs above Phred 30 with SeqKit" + + # Extract just the total bp count of the R1 input FastQ file, then + # double it to estimate total R1 and R2 input + if [ -s seqkit-seq-stats.!{meta.id}.stdout.log ] ; then + total_bp=$(tail -n 1 seqkit-seq-stats.!{meta.id}.stdout.log | awk '{printf $5}') + if ! [[ $total_bp =~ ^[0-9]+$ ]]; then + msg "ERROR: total bp = $total_bp" >&2 + msg "ERROR: total bp size not counted with seqkit" >&2 + exit 1 + else + echo -n "${total_bp}" > "!{meta.id}.input_total_bp.txt" + msg "INFO: found ${total_bp}bp for !{meta.id}" + fi + else + msg "ERROR: nucleotide count output logfile by seqkit is empty" >&2 + exit 1 + fi + + # Get process version information + cat <<-END_VERSIONS > versions.yml + "!{task.process}": + seqkit: $(seqkit 2>&1 | grep "^Version: " | sed 's/^Version: //1') + END_VERSIONS + ''' +} diff --git a/modules/local/count_total_bp_input_reads_seqtk/main.nf b/modules/local/count_total_bp_input_reads_seqtk/main.nf index f088507f..75bc9068 100644 --- a/modules/local/count_total_bp_input_reads_seqtk/main.nf +++ b/modules/local/count_total_bp_input_reads_seqtk/main.nf @@ -1,7 +1,7 @@ process COUNT_TOTAL_BP_INPUT_READS_SEQTK { tag { "${meta.id}" } - container "gregorysprenger/seqtk@sha256:756bff7222c384d358cb22ecbbae443e112b296503cb0e1a6baf9cf80545ae20" + container "staphb/seqtk@sha256:82797114adb664ba939b4f6dfcb822483a4af827def4288e5207be559055f2cc" input: tuple val(meta), path(reads) @@ -17,30 +17,40 @@ process COUNT_TOTAL_BP_INPUT_READS_SEQTK { source bash_functions.sh # Calculate total bp input for R1 FastQ file + # NOTE: specified quality to trim param depends on user-supplied read + # trimmer selected (e.g., Fastp or Trimmomatic) and each have + # different variables for minimum quality score to use: + # - Fastp = params.fastp_window_mean_quality + # - Trimmomatic = params.trimmomatic_required_quality + # but we'll just use Phred30 cutoff to be conservative for the + # downsampling step here (`-q 30`). + + msg "INFO: Calculating !{meta.id} basepairs above Phred 30 with Seqtk for subsampling calculations..." + seqtk fqchk \ - !{reads[0]} \ - 1> seqtk-fqchk.!{meta.id}.stdout.log \ - 2> seqtk-fqchk.!{meta.id}.stderr.log + -q 30 \ + !{reads[0]} !{reads[1]} \ + > seqtk-fqchk.!{meta.id}.stdout.log \ + + msg "INFO: Calculated !{meta.id} basepairs above Phred 30 with Seqtk" # Extract just the total bp count of the R1 input FastQ file, then # double it to estimate total R1 and R2 input if [ -s seqtk-fqchk.!{meta.id}.stdout.log ] ; then - R1_total_bp=$(grep '^ALL' seqtk-fqchk.!{meta.id}.stdout.log | awk '{print $2}') - if ! [[ $R1_total_bp =~ ^[0-9]+$ ]]; then - msg "ERROR: R1 total bp size not counted with seqtk fqchk" >&2 + total_bp=$(grep '^ALL' seqtk-fqchk.!{meta.id}.stdout.log | awk '{printf $2}') + if ! [[ $total_bp =~ ^[0-9]+$ ]]; then + msg "ERROR: total bp = $total_bp" >&2 + msg "ERROR: total bp size not counted with seqtk fqchk" >&2 exit 1 else - # Double the R1 and skip R2 count; imperfect but faster - # and close enough for an estimation - R1R2_total_bp=$(( 2 * ${R1_total_bp} )) + echo -n "${total_bp}" > "!{meta.id}.input_total_bp.txt" + msg "INFO: found ${total_bp}bp for !{meta.id}" fi else msg "ERROR: nucleotide count output logfile by seqtk fqchk is empty" >&2 exit 1 fi - echo -n "${R1R2_total_bp}" > "!{meta.id}.input_total_bp.txt" - # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": diff --git a/modules/local/create_excel_run_summary_python/main.nf b/modules/local/create_excel_run_summary_python/main.nf index faf13979..34b97e04 100644 --- a/modules/local/create_excel_run_summary_python/main.nf +++ b/modules/local/create_excel_run_summary_python/main.nf @@ -4,38 +4,34 @@ process CREATE_EXCEL_RUN_SUMMARY_PYTHON { input: path(list_of_files) + path(tab_colors) + val(wf_version) output: - path("Summary-Report_*.xlsx"), emit: summary + path("Summary-Report.xlsx"), emit: summary path(".command.{out,err}") - path("versions.yml") , emit: versions + path("versions.yml") , emit: versions shell: ''' - python3 <<-END_PYTHON - import glob - import datetime - import pandas as pd + source bash_functions.sh - def create_summary_workbook(output_file, tsv_file): - sheet_name = tsv_file.split(".")[1] - data = pd.read_csv(tsv_file, sep="\t") - data.to_excel(output_file, sheet_name=sheet_name, index=False) + msg "INFO: Converting summary TSV files: !{list_of_files} into single XLSX workbook..." - date = datetime.datetime.now() - date_format = date.strftime("%Y-%b-%d_%H-%M-%S") + # Default outfile is "Summary-Report.xlsx" in python script + tsv_to_excel.py \ + !{list_of_files} \ + --color-dict !{tab_colors} \ + -t "wf-paired-end-illumina-assembly_v!{wf_version}" \ + -s "genome_assembly" \ + -c "bacterial-genomics" - list_of_files = glob.glob("*.tsv") - - with pd.ExcelWriter(f"Summary-Report_{date_format}.xlsx") as output_file: - for file in list_of_files: - create_summary_workbook(output_file, file) - - END_PYTHON + msg "INFO: Converted summary TSV files into single XLSX workbook." # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": + pandas: $(python3 -c "import pandas as pd; print(pd.__version__)") python: $(python3 --version 2>&1 | awk '{print $2}') ubuntu: $(awk -F ' ' '{print $2, $3}' /etc/issue | tr -d '\\n') END_VERSIONS diff --git a/modules/local/create_excel_run_summary_python/resources/xlsx_tab_color_key.txt b/modules/local/create_excel_run_summary_python/resources/xlsx_tab_color_key.txt new file mode 100644 index 00000000..b019665c --- /dev/null +++ b/modules/local/create_excel_run_summary_python/resources/xlsx_tab_color_key.txt @@ -0,0 +1,74 @@ +# NOTE: Colors should follow workflow order in ROYGBIV then greyscale +# +# Validate Inputs (=255, 0, 0 # Red) +Input_Checksums=255, 0, 0 +Input_Reads.Metrics=255, 0, 0 +# +# Human Removal (=255, 165, 0 # Orange) +SRA_Human_Scrubbed=255, 165, 0 +SRA_Scrubbed_Reads.Metrics=255, 165, 0 +BBTools_Repair_Removal=255, 165, 0 +BBTools_Repaired_Reads.Metrics=255, 165, 0 +Hostile_Human_Removed=255, 165, 0 +Hostile_Removed_Reads.Metrics=255, 165, 0 +# +# Downsample (=165, 42, 42 # Brown) +Downsampled_Reads.Metrics=165, 42, 42 +# +# Clean Reads (=255, 255, 0 # Yellow) +PhiX_Removal=255, 255, 0 +PhiX_Removed_Reads.Metrics=255, 255, 0 +Adapter_and_QC_Trim=255, 255, 0 +Adapter_QC_Trim_Reads.Metrics=255, 255, 0 +Clean_and_Overlapped=255, 255, 0 +Clean_Reads.Metrics=255, 255, 0 +Clean_Reads_Checksums=255, 255, 0 +# +# Genome Assembly (=154, 205, 50 # YellowGreen) -- replaced Chartreuse +Assembly_Checksums=154, 205, 50 +# +# Depth of Coverage (=0, 128, 0 # Dark Green) +Assembly_Depth=0, 128, 0 +Clean_Reads_Aligned=0, 128, 0 +# +# Genotyping (=0, 255, 0 # Green) +MLST=0, 255, 0 +# +# Genome Annotation (=0, 255, 255 # Cyan) +Annotation_Checksums=0, 255, 255 +# +# 16S rRNA Gene Classification (=0, 0, 255 # Blue) +16S_Species_BLAST=0, 0, 255 +16S_Genus_RDP=0, 0, 255 +# +# Taxonomic Classification of Reads (=255, 0, 255 # Magenta) +Kraken=255, 0, 255 +Kraken2=255, 0, 255 +# +# Assembly Assessment (=128, 0, 128 # Purple) +Assembly_Metrics=128, 0, 128 +Assembly_Completeness=128, 0, 128 +BUSCO_Completeness=128, 0, 128 +Contigs_Classified=128, 0, 128 +Assemblies_Classified=128, 0, 128 +# +# Misc (=128, 128, 128 # Gray) +QC_File_Checks=128, 128, 128 +Output_Checksums=128, 128, 128 +# +# +# +# =255, 0, 0 # Red +# =255, 165, 0 # Orange +# =165, 42, 42 # Brown +# =255, 255, 0 # Yellow +# =127, 255, 0 # Chartreuse +# =0, 128, 0 # Dark Green +# =0, 255, 0 # Green +# =0, 255, 255 # Cyan +# =0, 0, 255 # Blue +# =255, 0, 255 # Magenta +# =128, 0, 128 # Purple +# =128, 128, 128 # Gray +# =0, 0, 0 # Black +# =255, 255, 255 # White \ No newline at end of file diff --git a/modules/local/extract_16S_barrnap/main.nf b/modules/local/extract_16S_barrnap/main.nf index 77b2afc3..cb59843b 100644 --- a/modules/local/extract_16S_barrnap/main.nf +++ b/modules/local/extract_16S_barrnap/main.nf @@ -40,13 +40,13 @@ process EXTRACT_16S_BARRNAP { fi fi - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.SSU_Extracted_File.tsv" - if verify_minimum_file_size "16S.!{meta.id}-!{meta.assembler}.fa" 'SSU Extracted File' "!{params.min_filesize_extracted_ssu_file}"; then - echo -e "!{meta.id}-!{meta.assembler}\tSSU Extracted File\tPASS" \ - >> "!{meta.id}-!{meta.assembler}.SSU_Extracted_File.tsv" + echo -e "Sample _name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.SSU_Extracted_File.tsv" + if verify_minimum_file_size "16S.!{meta.id}-!{meta.assembler}.fa" 'SSU Extracted FastA File' "!{params.min_filesize_extracted_ssu_file}"; then + echo -e "!{meta.id}\tSSU Extracted FastA File\tPASS" \ + >> "!{meta.id}.SSU_Extracted_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tSSU Extracted File\tFAIL" \ - >> "!{meta.id}-!{meta.assembler}.SSU_Extracted_File.tsv" + echo -e "!{meta.id}\tSSU Extracted FastA File\tFAIL" \ + >> "!{meta.id}.SSU_Extracted_File.tsv" fi awk -v awk_var="!{meta.id}" \ @@ -58,12 +58,12 @@ process EXTRACT_16S_BARRNAP { mv -f "!{meta.id}-!{meta.assembler}.fa-renamed" \ "16S.!{meta.id}-!{meta.assembler}.fa" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.SSU_Renamed_File.tsv" - if verify_minimum_file_size "16S.!{meta.id}-!{meta.assembler}.fa" 'SSU Renamed File' "!{params.min_filesize_renamed_ssu_file}"; then - echo -e "!{meta.id}-!{meta.assembler}\tSSU Renamed File\tPASS" \ - >> "!{meta.id}-!{meta.assembler}.SSU_Renamed_File.tsv" + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.SSU_Renamed_File.tsv" + if verify_minimum_file_size "16S.!{meta.id}-!{meta.assembler}.fa" 'SSU Renamed FastA File' "!{params.min_filesize_renamed_ssu_file}"; then + echo -e "!{meta.id}\tSSU Renamed FastA File\tPASS" \ + >> "!{meta.id}.SSU_Renamed_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tSSU Renamed File\tFAIL" \ + echo -e "!{meta.id}\tSSU Renamed FastA File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.SSU_Renamed_File.tsv" fi diff --git a/modules/local/extract_read_alignment_depths_bedtools/main.nf b/modules/local/extract_read_alignment_depths_bedtools/main.nf index efa02927..7e3c8e20 100644 --- a/modules/local/extract_read_alignment_depths_bedtools/main.nf +++ b/modules/local/extract_read_alignment_depths_bedtools/main.nf @@ -1,13 +1,13 @@ process EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS { tag { "${meta.id}-${meta.assembler}" } - container "snads/bedtools@sha256:9b80fb5c5ef1b6f4a4a211d8739fa3fe107da34d1fb6609d6b70ddc7afdce12c" + container "staphb/bedtools@sha256:52d4a9359d3adaa6ac8f8ebbdc5596bae791a738974e9a85f72892486a43336e" input: tuple val(meta), path(bam_files) output: - tuple val(meta), path("${meta.id}-${meta.assembler}.CleanedReads-AlnStats.tsv"), emit: summary + tuple val(meta), path("${meta.id}-${meta.assembler}.Clean_Reads-AlnStats.tsv"), emit: summary path(".command.{out,err}") path("versions.yml") , emit: versions @@ -15,25 +15,70 @@ process EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS { ''' source bash_functions.sh - # Calculate and report coverage of paired-reads and singleton reads separately - msg "INFO: Extracting read alignment depths using bedtools" + # Calculate and report coverage of paired-reads and singleton reads separately, in addition to combined - single_cov='0 bp TooFewToMap Singleton Reads (0.0x)\t' + SINGLE_STATS=(0 0 0 0) + echo -n '' > single_coverage.tsv if [ -s !{bam_files[1]} ]; then - single_cov=$(bedtools genomecov -d -split -ibam !{bam_files[1]} |\ - awk '{sum+=$3} END{print sum " bp Singleton Reads Mapped (" sum/NR "x)\t"}') + msg "INFO: Extracting singleton read alignment depths using bedtools from !{bam_files[1]}" + bedtools genomecov -d -split -ibam !{bam_files[1]} > single_coverage.tsv + + total_sites=$(wc -l < single_coverage.tsv) + sum_single_coverage=$(awk '{sum+=$3} END{print sum}' single_coverage.tsv) + mean_single_coverage=$(awk -v sum="$sum_single_coverage" -v total="$total_sites" 'BEGIN{printf "%.1f", sum/total}') + stdev_single_coverage=$(awk -v mean="$mean_single_coverage" '{sum+=($3-mean)^2} END{printf "%.1f", sqrt(sum/NR)}' single_coverage.tsv) + + SINGLE_STATS=($total_sites $sum_single_coverage $mean_single_coverage $stdev_single_coverage) fi - cov_info=$(bedtools genomecov -d -split -ibam "!{bam_files[0]}" |\ - awk -v OFS='\t' -v SEcov="${single_cov}" 'BEGIN{sum=0} {sum+=$3} END{ - print sum " bp Paired Reads Mapped (" sum/NR "x)\t" SEcov NR " bp Genome"}') + msg "INFO: Extracting paired-end read alignment depths using bedtools from !{bam_files[0]}" + bedtools genomecov -d -split -ibam !{bam_files[0]} > paired_coverage.tsv + + total_sites=$(wc -l < paired_coverage.tsv) + sum_paired_coverage=$(awk '{sum+=$3} END{print sum}' paired_coverage.tsv) + mean_paired_coverage=$(awk -v sum="$sum_paired_coverage" -v total="$total_sites" 'BEGIN {printf "%.1f", sum/total}') + stdev_paired_coverage=$(awk -v mean="$mean_paired_coverage" '{sum+=($3-mean)^2} END{printf "%.1f", sqrt(sum/NR)}' paired_coverage.tsv) + + PAIRED_STATS=($total_sites $sum_paired_coverage $mean_paired_coverage $stdev_paired_coverage) + + # Combine coverage values to report total depth of coverage statistics + sum_total_coverage=$(awk '{sum+=$3} END{print sum}' {paired,single}_coverage.tsv) + mean_total_coverage=$(awk -v sum="$sum_paired_coverage" -v total="$total_sites" 'BEGIN {printf "%.1f", sum/total}') + stdev_total_coverage=$(awk -v mean="$mean_paired_coverage" '{sum+=($3-mean)^2} END{printf "%.1f", sqrt(sum/NR)}' {paired,single}_coverage.tsv) + + TOTAL_STATS=($total_sites $sum_total_coverage $mean_total_coverage $stdev_total_coverage) + + COVERAGE_DATA=( + "!{meta.id}" + "${PAIRED_STATS[1]}" + "${PAIRED_STATS[2]}" + "${PAIRED_STATS[3]}" + "${SINGLE_STATS[1]}" + "${SINGLE_STATS[2]}" + "${SINGLE_STATS[3]}" + "${TOTAL_STATS[1]}" + "${TOTAL_STATS[2]}" + "${TOTAL_STATS[3]}" + "${TOTAL_STATS[0]}" + ) + COVERAGE_DATA=$(printf "%s\t" "${COVERAGE_DATA[@]}" | sed 's/\t$//') - echo -e "!{meta.id}\t${cov_info}" \ - > "!{meta.id}-!{meta.assembler}.CleanedReads-AlnStats.tsv" + SUMMARY_HEADER=( + "Sample_name" + "Total_mapped_paired_reads_(bp)" + "Mean_coverage_of_paired_reads_(x)" + "Stdev_coverage_of_paired_reads_(x)" + "Total_mapped_singleton_reads_(bp)" + "Mean_coverage_of_singleton_reads_(x)" + "Stdev_coverage_of_singleton_reads_(x)" + "Total_mapped_paired_and_singleton_reads_(bp)" + "Mean_coverage_of_paired_and_singleton_reads_(x)" + "Stdev_coverage_of_paired_and_singleton_reads_(x)" + "Genome_assembly_length_(bp)" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') - sed -i \ - '1i Sample name\tCoverage of paired reads\tCoverage of singleton reads\tGenome size' \ - "!{meta.id}-!{meta.assembler}.CleanedReads-AlnStats.tsv" + echo -e "${SUMMARY_HEADER}\n${COVERAGE_DATA}" > "!{meta.id}-!{meta.assembler}.Clean_Reads-AlnStats.tsv" # Get process version information cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/filter_contigs_biopython/main.nf b/modules/local/filter_contigs_biopython/main.nf index a50e9e7a..2a5bcd2d 100644 --- a/modules/local/filter_contigs_biopython/main.nf +++ b/modules/local/filter_contigs_biopython/main.nf @@ -8,34 +8,43 @@ process FILTER_CONTIGS_BIOPYTHON { output: tuple val(meta), path("${meta.id}-${meta.assembler}.uncorrected.fna"), emit: uncorrected_contigs + path("${meta.id}-${meta.assembler}.discarded-contigs.fa.gz") , emit: discarded_contigs + path("${meta.id}-${meta.assembler}.filter-contigs-stats.txt") , emit: filter_stats path(".command.{out,err}") path("versions.yml") , emit: versions shell: - gcskew = params.filter_contigs_gcskew ? "" : "-g" - keep_low_complexity = params.filter_contigs_keep_low_complexity ? "" : "-m" + gcskew = params.filter_contigs_gcskew ? "" : "--gcskew" + keep_low_complexity = params.filter_contigs_keep_low_complexity ? "" : "--complex" no_sort = params.filter_contigs_no_sort ? "--no-sort" : "" - if (params.filter_contigs_discard_file) { - discard_file = "-d ${params.filter_contigs_discard_file}" - } else { - discard_file = "" - } ''' source bash_functions.sh + msg "INFO: Filtering contigs from !{contigs} ..." + # Remove junk contigs filter.contigs.py \ - -i !{contigs} \ - -b "!{meta.id}-!{meta.assembler}" \ - -o "!{meta.id}-!{meta.assembler}.uncorrected.fna" \ - -l !{params.filter_contigs_length} \ - -c !{params.filter_contigs_coverage} \ + --infile !{contigs} \ + --baseheader "!{meta.id}-!{meta.assembler}" \ + --outfile "!{meta.id}-!{meta.assembler}.uncorrected.fna" \ + --len !{params.filter_contigs_length} \ + --cov !{params.filter_contigs_coverage} \ --deflines !{params.filter_contigs_deflines} \ + --discarded "!{meta.id}-!{meta.assembler}.discarded-contigs.fa" \ !{no_sort} \ !{gcskew} \ - !{discard_file} \ - !{keep_low_complexity} + !{keep_low_complexity} \ + 2> "!{meta.id}-!{meta.assembler}.filter-contigs-stats.txt" + + msg "INFO: Completed contig filtering for !{meta.id}" + + if [ -s "!{meta.id}-!{meta.assembler}.discarded-contigs.fa" ]; then + gzip -9f "!{meta.id}-!{meta.assembler}.discarded-contigs.fa" + msg "INFO: discarded contigs saved as !{meta.id}-!{meta.assembler}.discarded-contigs.fa.gz" + else + msg "INFO: no contigs were discarded, therefore not storing empty !{meta.id}-!{meta.assembler}.discarded-contigs.fa.gz file" + fi # Get process version information cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/filter_contigs_biopython/params.config b/modules/local/filter_contigs_biopython/params.config index 900d99b7..05976c15 100644 --- a/modules/local/filter_contigs_biopython/params.config +++ b/modules/local/filter_contigs_biopython/params.config @@ -5,6 +5,5 @@ params { filter_contigs_gcskew = false filter_contigs_no_sort = false filter_contigs_keep_low_complexity = false - filter_contigs_discard_file = null filter_contigs_deflines = "rename_retain" } diff --git a/modules/local/infile_handling_unix/main.nf b/modules/local/infile_handling_unix/main.nf index ba774b9c..f86b4640 100644 --- a/modules/local/infile_handling_unix/main.nf +++ b/modules/local/infile_handling_unix/main.nf @@ -7,10 +7,11 @@ process INFILE_HANDLING_UNIX { tuple val(meta), path(reads) output: - tuple val(meta), path("${meta.id}.Raw_Initial_FastQ_File.tsv"), emit: qc_filecheck - tuple val(meta), path(reads) , emit: input + tuple val(meta), path("${meta.id}.Raw_Initial_FastQ_Size_of_File.tsv"), emit: qc_filecheck + tuple val(meta), path(reads) , emit: input + path("${meta.id}.Input_FastQ.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") - path("versions.yml") , emit: versions + path("versions.yml") , emit: versions shell: ''' @@ -19,37 +20,42 @@ process INFILE_HANDLING_UNIX { msg "INFO: Read 1: !{reads[0]}" msg "INFO: Read 2: !{reads[1]}" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}.Raw_Initial_FastQ_File.tsv" + ### Evaluate Filesize of each Input FastQ file ### + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.Raw_Initial_FastQ_Size_of_File.tsv" i=1 for fastq in !{reads}; do - # Check if input FastQ file is corrupted - if [[ ${fastq} =~ .gz ]]; then - gunzip -t ${fastq} 2>/dev/null || \ - $( - msg "ERROR: Input file ${fastq} is corrupted and assembly cannot be performed!" >&2 \ - && exit 1 - ) - elif [[ ${fastq} =~ .fastq ]] || [[ ${fastq} =~ .fq ]]; then - cat ${fastq} > /dev/null 2>&1 || \ - $( - msg "ERROR: Input file ${fastq} is corrupted and assembly cannot be performed!" >&2 \ - && exit 1 - ) - fi - # Check if input FastQ file meets minimum file size requirement if verify_minimum_file_size "${fastq}" 'Raw Initial FastQ Files' "!{params.min_filesize_fastq_input}"; then - echo -e "!{meta.id}\tRaw Initial FastQ (R${i}) File\tPASS" >> "!{meta.id}.Raw_Initial_FastQ_File.tsv" + echo -e "!{meta.id}\tRaw Initial FastQ (R${i}) Filesize\tPASS" >> "!{meta.id}.Raw_Initial_FastQ_Size_of_File.tsv" else - echo -e "!{meta.id}\tRaw Initial FastQ (R${i}) File\tFAIL" >> "!{meta.id}.Raw_Initial_FastQ_File.tsv" + msg "ERROR: R${i} file for !{meta.id}: ${fastq} is not at least !{params.min_filesize_fastq_input} in size" >&2 + echo -e "!{meta.id}\tRaw Initial FastQ (R${i}) Filesize\tFAIL" >> "!{meta.id}.Raw_Initial_FastQ_Size_of_File.tsv" fi ((i++)) done + ### Calculate SHA-512 Checksums of each Input FastQ file ### + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-512)" + "File" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Input_FastQ.SHA512-checksums.tsv" + + find . -type l -regex ".*\\\\(\\.fq\\\\|\\.fq\\\\.gz\\\\|\\.fastq\\\\|\\.fastq\\\\.gz\\)$" | while read f; do + f="$(readlink -f ${f})" + echo -ne "!{meta.id}\t" >> "!{meta.id}.Input_FastQ.SHA512-checksums.tsv" + awk 'NR%2==0' "${f}" | paste - - | sort -k1,1 | sha512sum | awk '{print $1 "\t" "'"$f"'"}' + done >> "!{meta.id}.Input_FastQ.SHA512-checksums.tsv" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": + find: $(find --version | grep ^find | sed 's/find //1') + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') ubuntu: $(awk -F ' ' '{print $2,$3}' /etc/issue | tr -d '\\n') END_VERSIONS ''' diff --git a/modules/local/kraken1_db_preparation_unix/main.nf b/modules/local/kraken1_db_preparation_unix/main.nf index 6a7c95a2..53b9b266 100644 --- a/modules/local/kraken1_db_preparation_unix/main.nf +++ b/modules/local/kraken1_db_preparation_unix/main.nf @@ -26,10 +26,30 @@ process KRAKEN1_DB_PREPARATION_UNIX { # Verify all 4 files are found if [[ $(find database/ -type f | wc -l) != 4 ]]; then - msg "ERROR: Missing one of the following files: `database.{idx,kdb}, {names,nodes}.dmp`." + msg "ERROR: Missing one of the following files: `database.{idx,kdb}, {names,nodes}.dmp`." >&2 exit 1 fi + # ### Calculate SHA-512 Checksum Kraken inspect.txt file ### + # SUMMARY_HEADER=( + # "Sample_name" + # "Checksum_(SHA-512)" + # "File" + # ) + # SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + # echo "${SUMMARY_HEADER}" > "!{meta.id}.Kraken_Database.SHA512-checksums.tsv" + + # if [ -s "!{database}/inspect.txt" ]; then + # msg "INFO: Found pre-calculated inspect.txt Kraken db information" + # else + # msg "INFO: Creating inspect.txt Kraken db information..." + # kraken2-inspect --db "!{database}" --threads "!{task.cpus}" > "!{database}/inspect.txt" + # msg "INFO: Creating inspect.txt Kraken db information..." + # fi + # CHECKSUM=$(sha512sum !{database}/inspect.txt | awk '{print $1}') + # echo -e "!{meta.id}\t${CHECKSUM}\t!{database}/inspect.txt" >> "!{meta.id}.Kraken_Database.SHA512-checksums.tsv" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": diff --git a/modules/local/kraken2_db_preparation_unix/main.nf b/modules/local/kraken2_db_preparation_unix/main.nf index 9dd1bc61..629ce2fa 100644 --- a/modules/local/kraken2_db_preparation_unix/main.nf +++ b/modules/local/kraken2_db_preparation_unix/main.nf @@ -20,6 +20,26 @@ process KRAKEN2_DB_PREPARATION_UNIX { mkdir database mv `find db_tmp/ -name "*.k2d"` database/ + # ### Calculate SHA-512 Checksum Kraken2 inspect.txt file ### + # SUMMARY_HEADER=( + # "Sample_name" + # "Checksum_(SHA-512)" + # "File" + # ) + # SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + # echo "${SUMMARY_HEADER}" > "!{meta.id}.Kraken2_Database.SHA512-checksums.tsv" + + # if [ -s "!{database}/inspect.txt" ]; then + # msg "INFO: Found pre-calculated inspect.txt Kraken2 db information" + # else + # msg "INFO: Creating inspect.txt Kraken2 db information..." + # kraken2-inspect --db "!{database}" --threads "!{task.cpus}" > "!{database}/inspect.txt" + # msg "INFO: Creating inspect.txt Kraken2 db information..." + # fi + # CHECKSUM=$(sha512sum !{database}/inspect.txt | awk '{print $1}') + # echo -e "!{meta.id}\t${CHECKSUM}\t!{database}/inspect.txt" >> "!{meta.id}.Kraken2_Database.SHA512-checksums.tsv" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": diff --git a/modules/local/map_contigs_bwa/main.nf b/modules/local/map_contigs_bwa/main.nf index e2d45d6a..0e91bea2 100644 --- a/modules/local/map_contigs_bwa/main.nf +++ b/modules/local/map_contigs_bwa/main.nf @@ -11,6 +11,7 @@ process MAP_CONTIGS_BWA { tuple val(meta), path("${meta.id}-${meta.assembler}.{Filtered,Binary,Final}*_File.tsv"), emit: qc_filecheck tuple val(meta), path("${meta.id}-${meta.assembler}.{paired,single}.bam") , emit: bam tuple val(meta), path("${meta.id}-${meta.assembler}.fna") , emit: assembly + path("${meta.id}.Assembly_FastA.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") path("versions.yml") , emit: versions @@ -19,17 +20,21 @@ process MAP_CONTIGS_BWA { source bash_functions.sh # Map SKESA contigs with cleaned PE reads - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Filtered_Assembly_File.tsv" - if verify_minimum_file_size "!{uncorrected_contigs}" 'Filtered Assembly File' "!{params.min_filesize_filtered_assembly}"; then - echo -e "!{meta.id}-!{meta.assembler}\tFiltered Assembly File\tPASS" \ + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Filtered_Assembly_File.tsv" + if verify_minimum_file_size "!{uncorrected_contigs}" 'Filtered Assembly FastA File' "!{params.min_filesize_filtered_assembly}"; then + echo -e "!{meta.id}\tFiltered Assembly FastA File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Filtered_Assembly_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tFiltered Assembly File\tFAIL" \ + echo -e "!{meta.id}\tFiltered Assembly FastA File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Filtered_Assembly_File.tsv" fi bwa index !{uncorrected_contigs} + msg "INFO: Completed bwa index of !{uncorrected_contigs} FastA assembly file" + + msg "INFO: Cleaned paired-end read mapping of !{meta.id}..." + bwa mem \ -v 2 \ -x intractg \ @@ -43,33 +48,41 @@ process MAP_CONTIGS_BWA { -o "!{meta.id}-!{meta.assembler}.paired.bam" \ --reference !{uncorrected_contigs} - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Binary_PE_Alignment_Map_File.tsv" - if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.paired.bam" 'Binary PE Alignment Map File' "!{params.min_filesize_binary_pe_alignment}"; then - echo -e "!{meta.id}-!{meta.assembler}\tBinary PE Alignment Map File\tPASS" \ + msg "INFO: Completed paired read mapping of !{meta.id}" + + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Binary_PE_Alignment_Map_File.tsv" + if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.paired.bam" 'Binary PE Alignment Map BAM File' "!{params.min_filesize_binary_pe_alignment}"; then + echo -e "!{meta.id}\tBinary PE Alignment Map BAM File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Binary_PE_Alignment_Map_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tBinary PE Alignment Map File\tFAIL" \ + echo -e "!{meta.id}\tBinary PE Alignment Map BAM File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Binary_PE_Alignment_Map_File.tsv" fi samtools index "!{meta.id}-!{meta.assembler}.paired.bam" + msg "INFO: Completed samtools index of paired-end BAM alignment file for !{meta.id}" + cp -L "!{meta.id}-!{meta.assembler}.uncorrected.fna" "!{meta.id}-!{meta.assembler}.fna" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Final_Corrected_Assembly_FastA_File.tsv" + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Final_Corrected_Assembly_FastA_File.tsv" if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.fna" 'Final Corrected Assembly FastA File' "!{params.min_filesize_final_assembly}"; then - echo -e "!{meta.id}-!{meta.assembler}\tFinal Corrected Assembly FastA File\tPASS" \ + echo -e "!{meta.id}\tFinal Corrected Assembly FastA File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Final_Corrected_Assembly_FastA_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tFinal Corrected Assembly FastA File\tFAIL" \ + echo -e "!{meta.id}\tFinal Corrected Assembly FastA File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Final_Corrected_Assembly_FastA_File.tsv" fi # Single read mapping if available for downstream depth of coverage calculations if [[ !{meta.id}_single.fq.gz ]]; then - msg "INFO: Single read mapping" + + msg "INFO: Single read mapping of !{meta.id}..." + bwa index "!{meta.id}-!{meta.assembler}.fna" + msg "INFO: Completed bwa index of !{meta.id}-!{meta.assembler}.fna FastA assembly file" + bwa mem \ -v 2 \ -x intractg \ @@ -83,23 +96,45 @@ process MAP_CONTIGS_BWA { -o "!{meta.id}-!{meta.assembler}.single.bam" \ --reference "!{meta.id}-!{meta.assembler}.fna" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Binary_SE_Alignment_Map_File.tsv" - if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.single.bam" 'Binary SE Alignment Map File' '!{params.min_filesize_binary_se_alignment}'; then - echo -e "!{meta.id}-!{meta.assembler}\tBinary SE Alignment Map File\tPASS" \ + msg "INFO: Completed single read mapping of !{meta.id}" + + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Binary_SE_Alignment_Map_File.tsv" + if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.single.bam" 'Binary SE Alignment Map BAM File' '!{params.min_filesize_binary_se_alignment}'; then + echo -e "!{meta.id}\tBinary SE Alignment Map BAM File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Binary_SE_Alignment_Map_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tBinary SE Alignment Map File\tFAIL" \ + echo -e "!{meta.id}\tBinary SE Alignment Map BAM File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Binary_SE_Alignment_Map_File.tsv" fi samtools index "!{meta.id}-!{meta.assembler}.single.bam" + + msg "INFO: Completed samtools index of single-end BAM alignment file for !{meta.id}" + fi + # Calculate checksum + FILE="!{meta.id}-!{meta.assembler}.fna" + CHECKSUM=$(awk '/^>/ {print substr($1, 1)} !/^>/ {print}' "${FILE}" | sha512sum | awk '{print $1}') + echo "${CHECKSUM}" | awk -v sample_id="!{meta.id}" -v file="${FILE}" ' + BEGIN { + # Print the header once + print "Sample_name\tChecksum_(SHA-512)\tFile" + } + { + # Print the data row once, using the CHECKSUM from input + print sample_id "\t" $1 "\t" file + }' \ + > "!{meta.id}.Assembly_FastA.SHA512-checksums.tsv" + + msg "INFO: Calculated checksum of FastA assembly file for !{meta.id}" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": bwa: $(bwa 2>&1 | head -n 3 | tail -1 | awk 'NF>1{print $NF}') samtools: $(samtools --version | head -n 1 | awk 'NF>1{print $NF}') + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') END_VERSIONS ''' } diff --git a/modules/local/map_contigs_bwa/params.config b/modules/local/map_contigs_bwa/params.config index e8fa9ada..6835ed1c 100644 --- a/modules/local/map_contigs_bwa/params.config +++ b/modules/local/map_contigs_bwa/params.config @@ -3,5 +3,5 @@ params { min_filesize_final_assembly = '1M' min_filesize_filtered_assembly = '1M' min_filesize_binary_se_alignment = '1k' - min_filesize_binary_pe_alignment = '25M' + min_filesize_binary_pe_alignment = '6M' } diff --git a/modules/local/mlst_mlst/main.nf b/modules/local/mlst_mlst/main.nf index e3e396d5..8ccef044 100644 --- a/modules/local/mlst_mlst/main.nf +++ b/modules/local/mlst_mlst/main.nf @@ -1,7 +1,7 @@ process MLST_MLST { tag { "${meta.id}-${meta.assembler}" } - container "gregorysprenger/mlst@sha256:69c8c8027474b8f361ef4a579df171702f3ed52f45e3fb388a41ccbf4542706f" + container "staphb/mlst@sha256:17e78a25fc5171706b22c8c3d4b1ca2352593b56fef8f28401dd5da3e2e7abe8" // staphb/mlst:2.23.0-2024-09-01 input: tuple val(meta), path(assembly) @@ -20,8 +20,7 @@ process MLST_MLST { ''' source bash_functions.sh - # MLST for each assembly - msg "INFO: Performing MLST" + msg "INFO: Looking for MLST schemes to exclude ..." # Check if input scheme is in mlst's database mlst_scheme="!{scheme}" @@ -47,20 +46,38 @@ process MLST_MLST { exclude_list=$(echo ${exclude_list[@]} | tr ' ' ',') fi + msg "INFO: Excluding MLST schemes: ${exclude_list}" + if [[ -s !{assembly} ]]; then + msg "INFO: Performing MLST ..." + mlst \ "!{assembly}" \ !{min_score} \ !{min_identity} \ !{min_coverage} \ + --novel "!{meta.id}-!{meta.assembler}.MLST.novel.fasta" \ --threads !{task.cpus} \ --scheme "${mlst_scheme}" \ --exclude "${exclude_list}" \ - >> "!{meta.id}-!{meta.assembler}.MLST.tsv" + > "!{meta.id}-!{meta.assembler}.MLST.tsv" + + msg "INFO: Completed MLST genotyping" + + # Print header line and add in Sample_name identifier to data row + awk -F $'\t' -v id="!{meta.id}" \ + 'BEGIN{ + OFS=FS + print "Sample_name" OFS "PubMLST_scheme_name" OFS "Sequence_type_(ST-#)" OFS "Allele_numbers" + } + {$1=id; print}' \ + "!{meta.id}-!{meta.assembler}.MLST.tsv" \ + > tmp \ + && \ + mv tmp "!{meta.id}-!{meta.assembler}.MLST.tsv" + + msg "INFO: Appended header to MLST summary output file" - sed -i \ - '1i Filename\tPubMLST scheme name\tSequence type\tAllele IDs' \ - "!{meta.id}-!{meta.assembler}.MLST.tsv" fi # Get process version information diff --git a/modules/local/mlst_mlst/params.config b/modules/local/mlst_mlst/params.config index e29ff964..974f2792 100644 --- a/modules/local/mlst_mlst/params.config +++ b/modules/local/mlst_mlst/params.config @@ -4,5 +4,7 @@ params { mlst_min_score = '50' mlst_min_identity = '95' mlst_min_coverage = '10' - mlst_ignore_scheme = 'ecoli,abaumannii,vcholerae_2' + + // Exclude all MLST databases by default with a *_ such as leptospira_2 and leptospira_3 + mlst_ignore_scheme = 'abaumannii_2,blicheniformis_14,bordetella_3,brachyspira_2,brachyspira_3,brachyspira_4,brachyspira_5,campylobacter_nonjejuni_2,campylobacter_nonjejuni_3,campylobacter_nonjejuni_4,campylobacter_nonjejuni_5,campylobacter_nonjejuni_6,campylobacter_nonjejuni_7,campylobacter_nonjejuni_8,campylobacter_nonjejuni_9,diphtheria_3,ecoli_achtman_4,leptospira_2,leptospira_3,listeria_2,llactis_phage,mbovis_2,mcatarrhalis_achtman_6,mgallisepticum_2,mhominis_3,mycobacteria_2,pacnes_3,pmultocida_2,senterica_achtman_2,vcholerae_2,ypseudotuberculosis_achtman_3' } diff --git a/modules/local/overlap_paired_reads_flash/main.nf b/modules/local/overlap_paired_reads_flash/main.nf index 9a2bdbd9..0757846f 100644 --- a/modules/local/overlap_paired_reads_flash/main.nf +++ b/modules/local/overlap_paired_reads_flash/main.nf @@ -1,8 +1,8 @@ process OVERLAP_PAIRED_READS_FLASH { - label "process_low" + label "process_high" //extra RAM unnecessary but this is a bottleneck for CPU speed in the workflow tag { "${meta.id}" } - container "snads/flash@sha256:363b2f44d040c669191efbc3d3ba99caf5efd3fdef370af8f00f3328932143a6" + container "staphb/flash@sha256:44889120b49d3f8eefdde8f6040b096d5ee122ceb71d936b596757e4fc16a2c0" input: tuple val(meta), path(fastq_pairs) @@ -10,7 +10,8 @@ process OVERLAP_PAIRED_READS_FLASH { output: tuple val(meta), path("${meta.id}.Non-overlapping_FastQ_File.tsv"), emit: qc_filecheck tuple val(meta), path("${meta.id}*{paired,single}.fq.gz") , emit: cleaned_fastq_files - path("${meta.id}.FLASH.tsv") , emit: summary + path("${meta.id}.Overlap.tsv") , emit: summary + path("${meta.id}.Clean_Reads_FastQ.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") path("versions.yml") , emit: versions @@ -18,16 +19,19 @@ process OVERLAP_PAIRED_READS_FLASH { ''' source bash_functions.sh + echo !{fastq_pairs} + # Determine read length based on the first 100 reads echo "$(cat !{meta.id}_R1.paired.fq | head -n 400)" > read_R1_len.txt READ_LEN=$(awk 'NR%4==2 {if(length > x) {x=length; y=$0}} END{print length(y)}' read_R1_len.txt) + # Require 80% overlap length relative to input read length OVERLAP_LEN=$(echo | awk -v n=${READ_LEN} '{print int(n*0.8)}') - msg "INFO: ${READ_LEN} bp read length detected from raw input" >&2 + msg "INFO: ${READ_LEN} bp read length detected from raw input" # Merge overlapping sister reads into singleton reads if [ ${OVERLAP_LEN} -gt 0 ]; then - msg "INFO: ${OVERLAP_LEN} bp overlap will be required for sister reads to be merged" >&2 + msg "INFO: ${OVERLAP_LEN} bp overlap will be required for sister reads to be merged" msg "INFO: Merging paired end reads using FLASH" flash \ @@ -37,13 +41,14 @@ process OVERLAP_PAIRED_READS_FLASH { -m ${OVERLAP_LEN} \ "!{meta.id}_R1.paired.fq" "!{meta.id}_R2.paired.fq" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}.Non-overlapping_FastQ_File.tsv" + # Perform filesize checks and QA report + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.Non-overlapping_FastQ_File.tsv" for suff in notCombined_1.fastq notCombined_2.fastq; do if verify_minimum_file_size "flash.${suff}" 'Non-overlapping FastQ Files' "!{params.min_filesize_non_overlapping_fastq}"; then - echo -e "!{meta.id}\tNon-overlapping FastQ File (${suff})\tPASS" \ + echo -e "!{meta.id}\tNon-overlapping (${suff}) FastQ File\tPASS" \ >> "!{meta.id}.Non-overlapping_FastQ_File.tsv" else - echo -e "!{meta.id}\tNon-overlapping FastQ File (${suff})\tFAIL" \ + echo -e "!{meta.id}\tNon-overlapping (${suff}) FastQ File\tFAIL" \ >> "!{meta.id}.Non-overlapping_FastQ_File.tsv" fi done @@ -60,33 +65,53 @@ process OVERLAP_PAIRED_READS_FLASH { cat flash.extendedFrags.fastq >> "!{meta.id}_single.fq" else - echo "$(cat !{meta.id}_R2.paired.fq | tail -n 4)" >> "!{meta.id}_single.fq" + # Hack to ensure there's a legit singleton read to pass along to the next steps + echo "$(tail -n 4 !{meta.id}_R2.paired.fq)" >> "!{meta.id}_single.fq" fi - msg "INFO: ${CNT_READS_OVERLAPPED:-0} pairs overlapped into singleton reads" >&2 + msg "INFO: ${CNT_READS_OVERLAPPED:-0} pairs overlapped into singleton reads" fi - # Summarize final read set and compress - count_R1=$(echo $(cat !{meta.id}_R1.paired.fq | wc -l)) + # Summarize final read set counts + count_R1=$(wc -l !{meta.id}_R1.paired.fq | awk '{print $1}') CNT_CLEANED_PAIRS=$(echo $((${count_R1}/4))) msg "INFO: Number of reads cleaned: ${CNT_CLEANED_PAIRS}" - count_single=$(echo $(cat "!{meta.id}_single.fq" | wc -l)) + count_single=$(wc -l "!{meta.id}_single.fq" | awk '{print $1}') CNT_CLEANED_SINGLETON=$(echo $((${count_single}/4))) msg "INFO: Number of singletons cleaned: ${CNT_CLEANED_SINGLETON}" - echo -e "Sample name\t# cleaned reads (paired FastQ)\t# cleaned reads (singletons)\t# overlapped reads" \ - > "!{meta.id}.FLASH.tsv" + # Report I/O sequence stats + echo -e "Sample_name\tCleaned_reads_(#_paired)\tCleaned_reads_(#_singletons)\tOverlapped_reads_(#)" \ + > "!{meta.id}.Overlap.tsv" echo -e "!{meta.id}\t${CNT_CLEANED_PAIRS}\t${CNT_CLEANED_SINGLETON}\t${CNT_READS_OVERLAPPED:-0}" \ - >> "!{meta.id}.FLASH.tsv" + >> "!{meta.id}.Overlap.tsv" + # Compress the output FastQ files for outdir storage gzip -9f "!{meta.id}_single.fq" \ "!{meta.id}_R1.paired.fq" \ "!{meta.id}_R2.paired.fq" + ### Calculate SHA-512 Checksums of each Input FastQ file ### + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-512)" + "File" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Clean_Reads_FastQ.SHA512-checksums.tsv" + + # Calculate checksums + for f in "!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" "!{meta.id}_single.fq.gz"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.Clean_Reads_FastQ.SHA512-checksums.tsv" + zcat "${f}" | awk 'NR%2==0' | paste - - | sort -k1,1 | sha512sum | awk '{print $1 "\t" "'"${f}"'"}' + done >> "!{meta.id}.Clean_Reads_FastQ.SHA512-checksums.tsv" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') flash: $(flash --version | head -n 1 | awk 'NF>1{print $NF}') END_VERSIONS ''' diff --git a/modules/local/polish_assembly_bwa_pilon/main.nf b/modules/local/polish_assembly_bwa_pilon/main.nf index da2ce001..a029238f 100644 --- a/modules/local/polish_assembly_bwa_pilon/main.nf +++ b/modules/local/polish_assembly_bwa_pilon/main.nf @@ -12,6 +12,7 @@ process POLISH_ASSEMBLY_BWA_PILON { tuple val(meta), path("${meta.id}-${meta.assembler}.{paired,single}.bam") , emit: bam path("${meta.id}-${meta.assembler}.{SNPs,InDels}-corrected.cnt.tsv") tuple val(meta), path("${meta.id}-${meta.assembler}.fna") , emit: assembly + path("${meta.id}.Assembly_FastA.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") path("versions.yml") , emit: versions @@ -20,29 +21,40 @@ process POLISH_ASSEMBLY_BWA_PILON { ''' source bash_functions.sh - # Correct cleaned SPAdes contigs with cleaned PE reads - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Filtered_Assembly_File.tsv" + msg "INFO: evaluating input filesize of !{uncorrected_contigs} ..." + + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Filtered_Assembly_File.tsv" if verify_minimum_file_size "!{uncorrected_contigs}" 'Filtered Assembly File' "!{params.min_filesize_filtered_assembly}"; then - echo -e "!{meta.id}-!{meta.assembler}\tFiltered Assembly File\tPASS" \ + echo -e "!{meta.id}\tFiltered Assembly File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Filtered_Assembly_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tFiltered Assembly File\tFAIL" \ + echo -e "!{meta.id}\tFiltered Assembly File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Filtered_Assembly_File.tsv" fi - # Set up files to retain InDel and SNP correction information - echo -e "Correction round\tNumber of InDels corrected" \ + msg "INFO: assembly filesize meets or exceeds !{params.min_filesize_filtered_assembly}" + + msg "INFO: Polishing !{meta.id} contigs with cleaned paired end reads ..." + + # Set up files to retain InDel and SNP correction counts (each line is a subsequent polishing round) + echo -e "Sample_name\tCorrection_round\tInDels_corrected_[#]" \ > "!{meta.id}-!{meta.assembler}.InDels-corrected.cnt.tsv" - echo -e "Correction round\tNumber of SNPs corrected" \ + echo -e "Sample_name\tCorrection_round\tSNPs_corrected_[#]" \ > "!{meta.id}-!{meta.assembler}.SNPs-corrected.cnt.tsv" - msg "INFO: Polishing contigs with paired end reads.." + # Set up QC File checks for BAM and FastA for each round of polishing + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Binary_PE_Alignment_Map_File.tsv" + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Polished_Assembly_File.tsv" for (( i=1; i<=!{polish_corrections}; i++ )); do - msg "INFO: Performing polishing step ${i} of !{polish_corrections}" + msg "INFO: Performing polishing step ${i} of !{polish_corrections} total rounds ..." bwa index !{uncorrected_contigs} + msg "INFO: Completed bwa index of !{uncorrected_contigs} FastA assembly file" + + msg "INFO: Cleaned paired read mapping (${i} of !{polish_corrections}) of !{meta.id} ..." + bwa mem \ -v 2 \ -x intractg \ @@ -56,17 +68,22 @@ process POLISH_ASSEMBLY_BWA_PILON { -o "!{meta.id}-!{meta.assembler}.paired.bam" \ --reference !{uncorrected_contigs} - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Binary_PE_Alignment_Map_File.tsv" - if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.paired.bam" 'Binary PE Alignment Map File' "!{params.min_filesize_binary_pe_alignment}"; then - echo -e "!{meta.id}-!{meta.assembler}\tBinary PE Alignment Map File (${i} of !{polish_corrections})\tPASS" \ + msg "INFO: Completed paired-end read mapping (${i} of !{polish_corrections}) of !{meta.id} and formed sorted BAM output file" + + if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.paired.bam" 'Binary PE Alignment Map BAM File' "!{params.min_filesize_binary_pe_alignment}"; then + echo -e "!{meta.id}\tBinary PE Alignment Map BAM File (${i} of !{polish_corrections} rounds)\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Binary_PE_Alignment_Map_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tBinary PE Alignment Map File (${i} of !{polish_corrections})\tFAIL" \ + echo -e "!{meta.id}\tBinary PE Alignment Map BAM File (${i} of !{polish_corrections} rounds)\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Binary_PE_Alignment_Map_File.tsv" fi samtools index "!{meta.id}-!{meta.assembler}.paired.bam" + msg "INFO: Completed samtools index of paired-end BAM alignment file for !{meta.id}" + + msg "INFO: Pilon correcting SNPs and InDels (${i} of !{polish_corrections}) of !{meta.id}..." + pilon \ --genome !{uncorrected_contigs} \ --frags "!{meta.id}-!{meta.assembler}.paired.bam" \ @@ -74,21 +91,22 @@ process POLISH_ASSEMBLY_BWA_PILON { --changes \ --fix snps,indels \ --mindepth 0.50 \ - --threads !{task.cpus} >&2 + --threads !{task.cpus} + + msg "INFO: Completed Pilon correction of SNPs and InDels (${i} of !{polish_corrections}) for !{meta.id}" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Polished_Assembly_File.tsv" if verify_minimum_file_size "!{uncorrected_contigs}" 'Polished Assembly File' "!{params.min_filesize_polished_assembly}"; then - echo -e "!{meta.id}-!{meta.assembler}\tPolished Assembly File (${i} of !{polish_corrections})\tPASS" \ + echo -e "!{meta.id}\tPolished Assembly File (${i} of !{polish_corrections}) rounds\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Polished_Assembly_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tPolished Assembly File (${i} of !{polish_corrections})\tFAIL" \ + echo -e "!{meta.id}\tPolished Assembly File (${i} of !{polish_corrections} rounds)\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Polished_Assembly_File.tsv" fi # Place round number and number of InDels/SNPs corrected into respective files - echo -e "${i}\t$(grep -c '-' !{meta.id}-!{meta.assembler}.changes)" \ + echo -e "!{meta.id}\t${i}\t$(grep -c '-' !{meta.id}-!{meta.assembler}.changes)" \ >> "!{meta.id}-!{meta.assembler}.InDels-corrected.cnt.tsv" - echo -e "${i}\t$(grep -vc '-' !{meta.id}-!{meta.assembler}.changes)" \ + echo -e "!{meta.id}\t${i}\t$(grep -vc '-' !{meta.id}-!{meta.assembler}.changes)" \ >> "!{meta.id}-!{meta.assembler}.SNPs-corrected.cnt.tsv" rm -f "!{meta.id}-!{meta.assembler}.{changes,uncorrected.fna}" @@ -99,23 +117,29 @@ process POLISH_ASSEMBLY_BWA_PILON { sed -i 's/_pilon//1' "!{meta.id}-!{meta.assembler}.uncorrected.fna" done + # Final corrected FastA assembly file handling mv -f "!{meta.id}-!{meta.assembler}.uncorrected.fna" "!{meta.id}-!{meta.assembler}.fna" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Final_Corrected_Assembly_FastA_File.tsv" + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Final_Corrected_Assembly_FastA_File.tsv" if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.fna" 'Final Corrected Assembly FastA File' "!{params.min_filesize_final_assembly}"; then - echo -e "!{meta.id}-!{meta.assembler}\tFinal Corrected Assembly FastA File\tPASS" \ + echo -e "!{meta.id}\tFinal Corrected Assembly FastA File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Final_Corrected_Assembly_FastA_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tFinal Corrected Assembly FastA File\tFAIL" \ + echo -e "!{meta.id}\tFinal Corrected Assembly FastA File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Final_Corrected_Assembly_FastA_File.tsv" fi # Single read mapping if available for downstream depth of coverage # calculations, not for assembly polishing. + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Binary_SE_Alignment_Map_File.tsv" + if [[ !{meta.id}_single.fq.gz ]]; then - msg "INFO: Single read mapping" + msg "INFO: Single read mapping of !{meta.id}..." + bwa index "!{meta.id}-!{meta.assembler}.fna" + msg "INFO: Completed bwa index of !{meta.id}-!{meta.assembler}.fna FastA assembly file for single-end read mapping" + bwa mem \ -v 2 \ -x intractg \ @@ -129,24 +153,45 @@ process POLISH_ASSEMBLY_BWA_PILON { -o "!{meta.id}-!{meta.assembler}.single.bam" \ --reference "!{meta.id}-!{meta.assembler}.fna" - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}-!{meta.assembler}.Binary_SE_Alignment_Map_File.tsv" + msg "INFO: Completed single-end read mapping of !{meta.id} and formed sorted BAM output file" + if verify_minimum_file_size "!{meta.id}-!{meta.assembler}.single.bam" 'Binary SE Alignment Map File' '!{params.min_filesize_binary_se_alignment}'; then - echo -e "!{meta.id}-!{meta.assembler}\tBinary SE Alignment Map File\tPASS" \ + echo -e "!{meta.id}\tBinary SE Alignment Map File\tPASS" \ >> "!{meta.id}-!{meta.assembler}.Binary_SE_Alignment_Map_File.tsv" else - echo -e "!{meta.id}-!{meta.assembler}\tBinary SE Alignment Map File\tFAIL" \ + echo -e "!{meta.id}\tBinary SE Alignment Map File\tFAIL" \ >> "!{meta.id}-!{meta.assembler}.Binary_SE_Alignment_Map_File.tsv" fi samtools index "!{meta.id}-!{meta.assembler}.single.bam" + + msg "INFO: Completed samtools index of single-end BAM alignment file for !{meta.id}" + fi + # Calculate checksum + FILE="!{meta.id}-!{meta.assembler}.fna" + CHECKSUM=$(awk '/^>/ {print substr($1, 1)} !/^>/ {print}' "${FILE}" | sha512sum | awk '{print $1}') + echo "${CHECKSUM}" | awk -v sample_id="!{meta.id}" -v file="${FILE}" ' + BEGIN { + # Print the header once + print "Sample_name\tChecksum_(SHA-512)\tFile" + } + { + # Print the data row once, using the CHECKSUM from input + print sample_id "\t" $1 "\t" file + }' \ + > "!{meta.id}.Assembly_FastA.SHA512-checksums.tsv" + + msg "INFO: Calculated checksum of polished FastA assembly file for !{meta.id}" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": - pilon: $(pilon --version | cut -d ' ' -f 3) bwa: $(bwa 2>&1 | head -n 3 | tail -1 | awk 'NF>1{print $NF}') + pilon: $(pilon --version | cut -d ' ' -f 3) samtools: $(samtools --version | head -n 1 | awk 'NF>1{print $NF}') + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') END_VERSIONS ''' } diff --git a/modules/local/polish_assembly_bwa_pilon/params.config b/modules/local/polish_assembly_bwa_pilon/params.config index 92f1eb31..66393c48 100644 --- a/modules/local/polish_assembly_bwa_pilon/params.config +++ b/modules/local/polish_assembly_bwa_pilon/params.config @@ -4,7 +4,7 @@ params { min_filesize_filtered_assembly = '1M' min_filesize_polished_assembly = '1M' min_filesize_binary_se_alignment = '1k' - min_filesize_binary_pe_alignment = '25M' + min_filesize_binary_pe_alignment = '6M' // Number of corrections spades_polish_corrections = 3 diff --git a/modules/local/prepare_db_sra_human_scrubber_unix/main.nf b/modules/local/prepare_db_sra_human_scrubber_unix/main.nf index e523eda4..d2189b66 100644 --- a/modules/local/prepare_db_sra_human_scrubber_unix/main.nf +++ b/modules/local/prepare_db_sra_human_scrubber_unix/main.nf @@ -18,7 +18,7 @@ process PREPARE_DB_SRA_HUMAN_SCRUBBER { # Make sure the decompression worked if [[ $(find . -type f -name !{database%.gz} | wc -l) -lt 1 ]]; then - msg "ERROR: Missing decompressed SRA Human Scrubber db file" + msg "ERROR: Missing decompressed SRA Human Scrubber db file" >&2 exit 1 fi diff --git a/modules/local/qa_assembly_quast/main.nf b/modules/local/qa_assembly_quast/main.nf index baf2d53a..90f1645d 100644 --- a/modules/local/qa_assembly_quast/main.nf +++ b/modules/local/qa_assembly_quast/main.nf @@ -2,67 +2,51 @@ process QA_ASSEMBLY_QUAST { label "process_low" tag { "${meta.id}-${meta.assembler}" } - container "snads/quast@sha256:c8147a279feafbc88bafeeda3817ff32d43db87d31dd0978df1cd2f8022d324c" + container "staphb/quast@sha256:83ea0fd6c28ca01508fd7a93c0942b19089e6de25c9b8a496a34e138d240e0e8" input: - tuple val(meta), path(cleaned_fastq_files), path(assembly) + tuple val(meta), path(assembly) output: - tuple val(meta), path("${meta.id}-${meta.assembler}.QuastSummary.tsv"), path("${meta.id}-${meta.assembler}.CleanedReads-Bases.tsv"), emit: qa_summaries - path("${meta.id}-${meta.assembler}.QuastSummary.tsv") , emit: summary_assemblies - path("${meta.id}-${meta.assembler}.CleanedReads-Bases.tsv") , emit: summary_reads + tuple val(meta), path("${meta.id}-${meta.assembler}.QuastSummary.tsv"), emit: qa_summaries + path("${meta.id}-${meta.assembler}.QuastSummary.tsv") , emit: summary_assemblies path(".command.{out,err}") - path("versions.yml") , emit: versions + path("versions.yml") , emit: versions shell: ''' source bash_functions.sh # Run Quast - msg "INFO: Evaluating assembly using QUAST" + msg "INFO: Evaluating !{meta.id} assembly using QUAST ..." quast.py \ --silent \ --no-html \ - --strict-NA \ - --gene-finding \ + --no-plots \ --min-contig 100 \ --output-dir quast \ - --gene-thresholds 300 \ - --ambiguity-usage one \ --threads !{task.cpus} \ --contig-thresholds 500,1000 \ - "!{assembly}" >&2 + "!{assembly}" + + msg "INFO: Completed QUAST evaluation of !{meta.id} assembly" mv -f quast/transposed_report.tsv "!{meta.id}-!{meta.assembler}.QuastSummary.tsv" # Quast modifies basename. Need to check and modify if needed. assemblies_name=$(awk '{print $1}' "!{meta.id}-!{meta.assembler}.QuastSummary.tsv" | awk 'NR!=1 {print}') if [ ${assemblies_name} != !{meta.id} ]; then - sed -i "s|${assemblies_name}|!{meta.id}|g" "!{meta.id}-!{meta.assembler}.QuastSummary.tsv" + sed -i "s|${assemblies_name}|!{meta.id}|1" "!{meta.id}-!{meta.assembler}.QuastSummary.tsv" fi - # TO-DO: move this unix-only component to separate QA_READS_BASEPAIR_COUNT_UNIX - # Count nucleotides per read set - echo -n '' > "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv" - for (( i=0; i<3; i+=3 )); do - R1=$(basename "!{meta.id}_R1.paired.fq.gz" _R1.paired.fq.gz) - R2=$(basename "!{meta.id}_R2.paired.fq.gz" _R2.paired.fq.gz) - single=$(basename "!{meta.id}_single.fq.gz" _single.fq.gz) + # Keep same first column header column name as all others -- "Sample_name" + sed -i '1s/^Assembly/Sample_name/1' "!{meta.id}-!{meta.assembler}.QuastSummary.tsv" - # Verify each set of reads groups properly - nr_uniq_str=$(echo -e "${R1}\\n${R2}\\n${single}" | sort -u | wc -l) - if [ "${nr_uniq_str}" -ne 1 ]; then - msg "ERROR: improperly grouped ${R1} ${R2} ${single}" >&2 - exit 1 - fi - echo -ne "${R1}\t" >> "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv" - zcat "!{meta.id}_R1.paired.fq.gz" "!{meta.id}_R2.paired.fq.gz" "!{meta.id}_single.fq.gz" | \ - awk 'BEGIN{SUM=0} {if(NR%4==2){SUM+=length($0)}} END{OFMT="%f"; print SUM}' \ - >> "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv" + # Replace space characters in header line with underscores + sed -i '1s/ /_/g' "!{meta.id}-!{meta.assembler}.QuastSummary.tsv" - sed -i '1i Sample name\t# cleaned bases' "!{meta.id}-!{meta.assembler}.CleanedReads-Bases.tsv" - done + msg "INFO: Completed QUAST output renaming for !{meta.id}" # Get process version information cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/read_classify_kraken/main.nf b/modules/local/read_classify_kraken/main.nf index 09a4be5f..3e7058b5 100644 --- a/modules/local/read_classify_kraken/main.nf +++ b/modules/local/read_classify_kraken/main.nf @@ -2,7 +2,7 @@ process READ_CLASSIFY_KRAKEN_ONE { label "process_high" tag { "${meta.id}" } - container "gregorysprenger/kraken@sha256:b5ab4b75fb197b16e81d8cc3878e08479bc7d105ac0b2e948e6f6a9985cfc93e" + container "staphb/kraken@sha256:6f426bbe8ba0b49b6285d773392a94aa79f424ddc50bfb7a00bb52552ea77267" input: tuple val(meta), path(cleaned_fastq_files) @@ -21,7 +21,8 @@ process READ_CLASSIFY_KRAKEN_ONE { # Investigate taxonomic identity of cleaned reads if [ ! -s !{meta.id}.kraken_summary.tsv ]; then - msg "INFO: Performing Kraken1 classifications" + msg "INFO: Performing Kraken1 classifications of !{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} !{cleaned_fastq_files[2]} ..." + kraken \ --fastq-input \ --db !{database} \ @@ -30,17 +31,34 @@ process READ_CLASSIFY_KRAKEN_ONE { !{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} !{cleaned_fastq_files[2]} \ > "!{meta.id}_kraken.output" - msg "INFO: Creating Kraken Report" + msg "INFO: Created Kraken1 classifications of !{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} !{cleaned_fastq_files[2]}" + + msg "INFO: Making Kraken1 report from !{meta.id}_kraken.output ..." + + msg "INFO: Creating Kraken1 Report" kraken-report \ --db !{database} \ !{meta.id}_kraken.output \ - > kraken.tsv 2>&1 | tr '^M' '\\n' 1>&2 + > kraken.tsv + + msg "INFO: Created Kraken1 report kraken.tsv" + + msg "INFO: Summarizing Kraken1 report kraken.tsv ..." + + echo -ne "!{meta.id}\t" > "!{meta.id}.kraken_summary.tsv" + summarize_kraken 'kraken.tsv' | sed 's/%//g' >> "!{meta.id}.kraken_summary.tsv" - msg "INFO: Summarizing Kraken1" - echo -e "% Reads\t# Reads\tUnclassified\t% Reads\t# Reads\tGenus\t% Reads\t# Reads\tGenus\t% Reads\t# Reads\tSpecies\t% Reads\t# Reads\tSpecies\t% Reads\t# Reads" \ - > "!{meta.id}.kraken_summary.tsv" + # Add header to output + SUMMARY_HEADER=( + "Sample_name" + "Reads_(%)" "Reads_(#)" "Unclassified" + "Reads_(%)" "Reads_(#)" "Genus" "Reads_(%)" "Reads_(#)" "Genus" "Reads_(%)" "Reads_(#)" "Genus" + "Reads_(%)" "Reads_(#)" "Species" "Reads_(%)" "Reads_(#)" "Species" "Reads_(%)" "Reads_(#)" "Species" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + sed -i "1i ${SUMMARY_HEADER}" "!{meta.id}.kraken_summary.tsv" - summarize_kraken 'kraken.tsv' >> "!{meta.id}.kraken_summary.tsv" + msg "INFO: Created Kraken1 report !{meta.id}.kraken_summary.tsv" mv kraken.tsv "!{meta.id}.kraken_output.tsv" gzip "!{meta.id}.kraken_output.tsv" diff --git a/modules/local/read_classify_kraken2/main.nf b/modules/local/read_classify_kraken2/main.nf index e2f8a76b..d5a13560 100644 --- a/modules/local/read_classify_kraken2/main.nf +++ b/modules/local/read_classify_kraken2/main.nf @@ -2,7 +2,7 @@ process READ_CLASSIFY_KRAKEN_TWO { label "process_high" tag { "${meta.id}" } - container "gregorysprenger/kraken2@sha256:213e70b0f465464b2e52f9f128dcb0cc6761705f6e99b7ce48a5a27a6851083a" + container "staphb/kraken2@sha256:53aee35987059ae177301e6bdeceb1524a4bcf7b0eb0ef0842d8578b6bf1a5ee" input: tuple val(meta), path(cleaned_fastq_files) @@ -21,7 +21,8 @@ process READ_CLASSIFY_KRAKEN_TWO { # Investigate taxonomic identity of cleaned reads if [ ! -s !{meta.id}.kraken2_summary.tsv ]; then - msg "INFO: Performing Kraken2 classifications" + msg "INFO: Performing Kraken2 classifications of !{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} !{cleaned_fastq_files[2]} ..." + kraken2 \ --use-names \ --gzip-compressed \ @@ -31,11 +32,24 @@ process READ_CLASSIFY_KRAKEN_TWO { --threads !{task.cpus} \ !{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} !{cleaned_fastq_files[2]} - msg "INFO: Summarizing Kraken2" - echo -e "% Reads\t# Reads\tUnclassified\t% Reads\t# Reads\tGenus\t% Reads\t# Reads\tGenus\t% Reads\t# Reads\tSpecies\t% Reads\t# Reads\tSpecies\t% Reads\t# Reads" \ - > "!{meta.id}.kraken2_summary.tsv" + msg "INFO: Completed Kraken2 classifications of !{cleaned_fastq_files[0]} !{cleaned_fastq_files[1]} !{cleaned_fastq_files[2]}" + + msg "INFO: Summarizing Kraken2 report kraken2.tsv ..." + + echo -ne "!{meta.id}\t" > "!{meta.id}.kraken2_summary.tsv" + summarize_kraken 'kraken2.tsv' | sed 's/%//g' >> "!{meta.id}.kraken2_summary.tsv" + + # Add header to output + SUMMARY_HEADER=( + "Sample_name" + "Reads_(%)" "Reads_(#)" "Unclassified" + "Reads_(%)" "Reads_(#)" "Genus" "Reads_(%)" "Reads_(#)" "Genus" "Reads_(%)" "Reads_(#)" "Genus" + "Reads_(%)" "Reads_(#)" "Species" "Reads_(%)" "Reads_(#)" "Species" "Reads_(%)" "Reads_(#)" "Species" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + sed -i "1i ${SUMMARY_HEADER}" "!{meta.id}.kraken2_summary.tsv" - summarize_kraken 'kraken2.tsv' >> "!{meta.id}.kraken2_summary.tsv" + msg "INFO: Created Kraken2 summary !{meta.id}.kraken2_summary.tsv" mv kraken2.tsv !{meta.id}.kraken2_output.tsv gzip !{meta.id}.kraken2_output.tsv diff --git a/modules/local/remove_broken_pairs_bbtools_repair/main.nf b/modules/local/remove_broken_pairs_bbtools_repair/main.nf index 4cf2bd5f..9bbe0222 100644 --- a/modules/local/remove_broken_pairs_bbtools_repair/main.nf +++ b/modules/local/remove_broken_pairs_bbtools_repair/main.nf @@ -2,7 +2,7 @@ process REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR { label "process_high" tag { "${meta.id}" } - container "snads/bbtools@sha256:9f2a9b08563839cec87d856f0fc7607c235f464296fd71e15906ea1d15254695" + container "staphb/bbtools@sha256:f7b98063910e2e3b5be12f62076ec5cfdeaa562a01596758feb9a892ce18a363" input: tuple val(meta), path(reads) @@ -10,7 +10,8 @@ process REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR { output: tuple val(meta), path("${meta.id}.BBTools-Repair-removed_FastQ_File.tsv"), emit: qc_filecheck tuple val(meta), path("${meta.id}_repaired-R{1,2}.fastq.gz") , emit: repaired_reads - path("${meta.id}.BBTools-Repair-Removal.tsv") , emit: summary + path("${meta.id}.BBTools_Repair_Removal.tsv") , emit: summary + path("${meta.id}.BBTools_Repair_Removed_FastQ.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") path("versions.yml") , emit: versions @@ -19,7 +20,7 @@ process REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR { source bash_functions.sh # Remove broken sister read sequences - msg "INFO: Removing broken sister reads using BBTools' Repair..." + msg "INFO: Removing broken sister reads for !{meta.id} using BBTools' Repair with !{task.memory} RAM ..." # NOTE: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/repair-guide/ # "Repairing (repair flag) arbitrarily disordered files will take a lot of memory" @@ -27,14 +28,24 @@ process REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR { # NOTE: no CPU flag repair.sh \ overwrite=t \ - in="!{reads[0]}" \ - in2="!{reads[1]}" \ - out=!{meta.id}_repaired-R1.fastq.gz \ - out2=!{meta.id}_repaired-R2.fastq.gz \ + in=!{reads[0]} \ + in2=!{reads[1]} \ + out=!{meta.id}_repaired-R1.fastq \ + out2=!{meta.id}_repaired-R2.fastq \ outs=!{meta.id}_discarded_singletons.fastq \ repair=t - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}.BBTools-Repair-removed_FastQ_File.tsv" + msg "INFO: Completed removal of broken sister reads for !{meta.id} using BBTools' Repair" + + # NOTE: repair.sh handles .gz outfile extension but it can get stuck hanging + # when there's errors like: + # "bgzip: error while loading shared libraries: libcurl-gnutls.so.4: cannot open shared object file: No such file or directory" + # "Caused by: java.io.IOException: Stream closed" + gzip -f !{meta.id}_repaired-R1.fastq !{meta.id}_repaired-R2.fastq + + msg "INFO: Completed FastQ compression of repaired sister reads for !{meta.id}" + + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.BBTools-Repair-removed_FastQ_File.tsv" for suff in R1.fastq.gz R2.fastq.gz; do if verify_minimum_file_size "!{meta.id}_repaired-${suff}" 'Repaired FastQ Files' "!{params.min_filesize_broken_pairs_bbtools_repair_removed}"; then echo -e "!{meta.id}\tBBTools-repair-removed FastQ ($suff) File\tPASS" \ @@ -42,6 +53,7 @@ process REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR { else echo -e "!{meta.id}\tBBTools-repair-removed FastQ ($suff) File\tFAIL" \ >> "!{meta.id}.BBTools-Repair-removed_FastQ_File.tsv" + exit 1 fi done @@ -62,16 +74,16 @@ process REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR { OUTPUT_BASES_PERCENT=$(grep '^Result: ' .command.err | awk '{print $7}' | sed -e 's/[()]//g' -e 's/%//') # Ensure all values parsed properly from stderr output - for val in $INPUT_READS $INPUT_BASES $REMOVED_READS_COUNT $REMOVED_BASES_COUNT $OUTPUT_READS_COUNT $OUTPUT_BASES_COUNT; do + for val in "${INPUT_READS}" "${INPUT_BASES}" "${REMOVED_READS_COUNT}" "${REMOVED_BASES_COUNT}" "${OUTPUT_READS_COUNT}" "${OUTPUT_BASES_COUNT}"; do if [[ ! "${val}" =~ [0-9] ]]; then msg "ERROR: expected integer parsed from bbtools repair stderr instead of:${val}" >&2 exit 1 fi done - for val in $REMOVED_READS_PERCENT $REMOVED_BASES_PERCENT $OUTPUT_READS_PERCENT $OUTPUT_BASES_PERCENT; do + for val in "${REMOVED_READS_PERCENT}" "${REMOVED_BASES_PERCENT}" "${OUTPUT_READS_PERCENT}" "${OUTPUT_BASES_PERCENT}"; do if [[ ! "${val}" =~ [0-9.] ]]; then - msg "ERROR: expected percentage parsed from SRA Human Scrubber stderr instead of:${val}" >&2 - exit 1 + msg "ERROR: expected percentage parsed from SRA Human Scrubber stderr instead of:${val}" >&2 + exit 1 fi done @@ -79,46 +91,42 @@ process REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR { msg "INFO: ${REMOVED_READS_COUNT} (${REMOVED_READS_PERCENT}) reads were removed" msg "INFO: Output contains ${OUTPUT_BASES_COUNT} bp and ${OUTPUT_READS_COUNT} reads" - DELIM=$'\t' SUMMARY_HEADER=( - "Sample name" - "# Input reads" - "# Input bases" - "# Output reads" - "% Output reads" - "# Output bases" - "% Output bases" - "# Removed reads" - "% Removed reads" - "# Removed bases" - "% Removed bases" + "Sample_name" + "Input_reads_(#)" + "Removed_reads_(#)" + "Removed_reads_(%)" + "Output_reads_(#)" + "Output_reads_(%)" ) - SUMMARY_HEADER=$(printf "%s${DELIM}" "${SUMMARY_HEADER[@]}") - SUMMARY_HEADER="${SUMMARY_HEADER%${DELIM}}" + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') SUMMARY_OUTPUT=( "!{meta.id}" "${INPUT_READS}" - "${INPUT_BASES}" - "${OUTPUT_READS_COUNT}" - "${OUTPUT_READS_PERCENT}" - "${OUTPUT_BASES_COUNT}" - "${OUTPUT_BASES_PERCENT}" "${REMOVED_READS_COUNT}" "${REMOVED_READS_PERCENT}" - "${REMOVED_BASES_COUNT}" - "${REMOVED_BASES_PERCENT}" + "${OUTPUT_READS_COUNT}" + "${OUTPUT_READS_PERCENT}" ) - SUMMARY_OUTPUT=$(printf "%s${DELIM}" "${SUMMARY_OUTPUT[@]}") - SUMMARY_OUTPUT="${SUMMARY_OUTPUT%${DELIM}}" + SUMMARY_OUTPUT=$(printf "%s\t" "${SUMMARY_OUTPUT[@]}" | sed 's/\t$//') # Store input/output counts - echo -e "${SUMMARY_HEADER}" > !{meta.id}.BBTools-Repair-Removal.tsv - echo -e "${SUMMARY_OUTPUT}" >> !{meta.id}.BBTools-Repair-Removal.tsv + echo -e "${SUMMARY_HEADER}" > "!{meta.id}.BBTools_Repair_Removal.tsv" + echo -e "${SUMMARY_OUTPUT}" >> "!{meta.id}.BBTools_Repair_Removal.tsv" + + # Calculate checksums + for f in "!{meta.id}_repaired-R1.fastq.gz" "!{meta.id}_repaired-R2.fastq.gz"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.BBTools_Repair_Removed_FastQ.SHA512-checksums.tsv" + BASE="$(basename ${f})" + HASH=$(zcat "${f}" | awk 'NR%2==0' | paste - - | sort -k1,1 | sha512sum | awk '{print $1}') + echo -e "${HASH}\t${BASE}" + done >> "!{meta.id}.BBTools_Repair_Removed_FastQ.SHA512-checksums.tsv" # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') repair.sh: $(repair.sh --version 2>&1 | head -n 2 | tail -1 | awk 'NF>1{print $NF}') END_VERSIONS ''' diff --git a/modules/local/remove_host_hostile/main.nf b/modules/local/remove_host_hostile/main.nf index 5dea9e8c..30c988f7 100644 --- a/modules/local/remove_host_hostile/main.nf +++ b/modules/local/remove_host_hostile/main.nf @@ -10,7 +10,8 @@ process REMOVE_HOST_HOSTILE { output: tuple val(meta), path("${meta.id}.Hostile-removed_FastQ_File.tsv"), emit: qc_filecheck tuple val(meta), path("hostile/${meta.id}*.clean_*") , emit: host_removed_reads - path("${meta.id}.Hostile-Removal.tsv") , emit: summary + path("${meta.id}.Hostile_Removal.tsv") , emit: summary + path("${meta.id}.Hostile_FastQ.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") path("versions.yml") , emit: versions @@ -24,6 +25,9 @@ process REMOVE_HOST_HOSTILE { # Use a non-default host to remove only if user-specified HOST_INDEX_ARGUMENT='' if [[ ! -z "!{params.hostile_host_reference_path_prefix}" ]]; then + + msg "INFO: Detected user-specified hostile reference path. Validating its Bowtie2 index files ..." + for suff in .1.bt2 .2.bt2 .3.bt2 .4.bt2 .rev.1.bt2 .rev.2.bt2; do if verify_minimum_file_size "!{params.hostile_host_reference_path_prefix}${suff}" 'Bowtie2 Index file for Hostile' '1c'; then continue @@ -32,16 +36,20 @@ process REMOVE_HOST_HOSTILE { exit 1 fi done + bowtie2-inspect \ --summary \ !{params.hostile_host_reference_path_prefix} \ 1> !{meta.id}.bowtie2-inspect.stdout.log \ 2> !{meta.id}.bowtie2-inspect.stderr.log HOST_INDEX_ARGUMENT="--index !{params.hostile_host_reference_path_prefix}" + + msg "INFO: Completed validation of user-specified hostile Bowtie2 index files" + fi # Remove Host Reads - msg "INFO: Removing host reads using Hostile" + msg "INFO: Removing host reads using Hostile for !{meta.id} ..." if [[ ! -z "!{params.hostile_host_reference_path_prefix}" ]]; then hostile \ @@ -49,6 +57,7 @@ process REMOVE_HOST_HOSTILE { --fastq1 "!{reads[0]}" \ --fastq2 "!{reads[1]}" \ --out-dir hostile \ + --force \ "${HOST_INDEX_ARGUMENT}" \ --threads !{task.cpus} else @@ -57,32 +66,43 @@ process REMOVE_HOST_HOSTILE { --fastq1 "!{reads[0]}" \ --fastq2 "!{reads[1]}" \ --out-dir hostile \ + --force \ --threads !{task.cpus} fi + msg "INFO: Completed host removal with Hostile for !{meta.id}" + # JSON format stdout reports input/output filenames and read counts if ! verify_minimum_file_size .command.out 'JSON stdout for Hostile' '300c'; then msg "ERROR: JSON stdout missing or empty for reporting Hostile results" >&2 exit 1 fi + msg "INFO: Parsing host removal output files from Hostile for !{meta.id} ..." + # NOTE: grep used because `jq` absent from package RELATIVE_OUTPATH_R1=$(grep '"fastq1_out_path":' .command.out | awk '{print $2}' | sed 's/[",]//g') RELATIVE_OUTPATH_R2=$(grep '"fastq2_out_path":' .command.out | awk '{print $2}' | sed 's/[",]//g') + msg "INFO: Completed parsing host removal output files from Hostile for !{meta.id}" + # Validate output files are sufficient size to continue - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}.Hostile-removed_FastQ_File.tsv" + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.Hostile-removed_FastQ_File.tsv" + i=1 for file in ${RELATIVE_OUTPATH_R1} ${RELATIVE_OUTPATH_R2}; do if verify_minimum_file_size "${file}" 'Hostile-removed FastQ Files' "!{min_filesize_output_fastq}"; then - echo -e "!{meta.id}\tHostile-removed FastQ ($file) File\tPASS" \ + echo -e "!{meta.id}\tHostile-removed FastQ (R${i}) File\tPASS" \ >> !{meta.id}.Hostile-removed_FastQ_File.tsv else - echo -e "!{meta.id}\tHostile-removed FastQ ($file) File\tFAIL" \ + echo -e "!{meta.id}\tHostile-removed FastQ (R${i}) File\tFAIL" \ >> !{meta.id}.Hostile-removed_FastQ_File.tsv fi + ((i++)) done + msg "INFO: Parsing host removal metrics from Hostile for !{meta.id} ..." + # NOTE: grep used because `jq` absent from package COUNT_READS_INPUT=$(grep '"reads_in":' .command.out | awk '{print $2}' | sed 's/,//g') COUNT_READS_OUTPUT=$(grep '"reads_out":' .command.out | awk '{print $2}' | sed 's/,//g') @@ -90,14 +110,16 @@ process REMOVE_HOST_HOSTILE { COUNT_READS_REMOVED=$(grep '"reads_removed":' .command.out | awk '{print $2}' | sed 's/,//g') PERCENT_REMOVED=$(grep '"reads_removed_proportion":' .command.out | awk '{$2=$2*100; print $2}') + msg "INFO: Completed parsing host removal metrics from Hostile for !{meta.id}" + # Ensure all values parsed properly from JSON output report - for val in $COUNT_READS_INPUT $COUNT_READS_OUTPUT $COUNT_READS_REMOVED; do + for val in "${COUNT_READS_INPUT}" "${COUNT_READS_OUTPUT}" "${COUNT_READS_REMOVED}"; do if [[ ! "${val}" =~ [0-9] ]]; then msg "ERROR: expected integer parsed from Hostile JSON instead of:${val}" >&2 exit 1 fi done - for val in $PERCENT_REMOVED $PERCENT_OUTPUT; do + for val in "${PERCENT_REMOVED}" "${PERCENT_OUTPUT}"; do if [[ ! "${val}" =~ [0-9.] ]]; then msg "ERROR: expected percentage parsed from Hostile JSON instead of:${val}" >&2 exit 1 @@ -109,36 +131,52 @@ process REMOVE_HOST_HOSTILE { msg "INFO: ${PERCENT_REMOVED}% of input reads were removed (${COUNT_READS_REMOVED} reads)" msg "INFO: ${COUNT_READS_OUTPUT} non-host reads (${PERCENT_OUTPUT}%) were retained" - DELIM='\t' SUMMARY_HEADER=( - "Sample name" - "# Input reads" - "# Output reads" - "% Output reads" - "# Removed reads" - "% Removed reads" + "Sample_name" + "Input_reads_(#)" + "Removed_reads_(#)" + "Removed_reads_(%)" + "Output_reads_(#)" + "Output_reads_(%)" ) - SUMMARY_HEADER=$(printf "%s${DELIM}" "${SUMMARY_HEADER[@]}") - SUMMARY_HEADER="${SUMMARY_HEADER%${DELIM}}" + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') SUMMARY_OUTPUT=( "!{meta.id}" "${COUNT_READS_INPUT}" - "${COUNT_READS_OUTPUT}" - "${PERCENT_OUTPUT}" "${COUNT_READS_REMOVED}" "${PERCENT_REMOVED}" + "${COUNT_READS_OUTPUT}" + "${PERCENT_OUTPUT}" ) - SUMMARY_OUTPUT=$(printf "%s${DELIM}" "${SUMMARY_OUTPUT[@]}") - SUMMARY_OUTPUT="${SUMMARY_OUTPUT%${DELIM}}" + SUMMARY_OUTPUT=$(printf "%s\t" "${SUMMARY_OUTPUT[@]}" | sed 's/\t$//') # Store input/output counts - echo -e "${SUMMARY_HEADER}" > !{meta.id}.Hostile-Removal.tsv - echo -e "${SUMMARY_OUTPUT}" >> !{meta.id}.Hostile-Removal.tsv + echo -e "${SUMMARY_HEADER}" > !{meta.id}.Hostile_Removal.tsv + echo -e "${SUMMARY_OUTPUT}" >> !{meta.id}.Hostile_Removal.tsv + + ### Calculate SHA-512 Checksums of each FastQ file ### + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-512)" + "File" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Hostile_FastQ.SHA512-checksums.tsv" + + # Calculate checksums + for f in "${RELATIVE_OUTPATH_R1}" "${RELATIVE_OUTPATH_R2}"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.Hostile_FastQ.SHA512-checksums.tsv" + BASE="$(basename ${f})" + HASH=$(zcat "${f}" | awk 'NR%2==0' | paste - - | sort -k1,1 | sha512sum | awk '{print $1}') + echo -e "${HASH}\t${BASE}" + done >> "!{meta.id}.Hostile_FastQ.SHA512-checksums.tsv" # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') hostile: $(hostile --version) END_VERSIONS ''' diff --git a/modules/local/remove_host_sra_human_scrubber/main.nf b/modules/local/remove_host_sra_human_scrubber/main.nf index 717c9844..cecd80f6 100644 --- a/modules/local/remove_host_sra_human_scrubber/main.nf +++ b/modules/local/remove_host_sra_human_scrubber/main.nf @@ -11,12 +11,13 @@ process REMOVE_HOST_SRA_HUMAN_SCRUBBER { output: tuple val(meta), path("${meta.id}.SRA_Human_Scrubber_FastQ_File.tsv"), emit: qc_filecheck tuple val(meta), path("${meta.id}_R*_scrubbed.fastq.gz") , emit: host_removed_reads - path("${meta.id}.SRA-Human-Scrubber-Removal.tsv") , emit: summary + path("${meta.id}.SRA_Human_Scrubber_Removal.tsv") , emit: summary + path("${meta.id}.SRA_Human_Scrubber_FastQ.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") path("versions.yml") , emit: versions shell: - // TODO: Use container built on Ubuntu + // TODO: Use container built on Ubuntu to avoid filesize conversion (`find` is different here) minFilesize = params.min_filesize_fastq_sra_human_scrubber_removed min_filesize_output_fastq = ( ['c','b','k'].contains(minFilesize[-1])) ? "${minFilesize}" : "25000k" ''' @@ -28,33 +29,39 @@ process REMOVE_HOST_SRA_HUMAN_SCRUBBER { # https://github.com/ncbi/sra-human-scrubber/issues/20#issuecomment-1414392052 # Remove Host Reads - msg "INFO: Removing host reads using SRA Human Scrubber" + msg "INFO: Removing host reads from !{meta.id} using SRA Human Scrubber with !{task.memory} RAM ..." # NOTE: input only able to handle FastQ, not compressed # NOTE: output .gz filename doesn't compress, so requires gzip # NOTE: no handling for PE, do paired files 1-by-1 # scrub the R1 FastQ file if [[ "!{reads[0]}" =~ .gz ]]; then + msg "INFO: Removing host reads from GZ compressed R1 file "!{reads[0]}" ..." + zcat "!{reads[0]}" | \ scrub.sh \ - -d !{database} \ - -p !{task.cpus} \ + -d "!{database}" \ + -p "!{task.cpus}" \ -x \ -o "!{meta.id}_R1_scrubbed.fastq" \ 2> scrub_R1.stderr.txt - gzip "!{meta.id}_R1_scrubbed.fastq" + gzip -f "!{meta.id}_R1_scrubbed.fastq" + else + msg "INFO: Removing host reads from uncompressed R1 file "!{reads[0]}" ..." + scrub.sh \ -i "!{reads[0]}" \ - -d !{database} \ - -p !{task.cpus} \ + -d "!{database}" \ + -p "!{task.cpus}" \ -x \ -o "!{meta.id}_R1_scrubbed.fastq" \ 2> scrub_R1.stderr.txt - gzip "!{meta.id}_R1_scrubbed.fastq" + gzip -f "!{meta.id}_R1_scrubbed.fastq" fi + msg "INFO: Completed host reads removal from R1 file of !{meta.id}" # Parse R1 counts input/output/removed R1_COUNT_READS_INPUT=$(grep 'total read count:' scrub_R1.stderr.txt \ @@ -64,26 +71,32 @@ process REMOVE_HOST_SRA_HUMAN_SCRUBBER { # scrub the R2 FastQ file if [[ "!{reads[1]}" =~ .gz ]]; then + msg "INFO: Removing host reads from GZ compressed R2 file "!{reads[1]}" ..." + zcat "!{reads[1]}" | \ scrub.sh \ - -d !{database} \ - -p !{task.cpus} \ + -d "!{database}" \ + -p "!{task.cpus}" \ -x \ -o "!{meta.id}_R2_scrubbed.fastq" \ 2> scrub_R2.stderr.txt - gzip "!{meta.id}_R2_scrubbed.fastq" + gzip -f "!{meta.id}_R2_scrubbed.fastq" + else + msg "INFO: Removing host reads from uncompressed R2 file "!{reads[1]}" ..." + scrub.sh \ -i "!{reads[1]}" \ - -d !{database} \ - -p !{task.cpus} \ + -d "!{database}" \ + -p "!{task.cpus}" \ -x \ -o "!{meta.id}_R2_scrubbed.fastq" \ 2> scrub_R2.stderr.txt - gzip "!{meta.id}_R2_scrubbed.fastq" + gzip -f "!{meta.id}_R2_scrubbed.fastq" fi + msg "INFO: Completed host reads removal from R2 file of !{meta.id}" # Parse R2 counts input/output/removed R2_COUNT_READS_INPUT=$(grep 'total read count:' scrub_R2.stderr.txt \ @@ -102,6 +115,8 @@ process REMOVE_HOST_SRA_HUMAN_SCRUBBER { fi done + msg "INFO: Completed QC filesize checks of !{meta.id} for R1 and R2 after SRA Human Scrubbing" + # Summarize R1 and R2 counts input/output/removed COUNT_READS_INPUT=$(("${R1_COUNT_READS_INPUT}"+"${R2_COUNT_READS_INPUT}")) COUNT_READS_REMOVED=$(("${R1_COUNT_READS_REMOVED}"+"${R2_COUNT_READS_REMOVED}")) @@ -112,13 +127,13 @@ process REMOVE_HOST_SRA_HUMAN_SCRUBBER { | awk '{proportion=$1/$2} END{printf("%.6f", 100-(proportion*100))}') # Ensure all values parsed properly from stderr output - for val in $COUNT_READS_INPUT $COUNT_READS_OUTPUT $COUNT_READS_REMOVED; do + for val in "${COUNT_READS_INPUT}" "${COUNT_READS_OUTPUT}" "${COUNT_READS_REMOVED}"; do if [[ ! "${val}" =~ [0-9] ]]; then msg "ERROR: expected integer parsed from SRA Human Scrubber stderr instead of:${val}" >&2 exit 1 fi done - for val in $PERCENT_REMOVED $PERCENT_OUTPUT; do + for val in "${PERCENT_REMOVED}" "${PERCENT_OUTPUT}"; do if [[ ! "${val}" =~ [0-9.] ]]; then msg "ERROR: expected percentage parsed from SRA Human Scrubber stderr instead of:${val}" >&2 exit 1 @@ -130,38 +145,54 @@ process REMOVE_HOST_SRA_HUMAN_SCRUBBER { msg "INFO: ${PERCENT_REMOVED}% of input reads were removed (${COUNT_READS_REMOVED} reads)" msg "INFO: ${COUNT_READS_OUTPUT} non-host reads (${PERCENT_OUTPUT}%) were retained" - DELIM=$'\t' SUMMARY_HEADER=( - "Sample name" - "# Input reads" - "# Output reads" - "% Output reads" - "# Removed reads" - "% Removed reads" + "Sample_name" + "Input_reads_(#)" + "Removed_reads_(#)" + "Removed_reads_(%)" + "Output_reads_(#)" + "Output_reads_(%)" ) - SUMMARY_HEADER=$(printf "%s${DELIM}" "${SUMMARY_HEADER[@]}") - SUMMARY_HEADER="${SUMMARY_HEADER%${DELIM}}" + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') SUMMARY_OUTPUT=( "!{meta.id}" "${COUNT_READS_INPUT}" - "${COUNT_READS_OUTPUT}" - "${PERCENT_OUTPUT}" "${COUNT_READS_REMOVED}" "${PERCENT_REMOVED}" + "${COUNT_READS_OUTPUT}" + "${PERCENT_OUTPUT}" ) - SUMMARY_OUTPUT=$(printf "%s${DELIM}" "${SUMMARY_OUTPUT[@]}") - SUMMARY_OUTPUT="${SUMMARY_OUTPUT%${DELIM}}" + SUMMARY_OUTPUT=$(printf "%s\t" "${SUMMARY_OUTPUT[@]}" | sed 's/\t$//') # Store input/output counts - echo -e "${SUMMARY_HEADER}" > !{meta.id}.SRA-Human-Scrubber-Removal.tsv - echo -e "${SUMMARY_OUTPUT}" >> !{meta.id}.SRA-Human-Scrubber-Removal.tsv + echo -e "${SUMMARY_HEADER}" > "!{meta.id}.SRA_Human_Scrubber_Removal.tsv" + echo -e "${SUMMARY_OUTPUT}" >> "!{meta.id}.SRA_Human_Scrubber_Removal.tsv" + + ### Calculate SHA-512 Checksums of each FastQ file ### + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-512)" + "File" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.SRA_Human_Scrubber_FastQ.SHA512-checksums.tsv" + + # Calculate checksums + for f in "!{meta.id}_R1_scrubbed.fastq.gz" "!{meta.id}_R2_scrubbed.fastq.gz"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.SRA_Human_Scrubber_FastQ.SHA512-checksums.tsv" + BASE="$(basename ${f})" + HASH=$(zcat "${f}" | awk 'NR%2==0' | paste - - | sort -k1,1 | sha512sum | awk '{print $1}') + echo -e "${HASH}\t${BASE}" + done >> "!{meta.id}.SRA_Human_Scrubber_FastQ.SHA512-checksums.tsv" # Get process version information # NOTE: currently no option to print the software version number, but # track this issue https://github.com/ncbi/sra-human-scrubber/issues/28 cat <<-END_VERSIONS > versions.yml "!{task.process}": + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') sra-human-scrubber: 2.2.1 END_VERSIONS ''' diff --git a/modules/local/remove_phix_bbduk/main.nf b/modules/local/remove_phix_bbduk/main.nf index 1457f460..aa67286e 100644 --- a/modules/local/remove_phix_bbduk/main.nf +++ b/modules/local/remove_phix_bbduk/main.nf @@ -1,44 +1,43 @@ process REMOVE_PHIX_BBDUK { - label "process_low" + label "process_high" tag { "${meta.id}" } container "snads/bbtools@sha256:9f2a9b08563839cec87d856f0fc7607c235f464296fd71e15906ea1d15254695" + // NOTE: "staphb/bbtools@sha256-f7b98063910e2e3b5be12f62076ec5cfdeaa562a01596758feb9a892ce18a363" + // somtimes gives "bbtools bgzip: error while loading shared libraries: libcurl-gnutls.so.4: cannot open shared object file" + // error with some samples (e.g., SRR14718846). Need to upgrade but find out what we're doing different. + // Dockerfile or cmd difference issue? or both? input: tuple val(meta), path(reads) path phix_reference_file output: - tuple val(meta), path("${meta.id}_noPhiX-R{1,2}.fsq"), emit: phix_removed_reads - tuple val(meta), path("${meta.id}.PhiX*_File.tsv") , emit: qc_filecheck - path("${meta.id}.BBDuk.tsv") , emit: summary + tuple val(meta), path("${meta.id}_noPhiX-R{1,2}.fsq") , emit: phix_removed_reads + tuple val(meta), path("${meta.id}.PhiX_Genome_File.tsv"), emit: qc_filecheck + path("${meta.id}.PhiX.tsv") , emit: summary + path("${meta.id}.noPhiX_FastQ.SHA256-checksums.tsv") , emit: checksums path(".command.{out,err}") - path("versions.yml") , emit: versions + path("versions.yml") , emit: versions shell: ''' source bash_functions.sh # Verify PhiX reference file size - echo -e "Sample name\\tQC step\\tOutcome (Pass/Fail)" > "!{meta.id}.PhiX_Genome_File.tsv" + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.PhiX_Genome_File.tsv" if verify_minimum_file_size !{phix_reference_file} 'PhiX Genome' "!{params.min_filesize_phix_genome}"; then - echo -e "!{meta.id}\\tPhiX Genome\\tPASS" >> "!{meta.id}.PhiX_Genome_File.tsv" + echo -e "!{meta.id}\tPhiX Genome FastA File\tPASS" >> "!{meta.id}.PhiX_Genome_File.tsv" else - echo -e "!{meta.id}\\tPhiX Genome\\tFAIL" >> "!{meta.id}.PhiX_Genome_File.tsv" + echo -e "!{meta.id}\tPhiX Genome FastA File\tFAIL" >> "!{meta.id}.PhiX_Genome_File.tsv" fi - # Auto reformat FastQ files - # msg "INFO: Auto reformatting FastQ files.." - # for read in !{reads}; do - # reformat.sh \ - # in="${read}" \ - # out="reformatted.${read}" \ - # tossbrokenreads=t - # done - # Remove PhiX - msg "INFO: Removing PhiX using BBDuk.." + msg "INFO: Removing PhiX from !{meta.id} using BBDuk..." + # NOTE: With excess sequence reads, it is very normal and possible to see initial error of + # "NOTE: Process `ASSEMBLY:REMOVE_PHIX_BBDUK (name)` terminated with an error exit status (140) -- Execution is retried (1)" + # But an automatic retry in the workflow with increase RAM should process the bulky sample just fine. bbduk.sh \ k=31 \ hdist=1 \ @@ -52,13 +51,15 @@ process REMOVE_PHIX_BBDUK { out2=!{meta.id}_noPhiX-R2.fsq \ ref="!{phix_reference_file}" - echo -e "Sample name\\tQC step\\tOutcome (Pass/Fail)" > "!{meta.id}.PhiX-removed_FastQ_File.tsv" + msg "INFO: PhiX removed from !{meta.id} using BBDuk" + + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.PhiX-removed_FastQ_File.tsv" for suff in R1.fsq R2.fsq; do if verify_minimum_file_size "!{meta.id}_noPhiX-${suff}" 'PhiX-removed FastQ Files' "!{params.min_filesize_fastq_phix_removed}"; then - echo -e "!{meta.id}\\tPhiX-removed FastQ ($suff) File\\tPASS" \ + echo -e "!{meta.id}\tPhiX-removed (${suff}) FastQ File\tPASS" \ >> "!{meta.id}.PhiX-removed_FastQ_File.tsv" else - echo -e "!{meta.id}\\tPhiX-removed FastQ ($suff) File\\tFAIL" \ + echo -e "!{meta.id}\tPhiX-removed (${suff}) FastQ File\tFAIL" \ >> "!{meta.id}.PhiX-removed_FastQ_File.tsv" fi done @@ -84,44 +85,69 @@ process REMOVE_PHIX_BBDUK { NUM_CLEANED_BASES=$(grep '^Result: ' .command.err | awk '{print $5}') PERCENT_CLEANED_BASES=$(grep '^Result: ' .command.err | awk '{print $7}' | sed 's/[()]//g') - msg "INFO: Input contains ${TOT_BASES} bp and $TOT_READS reads" + msg "INFO: Input contains ${TOT_BASES} bp and ${TOT_READS} reads" msg "INFO: ${PHIX_BASES:-0} bp of PhiX were detected and ${PHIX_READS:-0} reads were removed" SUMMARY_HEADER=( - "Sample name", - "# Cleaned reads", - "% Cleaned reads", - "# Cleaned bp", - "% Cleaned bp", - "# PhiX reads", - "% PhiX reads", - "# PhiX Bp", - "% PhiX bp", - "# Raw reads", - "# Raw bp\n" + "Sample_name" + "Cleaned_reads_(#)" + "Cleaned_reads_(%)" + "Cleaned_basepairs_(#)" + "Cleaned_basepairs_(%)" + "PhiX_reads_(#)" + "PhiX_reads_(%)" + "PhiX_basepairs_(#)" + "PhiX_basepairs_(%)" + "Raw_reads_(#)" + "Raw_basepairs_(#)" ) SUMMARY_OUTPUT=( - "!{meta.id}", - "${NUM_CLEANED_READS}", - "${PERCENT_CLEANED_READS}", - "${NUM_CLEANED_BASES}", - "${PERCENT_CLEANED_BASES}", - "${NUM_PHIX_READS}", - "${PERCENT_PHIX_READS}", - "${NUM_PHIX_BASES}", - "${PERCENT_PHIX_BASES}", - "${TOT_READS}", - "${TOT_BASES}\n" + "!{meta.id}" + "${NUM_CLEANED_READS}" + "${PERCENT_CLEANED_READS}" + "${NUM_CLEANED_BASES}" + "${PERCENT_CLEANED_BASES}" + "${NUM_PHIX_READS}" + "${PERCENT_PHIX_READS}" + "${NUM_PHIX_BASES}" + "${PERCENT_PHIX_BASES}" + "${TOT_READS}" + "${TOT_BASES}" + ) + + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//1') + SUMMARY_OUTPUT=$(printf "%s\t" "${SUMMARY_OUTPUT[@]}" | sed 's/\t$//1') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.PhiX.tsv" + echo "${SUMMARY_OUTPUT}" >> "!{meta.id}.PhiX.tsv" + + ### Calculate SHA-256 Checksums of each FastQ file ### + msg "INFO: Calculating checksums for !{meta.id}_noPhiX-R1.fsq !{meta.id}_noPhiX-R2.fsq ..." + + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-256)" + "File" ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//1') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.noPhiX_FastQ.SHA256-checksums.tsv" + + # Calculate checksums + for f in "!{meta.id}_noPhiX-R1.fsq" "!{meta.id}_noPhiX-R2.fsq"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.noPhiX_FastQ.SHA256-checksums.tsv" + awk 'NR%2==0' "${f}" | paste - - | sort -k1,1 | sha256sum | awk '{print $1 "\t" "'"$f"'"}' + done >> "!{meta.id}.noPhiX_FastQ.SHA256-checksums.tsv" - printf "%s" "${SUMMARY_HEADER[@]}" | tr ',' '\\t' > "!{meta.id}.BBDuk.tsv" - printf "%s" "${SUMMARY_OUTPUT[@]}" | tr ',' '\\t' >> "!{meta.id}.BBDuk.tsv" + msg "INFO: Calculated checksums for !{meta.id}_noPhiX-R1.fsq !{meta.id}_noPhiX-R2.fsq" # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": - bbduk: $(bbduk.sh --version 2>&1 | head -n 2 | tail -1 | awk 'NF>1{print $NF}') + find: $(find --version | grep "^find" | sed 's/find //1') + sha256sum: $(sha256sum --version | grep "^sha256sum" | sed 's/sha256sum //1') + bbduk: $(bbduk.sh --version 2>&1 | grep "^BBMap" | awk 'NF>1{print $NF}') END_VERSIONS ''' } diff --git a/modules/local/split_multifasta_assembly_biopython/main.nf b/modules/local/split_multifasta_assembly_biopython/main.nf index 8f915db8..deb435a7 100644 --- a/modules/local/split_multifasta_assembly_biopython/main.nf +++ b/modules/local/split_multifasta_assembly_biopython/main.nf @@ -19,11 +19,11 @@ process SPLIT_MULTIFASTA_ASSEMBLY_BIOPYTHON { # Split assembly multi-record FastA into individual FastA files for each contig if [[ -s "!{assembly}" ]]; then split.multifasta.py \ - --outdir "bins" \ + --ext "!{params.split_multifasta_extension}" \ --infile "!{assembly}" \ - !{no_gaps} \ + --outdir "bins" \ --suffix '' \ - --ext !{params.split_multifasta_extension} + !{no_gaps} \ else msg "ERROR: ${assembly} absent" >&2 @@ -32,8 +32,8 @@ process SPLIT_MULTIFASTA_ASSEMBLY_BIOPYTHON { # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": - python: $(python --version 2>&1 | awk '{print $2}') biopython: $(python -c 'import Bio; print(Bio.__version__)' 2>&1) + python: $(python --version 2>&1 | awk '{print $2}') END_VERSIONS ''' } diff --git a/modules/local/subsample_reads_to_depth_seqkit/main.nf b/modules/local/subsample_reads_to_depth_seqkit/main.nf new file mode 100644 index 00000000..c511e9d8 --- /dev/null +++ b/modules/local/subsample_reads_to_depth_seqkit/main.nf @@ -0,0 +1,98 @@ +process SUBSAMPLE_READS_TO_DEPTH_SEQKIT { + + label "process_medium" + tag { "${meta.id}" } + container "staphb/seqkit@sha256:8eb09a52ae932f7c25cfbb8db0df7110567087a187c7e90d46f499962d1c82c9" + + input: + tuple val(meta), path(reads), path(depth), path(fraction_of_reads) + + output: + tuple val(meta), path("*.{fastq,fq}.gz", includeInputs: true), emit: reads + path("${meta.id}.Subsampled_FastQ.SHA512-checksums.tsv") , emit: checksums + path(".command.{out,err}") + path("versions.yml") , emit: versions + + shell: + seqkit_seed = (params.seqkit_seed >= 1)? params.seqkit_seed : 947266746 + + ''' + source bash_functions.sh + + fraction_of_reads_to_use=$(cat !{fraction_of_reads}) + initial_depth=$(cat !{depth}) + + depth="!{params.depth}" + + echo "!{params.seqkit_seed}" > seed-value.txt + + if ! [[ ${fraction_of_reads_to_use} =~ ^[0-9.]+$ ]]; then + msg "ERROR: Unable to calculate a fraction to subsample; ${fraction_of_reads_to_use} not a floating point value" >&2 + exit 1 + fi + if [ ${depth%.*} -gt 0 ] && [ ${initial_depth%.*} -gt ${depth%.*} ]; then + msg "INFO: Subsampling !{meta.id} R1 with seqkit using seed:!{params.seqkit_seed} ..." + + seqkit sample \ + !{reads[0]} \ + --threads !{task.cpus} \ + --proportion ${fraction_of_reads_to_use} \ + --rand-seed "!{params.seqkit_seed}" \ + --out-file "!{meta.id}_R1.subsampled.fastq.gz" \ + 2> seqkit.R1.stderr.txt + + msg "INFO: Subsampling !{meta.id} R2 with seqkit using seed:!{params.seqkit_seed} ..." + + seqkit sample \ + !{reads[1]} \ + --threads !{task.cpus} \ + --proportion ${fraction_of_reads_to_use} \ + --rand-seed "!{params.seqkit_seed}" \ + --out-file "!{meta.id}_R2.subsampled.fastq.gz" \ + 2> seqkit.R2.stderr.txt + + msg "INFO: Completed subsampling of R1 and R2 by seqkit" + + # Discard symlink infiles to avoid them being passed as outfiles when + # subsampling occurred. + rm -f !{reads[0]} !{reads[1]} + + number_output_R1_sequences=$(grep 'sequences outputted' seqkit.R1.stderr.txt | awk '{print $2}') + number_output_R2_sequences=$(grep 'sequences outputted' seqkit.R2.stderr.txt | awk '{print $2}') + + msg "INFO: Subsampled reads contain: ${number_output_R1_sequences} and ${number_output_R2_sequences} sequences" + + else + # The input FastQ files that were never subsampled will get passed on + # as outputs here with the 'includeInputs: true' + msg "INFO: Subsampling not requested or required for !{meta.id}" + touch "!{meta.id}.Subsampled_FastQ.SHA512-checksums.tsv" versions.yml + exit 0 + fi + + ### Calculate SHA-512 Checksums of each FastQ file ### + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-512)" + "File" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Subsampled_FastQ.SHA512-checksums.tsv" + + # Calculate checksums + for f in "!{meta.id}_R1.subsampled.fastq.gz" "!{meta.id}_R2.subsampled.fastq.gz"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.Subsampled_FastQ.SHA512-checksums.tsv" + zcat "${f}" | awk 'NR%2==0' | paste - - | sort -k1,1 | sha512sum | awk '{print $1 "\t" "'"${f}"'"}' + done >> "!{meta.id}.Subsampled_FastQ.SHA512-checksums.tsv" + + msg "INFO: calculated checksums for !{meta.id}_R1.subsampled.fastq.gz !{meta.id}_R2.subsampled.fastq.gz" + + # Get process version information + cat <<-END_VERSIONS > versions.yml + "!{task.process}": + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') + seqkit: $(seqkit 2>&1 | grep "^Version: " | sed 's/^Version: //1') + END_VERSIONS + ''' +} diff --git a/modules/local/subsample_reads_to_depth_seqkit/params.config b/modules/local/subsample_reads_to_depth_seqkit/params.config new file mode 100644 index 00000000..436488d5 --- /dev/null +++ b/modules/local/subsample_reads_to_depth_seqkit/params.config @@ -0,0 +1,4 @@ +params { + // Set random seed value for reproducibility + seqkit_seed = 947266746 +} diff --git a/modules/local/subsample_reads_to_depth_seqtk/main.nf b/modules/local/subsample_reads_to_depth_seqtk/main.nf index 0b0e8d91..e69fd5af 100644 --- a/modules/local/subsample_reads_to_depth_seqtk/main.nf +++ b/modules/local/subsample_reads_to_depth_seqtk/main.nf @@ -1,5 +1,6 @@ process SUBSAMPLE_READS_TO_DEPTH_SEQTK { + label "process_medium" tag { "${meta.id}" } container "staphb/seqtk@sha256:e3105ea1c7375e6bfe0603f6e031b022068b3d4d529f295c5fa24e0a6709dd2c" @@ -8,10 +9,13 @@ process SUBSAMPLE_READS_TO_DEPTH_SEQTK { output: tuple val(meta), path("*.{fastq,fq}.gz", includeInputs: true), emit: reads + path("${meta.id}.Subsampled_FastQ.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") path("versions.yml") , emit: versions shell: + seqtk_seed = (params.seqtk_seed >= 1)? params.seqtk_seed : 947266746 + ''' source bash_functions.sh @@ -20,32 +24,72 @@ process SUBSAMPLE_READS_TO_DEPTH_SEQTK { depth="!{params.depth}" + echo "!{params.seqkit_seed}" > seed-value.txt + if ! [[ ${fraction_of_reads_to_use} =~ ^[0-9.]+$ ]]; then msg "ERROR: Unable to calculate a fraction to subsample; ${fraction_of_reads_to_use} not a floating point value" >&2 exit 1 fi if [ ${depth%.*} -gt 0 ] && [ ${initial_depth%.*} -gt ${depth%.*} ]; then - seqtk sample !{reads[0]} ${fraction_of_reads_to_use} > "!{meta.id}_R1.subsampled.fastq" - seqtk sample !{reads[1]} ${fraction_of_reads_to_use} > "!{meta.id}_R2.subsampled.fastq" + msg "INFO: Subsampling !{meta.id} R1 with seqtk using seed:!{params.seqtk_seed} ..." + + seqtk sample \ + -s "!{params.seqtk_seed}" \ + !{reads[0]} \ + ${fraction_of_reads_to_use} \ + > "!{meta.id}_R1.subsampled.fastq" + + msg "INFO: Subsampling !{meta.id} R2 with seqtk using seed:!{params.seqtk_seed} ..." + + seqtk sample \ + -s "!{params.seqtk_seed}" \ + !{reads[1]} \ + ${fraction_of_reads_to_use} \ + > "!{meta.id}_R2.subsampled.fastq" + # Discard symlink infiles to avoid them being passed as outfiles when + # subsampling occurred. rm -f !{reads[0]} !{reads[1]} - gzip -9f "!{meta.id}_R1.subsampled.fastq" \ + gzip -9f \ + "!{meta.id}_R1.subsampled.fastq" \ "!{meta.id}_R2.subsampled.fastq" + msg "INFO: Subsampled !{meta.id} R1 and R2 with seqtk" + else + # The input FastQ files that were never subsampled will get passed on + # as outputs here with the 'includeInputs: true' msg "INFO: Subsampling not requested or required" - ### NOTE: - ### this gonna be tricky?! - ### pass onto phix-remove-bbduk either subsampled reads() or initial input reads() tuple + touch "!{meta.id}.Subsampled_FastQ.SHA512-checksums.tsv" versions.yml + exit 0 fi ### number of contigs and repeats elements with Lander-Waterman statistics ### https://pubmed.ncbi.nlm.nih.gov/7497129/ ??? + ### Calculate SHA-512 Checksums of each FastQ file ### + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-512)" + "File" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Subsampled_FastQ.SHA512-checksums.tsv" + + # Calculate checksums + for f in "!{meta.id}_R1.subsampled.fastq.gz" "!{meta.id}_R2.subsampled.fastq.gz"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.Subsampled_FastQ.SHA512-checksums.tsv" + zcat "${f}" | awk 'NR%2==0' | paste - - | sort -k1,1 | sha512sum | awk '{print $1 "\t" "'"${f}"'"}' + done >> "!{meta.id}.Subsampled_FastQ.SHA512-checksums.tsv" + + msg "INFO: calculated checksums for !{meta.id}_R1.subsampled.fastq.gz !{meta.id}_R2.subsampled.fastq.gz" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') seqtk: $(seqtk 2>&1 | grep "^Version: " | sed 's/^Version: //1') END_VERSIONS ''' diff --git a/modules/local/subsample_reads_to_depth_seqtk/params.config b/modules/local/subsample_reads_to_depth_seqtk/params.config new file mode 100644 index 00000000..f38f7320 --- /dev/null +++ b/modules/local/subsample_reads_to_depth_seqtk/params.config @@ -0,0 +1,4 @@ +params { + // Set random seed value for reproducibility + seqtk_seed = 947266746 +} diff --git a/modules/local/trim_reads_fastp/main.nf b/modules/local/trim_reads_fastp/main.nf index c439ba7c..528661f5 100755 --- a/modules/local/trim_reads_fastp/main.nf +++ b/modules/local/trim_reads_fastp/main.nf @@ -9,12 +9,13 @@ process TRIM_READS_FASTP { path(adapter_reference_file) output: - tuple val(meta), path("${meta.id}.Adapter*_File.tsv") , emit: qc_filecheck - tuple val(meta), path("${meta.id}*{paired,single}.fq"), emit: fastq_adapters_removed - path("${meta.id}.fastp.tsv") , emit: summary + tuple val(meta), path("${meta.id}.Adapter*_Fast*_File.tsv"), emit: qc_filecheck // regex grabs 2 QC Files here + tuple val(meta), path("${meta.id}*{paired,single}.fq") , emit: fastq_adapters_removed + path("${meta.id}.Fastp.tsv") , emit: summary path("${meta.id}.fastp.*") + path("${meta.id}.Trim_FastQ.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") - path("versions.yml") , emit: versions + path("versions.yml") , emit: versions shell: adapter_fasta = adapter_reference_file ? "--adapter_fasta ${adapter_reference_file}" : "" @@ -32,7 +33,7 @@ process TRIM_READS_FASTP { fi # Adapter clip and quality trim - msg "INFO: Performing read trimming with fastp" + msg "INFO: Performing read trimming on !{meta.id} with Fastp ..." # Run fastp fastp \ @@ -52,45 +53,93 @@ process TRIM_READS_FASTP { --html !{meta.id}.fastp.html \ --thread !{task.cpus} - # Calculate number of reads discarded - READ_COUNT_INPUT=$( - grep -A3 '"before_filtering"' !{meta.id}.fastp.json \ + msg "INFO: Completed read trimming on !{meta.id} with Fastp" + + # Parse input, discard, and output counts (I really wish `jq` was in this container!) + NUM_INPUT_READS=$( + grep -A 10 '"before_filtering"' "!{meta.id}.fastp.json" \ | grep "total_reads" \ - | sed 's/.*://g;s/,//g' + | sed 's/.*://1;s/,//1' ) - READ_COUNT_OUTPUT=$( - grep -A3 '"after_filtering"' !{meta.id}.fastp.json \ - | grep "total_reads" \ - | sed 's/.*://g;s/,//g' + NUM_INPUT_BASES=$( + grep -A 10 '"before_filtering"' "!{meta.id}.fastp.json" \ + | grep "total_bases" \ + | sed 's/.*://1;s/,//1' ) - READ_COUNT_DISCARDED=$((${READ_COUNT_INPUT} - ${READ_COUNT_OUTPUT})) - - msg "INFO: ${READ_COUNT_DISCARDED} reads are poor quality and were discarded" >&2 - - # Count up the total number of broken sister reads - COUNT_BROKEN_R1=$(awk '{lines++} END{print lines/4}' !{meta.id}_R1.unpaired.fq) - COUNT_BROKEN_R2=$(awk '{lines++} END{print lines/4}' !{meta.id}_R2.unpaired.fq) - COUNT_BROKEN_TOTAL=$((${COUNT_BROKEN_R1} + ${COUNT_BROKEN_R2})) + NUM_OUTPUT_READS=$( + grep -A 10 '"after_filtering"' "!{meta.id}.fastp.json" \ + | grep "total_reads" \ + | sed 's/.*://1;s/,//1' + ) - # Log report the total counts of singletons - msg "INFO: $COUNT_BROKEN_R1 forward reads lacked a high quality R2 sister read" >&2 - msg "INFO: $COUNT_BROKEN_R2 reverse reads lacked a high quality R1 sister read" >&2 - msg "INFO: $COUNT_BROKEN_TOTAL total broken read pairs were saved as singletons" >&2 + NUM_OUTPUT_BASES=$( + grep -A 10 '"after_filtering"' "!{meta.id}.fastp.json" \ + | grep "total_bases" \ + | sed 's/.*://1;s/,//1' + ) - # Create report file of reads removed and broken - echo -e "!{meta.id}\t${READ_COUNT_DISCARDED}\t${COUNT_BROKEN_TOTAL}" > !{meta.id}.fastp.tsv - sed -i '1i Sample name\t# discarded reads\t# singleton reads' !{meta.id}.fastp.tsv + NUM_REMOVED_READS=$((${NUM_INPUT_READS} - ${NUM_OUTPUT_READS})) + + PERCENT_REMOVED_READS=$(echo "${NUM_OUTPUT_READS}" "${NUM_INPUT_READS}" \ + | awk '{proportion=$1/$2} END{printf("%.6f", 100-(proportion*100))}') + + NUM_REMOVED_BASES=$((${NUM_INPUT_BASES} - ${NUM_OUTPUT_BASES})) + + PERCENT_REMOVED_BASES=$(echo "${NUM_OUTPUT_BASES}" "${NUM_INPUT_BASES}" \ + | awk '{proportion=$1/$2} END{printf("%.6f", 100-(proportion*100))}') + + PERCENT_OUTPUT_READS=$(echo "${NUM_REMOVED_READS}" "${NUM_INPUT_READS}" \ + | awk '{proportion=$1/$2} END{printf("%.6f", 100-(proportion*100))}') + + PERCENT_OUTPUT_BASES=$(echo "${NUM_REMOVED_BASES}" "${NUM_INPUT_BASES}" \ + | awk '{proportion=$1/$2} END{printf("%.6f", 100-(proportion*100))}') + + # Form and create a summary file of input, discarded, and output + SUMMARY_HEADER=( + "Sample_name" + "Input_reads_(#)" + "Input_basepairs_(#)" + "Removed_reads_(#)" + "Removed_reads_(%)" + "Removed_basepairs_(#)" + "Removed_basepairs_(%)" + "Output_reads_(#)" + "Output_reads_(%)" + "Output_basepairs_(#)" + "Output_basepairs_(%)" + ) + + SUMMARY_OUTPUT=( + "!{meta.id}" + "${NUM_INPUT_READS}" + "${NUM_INPUT_BASES}" + "${NUM_REMOVED_READS}" + "${PERCENT_REMOVED_READS}" + "${NUM_REMOVED_BASES}" + "${PERCENT_REMOVED_BASES}" + "${NUM_OUTPUT_READS}" + "${PERCENT_OUTPUT_READS}" + "${NUM_OUTPUT_BASES}" + "${PERCENT_OUTPUT_BASES}" + ) + + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//1') + SUMMARY_OUTPUT=$(printf "%s\t" "${SUMMARY_OUTPUT[@]}" | sed 's/\t$//1') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Fastp.tsv" + echo "${SUMMARY_OUTPUT}" >> "!{meta.id}.Fastp.tsv" # Test/verify paired FastQ outfiles sizes are reasonable to continue + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > !{meta.id}.Adapter_and_QC_Trimmed_FastQ_File.tsv for suff in R1.paired.fq R2.paired.fq; do if verify_minimum_file_size "!{meta.id}_${suff}" 'Adapter-removed FastQ Files' "!{params.min_filesize_fastq_adapters_removed}"; then echo -e "!{meta.id}\tAdapter-removed ($suff) FastQ File\tPASS" \ - >> !{meta.id}.Adapter-removed_FastQ_File.tsv + >> !{meta.id}.Adapter_and_QC_Trimmed_FastQ_File.tsv else echo -e "!{meta.id}\tAdapter-removed ($suff) FastQ File\tFAIL" \ - >> !{meta.id}.Adapter-removed_FastQ_File.tsv + >> !{meta.id}.Adapter_and_QC_Trimmed_FastQ_File.tsv fi done @@ -98,9 +147,30 @@ process TRIM_READS_FASTP { cat !{meta.id}_R1.unpaired.fq !{meta.id}_R2.unpaired.fq >> !{meta.id}_single.fq rm -f !{meta.id}_R1.unpaired.fq !{meta.id}_R2.unpaired.fq + ### Calculate SHA-512: Checksums of each FastQ file ### + msg "INFO: Calculating checksums for !{meta.id}_R1.paired.fq and !{meta.id}_R2.paired.fq ..." + + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-512)" + "File" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Trim_FastQ.SHA512-checksums.tsv" + + # Calculate checksums + for f in "!{meta.id}_R1.paired.fq" "!{meta.id}_R2.paired.fq" "!{meta.id}_single.fq"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.Trim_FastQ.SHA512-checksums.tsv" + awk 'NR%2==0' "${f}" | paste - - | sort -k1,1 | sha512sum | awk '{print $1 "\t" "'"${f}"'"}' + done >> "!{meta.id}.Trim_FastQ.SHA512-checksums.tsv" + + msg "INFO: Calculated checksums for !{meta.id}_R1.paired.fq and !{meta.id}_R2.paired.fq" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') fastp: $(fastp --version 2>&1 | awk 'NF>1{print $NF}') END_VERSIONS ''' diff --git a/modules/local/trim_reads_trimmomatic/main.nf b/modules/local/trim_reads_trimmomatic/main.nf index 33242fb5..0a0a63d2 100644 --- a/modules/local/trim_reads_trimmomatic/main.nf +++ b/modules/local/trim_reads_trimmomatic/main.nf @@ -2,18 +2,19 @@ process TRIM_READS_TRIMMOMATIC { label "process_high" tag { "${meta.id}" } - container "snads/trimmomatic@sha256:afbb19fdf540e6bd508b657e8dafffb6b411b5b0bf0e302347889220a0b571f1" + container "staphb/trimmomatic@sha256:57b673e66313e355a447e4fa1a78fd3ba1ae3ddd8c8f91358efe99140acb5ddb" input: - tuple val(meta), path(noPhiX) + tuple val(meta), path(reads) path(adapter_reference_file) output: - tuple val(meta), path("${meta.id}.Adapter*_File.tsv") , emit: qc_filecheck - tuple val(meta), path("${meta.id}*{paired,single}.fq"), emit: fastq_adapters_removed - path("${meta.id}.Trimmomatic.tsv") , emit: summary + tuple val(meta), path("${meta.id}.Adapter*_Fast*_File.tsv"), emit: qc_filecheck // regex grabs 2 QC Files here + tuple val(meta), path("${meta.id}*{paired,single}.fq") , emit: fastq_adapters_removed + path("${meta.id}.Trimmomatic.tsv") , emit: summary + path("${meta.id}.Trim_FastQ.SHA512-checksums.tsv") , emit: checksums path(".command.{out,err}") - path("versions.yml") , emit: versions + path("versions.yml") , emit: versions shell: keep_both_reads = params.trimmomatic_keep_both_reads ? 'TRUE' : 'FALSE' @@ -33,70 +34,137 @@ process TRIM_READS_TRIMMOMATIC { source bash_functions.sh # Verify adapter reference file size - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}.Adapters_FastA_File.tsv" - if verify_minimum_file_size !{adapter_reference_file} 'Adapters FastA' "!{params.min_filesize_adapters}"; then + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.Adapters_FastA_File.tsv" + if verify_minimum_file_size "!{adapter_reference_file}" 'Adapters FastA' "!{params.min_filesize_adapters}"; then echo -e "!{meta.id}\tAdapters FastA File\tPASS" >> "!{meta.id}.Adapters_FastA_File.tsv" else echo -e "!{meta.id}\tAdapters FastA File\tFAIL" >> "!{meta.id}.Adapters_FastA_File.tsv" fi # Adapter clip and quality trim - msg "INFO: Performing read trimming with Trimmomatic" + msg "INFO: Performing read trimming on !{meta.id} with Trimmomatic ..." + # NOTE: *order* matters on trimming here with Trimmomatic!!! trimmomatic PE \ - !{phred} \ - -threads !{task.cpus} \ - !{noPhiX[0]} !{noPhiX[1]} \ - !{meta.id}_R1.paired.fq !{meta.id}_R1.unpaired.fq \ - !{meta.id}_R2.paired.fq !{meta.id}_R2.unpaired.fq \ - MINLEN:!{min_length} \ - LEADING:!{leading_quality} \ - TRAILING:!{trailing_quality} \ - SLIDINGWINDOW:!{window_size}:!{req_quality} \ - ILLUMINACLIP:!{adapter_reference_file}:!{illumina_clip_params} - - TRIMMO_DISCARD=$(grep '^Input Read Pairs: ' .command.err \ - | grep ' Dropped: ' | awk '{print $20}') - - msg "INFO: ${TRIMMO_DISCARD} reads are poor quality and were discarded" >&2 - - CNT_BROKEN_R1=$(awk '{lines++} END{print lines/4}' !{meta.id}_R1.unpaired.fq) - CNT_BROKEN_R2=$(awk '{lines++} END{print lines/4}' !{meta.id}_R2.unpaired.fq) - - if [[ -z "${TRIMMO_DISCARD}" || -z "${CNT_BROKEN_R1}" || -z "${CNT_BROKEN_R2}" ]]; then - msg 'ERROR: unable to parse discarded read counts from trimmomatic log' >&2 - exit 1 - fi - - CNT_BROKEN=$((${CNT_BROKEN_R1} + ${CNT_BROKEN_R2})) - - msg "INFO: $CNT_BROKEN_R1 forward reads lacked a high quality R2 sister read" >&2 - msg "INFO: $CNT_BROKEN_R2 reverse reads lacked a high quality R1 sister read" >&2 - msg "INFO: $CNT_BROKEN total broken read pairs were saved as singletons" >&2 - - echo -e "!{meta.id}\t${TRIMMO_DISCARD}\t${CNT_BROKEN}" \ - > "!{meta.id}.Trimmomatic.tsv" - - sed -i '1i Sample name\t# discarded reads\t# singleton reads' !{meta.id}.Trimmomatic.tsv + "!{phred}" \ + -threads "!{task.cpus}" \ + "!{reads[0]}" "!{reads[1]}" \ + "!{meta.id}_R1.paired.fq" "!{meta.id}_R1.unpaired.fq" \ + "!{meta.id}_R2.paired.fq" "!{meta.id}_R2.unpaired.fq" \ + ILLUMINACLIP:"!{adapter_reference_file}":"!{illumina_clip_params}" \ + SLIDINGWINDOW:"!{window_size}":"!{req_quality}" \ + LEADING:"!{leading_quality}" \ + TRAILING:"!{trailing_quality}" \ + MINLEN:"!{min_length}" + + msg "INFO: Completed read trimming on !{meta.id} with Trimmomatic" cat !{meta.id}_R1.unpaired.fq !{meta.id}_R2.unpaired.fq > "!{meta.id}_single.fq" rm -f !{meta.id}_R1.unpaired.fq !{meta.id}_R2.unpaired.fq - echo -e "Sample name\tQC step\tOutcome (Pass/Fail)" > "!{meta.id}.Adapter-removed_FastQ_File.tsv" + # Parse input, discard, and output counts + NUM_INPUT_READS=$(grep '^Input Read Pairs: ' .command.err \ + | awk '{print $4}') + + ### NUM_INPUT_BASES= **missing**; skip slow calc; SeqKit does this on previous process output + + NUM_REMOVED_READS=$(grep '^Input Read Pairs: ' .command.err \ + | grep ' Dropped: ' | awk '{print $20}') + + PERCENT_REMOVED_READS=$(grep '^Input Read Pairs: ' .command.err \ + | grep ' Dropped: ' | awk '{print $21}' | tr -d '()%') + + NUM_OUTPUT_PAIRED_READS=$(wc -l "!{meta.id}_R1.paired.fq" | awk '{print $1/2}') + + NUM_OUTPUT_SINGLE_READS=$(wc -l "!{meta.id}_single.fq" | awk '{print $1/4}') + + NUM_OUTPUT_READS=$((${NUM_OUTPUT_PAIRED_READS} + ${NUM_OUTPUT_SINGLE_READS})) + + ### NUM_REMOVED_BASES= **missing**; skip slow calc + ### PERCENT_REMOVED_BASES= **missing**; skip slow calc + + PERCENT_OUTPUT_READS=$(echo "${NUM_REMOVED_READS}" "${NUM_INPUT_READS}" \ + | awk '{proportion=$1/$2} END{printf("%.6f", 100-(proportion*100))}') + + ### NUM_OUTPUT_BASES= **missing**; skip slow calc; SeqKit does this in next process input + ### PERCENT_OUTPUT_BASES= **missing**; skip slow calc; SeqKit does this in next process input + + msg "INFO: ${NUM_REMOVED_READS} reads (${PERCENT_REMOVED_READS}% of input) were discarded" + + # Form and create a summary file of input, discarded, and output + SUMMARY_HEADER=( + "Sample_name" + "Input_reads_(#)" + "Removed_reads_(#)" + "Removed_reads_(%)" + "Output_reads_(#)" + "Output_reads_(%)" + ) + # Skipped these slow calcs (fastp provides these but not trimmomatic) + # "Input_basepairs_(#)" + # "Removed_basepairs_(#)" + # "Removed_basepairs_(%)" + # "Output_basepairs_(#)" + # "Output_basepairs_(%)" + + SUMMARY_OUTPUT=( + "!{meta.id}" + "${NUM_INPUT_READS}" + "${NUM_REMOVED_READS}" + "${PERCENT_REMOVED_READS}" + "${NUM_OUTPUT_READS}" + "${PERCENT_OUTPUT_READS}" + ) + # Skipped these slow calcs (fastp provides these but not trimmomatic) + # "${NUM_INPUT_BASES}" + # "${NUM_REMOVED_BASES}" + # "${PERCENT_REMOVED_BASES}" + # "${NUM_OUTPUT_BASES}" + # "${PERCENT_OUTPUT_BASES}" + + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//1') + SUMMARY_OUTPUT=$(printf "%s\t" "${SUMMARY_OUTPUT[@]}" | sed 's/\t$//1') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Trimmomatic.tsv" + echo "${SUMMARY_OUTPUT}" >> "!{meta.id}.Trimmomatic.tsv" + + # Test/verify paired FastQ outfiles sizes are reasonable to continue + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.Adapter_and_QC_Trimmed_FastQ_File.tsv" for suff in R1.paired.fq R2.paired.fq; do if verify_minimum_file_size "!{meta.id}_${suff}" 'Adapter-removed FastQ Files' "!{params.min_filesize_fastq_adapters_removed}"; then echo -e "!{meta.id}\tAdapter-removed ($suff) FastQ File\tPASS" \ - >> "!{meta.id}.Adapter-removed_FastQ_File.tsv" + >> "!{meta.id}.Adapter_and_QC_Trimmed_FastQ_File.tsv" else echo -e "!{meta.id}\tAdapter-removed ($suff) FastQ File\tFAIL" \ - >> "!{meta.id}.Adapter-removed_FastQ_File.tsv" + >> "!{meta.id}.Adapter_and_QC_Trimmed_FastQ_File.tsv" fi done + ### Calculate SHA-512 Checksums of each FastQ file ### + msg "INFO: Calculating checksums for !{meta.id}_R1.paired.fq and !{meta.id}_R2.paired.fq !{meta.id}_single.fq ..." + + SUMMARY_HEADER=( + "Sample_name" + "Checksum_(SHA-512)" + "File" + ) + SUMMARY_HEADER=$(printf "%s\t" "${SUMMARY_HEADER[@]}" | sed 's/\t$//') + + echo "${SUMMARY_HEADER}" > "!{meta.id}.Trim_FastQ.SHA512-checksums.tsv" + + # Calculate checksums + for f in "!{meta.id}_R1.paired.fq" "!{meta.id}_R2.paired.fq" "!{meta.id}_single.fq"; do + echo -ne "!{meta.id}\t" >> "!{meta.id}.Trim_FastQ.SHA512-checksums.tsv" + awk 'NR%2==0' "${f}" | paste - - | sort -k1,1 | sha512sum | awk '{print $1 "\t" "'"${f}"'"}' + done >> "!{meta.id}.Trim_FastQ.SHA512-checksums.tsv" + + msg "INFO: Calculated checksums for !{meta.id}_R1.paired.fq and !{meta.id}_R2.paired.fq !{meta.id}_single.fq" + # Get process version information cat <<-END_VERSIONS > versions.yml "!{task.process}": + sha512sum: $(sha512sum --version | grep ^sha512sum | sed 's/sha512sum //1') trimmomatic: $(trimmomatic -version) END_VERSIONS ''' diff --git a/modules/local/update_db_sra_human_scrubber/main.nf b/modules/local/update_db_sra_human_scrubber/main.nf index 3270cc16..37591cb3 100644 --- a/modules/local/update_db_sra_human_scrubber/main.nf +++ b/modules/local/update_db_sra_human_scrubber/main.nf @@ -27,8 +27,8 @@ process UPDATE_DB_SRA_HUMAN_SCRUBBER { # which contains a date, then makes a generalized symlink, e.g., # . # └── data - # ├── human_filter.db -> human_filter.db.20231218v2 - # └── human_filter.db.20231218v2 + # ├── human_filter.db -> human_filter.db.20240718v2 + # └── human_filter.db.20240718v2 mkdir -p data init_db.sh diff --git a/modules/local/validate_fastq_seqfu/main.nf b/modules/local/validate_fastq_seqfu/main.nf new file mode 100644 index 00000000..ae07c9ad --- /dev/null +++ b/modules/local/validate_fastq_seqfu/main.nf @@ -0,0 +1,59 @@ +process VALIDATE_FASTQ_SEQFU { + + tag { "${meta.id}" } + container "staphb/seqfu@sha256:20831d2727d0f613f753eb301e19b345f5c9ea82c23762cb78a0c273539a3647" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("${meta.id}.Raw_Initial_FastQ_Format_Validation_File.tsv"), emit: qc_filecheck + tuple val(meta), path(reads) , emit: input + path(".command.{out,err}") + path("versions.yml") , emit: versions + + shell: + ''' + source bash_functions.sh + + msg "INFO: Validating !{meta.id} FastQ input with SeqFu..." + + echo -e "Sample_name\tQC_step\tOutcome_(Pass/Fail)" > "!{meta.id}.Raw_Initial_FastQ_Format_Validation_File.tsv" + + msg "INFO: Checking for FastQ valid format in R1: !{reads[0]} and R2: !{reads[1]}" + + # https://telatin.github.io/seqfu2/tools/check.html#integrity-check + # A single FASTQ file is considered valid if: + # 1 - each record has the same sequence and quality length + # 2 - only A,C,G,T,N characters are present in the sequence + # + # A paired-end set of FASTQ files is considered valid if: + # - each file is individually valid + # 3 - the two files have the same number of sequences + # 4 - the first and last sequence of both files has the same name (the last three characters are ignored if the remaining - sequence name is greater than 4 characters) + # 5 - the first and last sequence of the two files are not identical (R1 != R2) + # Deep check + # If you are parsing NGS files, i.e. FASTQ files, with four lines per record and you expect them to be accepted by any program, use --deep. + seqfu check \ + --deep \ + --verbose \ + !{reads[0]} !{reads[1]} + + # Retain the exit code status by exiting the exit value after error message + retVal=$? + if [ $retVal -ne 0 ]; then + msg "ERROR: FastQ format validation tests with SeqFu failed for: !{meta.id} with exit status code: ${retVal}" >&2 + echo -e "!{meta.id}\tRaw Initial FastQ (R1 and R2) Valid Format\tFAIL" >> "!{meta.id}.Raw_Initial_FastQ_Format_Validation_File.tsv" + exit $retVal + fi + + msg "INFO: SeqFu check on !{reads[0]} !{reads[1]} completed without errors, suggesting the pair is a valid read set." + echo -e "!{meta.id}\tRaw Initial FastQ (R1 and R2) Valid Format\tPASS" >> "!{meta.id}.Raw_Initial_FastQ_Format_Validation_File.tsv" + + # Get process version information + cat <<-END_VERSIONS > versions.yml + "!{task.process}": + seqfu: $(seqfu --version) + END_VERSIONS + ''' +} diff --git a/modules/nf-core/gtdbtk/classifywf/environment.yml b/modules/nf-core/gtdbtk/classifywf/environment.yml index 8801269e..500531ea 100644 --- a/modules/nf-core/gtdbtk/classifywf/environment.yml +++ b/modules/nf-core/gtdbtk/classifywf/environment.yml @@ -1,7 +1,5 @@ -name: gtdbtk_classifywf channels: - conda-forge - bioconda - - defaults dependencies: - - bioconda::gtdbtk=2.3.2 + - bioconda::gtdbtk=2.4.0 diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index e6df7517..db2cddf2 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -1,68 +1,71 @@ process GTDBTK_CLASSIFYWF { tag "${prefix}" label 'process_medium' - - // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gtdbtk:2.3.2--pyhdfd78af_0' : - 'biocontainers/gtdbtk:2.3.2--pyhdfd78af_0' }" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gtdbtk:2.4.0--pyhdfd78af_1' : 'biocontainers/gtdbtk:2.4.0--pyhdfd78af_1'}" input: - tuple val(meta), path("bins/*") + tuple val(meta) , path("bins/*") tuple val(db_name), path("database/*") - path(mash_db) + val use_pplacer_scratch_dir + path mash_db output: - tuple val(meta), path("gtdbtk.${prefix}.*.summary.tsv") , emit: summary - tuple val(meta), path("gtdbtk.${prefix}.*.classify.tree.gz") , emit: tree, optional: true - tuple val(meta), path("gtdbtk.${prefix}.*.markers_summary.tsv") , emit: markers, optional: true - tuple val(meta), path("gtdbtk.${prefix}.*.msa.fasta.gz") , emit: msa, optional: true - tuple val(meta), path("gtdbtk.${prefix}.*.user_msa.fasta.gz") , emit: user_msa, optional: true - tuple val(meta), path("gtdbtk.${prefix}.*.filtered.tsv") , emit: filtered, optional: true - tuple val(meta), path("gtdbtk.${prefix}.failed_genomes.tsv") , emit: failed, optional: true - tuple val(meta), path("gtdbtk.${prefix}.log") , emit: log - tuple val(meta), path("gtdbtk.${prefix}.warnings.log") , emit: warnings - path("versions.yml") , emit: versions - path(".command.{out,err}") + tuple val(meta), path("gtdbtk.${prefix}.*.summary.tsv") , emit: summary + tuple val(meta), path("gtdbtk.${prefix}.*.classify.tree.gz") , emit: tree , optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.markers_summary.tsv"), emit: markers , optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.msa.fasta.gz") , emit: msa , optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.user_msa.fasta.gz") , emit: user_msa, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.filtered.tsv") , emit: filtered, optional: true + tuple val(meta), path("gtdbtk.${prefix}.failed_genomes.tsv") , emit: failed , optional: true + tuple val(meta), path("gtdbtk.${prefix}.log") , emit: log + tuple val(meta), path("gtdbtk.${prefix}.warnings.log") , emit: warnings + path ("versions.yml"), emit: versions when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" - def mash_mode = mash_db ? "--mash_db ${mash_db}" : "--skip_ani_screen" + def pplacer_scratch = use_pplacer_scratch_dir ? "--scratch_dir pplacer_tmp" : "" + def mash_mode = mash_db ? "--mash_db ${mash_db}" : "--skip_ani_screen" prefix = task.ext.prefix ?: "${meta.id}" """ export GTDBTK_DATA_PATH="\${PWD}/database" - if [ ${pplacer_scratch} != "" ] ; then + if [[ ! -z "${pplacer_scratch}" ]] ; then + echo "MAKING ./pplacer_tmp dir ..." mkdir pplacer_tmp fi gtdbtk classify_wf \\ - $args \\ + ${args} \\ --genome_dir bins \\ --prefix "gtdbtk.${prefix}" \\ --out_dir "\${PWD}" \\ - --cpus $task.cpus \\ - $mash_mode \\ - $pplacer_scratch \\ - --min_perc_aa $params.gtdbtk_min_perc_aa \\ - --min_af $params.gtdbtk_min_af + --cpus ${task.cpus} \\ + ${mash_mode} \\ + ${pplacer_scratch} - mv classify/* . + ## If mash db given, classify/ and identify/ directories won't be created + if [[ -d classify/ && \$(ls -A classify/) ]]; then + mv classify/* . + fi - mv identify/* . + if [[ -d identify/ && \$(ls -A identify/) ]]; then + mv identify/* . + fi - mv align/* .\ + ## If nothing aligns, no output, so only run + if [[ -d align/ && \$(ls -A align/) ]]; then + mv align/* . + fi mv gtdbtk.log "gtdbtk.${prefix}.log" mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log" - find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing + find -name "gtdbtk.${prefix}.*.classify.tree" | xargs -r gzip # do not fail if .tree is missing cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -71,14 +74,13 @@ process GTDBTK_CLASSIFYWF { """ stub: - def VERSION = '2.3.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. prefix = task.ext.prefix ?: "${meta.id}" """ touch gtdbtk.${prefix}.stub.summary.tsv - touch gtdbtk.${prefix}.stub.classify.tree.gz + echo "" | gzip > gtdbtk.${prefix}.stub.classify.tree.gz touch gtdbtk.${prefix}.stub.markers_summary.tsv - touch gtdbtk.${prefix}.stub.msa.fasta.gz - touch gtdbtk.${prefix}.stub.user_msa.fasta.gz + echo "" | gzip > gtdbtk.${prefix}.stub.msa.fasta.gz + echo "" | gzip > gtdbtk.${prefix}.stub.user_msa.fasta.gz touch gtdbtk.${prefix}.stub.filtered.tsv touch gtdbtk.${prefix}.log touch gtdbtk.${prefix}.warnings.log @@ -86,7 +88,7 @@ process GTDBTK_CLASSIFYWF { cat <<-END_VERSIONS > versions.yml "${task.process}": - gtdbtk: \$(echo "$VERSION") + gtdbtk: \$(echo \$(gtdbtk --version -v 2>&1) | sed "s/gtdbtk: version //; s/ Copyright.*//") END_VERSIONS """ } diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml index d85f9966..fc081897 100644 --- a/modules/nf-core/gtdbtk/classifywf/meta.yml +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -1,5 +1,6 @@ name: gtdbtk_classifywf -description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. +description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications + to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. keywords: - GTDB taxonomy - taxonomic classification @@ -10,76 +11,135 @@ keywords: - archaea tools: - gtdbtk: - description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. + description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications + to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. homepage: https://ecogenomics.github.io/GTDBTk/ documentation: https://ecogenomics.github.io/GTDBTk/ tool_dev_url: https://github.com/Ecogenomics/GTDBTk doi: "10.1093/bioinformatics/btz848" licence: ["GNU General Public v3 (GPL v3)"] + identifier: "" input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false, assembler:'spades' ] - - bins: - type: file - description: The binned fasta files from the assembler - pattern: "*.{fasta,fa}" - - database: - type: file - description: The local copy of the taxonomic database used by GTDB-tk (unzipped copy) - pattern: "*" - - mash_db: - type: file - description: The local copy of the Mash sketch database used by GTDB-tk if `ani_screen` mode is used (optional) - pattern: "*.msh" + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + - '"bins/*"': + type: file + description: bins + - - db_name: + type: string + description: The name of the GTDB database to use. + - '"database/*"': + type: file + description: GTDB database + - - use_pplacer_scratch_dir: + type: boolean + description: Set to true to reduce pplacer memory usage by writing to disk (slower) + - - mash_db: + type: file + description: The local copy of the Mash sketch database used by GTDB-tk if `ani_screen` + mode is used (optional) + pattern: "*.msh" output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - summary: - type: file - description: A TSV summary file for the classification - pattern: "*.{summary.tsv}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.*.summary.tsv: + type: file + description: A TSV summary file for the classification + pattern: "*.{summary.tsv}" - tree: - type: file - description: NJ or UPGMA tree in Newick format produced from a multiple sequence alignment - pattern: "*.{classify.tree.gz}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.*.classify.tree.gz: + type: file + description: NJ or UPGMA tree in Newick format produced from a multiple sequence + alignment + pattern: "*.{classify.tree.gz}" - markers: - type: file - description: A TSV summary file lineage markers used for the classification. - pattern: "*.{markers_summary.tsv}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.*.markers_summary.tsv: + type: file + description: A TSV summary file lineage markers used for the classification. + pattern: "*.{markers_summary.tsv}" - msa: - type: file - description: Multiple sequence alignments file. - pattern: "*.{msa.fasta.gz}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.*.msa.fasta.gz: + type: file + description: Multiple sequence alignments file. + pattern: "*.{msa.fasta.gz}" - user_msa: - type: file - description: Multiple sequence alignments file for the user-provided files. - pattern: "*.{user_msa.fasta.gz}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.*.user_msa.fasta.gz: + type: file + description: Multiple sequence alignments file for the user-provided files. + pattern: "*.{user_msa.fasta.gz}" - filtered: - type: file - description: A list of genomes with an insufficient number of amino acids in MSA.. - pattern: "*.{filtered.tsv}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.*.filtered.tsv: + type: file + description: A list of genomes with an insufficient number of amino acids in + MSA.. + pattern: "*.{filtered.tsv}" + - failed: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.failed_genomes.tsv: + type: file + description: A TSV summary of the genomes which GTDB-tk failed to classify. + pattern: "*.{failed_genomes.tsv}" - log: - type: file - description: GTDB-tk log file - pattern: "*.{log}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.log: + type: file + description: GTDB-tk log file + pattern: "*.{log}" - warnings: - type: file - description: GTDB-tk warnings log file - pattern: "*.{warnings.log}" - - failed: - type: file - description: A TSV summary of the genomes which GTDB-tk failed to classify. - pattern: "*.{failed_genomes.tsv}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gtdbtk.${prefix}.warnings.log: + type: file + description: GTDB-tk warnings log file + pattern: "*.{warnings.log}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@skrakau" - "@abhi18av" diff --git a/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test new file mode 100644 index 00000000..deca962d --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test @@ -0,0 +1,42 @@ +nextflow_process { + + name "Test Process GTDBTK_CLASSIFYWF" + script "../main.nf" + process "GTDBTK_CLASSIFYWF" + + tag "modules" + tag "modules_nfcore" + tag "gtdbtk" + tag "gtdbtk/classifywf" + + // Only stub test is possible due to very large required database (>70GB) + test("sarscov2 - genome fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false, assembler:'SPADES' ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/scaffolds.fasta', checkIfExists: true), + ] + ] + input[1] = [[], []] + input[2] = false + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap new file mode 100644 index 00000000..eb0ee89a --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap @@ -0,0 +1,199 @@ +{ + "sarscov2 - genome fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.classify.tree.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.failed_genomes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + "versions.yml:md5,2c94de2b8633b99e11881ab0193835d7" + ], + "failed": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.failed_genomes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtered": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "markers": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "msa": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "summary": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tree": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.classify.tree.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "user_msa": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,2c94de2b8633b99e11881ab0193835d7" + ], + "warnings": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T11:46:32.337929018" + } +} \ No newline at end of file diff --git a/modules/nf-core/gtdbtk/classifywf/tests/tags.yml b/modules/nf-core/gtdbtk/classifywf/tests/tags.yml new file mode 100644 index 00000000..5d8badac --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/tags.yml @@ -0,0 +1,2 @@ +gtdbtk/classifywf: + - "modules/nf-core/gtdbtk/classifywf/**" diff --git a/nextflow.config b/nextflow.config index 1d7df4bb..9cf7b263 100644 --- a/nextflow.config +++ b/nextflow.config @@ -142,10 +142,10 @@ manifest { name = 'bacterial-genomics/wf-paired-end-illumina-assembly' author = "Christopher A. Gulvik" homePage = 'https://github.com/bacterial-genomics/wf-paired-end-illumina-assembly' - description = "Trim, assemble, and annotate paired end illumina reads." + description = "Clean, assemble, and annotate paired end illumina reads." mainScript = 'main.nf' nextflowVersion = '!>=22.04.3' - version = '2.1.1' + version = '3.0.0' } /* @@ -155,7 +155,7 @@ manifest { */ // Function to get current timestamp -def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +def trace_timestamp = new java.util.Date().format('yyyy-MMM-dd_EEE_HH-mm-ss') timeline { enabled = true @@ -224,10 +224,10 @@ def check_max(obj, type) { } } else if (type == 'cpus') { try { - return Math.min( obj, params.max_cpus as int ) + return Math.min(obj, params.max_cpus as int) } catch (all) { println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" return obj } } -} \ No newline at end of file +} diff --git a/nextflow_schema.json b/nextflow_schema.json index cde503c6..af85c69e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -27,7 +27,8 @@ "type": "string", "default": "25M", "hidden": true, - "description": "Minimum file size of input FastQ files." + "description": "Minimum file size of input FastQ files.", + "pattern": "^\\d+(\\.\\d+)?s?[k|M|G]?$" }, "logpath": { "type": "string", @@ -87,7 +88,19 @@ "title": "Downsampling options", "type": "object", "description": "Downsampling options", + "subsample_tool": { + "type": "string", + "default": "seqkit", + "hidden": false, + "description": "Specify which read subsampling tool to use during downsampling.", + "enum": ["seqtk", "seqkit"] + }, "properties": { + "subsample_tool": { + "type": "string", + "default": "seqkit", + "hidden": true + }, "depth": { "type": "integer", "default": 100, @@ -98,6 +111,18 @@ "type": "string", "hidden": true, "description": "Genome size to subsample FastQ files to." + }, + "seqkit_seed": { + "type": "integer", + "default": 947266746, + "hidden": true, + "description": "Seed value to use in downsampling reads sets with SeqKit" + }, + "seqtk_seed": { + "type": "integer", + "default": 947266746, + "hidden": true, + "description": "Seed value to use in downsampling reads sets with Seqtk" } } }, @@ -132,7 +157,7 @@ }, "min_filesize_binary_pe_alignment": { "type": "string", - "default": "25M", + "default": "6M", "hidden": true, "description": "Minimum file size of binary cleaned paired-end alignment file after paired-end reads are mapped onto the filtered assembly." }, @@ -518,11 +543,6 @@ "hidden": true, "description": "Minimum coverage of each contig, where an integer is an absolute coverage and a float is a minimum percentage, when filtering contigs." }, - "filter_contigs_discard_file": { - "type": "string", - "hidden": true, - "description": "Output FastA file of discarded sequences which includes failed filters in the deflines when filtering contigs." - }, "filter_contigs_gcskew": { "type": "boolean", "hidden": true, @@ -532,7 +552,8 @@ "type": "integer", "default": 1000, "hidden": true, - "description": "Minimum contig length (in bp) when filtering contigs." + "description": "Minimum contig length (in bp) when filtering contigs.", + "minimum": 1 }, "filter_contigs_keep_low_complexity": { "type": "boolean", @@ -584,24 +605,12 @@ "hidden": true, "description": "Reduce pplacer memory usage by writing to disk (slower)." }, - "gtdbtk_min_perc_aa": { - "type": "integer", - "default": 10, - "hidden": true, - "description": "Exclude genomes that do not have at least this percentage of AA in the MSA (inclusive bound)." - }, "gtdbtk_pplacer_cpus": { "type": "integer", "default": 1, "hidden": true, "description": "Number of CPUs to use during pplacer placement." }, - "gtdbtk_min_af": { - "type": "string", - "default": "0.65", - "hidden": true, - "description": "Minimum alignment fraction to assign genome to a species cluster." - }, "busco_db": { "type": "string", "description": "Database for intra-contig gene analysis with BUSCO.", @@ -620,13 +629,13 @@ "split_multifasta_remove_gaps": { "type": "boolean", "hidden": true, - "description": "Remove gaps from contigs when splitting contigs for BUSCO analysis." + "description": "Remove gaps '-' within each contig when splitting the assembly into individual files for BUSCO analysis." }, "split_multifasta_extension": { "type": "string", "default": ".fasta", "hidden": true, - "description": "File extension of contig file whens plitting contig file." + "description": "Output file extension of each individual contig file when splitting assembly file for BUSCO analysis." } } }, @@ -690,23 +699,27 @@ }, "mlst_ignore_scheme": { "type": "string", - "default": "ecoli,abaumannii,vcholerae_2", - "description": "Comma separated list of mlst schemes to ignore." + "default": "abaumannii_2,blicheniformis_14,bordetella_3,brachyspira_2,brachyspira_3,brachyspira_4,brachyspira_5,campylobacter_nonjejuni_2,campylobacter_nonjejuni_3,campylobacter_nonjejuni_4,campylobacter_nonjejuni_5,campylobacter_nonjejuni_6,campylobacter_nonjejuni_7,campylobacter_nonjejuni_8,campylobacter_nonjejuni_9,diphtheria_3,ecoli_achtman_4,leptospira_2,leptospira_3,listeria_2,llactis_phage,mbovis_2,mcatarrhalis_achtman_6,mgallisepticum_2,mhominis_3,mycobacteria_2,pacnes_3,pmultocida_2,senterica_achtman_2,vcholerae_2,ypseudotuberculosis_achtman_3", + "description": "Comma separated list of mlst schemes to ignore.", + "hidden": true }, "mlst_min_score": { "type": "string", "default": "50", - "description": "Minumum score out of 100 to match a scheme when using autodection." + "description": "Minumum score out of 100 to match a scheme when using autodection.", + "hidden": true }, "mlst_min_identity": { "type": "string", "default": "95", - "description": "Percent identity of an allele to consider them 'similar'." + "description": "Percent identity of an allele to consider them 'similar'.", + "hidden": true }, "mlst_min_coverage": { "type": "string", "default": "10", - "description": "Percent coverage to report a partial allele." + "description": "Percent coverage to report a partial allele.", + "hidden": true } } }, @@ -974,10 +987,10 @@ "$ref": "#/definitions/reference_file_options" }, { - "$ref": "#/definitions/read_trimming_options" + "$ref": "#/definitions/taxonomic_profiling_options" }, { - "$ref": "#/definitions/taxonomic_profiling_options" + "$ref": "#/definitions/read_trimming_options" }, { "$ref": "#/definitions/ribosomal_rna_analysis_options" @@ -1021,5 +1034,12 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "schema_ignore_params": { + "type": "string", + "default": "filter_blast_bitscore,filter_blast_column,min_filesize_filtered_blastn,min_filesize_blastn_output,min_filesize_blastn_db,min_filesize_extracted_ssu_file,min_filesize_renamed_ssu_file,genbank_search_type,genbank_query_qualifier,genbank_query_feature,genbank_query,min_filesize_annotated_genbank,min_filesize_binary_se_alignment,min_filesize_final_assembly,min_filesize_polished_assembly,min_filesize_binary_pe_alignment,min_filesize_filtered_assembly,filter_contigs_no_sort,filter_contigs_deflines,filter_contigs_keep_low_complexity,filter_contigs_length,filter_contigs_gcskew,filter_contigs_coverage,min_filesize_raw_assembly,min_filesize_non_overlapping_fastq,min_filesize_fastq_adapters_removed,min_filesize_adapters,min_filesize_fastq_phix_removed,min_filesize_phix_genome,min_filesize_fastq_input,workflows,available_workflows,max_retry,bigdata,logpath,qc_filecheck_log_dir,process_log_dir,kraken1_db,kraken2_db,blast_db,polish_corrections,skesa_allow_snps,skesa_min_contig_length,skesa_max_snp_length,skesa_fraction,skesa_steps,skesa_vector_percent,skesa_kmer_length,excel_sheet_name,merge_lanes,sge_high_memory,sge_options,sge_queue_size,sge_queue,sge_penv,singularity_cache,sge_process_time,gtdbtk_pplacer_scratch,gtdbtk_pplacer_cpus,depth,genome_size,busco_config,adapter_reference,phix_reference,spades_mode,spades_kmer_sizes,validationSchemaIgnoreParams,validationShowHiddenParams,validation-schema-ignore-params,validation-show-hidden-params,mash_db,min_filesize_sra_human_scrubber_db_file,trimmomatic_keep_both_reads,trimmomatic_palindrome_clip_threshold,trimmomatic_simple_clip_threshold,trimmomatic_required_quality,trimmomatic_trailing_quality,trimmomatic_leading_quality,trimmomatic_min_length,trimmomatic_min_adapter_length,trimmomatic_seed_mismatches,trimmomatic_window_size,trimmomatic_phred,create_excel_outputs,rdp_phylomarker,rdp_output_format,min_filesize_rdp_output,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SPADES,ASSEMBLY:READ_CLASSIFY_KRAKEN_ONE,ASSEMBLY:ASSEMBLE_CONTIGS:ASSEMBLE_CONTIGS_SKESA,min_filesize_checkm2_report,cat_db,min_filesize_cat_output,download_cat_db,trim_reads_tool,subsample_tool,seqtk_seed,seqkit_seed", + "hidden": true + } + } } diff --git a/run_assembly.uge-nextflow b/run_assembly.uge-nextflow index c086fcc7..d95fc5f1 100755 --- a/run_assembly.uge-nextflow +++ b/run_assembly.uge-nextflow @@ -49,7 +49,7 @@ prompt_new_outdir() { read -p "Would you like to use a different output path? (yes|no) " -n 1 -r echo -e "${COLOR_OFF}" - if [[ $REPLY =~ ^[yY] ]]; then + if [[ "${REPLY}" =~ ^[yY] ]]; then echo -e "${RED_BG}" read -p "Enter new Output path: " _new_outdir echo -e "${COLOR_OFF}" @@ -76,15 +76,10 @@ prompt_if_previous_nextflow_run() { read -p "Do you want to continue? (yes|no) " -n 1 -r echo -e "${COLOR_OFF}" - if [[ $REPLY =~ ^[yY] ]]; then + if [[ "${REPLY}" =~ ^[yY] ]]; then # Continue submission if user wants to possibly overwrite data echo -e "${YELLOW_TXT}\nData in ${1} will be overwritten." echo -e "Continuing to submit samples...${COLOR_OFF}" - - # Rename .work directory - # if [[ -d "${OUT}/.work" ]]; then - # mv "${OUT}/.work" "${OUT}/.work_$(basename $(readlink -f ${OUT}/.work))" - # fi else # If user doesn't want to continue, ask for a different output path prompt_new_outdir @@ -94,7 +89,7 @@ prompt_if_previous_nextflow_run() { } # Check argument requirements -[[ $1 == "--help" || $1 == "-h" ]] && { usage; exit 0; } +[[ "${1}" == "--help" || "${1}" == "-h" ]] && { usage; exit 0; } if [[ $# -lt 1 || $# -gt 2 ]]; then echo -e "${RED_TXT}ERROR:${COLOR_OFF} improper number ("$#") of arguments provided" >&2 usage @@ -102,36 +97,36 @@ if [[ $# -lt 1 || $# -gt 2 ]]; then fi # Confirm we are on a host that can submit jobs -if [ ${HOSTNAME%%.*} == 'biolinux' ] || \ - [ ${HOSTNAME%%.*} == 'login02' ] || \ - [ ${HOSTNAME%%.*} == 'rosalind01' ] || \ - [ $(echo ${HOSTNAME%%.*} | sed 's/[0-9]//g') == 'node' ]; then +if [ "${HOSTNAME%%.*}" == 'biolinux' ] || \ + [ "${HOSTNAME%%.*}" == 'login02' ] || \ + [ "${HOSTNAME%%.*}" == 'rosalind01' ] || \ + [ "$(echo ${HOSTNAME%%.*} | sed 's/[0-9]//g')" == 'node' ]; then : else - echo -e "${RED_TXT}ERROR:${COLOR_OFF} must be on aspen or biolinux" >&2 + echo -e "${RED_TXT}ERROR:${COLOR_OFF} must be on rosalind, aspen, or biolinux" >&2 exit 1 fi # I/O handling -if [[ $1 =~ .+[[:space:]].+ ]]; then +if [[ "${1}" =~ .+[[:space:]].+ ]]; then echo -e "${RED_TXT}ERROR:${COLOR_OFF} whitespace in $1 path unsupported" >&2 exit 1 fi -if [[ $2 =~ .+[[:space:]].+ ]]; then +if [[ "${2}" =~ .+[[:space:]].+ ]]; then echo -e "${RED_TXT}ERROR:${COLOR_OFF} whitespace in $2 path unsupported" >&2 exit 1 fi -IN=$(readlink -f "$1") -if [[ -z "$2" ]]; then +IN="$(readlink -f ${1})" +if [[ -z "${2}" ]]; then OUT="${PWD}" else - OUT=$(readlink -f "$2") + OUT="$(readlink -f ${2})" fi # Get LAB_HOME or custom tmp/cache variables from user's ~/.bashrc, # while still enabling fancy people to override these from their # current working environment too. -source ${HOME}/.bashrc +source "${HOME}/.bashrc" if [[ -z "${LAB_HOME}" ]]; then echo -e "${RED_TXT}ERROR:${COLOR_OFF} ${LAB_HOME} not set" >&2 exit 1 @@ -140,20 +135,20 @@ fi # Check if Singularity environment variables are set, # and confirm the user has write access. # We expect these 2 paths to already exist. -if [[ -z ${SINGULARITY_CACHEDIR} ]]; then +if [[ -z "${SINGULARITY_CACHEDIR}" ]]; then echo -e "${RED_TXT}ERROR:${COLOR_OFF} \$SINGULARITY_CACHEDIR not set" >&2 exit 1 else - if [[ ! -d ${SINGULARITY_CACHEDIR} ]] || [[ ! -w ${SINGULARITY_CACHEDIR} ]]; then + if [[ ! -d "${SINGULARITY_CACHEDIR}" ]] || [[ ! -w "${SINGULARITY_CACHEDIR}" ]]; then echo -e "${RED_TXT}ERROR:${COLOR_OFF} \$SINGULARITY_CACHEDIR ${SINGULARITY_CACHEDIR} not writeable for ${USER}" >&2 exit 1 fi fi -if [[ -z ${SINGULARITY_TMPDIR} ]]; then +if [[ -z "${SINGULARITY_TMPDIR}" ]]; then echo -e "${RED_TXT}ERROR:${COLOR_OFF} \$SINGULARITY_TMPDIR not set" >&2 exit 1 else - if [[ ! -d ${SINGULARITY_TMPDIR} ]] || [[ ! -w ${SINGULARITY_TMPDIR} ]]; then + if [[ ! -d "${SINGULARITY_TMPDIR}" ]] || [[ ! -w "${SINGULARITY_TMPDIR}" ]]; then echo -e "${RED_TXT}ERROR:${COLOR_OFF} \$SINGULARITY_TMPDIR ${SINGULARITY_TMPDIR} not writeable for ${USER}" >&2 exit 1 fi @@ -162,7 +157,7 @@ fi # If Windows-style mount point (e.g., "Z:\Streptobacillus\Raw_FQs") given as a # path, check both user's home, lab home, and group as possible mount areas # to determine if files exist in 1 but not the others. -if [[ ${IN} =~ ^.+[A-Z]\:.+ ]]; then +if [[ "${IN}" =~ ^.+[A-Z]\:.+ ]]; then path_slashes_replaced="$(echo "${IN}" | cut -d ':' -f 2 | sed 's/\\/\//g')" user_home="${HOME}"/"${path_slashes_replaced}" user_group_main="/scicomp${path_slashes_replaced}" # Dropped slash between scicomp and var @@ -183,12 +178,12 @@ if [[ ${IN} =~ ^.+[A-Z]\:.+ ]]; then -regex ".+_(R)?(1|2)(.+)?\.(fq|fastq)($|\.gz$)" \ 2> /dev/null \ | wc -l) - if [[ ${cnt_read_files} -ge 1 ]]; then + if [[ "${cnt_read_files}" -ge 1 ]]; then echo -e "${YELLOW_TXT}\n${cnt_read_files} FastQ files found in: ${path}${COLOR_OFF}" echo -e "${RED_BG}" read -p "Use ${path} as Input Directory? (yes|no) " -n 1 -r echo -e "${COLOR_OFF}" - if [[ $REPLY =~ ^[yY] ]]; then + if [[ "${REPLY}" =~ ^[yY] ]]; then IN="${path}" else exit 0 @@ -199,15 +194,15 @@ if [[ ${IN} =~ ^.+[A-Z]\:.+ ]]; then fi # For now, just output into shared LAB_HOME area -if [[ ${OUT} =~ ^.+[A-Z]\:.+ ]]; then +if [[ "${OUT}" =~ ^.+[A-Z]\:.+ ]]; then echo -e "${YELLOW_TXT}\nWindows hard mount path detected as Output Directory ${COLOR_OFF}" - proposed_outdir="${LAB_HOME}"/"$(date '+%Y-%b-%d_%a_%H:%M:%S')" + proposed_outdir="${LAB_HOME}"/"$(date '+%Y-%b-%d_%a_%H-%M-%S')" echo -e "${RED_BG}" read -p "Use ${proposed_outdir} as Output Directory? (yes|no) " -n 1 -r echo -e "${COLOR_OFF}" - if [[ $REPLY =~ ^[yY] ]]; then + if [[ "${REPLY}" =~ ^[yY] ]]; then OUT="${path}" else echo -e "${RED_TXT}\nOkay, bad autoselection? Re-run with a different specified path ${COLOR_OFF}\n" @@ -218,7 +213,7 @@ fi # If Windows-style path from Core Facility given via email # (e.g., "\\...cdc.gov\groups\OID\...BCFB\by-instrument\NovaSeq\") # is given as path, form the Linux path -if [[ ${IN} =~ ^.+\.cdc\.gov\\.+\\.+ ]]; then +if [[ "${IN}" =~ ^.+\.cdc\.gov\\.+\\.+ ]]; then inpath_stripped_hostname="${IN/*.cdc\.gov/\scicomp}" inpath_dewindowsed="$(echo "${inpath_stripped_hostname}" | sed 's/\\/\//g')" IN="${inpath_dewindowsed}" @@ -242,7 +237,7 @@ if [ -d "${OUT}/.log" ] && \ fi # Add found files to an array -found_files=( $(find -L ${IN} \ +found_files=( $(find -L "${IN}" \ -maxdepth 2 \ -type f \ -readable \ @@ -280,7 +275,7 @@ done # Prompt user if >10 samples to prevent accidentally assembling too many if [[ ${#submitted[@]} -ge 10 ]]; then echo -e "${YELLOW_TXT}\nWARNING: ${#submitted[@]} samples have been identified:\n${COLOR_OFF}" - for f in ${submitted[@]}; do + for f in "${submitted[@]}"; do echo -e "${YELLOW_TXT}${f}${COLOR_OFF}" done | xargs -L3 | column -t @@ -288,7 +283,7 @@ if [[ ${#submitted[@]} -ge 10 ]]; then echo -e "${RED_BG}" read -p "Do you want to continue submitting ${#submitted[@]} samples? (yes|no) " -n 1 -r echo -e "${COLOR_OFF}" - if [[ $REPLY =~ ^[yY] ]]; then + if [[ "${REPLY}" =~ ^[yY] ]]; then echo -e "\nContinuing to submit ${#submitted[@]} samples...\n" else echo -e "${RED_TXT}\nSubmission cancelled ${COLOR_OFF}" @@ -297,10 +292,10 @@ if [[ ${#submitted[@]} -ge 10 ]]; then fi # Set up log directory in OUT directory -mkdir -p ${OUT}/pipeline_info +mkdir -p "${OUT}/pipeline_info" # Get node number - <=230 = biolinux, >=231 = rosalind -NODE_NUM=$(echo ${HOSTNAME%%.*} | sed 's/node//1') +NODE_NUM="$(echo ${HOSTNAME%%.*} | sed 's/node//1')" # If FastQ files are able to be submitted, submit and display basenames if [[ ${#submitted[@]} -ge 1 ]] && \ @@ -310,17 +305,17 @@ if [[ ${#submitted[@]} -ge 1 ]] && \ qsub \ -m ba \ -q all.q \ - -v IN=${IN} \ - -v OUT=${OUT} \ - -o ${OUT}/pipeline_info \ - -e ${OUT}/pipeline_info \ - -M ${USER}@cdc.gov \ - -v LAB_HOME=${LAB_HOME} \ - -N ASM_${#submitted[@]} \ - -v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \ - -v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \ - -v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \ - ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow + -v IN="${IN}" \ + -v OUT="${OUT}" \ + -o "${OUT}/pipeline_info" \ + -e "${OUT}/pipeline_info" \ + -M "${USER}@cdc.gov" \ + -v LAB_HOME="${LAB_HOME}" \ + -N "ASM_${#submitted[@]}" \ + -v SINGULARITY_TMPDIR="${SINGULARITY_TMPDIR}" \ + -v SINGULARITY_CACHEDIR="${SINGULARITY_CACHEDIR}" \ + -v NXF_SINGULARITY_CACHEDIR="${SINGULARITY_CACHEDIR}" \ + "${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow" elif [[ ${#submitted[@]} -ge 1 ]] && \ [[ ${HOSTNAME%%.*} == 'rosalind01' ]] || \ @@ -329,18 +324,18 @@ elif [[ ${#submitted[@]} -ge 1 ]] && \ qsub \ -m ba \ -q all.q \ - -v IN=${IN} \ - -v OUT=${OUT} \ - -o ${OUT}/pipeline_info \ - -e ${OUT}/pipeline_info \ - -M ${USER}@cdc.gov \ - -v LAB_HOME=${LAB_HOME} \ - -N ASM_${#submitted[@]} \ + -v IN="${IN}" \ + -v OUT="${OUT}" \ + -o "${OUT}/pipeline_info" \ + -e "${OUT}/pipeline_info" \ + -M "${USER}@cdc.gov" \ + -v LAB_HOME="${LAB_HOME}" \ + -N "ASM_${#submitted[@]}" \ -l max_runtime=72:00:00 \ - -v SINGULARITY_TMPDIR=${SINGULARITY_TMPDIR} \ - -v SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \ - -v NXF_SINGULARITY_CACHEDIR=${SINGULARITY_CACHEDIR} \ - ${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow + -v SINGULARITY_TMPDIR="${SINGULARITY_TMPDIR}" \ + -v SINGULARITY_CACHEDIR="${SINGULARITY_CACHEDIR}" \ + -v NXF_SINGULARITY_CACHEDIR="${SINGULARITY_CACHEDIR}" \ + "${LAB_HOME}/workflows/wf-paired-end-illumina-assembly/_run_assembly.uge-nextflow" else echo -e "${RED_TXT}Biolinux/Aspen/Rosalind HPC is not detected.\nSubmission cancelled. ${COLOR_OFF}" diff --git a/subworkflows/local/assemble_contigs.nf b/subworkflows/local/assemble_contigs.nf index b0dea7dc..4d40dc6e 100644 --- a/subworkflows/local/assemble_contigs.nf +++ b/subworkflows/local/assemble_contigs.nf @@ -56,13 +56,15 @@ workflow ASSEMBLE_CONTIGS { var_assembler_name // var (str): assembler_name main: - ch_versions = Channel.empty() - ch_qc_filechecks = Channel.empty() + ch_versions = Channel.empty() + ch_checksums_file = Channel.empty() + ch_qc_filechecks = Channel.empty() // Update meta to include meta.assembler if ( var_assembler_name == "SKESA" ) { // SKESA assembler // PROCESS: Run SKESA to assemble contigs with cleaned paired reads and cleaned singletons + // which skips post-assembly mapping for SNP and InDel corrections too for speed. ASSEMBLE_CONTIGS_SKESA ( ch_cleaned_reads ) @@ -79,7 +81,7 @@ workflow ASSEMBLE_CONTIGS { ) ch_versions = ch_versions.mix(FILTER_CONTIGS_BIOPYTHON.out.versions) - // PROCESS: Create BAM file + // PROCESS: Create BAM file for depth of coverage calculations MAP_CONTIGS_BWA ( ch_cleaned_reads.join(FILTER_CONTIGS_BIOPYTHON.out.uncorrected_contigs) ) @@ -98,9 +100,11 @@ workflow ASSEMBLE_CONTIGS { ) // Collect QC File Checks - ch_qc_filechecks = ch_qc_filechecks + ch_checksums_file = ch_checksums_file.mix(MAP_CONTIGS_BWA.out.checksums) + ch_qc_filechecks = ch_qc_filechecks .mix(ASSEMBLE_CONTIGS_SKESA.out.qc_filecheck) .mix(MAP_CONTIGS_BWA.out.qc_filecheck) + } else { // Defaulting to SPAdes assembler // PROCESS: Run SPAdes to assemble contigs with cleaned paired reads and cleaned singletons @@ -108,8 +112,6 @@ workflow ASSEMBLE_CONTIGS { ch_cleaned_reads ) ch_versions = ch_versions.mix(ASSEMBLE_CONTIGS_SPADES.out.versions) - - // ch_contigs = ASSEMBLE_CONTIGS_SPADES.out.contigs.map{ meta, file -> [ meta, [file] ] } ch_contigs = qcfilecheck( "ASSEMBLE_CONTIGS_SPADES", ASSEMBLE_CONTIGS_SPADES.out.qc_filecheck, @@ -122,7 +124,10 @@ workflow ASSEMBLE_CONTIGS { ) ch_versions = ch_versions.mix(FILTER_CONTIGS_BIOPYTHON.out.versions) - // PROCESS: Use BWA/Samtools/Pilon to correct contigs with cleaned PE reads + // PROCESS: Use BWA/Samtools/Pilon to SNP and InDel correct contigs with cleaned PE reads + // NOTE: The "path(cleaned_fastq_files)" is already input to this POLISH channel, but + // currently just the meta.id is used for the readset. Should be using the + // path(cleaned_fastq_files) items though. POLISH_ASSEMBLY_BWA_PILON ( ch_cleaned_reads.join(FILTER_CONTIGS_BIOPYTHON.out.uncorrected_contigs) ) @@ -141,7 +146,8 @@ workflow ASSEMBLE_CONTIGS { ) // Collect QC File Checks - ch_qc_filechecks = ch_qc_filechecks + ch_checksums_file = ch_checksums_file.mix(POLISH_ASSEMBLY_BWA_PILON.out.checksums) + ch_qc_filechecks = ch_qc_filechecks .mix(ASSEMBLE_CONTIGS_SPADES.out.qc_filecheck) .mix(POLISH_ASSEMBLY_BWA_PILON.out.qc_filecheck) } @@ -150,5 +156,6 @@ workflow ASSEMBLE_CONTIGS { bam_files = ch_bam_files // channel: [ val(meta), [{paired,single}.bam] ] assembly_file = ch_assembly_file // channel: [ val(meta), [assembly.fna] ] qc_filecheck = ch_qc_filechecks + checksums = ch_checksums_file // channel: [ val(meta), [assembly.fna] ] versions = ch_versions } diff --git a/subworkflows/local/assembly_assessment.nf b/subworkflows/local/assembly_assessment.nf index e54a534a..336994e9 100644 --- a/subworkflows/local/assembly_assessment.nf +++ b/subworkflows/local/assembly_assessment.nf @@ -3,6 +3,9 @@ // Parameters for each of these have to be used independent of each other. // +import java.nio.file.Files +import java.nio.file.Paths + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS @@ -12,16 +15,17 @@ // // MODULES: Local modules // -include { CAT_DB_PREPARATION_UNIX } from "../../modules/local/cat_db_preparation_unix/main" -include { DOWNLOAD_CAT_DB_UNIX } from "../../modules/local/download_cat_db_unix/main" -include { BUSCO_DB_PREPARATION_UNIX } from "../../modules/local/busco_db_preparation_unix/main" -include { GTDBTK_DB_PREPARATION_UNIX } from "../../modules/local/gtdbtk_db_preparation_unix/main" -include { CHECKM2_DB_PREPARATION_UNIX } from "../../modules/local/checkm2_db_preparation_unix/main" - -include { QA_ASSEMBLY_QUAST } from "../../modules/local/qa_assembly_quast/main" -include { CALCULATE_COVERAGE_UNIX } from "../../modules/local/calculate_coverage_unix/main" -include { CLASSIFY_CONTIGS_CAT } from "../../modules/local/classify_contigs_cat/main" -include { ASSESS_ASSEMBLY_CHECKM2 } from "../../modules/local/assess_assembly_checkm2/main" +include { CAT_DB_PREPARATION_UNIX } from "../../modules/local/cat_db_preparation_unix/main" +include { DOWNLOAD_CAT_DB_UNIX } from "../../modules/local/download_cat_db_unix/main" +include { BUSCO_DB_PREPARATION_UNIX } from "../../modules/local/busco_db_preparation_unix/main" +include { SPLIT_MULTIFASTA_ASSEMBLY_BIOPYTHON } from "../../modules/local/split_multifasta_assembly_biopython/main" +include { GTDBTK_DB_PREPARATION_UNIX } from "../../modules/local/gtdbtk_db_preparation_unix/main" +include { CHECKM2_DB_PREPARATION_UNIX } from "../../modules/local/checkm2_db_preparation_unix/main" + +include { QA_ASSEMBLY_QUAST } from "../../modules/local/qa_assembly_quast/main" +include { CALCULATE_COVERAGE_UNIX } from "../../modules/local/calculate_coverage_unix/main" +include { CLASSIFY_CONTIGS_CAT } from "../../modules/local/classify_contigs_cat/main" +include { ASSESS_ASSEMBLY_CHECKM2 } from "../../modules/local/assess_assembly_checkm2/main" // // MODULES: nf-core modules @@ -62,8 +66,7 @@ workflow ASSEMBLY_ASSESSMENT { take: ch_assembly_file // channel: [ val(meta), [ contigs.fasta ] ] - ch_cleaned_fastq_files // channel: [ val(meta), [ cleaned_fastq_files (R1, R2, single) ] ] - ch_read_alignment_stats // channel: [ val(meta), [ CleanedReads-AlnStats.tsv ] ] + ch_read_alignment_stats // channel: [ val(meta), [ Clean_Reads-AlnStats.tsv ] ] ch_busco_config_file // channel: busco_config.ini ch_busco_db_file // channel: database ch_mash_db_file // channel: database @@ -85,30 +88,21 @@ workflow ASSEMBLY_ASSESSMENT { // PROCESS: Run QUAST on the polished assembly for quality assessment and // report the number of cleaned basepairs used to form the assembly QA_ASSEMBLY_QUAST ( - ch_cleaned_fastq_files.join(ch_assembly_file) + ch_assembly_file ) ch_versions = ch_versions.mix(QA_ASSEMBLY_QUAST.out.versions) // Collect assembly summaries and concatenate into one file ch_assembly_summary = QA_ASSEMBLY_QUAST.out.summary_assemblies - .collectFile( - name: "Summary.Assemblies.tsv", - keepHeader: true, - storeDir: "${params.outdir}/Summaries" - ) + .collectFile( + name: "Summary.Assembly_Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) ch_output_summary_files = ch_output_summary_files.mix(ch_assembly_summary) - // Collect cleaned read/base summaries and concatenate into one file - ch_cleaned_summary = QA_ASSEMBLY_QUAST.out.summary_reads - .collectFile( - name: "Summary.CleanedReads-Bases.tsv", - keepHeader: true, - storeDir: "${params.outdir}/Summaries" - ) - - ch_output_summary_files = ch_output_summary_files.mix(ch_cleaned_summary) - /* ================================================================================ Calculate coverage of assembly @@ -118,16 +112,18 @@ workflow ASSEMBLY_ASSESSMENT { // PROCESS: Calculate genome assembly depth of coverage CALCULATE_COVERAGE_UNIX ( QA_ASSEMBLY_QUAST.out.qa_summaries - .join(ch_read_alignment_stats) + .join(ch_assembly_file) + .join(ch_read_alignment_stats) ) ch_versions = ch_versions.mix(CALCULATE_COVERAGE_UNIX.out.versions) // Collect genome coverage summaries and concatenate into one file ch_genome_cov_summary = CALCULATE_COVERAGE_UNIX.out.summary .collectFile( - name: "Summary.GenomeCoverage.tsv", + name: "Summary.Assembly_Depth.tsv", keepHeader: true, - storeDir: "${params.outdir}/Summaries" + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" ) ch_output_summary_files = ch_output_summary_files.mix(ch_genome_cov_summary) @@ -175,13 +171,26 @@ workflow ASSEMBLY_ASSESSMENT { // PROCESS: Perform GTDB-Tk on assembly FastA file QA_ASSEMBLY_GTDBTK ( - ch_assembly_file, - ch_db_for_gtdbtk, - ch_mash_db_file + ch_assembly_file, // tuple val(meta) , path("bins/*") + ch_db_for_gtdbtk, // tuple val(db_name), path("database/*") + '/scratch', // val use_pplacer_scratch_dir NOTE: current nf-core module doesn't even use this path! + [] // path mash_db ) ch_versions = ch_versions.mix(QA_ASSEMBLY_GTDBTK.out.versions) ch_output_summary_files = ch_output_summary_files.mix(QA_ASSEMBLY_GTDBTK.out.summary.map{ meta, file -> file }) + // Collect GTDB-Tk summaries and concatenate into one file + ch_gtdbtk_summary = QA_ASSEMBLY_GTDBTK.out.summary + .map{ meta, file -> file } // Map to only include the files + .collectFile( + name: "Summary.Assemblies_Classified.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_gtdbtk_summary) + /* ================================================================================ CAT: Contig Annotation Tool @@ -252,6 +261,17 @@ workflow ASSEMBLY_ASSESSMENT { ch_output_summary_files = ch_output_summary_files.mix(ch_cat_output.map{ meta, file -> file }) + // Collect CAT summaries and concatenate into one file + ch_classified_contigs_cat_summary = CLASSIFY_CONTIGS_CAT.out.output + .collectFile( + name: "Summary.Contigs_Classified.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_classified_contigs_cat_summary) + /* ================================================================================ CheckM2: Check for completeness and contamination @@ -301,7 +321,17 @@ workflow ASSEMBLY_ASSESSMENT { ASSESS_ASSEMBLY_CHECKM2.out.summary ) - ch_output_summary_files = ch_output_summary_files.mix(ch_checkm2_output.map{ meta, file -> file }) + // Concatenate CheckM2 summaries + ch_checkm2_output = ASSESS_ASSEMBLY_CHECKM2.out.summary + .map{ meta, file -> file } // Map to only include the files + .collectFile( + name: "Summary.Assembly_Completeness.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_checkm2_output) /* ================================================================================ @@ -345,10 +375,15 @@ workflow ASSEMBLY_ASSESSMENT { } } .collect() + // Debug prints + println "DEBUG: Type of ch_db_for_busco: ${ch_db_for_busco.getClass()}" + println "DEBUG: Contents of ch_db_for_busco: ${ch_db_for_busco.inspect()}" } else { error("Unsupported object given to --busco_db, database must be supplied as either a directory or a .tar.gz file!") } + // NOTE: this defaults to "[auto]" not "auto" and gives error, no output + // "ERROR: [auto]_odb10 is not a valid option for 'lineages'" without [0] for channel item in QA_ASSEMBLY_BUSCO ch_lineage_for_busco_db = Channel .of(ch_busco_db_file) .map{ @@ -365,14 +400,83 @@ workflow ASSEMBLY_ASSESSMENT { ch_versions = ch_versions.mix(SPLIT_MULTIFASTA_ASSEMBLY_BIOPYTHON.out.versions) // PROCESS: Perform BUSCO analysis on contigs + // SPLIT_MULTIFASTA_ASSEMBLY_BIOPYTHON.out.split_multifasta_assembly_dir, // tuple val(meta), path('tmp_input/*') + // ch_lineage_for_busco_db, // val lineage ; Required: lineage to check against, "auto" enables --auto-lineage instead QA_ASSEMBLY_BUSCO ( - SPLIT_MULTIFASTA_ASSEMBLY_BIOPYTHON.out.split_multifasta_assembly_dir, - ch_lineage_for_busco_db, - ch_db_for_busco, - ch_busco_config_file + ch_assembly_file, // tuple val(meta), path('tmp_input/*') + 'genome', // val mode ; Required: One of genome, proteins, or transcriptome + 'auto', // val lineage ; Required: lineage to check against, "auto" enables --auto-lineage instead + ch_db_for_busco, // path busco_lineages_path ; Recommended: path to busco lineages - downloads if not set + ch_busco_config_file // path config_file ; Optional: busco configuration file ) ch_versions = ch_versions.mix(QA_ASSEMBLY_BUSCO.out.versions) ch_output_summary_files = ch_output_summary_files.mix(QA_ASSEMBLY_BUSCO.out.batch_summary.map{ meta, file -> file }) + + // Collect BUSCO summaries and concatenate into one file + println "DEBUG: ch_busco_summary QA_ASSEMBLY_BUSCO.out.batch_summary = ${QA_ASSEMBLY_BUSCO.out.batch_summary}" + ch_busco_summary = QA_ASSEMBLY_BUSCO.out.batch_summary + .map { meta, file -> + println "DEBUG: ch_busco_summary meta.id = ${meta.id} (${meta.getClass()})" + println "DEBUG: ch_busco_summary file = ${file} (${file.getClass()})" + return [meta.id, file] // meta.id and file path (or string representing file path) + } + .collectFile( + name: "Summary.BUSCO_Completeness.tsv", + keepHeader: true, + sort: { + // Adding debug to print all incoming elements + println "DEBUG: ch_busco_summary Sorting pair: ${it} (${it.getClass()})" + def filePath = (it[1] instanceof String) ? Paths.get(it[1]) : it[1] + println "DEBUG: ch_busco_summary Resolved filePath = ${filePath}" + + // Check if the file exists before reading + if (Files.exists(filePath)) { + Files.readString(filePath) + } else { + println "ERROR: ch_busco_summary File does not exist at path: ${filePath}" + return "" + } + }, + storeDir: "${params.outdir}/Summaries" + ) { pair -> + def sample_id = pair[0] + def path = pair[1] + + println "DEBUG: ch_busco_summary Processing sample_id = ${sample_id}" + println "DEBUG: ch_busco_summary Processing path = ${path} (${path.getClass()})" + + def filePath = (path instanceof String) ? Paths.get(path) : path + + if (!Files.exists(filePath)) { + println "ERROR: ch_busco_summary File does not exist for sample ${sample_id}: ${filePath}" + return "" + } + + def lines = Files.readAllLines(filePath) // Read each line as a list of strings + lines[0] = "Sample_name\t" + lines[0].replaceAll(/\s/, '_') + + def modifiedRows = lines.collect { line -> + def columns = line.split('\t').toList() + columns.remove(1) // Remove the second column + return columns + } + + def idx = 0 + def finalRows = modifiedRows.collect { columns -> + def result + if (idx == 0) { + result = "Sample_name\t" + columns.join('\t') + } else { + result = "$sample_id\t" + columns.join('\t') + } + idx++ + return result + } + + return finalRows.join('\n') + '\n' + } + + // ch_output_summary_files = ch_output_summary_files.mix(ch_busco_summary) } emit: diff --git a/subworkflows/local/downsampling.nf b/subworkflows/local/downsampling.nf index 0b2e5352..1c4a8b0f 100644 --- a/subworkflows/local/downsampling.nf +++ b/subworkflows/local/downsampling.nf @@ -13,8 +13,23 @@ // include { ESTIMATE_GENOME_SIZE_KMC } from "../../modules/local/estimate_genome_size_kmc/main" include { COUNT_TOTAL_BP_INPUT_READS_SEQTK } from "../../modules/local/count_total_bp_input_reads_seqtk/main" +include { COUNT_TOTAL_BP_INPUT_READS_SEQKIT } from "../../modules/local/count_total_bp_input_reads_seqkit/main" include { ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX } from "../../modules/local/estimate_original_input_depth_unix/main" include { SUBSAMPLE_READS_TO_DEPTH_SEQTK } from "../../modules/local/subsample_reads_to_depth_seqtk/main" +include { SUBSAMPLE_READS_TO_DEPTH_SEQKIT } from "../../modules/local/subsample_reads_to_depth_seqkit/main" +include { CALCULATE_METRICS_FASTQ_SEQTK as CALC_STATS_DOWNSAMPLE_FQ_SEQTK } from "../../modules/local/calculate_metrics_fastq_seqtk/main" +include { CALCULATE_METRICS_FASTQ_SEQKIT as CALC_STATS_DOWNSAMPLE_FQ_SEQKIT } from "../../modules/local/calculate_metrics_fastq_seqkit/main" + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + WORKFLOW FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Convert params.assembler to lowercase +def toLower(it) { + it.toString().toLowerCase() +} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -28,7 +43,8 @@ workflow DOWNSAMPLE { ch_raw_reads // channel: [ val(meta), [reads] ] main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() + ch_output_summary_files = Channel.empty() // Handle too much raw data, subsample the input FastQ files // Only calculate genome size if genome_size is unknown or a depth given to subsample @@ -37,11 +53,12 @@ workflow DOWNSAMPLE { ESTIMATE_GENOME_SIZE_KMC ( ch_raw_reads ) + ch_versions = ch_versions.mix(ESTIMATE_GENOME_SIZE_KMC.out.versions) } else { if (params.depth <= 0) { - log.info("Depth is set to ${params.genome_size}x. No subsampling to perform and therefore no genome size estimation required.") + log.info("Depth is set to <= 0x. No subsampling to perform and therefore no genome size estimation required.") } else { log.info("Using the user-input genome size of ${params.genome_size}bp") } @@ -50,30 +67,107 @@ workflow DOWNSAMPLE { // still consider downsampling with specified genome_size input value } + // Only if specified depth is less than wanted depth, subsample infiles if (params.depth > 0) { log.info("Estimating if the input exceeds ${params.depth}x") - // Use the genome size to figure out the expected depth - COUNT_TOTAL_BP_INPUT_READS_SEQTK ( - ch_raw_reads - ) - ch_versions = ch_versions.mix(COUNT_TOTAL_BP_INPUT_READS_SEQTK.out.versions) + // Subsample with seqtk + if ( toLower(params.subsample_tool) == "seqtk" ) { + + // Use the genome size to figure out the expected depth + COUNT_TOTAL_BP_INPUT_READS_SEQTK ( + ch_raw_reads + ) - ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX ( - COUNT_TOTAL_BP_INPUT_READS_SEQTK.out.input_total_bp - .join(ESTIMATE_GENOME_SIZE_KMC.out.genome_size) - ) - ch_versions = ch_versions.mix(ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX.out.versions) + ch_versions = ch_versions.mix(COUNT_TOTAL_BP_INPUT_READS_SEQTK.out.versions) - // Only if specified depth is less than wanted depth, subsample infiles - SUBSAMPLE_READS_TO_DEPTH_SEQTK ( - ch_raw_reads.join(ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX.out.fraction_of_reads_to_use) - ) - ch_versions = ch_versions.mix(SUBSAMPLE_READS_TO_DEPTH_SEQTK.out.versions) + ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX ( + COUNT_TOTAL_BP_INPUT_READS_SEQTK.out.input_total_bp + .join(ESTIMATE_GENOME_SIZE_KMC.out.genome_size) + ) + + ch_versions = ch_versions.mix(ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX.out.versions) + + SUBSAMPLE_READS_TO_DEPTH_SEQTK ( + ch_raw_reads.join(ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX.out.fraction_of_reads_to_use) + ) + + ch_versions = ch_versions.mix(SUBSAMPLE_READS_TO_DEPTH_SEQTK.out.versions) + + // Collect subsampled reads + ch_downsampled_reads = SUBSAMPLE_READS_TO_DEPTH_SEQTK.out.reads - // Collect subsampled reads - ch_downsampled_reads = SUBSAMPLE_READS_TO_DEPTH_SEQTK.out.reads + // PROCESS: Calculate downsampled FastQ metrics for each sample with Seqtk + CALC_STATS_DOWNSAMPLE_FQ_SEQTK ( + SUBSAMPLE_READS_TO_DEPTH_SEQTK.out.reads, + "Downsampled_Reads" + ) + ch_versions = ch_versions.mix(CALC_STATS_DOWNSAMPLE_FQ_SEQTK.out.versions) + + // Collect cleaned read/base summaries and concatenate into one file + ch_downsampled_reads_metrics_summary = CALC_STATS_DOWNSAMPLE_FQ_SEQTK.out.output + .collectFile( + name: "Summary.Downsampled_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_downsampled_reads_metrics_summary) + + // Subsample with SeqKit + } else if ( toLower(params.subsample_tool) == "seqkit" ) { + + // // Use the genome size to figure out the expected depth + // CALC_STATS_INPUT_FQ_SEQKIT ( + // ch_raw_reads, + // "Input_for_Subsampling_Reads" + // ) + // ch_versions = ch_versions.mix(CALC_STATS_INPUT_FQ_SEQKIT.out.versions) + + COUNT_TOTAL_BP_INPUT_READS_SEQKIT ( + ch_raw_reads + ) + + ch_versions = ch_versions.mix(COUNT_TOTAL_BP_INPUT_READS_SEQKIT.out.versions) + + ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX ( + COUNT_TOTAL_BP_INPUT_READS_SEQKIT.out.input_total_bp + .join(ESTIMATE_GENOME_SIZE_KMC.out.genome_size) + ) + + ch_versions = ch_versions.mix(ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX.out.versions) + + // Subsample with seqkit + SUBSAMPLE_READS_TO_DEPTH_SEQKIT ( + ch_raw_reads.join(ESTIMATE_ORIGINAL_INPUT_DEPTH_UNIX.out.fraction_of_reads_to_use) + ) + + ch_versions = ch_versions.mix(SUBSAMPLE_READS_TO_DEPTH_SEQKIT.out.versions) + + // Collect subsampled reads + ch_downsampled_reads = SUBSAMPLE_READS_TO_DEPTH_SEQKIT.out.reads + + // PROCESS: Calculate downsampled FastQ metrics for each sample with SeqKit + CALC_STATS_DOWNSAMPLE_FQ_SEQKIT ( + SUBSAMPLE_READS_TO_DEPTH_SEQKIT.out.reads, + "Downsampled_Reads" + ) + + ch_versions = ch_versions.mix(CALC_STATS_DOWNSAMPLE_FQ_SEQKIT.out.versions) + + // Collect cleaned read/base summaries and concatenate into one file + ch_downsampled_reads_metrics_summary = CALC_STATS_DOWNSAMPLE_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.Downsampled_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_downsampled_reads_metrics_summary) + } } else { // Skip subsampling and pass raw reads to PhiX removal // Collect raw reads @@ -81,6 +175,7 @@ workflow DOWNSAMPLE { } emit: - reads = ch_downsampled_reads // channel: [ val(meta), [reads] ] - versions = ch_versions + reads = ch_downsampled_reads // channel: [ val(meta), [reads] ] + versions = ch_versions + output_summary_files = ch_output_summary_files } diff --git a/subworkflows/local/host_removal.nf b/subworkflows/local/host_removal.nf index 5bf69faf..c4f11ff5 100644 --- a/subworkflows/local/host_removal.nf +++ b/subworkflows/local/host_removal.nf @@ -15,6 +15,9 @@ include { REMOVE_HOST_HOSTILE } from "../../modules/local/remove_ include { REMOVE_HOST_SRA_HUMAN_SCRUBBER } from "../../modules/local/remove_host_sra_human_scrubber/main" include { UPDATE_DB_SRA_HUMAN_SCRUBBER } from "../../modules/local/update_db_sra_human_scrubber/main" include { REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR } from "../../modules/local/remove_broken_pairs_bbtools_repair/main" +include { CALCULATE_METRICS_FASTQ_SEQKIT as CALC_STATS_HOSTILE_REMOVED_FQ_SEQKIT } from "../../modules/local/calculate_metrics_fastq_seqkit/main" +include { CALCULATE_METRICS_FASTQ_SEQKIT as CALC_STATS_REPAIR_REMOVED_FQ_SEQKIT } from "../../modules/local/calculate_metrics_fastq_seqkit/main" +include { CALCULATE_METRICS_FASTQ_SEQKIT as CALC_STATS_SCRUB_REMOVED_FQ_SEQKIT } from "../../modules/local/calculate_metrics_fastq_seqkit/main" /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -57,8 +60,9 @@ workflow HOST_REMOVAL { ch_sra_scrubber_db_file // channel: [ sra-human-scrubber database file ] main: - ch_versions = Channel.empty() - ch_qc_filechecks = Channel.empty() + ch_versions = Channel.empty() + ch_qc_filechecks = Channel.empty() + ch_output_summary_files = Channel.empty() // Database handling if ( !ch_sra_scrubber_db_file.isEmpty() ) { @@ -102,10 +106,41 @@ workflow HOST_REMOVAL { REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.host_removed_reads ) + // Collect removal summaries and concatenate into one file + ch_scrub_summary = REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.summary + .collectFile( + name: "Summary.SRA_Human_Scrubbed.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.summary, collected files: ${collectedFiles}" } + + ch_output_summary_files = ch_output_summary_files.mix(ch_scrub_summary) + + // PROCESS: Calculate human removed FastQ metrics for each sample with SeqKit + CALC_STATS_SCRUB_REMOVED_FQ_SEQKIT ( + REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.host_removed_reads, + "SRA_Scrubbed_Reads" + ) + + ch_versions = ch_versions.mix(CALC_STATS_SCRUB_REMOVED_FQ_SEQKIT.out.versions) + + // Collect scrubbed counts into one file + ch_scrubbed_reads_metrics = CALC_STATS_SCRUB_REMOVED_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.SRA_Scrubbed_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_scrubbed_reads_metrics) + // sra-human-scrubber non-default "-x" removes reads instead of masks // with N, so it's essential to discard broken pairs or else the // assembly polishing step with bwa mem and samtools mapping fails - // with an error for the such as: + // with an error, such as: // [mem_sam_pe] paired reads have different names: "SRR16343585.13690", "SRR16343585.13689" // [mem_sam_pe] paired reads have different names: "SRR16343585.13682", "SRR16343585.13681" // PROCESS: run BBTools' repair.sh to discard broken sister reads @@ -122,6 +157,37 @@ workflow HOST_REMOVAL { REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.repaired_reads ) + // Collect removal summaries and concatenate into one file + ch_repair_summary = REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.summary + .collectFile( + name: "Summary.BBTools_Repair_Removal.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.summary, collected files: ${collectedFiles}" } + + ch_output_summary_files = ch_output_summary_files.mix(ch_repair_summary) + + // PROCESS: Calculate human removed FastQ metrics for each sample with SeqKit + CALC_STATS_REPAIR_REMOVED_FQ_SEQKIT ( + REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.repaired_reads, + "BBTools_Repaired_Reads" + ) + + ch_versions = ch_versions.mix(CALC_STATS_REPAIR_REMOVED_FQ_SEQKIT.out.versions) + + // Collect counts from repair.sh removed broken pairs into one file + ch_repaired_reads_metrics = CALC_STATS_REPAIR_REMOVED_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.BBTools_Repaired_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_repaired_reads_metrics) + // Collect QC File Checks ch_qc_filechecks = ch_qc_filechecks .mix(REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.qc_filecheck) @@ -129,6 +195,7 @@ workflow HOST_REMOVAL { } else if ( toLower(params.host_remove) == "hostile" ) { // hostile removal tool + // PROCESS: Run hostile to remove background host DNA read sequences REMOVE_HOST_HOSTILE ( ch_infile_handling @@ -145,8 +212,40 @@ workflow HOST_REMOVAL { // Collect QC File Checks ch_qc_filechecks = ch_qc_filechecks.mix(REMOVE_HOST_HOSTILE.out.qc_filecheck) + // Collect removal summaries and concatenate into one file + ch_hostile_summary = REMOVE_HOST_HOSTILE.out.summary + .collectFile( + name: "Summary.BBTools_Repair_Removal.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From REMOVE_HOST_HOSTILE.out.summary, collected files: ${collectedFiles}" } + + ch_output_summary_files = ch_output_summary_files.mix(ch_hostile_summary) + + // PROCESS: Calculate human removed FastQ metrics for each sample with SeqKit + CALC_STATS_HOSTILE_REMOVED_FQ_SEQKIT ( + REMOVE_HOST_HOSTILE.out.host_removed_reads, + "Hostile_Removed_Reads" + ) + + ch_versions = ch_versions.mix(CALC_STATS_HOSTILE_REMOVED_FQ_SEQKIT.out.versions) + + // Collect hostile removal counts into one file + ch_hostile_removed_reads_metrics = CALC_STATS_HOSTILE_REMOVED_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.Hostile_Removed_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_hostile_removed_reads_metrics) + } else if ( toLower(params.host_remove) == "both" && ch_db_for_sra_human_scrubber ) { - // sra-human-scrubber removal tool then hostile removal tool + // sra-human-scrubber removal tool (+ repair step), then hostile removal tool + // PROCESS: Run sra-human-scrubber first and then hostile to remove background host DNA read sequences REMOVE_HOST_SRA_HUMAN_SCRUBBER ( ch_infile_handling, @@ -161,6 +260,37 @@ workflow HOST_REMOVAL { REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.host_removed_reads ) + // Collect removal summaries and concatenate into one file + ch_scrub_summary = REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.summary + .collectFile( + name: "Summary.SRA_Human_Scrubbed.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.summary, collected files: ${collectedFiles}" } + + ch_output_summary_files = ch_output_summary_files.mix(ch_scrub_summary) + + // PROCESS: Calculate human removed FastQ metrics for each sample with SeqKit + CALC_STATS_SCRUB_REMOVED_FQ_SEQKIT ( + REMOVE_HOST_SRA_HUMAN_SCRUBBER.out.host_removed_reads, + "SRA_Scrubbed_Reads" + ) + + ch_versions = ch_versions.mix(CALC_STATS_SCRUB_REMOVED_FQ_SEQKIT.out.versions) + + // Collect scrubbed counts into one file + ch_scrubbed_reads_metrics = CALC_STATS_SCRUB_REMOVED_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.SRA_Scrubbed_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_scrubbed_reads_metrics) + // sra-human-scrubber non-default "-x" removes reads instead of masks // with N, so it's essential to discard broken pairs or else the // assembly polishing step with bwa mem and samtools mapping fails @@ -181,12 +311,43 @@ workflow HOST_REMOVAL { REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.repaired_reads ) + // Collect removal summaries and concatenate into one file + ch_repair_summary = REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.summary + .collectFile( + name: "Summary.BBTools_Repair_Removal.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.summary, collected files: ${collectedFiles}" } + + ch_output_summary_files = ch_output_summary_files.mix(ch_repair_summary) + + // PROCESS: Calculate human removed FastQ metrics for each sample with SeqKit + CALC_STATS_REPAIR_REMOVED_FQ_SEQKIT ( + REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.repaired_reads, + "BBTools_Repaired_Reads" + ) + + ch_versions = ch_versions.mix(CALC_STATS_REPAIR_REMOVED_FQ_SEQKIT.out.versions) + + // Collect counts from repair.sh removed broken pairs into one file + ch_repaired_reads_metrics = CALC_STATS_REPAIR_REMOVED_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.BBTools_Repaired_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_repaired_reads_metrics) + // hostile removal tool // PROCESS: Run hostile to remove background host DNA read sequences REMOVE_HOST_HOSTILE ( ch_repair_sra ) - ch_versions = ch_versions.mix(REMOVE_HOST_HOSTILE.out.versions) + ch_versions = ch_versions.mix(REMOVE_HOST_HOSTILE.out.versions) // Collect output files ch_host_removed_reads = qcfilecheck( @@ -201,6 +362,35 @@ workflow HOST_REMOVAL { .mix(REMOVE_BROKEN_PAIRS_BBTOOLS_REPAIR.out.qc_filecheck) .mix(REMOVE_HOST_HOSTILE.out.qc_filecheck) + // Collect removal summaries and concatenate into one file + ch_hostile_summary = REMOVE_HOST_HOSTILE.out.summary + .collectFile( + name: "Summary.Hostile_Human_Removed.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From REMOVE_HOST_HOSTILE.out.summary, collected files: ${collectedFiles}" } + + ch_output_summary_files = ch_output_summary_files.mix(ch_hostile_summary) + // PROCESS: Calculate human removed FastQ metrics for each sample with SeqKit + CALC_STATS_HOSTILE_REMOVED_FQ_SEQKIT ( + REMOVE_HOST_HOSTILE.out.host_removed_reads, + "Hostile_Removed_Reads" + ) + + ch_versions = ch_versions.mix(CALC_STATS_HOSTILE_REMOVED_FQ_SEQKIT.out.versions) + + // Collect hostile removal counts into one file + ch_hostile_removed_reads_metrics = CALC_STATS_HOSTILE_REMOVED_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.Hostile_Removed_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + + ch_output_summary_files = ch_output_summary_files.mix(ch_hostile_removed_reads_metrics) } else if ( toLower(params.host_remove) == "skip" ) { // User-specified skip host removal @@ -214,7 +404,8 @@ workflow HOST_REMOVAL { } emit: - host_removed_reads = ch_host_removed_reads // channel: [ val(meta), [host_removed_fastq_files (R1, R2)] ] - qc_filecheck = ch_qc_filechecks - versions = ch_versions + host_removed_reads = ch_host_removed_reads // channel: [ val(meta), [host_removed_fastq_files (R1, R2)] ] + qc_filecheck = ch_qc_filechecks + versions = ch_versions + output_summary_files = ch_output_summary_files } diff --git a/tower.yml b/tower.yml index fafae09c..f5c3f798 100755 --- a/tower.yml +++ b/tower.yml @@ -1,5 +1,5 @@ -reports: - results/multiqc/multiqc_report.html: - display: "MultiQC HTML report" - results/samplesheet.valid.csv: - display: "Auto-created samplesheet with collated metadata and FASTQ paths" \ No newline at end of file +reports: + results/multiqc/multiqc_report.html: + display: "MultiQC HTML report" + results/samplesheet.valid.csv: + display: "Auto-created samplesheet with collated metadata and FASTQ paths" diff --git a/workflows/assembly.nf b/workflows/assembly.nf index 77faf26d..8f29f2c2 100644 --- a/workflows/assembly.nf +++ b/workflows/assembly.nf @@ -43,6 +43,13 @@ if(params.busco_config){ // MODULES: Local modules // include { INFILE_HANDLING_UNIX } from "../modules/local/infile_handling_unix/main" +include { VALIDATE_FASTQ_SEQFU } from "../modules/local/validate_fastq_seqfu/main.nf" + +include { CALCULATE_METRICS_FASTQ_SEQKIT as CALC_STATS_INPUT_FQ_SEQKIT } from "../modules/local/calculate_metrics_fastq_seqkit/main" +include { CALCULATE_METRICS_FASTQ_SEQKIT as CALC_STATS_NOPHIX_FQ_SEQKIT } from "../modules/local/calculate_metrics_fastq_seqkit/main" +include { CALCULATE_METRICS_FASTQ_SEQKIT as CALC_STATS_TRIM_FQ_SEQKIT } from "../modules/local/calculate_metrics_fastq_seqkit/main" +include { CALCULATE_METRICS_FASTQ_SEQKIT as CALC_STATS_CLEANEDREADS_FQ_SEQKIT } from "../modules/local/calculate_metrics_fastq_seqkit/main" +include { CALCULATE_METRICS_FASTQ_SEQTK } from "../modules/local/calculate_metrics_fastq_seqtk/main" include { REMOVE_PHIX_BBDUK } from "../modules/local/remove_phix_bbduk/main" include { TRIM_READS_TRIMMOMATIC } from "../modules/local/trim_reads_trimmomatic/main" @@ -254,13 +261,17 @@ workflow ASSEMBLY { INFILE_HANDLING_UNIX ( INPUT_CHECK.out.raw_reads ) + // INFILE_HANDLING_UNIX + // .view { file -> println "DEBUG: From INFILE_HANDLING_UNIX, emitting file: ${file}" } + // INFILE_HANDLING_UNIX + // .view { item -> println "DEBUG: From INFILE_HANDLING_UNIX, channel item: ${item}" } ch_versions = ch_versions.mix(INFILE_HANDLING_UNIX.out.versions) ch_qc_filecheck = ch_qc_filecheck.concat(INFILE_HANDLING_UNIX.out.qc_filecheck) ch_infile_handling = qcfilecheck( "INFILE_HANDLING_UNIX", INFILE_HANDLING_UNIX.out.qc_filecheck, INFILE_HANDLING_UNIX.out.input - ) + ) ch_infile_handling = ch_infile_handling .map{ @@ -269,24 +280,73 @@ workflow ASSEMBLY { [ meta, file] } + ch_infile_checksum = INFILE_HANDLING_UNIX.out.checksums + .collectFile( + name: "Summary.Input_Checksums.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From INFILE_HANDLING_UNIX.out.checksums, collected files: ${collectedFiles}" } + ch_output_summary_files = ch_output_summary_files.mix(ch_infile_checksum) + + // PROCESS: Calculate input FastQ metrics for each sample with SeqKit + CALC_STATS_INPUT_FQ_SEQKIT ( + INPUT_CHECK.out.raw_reads, + "Input_Reads" + ) + ch_versions = ch_versions.mix(CALC_STATS_INPUT_FQ_SEQKIT.out.versions) + + // Collect raw input read/base summaries and concatenate into one file + ch_input_reads_metrics_summary = CALC_STATS_INPUT_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.Input_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + ch_output_summary_files = ch_output_summary_files.mix(ch_input_reads_metrics_summary) + + // Check input files are valid FastQ format + VALIDATE_FASTQ_SEQFU ( + INPUT_CHECK.out.raw_reads + ) + ch_versions = ch_versions.mix(VALIDATE_FASTQ_SEQFU.out.versions) + ch_qc_filecheck = ch_qc_filecheck.concat(VALIDATE_FASTQ_SEQFU.out.qc_filecheck) + ch_validate_fastq_format = qcfilecheck( + "VALIDATE_FASTQ_SEQFU", + VALIDATE_FASTQ_SEQFU.out.qc_filecheck, + VALIDATE_FASTQ_SEQFU.out.input + ) + // SUBWORKFLOW: Remove host from FastQ files HOST_REMOVAL ( ch_infile_handling, ch_sra_scrubber_db_file ) - ch_versions = ch_versions.mix(HOST_REMOVAL.out.versions) + // HOST_REMOVAL + // .view { file -> println "DEBUG: From HOST_REMOVAL, emitting file: ${file}" } + // HOST_REMOVAL + // .view { item -> println "DEBUG: From HOST_REMOVAL, channel item: ${item}" } + ch_versions = ch_versions.mix(HOST_REMOVAL.out.versions) + ch_output_summary_files = ch_output_summary_files.mix(HOST_REMOVAL.out.output_summary_files) // SUBWORKFLOW: Downsample FastQ files DOWNSAMPLE ( HOST_REMOVAL.out.host_removed_reads ) - ch_versions = ch_versions.mix(DOWNSAMPLE.out.versions) + ch_versions = ch_versions.mix(DOWNSAMPLE.out.versions) + ch_output_summary_files = ch_output_summary_files.mix(DOWNSAMPLE.out.output_summary_files) // PROCESS: Run bbduk to remove PhiX reads REMOVE_PHIX_BBDUK ( DOWNSAMPLE.out.reads, ch_phix_reference ) + // REMOVE_PHIX_BBDUK + // .view { file -> println "DEBUG: From REMOVE_PHIX_BBDUK, emitting file: ${file}" } + // REMOVE_PHIX_BBDUK + // .view { item -> println "DEBUG: From REMOVE_PHIX_BBDUK, channel item: ${item}" } ch_versions = ch_versions.mix(REMOVE_PHIX_BBDUK.out.versions) ch_qc_filecheck = ch_qc_filecheck.concat(REMOVE_PHIX_BBDUK.out.qc_filecheck) ch_removed_phix = qcfilecheck( @@ -300,34 +360,75 @@ workflow ASSEMBLY { .collectFile( name: "Summary.PhiX_Removal.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) - + .view { collectedFiles -> println "DEBUG: From REMOVE_PHIX_BBDUK.out.summary, collected files: ${collectedFiles}" } ch_output_summary_files = ch_output_summary_files.mix(ch_phix_removal_summary) + // PROCESS: Calculate PhiX-free FastQ metrics for each sample with SeqKit + CALC_STATS_NOPHIX_FQ_SEQKIT ( + REMOVE_PHIX_BBDUK.out.phix_removed_reads, + "PhiX_Removed_Reads" + ) + ch_versions = ch_versions.mix(CALC_STATS_NOPHIX_FQ_SEQKIT.out.versions) + + // Collect PhiX-free read/base summaries and concatenate into one file + ch_nophix_reads_metrics_summary = CALC_STATS_NOPHIX_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.PhiX_Removed_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + ch_output_summary_files = ch_output_summary_files.mix(ch_nophix_reads_metrics_summary) + if ( toLower(params.trim_reads_tool) == "trimmomatic" ) { // PROCESS: Run trimmomatic to clip adapters and do quality trimming TRIM_READS_TRIMMOMATIC ( ch_removed_phix, ch_adapter_reference ) + // TRIM_READS_TRIMMOMATIC + // .view { file -> println "DEBUG: From TRIM_READS_TRIMMOMATIC, emitting file: ${file}" } + // TRIM_READS_TRIMMOMATIC + // .view { item -> println "DEBUG: From TRIM_READS_TRIMMOMATIC, channel item: ${item}" } ch_versions = ch_versions.mix(TRIM_READS_TRIMMOMATIC.out.versions) ch_qc_filecheck = ch_qc_filecheck.concat(TRIM_READS_TRIMMOMATIC.out.qc_filecheck) ch_trim_reads = qcfilecheck( "TRIM_READS_TRIMMOMATIC", TRIM_READS_TRIMMOMATIC.out.qc_filecheck, TRIM_READS_TRIMMOMATIC.out.fastq_adapters_removed - ) + ) // Collect read trimming summaries and concatenate into one file ch_trimmomatic_summary = TRIM_READS_TRIMMOMATIC.out.summary .collectFile( - name: "Summary-Trimmomatic.Adapter_and_QC_Trimming.tsv", + name: "Summary.Adapter_and_QC_Trim.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) - + .view { collectedFiles -> println "DEBUG: From TRIM_READS_TRIMMOMATIC.out.summary, collected files: ${collectedFiles}" } ch_output_summary_files = ch_output_summary_files.mix(ch_trimmomatic_summary) + + // PROCESS: Calculate adapter-and-quality-trimmed FastQ metrics for each sample with SeqKit + CALC_STATS_TRIM_FQ_SEQKIT ( + TRIM_READS_TRIMMOMATIC.out.fastq_adapters_removed, + "Adapter_QC_Trim_Reads" + ) + ch_versions = ch_versions.mix(CALC_STATS_TRIM_FQ_SEQKIT.out.versions) + + // Collect adapter-and-quality-trimmed read/base summaries and concatenate into one file + ch_trim_reads_metrics_summary = CALC_STATS_TRIM_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.Adapter_QC_Trim_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + ch_output_summary_files = ch_output_summary_files.mix(ch_trim_reads_metrics_summary) + } else if ( toLower(params.trim_reads_tool) == "fastp" ) { // Do not use 'adapters_Nextera_NEB_TruSeq_NuGEN_ThruPLEX.fas' for fastp ch_adapter_reference = ch_adapter_reference.map{ it[0].getSimpleName() == "adapters_Nextera_NEB_TruSeq_NuGEN_ThruPLEX.fas" } @@ -338,28 +439,54 @@ workflow ASSEMBLY { ch_removed_phix, ch_adapter_reference ) + // TRIM_READS_FASTP + // .view { file -> println "DEBUG: From TRIM_READS_FASTP, emitting file: ${file}" } + // TRIM_READS_FASTP + // .view { item -> println "DEBUG: From TRIM_READS_FASTP, channel item: ${item}" } ch_versions = ch_versions.mix(TRIM_READS_FASTP.out.versions) ch_qc_filecheck = ch_qc_filecheck.concat(TRIM_READS_FASTP.out.qc_filecheck) ch_trim_reads = qcfilecheck( "TRIM_READS_FASTP", TRIM_READS_FASTP.out.qc_filecheck, TRIM_READS_FASTP.out.fastq_adapters_removed - ) + ) ch_fastp_summary = TRIM_READS_FASTP.out.summary .collectFile( - name: "Summary-fastp.Adapter_and_QC_Trimming.tsv", + name: "Summary.Adapter_and_QC_Trim.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) - + .view { collectedFiles -> println "DEBUG: From TRIM_READS_FASTP.out.summary, collected files: ${collectedFiles}" } ch_output_summary_files = ch_output_summary_files.mix(ch_fastp_summary) + + // PROCESS: Calculate adapter-and-quality-trimmed FastQ metrics for each sample with SeqKit + CALC_STATS_TRIM_FQ_SEQKIT ( + TRIM_READS_FASTP.out.fastq_adapters_removed, + "Adapter_QC_Trim_Reads" + ) + ch_versions = ch_versions.mix(CALC_STATS_TRIM_FQ_SEQKIT.out.versions) + + // Collect adapter-and-quality-trimmed read/base summaries and concatenate into one file + ch_trim_reads_metrics_summary = CALC_STATS_TRIM_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.Adapter_QC_Trim_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + ch_output_summary_files = ch_output_summary_files.mix(ch_trim_reads_metrics_summary) } // PROCESS: Run flash to merge overlapping sister reads into singleton reads OVERLAP_PAIRED_READS_FLASH ( ch_trim_reads ) + // OVERLAP_PAIRED_READS_FLASH + // .view { file -> println "DEBUG: From OVERLAP_PAIRED_READS_FLASH, emitting file: ${file}" } + // OVERLAP_PAIRED_READS_FLASH + // .view { item -> println "DEBUG: From OVERLAP_PAIRED_READS_FLASH, channel item: ${item}" } ch_versions = ch_versions.mix(OVERLAP_PAIRED_READS_FLASH.out.versions) ch_qc_filecheck = ch_qc_filecheck.concat(OVERLAP_PAIRED_READS_FLASH.out.qc_filecheck) ch_overlap_flash = qcfilecheck( @@ -368,16 +495,70 @@ workflow ASSEMBLY { OVERLAP_PAIRED_READS_FLASH.out.cleaned_fastq_files ) + ch_cleanedreads_checksum = OVERLAP_PAIRED_READS_FLASH.out.checksums + .collectFile( + name: "Summary.Clean_Reads_Checksums.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From OVERLAP_PAIRED_READS_FLASH.out.checksums, collected files: ${collectedFiles}" } + ch_output_summary_files = ch_output_summary_files.mix(ch_cleanedreads_checksum) + // Collect singleton read summaries and concatenate into one file ch_overlap_summary = OVERLAP_PAIRED_READS_FLASH.out.summary .collectFile( - name: "Summary.Clean_and_Overlapping_Reads.tsv", + name: "Summary.Clean_and_Overlapped.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) - + .view { collectedFiles -> println "DEBUG: From OVERLAP_PAIRED_READS_FLASH.out.summary, collected files: ${collectedFiles}" } ch_output_summary_files = ch_output_summary_files.mix(ch_overlap_summary) + /* + ================================================================================ + Clean_Reads Assessment + ================================================================================ + */ + // PROCESS: Calculate cleaned FastQ metrics for each sample with SeqKit + CALC_STATS_CLEANEDREADS_FQ_SEQKIT ( + OVERLAP_PAIRED_READS_FLASH.out.cleaned_fastq_files, + "Clean_Reads" + ) + ch_versions = ch_versions.mix(CALC_STATS_CLEANEDREADS_FQ_SEQKIT.out.versions) + + // Collect cleaned read/base summaries and concatenate into one file + ch_cleaned_reads_metrics_summary = CALC_STATS_CLEANEDREADS_FQ_SEQKIT.out.output + .collectFile( + name: "Summary.Clean_Reads.Metrics.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + ch_output_summary_files = ch_output_summary_files.mix(ch_cleaned_reads_metrics_summary) + + // NOTE: This is just a backup working solution; use SeqKit only for now due to extra metrics provided + // // PROCESS: Calculate cleaned FastQ metrics for each sample with SeqKit + // CALCULATE_METRICS_FASTQ_SEQTK ( + // OVERLAP_PAIRED_READS_FLASH.out.cleaned_fastq_files + // ) + // ch_versions = ch_versions.mix(CALCULATE_METRICS_FASTQ_SEQTK.out.versions) + // // Collect cleaned read/base summaries and concatenate into one file + // ch_cleaned_reads_metrics_summary = CALCULATE_METRICS_FASTQ_SEQTK.out.output + // .collectFile( + // name: "Summary.Cleaned_Reads.Metrics.tsv", + // keepHeader: true, + // sort: { file -> file.text }, + // storeDir: "${params.outdir}/Summaries" + // ) + // .view { collectedFiles -> println "DEBUG: From ch_cleaned_reads_metrics_summary, collected files: ${collectedFiles}" } + // ch_output_summary_files = ch_output_summary_files.mix(ch_cleaned_reads_metrics_summary) + + // KAT K-mer plot + + // seqkit stats for TSV + /* ================================================================================ Taxonomic information @@ -425,10 +606,12 @@ workflow ASSEMBLY { } } .collect() + } else { log.error("Unsupported object given to --kraken1_db, database must be supplied as either a directory or a .tar.gz file!") ch_db_for_kraken1 = Channel.empty() } + } else { log.warn("Kraken could not be performed - database not specified using --kraken1_db!") ch_db_for_kraken1 = Channel.empty() @@ -439,6 +622,10 @@ workflow ASSEMBLY { ch_overlap_flash, ch_db_for_kraken1 ) + // READ_CLASSIFY_KRAKEN_ONE + // .view { file -> println "DEBUG: From READ_CLASSIFY_KRAKEN_ONE, emitting file: ${file}" } + // READ_CLASSIFY_KRAKEN_ONE + // .view { item -> println "DEBUG: From READ_CLASSIFY_KRAKEN_ONE, channel item: ${item}" } ch_versions = ch_versions.mix(READ_CLASSIFY_KRAKEN_ONE.out.versions) // Collect kraken summaries and concatenate into one file @@ -446,8 +633,11 @@ workflow ASSEMBLY { .collectFile( name: "Summary.Kraken.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) + .view { collectedFiles -> println "DEBUG: From READ_CLASSIFY_KRAKEN_ONE.out.summary, collected files: ${collectedFiles}" } + ch_output_summary_files = ch_output_summary_files.mix(ch_kraken_one_summary) // Prepare kraken2 database for use if ( ch_kraken2_db_file ) { @@ -480,10 +670,12 @@ workflow ASSEMBLY { } } .collect() + } else { log.error("Unsupported object given to --kraken2_db, database must be supplied as either a directory or a .tar.gz file!") ch_db_for_kraken2 = Channel.empty() } + } else { log.warn("Kraken2 could not be performed - database not specified using --kraken2_db!") ch_db_for_kraken2 = Channel.empty() @@ -494,6 +686,10 @@ workflow ASSEMBLY { ch_overlap_flash, ch_db_for_kraken2 ) + // READ_CLASSIFY_KRAKEN_TWO + // .view { file -> println "DEBUG: From READ_CLASSIFY_KRAKEN_TWO, emitting file: ${file}" } + // READ_CLASSIFY_KRAKEN_TWO + // .view { item -> println "DEBUG: From READ_CLASSIFY_KRAKEN_TWO, channel item: ${item}" } ch_versions = ch_versions.mix(READ_CLASSIFY_KRAKEN_TWO.out.versions) // Collect kraken2 summaries and concatenate into one file @@ -501,8 +697,12 @@ workflow ASSEMBLY { .collectFile( name: "Summary.Kraken2.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) + .view { collectedFiles -> println "DEBUG: From READ_CLASSIFY_KRAKEN_TWO.out.summary, collected files: ${collectedFiles}" } + ch_output_summary_files = ch_output_summary_files.mix(ch_kraken_two_summary) + /* ================================================================================ @@ -514,9 +714,23 @@ workflow ASSEMBLY { ch_overlap_flash, var_assembler_name ) + // ASSEMBLE_CONTIGS + // .view { file -> println "DEBUG: From ASSEMBLE_CONTIGS, emitting file: ${file}" } + // ASSEMBLE_CONTIGS + // .view { item -> println "DEBUG: From ASSEMBLE_CONTIGS, channel item: ${item}" } ch_versions = ch_versions.mix(ASSEMBLE_CONTIGS.out.versions) ch_qc_filecheck = ch_qc_filecheck.concat(ASSEMBLE_CONTIGS.out.qc_filecheck) + ch_assembly_checksum = ASSEMBLE_CONTIGS.out.checksums + .collectFile( + name: "Summary.Assembly_Checksums.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From OVERLAP_PAIRED_READS_FLASH.out.checksums, collected files: ${collectedFiles}" } + ch_output_summary_files = ch_output_summary_files.mix(ch_assembly_checksum) + /* ================================================================================ Assembly Information @@ -527,22 +741,32 @@ workflow ASSEMBLY { EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS ( ASSEMBLE_CONTIGS.out.bam_files ) + // EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS + // .view { file -> println "DEBUG: From EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS, emitting file: ${file}" } + // EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS + // .view { item -> println "DEBUG: From EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS, channel item: ${item}" } ch_versions = ch_versions.mix(EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS.out.versions) // Collect alignment summary stats and concatenate into one file ch_alignment_stats_summary = EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS.out.summary .map{ meta, file -> file } .collectFile( - name: "Summary.CleanedReads-AlignmentStats.tsv", + name: "Summary.Clean_Reads_Aligned.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) + .view { collectedFiles -> println "DEBUG: From EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS.out.summary, collected files: ${collectedFiles}" } ch_output_summary_files = ch_output_summary_files.mix(ch_alignment_stats_summary) // PROCESS: Run MLST to find MLST for each polished assembly MLST_MLST ( ASSEMBLE_CONTIGS.out.assembly_file ) + // MLST_MLST + // .view { file -> println "DEBUG: From MLST_MLST, emitting file: ${file}" } + // MLST_MLST + // .view { item -> println "DEBUG: From MLST_MLST, channel item: ${item}" } ch_versions = ch_versions.mix(MLST_MLST.out.versions) // Collect MLST Summaries and concatenate into one file @@ -550,15 +774,20 @@ workflow ASSEMBLY { .collectFile( name: "Summary.MLST.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) - + .view { collectedFiles -> println "DEBUG: From MLST_MLST.out.summary, collected files: ${collectedFiles}" } ch_output_summary_files = ch_output_summary_files.mix(ch_mlst_summary) // PROCESS: Annotate the polished assembly using Prokka ANNOTATE_PROKKA ( ASSEMBLE_CONTIGS.out.assembly_file ) + // ANNOTATE_PROKKA + // .view { file -> println "DEBUG: From ANNOTATE_PROKKA, emitting file: ${file}" } + // ANNOTATE_PROKKA + // .view { item -> println "DEBUG: From ANNOTATE_PROKKA, channel item: ${item}" } ch_versions = ch_versions.mix(ANNOTATE_PROKKA.out.versions) ch_genbank = qcfilecheck( "ANNOTATE_PROKKA", @@ -566,6 +795,17 @@ workflow ASSEMBLY { ANNOTATE_PROKKA.out.prokka_genbank_file ) + ch_annotation_checksum = ANNOTATE_PROKKA.out.checksums + .collectFile( + name: "Summary.Annotation_Checksums.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From ANNOTATE_PROKKA.out.checksums, collected files: ${collectedFiles}" } + ch_output_summary_files = ch_output_summary_files.mix(ch_annotation_checksum) + + /* ================================================================================ Evaluate 16S @@ -576,6 +816,10 @@ workflow ASSEMBLY { EXTRACT_16S_BIOPYTHON ( ch_genbank.join(ASSEMBLE_CONTIGS.out.assembly_file) ) + // EXTRACT_16S_BIOPYTHON + // .view { file -> println "DEBUG: From EXTRACT_16S_BIOPYTHON, emitting file: ${file}" } + // EXTRACT_16S_BIOPYTHON + // .view { item -> println "DEBUG: From EXTRACT_16S_BIOPYTHON, channel item: ${item}" } ch_versions = ch_versions.mix(EXTRACT_16S_BIOPYTHON.out.versions) // PROCESS: Extract 16S rRNA gene sequences with Barrnap if missing from 16S_EXTRACT_BIOPYTHON @@ -583,6 +827,10 @@ workflow ASSEMBLY { ASSEMBLE_CONTIGS.out.assembly_file .join(EXTRACT_16S_BIOPYTHON.out.extracted_rna) ) + // EXTRACT_16S_BARRNAP + // .view { file -> println "DEBUG: From EXTRACT_16S_BARRNAP, emitting file: ${file}" } + // EXTRACT_16S_BARRNAP + // .view { item -> println "DEBUG: From EXTRACT_16S_BARRNAP, channel item: ${item}" } ch_versions = ch_versions.mix(EXTRACT_16S_BARRNAP.out.versions) ch_qc_filecheck = ch_qc_filecheck.concat(EXTRACT_16S_BARRNAP.out.qc_filecheck) ch_extracted_rna = qcfilecheck( @@ -623,9 +871,11 @@ workflow ASSEMBLY { } } .collect() + } else { error("Unsupported object given to --blast_db, database must be supplied as either a directory or a .tar.gz file!") } + } else { error("Missing 16S ribosomal RNA database! Database must be supplied to `--blast_db` as either a directory or a .tar.gz file!") } @@ -635,6 +885,10 @@ workflow ASSEMBLY { ch_extracted_rna, ch_db_for_blast ) + // ALIGN_16S_BLAST + // .view { file -> println "DEBUG: From ALIGN_16S_BLAST, emitting file: ${file}" } + // ALIGN_16S_BLAST + // .view { item -> println "DEBUG: From ALIGN_16S_BLAST, channel item: ${item}" } ch_versions = ch_versions.mix(ALIGN_16S_BLAST.out.versions) ch_qc_filecheck = ch_qc_filecheck.concat(ALIGN_16S_BLAST.out.qc_filecheck) ch_blast_output = qcfilecheck( @@ -647,6 +901,10 @@ workflow ASSEMBLY { CLASSIFY_16S_RDP ( EXTRACT_16S_BARRNAP.out.extracted_rna ) + // CLASSIFY_16S_RDP + // .view { file -> println "DEBUG: From CLASSIFY_16S_RDP, emitting file: ${file}" } + // CLASSIFY_16S_RDP + // .view { item -> println "DEBUG: From CLASSIFY_16S_RDP, channel item: ${item}" } ch_versions = ch_versions.mix(CLASSIFY_16S_RDP.out.versions) ch_rdp_summary = qcfilecheck( @@ -655,44 +913,68 @@ workflow ASSEMBLY { CLASSIFY_16S_RDP.out.rdp_tsv ) - // Concatenate RDP summaries - ch_rdp_summary - .map{meta, file -> file} - .collect() - .flatten() - .collectFile( - name: "Summary.RDP.tsv", - keepHeader: true, - storeDir: "${params.outdir}/Summaries" - ) - - ch_output_summary_files = ch_output_summary_files.mix(ch_rdp_summary.map{ meta, file -> file }) + // PROCESS: Concatenate RDP summaries + ch_rdp_summary = ch_rdp_summary + .map{ meta, file -> // Map to only include the files + if (file.exists() && file.size() > 0) { + return file + } else { + error "File does not exist or empty size: ${file}" + } + } + .view { file -> println "DEBUG: From ch_rdp_summary, File to be collected: ${file}" } + .collectFile( + name: "${var_assembler_name}.16S_top_genus_RDP.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/SSU" + ) + .collectFile( + name: "Summary.16S_Genus_RDP.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries" + ) + .view { collectedFiles -> println "DEBUG: From ch_rdp_summary, collected files: ${collectedFiles}" } + ch_output_summary_files = ch_output_summary_files.mix(ch_rdp_summary) // PROCESS: Filter Blast output for best alignment, based on bitscore BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON ( ch_blast_output ) + BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON.out.summary + .view { file -> println "DEBUG: From BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON.out.summary, emitting file: ${file}" } + BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON.out.summary + .view { item -> println "DEBUG: From BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON.out.summary, channel item: ${item}" } ch_versions = ch_versions.mix(BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON.out.versions) ch_top_blast = qcfilecheck( "BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON", BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON.out.qc_filecheck, - BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON.out.top_blast_species + BEST_16S_BLASTN_BITSCORE_TAXON_PYTHON.out.summary ) // Collect top BLASTn species and concatenate into one file ch_top_blast = ch_top_blast - .map{ meta, file -> file } + .map{ meta, file -> file + if (file.exists() && file.size() > 0) { + return file + } else { + error "File does not exist or empty size: ${file}" + } + } .collectFile( - name: "${var_assembler_name}.16S-top-species.tsv", + name: "${var_assembler_name}.16S_top_species_BLAST.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/SSU" ) .collectFile( - name: "Summary.16S.tsv", + name: "Summary.16S_Species_BLAST.tsv", keepHeader: true, + sort: { file -> file.text }, storeDir: "${params.outdir}/Summaries" ) - + .view { collectedFiles -> println "DEBUG: From ch_top_blast, collected files: ${collectedFiles}" } ch_output_summary_files = ch_output_summary_files.mix(ch_top_blast) /* @@ -703,7 +985,6 @@ workflow ASSEMBLY { ASSEMBLY_ASSESSMENT ( ASSEMBLE_CONTIGS.out.assembly_file, - ch_overlap_flash, EXTRACT_READ_ALIGNMENT_DEPTHS_BEDTOOLS.out.summary, ch_busco_config_file, ch_busco_db_file, @@ -724,15 +1005,49 @@ workflow ASSEMBLY { // Collect QC file checks and concatenate into one file ch_qc_filecheck = ch_qc_filecheck - .map{ meta, file -> file } - .collect() - .flatten() - .collectFile( - name: "Summary.QC_File_Checks.tsv", - keepHeader: true, - storeDir: "${params.outdir}/Summaries", - sort: 'index' - ) + .map{ meta, file -> file } + .collect() + .flatten() + .collectFile( + name: "Summary.QC_File_Checks.tsv", + keepHeader: true, + sort: { file -> file.text }, + storeDir: "${params.outdir}/Summaries", + ) + + // Execute when the ch_qc_filecheck channel completes + ch_qc_filecheck.subscribe { files -> + int failCount = 0 + int passCount = 0 + int fileCount = 0 + + // Loop through the collected files and count passes/fails + new File("${params.outdir}/pipeline_info/qc_file_checks/").eachFileMatch(~'.*\\.tsv') { file -> + fileCount++ // Increment the file counter for each file + file.eachLine { line -> + if (line.contains('FAIL')) { + System.err.println('ERROR: ' + line + ' in file: ' + file.name) + System.out.println('ERROR: ' + line + ' in file: ' + file.name) + failCount++ + } else if (line.contains('PASS')) { + passCount++ + } + } + } + + // Print the total number of files found + System.out.println('INFO: found ' + fileCount + ' QC test results') + + // Print the total number of fails and passes + System.err.println('INFO: found ' + failCount + ' QC failure steps across all files') + System.out.println('INFO: found ' + passCount + ' QC passed steps, out of total: ' + fileCount) + + // Extra warning if there are any failures + if (failCount > 0) { + System.err.println('ERROR: found ' + failCount + ' QC files that failed!') + System.out.println('ERROR: found ' + failCount + ' QC files that failed!') + } + } ch_output_summary_files = ch_output_summary_files.mix(ch_qc_filecheck) @@ -743,15 +1058,37 @@ workflow ASSEMBLY { */ if (params.create_excel_outputs) { - CREATE_EXCEL_RUN_SUMMARY_PYTHON ( - ch_output_summary_files.collect() - ) - ch_versions = ch_versions.mix(CREATE_EXCEL_RUN_SUMMARY_PYTHON.out.versions) + tab_colors_file = file("${projectDir}/modules/local/create_excel_run_summary_python/resources/xlsx_tab_color_key.txt") - CONVERT_TSV_TO_EXCEL_PYTHON ( - CREATE_EXCEL_RUN_SUMMARY_PYTHON.out.summary - ) - ch_versions = ch_versions.mix(CONVERT_TSV_TO_EXCEL_PYTHON.out.versions) + // Collect summary files into the list_of_files variable + list_of_files = ch_output_summary_files + .view { item -> + println "DEBUG: From ch_output_summary_files, Received item: ${item.getClass().getName()} - ${item}" + } + .filter { item -> + // Extract the file if it's a tuple or list + def file = (item instanceof List) ? item[1] : item // Assuming file is the second element in the tuple + def fileName = file.getName() // Convert Path to String + if (fileName.startsWith("Summary") && file.size() > 0) { + println "DEBUG: From ch_output_summary_files, Valid summary file found: ${fileName} (Size: ${file.size()} bytes)" + return true + } else { + if (file.size() == 0) { + println "DEBUG: From ch_output_summary_files, Skipping empty summary file: ${fileName}" + } else { + println "DEBUG: From ch_output_summary_files, Skipping non-summary file: ${fileName}" + } + return false + } + } + .collect() + // Debugging print to show all files collected + .view { files -> println "DEBUG: Files passed to CREATE_EXCEL_RUN_SUMMARY_PYTHON: ${files}" } + + // Pass both variables to the process + def workflow_version = workflow.manifest.version ?: 'dev' + CREATE_EXCEL_RUN_SUMMARY_PYTHON(list_of_files, tab_colors_file, workflow_version) + ch_versions = ch_versions.mix(CREATE_EXCEL_RUN_SUMMARY_PYTHON.out.versions) } /*