Skip to content

Commit

Permalink
Patch for treesapp abundance (#79)
Browse files Browse the repository at this point in the history
* Fix abundance checkpointing

* Fix test, prep for release

* Fixed bugs with different reporting methods
  • Loading branch information
cmorganl authored Mar 25, 2021
1 parent 87cd14b commit 3f9ce19
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 15 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## [0.10.4] - 2021-03-25
### Fixed
- Checkpoint determination in `treesapp abundance`
- '--report append' and 'report update' was not working properly in `treesapp abundance`.
Fixed by deduplicating PQueries prior to appending.

### Changed
- Checks whether all FASTQ file paths exist earlier in `treesapp abundance`

## [0.10.3] - 2021-03-23
### Fixed
- Replaced duplicate SAM file paths for unique ones when multiple fastqs are provided to `treesapp abundance`
Expand Down
2 changes: 1 addition & 1 deletion dev_utils/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM ubuntu:bionic

ENV TREESAPP_VERSION="0.10.3"\
ENV TREESAPP_VERSION="0.10.4"\
MAFFT_VERSION="7.475-1"\
BWA_VERSION="0.7.17"\
PRODIGAL_VERSION="2.6.3"\
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
SETUP_METADATA = \
{
"name": "treesapp",
"version": "0.10.3",
"version": "0.10.4",
"description": "TreeSAPP is a functional and taxonomic annotation tool for genomes and metagenomes.",
"long_description": LONG_DESCRIPTION,
"long_description_content_type": "text/markdown",
Expand Down
105 changes: 105 additions & 0 deletions tests/test_abundance.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,39 @@
import os
import shutil
import pytest
import unittest

from .testing_utils import get_test_data


class AbundanceTester(unittest.TestCase):
def setUp(self) -> None:
from treesapp import treesapp_args
from treesapp.classy import Abundance
self.mock_abund = Abundance()
ts_assign_output = get_test_data("test_output_TarA/")
self.mock_abund_dir = os.path.join("tests", "abundance_test_input")
if os.path.isdir(self.mock_abund_dir):
shutil.rmtree(self.mock_abund_dir)
shutil.copytree(ts_assign_output, self.mock_abund_dir)
mock_cli_args = ["--treesapp_output", self.mock_abund_dir,
"--reads", get_test_data("test_TarA.1.fq"),
"--reverse", get_test_data("test_TarA.2.fq"),
"--overwrite"]

self.abundance_parser = treesapp_args.TreeSAPPArgumentParser()
treesapp_args.add_abundance_arguments(self.abundance_parser)

self.mock_args = self.abundance_parser.parse_args(mock_cli_args)
self.mock_abund.furnish_with_arguments(self.mock_args)

return

def tearDown(self) -> None:
if os.path.isdir(self.mock_abund_dir):
shutil.rmtree(self.mock_abund_dir)
return

def test_strip_file_to_sample_name(self):
from treesapp.classy import Abundance
mock_abund = Abundance()
Expand All @@ -18,6 +50,79 @@ def test_strip_file_to_sample_name(self):
self.assertEqual("SRR7188253", mock_abund.sample_prefix)
return

def test_parse_args(self):
from treesapp import treesapp_args
mock_cli_args = ["--treesapp_output", self.mock_abund_dir,
"--reads", get_test_data("test_TarA.1.fq")]

parser = treesapp_args.TreeSAPPArgumentParser()
treesapp_args.add_abundance_arguments(parser)
args = parser.parse_args(mock_cli_args)
self.assertEqual("pe", args.pairing)
self.assertEqual("tpm", args.metric)
return

def test_check_arguments(self):
from treesapp import treesapp_args
# Set up the parser
parser = treesapp_args.TreeSAPPArgumentParser()
treesapp_args.add_abundance_arguments(parser)

# Test fail if path to reads doesn't exist
bad_fwd_args = parser.parse_args(["--treesapp_output", self.mock_abund_dir, "--reads", "test_TarA.1.fq"])
bad_rev_args = parser.parse_args(["--treesapp_output", self.mock_abund_dir,
"--reads", get_test_data("test_TarA.1.fq"),
"--reverse", "path/to/bad/rev.fq"])
with pytest.raises(SystemExit):
self.mock_abund.check_arguments(bad_fwd_args)
self.mock_abund.check_arguments(bad_rev_args)

self.mock_abund.check_arguments(self.mock_args)
self.assertEqual("tests/abundance_test_input/intermediates/orf-call/PitchLake_TarA_Mcr_Dsr_ORFs.fna",
os.path.relpath(start=os.getcwd(), path=self.mock_abund.ref_nuc_seqs))
self.assertEqual("tests/abundance_test_input/final_outputs/" + self.mock_abund.classification_tbl_name,
os.path.relpath(start=os.getcwd(), path=self.mock_abund.classifications))

return

def test_decide_stage(self):
from treesapp.classy import Abundance
mock_abund = Abundance()
mock_abund.furnish_with_arguments(self.mock_args)

# Test with report=='update'
mock_abund.check_arguments(self.mock_args)
mock_abund.decide_stage(self.mock_args)
self.assertTrue(mock_abund.stage_status("align_map"))
self.assertTrue(mock_abund.stage_status("summarise"))
self.assertTrue(mock_abund.stage_status("sam_sum"))
open(os.path.join(self.mock_abund_dir, "intermediates", "align_map", mock_abund.sample_prefix) + ".sam",
'w').close()

# Simulate a re-run when all the FASTQ files in args.reads and args.reverse are the same and SAM exists
rerun_args = self.abundance_parser.parse_args(["--treesapp_output", self.mock_abund_dir,
"--reads", get_test_data("test_TarA.1.fq"),
"--reverse", get_test_data("test_TarA.2.fq"),
"--report", "update"])
mock_abund.check_arguments(rerun_args)
mock_abund.decide_stage(rerun_args)
self.assertFalse(mock_abund.stage_status("align_map"))
self.assertTrue(mock_abund.stage_status("sam_sum"))
self.assertTrue(mock_abund.stage_status("summarise"))

# Test when report=='append'
rerun_args = self.abundance_parser.parse_args(["--treesapp_output", self.mock_abund_dir,
"--reads", get_test_data("SRR3669912_1.fastq"),
"--reverse", get_test_data("SRR3669912_2.fastq"),
"--report", "append"])
mock_abund.check_arguments(rerun_args)
mock_abund.decide_stage(rerun_args)
self.assertTrue(mock_abund.stage_status("align_map"))
self.assertTrue(mock_abund.stage_status("sam_sum"))
self.assertTrue(mock_abund.stage_status("summarise"))

return


if __name__ == '__main__':
unittest.main()
5 changes: 4 additions & 1 deletion tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def test_abundance(self):
"--reads", get_test_data("test_TarA.1.fq"),
"--reverse", get_test_data("test_TarA.2.fq"),
"--pairing", "pe",
"--report", "update",
"--metric", "fpkm",
"--num_procs", str(self.num_procs),
"--overwrite"]
abund_dict = abundance(abundance_command_list)["test_TarA"]
Expand All @@ -61,6 +63,7 @@ def test_abundance(self):
"--reads", get_test_data("test_TarA.fastq.gz"),
"--pairing", "pe",
"--metric", "tpm",
"--report", "append",
"--num_procs", str(self.num_procs),
"--delete", "--overwrite"]
abundance(abundance_command_list)
Expand All @@ -69,7 +72,7 @@ def test_abundance(self):
abundant_pqueries = []
for rp in pqueries:
abundant_pqueries += [pq for pq in pqueries[rp] if pq.abundance > 0.0]
self.assertEqual(33702, round(sum([pq.abundance for pq in abundant_pqueries])))
self.assertEqual(33850, round(sum([pq.abundance for pq in abundant_pqueries])))

# Replace the classification table
copyfile(os.path.join(self.ts_assign_output, "tmp.tsv"), classification_table)
Expand Down
2 changes: 1 addition & 1 deletion treesapp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /usr/bin/env python
"""A Python package for gene-centric taxonomic and functional classification using phylogenetic placement."""
name = "treesapp"
__version__ = "0.10.3"
__version__ = "0.10.4"
24 changes: 19 additions & 5 deletions treesapp/abundance.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def abundance(sys_args) -> dict:

if ts_abund.stage_status("align_map"):
wrapper.align_reads_to_nucs(ts_abund.executables["bwa"], ts_abund.ref_nuc_seqs,
ts_abund.stage_output_dir, fwd_reads, args.pairing,
ts_abund.stage_lookup("align_map").dir_path, fwd_reads, args.pairing,
reverse=rev_reads, sam_file=ts_abund.aln_file, num_threads=args.num_threads)
ts_abund.increment_stage_dir()

Expand Down Expand Up @@ -89,21 +89,35 @@ def abundance(sys_args) -> dict:
ts_abund.delete_intermediates(args.delete)

if args.report != "nothing" and os.path.isfile(ts_abund.classifications):
pqueries = file_parsers.load_classified_sequences_from_assign_output(ts_abund.output_dir)
refpkg_pqueries = file_parsers.load_classified_sequences_from_assign_output(ts_abund.output_dir)

# Collect the names of all classified PQueries
classified_seqs = set()
for rp_pqs in pqueries.values():
for rp_pqs in refpkg_pqueries.values():
classified_seqs.update({pq.seq_name for pq in rp_pqs})

# Select a unique set of PQuery instances to use in the updated classification table
for refpkg_name, pqueries in refpkg_pqueries.items():
unique_pqueries = []
seq_name_list = list({pq.place_name for pq in pqueries})
for pquery in pqueries:
i = 0
while i < len(seq_name_list):
if pquery.place_name == seq_name_list[i]:
unique_pqueries.append(pquery)
seq_name_list.pop(i)
i = len(seq_name_list)
i += 1
refpkg_pqueries[refpkg_name] = unique_pqueries

# Fill PQuery.abundance attribute
for sample_name, abundance_map in abundance_dict.items():
# Filter the abundance_map dictionary to just the classified sequences
absentee = set(abundance_map.keys()).difference(classified_seqs)
while absentee:
abundance_map.pop(absentee.pop())
phylo_seq.abundify_tree_saps(pqueries, abundance_map)
file_parsers.write_classification_table(pqueries, sample_name, ts_abund.classifications,
phylo_seq.abundify_tree_saps(refpkg_pqueries, abundance_map)
file_parsers.write_classification_table(refpkg_pqueries, sample_name, ts_abund.classifications,
append=ts_abund.append_abundance)
ts_abund.append_abundance = True

Expand Down
27 changes: 24 additions & 3 deletions treesapp/classy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1474,7 +1474,7 @@ def __init__(self):
self.ref_nuc_seqs = ""
self.classifications = ""
self.aln_file = ""
self.append_abundance = False
self.append_abundance = True
self.fq_suffix_re = re.compile(r"([._-])+(pe|fq|fastq|fwd|R1|1)$")
self.idx_extensions = ["amb", "ann", "bwt", "pac", "sa"]
self.stages = {0: ModuleFunction("align_map", 0),
Expand Down Expand Up @@ -1508,13 +1508,26 @@ def check_arguments(self, args):
self.aln_file = self.stage_lookup("align_map").dir_path + \
'.'.join(os.path.basename(self.ref_nuc_seqs).split('.')[0:-1]) + ".sam"

# Ensure all the FASTQ file paths are valid for the forward reads
for reads_file in args.reads:
if not os.path.isfile(reads_file):
logging.error("Unable to find forward reads file '{}'.\n".format(reads_file))
sys.exit(1)

if len(args.reverse) > 0:
if len(args.reads) != len(args.reverse):
logging.error("Number of fastq files differs between reads ({}) and reverse ({}) arguments!.\n"
"".format(len(args.reads), len(args.reverse)))
sys.exit(3)

if args.report == "update":
# Ensure all the FASTQ file paths are valid for the reverse reads
for reads_file in args.reverse:
if not os.path.isfile(reads_file):
logging.error("Unable to find reverse reads files '{}'.\n".format(reads_file))
sys.exit(5)

if args.report == "append":
self.append_abundance = True
else:
self.append_abundance = False

return
Expand All @@ -1529,6 +1542,14 @@ def decide_stage(self, args):
:return: None
"""
self.validate_continue(args)

# Not necessary for either 'update'
for reads_file in args.reads:
self.strip_file_to_sample_name(reads_file)
if not os.path.isfile(self.stage_lookup("align_map").dir_path + self.sample_prefix + ".sam"):
self.change_stage_status("align_map", True)
break

return

def delete_intermediates(self, clean_up: bool) -> None:
Expand Down
7 changes: 4 additions & 3 deletions treesapp/treesapp_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,15 @@ def add_search_params(self):
"for it to be considered. [ DEFAULT = 20 ]")

def add_abundance_params(self):
self.fpkm_opts.add_argument("--metric", required=False, default="fpkm", choices=["fpkm", "tpm"],
help="Selects which normalization metric to use, FPKM or TPM.")
self.fpkm_opts.add_argument("--metric", required=False, default="tpm", choices=["fpkm", "tpm"],
help="Selects which normalization metric to use, FPKM or TPM. [ DEFAULT = tpm ]")
self.fpkm_opts.add_argument("-r", "--reads", required=False, nargs='+',
help="FASTQ file containing to be aligned to predicted genes using BWA MEM")
self.fpkm_opts.add_argument("-2", "--reverse", required=False, nargs='+', default=[],
help="FASTQ file containing to reverse mate-pair reads to be aligned using BWA MEM")
self.fpkm_opts.add_argument("-p", "--pairing", required=False, default='pe', choices=['pe', 'se'],
help="Indicating whether the reads are paired-end (pe) or single-end (se)")
help="Indicating whether the reads are paired-end (pe) or single-end (se). "
"[ DEFAULT = pe ]")

def add_phylogeny_params(self):
self.optopt.add_argument("-b", "--bootstraps",
Expand Down

0 comments on commit 3f9ce19

Please sign in to comment.