Patch for treesapp abundance (#79)

* Fix abundance checkpointing * Fix test, prep for release * Fixed bugs with different reporting methods
hallamlab · Mar 25, 2021 · 3f9ce19 · 3f9ce19
1 parent 87cd14b
commit 3f9ce19
Show file tree

Hide file tree

Showing 9 changed files with 168 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## [0.10.4] - 2021-03-25
+### Fixed
+- Checkpoint determination in `treesapp abundance`
+- '--report append' and 'report update' was not working properly in `treesapp abundance`.
+  Fixed by deduplicating PQueries prior to appending. 
+
+### Changed
+- Checks whether all FASTQ file paths exist earlier in `treesapp abundance`
+
 ## [0.10.3] - 2021-03-23
 ### Fixed
 - Replaced duplicate SAM file paths for unique ones when multiple fastqs are provided to `treesapp abundance`

diff --git a/dev_utils/Dockerfile b/dev_utils/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:bionic
 
-ENV TREESAPP_VERSION="0.10.3"\
+ENV TREESAPP_VERSION="0.10.4"\
   MAFFT_VERSION="7.475-1"\
   BWA_VERSION="0.7.17"\
   PRODIGAL_VERSION="2.6.3"\

diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 SETUP_METADATA = \
     {
         "name": "treesapp",
-        "version": "0.10.3",
+        "version": "0.10.4",
         "description": "TreeSAPP is a functional and taxonomic annotation tool for genomes and metagenomes.",
         "long_description": LONG_DESCRIPTION,
         "long_description_content_type": "text/markdown",

diff --git a/tests/test_abundance.py b/tests/test_abundance.py
@@ -1,7 +1,39 @@
+import os
+import shutil
+import pytest
 import unittest
 
+from .testing_utils import get_test_data
+
 
 class AbundanceTester(unittest.TestCase):
+    def setUp(self) -> None:
+        from treesapp import treesapp_args
+        from treesapp.classy import Abundance
+        self.mock_abund = Abundance()
+        ts_assign_output = get_test_data("test_output_TarA/")
+        self.mock_abund_dir = os.path.join("tests", "abundance_test_input")
+        if os.path.isdir(self.mock_abund_dir):
+            shutil.rmtree(self.mock_abund_dir)
+        shutil.copytree(ts_assign_output, self.mock_abund_dir)
+        mock_cli_args = ["--treesapp_output", self.mock_abund_dir,
+                         "--reads", get_test_data("test_TarA.1.fq"),
+                         "--reverse", get_test_data("test_TarA.2.fq"),
+                         "--overwrite"]
+
+        self.abundance_parser = treesapp_args.TreeSAPPArgumentParser()
+        treesapp_args.add_abundance_arguments(self.abundance_parser)
+
+        self.mock_args = self.abundance_parser.parse_args(mock_cli_args)
+        self.mock_abund.furnish_with_arguments(self.mock_args)
+
+        return
+
+    def tearDown(self) -> None:
+        if os.path.isdir(self.mock_abund_dir):
+            shutil.rmtree(self.mock_abund_dir)
+        return
+
     def test_strip_file_to_sample_name(self):
         from treesapp.classy import Abundance
         mock_abund = Abundance()
@@ -18,6 +50,79 @@ def test_strip_file_to_sample_name(self):
         self.assertEqual("SRR7188253", mock_abund.sample_prefix)
         return
 
+    def test_parse_args(self):
+        from treesapp import treesapp_args
+        mock_cli_args = ["--treesapp_output", self.mock_abund_dir,
+                         "--reads", get_test_data("test_TarA.1.fq")]
+
+        parser = treesapp_args.TreeSAPPArgumentParser()
+        treesapp_args.add_abundance_arguments(parser)
+        args = parser.parse_args(mock_cli_args)
+        self.assertEqual("pe", args.pairing)
+        self.assertEqual("tpm", args.metric)
+        return
+
+    def test_check_arguments(self):
+        from treesapp import treesapp_args
+        # Set up the parser
+        parser = treesapp_args.TreeSAPPArgumentParser()
+        treesapp_args.add_abundance_arguments(parser)
+
+        # Test fail if path to reads doesn't exist
+        bad_fwd_args = parser.parse_args(["--treesapp_output", self.mock_abund_dir, "--reads", "test_TarA.1.fq"])
+        bad_rev_args = parser.parse_args(["--treesapp_output", self.mock_abund_dir,
+                                          "--reads", get_test_data("test_TarA.1.fq"),
+                                          "--reverse", "path/to/bad/rev.fq"])
+        with pytest.raises(SystemExit):
+            self.mock_abund.check_arguments(bad_fwd_args)
+            self.mock_abund.check_arguments(bad_rev_args)
+
+        self.mock_abund.check_arguments(self.mock_args)
+        self.assertEqual("tests/abundance_test_input/intermediates/orf-call/PitchLake_TarA_Mcr_Dsr_ORFs.fna",
+                         os.path.relpath(start=os.getcwd(), path=self.mock_abund.ref_nuc_seqs))
+        self.assertEqual("tests/abundance_test_input/final_outputs/" + self.mock_abund.classification_tbl_name,
+                         os.path.relpath(start=os.getcwd(), path=self.mock_abund.classifications))
+
+        return
+
+    def test_decide_stage(self):
+        from treesapp.classy import Abundance
+        mock_abund = Abundance()
+        mock_abund.furnish_with_arguments(self.mock_args)
+
+        # Test with report=='update'
+        mock_abund.check_arguments(self.mock_args)
+        mock_abund.decide_stage(self.mock_args)
+        self.assertTrue(mock_abund.stage_status("align_map"))
+        self.assertTrue(mock_abund.stage_status("summarise"))
+        self.assertTrue(mock_abund.stage_status("sam_sum"))
+        open(os.path.join(self.mock_abund_dir, "intermediates", "align_map", mock_abund.sample_prefix) + ".sam",
+             'w').close()
+
+        # Simulate a re-run when all the FASTQ files in args.reads and args.reverse are the same and SAM exists
+        rerun_args = self.abundance_parser.parse_args(["--treesapp_output", self.mock_abund_dir,
+                                                       "--reads", get_test_data("test_TarA.1.fq"),
+                                                       "--reverse", get_test_data("test_TarA.2.fq"),
+                                                       "--report", "update"])
+        mock_abund.check_arguments(rerun_args)
+        mock_abund.decide_stage(rerun_args)
+        self.assertFalse(mock_abund.stage_status("align_map"))
+        self.assertTrue(mock_abund.stage_status("sam_sum"))
+        self.assertTrue(mock_abund.stage_status("summarise"))
+
+        # Test when report=='append'
+        rerun_args = self.abundance_parser.parse_args(["--treesapp_output", self.mock_abund_dir,
+                                                       "--reads", get_test_data("SRR3669912_1.fastq"),
+                                                       "--reverse", get_test_data("SRR3669912_2.fastq"),
+                                                       "--report", "append"])
+        mock_abund.check_arguments(rerun_args)
+        mock_abund.decide_stage(rerun_args)
+        self.assertTrue(mock_abund.stage_status("align_map"))
+        self.assertTrue(mock_abund.stage_status("sam_sum"))
+        self.assertTrue(mock_abund.stage_status("summarise"))
+
+        return
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_commands.py b/tests/test_commands.py
@@ -45,6 +45,8 @@ def test_abundance(self):
                                   "--reads", get_test_data("test_TarA.1.fq"),
                                   "--reverse", get_test_data("test_TarA.2.fq"),
                                   "--pairing", "pe",
+                                  "--report", "update",
+                                  "--metric", "fpkm",
                                   "--num_procs", str(self.num_procs),
                                   "--overwrite"]
         abund_dict = abundance(abundance_command_list)["test_TarA"]
@@ -61,6 +63,7 @@ def test_abundance(self):
                                   "--reads", get_test_data("test_TarA.fastq.gz"),
                                   "--pairing", "pe",
                                   "--metric", "tpm",
+                                  "--report", "append",
                                   "--num_procs", str(self.num_procs),
                                   "--delete", "--overwrite"]
         abundance(abundance_command_list)
@@ -69,7 +72,7 @@ def test_abundance(self):
         abundant_pqueries = []
         for rp in pqueries:
             abundant_pqueries += [pq for pq in pqueries[rp] if pq.abundance > 0.0]
-        self.assertEqual(33702, round(sum([pq.abundance for pq in abundant_pqueries])))
+        self.assertEqual(33850, round(sum([pq.abundance for pq in abundant_pqueries])))
 
         # Replace the classification table
         copyfile(os.path.join(self.ts_assign_output, "tmp.tsv"), classification_table)

diff --git a/treesapp/__init__.py b/treesapp/__init__.py
@@ -1,4 +1,4 @@
 #! /usr/bin/env python
 """A Python package for gene-centric taxonomic and functional classification using phylogenetic placement."""
 name = "treesapp"
-__version__ = "0.10.3"
+__version__ = "0.10.4"
diff --git a/treesapp/abundance.py b/treesapp/abundance.py
@@ -58,7 +58,7 @@ def abundance(sys_args) -> dict:
 
         if ts_abund.stage_status("align_map"):
             wrapper.align_reads_to_nucs(ts_abund.executables["bwa"], ts_abund.ref_nuc_seqs,
-                                        ts_abund.stage_output_dir, fwd_reads, args.pairing,
+                                        ts_abund.stage_lookup("align_map").dir_path, fwd_reads, args.pairing,
                                         reverse=rev_reads, sam_file=ts_abund.aln_file, num_threads=args.num_threads)
             ts_abund.increment_stage_dir()
 
@@ -89,21 +89,35 @@ def abundance(sys_args) -> dict:
     ts_abund.delete_intermediates(args.delete)
 
     if args.report != "nothing" and os.path.isfile(ts_abund.classifications):
-        pqueries = file_parsers.load_classified_sequences_from_assign_output(ts_abund.output_dir)
+        refpkg_pqueries = file_parsers.load_classified_sequences_from_assign_output(ts_abund.output_dir)
 
         # Collect the names of all classified PQueries
         classified_seqs = set()
-        for rp_pqs in pqueries.values():
+        for rp_pqs in refpkg_pqueries.values():
             classified_seqs.update({pq.seq_name for pq in rp_pqs})
 
+        # Select a unique set of PQuery instances to use in the updated classification table
+        for refpkg_name, pqueries in refpkg_pqueries.items():
+            unique_pqueries = []
+            seq_name_list = list({pq.place_name for pq in pqueries})
+            for pquery in pqueries:
+                i = 0
+                while i < len(seq_name_list):
+                    if pquery.place_name == seq_name_list[i]:
+                        unique_pqueries.append(pquery)
+                        seq_name_list.pop(i)
+                        i = len(seq_name_list)
+                    i += 1
+            refpkg_pqueries[refpkg_name] = unique_pqueries
+
         # Fill PQuery.abundance attribute
         for sample_name, abundance_map in abundance_dict.items():
             # Filter the abundance_map dictionary to just the classified sequences
             absentee = set(abundance_map.keys()).difference(classified_seqs)
             while absentee:
                 abundance_map.pop(absentee.pop())
-            phylo_seq.abundify_tree_saps(pqueries, abundance_map)
-            file_parsers.write_classification_table(pqueries, sample_name, ts_abund.classifications,
+            phylo_seq.abundify_tree_saps(refpkg_pqueries, abundance_map)
+            file_parsers.write_classification_table(refpkg_pqueries, sample_name, ts_abund.classifications,
                                                     append=ts_abund.append_abundance)
             ts_abund.append_abundance = True
 

diff --git a/treesapp/classy.py b/treesapp/classy.py
@@ -1474,7 +1474,7 @@ def __init__(self):
         self.ref_nuc_seqs = ""
         self.classifications = ""
         self.aln_file = ""
-        self.append_abundance = False
+        self.append_abundance = True
         self.fq_suffix_re = re.compile(r"([._-])+(pe|fq|fastq|fwd|R1|1)$")
         self.idx_extensions = ["amb", "ann", "bwt", "pac", "sa"]
         self.stages = {0: ModuleFunction("align_map", 0),
@@ -1508,13 +1508,26 @@ def check_arguments(self, args):
         self.aln_file = self.stage_lookup("align_map").dir_path + \
                         '.'.join(os.path.basename(self.ref_nuc_seqs).split('.')[0:-1]) + ".sam"
 
+        # Ensure all the FASTQ file paths are valid for the forward reads
+        for reads_file in args.reads:
+            if not os.path.isfile(reads_file):
+                logging.error("Unable to find forward reads file '{}'.\n".format(reads_file))
+                sys.exit(1)
+
         if len(args.reverse) > 0:
             if len(args.reads) != len(args.reverse):
                 logging.error("Number of fastq files differs between reads ({}) and reverse ({}) arguments!.\n"
                               "".format(len(args.reads), len(args.reverse)))
                 sys.exit(3)
-
-        if args.report == "update":
+            # Ensure all the FASTQ file paths are valid for the reverse reads
+            for reads_file in args.reverse:
+                if not os.path.isfile(reads_file):
+                    logging.error("Unable to find reverse reads files '{}'.\n".format(reads_file))
+                    sys.exit(5)
+
+        if args.report == "append":
+            self.append_abundance = True
+        else:
             self.append_abundance = False
 
         return
@@ -1529,6 +1542,14 @@ def decide_stage(self, args):
         :return: None
         """
         self.validate_continue(args)
+
+        # Not necessary for either 'update'
+        for reads_file in args.reads:
+            self.strip_file_to_sample_name(reads_file)
+            if not os.path.isfile(self.stage_lookup("align_map").dir_path + self.sample_prefix + ".sam"):
+                self.change_stage_status("align_map", True)
+                break
+
         return
 
     def delete_intermediates(self, clean_up: bool) -> None:

diff --git a/treesapp/treesapp_args.py b/treesapp/treesapp_args.py
@@ -112,14 +112,15 @@ def add_search_params(self):
                                           "for it to be considered. [ DEFAULT = 20 ]")
 
     def add_abundance_params(self):
-        self.fpkm_opts.add_argument("--metric", required=False, default="fpkm", choices=["fpkm", "tpm"],
-                                    help="Selects which normalization metric to use, FPKM or TPM.")
+        self.fpkm_opts.add_argument("--metric", required=False, default="tpm", choices=["fpkm", "tpm"],
+                                    help="Selects which normalization metric to use, FPKM or TPM. [ DEFAULT = tpm ]")
         self.fpkm_opts.add_argument("-r", "--reads", required=False, nargs='+',
                                     help="FASTQ file containing to be aligned to predicted genes using BWA MEM")
         self.fpkm_opts.add_argument("-2", "--reverse", required=False, nargs='+', default=[],
                                     help="FASTQ file containing to reverse mate-pair reads to be aligned using BWA MEM")
         self.fpkm_opts.add_argument("-p", "--pairing", required=False, default='pe', choices=['pe', 'se'],
-                                    help="Indicating whether the reads are paired-end (pe) or single-end (se)")
+                                    help="Indicating whether the reads are paired-end (pe) or single-end (se). "
+                                         "[ DEFAULT = pe ]")
 
     def add_phylogeny_params(self):
         self.optopt.add_argument("-b", "--bootstraps",