Move fasta concatnate codes to phylotree module

stajichlab · Oct 25, 2023 · e81eed3 · e81eed3
1 parent 5bd9ea4
commit e81eed3
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 77 deletions.
diff --git a/src/phyling/libphyling.py b/src/phyling/libphyling.py
@@ -2,14 +2,14 @@
 from __future__ import annotations
 
 import csv
+import json
 import logging
 import re
 import shutil
 import subprocess
 import sys
 from collections.abc import Iterator
 from io import BytesIO, StringIO
-from itertools import product
 from multiprocessing.dummy import Pool
 from pathlib import Path
 
@@ -86,7 +86,7 @@ class msa_generator:
 
     def __init__(self, inputs: list[Path]):
         """Initialize the MSA generator object and perform sequence check and preprocessing."""
-        self._inputs = inputs
+        self._inputs = sorted(inputs)
         self._fastastrip_re = re.compile(r"(\.(aa|pep|cds|fna|faa))?\.(fasta|fas|faa|fna|seq|fa)(\.gz)?")
         self._inputs_basename_check()
         # Create a dict to retrieve the sequence later
@@ -157,14 +157,13 @@ def filter_orthologs(self) -> None:
             )
             sys.exit(1)
 
-    def align(self, output: Path, method: str, non_trim: bool, concat: bool, threads: int) -> None:
+    def align(self, output: Path, method: str, non_trim: bool, threads: int) -> None:
         """
         Align a set of identify orthologous proteins.
 
         First the sequences that have been identified as ortholog will be aligned through hmmalign or muscle. Next, do the dna to
         peptide back translation if self._cds_seqs is found (which means the inputs are cds fasta). Next, run the trimming to
-        remove uninformative regions (can be switch off). Finally, do fill_missing_taxon and concatenate all the MSA results if
-        the concat mode is enable. Otherwise output each MSA results in separate files.
+        remove uninformative regions (can be switch off). Finally, output each MSA results in separate files.
         """
         # Parallelize the MSA step
         logging.info(f"Use {method} for peptide MSA")
@@ -201,34 +200,16 @@ def align(self, output: Path, method: str, non_trim: bool, concat: bool, threads
                 alignmentList = pep_msa_List
                 ext = phyling.config.prot_aln_ext
 
-            if concat:
-                concat_alignments = MultipleSeqAlignment([])
-                for input in self._inputs:
-                    # strip the filename extension from name so that this resembles sp name
-                    sample = self._fastastrip_re.sub(r"", input.name)
-                    concat_alignments.append(SeqRecord(Seq(""), id=sample, description=""))
-                concat_alignments.sort()
-                alignmentList = pool.starmap(
-                    self._fill_missing_taxon, product([[seq.id for seq in concat_alignments]], alignmentList)
-                )
-                logging.info("Filling missing taxon done")
-
+        logging.info(f"Output individual fasta to folder {output}")
         for hmm, alignment in zip([hmm for hmm in self.orthologs.keys()], alignmentList):
-            output_aa = output / f"{hmm}.{ext}"
+            output_mfa = output / f"{hmm}.{ext}"
             alignment.sort()
-            if concat:
-                concat_alignments += alignment
-            else:
-                with open(output_aa, "w") as f:
-                    SeqIO.write(alignment, f, format="fasta")
-
-        if concat:
-            output_concat = output / f"concat_alignments.{ext}"
-            with open(output_concat, "w") as f:
-                SeqIO.write(concat_alignments, f, format="fasta")
-            logging.info(f"Output concatenated fasta to {output_concat}")
-        else:
-            logging.info(f"Output individual fasta to folder {output}")
+            with open(output_mfa, "w") as f:
+                SeqIO.write(alignment, f, format="fasta")
+
+        output_metadata = output / ".metadata.json"
+        with open(output_metadata, "w") as f:
+            json.dump(self._generate_metadata(), f)
 
     def _sequence_preprocess(self, input: Path) -> tuple(int, int):
         """
@@ -438,21 +419,17 @@ def _run_muscle(self, hits: set) -> MultipleSeqAlignment:
             seq.seq = Seq(re.sub(r"[ZzBbXx\*\.]", "-", str(seq.seq)))
         return alignment
 
-    def _fill_missing_taxon(self, taxonList: list, alignment: MultipleSeqAlignment) -> MultipleSeqAlignment:
-        """Include empty gap-only strings in alignment for taxa lacking an ortholog."""
-        missing = set(taxonList) - {seq.id for seq in alignment}
-        for sample in missing:
-            alignment.append(
-                SeqRecord(
-                    Seq("-" * alignment.get_alignment_length()),
-                    id=sample,
-                    description="",
-                )
-            )
-        return alignment
+    def _generate_metadata(self) -> dict:
+        """Generate metadata for phylotree module."""
+        metadata = {"sample": {}}
+        for input in self._inputs:
+            sample = self._fastastrip_re.sub(r"", input.name)
+            metadata["sample"].setdefault(sample, sample)
 
+        return metadata
 
-def main(inputs, input_dir, output, markerset, evalue, method, non_trim, concat, threads, **kwargs):
+
+def main(inputs, input_dir, output, markerset, evalue, method, non_trim, threads, **kwargs):
     """
     Perform multiple sequence alignment (MSA) on orthologous sequences that match the hmm markers across samples.
 
@@ -462,12 +439,9 @@ def main(inputs, input_dir, output, markerset, evalue, method, non_trim, concat,
     an evalue of 1e-10 will be used as the default cutoff.
 
     Sequences corresponding to orthologs found in more than 3 samples are extracted from each input. These sequences
-    then undergo MSA with hmmalign or muscle. The resulting alignment is further trimmed using clipkit by default. You
-    can use the -n/--non_trim option to skip the trimming step.
-
-    By default, the alignment results are output separately for each hmm marker. The consensus tree method is applying
-    by default to construct a phylogenetic tree. You can use the -c/--concat option to concatenate the aligned
-    sequences by sample and build a single tree afterward.
+    then undergo MSA with hmmalign or muscle. The resulting alignments are further trimmed using clipkit by default.
+    You can use the -n/--non_trim option to skip the trimming step. Finally, The alignment results are output
+    separately for each hmm marker.
     """
     # If args.input_dir is used to instead of args.inputs
     if input_dir:
@@ -503,4 +477,4 @@ def main(inputs, input_dir, output, markerset, evalue, method, non_trim, concat,
     msa = msa_generator(inputs)
     msa.search(markerset, evalue=evalue, threads=threads)
     msa.filter_orthologs()
-    msa.align(output, non_trim=non_trim, method=method, concat=concat, threads=threads)
+    msa.align(output, non_trim=non_trim, method=method, threads=threads)
diff --git a/src/phyling/phyling.py b/src/phyling/phyling.py
@@ -97,12 +97,6 @@ def parser_submodule(parser, parent_parser) -> None:
         action="store_true",
         help="Report non-trimmed alignment results",
     )
-    p_aln.add_argument(
-        "-c",
-        "--concat",
-        action="store_true",
-        help="Report concatenated alignment results",
-    )
     p_aln.add_argument(
         "-t",
         "--threads",
@@ -141,6 +135,12 @@ def parser_submodule(parser, parent_parser) -> None:
         default="upgma",
         help='Algorithm used for tree building (default="upgma")',
     )
+    p_tree.add_argument(
+        "-c",
+        "--concat",
+        action="store_true",
+        help="Concatenated alignment results",
+    )
     p_tree.add_argument("-f", "--figure", action="store_true", help="Generate a matplotlib tree figure")
     p_tree.add_argument(
         "-t",

diff --git a/src/phyling/phylotree.py b/src/phyling/phylotree.py
@@ -1,17 +1,22 @@
 """Phylogenetic tree construction methods."""
 from __future__ import annotations
 
+import json
 import logging
 import shutil
 import subprocess
 import sys
 from io import StringIO
+from itertools import product
 from multiprocessing.dummy import Pool
 from pathlib import Path
 
 import matplotlib.pyplot as plt
-from Bio import AlignIO, Phylo
+from Bio import AlignIO, Phylo, SeqIO
+from Bio.Align import MultipleSeqAlignment
 from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
 
 import phyling.config
 
@@ -37,6 +42,17 @@ def __init__(self, method: str, threads: int, *file: Path):
             )
             sys.exit(1)
 
+    def get(self) -> Phylo.BaseTree.Tree:
+        """Run the phylogeny analysis and get the tree object list."""
+        if len(self._file) == 1:
+            final_tree = self._build(self._file[0], self._threads)
+        else:
+            logging.debug(f"Run in multiprocesses mode. {self._threads} jobs are run concurrently")
+            with Pool(self._threads) as pool:
+                trees = pool.map(self._build, self._file)
+            final_tree = run_astral(trees)
+        return final_tree
+
     def _with_VeryFastTree(self, file: Path, threads: int) -> Phylo.BaseTree.Tree:
         stream = StringIO()
         with open(file) as f:
@@ -72,16 +88,37 @@ def _build(self, file: Path, threads: int = 1) -> Phylo.BaseTree.Tree:
         logging.debug(f"Tree building on {file.name} is done")
         return tree
 
-    def get(self) -> Phylo.BaseTree.Tree:
-        """Run the phylogeny analysis and get the tree object list."""
-        if len(self._file) == 1:
-            final_tree = self._build(self._file[0], self._threads)
-        else:
-            logging.debug(f"Run in multiprocesses mode. {self._threads} jobs are run concurrently")
-            with Pool(self._threads) as pool:
-                trees = pool.map(self._build, self._file)
-            final_tree = run_astral(trees)
-        return final_tree
+
+def concatenate_fasta(taxonList: list, alignmentList: list[MultipleSeqAlignment], threads: int) -> MultipleSeqAlignment:
+    """Concatenate multiple MSA results into a single fasta."""
+    concat_alignments = MultipleSeqAlignment([])
+    for sample in taxonList:
+        concat_alignments.append(SeqRecord(Seq(""), id=sample, description=""))
+    concat_alignments.sort()
+    with Pool(threads) as pool:
+        alignmentList = pool.starmap(
+            fill_missing_taxon, product([[seq.id for seq in concat_alignments]], alignmentList)
+        )
+    logging.debug("Filling missing taxon done")
+    for alignment in alignmentList:
+        concat_alignments += alignment
+    logging.debug("Concatenate fasta done")
+    return concat_alignments
+
+
+def fill_missing_taxon(taxonList: list, alignment: MultipleSeqAlignment) -> MultipleSeqAlignment:
+    """Include empty gap-only strings in alignment for taxa lacking an ortholog."""
+    missing = set(taxonList) - {seq.id for seq in alignment}
+    for sample in missing:
+        alignment.append(
+            SeqRecord(
+                Seq("-" * alignment.get_alignment_length()),
+                id=sample,
+                description="",
+            )
+        )
+        alignment.sort()
+    return alignment
 
 
 def run_astral(trees: list[Phylo.BaseTree.Tree]) -> Phylo.BaseTree.Tree:
@@ -98,12 +135,12 @@ def run_astral(trees: list[Phylo.BaseTree.Tree]) -> Phylo.BaseTree.Tree:
     return Phylo.read(StringIO(stdout), "newick")
 
 
-def phylotree(inputs, input_dir, output, method, figure, threads, **kwargs):
+def phylotree(inputs, input_dir, output, method, figure, concat, threads, **kwargs):
     """
     Construct a phylogenetic tree based on the results of multiple sequence alignment (MSA).
 
-    If multiple MSA results are given, the consensus tree method will be employed, using a 50% cutoff to represent
-    the majority of all the trees.
+    By default the consensus tree method will be employed which use a 50% cutoff to represent the majority of all the
+    trees. You can use the -c/--concat option to concatenate the MSA and build a single tree instead.
 
     By default, the UPGMA algorithm is used for tree construction. Users can switch to the Neighbor Joining method by
     specifying the -m/--method nj.
@@ -115,17 +152,58 @@ def phylotree(inputs, input_dir, output, method, figure, threads, **kwargs):
     method_dict = {"upgma": "UPGMA", "nj": "Neighbor Joining", "ft": "VeryFastTree"}
     logging.info(f"Algorithm choose for tree building: {method_dict[method]}")
     if input_dir:
-        inputs = list(Path(input_dir).glob(f"*.{phyling.config.aln_ext}"))
+        inputs = [file for file in Path(input_dir).glob(f"*.{phyling.config.aln_ext}")]
     else:
-        inputs = [Path(sample) for sample in inputs]
+        inputs = [Path(file) for file in inputs if file.endswith(phyling.config.aln_ext)]
+        input_dir = {file.parent for file in inputs}
+
+        if len(input_dir) > 1:
+            logging.error("Inputs do not came from the same folder. Cannot obtain the correct metadata.")
+            logging.error(input_dir)
+            sys.exit(1)
+        else:
+            input_dir = input_dir.pop()
+
     logging.info(f"Found {len(inputs)} MSA fasta")
-    if inputs[0].name == f"concat_alignments.{phyling.config.aln_ext}":
-        logging.info("Generate phylogenetic tree the on concatenated fasta")
-    else:
-        logging.info("Generate phylogenetic tree on all MSA fasta and conclude an majority consensus tree")
+
+    # Get the metadata under the inputs folder
+    input_dir = Path(input_dir)
+    metadata = input_dir / ".metadata.json"
+    if not metadata.exists():
+        logging.error("Cannot find the metadata. Aborted.")
+        sys.exit(1)
+    with open(metadata) as f:
+        metadata = json.load(f)
+    if len(metadata["sample"]) < 3:
+        logging.error(
+            "Something wrong with the metadata. It contains less than 3 taxa which is not valid for tree building."
+        )
+        sys.exit(1)
+
     output = Path(output)
     output.mkdir(exist_ok=True)
 
+    if concat and len(inputs) > 1:
+        # Get the concat_alignments
+        logging.info("Generate phylogenetic tree the on concatenated fasta")
+        alignmentList = []
+        for file in inputs:
+            alignmentList.append(AlignIO.read(file, format="fasta"))
+        concat_alignments = concatenate_fasta(list(metadata["sample"].keys()), alignmentList, threads=threads)
+        # Rename the sample
+        for seq in concat_alignments:
+            seq.name = seq.id = metadata["sample"][seq.id]
+            seq.description = ""
+
+        # Output the concatenated MSA to file
+        output_concat = output / f"concat_alignments.{phyling.config.aln_ext}"
+        with open(output_concat, "w") as f:
+            SeqIO.write(concat_alignments, f, format="fasta")
+        logging.info(f"Concatenated fasta is output to {output_concat}")
+        inputs = [output_concat]
+    else:
+        logging.info("Generate phylogenetic tree on all MSA fasta and conclude a majority consensus tree")
+
     tree_generator_obj = tree_generator(method, threads, *inputs)
     final_tree = tree_generator_obj.get()
     Phylo.draw_ascii(final_tree)