Skip to content

Commit

Permalink
Move fasta concatnate codes to phylotree module
Browse files Browse the repository at this point in the history
  • Loading branch information
chtsai0105 committed Oct 25, 2023
1 parent 5bd9ea4 commit e81eed3
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 77 deletions.
76 changes: 25 additions & 51 deletions src/phyling/libphyling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
from __future__ import annotations

import csv
import json
import logging
import re
import shutil
import subprocess
import sys
from collections.abc import Iterator
from io import BytesIO, StringIO
from itertools import product
from multiprocessing.dummy import Pool
from pathlib import Path

Expand Down Expand Up @@ -86,7 +86,7 @@ class msa_generator:

def __init__(self, inputs: list[Path]):
"""Initialize the MSA generator object and perform sequence check and preprocessing."""
self._inputs = inputs
self._inputs = sorted(inputs)
self._fastastrip_re = re.compile(r"(\.(aa|pep|cds|fna|faa))?\.(fasta|fas|faa|fna|seq|fa)(\.gz)?")
self._inputs_basename_check()
# Create a dict to retrieve the sequence later
Expand Down Expand Up @@ -157,14 +157,13 @@ def filter_orthologs(self) -> None:
)
sys.exit(1)

def align(self, output: Path, method: str, non_trim: bool, concat: bool, threads: int) -> None:
def align(self, output: Path, method: str, non_trim: bool, threads: int) -> None:
"""
Align a set of identify orthologous proteins.
First the sequences that have been identified as ortholog will be aligned through hmmalign or muscle. Next, do the dna to
peptide back translation if self._cds_seqs is found (which means the inputs are cds fasta). Next, run the trimming to
remove uninformative regions (can be switch off). Finally, do fill_missing_taxon and concatenate all the MSA results if
the concat mode is enable. Otherwise output each MSA results in separate files.
remove uninformative regions (can be switch off). Finally, output each MSA results in separate files.
"""
# Parallelize the MSA step
logging.info(f"Use {method} for peptide MSA")
Expand Down Expand Up @@ -201,34 +200,16 @@ def align(self, output: Path, method: str, non_trim: bool, concat: bool, threads
alignmentList = pep_msa_List
ext = phyling.config.prot_aln_ext

if concat:
concat_alignments = MultipleSeqAlignment([])
for input in self._inputs:
# strip the filename extension from name so that this resembles sp name
sample = self._fastastrip_re.sub(r"", input.name)
concat_alignments.append(SeqRecord(Seq(""), id=sample, description=""))
concat_alignments.sort()
alignmentList = pool.starmap(
self._fill_missing_taxon, product([[seq.id for seq in concat_alignments]], alignmentList)
)
logging.info("Filling missing taxon done")

logging.info(f"Output individual fasta to folder {output}")
for hmm, alignment in zip([hmm for hmm in self.orthologs.keys()], alignmentList):
output_aa = output / f"{hmm}.{ext}"
output_mfa = output / f"{hmm}.{ext}"
alignment.sort()
if concat:
concat_alignments += alignment
else:
with open(output_aa, "w") as f:
SeqIO.write(alignment, f, format="fasta")

if concat:
output_concat = output / f"concat_alignments.{ext}"
with open(output_concat, "w") as f:
SeqIO.write(concat_alignments, f, format="fasta")
logging.info(f"Output concatenated fasta to {output_concat}")
else:
logging.info(f"Output individual fasta to folder {output}")
with open(output_mfa, "w") as f:
SeqIO.write(alignment, f, format="fasta")

output_metadata = output / ".metadata.json"
with open(output_metadata, "w") as f:
json.dump(self._generate_metadata(), f)

def _sequence_preprocess(self, input: Path) -> tuple(int, int):
"""
Expand Down Expand Up @@ -438,21 +419,17 @@ def _run_muscle(self, hits: set) -> MultipleSeqAlignment:
seq.seq = Seq(re.sub(r"[ZzBbXx\*\.]", "-", str(seq.seq)))
return alignment

def _fill_missing_taxon(self, taxonList: list, alignment: MultipleSeqAlignment) -> MultipleSeqAlignment:
"""Include empty gap-only strings in alignment for taxa lacking an ortholog."""
missing = set(taxonList) - {seq.id for seq in alignment}
for sample in missing:
alignment.append(
SeqRecord(
Seq("-" * alignment.get_alignment_length()),
id=sample,
description="",
)
)
return alignment
def _generate_metadata(self) -> dict:
"""Generate metadata for phylotree module."""
metadata = {"sample": {}}
for input in self._inputs:
sample = self._fastastrip_re.sub(r"", input.name)
metadata["sample"].setdefault(sample, sample)

return metadata

def main(inputs, input_dir, output, markerset, evalue, method, non_trim, concat, threads, **kwargs):

def main(inputs, input_dir, output, markerset, evalue, method, non_trim, threads, **kwargs):
"""
Perform multiple sequence alignment (MSA) on orthologous sequences that match the hmm markers across samples.
Expand All @@ -462,12 +439,9 @@ def main(inputs, input_dir, output, markerset, evalue, method, non_trim, concat,
an evalue of 1e-10 will be used as the default cutoff.
Sequences corresponding to orthologs found in more than 3 samples are extracted from each input. These sequences
then undergo MSA with hmmalign or muscle. The resulting alignment is further trimmed using clipkit by default. You
can use the -n/--non_trim option to skip the trimming step.
By default, the alignment results are output separately for each hmm marker. The consensus tree method is applying
by default to construct a phylogenetic tree. You can use the -c/--concat option to concatenate the aligned
sequences by sample and build a single tree afterward.
then undergo MSA with hmmalign or muscle. The resulting alignments are further trimmed using clipkit by default.
You can use the -n/--non_trim option to skip the trimming step. Finally, The alignment results are output
separately for each hmm marker.
"""
# If args.input_dir is used to instead of args.inputs
if input_dir:
Expand Down Expand Up @@ -503,4 +477,4 @@ def main(inputs, input_dir, output, markerset, evalue, method, non_trim, concat,
msa = msa_generator(inputs)
msa.search(markerset, evalue=evalue, threads=threads)
msa.filter_orthologs()
msa.align(output, non_trim=non_trim, method=method, concat=concat, threads=threads)
msa.align(output, non_trim=non_trim, method=method, threads=threads)
12 changes: 6 additions & 6 deletions src/phyling/phyling.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,6 @@ def parser_submodule(parser, parent_parser) -> None:
action="store_true",
help="Report non-trimmed alignment results",
)
p_aln.add_argument(
"-c",
"--concat",
action="store_true",
help="Report concatenated alignment results",
)
p_aln.add_argument(
"-t",
"--threads",
Expand Down Expand Up @@ -141,6 +135,12 @@ def parser_submodule(parser, parent_parser) -> None:
default="upgma",
help='Algorithm used for tree building (default="upgma")',
)
p_tree.add_argument(
"-c",
"--concat",
action="store_true",
help="Concatenated alignment results",
)
p_tree.add_argument("-f", "--figure", action="store_true", help="Generate a matplotlib tree figure")
p_tree.add_argument(
"-t",
Expand Down
118 changes: 98 additions & 20 deletions src/phyling/phylotree.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
"""Phylogenetic tree construction methods."""
from __future__ import annotations

import json
import logging
import shutil
import subprocess
import sys
from io import StringIO
from itertools import product
from multiprocessing.dummy import Pool
from pathlib import Path

import matplotlib.pyplot as plt
from Bio import AlignIO, Phylo
from Bio import AlignIO, Phylo, SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import phyling.config

Expand All @@ -37,6 +42,17 @@ def __init__(self, method: str, threads: int, *file: Path):
)
sys.exit(1)

def get(self) -> Phylo.BaseTree.Tree:
"""Run the phylogeny analysis and get the tree object list."""
if len(self._file) == 1:
final_tree = self._build(self._file[0], self._threads)
else:
logging.debug(f"Run in multiprocesses mode. {self._threads} jobs are run concurrently")
with Pool(self._threads) as pool:
trees = pool.map(self._build, self._file)
final_tree = run_astral(trees)
return final_tree

def _with_VeryFastTree(self, file: Path, threads: int) -> Phylo.BaseTree.Tree:
stream = StringIO()
with open(file) as f:
Expand Down Expand Up @@ -72,16 +88,37 @@ def _build(self, file: Path, threads: int = 1) -> Phylo.BaseTree.Tree:
logging.debug(f"Tree building on {file.name} is done")
return tree

def get(self) -> Phylo.BaseTree.Tree:
"""Run the phylogeny analysis and get the tree object list."""
if len(self._file) == 1:
final_tree = self._build(self._file[0], self._threads)
else:
logging.debug(f"Run in multiprocesses mode. {self._threads} jobs are run concurrently")
with Pool(self._threads) as pool:
trees = pool.map(self._build, self._file)
final_tree = run_astral(trees)
return final_tree

def concatenate_fasta(taxonList: list, alignmentList: list[MultipleSeqAlignment], threads: int) -> MultipleSeqAlignment:
"""Concatenate multiple MSA results into a single fasta."""
concat_alignments = MultipleSeqAlignment([])
for sample in taxonList:
concat_alignments.append(SeqRecord(Seq(""), id=sample, description=""))
concat_alignments.sort()
with Pool(threads) as pool:
alignmentList = pool.starmap(
fill_missing_taxon, product([[seq.id for seq in concat_alignments]], alignmentList)
)
logging.debug("Filling missing taxon done")
for alignment in alignmentList:
concat_alignments += alignment
logging.debug("Concatenate fasta done")
return concat_alignments


def fill_missing_taxon(taxonList: list, alignment: MultipleSeqAlignment) -> MultipleSeqAlignment:
"""Include empty gap-only strings in alignment for taxa lacking an ortholog."""
missing = set(taxonList) - {seq.id for seq in alignment}
for sample in missing:
alignment.append(
SeqRecord(
Seq("-" * alignment.get_alignment_length()),
id=sample,
description="",
)
)
alignment.sort()
return alignment


def run_astral(trees: list[Phylo.BaseTree.Tree]) -> Phylo.BaseTree.Tree:
Expand All @@ -98,12 +135,12 @@ def run_astral(trees: list[Phylo.BaseTree.Tree]) -> Phylo.BaseTree.Tree:
return Phylo.read(StringIO(stdout), "newick")


def phylotree(inputs, input_dir, output, method, figure, threads, **kwargs):
def phylotree(inputs, input_dir, output, method, figure, concat, threads, **kwargs):
"""
Construct a phylogenetic tree based on the results of multiple sequence alignment (MSA).
If multiple MSA results are given, the consensus tree method will be employed, using a 50% cutoff to represent
the majority of all the trees.
By default the consensus tree method will be employed which use a 50% cutoff to represent the majority of all the
trees. You can use the -c/--concat option to concatenate the MSA and build a single tree instead.
By default, the UPGMA algorithm is used for tree construction. Users can switch to the Neighbor Joining method by
specifying the -m/--method nj.
Expand All @@ -115,17 +152,58 @@ def phylotree(inputs, input_dir, output, method, figure, threads, **kwargs):
method_dict = {"upgma": "UPGMA", "nj": "Neighbor Joining", "ft": "VeryFastTree"}
logging.info(f"Algorithm choose for tree building: {method_dict[method]}")
if input_dir:
inputs = list(Path(input_dir).glob(f"*.{phyling.config.aln_ext}"))
inputs = [file for file in Path(input_dir).glob(f"*.{phyling.config.aln_ext}")]
else:
inputs = [Path(sample) for sample in inputs]
inputs = [Path(file) for file in inputs if file.endswith(phyling.config.aln_ext)]
input_dir = {file.parent for file in inputs}

if len(input_dir) > 1:
logging.error("Inputs do not came from the same folder. Cannot obtain the correct metadata.")
logging.error(input_dir)
sys.exit(1)
else:
input_dir = input_dir.pop()

logging.info(f"Found {len(inputs)} MSA fasta")
if inputs[0].name == f"concat_alignments.{phyling.config.aln_ext}":
logging.info("Generate phylogenetic tree the on concatenated fasta")
else:
logging.info("Generate phylogenetic tree on all MSA fasta and conclude an majority consensus tree")

# Get the metadata under the inputs folder
input_dir = Path(input_dir)
metadata = input_dir / ".metadata.json"
if not metadata.exists():
logging.error("Cannot find the metadata. Aborted.")
sys.exit(1)
with open(metadata) as f:
metadata = json.load(f)
if len(metadata["sample"]) < 3:
logging.error(
"Something wrong with the metadata. It contains less than 3 taxa which is not valid for tree building."
)
sys.exit(1)

output = Path(output)
output.mkdir(exist_ok=True)

if concat and len(inputs) > 1:
# Get the concat_alignments
logging.info("Generate phylogenetic tree the on concatenated fasta")
alignmentList = []
for file in inputs:
alignmentList.append(AlignIO.read(file, format="fasta"))
concat_alignments = concatenate_fasta(list(metadata["sample"].keys()), alignmentList, threads=threads)
# Rename the sample
for seq in concat_alignments:
seq.name = seq.id = metadata["sample"][seq.id]
seq.description = ""

# Output the concatenated MSA to file
output_concat = output / f"concat_alignments.{phyling.config.aln_ext}"
with open(output_concat, "w") as f:
SeqIO.write(concat_alignments, f, format="fasta")
logging.info(f"Concatenated fasta is output to {output_concat}")
inputs = [output_concat]
else:
logging.info("Generate phylogenetic tree on all MSA fasta and conclude a majority consensus tree")

tree_generator_obj = tree_generator(method, threads, *inputs)
final_tree = tree_generator_obj.get()
Phylo.draw_ascii(final_tree)
Expand Down

0 comments on commit e81eed3

Please sign in to comment.