Skip to content

Commit

Permalink
Fix bug: peptide sequences show up when using cds sequences as input.
Browse files Browse the repository at this point in the history
  • Loading branch information
chtsai0105 committed Oct 26, 2023
1 parent cbc3013 commit c10fc78
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 36 deletions.
22 changes: 13 additions & 9 deletions src/phyling/libphyling.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,19 @@ def trim_gaps(

clipkit_pep_msa = MSA.from_bio_msa(pep_msa, gap_chars="-")
clipkit_pep_msa.trim(mode=TrimmingMode.gappy, gap_threshold=gaps)
bio_msa = clipkit_pep_msa.to_bio_msa()

if clipkit_pep_msa._site_positions_to_trim.size > 0 and cds_msa:
infoList = [{"id": rec.id, "name": rec.name, "description": rec.description} for rec in cds_msa]
clipkit_cds_msa = MSA.from_bio_msa(cds_msa)
pep_trimList_expanded = np.expand_dims(clipkit_pep_msa._site_positions_to_trim, axis=1)
cds_site_positions_to_trim = (pep_trimList_expanded * np.array([3]) + np.array([0, 1, 2])).flatten()
clipkit_cds_msa.trim(site_positions_to_trim=cds_site_positions_to_trim)
bio_msa = clipkit_cds_msa.to_bio_msa()
pep_msa = clipkit_pep_msa.to_bio_msa()

if cds_msa:
if clipkit_pep_msa._site_positions_to_trim.size > 0:
infoList = [{"id": rec.id, "name": rec.name, "description": rec.description} for rec in cds_msa]
clipkit_cds_msa = MSA.from_bio_msa(cds_msa)
pep_trimList_expanded = np.expand_dims(clipkit_pep_msa._site_positions_to_trim, axis=1)
cds_site_positions_to_trim = (pep_trimList_expanded * np.array([3]) + np.array([0, 1, 2])).flatten()
clipkit_cds_msa.trim(site_positions_to_trim=cds_site_positions_to_trim)
cds_msa = clipkit_cds_msa.to_bio_msa()
bio_msa = cds_msa
else:
bio_msa = pep_msa

for new_rec, rec in zip(bio_msa, infoList):
new_rec.id, new_rec.name, new_rec.description = rec["id"], rec["name"], rec["description"]
Expand Down
95 changes: 68 additions & 27 deletions tests/libphyling_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,42 +108,83 @@ def test_bp_mrtrans_with_cds_msa_id(self):


class Testtrimgaps:
pep_msa1 = create_msa(["-MG--A", "M-GT-C", "MMGTG-"], ["Species_A", "Species_B", "Species_C"])
pep_msa2 = create_msa(["MTG--A", "M-GT-C"], ["Species_A", "Species_B"])

def test_trim_gaps_basic(self):
result_msa = trim_gaps(self.pep_msa1, gaps=0.5)
assert len(result_msa) == 3
assert str(result_msa[0].seq) == "-MG-A"
assert str(result_msa[1].seq) == "M-GTC"
assert str(result_msa[2].seq) == "MMGT-"
pep_msa = create_msa(
[
"MI-T-C",
"M-AT-C",
"MI-TG-",
"M-GT-C",
],
[
"Species_A",
"Species_B",
"Species_C",
"Species_D",
],
)
cds_msa = create_msa(
[
"ATGATC---ACG---TGT",
"ATG---GCTACT---TGT",
"ATGATA---ACAGGA---",
"ATG---GGTACT---TGC",
],
[
"Species_A",
"Species_B",
"Species_C",
"Species_D",
],
)

def test_trim_gaps_with_peptide(self):
result_msa = trim_gaps(self.pep_msa, gaps=0.6)
assert len(result_msa) == 4
assert str(result_msa[0].seq) == "MI-TC"
assert str(result_msa[1].seq) == "M-ATC"
assert str(result_msa[2].seq) == "MI-T-"
assert str(result_msa[3].seq) == "M-GTC"

def test_trim_gaps_with_peptide_no_trim(self):
result_msa = trim_gaps(self.pep_msa, gaps=0.8)
assert len(result_msa) == 4
assert str(result_msa[0].seq) == "MI-T-C"
assert str(result_msa[1].seq) == "M-AT-C"
assert str(result_msa[2].seq) == "MI-TG-"
assert str(result_msa[3].seq) == "M-GT-C"

def test_trim_gaps_with_cds_msa(self):
cds_msa = create_msa(["ATGACTGGA------GCT", "ATG---GCTACT---TGT"], ["Species_A", "Species_B"])
result_msa = trim_gaps(self.pep_msa2, cds_msa, gaps=0.5)
assert len(result_msa) == 2
assert str(result_msa[0].seq) == "ATGGGAGCT"
assert str(result_msa[1].seq) == "ATGGCTTGT"
result_msa = trim_gaps(self.pep_msa, self.cds_msa, gaps=0.6)
assert len(result_msa) == 4
assert str(result_msa[0].seq) == "ATGATC---ACGTGT"
assert str(result_msa[1].seq) == "ATG---GCTACTTGT"
assert str(result_msa[2].seq) == "ATGATA---ACA---"
assert str(result_msa[3].seq) == "ATG---GGTACTTGC"

def test_trim_gaps_with_cds_msa_no_trim(self):
result_msa = trim_gaps(self.pep_msa, self.cds_msa, gaps=0.8)
assert len(result_msa) == 4
assert str(result_msa[0].seq) == "ATGATC---ACG---TGT"
assert str(result_msa[1].seq) == "ATG---GCTACT---TGT"
assert str(result_msa[2].seq) == "ATGATA---ACAGGA---"
assert str(result_msa[3].seq) == "ATG---GGTACT---TGC"

def test_trim_gaps_at_threshold_boundary(self):
result_msa = trim_gaps(self.pep_msa2, gaps=0.5)
assert len(result_msa) == 2
assert str(result_msa[0].seq) == "MGA"
assert str(result_msa[1].seq) == "MGC"

def test_trim_gaps_no_trim(self):
result_msa = trim_gaps(self.pep_msa2, gaps=0.51)
assert len(result_msa) == 2
assert str(result_msa[0].seq) == "MTG-A"
assert str(result_msa[1].seq) == "M-GTC"
result_msa = trim_gaps(self.pep_msa, gaps=0.5)
assert len(result_msa) == 4
assert str(result_msa[0].seq) == "MTC"
assert str(result_msa[1].seq) == "MTC"
assert str(result_msa[2].seq) == "MT-"
assert str(result_msa[3].seq) == "MTC"

def test_trim_gaps_invalid_gaps_value(self):
with pytest.raises(ValueError):
trim_gaps(self.pep_msa2, gaps=1.5)
trim_gaps(self.pep_msa, gaps=1.5)

def test_trim_gaps_msa_metadata(self):
result_msa = trim_gaps(self.pep_msa1, gaps=0.5)
assert len(result_msa) == 3
result_msa = trim_gaps(self.pep_msa, gaps=0.5)
assert len(result_msa) == 4
assert str(result_msa[0].id) == "Species_A"
assert str(result_msa[1].id) == "Species_B"
assert str(result_msa[2].id) == "Species_C"
assert str(result_msa[3].id) == "Species_D"

0 comments on commit c10fc78

Please sign in to comment.