Skip to content

Commit

Permalink
Finish codon-based trimming
Browse files Browse the repository at this point in the history
  • Loading branch information
TJBIII committed Feb 6, 2024
1 parent b2bb0da commit 5a18033
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 16 deletions.
2 changes: 2 additions & 0 deletions clipkit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def clipkit(

# override some options not currently available through programmatic interface
complement = False
codon = False
use_log = False
quiet = True

Expand All @@ -51,6 +52,7 @@ def clipkit(
gaps,
gap_characters,
complement,
codon,
TrimmingMode(mode),
use_log,
quiet,
Expand Down
8 changes: 7 additions & 1 deletion clipkit/clipkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class TrimRun:
input_file_format: FileFormat
output_file_format: FileFormat
gaps: float
codon: bool
version: str = current_version

@property
Expand All @@ -67,6 +68,7 @@ def run(
gaps: float,
gap_characters: Union[list, None],
complement: bool,
codon: bool,
mode: TrimmingMode,
use_log: bool,
quiet: bool,
Expand Down Expand Up @@ -99,7 +101,7 @@ def run(
gaps = smart_gap_threshold_determination(alignment, gap_characters)

msa = create_msa(alignment, gap_characters)
msa.trim(mode, gap_threshold=gaps)
msa.trim(mode, gap_threshold=gaps, site_positions_to_trim=None, codon=codon)

trim_run = TrimRun(
alignment,
Expand All @@ -109,6 +111,7 @@ def run(
input_file_format,
output_file_format,
gaps,
codon,
)

return trim_run, msa.stats
Expand All @@ -123,6 +126,7 @@ def execute(
gaps: float,
gap_characters: Union[list, None],
complement: bool,
codon: bool,
mode: TrimmingMode,
use_log: bool,
quiet: bool,
Expand Down Expand Up @@ -151,6 +155,7 @@ def execute(
gaps,
gap_characters,
complement,
codon,
mode,
use_log,
quiet,
Expand All @@ -168,6 +173,7 @@ def execute(
trim_run.gap_characters,
mode,
complement,
codon,
use_log,
)

Expand Down
47 changes: 36 additions & 11 deletions clipkit/msa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import numpy as np
from itertools import chain
from typing import Union

from .modes import TrimmingMode
Expand Down Expand Up @@ -113,7 +114,12 @@ def trim(
site_positions_to_trim = np.array(site_positions_to_trim)
if not isinstance(site_positions_to_trim, np.ndarray):
raise ValueError("site_positions_to_trim must be a list or np array")
self._site_positions_to_trim = site_positions_to_trim

self._site_positions_to_trim = (
self.determine_all_codon_sites_to_trim(site_positions_to_trim)
if codon is True
else site_positions_to_trim
)
else:
self._site_positions_to_trim = self.determine_site_positions_to_trim(
mode,
Expand Down Expand Up @@ -210,10 +216,8 @@ def determine_site_positions_to_trim(self, mode, gap_threshold, codon=False):
Example:
[2, 9] -> [1, 2, 3, 7, 8, 9]
"""
sites_to_trim = map(
self.determine_codon_triplet_positions, sites_to_trim
).flatten()
print(sites_to_trim)
return self.determine_all_codon_sites_to_trim(sites_to_trim)

return sites_to_trim

def generate_debug_log_info(self):
Expand All @@ -234,10 +238,31 @@ def generate_debug_log_info(self):
gappyness,
)

def determine_codon_triplet_positions(alignment_position):
def determine_all_codon_sites_to_trim(self, sites_to_trim):
"""
For each position in sites_to_trim we need the full triplet of codon positions.
Sites to trim -> all codon sites to trim
[2, 8] -> [0, 1, 2, 6, 7, 8]
"""
sites_to_trim_codon = [
self.determine_codon_triplet_positions(site_pos)
for site_pos in sites_to_trim
]
flattened_unique_sites = list(set(chain(*sites_to_trim_codon)))
return np.array(flattened_unique_sites)

def determine_codon_triplet_positions(self, alignment_position):
"""
Block 0 -> [0,1,2], block 1 -> [3,4,5]
We filter to make sure we are not including any positions out of range
"""
block = alignment_position // 3
remainder = alignment_position % 3
if remainder:
block += 1
codon_triplet_index = block * 3
return [codon_triplet_index - 2, codon_triplet_index - 1, codon_triplet_index]
codon_triplet_index_start = block * 3
sites = [
codon_triplet_index_start,
codon_triplet_index_start + 1,
codon_triplet_index_start + 2,
]
return list(filter(lambda x: x <= self._original_length - 1, sites))
8 changes: 5 additions & 3 deletions clipkit/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def write_user_args(
gap_chars: list,
mode: "TrimmingMode",
complement: bool,
codon: bool,
use_log: bool,
) -> None:
if seq_type.value == "nt":
Expand All @@ -57,6 +58,7 @@ def write_user_args(
Gap characters: {gap_chars}
Trimming mode: {mode.value}
Create complementary output: {complement}
Process as codons: {codon}
Create log file: {use_log}
""" # noqa
)
Expand All @@ -77,9 +79,9 @@ def write_output_files_message(
------------------------
| Writing output files |
------------------------
trimmed alignment: {out_file_name}
complement file: {out_file_name + '.complement' if complement else False}
log file: {out_file_name + '.log' if use_log else False}
Trimmed alignment: {out_file_name}
Complement file: {out_file_name + '.complement' if complement else False}
Log file: {out_file_name + '.log' if use_log else False}
"""
)
)
Expand Down
32 changes: 31 additions & 1 deletion tests/integration/test_codon.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_simple_codon(self):
sequence_type=None,
complement=False,
codon=True,
gaps=0.2,
gaps=0.8,
mode=TrimmingMode.gappy,
use_log=False,
gap_characters=DEFAULT_NT_GAP_CHARS,
Expand All @@ -42,3 +42,33 @@ def test_simple_codon(self):
output_content = out_file.read()

assert expected_content == output_content

def test_simple_codon_all_trimmed(self):
"""
test codon
usage: clipkit simple.fa -co
"""
output_file = "output/simple.fa_gappy_codon"
kwargs = dict(
input_file=f"{here.parent}/samples/simple.fa",
output_file=output_file,
input_file_format="fasta",
output_file_format="fasta",
sequence_type=None,
complement=False,
codon=True,
gaps=0.1,
mode=TrimmingMode.gappy,
use_log=False,
gap_characters=DEFAULT_NT_GAP_CHARS,
quiet=True,
)

execute(**kwargs)

expected_content = ">1\n>2\n>3\n>4\n>5\n"

with open(output_file, "r") as out_file:
output_content = out_file.read()

assert expected_content == output_content
72 changes: 72 additions & 0 deletions tests/unit/test_msa.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,75 @@ def test_trim_by_provided_site_positions_list(self):
]
)
np.testing.assert_equal(msa.sites_kept, expected_sites_kept)

@pytest.mark.parametrize(
"sites_to_trim, expected",
[
(
[0],
np.array(
[
["T", "A", "T"],
["-", "A", "T"],
["-", "T", "A"],
["-", "T", "A"],
["-", "T", "-"],
]
),
),
(
[2],
np.array(
[
["T", "A", "T"],
["-", "A", "T"],
["-", "T", "A"],
["-", "T", "A"],
["-", "T", "-"],
]
),
),
(
[3],
np.array(
[
["A", "-", "G"],
["A", "-", "G"],
["A", "-", "G"],
["A", "G", "A"],
["A", "C", "a"],
]
),
),
(
[5],
np.array(
[
["A", "-", "G"],
["A", "-", "G"],
["A", "-", "G"],
["A", "G", "A"],
["A", "C", "a"],
]
),
),
(
[0, 1, 2, 3, 4, 5],
np.array(
[
[],
[],
[],
[],
[],
],
dtype=object,
),
),
],
)
def test_trim_codons(self, sites_to_trim, expected):
bio_msa = get_biopython_msa("tests/unit/examples/simple.fa")
msa = MSA.from_bio_msa(bio_msa)
msa.trim(site_positions_to_trim=sites_to_trim, codon=True)
np.testing.assert_equal(msa.trimmed, expected)

0 comments on commit 5a18033

Please sign in to comment.