From bbf133d80a727b0b0203b28aee44b204d9ce386d Mon Sep 17 00:00:00 2001 From: Fengchao Date: Mon, 24 Jul 2017 19:13:47 +0800 Subject: [PATCH] Support more cross-linkers. --- .../java/proteomics/Index/BuildIndex.java | 37 +++++++++++++------ .../java/proteomics/TheoSeq/MassTool.java | 26 ++++++++----- src/main/resources/parameter.def | 2 +- .../java/proteomics/TheoSeq/MassToolTest.java | 4 +- 4 files changed, 45 insertions(+), 24 deletions(-) diff --git a/src/main/java/proteomics/Index/BuildIndex.java b/src/main/java/proteomics/Index/BuildIndex.java index 1799edc..d8a0971 100644 --- a/src/main/java/proteomics/Index/BuildIndex.java +++ b/src/main/java/proteomics/Index/BuildIndex.java @@ -19,6 +19,7 @@ public class BuildIndex { private static final float varModMassResolution = 0.01f; public final float linker_mass; + public final short linker_type; private final MassTool mass_tool_obj; private final Map pro_annotate_map; @@ -71,13 +72,25 @@ public BuildIndex(Map parameter_map) { fix_mod_map.put('n', Float.valueOf(parameter_map.get("n"))); fix_mod_map.put('c', Float.valueOf(parameter_map.get("c"))); - if (Math.abs(fix_mod_map.get('K') - fix_mod_map.get('n')) > 1e-6) { // todo improve - logger.error("K and N-term have different fixed modification. Exit."); + linker_type = Short.valueOf(parameter_map.get("cl_type")); + + // check the cross-linker and the fix modification + if (linker_type == 1) { + if (Math.abs(fix_mod_map.get('K') - fix_mod_map.get('n')) > 1e-6) { + linker_mass = 0; + logger.error("The link sites have different fix modifications."); + System.exit(1); + } else { + linker_mass = Float.valueOf(parameter_map.get("cl_mass")) - fix_mod_map.get('K'); + } + } else if (linker_type == 2) { + linker_mass = Float.valueOf(parameter_map.get("cl_mass")) - fix_mod_map.get('C'); + } else { + linker_mass = 0; + logger.error("The cross-linker type cannot be recognized."); System.exit(1); } - linker_mass = Float.valueOf(parameter_map.get("cl_mass")) - 2 * fix_mod_map.get('K'); // todo improve - // read protein database DbTool db_tool_obj = new DbTool(db_path); Map pro_seq_map = db_tool_obj.getProSeqMap(); @@ -124,7 +137,7 @@ public BuildIndex(Map parameter_map) { boolean proteinCTerm = seq_term_map.get(seq)[1]; // mod free - Set linkSiteSet = getLinkSiteSet(seq, proteinNTerm, proteinCTerm); + Set linkSiteSet = getLinkSiteSet(seq, proteinNTerm, proteinCTerm, linker_type); if (!linkSiteSet.isEmpty()) { float totalMass = (float) (mass_tool_obj.calResidueMass(seq) + MassTool.H2O); if (totalMass < max_precursor_mass - linker_mass) { @@ -226,7 +239,7 @@ private Map> buildSeqProMap(Map pro_seq_map, Set for_check_duplicate = new HashSet<>(); for (String pro_id : pro_seq_map.keySet()) { String pro_seq = pro_seq_map.get(pro_id); - Set seq_set = mass_tool_obj.buildChainSet(pro_seq); + Set seq_set = mass_tool_obj.buildChainSet(pro_seq, linker_type); for (String target_seq : seq_set) { if ((target_seq.length() >= min_chain_length) && (target_seq.length() <= max_chain_length) && !target_seq.contains("B") && !target_seq.contains("J") && !target_seq.contains("X") && !target_seq.contains("Z")) { if (!for_check_duplicate.contains(target_seq.replace("L", "I"))) { @@ -259,7 +272,7 @@ private Map> buildSeqProMap(Map pro_seq_map, for (String pro_id : pro_seq_map.keySet()) { String pro_seq = pro_seq_map.get(pro_id); String decoy_pro_seq = (new StringBuilder(pro_seq)).reverse().toString(); - Set decoy_seq_set = mass_tool_obj.buildChainSet(decoy_pro_seq); + Set decoy_seq_set = mass_tool_obj.buildChainSet(decoy_pro_seq, linker_type); for (String decoy_seq : decoy_seq_set) { if ((decoy_seq.length() >= min_chain_length) && (decoy_seq.length() <= max_chain_length) && !decoy_seq.contains("B") && !decoy_seq.contains("J") && !decoy_seq.contains("X") && !decoy_seq.contains("Z")) { if (!for_check_duplicate.contains(decoy_seq.replace("L", "I"))) { @@ -546,18 +559,20 @@ private Set checkKCTermMod(Set varSeqSet) { // eliminate those s } } - private Set getLinkSiteSet(String seq, boolean n_term, boolean c_term) { + private Set getLinkSiteSet(String seq, boolean n_term, boolean c_term, short linker_type) { AA[] aa_list = MassTool.seqToAAList(seq); Set output = new HashSet<>(5, 1); for (int i = 1; i < aa_list.length - 2; ++i) { - if (aa_list[i].aa == 'K' && (Math.abs(aa_list[i].delta_mass) < varModMassResolution)) { + if (linker_type == 1 && aa_list[i].aa == 'K' && (Math.abs(aa_list[i].delta_mass) < varModMassResolution)) { + output.add((short) i); + } else if (linker_type == 2 && aa_list[i].aa == 'C' && (Math.abs(aa_list[i].delta_mass) < varModMassResolution)) { output.add((short) i); } } - if (n_term && !output.contains((short) 1) && (Math.abs(aa_list[0].delta_mass) < varModMassResolution)) { + if (linker_type == 1 && n_term && !output.contains((short) 1) && (Math.abs(aa_list[0].delta_mass) < varModMassResolution)) { output.add((short) 0); } - if (c_term && aa_list[aa_list.length - 2].aa == 'K' && (Math.abs(aa_list[aa_list.length - 2].delta_mass) < varModMassResolution)) { + if (linker_type == 1 && c_term && aa_list[aa_list.length - 2].aa == 'K' && (Math.abs(aa_list[aa_list.length - 2].delta_mass) < varModMassResolution)) { output.add((short) (aa_list.length - 2)); } return output; diff --git a/src/main/java/proteomics/TheoSeq/MassTool.java b/src/main/java/proteomics/TheoSeq/MassTool.java index 31e5b74..0a1ae36 100644 --- a/src/main/java/proteomics/TheoSeq/MassTool.java +++ b/src/main/java/proteomics/TheoSeq/MassTool.java @@ -71,29 +71,35 @@ public float calResidueMass(String seq) { // n and c are also AA. return (float) total_mass; } - public Set buildChainSet(String pro_seq) { + public Set buildChainSet(String pro_seq, short linker_type) { Map> digest_range_map = digestTrypsin(pro_seq); Set chain_seq_set = new HashSet<>(); for (int i = 0; i <= missed_cleavage; ++i) { for (int[] digest_range_1 : digest_range_map.get(i)) { String sub_string = pro_seq.substring(digest_range_1[0], digest_range_1[1]); - if (sub_string.substring(0, sub_string.length() - 1).contains("K")) { - // If there is a K in middle, this peptide is a chain. + if (linker_type == 1 && sub_string.substring(0, sub_string.length() - 1).contains("K")) { + chain_seq_set.add("n" + sub_string + "c"); + } else if (linker_type == 2 && sub_string.substring(0, sub_string.length() - 1).contains("C")) { chain_seq_set.add("n" + sub_string + "c"); } + if (digest_range_1[1] == pro_seq.length()) { - // This is the end of the protein. No digestion site, so "K" in any position including C-term can be linked. - if (sub_string.contains("K")) { + // This is the end of the protein. No digestion site, so the link-sites in any position including C-term can be linked. + if (linker_type == 1 && sub_string.contains("K")) { + chain_seq_set.add("n" + sub_string + "c"); + } else if (linker_type == 2 && sub_string.contains("C")) { chain_seq_set.add("n" + sub_string + "c"); } } } - // Add N-term peptide - if (digest_range_map.get(i).size() > 0) { - int[] digest_range = digest_range_map.get(i).get(0); - String sub_string = pro_seq.substring(digest_range[0], digest_range[1]); - chain_seq_set.add("n" + sub_string + "c"); + if (linker_type == 1) { + // Add N-term peptide + if (digest_range_map.get(i).size() > 0) { + int[] digest_range = digest_range_map.get(i).get(0); + String sub_string = pro_seq.substring(digest_range[0], digest_range[1]); + chain_seq_set.add("n" + sub_string + "c"); + } } } return chain_seq_set; diff --git a/src/main/resources/parameter.def b/src/main/resources/parameter.def index c9bb146..e78e83b 100644 --- a/src/main/resources/parameter.def +++ b/src/main/resources/parameter.def @@ -25,8 +25,8 @@ mz_bin_size = 0.02 mz_bin_offset = 0 # Cross-linking parameter. -# Cross-linking site can not be modified. cl_mass = 138.0680796 +cl_type = 1 # 1 = Kn-Kn; 2 = C-C # Var modification # format: diff --git a/src/test/java/proteomics/TheoSeq/MassToolTest.java b/src/test/java/proteomics/TheoSeq/MassToolTest.java index 2735974..fc5edd7 100644 --- a/src/test/java/proteomics/TheoSeq/MassToolTest.java +++ b/src/test/java/proteomics/TheoSeq/MassToolTest.java @@ -60,7 +60,7 @@ public void mzToBin() throws Exception { public void buildChainSet() throws Exception { // 1 missed-cleavage, N-term linkable MassTool mass_tool_obj = new MassTool(1, fix_mod_map, "KR", "P", 1.0005f, 0.6f); - Set result = mass_tool_obj.buildChainSet("MRGFASSASRIATAAAASKPSLNASTSVNPKLSKTMDYMRIFSVFVVTLWIIRVDARVFKTY"); + Set result = mass_tool_obj.buildChainSet("MRGFASSASRIATAAAASKPSLNASTSVNPKLSKTMDYMRIFSVFVVTLWIIRVDARVFKTY", (short) 1); Set ground_truth = new HashSet<>(); ground_truth.add("nMRc"); ground_truth.add("nMRGFASSASRc"); @@ -73,7 +73,7 @@ public void buildChainSet() throws Exception { // 2 missed-cleavage, N-term linkable mass_tool_obj = new MassTool(2, fix_mod_map, "KR", "P", 1.0005f, 0.6f); - result = mass_tool_obj.buildChainSet("MRGFASSASRIATAAAASKPSLNASTSVNPKLSKTMDYMRIFSVFVVTLWIIRVDARVFKTY"); + result = mass_tool_obj.buildChainSet("MRGFASSASRIATAAAASKPSLNASTSVNPKLSKTMDYMRIFSVFVVTLWIIRVDARVFKTY", (short) 1); ground_truth = new HashSet<>(); ground_truth.add("nMRc"); ground_truth.add("nIATAAAASKPSLNASTSVNPKc");