diff --git a/src/main/java/proteomics/ECL2.java b/src/main/java/proteomics/ECL2.java index 2ab7e61..b238b60 100644 --- a/src/main/java/proteomics/ECL2.java +++ b/src/main/java/proteomics/ECL2.java @@ -23,7 +23,6 @@ public class ECL2 { public static final boolean cal_evalue = true; public static final double delta_c_t = 0; public static final int score_point_t = 15000; - public static final int random_score_point_t = 2; public static final boolean flankingPeaks = true; private static final Logger logger = LoggerFactory.getLogger(ECL2.class); diff --git a/src/main/java/proteomics/Index/BuildIndex.java b/src/main/java/proteomics/Index/BuildIndex.java index c33e9c2..be06adc 100644 --- a/src/main/java/proteomics/Index/BuildIndex.java +++ b/src/main/java/proteomics/Index/BuildIndex.java @@ -3,7 +3,6 @@ import org.apache.commons.math3.util.CombinatoricsUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import proteomics.ECL2; import proteomics.TheoSeq.DbTool; import proteomics.TheoSeq.MassTool; import proteomics.Types.AA; @@ -31,7 +30,6 @@ public class BuildIndex { private Map bin_candidate_num_map = new HashMap<>(); private Map seq_entry_map = new HashMap<>(); private Map seq_term_map = new HashMap<>(); - private TreeMap> uniprot_decoy_mass_seq_map = new TreeMap<>(); private Set for_check_duplicate = new HashSet<>(); private Map> seqProMap; @@ -170,41 +168,6 @@ public BuildIndex(Map parameter_map) { } bin_candidate_num_map.put(bin_index, candidate_num); } - - // read and index large decoy uniprot database - if (ECL2.cal_evalue) { - db_tool_obj = new DbTool(uniprot_decoy_db); - Map uniprot_decoy_pro_seq_map = db_tool_obj.getProSeqMap(); - for (String pro_seq : uniprot_decoy_pro_seq_map.values()) { - Set seq_set = mass_tool_obj.buildChainSet(pro_seq); - for (String mod_free_seq : seq_set) { - if ((mod_free_seq.length() >= min_chain_length) && (mod_free_seq.length() <= max_chain_length)) { - if (!mod_free_seq.contains("B") && !mod_free_seq.contains("J") && !mod_free_seq.contains("X") && !mod_free_seq.contains("Z")) { - String temp_str = mod_free_seq.replace("L", "!").replace("I", "!").replace("K", "#").replace("Q", "#"); - if (!for_check_duplicate.contains(temp_str)) { - for_check_duplicate.add(temp_str); - int temp_int = mod_free_seq.indexOf('K'); // only use the first K - if ((temp_int != -1) && (temp_int != mod_free_seq.length() - 2)) { // only consider chains whose link site is in the middle. - // consider PTM free - float mass = (float) (mass_tool_obj.calResidueMass(mod_free_seq) + MassTool.H2O); - if (mass < max_precursor_mass - linker_mass) { - if (uniprot_decoy_mass_seq_map.containsKey(mass)) { - if (uniprot_decoy_mass_seq_map.get(mass).size() < ECL2.random_score_point_t) { // limit the size of set for the sake of memory. - uniprot_decoy_mass_seq_map.get(mass).add(mod_free_seq + "-" + temp_int); - } - } else { - Set temp = new HashSet<>(); - temp.add(mod_free_seq + "-" + temp_int); - uniprot_decoy_mass_seq_map.put(mass, temp); - } - } - } - } - } - } - } - } - } } public Map> getSeqProMap() { @@ -243,10 +206,6 @@ public int massToBin(float mass) { return (int) Math.floor(mass / ms1_bin_size); } - public TreeMap> getUniprotDecoyMassSeqMap() { - return uniprot_decoy_mass_seq_map; - } - public Map getBinCandidateNumMap() { return bin_candidate_num_map; } diff --git a/src/main/java/proteomics/Search/CalEValue.java b/src/main/java/proteomics/Search/CalEValue.java index 58a3776..2465b40 100644 --- a/src/main/java/proteomics/Search/CalEValue.java +++ b/src/main/java/proteomics/Search/CalEValue.java @@ -3,7 +3,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import proteomics.ECL2; +import proteomics.Index.BuildIndex; import proteomics.TheoSeq.MassTool; +import proteomics.Types.ChainEntry; import proteomics.Types.ResultEntry; import proteomics.Types.SparseBooleanVector; import proteomics.Types.SparseVector; @@ -11,36 +13,41 @@ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; -import java.util.NavigableMap; +import java.util.Map; import java.util.Set; import java.util.TreeMap; public class CalEValue { private static final Logger logger = LoggerFactory.getLogger(CalEValue.class); + private static final float maxTolerance = 20; private ResultEntry result_entry; - private TreeMap> uniprot_decoy_mass_seq_map; + private TreeMap> bin_seq_map; + private Map seq_entry_map; + private BuildIndex buildIndexObj; private float linker_mass; private MassTool mass_tool_obj; private int max_common_ion_charge; private SparseVector pl_map_xcorr; - private int maxBinIdx; - private float e_value_precursor_mass_tol; + private int specMaxBinIdx; - CalEValue(int scan_num, ResultEntry result_entry, SparseVector pl_map_xcorr, int maxBinIdx, TreeMap> uniprot_decoy_mass_seq_map, MassTool mass_tool_obj, float linker_mass, int max_common_ion_charge, float e_value_precursor_mass_tol) { + CalEValue(int scan_num, ResultEntry result_entry, SparseVector pl_map_xcorr, int specMaxBinIdx, BuildIndex buildIndexObj, MassTool mass_tool_obj, float linker_mass, int max_common_ion_charge, float originalTolerance) { this.result_entry = result_entry; - this.uniprot_decoy_mass_seq_map = uniprot_decoy_mass_seq_map; + this.bin_seq_map = buildIndexObj.getMassBinSeqMap(); + this.seq_entry_map = buildIndexObj.getSeqEntryMap(); + this.buildIndexObj = buildIndexObj; this.linker_mass = linker_mass; this.mass_tool_obj = mass_tool_obj; this.max_common_ion_charge = max_common_ion_charge; this.pl_map_xcorr = pl_map_xcorr; - this.maxBinIdx = maxBinIdx; - this.e_value_precursor_mass_tol = e_value_precursor_mass_tol; + this.specMaxBinIdx = specMaxBinIdx; int gap_num = ECL2.score_point_t - result_entry.getScoreCount(); - if (gap_num >= 0) { - generateRandomRandomScores(gap_num); + float tolerance = originalTolerance; + while (gap_num > 0 && tolerance <= maxTolerance) { + gap_num = generateRandomRandomScores(gap_num, tolerance, tolerance + 1); + tolerance += 1; } if (gap_num > 0) { @@ -215,25 +222,40 @@ public class CalEValue { } } - private int generateRandomRandomScores(int gap_num) { - float precursor_mass = result_entry.spectrum_mass; - float max_mass = (precursor_mass - linker_mass) / 2; - for (float mass_1 : uniprot_decoy_mass_seq_map.keySet()) { - if (mass_1 <= max_mass) { - float left_mass = precursor_mass - linker_mass - mass_1 - e_value_precursor_mass_tol; - float right_mass = precursor_mass - linker_mass - mass_1 + e_value_precursor_mass_tol; - NavigableMap> sub_map = uniprot_decoy_mass_seq_map.subMap(left_mass, true, right_mass, true); + private int generateRandomRandomScores(int gap_num, float previousTolerance, float tolerance) { + int maxBinIdx = buildIndexObj.massToBin((result_entry.spectrum_mass - linker_mass) / 2); + for (int binIdx1 : bin_seq_map.keySet()) { + if (binIdx1 < maxBinIdx) { + int leftBinIdx1 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass - previousTolerance - tolerance) - maxBinIdx; + int rightBinIdx1 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass - previousTolerance) - maxBinIdx - 1; + int leftBinIdx2 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass + previousTolerance) - maxBinIdx + 1; + int rightBinIdx2 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass + previousTolerance + tolerance) - maxBinIdx; + TreeMap> sub_map = new TreeMap<>(); + sub_map.putAll(bin_seq_map.subMap(leftBinIdx1, true, rightBinIdx1, false)); + sub_map.putAll(bin_seq_map.subMap(leftBinIdx2, false, rightBinIdx2, true)); if (!sub_map.isEmpty()) { - for (String seq_1_link_site : uniprot_decoy_mass_seq_map.get(mass_1)) { - String[] temp = seq_1_link_site.split("-"); - String seq_1 = temp[0]; - short link_site = Short.valueOf(temp[1]); - SparseBooleanVector theo_mz_1 = mass_tool_obj.buildTheoVector(seq_1, link_site, precursor_mass - mass_1, result_entry.charge, max_common_ion_charge, maxBinIdx); - double score1 = theo_mz_1.dot(pl_map_xcorr) * 0.005; - if (score1 > Search.single_chain_t) { - gap_num = generateMoreScoresSub(sub_map, seq_1, score1, precursor_mass, gap_num); - if (gap_num < 0) { - return gap_num; + for (String seq1 : bin_seq_map.get(binIdx1)) { + ChainEntry chainEntry1 = seq_entry_map.get(seq1); + for (short linkSite1 : chainEntry1.link_site_set) { + SparseBooleanVector theoMz1 = mass_tool_obj.buildTheoVector(seq1, linkSite1, result_entry.spectrum_mass - chainEntry1.chain_mass, result_entry.charge, max_common_ion_charge, specMaxBinIdx); + double score1 = theoMz1.dot(pl_map_xcorr) * 0.005; + if (score1 > Search.single_chain_t) { + for (int binIdx2 : sub_map.keySet()) { + for (String seq2 : sub_map.get(binIdx2)) { + ChainEntry chainEntry2 = seq_entry_map.get(seq2); + for (short linkSite2 : chainEntry2.link_site_set) { + SparseBooleanVector theoMz2 = mass_tool_obj.buildTheoVector(seq2, linkSite2, result_entry.spectrum_mass - chainEntry2.chain_mass, result_entry.charge, max_common_ion_charge, specMaxBinIdx); + double score2 = theoMz2.dot(pl_map_xcorr) * 0.0005; + if (score2 > Search.single_chain_t) { + result_entry.addToScoreHistogram(score1 + score2); + --gap_num; + if (gap_num <= 0) { + return gap_num; + } + } + } + } + } } } } @@ -242,27 +264,4 @@ private int generateRandomRandomScores(int gap_num) { } return gap_num; } - - private int generateMoreScoresSub(NavigableMap> sub_map, String seq_1, double score1, float precursor_mass, int gap_num) { - for (float mass_2 : sub_map.keySet()) { - for (String seq_2_link_site : sub_map.get(mass_2)) { - String[] temp = seq_2_link_site.split("-"); - String seq_2 = temp[0]; - short link_site = Short.valueOf(temp[1]); - if (!seq_1.contentEquals(seq_2)) { - SparseBooleanVector theo_mz_2 = mass_tool_obj.buildTheoVector(seq_2, link_site, precursor_mass - mass_2, result_entry.charge, max_common_ion_charge, maxBinIdx); - double score2 = theo_mz_2.dot(pl_map_xcorr) * 0.005; - if (score2 > Search.single_chain_t) { - double score = score1 + score2; - result_entry.addToScoreHistogram(score); - --gap_num; - if (gap_num < 0) { - return gap_num; - } - } - } - } - } - return gap_num; - } } diff --git a/src/main/java/proteomics/Search/SearchWrap.java b/src/main/java/proteomics/Search/SearchWrap.java index 8d3a41f..7fbb995 100644 --- a/src/main/java/proteomics/Search/SearchWrap.java +++ b/src/main/java/proteomics/Search/SearchWrap.java @@ -75,13 +75,13 @@ public FinalResultEntry call() { } if (1 - (resultEntry.getSecondScore() / resultEntry.getScore()) > ECL2.delta_c_t) { if (ECL2.cal_evalue) { - float e_value_precursor_mass_tol; + float originalTolerance; if (search_obj.ms1_tolerance_unit == 1) { - e_value_precursor_mass_tol = resultEntry.spectrum_mass * search_obj.ms1_tolerance * 1e-6f; + originalTolerance = resultEntry.spectrum_mass * search_obj.ms1_tolerance * 1e-6f; } else { - e_value_precursor_mass_tol = search_obj.ms1_tolerance; + originalTolerance = search_obj.ms1_tolerance; } - new CalEValue(spectrumEntry.scan_num, resultEntry, xcorrPL, maxBinIdx, build_index_obj.getUniprotDecoyMassSeqMap(), mass_tool_obj, build_index_obj.linker_mass, max_common_ion_charge, e_value_precursor_mass_tol); + new CalEValue(spectrumEntry.scan_num, resultEntry, xcorrPL, specMaxBinIdx, build_index_obj, mass_tool_obj, build_index_obj.linker_mass, max_common_ion_charge, originalTolerance); if (resultEntry.getEValue() != 9999) { return search_obj.convertResultEntry(spectrumEntry.scan_num, resultEntry, seqProMap); } else {