From 418d3143b3ec37541039c7f4d25fe9f641b758b0 Mon Sep 17 00:00:00 2001 From: Fengchao Date: Wed, 31 Jan 2018 14:56:35 +0800 Subject: [PATCH] Fix a serious bug in CalEValue and improve CalEValue a lot. --- .../java/proteomics/Search/CalEValue.java | 68 +++++++++++-------- src/main/java/proteomics/Search/Search.java | 25 ++++--- .../java/proteomics/Search/SearchWrap.java | 7 +- .../proteomics/Types/ChainResultEntry.java | 14 ---- .../java/proteomics/Types/ResultEntry.java | 6 +- 5 files changed, 65 insertions(+), 55 deletions(-) diff --git a/src/main/java/proteomics/Search/CalEValue.java b/src/main/java/proteomics/Search/CalEValue.java index dfb0170..d990871 100644 --- a/src/main/java/proteomics/Search/CalEValue.java +++ b/src/main/java/proteomics/Search/CalEValue.java @@ -4,13 +4,13 @@ import org.slf4j.LoggerFactory; import proteomics.ECL2; import proteomics.Index.BuildIndex; +import proteomics.TheoSeq.MassTool; import proteomics.Types.*; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; -import java.util.Locale; -import java.util.TreeMap; +import java.util.*; public class CalEValue { @@ -18,19 +18,13 @@ public class CalEValue { private static final float maxTolerance = 20; private static final float toleranceStep = 1; - private ResultEntry result_entry; - private BuildIndex buildIndexObj; - private float linker_mass; - - CalEValue(int scan_num, ResultEntry result_entry, BuildIndex buildIndexObj, float linker_mass, float originalTolerance) throws IOException { - this.result_entry = result_entry; - this.buildIndexObj = buildIndexObj; - this.linker_mass = linker_mass; - + static void calEValue(int scan_num, ResultEntry result_entry, BuildIndex buildIndexObj, TreeMap> binScoresMap, float linker_mass, float originalTolerance, SparseVector xcorrPL, double singleChainT) throws IOException { int gap_num = ECL2.score_point_t - result_entry.getScoreCount(); float tolerance = originalTolerance; + float massWithoutLinker = result_entry.spectrum_mass - linker_mass; + int maxBinIdx = buildIndexObj.massToBin(massWithoutLinker * 0.5f); while (gap_num > 0 && tolerance <= maxTolerance) { - gap_num = generateRandomRandomScores(gap_num, tolerance, toleranceStep, result_entry.getBinChainMap()); + gap_num = generateRandomRandomScores(gap_num, tolerance, toleranceStep, binScoresMap, result_entry.spectrum_mass, massWithoutLinker, result_entry.charge, xcorrPL, buildIndexObj, buildIndexObj.getMassBinSeqMap(), buildIndexObj.getSeqEntryMap(), buildIndexObj.returnMassTool(), result_entry, maxBinIdx, singleChainT); tolerance += toleranceStep; } @@ -102,7 +96,7 @@ public class CalEValue { if (null_end_idx > 3 * inverseHistogramBinSize) { start_idx = Math.max(min_nonzero_idx, (int) (0.75 * null_end_idx)); } else { - start_idx = Math.max(min_nonzero_idx, (int) ( 0.5 * null_end_idx)); + start_idx = Math.max(min_nonzero_idx, (int) (0.5 * null_end_idx)); } // linear regression @@ -195,22 +189,21 @@ public class CalEValue { } } - private int generateRandomRandomScores(int gap_num, float tolerance, float toleranceStep, TreeMap binChainMap) { - int maxBinIdx = buildIndexObj.massToBin((result_entry.spectrum_mass - linker_mass) * 0.5f); - for (int binIdx1 : binChainMap.keySet()) { + private static int generateRandomRandomScores(int gap_num, float tolerance, float toleranceStep, TreeMap> binScoresMap, float precursorMass, float massWithoutLinker, int precursorCharge, SparseVector xcorrPL, BuildIndex buildIndex, TreeMap> binSequencesMap, Map seqEntryMap, MassTool massTool, ResultEntry resultEntry, int maxBinIdx, double singleChainT) { + for (int binIdx1 : binSequencesMap.keySet()) { if (binIdx1 < maxBinIdx) { - int leftBinIdx1 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass - tolerance - toleranceStep) - maxBinIdx; - int rightBinIdx1 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass - tolerance) - maxBinIdx - 1; - int leftBinIdx2 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass + tolerance) - maxBinIdx + 1; - int rightBinIdx2 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass + tolerance + toleranceStep) - maxBinIdx; - TreeMap sub_map = new TreeMap<>(); - sub_map.putAll(binChainMap.subMap(leftBinIdx1, true, rightBinIdx1, false)); - sub_map.putAll(binChainMap.subMap(leftBinIdx2, false, rightBinIdx2, true)); - if (!sub_map.isEmpty()) { - for (double score1 : binChainMap.get(binIdx1).getScoreList()) { - for (int binIdx2 : sub_map.keySet()) { - for (double score2 : binChainMap.get(binIdx2).getScoreList()) { - result_entry.addToScoreHistogram(score1 + score2); + int leftBinIdx1 = buildIndex.massToBin(massWithoutLinker - tolerance - toleranceStep) - binIdx1; + int rightBinIdx1 = buildIndex.massToBin(massWithoutLinker - tolerance) - binIdx1; + int leftBinIdx2 = buildIndex.massToBin(massWithoutLinker + tolerance) - binIdx1; + int rightBinIdx2 = buildIndex.massToBin(massWithoutLinker + tolerance + toleranceStep) - binIdx1; + TreeMap> subMap = new TreeMap<>(); + subMap.putAll(binSequencesMap.subMap(leftBinIdx1, true, rightBinIdx1, false)); + subMap.putAll(binSequencesMap.subMap(leftBinIdx2, false, rightBinIdx2, true)); + if (!subMap.isEmpty()) { + for (double score1 : subFunction(binIdx1, binScoresMap, binSequencesMap, seqEntryMap, massTool, precursorMass, precursorCharge, xcorrPL, singleChainT)) { + for (int binIdx2 : subMap.keySet()) { + for (double score2 : subFunction(binIdx2, binScoresMap, binSequencesMap, seqEntryMap, massTool, precursorMass, precursorCharge, xcorrPL, singleChainT)) { + resultEntry.addToScoreHistogram(score1 + score2); --gap_num; if (gap_num <= 0) { return gap_num; @@ -223,4 +216,23 @@ private int generateRandomRandomScores(int gap_num, float tolerance, float toler } return gap_num; } + + private static List subFunction(int binIdx, TreeMap> binScoresMap, TreeMap> binSequencesMap, Map seqEntryMap, MassTool massTool, float precursorMass, int precursorCharge, SparseVector xcorrPL, double singleChainT) { + List scoreList = new ArrayList<>(); + if (binScoresMap.containsKey(binIdx)) { + scoreList = binScoresMap.get(binIdx); + } else { + for (String seq : binSequencesMap.get(binIdx)) { + ChainEntry chainEntry = seqEntryMap.get(seq); + for (short linkSite : chainEntry.link_site_set) { + double xcorr = massTool.generateTheoFragmentAndCalXCorr(seq, linkSite, precursorMass - chainEntry.chain_mass, precursorCharge, xcorrPL); + if (xcorr > singleChainT) { + scoreList.add(xcorr); + } + } + } + binScoresMap.put(binIdx, scoreList); + } + return scoreList; + } } diff --git a/src/main/java/proteomics/Search/Search.java b/src/main/java/proteomics/Search/Search.java index eae3a27..19a2e05 100644 --- a/src/main/java/proteomics/Search/Search.java +++ b/src/main/java/proteomics/Search/Search.java @@ -20,7 +20,7 @@ public class Search { private final TreeMap> bin_seq_map; private final BuildIndex build_index_obj; private final int[] C13_correction_range; - private final float single_chain_t; + final float single_chain_t; private final boolean cal_evalue; /////////////////////////////////////////public methods//////////////////////////////////////////////////////////// @@ -47,7 +47,7 @@ public Search(BuildIndex build_index_obj, Map parameter_map) { Arrays.sort(C13_correction_range); } - ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL) throws IOException { + ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap> binScoresMap) throws IOException { int max_chain_bin_idx = Math.min(build_index_obj.massToBin(spectrumEntry.mass_without_linker_mass + C13_correction_range[C13_correction_range.length - 1] * 1.00335483f) + 1 - bin_seq_map.firstKey(), bin_seq_map.lastKey()); int min_chain_bin_idx = bin_seq_map.firstKey(); @@ -98,7 +98,7 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL) throws I // This bin hasn't been visited. Linear scan first. for (String seq : bin_seq_map.get(idx_1)) { ChainEntry chainEntry = chain_entry_map.get(seq); - linearScan(spectrumEntry, xcorrPL, chainEntry, idx_1, binChainMap, debugEntryList, devChainScoreMap); + linearScan(spectrumEntry, xcorrPL, chainEntry, idx_1, binChainMap, binScoresMap, debugEntryList, devChainScoreMap); } checkedBinSet.add(idx_1); } @@ -110,7 +110,7 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL) throws I // this bin hasn't been visited. Linear scan first. for (String seq : bin_seq_map.get(idx_2)) { ChainEntry chainEntry = chain_entry_map.get(seq); - linearScan(spectrumEntry, xcorrPL, chainEntry, idx_2, binChainMap, debugEntryList, devChainScoreMap); + linearScan(spectrumEntry, xcorrPL, chainEntry, idx_2, binChainMap, binScoresMap, debugEntryList, devChainScoreMap); } checkedBinSet.add(idx_2); } @@ -145,8 +145,8 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL) throws I } if (cal_evalue && (resultEntry.getScoreCount() < ECL2.score_point_t)) { - for (double s1 : chain_score_entry_1.getScoreList()) { - for (double s2 : chain_score_entry_2.getScoreList()) { + for (double s1 : binScoresMap.get(idx_1)) { + for (double s2 : binScoresMap.get(idx_2)) { resultEntry.addToScoreHistogram(s1 + s2); } } @@ -273,7 +273,7 @@ public FinalResultEntry convertResultEntry(int scanNum, ResultEntry result_entry return new FinalResultEntry(scanNum, result_entry.spectrum_id, rank, result_entry.charge, result_entry.spectrum_mz, result_entry.spectrum_mass, theo_mass, result_entry.rt, ppm, result_entry.getScore(), delta_c, final_seq_1, result_entry.getLinkSite1(), String.join(";", pro1Set), final_seq_2, result_entry.getLinkSite2(), String.join(";", pro2Set), cl_type, hit_type, C13_Diff_num, result_entry.getEValue(), result_entry.getScoreCount(), result_entry.getRSquare(), result_entry.getSlope(), result_entry.getIntercept(), result_entry.getStartIdx(), result_entry.getEndIdx(), result_entry.getChainScore1(), result_entry.getChainRank1(), result_entry.getChainScore2(), result_entry.getChainRank2(), result_entry.getCandidateNum(), cal_evalue, spectrumEntry.mgfTitle); } - private void linearScan(SpectrumEntry spectrumEntry, SparseVector xcorrPL, ChainEntry chainEntry, int binInx, TreeMap binChainMap, List debugEntryList, Map devChainScoreMap) { + private void linearScan(SpectrumEntry spectrumEntry, SparseVector xcorrPL, ChainEntry chainEntry, int binInx, TreeMap binChainMap, TreeMap> binScoresMap, List debugEntryList, Map devChainScoreMap) { for (short link_site_1 : chainEntry.link_site_set) { // generate theoretical fragment ion bins and calculate XCorr. double dot_product = mass_tool_obj.generateTheoFragmentAndCalXCorr(chainEntry.seq, link_site_1, spectrumEntry.precursor_mass - chainEntry.chain_mass, spectrumEntry.precursor_charge, xcorrPL); @@ -288,9 +288,17 @@ private void linearScan(SpectrumEntry spectrumEntry, SparseVector xcorrPL, Chain // Record result if (dot_product > single_chain_t) { + if (cal_evalue) { + if (binScoresMap.containsKey(binInx)) { + binScoresMap.get(binInx).add(dot_product); + } else { + List tempList = new ArrayList<>(); + tempList.add(dot_product); + binScoresMap.put(binInx, tempList); + } + } if (binChainMap.containsKey(binInx)) { ChainResultEntry chain_result_entry = binChainMap.get(binInx); - chain_result_entry.addToScoreList(dot_product, cal_evalue); if (dot_product > chain_result_entry.getScore()) { chain_result_entry.setSecondScore(chain_result_entry.getScore()); chain_result_entry.setSecondSeq(chain_result_entry.getSeq()); @@ -303,7 +311,6 @@ private void linearScan(SpectrumEntry spectrumEntry, SparseVector xcorrPL, Chain } } else { ChainResultEntry chain_result_entry = new ChainResultEntry(); - chain_result_entry.addToScoreList(dot_product, cal_evalue); chain_result_entry.setSeq(chainEntry.seq); chain_result_entry.setLinkSite(link_site_1); chain_result_entry.setScore(dot_product); diff --git a/src/main/java/proteomics/Search/SearchWrap.java b/src/main/java/proteomics/Search/SearchWrap.java index e2bf58c..f512245 100644 --- a/src/main/java/proteomics/Search/SearchWrap.java +++ b/src/main/java/proteomics/Search/SearchWrap.java @@ -9,8 +9,10 @@ import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; +import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.concurrent.Callable; public class SearchWrap implements Callable { @@ -44,7 +46,8 @@ public FinalResultEntry call() throws IOException { } writer.close(); } - ResultEntry resultEntry = search_obj.doSearch(spectrumEntry, xcorrPL); + TreeMap> binScoresMap = new TreeMap<>(); + ResultEntry resultEntry = search_obj.doSearch(spectrumEntry, xcorrPL, binScoresMap); if (resultEntry != null) { if (1 - (resultEntry.getSecondScore() / resultEntry.getScore()) >= delta_c_t) { if (cal_evalue) { @@ -54,7 +57,7 @@ public FinalResultEntry call() throws IOException { } else { originalTolerance = search_obj.ms1_tolerance; } - new CalEValue(spectrumEntry.scan_num, resultEntry, build_index_obj, build_index_obj.linker_mass, originalTolerance); + CalEValue.calEValue(spectrumEntry.scan_num, resultEntry, build_index_obj, binScoresMap, build_index_obj.linker_mass, originalTolerance, xcorrPL, search_obj.single_chain_t); if (resultEntry.getEValue() != 9999) { return search_obj.convertResultEntry(spectrumEntry.scan_num, resultEntry, seqProMap, spectrumEntry); } else { diff --git a/src/main/java/proteomics/Types/ChainResultEntry.java b/src/main/java/proteomics/Types/ChainResultEntry.java index c86af0f..9cce012 100644 --- a/src/main/java/proteomics/Types/ChainResultEntry.java +++ b/src/main/java/proteomics/Types/ChainResultEntry.java @@ -1,8 +1,5 @@ package proteomics.Types; -import java.util.LinkedList; -import java.util.List; - public class ChainResultEntry implements Comparable{ private String seq; @@ -12,7 +9,6 @@ public class ChainResultEntry implements Comparable{ private int link_site; private double score; private double second_score; - private List score_list = new LinkedList<>(); // contains all chain score for generating the random histogram public ChainResultEntry() {} @@ -26,12 +22,6 @@ public void setSecondSeq(String second_seq) { secondPtmFreeSeq = second_seq.replaceAll("[^A-Znc]", ""); } - public void addToScoreList(double score, boolean cal_evalue) { - if (cal_evalue) { - score_list.add(score); - } - } - public void setLinkSite(int link_site) { this.link_site = link_site; } @@ -81,8 +71,4 @@ public int compareTo(ChainResultEntry other) { return 0; } } - - public List getScoreList() { - return score_list; - } } diff --git a/src/main/java/proteomics/Types/ResultEntry.java b/src/main/java/proteomics/Types/ResultEntry.java index 265c594..a3333a0 100644 --- a/src/main/java/proteomics/Types/ResultEntry.java +++ b/src/main/java/proteomics/Types/ResultEntry.java @@ -88,8 +88,10 @@ public void setCandidateNum(long candidate_num) { public void addToScoreHistogram(double score) { try { - ++score_histogram[(int) Math.round(score * inverseHistogramBinSize)]; - ++score_count; + if (score > 0) { + ++score_histogram[(int) Math.round(score * inverseHistogramBinSize)]; + ++score_count; + } } catch (ArrayIndexOutOfBoundsException ex) { logger.warn("Score {} is out of the range [0, {}].", score, max_score); }