From c52797347b4d2f8cca502909d6e7b71e0a954cdc Mon Sep 17 00:00:00 2001 From: Fengchao Date: Fri, 19 May 2017 15:53:55 +0800 Subject: [PATCH] Finding the start and end points in e-value estimation based on Comet's code. --- pom.xml | 2 +- src/main/java/proteomics/ECL2.java | 2 +- .../java/proteomics/Search/CalEValue.java | 63 +++++++++++++++---- .../java/proteomics/Types/ResultEntry.java | 2 +- 4 files changed, 54 insertions(+), 15 deletions(-) diff --git a/pom.xml b/pom.xml index ff17e9f..fa697a2 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ hk.ust.bioinformatics ECL2 - 2.1.3 + 2.1.3-try-comet-evalue jar ECL2 diff --git a/src/main/java/proteomics/ECL2.java b/src/main/java/proteomics/ECL2.java index c5f9ec6..898e044 100644 --- a/src/main/java/proteomics/ECL2.java +++ b/src/main/java/proteomics/ECL2.java @@ -26,7 +26,7 @@ public class ECL2 { public static final int random_score_point_t = 2; private static final Logger logger = LoggerFactory.getLogger(ECL2.class); - private static final String version = "2.1.3"; + private static final String version = "2.1.3-try--comet-evalue"; public static boolean debug; public static boolean dev; diff --git a/src/main/java/proteomics/Search/CalEValue.java b/src/main/java/proteomics/Search/CalEValue.java index a2b1b04..5a6f387 100644 --- a/src/main/java/proteomics/Search/CalEValue.java +++ b/src/main/java/proteomics/Search/CalEValue.java @@ -44,13 +44,43 @@ public class CalEValue { int[] score_histogram = result_entry.getScoreHistogram(); double histogram_bin_size = result_entry.getHistogramBinSize(); - // find the max non-zero idx. It may be an outlier. - int max_nonzero_idx = 0; + int min_nonzero_idx = 0; for (int i = 0; i < score_histogram.length; ++i) { + if (score_histogram[i] > 0) { + min_nonzero_idx = i; + break; + } + } + if (min_nonzero_idx == score_histogram.length - 1) { + logger.debug("Fail to estimate e-value for Scan {} (an empty score histogram).", scan_num); + result_entry.setEValue(9999); + return; + } + + int max_nonzero_idx = 0; + for (int i = score_histogram.length - 1; i >= 0; --i) { if (score_histogram[i] > 0) { max_nonzero_idx = i; + break; } } + if (max_nonzero_idx - min_nonzero_idx < 5) { + logger.debug("Fail to estimate e-value for Scan {} (doesn't have a useful score histogram).", scan_num); + result_entry.setEValue(9999); + if (ECL2.debug) { + try (BufferedWriter writer = new BufferedWriter(new FileWriter(scan_num + ".evalue.csv"))) { + writer.write("histogram\n"); + for (int i = 0; i < max_nonzero_idx; ++i) { + writer.write(String.format("%d\n", score_histogram[i])); + } + } catch (IOException ex) { + logger.error(ex.getMessage()); + ex.printStackTrace(); + System.exit(1); + } + } + return; + } // generate survival array int[] survival_count_array = new int[max_nonzero_idx + 1]; @@ -60,12 +90,13 @@ public class CalEValue { } // find the next max nonzero idx. from this down to the changing point are from null. - int top_count = (int) (survival_count_array[0] * 0.01); // these are from non-null. - int null_end_idx = 0; - for (int i = max_nonzero_idx - 1; i >= 0; --i) { - if (survival_count_array[i] > top_count) { - null_end_idx = i; - break; + int null_end_idx = max_nonzero_idx; + for (int i = min_nonzero_idx; i <= max_nonzero_idx; ++i) { + if (score_histogram[i] == 0) { + if (i == max_nonzero_idx || score_histogram[i + 1] == 0) { + null_end_idx = i - 1; + break; + } } } @@ -75,13 +106,20 @@ public class CalEValue { ln_survival_count_array[i] = Math.log(survival_count_array[i]); } - // try and find the best linear regression result - int start_idx = (int) (null_end_idx * 0.5); + // find the start point + int start_idx = min_nonzero_idx; + if (null_end_idx > 3 / ResultEntry.histogram_bin_size) { + start_idx = Math.max(min_nonzero_idx, (int) (0.75 * null_end_idx)); + } else if (null_end_idx > 1.5 / ResultEntry.histogram_bin_size) { + start_idx = Math.max(min_nonzero_idx, (int) ( 0.5 * null_end_idx)); + } + + // linear regression double max_r_square = 0; double optimal_slope = 1; double optimal_intercept = 0; int optimal_start_idx = 0; - while (start_idx >= 0) { + while (start_idx >= min_nonzero_idx) { double x_sum = 0; double y_sum = 0; double x_mean = 0; @@ -134,11 +172,12 @@ public class CalEValue { } r_square = ssr / yy_mean; - if (r_square > max_r_square) { + if (slope < 0) { optimal_slope = slope; optimal_intercept = intercept; max_r_square = r_square; optimal_start_idx = start_idx; + break; } --start_idx; } diff --git a/src/main/java/proteomics/Types/ResultEntry.java b/src/main/java/proteomics/Types/ResultEntry.java index 3f3097e..7457bf4 100644 --- a/src/main/java/proteomics/Types/ResultEntry.java +++ b/src/main/java/proteomics/Types/ResultEntry.java @@ -7,7 +7,7 @@ public class ResultEntry{ private static final Logger logger = LoggerFactory.getLogger(ResultEntry.class); - private static final double histogram_bin_size = 0.01f; + public static final double histogram_bin_size = 0.05f; private static final double max_score = 20; public final String spectrum_id;