From 0d8310dac945d22f31d8c8859aca6ff93012749e Mon Sep 17 00:00:00 2001 From: Fengchao Date: Thu, 1 Feb 2018 18:47:39 +0800 Subject: [PATCH] Using SQL to reduce the RAM usage. --- pom.xml | 5 + src/main/java/proteomics/ECL2.java | 272 +++++++++--------- .../java/proteomics/Search/CalEValue.java | 16 +- src/main/java/proteomics/Search/Search.java | 138 +-------- .../java/proteomics/Search/SearchWrap.java | 207 +++++++++++-- .../java/proteomics/Spectrum/PreSpectra.java | 45 ++- .../java/proteomics/Spectrum/PreSpectrum.java | 4 +- .../proteomics/Types/FinalResultEntry.java | 135 --------- .../java/proteomics/Types/ResultEntry.java | 10 +- .../java/proteomics/Types/SpectrumEntry.java | 40 --- .../java/proteomics/Validation/CalFDR.java | 126 ++++---- 11 files changed, 439 insertions(+), 559 deletions(-) delete mode 100644 src/main/java/proteomics/Types/FinalResultEntry.java delete mode 100644 src/main/java/proteomics/Types/SpectrumEntry.java diff --git a/pom.xml b/pom.xml index 363fa12..80fc8a3 100644 --- a/pom.xml +++ b/pom.xml @@ -71,6 +71,11 @@ guava 23.6-jre + + org.xerial + sqlite-jdbc + 3.21.0.1 + diff --git a/src/main/java/proteomics/ECL2.java b/src/main/java/proteomics/ECL2.java index 491f4e3..8828f2c 100644 --- a/src/main/java/proteomics/ECL2.java +++ b/src/main/java/proteomics/ECL2.java @@ -5,11 +5,9 @@ import proteomics.Index.BuildIndex; import proteomics.Parameter.Parameter; import proteomics.Search.SearchWrap; -import proteomics.Types.FinalResultEntry; import proteomics.Search.Search; import proteomics.Spectrum.PreSpectra; import proteomics.TheoSeq.MassTool; -import proteomics.Types.SpectrumEntry; import proteomics.Validation.CalFDR; import uk.ac.ebi.pride.tools.jmzreader.JMzReader; import uk.ac.ebi.pride.tools.jmzreader.JMzReaderException; @@ -20,12 +18,16 @@ import java.io.*; import java.net.InetAddress; import java.net.UnknownHostException; +import java.sql.*; +import java.text.SimpleDateFormat; import java.util.*; import java.util.concurrent.*; +import java.util.concurrent.locks.ReentrantLock; public class ECL2 { public static final int score_point_t = 15000; + public static final int[] C13CorrectionRange = new int[]{-2, -1, 0}; private static final Logger logger = LoggerFactory.getLogger(ECL2.class); public static final String version = "2.1.5"; @@ -46,26 +48,30 @@ public static void main(String[] args) { logger.info("ECL2 Version {}.", version); logger.info("Author: Fengchao Yu. Email: fyuab@connect.ust.hk"); + String dbName = null; try { String hostName = InetAddress.getLocalHost().getHostName(); logger.info("Computer: {}.", hostName); - } catch (UnknownHostException ex) { - logger.warn("Cannot get the computer's name."); - } + dbName = String.format(Locale.US, "ECL2.%s.%s.temp.db", hostName, new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss").format(Calendar.getInstance().getTime())); - logger.info("Parameter file: {}.", parameter_path); - logger.info("Spectra file: {}.", spectra_path); + logger.info("Parameter file: {}.", parameter_path); + logger.info("Spectra file: {}.", spectra_path); - try { - new ECL2(parameter_path, spectra_path); - } catch (IOException | MzXMLParsingException | JMzReaderException | ExecutionException | InterruptedException ex) { + new ECL2(parameter_path, spectra_path, dbName); + } catch (UnknownHostException ex) { + logger.warn("Cannot get the computer's name."); + } catch (IOException | MzXMLParsingException | JMzReaderException | ExecutionException | InterruptedException | ClassNotFoundException | IllegalAccessException | InstantiationException | SQLException ex) { ex.printStackTrace(); logger.error(ex.toString()); System.exit(1); + } finally { + if (dbName != null) { + (new File(dbName)).delete(); + } } } - private ECL2(String parameter_path, String spectra_path) throws IOException, MzXMLParsingException, JMzReaderException, ExecutionException, InterruptedException { + private ECL2(String parameter_path, String spectra_path, String dbName) throws IOException, MzXMLParsingException, JMzReaderException, ExecutionException, InterruptedException, ClassNotFoundException, IllegalAccessException, InstantiationException, SQLException { // Get the parameter map Parameter parameter = new Parameter(parameter_path); Map parameter_map = parameter.returnParameterMap(); @@ -107,6 +113,9 @@ private ECL2(String parameter_path, String spectra_path) throws IOException, MzX logger.info("Peptide chains number: {}.", build_index_obj.getSeqEntryMap().size()); + String sqlPath = "jdbc:sqlite:" + dbName; + Class.forName("org.sqlite.JDBC").newInstance(); + logger.info("Reading spectra..."); JMzReader spectra_parser = null; File spectra_file = new File(spectra_path); @@ -124,12 +133,7 @@ private ECL2(String parameter_path, String spectra_path) throws IOException, MzX System.exit(1); } - PreSpectra pre_spectra_obj = new PreSpectra(spectra_parser, build_index_obj, parameter_map, ext); - Map num_spectrum_map = pre_spectra_obj.getNumSpectrumMap(); - Integer[] scanNumArray = num_spectrum_map.keySet().toArray(new Integer[num_spectrum_map.size()]); - Arrays.sort(scanNumArray); - - logger.info("Useful MS/MS spectra number: {}.", num_spectrum_map.size()); + new PreSpectra(spectra_parser, build_index_obj, parameter_map, ext, sqlPath); logger.info("Start searching..."); @@ -142,34 +146,52 @@ private ECL2(String parameter_path, String spectra_path) throws IOException, MzX } ExecutorService thread_pool = Executors.newFixedThreadPool(thread_num); Search search_obj = new Search(build_index_obj, parameter_map); - List> taskList = new LinkedList<>(); - for (int scanNum : scanNumArray) { - taskList.add(thread_pool.submit(new SearchWrap(search_obj, num_spectrum_map.get(scanNum), build_index_obj, mass_tool_obj, build_index_obj.getSeqProMap(), cal_evalue, delta_c_t, flankingPeaks))); + List> taskList = new LinkedList<>(); + Connection sqlConnection = DriverManager.getConnection(sqlPath); + Statement sqlStatement = sqlConnection.createStatement(); + ResultSet sqlResultSet = sqlStatement.executeQuery("SELECT scanId, precursorCharge, massWithoutLinker, precursorMass FROM spectraTable"); + ReentrantLock lock = new ReentrantLock(); + while (sqlResultSet.next()) { + String scanId = sqlResultSet.getString("scanId"); + int precursorCharge = sqlResultSet.getInt("precursorCharge"); + double massWithoutLinker = sqlResultSet.getDouble("massWithoutLinker"); + double precursorMass = sqlResultSet.getDouble("precursorMass"); + taskList.add(thread_pool.submit(new SearchWrap(search_obj, build_index_obj, mass_tool_obj, cal_evalue, delta_c_t, flankingPeaks, spectra_parser, lock, scanId, precursorCharge, massWithoutLinker, precursorMass, sqlConnection))); } + sqlResultSet.close(); + sqlStatement.close(); // check progress every minute, record results,and delete finished tasks. int lastProgress = 0; - Set final_search_results = new HashSet<>(scanNumArray.length + 1, 1); - while (!taskList.isEmpty()) { + int resultCount = 0; + int totalCount = taskList.size(); + int count = 0; + while (count < totalCount) { // record search results and delete finished ones. - List> toBeDeleteTaskList = new LinkedList<>(); - for (Future task : taskList) { + List> toBeDeleteTaskList = new LinkedList<>(); + for (Future task : taskList) { if (task.isDone()) { - if (task.get() != null) { - final_search_results.add(task.get()); + if (task.get()) { + ++resultCount; } toBeDeleteTaskList.add(task); } else if (task.isCancelled()) { toBeDeleteTaskList.add(task); } } + count += toBeDeleteTaskList.size(); taskList.removeAll(toBeDeleteTaskList); - int progress = (scanNumArray.length - taskList.size()) * 20 / scanNumArray.length; + int progress = count * 20 / totalCount; if (progress != lastProgress) { logger.info("Searching {}%...", progress * 5); lastProgress = progress; } + + if (count == totalCount) { + break; + } + if (debug) { Thread.sleep(1000); } else { @@ -185,147 +207,111 @@ private ECL2(String parameter_path, String spectra_path) throws IOException, MzX System.err.println("Pool did not terminate"); } - if (final_search_results.isEmpty()) { - logger.warn("There is no useful PSM."); + sqlConnection.close(); + if (lock.isLocked()) { + lock.unlock(); + } + + if (resultCount == 0) { + logger.warn("There is no useful results."); } else { // save result logger.info("Estimating q-value..."); - List> picked_result = pickResult(final_search_results); - List intra_result = new LinkedList<>(); - List inter_result = new LinkedList<>(); - if (!picked_result.get(0).isEmpty()) { - CalFDR cal_fdr_obj = new CalFDR(picked_result.get(0), cal_evalue); - intra_result = cal_fdr_obj.includeStats(cal_evalue); - intra_result.sort(Collections.reverseOrder()); - } - if (!picked_result.get(1).isEmpty()) { - CalFDR cal_fdr_obj = new CalFDR(picked_result.get(1), cal_evalue); - inter_result = cal_fdr_obj.includeStats(cal_evalue); - inter_result.sort(Collections.reverseOrder()); - } + Map inter_result = CalFDR.calFDR(sqlPath, cal_evalue, "inter_protein"); + Map intra_result = CalFDR.calFDR(sqlPath, cal_evalue, "intra_protein"); logger.info("Saving results..."); - saveTargetResult(intra_result, build_index_obj.getProAnnotateMap(), spectra_path, true, cal_evalue); - saveTargetResult(inter_result, build_index_obj.getProAnnotateMap(), spectra_path, false, cal_evalue); - saveDecoyResult(intra_result, build_index_obj.getProAnnotateMap(), spectra_path, true, cal_evalue); - saveDecoyResult(inter_result, build_index_obj.getProAnnotateMap(), spectra_path, false, cal_evalue); + Map allResult = new HashMap<>(); + allResult.putAll(inter_result); + allResult.putAll(intra_result); + saveTargetResult(allResult, build_index_obj.getProAnnotateMap(), spectra_path, cal_evalue, sqlPath); } logger.info("Done."); } - private static void saveTargetResult(List result, Map pro_annotate_map, String id_file_name, boolean is_intra, boolean cal_evalue) throws IOException { - BufferedWriter writer; - if (is_intra) { - writer = new BufferedWriter(new FileWriter(id_file_name + ".intra.target.csv")); - } else { - writer = new BufferedWriter(new FileWriter(id_file_name + ".inter.target.csv")); - } + private static void saveTargetResult(Map result, Map pro_annotate_map, String id_file_name, boolean cal_evalue, String sqlPath) throws IOException, SQLException, NullPointerException { + BufferedWriter intraTargetWriter = new BufferedWriter(new FileWriter(id_file_name + ".intra.target.csv")); + BufferedWriter intraDecoyWriter = new BufferedWriter(new FileWriter(id_file_name + ".intra.decoy.csv")); + BufferedWriter interTargetWriter = new BufferedWriter(new FileWriter(id_file_name + ".inter.target.csv")); + BufferedWriter interDecoyWriter = new BufferedWriter(new FileWriter(id_file_name + ".inter.decoy.csv")); + + Connection sqlConnection = DriverManager.getConnection(sqlPath); + Statement sqlStatement = sqlConnection.createStatement(); + ResultSet sqlResultSet = null; if (dev) { - writer.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,,candidate_num,point_num,r_square,slope,intercept,start_idx,end_idx,chain_score_1,chain_rank_1,chain_score_2,chain_rank_2\n"); + intraTargetWriter.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,,MS1_pearson_correlation_coefficient,candidate_num,point_num,r_square,slope,intercept,start_idx,end_idx,chain_score_1,chain_rank_1,chain_score_2,chain_rank_2\n"); + intraDecoyWriter.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,,MS1_pearson_correlation_coefficient,candidate_num,point_num,r_square,slope,intercept,start_idx,end_idx,chain_score_1,chain_rank_1,chain_score_2,chain_rank_2\n"); + interTargetWriter.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,,MS1_pearson_correlation_coefficient,candidate_num,point_num,r_square,slope,intercept,start_idx,end_idx,chain_score_1,chain_rank_1,chain_score_2,chain_rank_2\n"); + interDecoyWriter.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,,MS1_pearson_correlation_coefficient,candidate_num,point_num,r_square,slope,intercept,start_idx,end_idx,chain_score_1,chain_rank_1,chain_score_2,chain_rank_2\n"); } else { - writer.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,\n"); + intraTargetWriter.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,\n"); + intraDecoyWriter.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,\n"); + interTargetWriter.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,\n"); + interDecoyWriter.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,q_value,mgf_title,\n"); } - for (FinalResultEntry re : result) { - if (re.hit_type == 0) { - int link_site_1 = re.link_site_1; - int link_site_2 = re.link_site_2; + for (CalFDR.Entry entry : result.values()) { + sqlResultSet = sqlStatement.executeQuery(String.format(Locale.US, "SELECT scanNum, precursorMz, precursorMass, rt, isotopeCorrectionNum, ms1PearsonCorrelationCoefficient, precursorCharge, theoMass, score, deltaC, ppm, seq1, linkSite1, seq2, linkSite2, proId1, proId2, eValue, mgfTitle, candidateNum, pointCount, rSquare, slope, intercept, startIdx, endIdx, chainScore1, chainRank1, chainScore2, chainRank2, hitType, clType FROM spectraTable WHERE scanId='%s'", entry.scanId)); + if (sqlResultSet.next()) { List proAnnotationList1 = new LinkedList<>(); - for (String s : re.pro_id_1.split(";")) { - proAnnotationList1.add(pro_annotate_map.get(s)); + for (String s : sqlResultSet.getString("proId1").split(";")) { + if (s.startsWith("DECOY_")) { + proAnnotationList1.add("DECOY"); + } else { + proAnnotationList1.add(pro_annotate_map.get(s)); + } } List proAnnotationList2 = new LinkedList<>(); - for (String s : re.pro_id_2.split(";")) { - proAnnotationList2.add(pro_annotate_map.get(s)); - } - - if (dev) { - writer.write(re.scan_num + "," + re.spectrum_id + "," + re.spectrum_mz + "," + re.spectrum_mass + "," + re.peptide_mass + "," + re.rt + "," + re.C13_correction + "," + re.charge + "," + String.format(Locale.US, "%.4f", re.score) + "," + re.delta_c + "," + String.format(Locale.US, "%.2f", re.ppm) + "," + re.seq_1 + "-" + link_site_1 + "-" + re.seq_2 + "-" + link_site_2 + "," + re.pro_id_1 + "-" + re.pro_id_2 + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", re.e_value) : "-") + "," + String.format(Locale.US, "%.4f", re.qvalue) + ",\"" + re.mgfTitle + "\",," + re.candidate_num + "," + re.point_count + "," + String.format(Locale.US, "%.4f", re.r_square) + "," + String.format(Locale.US, "%.4f", re.slope) + "," + String.format(Locale.US, "%.4f", re.intercept) + "," + re.start_idx + "," + re.end_idx + "," + String.format(Locale.US, "%.4f", re.chain_score_1) + "," + re.chain_rank_1 + "," + String.format(Locale.US, "%.4f", re.chain_score_2) + "," + re.chain_rank_2 + "\n"); - } else { - writer.write(re.scan_num + "," + re.spectrum_id + "," + re.spectrum_mz + "," + re.spectrum_mass + "," + re.peptide_mass + "," + re.rt + "," + re.C13_correction + "," + re.charge + "," + String.format(Locale.US, "%.4f", re.score) + "," + re.delta_c + "," + String.format(Locale.US, "%.2f", re.ppm) + "," + re.seq_1 + "-" + link_site_1 + "-" + re.seq_2 + "-" + link_site_2 + "," + re.pro_id_1 + "-" + re.pro_id_2 + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", re.e_value) : "-") + "," + String.format(Locale.US, "%.4f", re.qvalue) + ",\"" + re.mgfTitle + "\"\n"); - } - } - } - writer.close(); - } - - private static void saveDecoyResult(List result, Map pro_annotate_map, String id_file_name, boolean is_intra, boolean cal_evalue) throws IOException { - BufferedWriter writer; - if (is_intra) { - writer = new BufferedWriter(new FileWriter(id_file_name + ".intra.decoy.csv")); - } else { - writer = new BufferedWriter(new FileWriter(id_file_name + ".inter.decoy.csv")); - } - - if (dev) { - writer.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,mgf_title,,candidate_num,point_num,r_square,slope,intercept,start_idx,end_idx,chain_score_1,chain_rank_1,chain_score_2,chain_rank_2\n"); - } else { - writer.write("scan_num,spectrum_id,spectrum_mz,spectrum_mass,peptide_mass,rt,C13_correction,charge,score,delta_C,ppm,peptide,protein,protein_annotation_1,protein_annotation_2,e_value,mgf_title\n"); - } - for (FinalResultEntry re : result) { - if ((re.hit_type == 1) || (re.hit_type == 2)) { - int link_site_1 = re.link_site_1; - int link_site_2 = re.link_site_2; - - String annotate_1 = "DECOY"; - String annotate_2 = "DECOY"; - - if (!re.pro_id_1.startsWith("DECOY")) { - String pro_1 = re.pro_id_1.split(";")[0]; - annotate_1 = pro_annotate_map.get(pro_1).replace(',', ';'); - } - - if (!re.pro_id_2.startsWith("DECOY")) { - String pro_2 = re.pro_id_2.split(";")[0]; - annotate_2 = pro_annotate_map.get(pro_2).replace(',', ';'); + for (String s : sqlResultSet.getString("proId2").split(";")) { + if (s.startsWith("DECOY_")) { + proAnnotationList2.add("DECOY"); + } else { + proAnnotationList2.add(pro_annotate_map.get(s)); + } } if (dev) { - writer.write(re.scan_num + "," + re.spectrum_id + "," + re.spectrum_mz + "," + re.spectrum_mass + "," + re.peptide_mass + "," + re.rt + "," + re.C13_correction + "," + re.charge + "," + String.format(Locale.US, "%.4f", re.score) + "," + re.delta_c + "," + String.format(Locale.US, "%.2f", re.ppm) + "," + re.seq_1 + "-" + link_site_1 + "-" + re.seq_2 + "-" + link_site_2 + "," + re.pro_id_1 + "-" + re.pro_id_2 + ",\"" + annotate_1 + "\",\"" + annotate_2 + "\"," + (cal_evalue ? String.format(Locale.US, "%E", re.e_value) : "-") + ",\"" + re.mgfTitle + "\",," + re.candidate_num + "," + re.point_count + "," + String.format(Locale.US, "%.4f", re.r_square) + "," + String.format(Locale.US, "%.4f", re.slope) + "," + String.format(Locale.US, "%.4f", re.intercept) + "," + re.start_idx + "," + re.end_idx + "," + String.format(Locale.US, "%.4f", re.chain_score_1) + "," + re.chain_rank_1 + "," + String.format(Locale.US, "%.4f", re.chain_score_2) + "," + re.chain_rank_2 + "\n"); + if (sqlResultSet.getString("clType").contentEquals("intra_protein")) { + if (sqlResultSet.getInt("hitType") == 0) { + intraTargetWriter.write(sqlResultSet.getInt("scanNum") + "," + entry.scanId + "," + sqlResultSet.getDouble("precursorMz") + "," + sqlResultSet.getDouble("precursorMass") + "," + sqlResultSet.getDouble("theoMass") + "," + sqlResultSet.getInt("rt") + "," + sqlResultSet.getInt("isotopeCorrectionNum") + "," + sqlResultSet.getInt("precursorCharge") + "," + sqlResultSet.getDouble("score") + "," + sqlResultSet.getDouble("deltaC") + "," + sqlResultSet.getDouble("ppm") + "," + sqlResultSet.getString("seq1") + "-" + sqlResultSet.getInt("linkSite1") + "-" + sqlResultSet.getString("seq2") + "-" + sqlResultSet.getInt("linkSite2") + "," + sqlResultSet.getString("proId1") + "-" + sqlResultSet.getString("proId2") + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", sqlResultSet.getDouble("eValue")) : "-") + "," + entry.qValue + ",\"" + sqlResultSet.getString("mgfTitle") + "\",," + sqlResultSet.getDouble("ms1PearsonCorrelationCoefficient") + "," + sqlResultSet.getInt("candidateNum") + "," + sqlResultSet.getInt("pointCount") + "," + sqlResultSet.getDouble("rSquare") + "," + sqlResultSet.getDouble("slope") + "," + sqlResultSet.getDouble("intercept") + "," + sqlResultSet.getInt("startIdx") + "," + sqlResultSet.getInt("endIdx") + "," + sqlResultSet.getDouble("chainScore1") + "," + sqlResultSet.getInt("chainRank1") + "," + sqlResultSet.getDouble("chainScore2") + "," + sqlResultSet.getInt("chainRank2") + "\n"); + } else { + intraDecoyWriter.write(sqlResultSet.getInt("scanNum") + "," + entry.scanId + "," + sqlResultSet.getDouble("precursorMz") + "," + sqlResultSet.getDouble("precursorMass") + "," + sqlResultSet.getDouble("theoMass") + "," + sqlResultSet.getInt("rt") + "," + sqlResultSet.getInt("isotopeCorrectionNum") + "," + sqlResultSet.getInt("precursorCharge") + "," + sqlResultSet.getDouble("score") + "," + sqlResultSet.getDouble("deltaC") + "," + sqlResultSet.getDouble("ppm") + "," + sqlResultSet.getString("seq1") + "-" + sqlResultSet.getInt("linkSite1") + "-" + sqlResultSet.getString("seq2") + "-" + sqlResultSet.getInt("linkSite2") + "," + sqlResultSet.getString("proId1") + "-" + sqlResultSet.getString("proId2") + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", sqlResultSet.getDouble("eValue")) : "-") + "," + entry.qValue + ",\"" + sqlResultSet.getString("mgfTitle") + "\",," + sqlResultSet.getDouble("ms1PearsonCorrelationCoefficient") + "," + sqlResultSet.getInt("candidateNum") + "," + sqlResultSet.getInt("pointCount") + "," + sqlResultSet.getDouble("rSquare") + "," + sqlResultSet.getDouble("slope") + "," + sqlResultSet.getDouble("intercept") + "," + sqlResultSet.getInt("startIdx") + "," + sqlResultSet.getInt("endIdx") + "," + sqlResultSet.getDouble("chainScore1") + "," + sqlResultSet.getInt("chainRank1") + "," + sqlResultSet.getDouble("chainScore2") + "," + sqlResultSet.getInt("chainRank2") + "\n"); + } + } else { + if (sqlResultSet.getInt("hitType") == 0) { + interTargetWriter.write(sqlResultSet.getInt("scanNum") + "," + entry.scanId + "," + sqlResultSet.getDouble("precursorMz") + "," + sqlResultSet.getDouble("precursorMass") + "," + sqlResultSet.getDouble("theoMass") + "," + sqlResultSet.getInt("rt") + "," + sqlResultSet.getInt("isotopeCorrectionNum") + "," + sqlResultSet.getInt("precursorCharge") + "," + sqlResultSet.getDouble("score") + "," + sqlResultSet.getDouble("deltaC") + "," + sqlResultSet.getDouble("ppm") + "," + sqlResultSet.getString("seq1") + "-" + sqlResultSet.getInt("linkSite1") + "-" + sqlResultSet.getString("seq2") + "-" + sqlResultSet.getInt("linkSite2") + "," + sqlResultSet.getString("proId1") + "-" + sqlResultSet.getString("proId2") + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", sqlResultSet.getDouble("eValue")) : "-") + "," + entry.qValue + ",\"" + sqlResultSet.getString("mgfTitle") + "\",," + sqlResultSet.getDouble("ms1PearsonCorrelationCoefficient") + "," + sqlResultSet.getInt("candidateNum") + "," + sqlResultSet.getInt("pointCount") + "," + sqlResultSet.getDouble("rSquare") + "," + sqlResultSet.getDouble("slope") + "," + sqlResultSet.getDouble("intercept") + "," + sqlResultSet.getInt("startIdx") + "," + sqlResultSet.getInt("endIdx") + "," + sqlResultSet.getDouble("chainScore1") + "," + sqlResultSet.getInt("chainRank1") + "," + sqlResultSet.getDouble("chainScore2") + "," + sqlResultSet.getInt("chainRank2") + "\n"); + } else { + interDecoyWriter.write(sqlResultSet.getInt("scanNum") + "," + entry.scanId + "," + sqlResultSet.getDouble("precursorMz") + "," + sqlResultSet.getDouble("precursorMass") + "," + sqlResultSet.getDouble("theoMass") + "," + sqlResultSet.getInt("rt") + "," + sqlResultSet.getInt("isotopeCorrectionNum") + "," + sqlResultSet.getInt("precursorCharge") + "," + sqlResultSet.getDouble("score") + "," + sqlResultSet.getDouble("deltaC") + "," + sqlResultSet.getDouble("ppm") + "," + sqlResultSet.getString("seq1") + "-" + sqlResultSet.getInt("linkSite1") + "-" + sqlResultSet.getString("seq2") + "-" + sqlResultSet.getInt("linkSite2") + "," + sqlResultSet.getString("proId1") + "-" + sqlResultSet.getString("proId2") + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", sqlResultSet.getDouble("eValue")) : "-") + "," + entry.qValue + ",\"" + sqlResultSet.getString("mgfTitle") + "\",," + sqlResultSet.getDouble("ms1PearsonCorrelationCoefficient") + "," + sqlResultSet.getInt("candidateNum") + "," + sqlResultSet.getInt("pointCount") + "," + sqlResultSet.getDouble("rSquare") + "," + sqlResultSet.getDouble("slope") + "," + sqlResultSet.getDouble("intercept") + "," + sqlResultSet.getInt("startIdx") + "," + sqlResultSet.getInt("endIdx") + "," + sqlResultSet.getDouble("chainScore1") + "," + sqlResultSet.getInt("chainRank1") + "," + sqlResultSet.getDouble("chainScore2") + "," + sqlResultSet.getInt("chainRank2") + "\n"); + } + } } else { - writer.write(re.scan_num + "," + re.spectrum_id + "," + re.spectrum_mz + "," + re.spectrum_mass + "," + re.peptide_mass + "," + re.rt + "," + re.C13_correction + "," + re.charge + "," + String.format(Locale.US, "%.4f", re.score) + "," + re.delta_c + "," + String.format(Locale.US, "%.2f", re.ppm) + "," + re.seq_1 + "-" + link_site_1 + "-" + re.seq_2 + "-" + link_site_2 + "," + re.pro_id_1 + "-" + re.pro_id_2 + ",\"" + annotate_1 + "\",\"" + annotate_2 + "\"," + (cal_evalue ? String.format(Locale.US, "%E", re.e_value) : "-") + ",\"" + re.mgfTitle + "\"\n"); + if (sqlResultSet.getString("clType").contentEquals("intra_protein")) { + if (sqlResultSet.getInt("hitType") == 0) { + intraTargetWriter.write(sqlResultSet.getInt("scanNum") + "," + entry.scanId + "," + sqlResultSet.getDouble("precursorMz") + "," + sqlResultSet.getDouble("precursorMass") + "," + sqlResultSet.getDouble("theoMass") + "," + sqlResultSet.getInt("rt") + "," + sqlResultSet.getInt("isotopeCorrectionNum") + "," + sqlResultSet.getInt("precursorCharge") + "," + sqlResultSet.getDouble("score") + "," + sqlResultSet.getDouble("deltaC") + "," + sqlResultSet.getDouble("ppm") + "," + sqlResultSet.getString("seq1") + "-" + sqlResultSet.getInt("linkSite1") + "-" + sqlResultSet.getString("seq2") + "-" + sqlResultSet.getInt("linkSite2") + "," + sqlResultSet.getString("proId1") + "-" + sqlResultSet.getString("proId2") + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", sqlResultSet.getDouble("eValue")) : "-") + "," + entry.qValue + ",\"" + sqlResultSet.getString("mgfTitle") + "\"\n"); + } else { + intraDecoyWriter.write(sqlResultSet.getInt("scanNum") + "," + entry.scanId + "," + sqlResultSet.getDouble("precursorMz") + "," + sqlResultSet.getDouble("precursorMass") + "," + sqlResultSet.getDouble("theoMass") + "," + sqlResultSet.getInt("rt") + "," + sqlResultSet.getInt("isotopeCorrectionNum") + "," + sqlResultSet.getInt("precursorCharge") + "," + sqlResultSet.getDouble("score") + "," + sqlResultSet.getDouble("deltaC") + "," + sqlResultSet.getDouble("ppm") + "," + sqlResultSet.getString("seq1") + "-" + sqlResultSet.getInt("linkSite1") + "-" + sqlResultSet.getString("seq2") + "-" + sqlResultSet.getInt("linkSite2") + "," + sqlResultSet.getString("proId1") + "-" + sqlResultSet.getString("proId2") + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", sqlResultSet.getDouble("eValue")) : "-") + "," + entry.qValue + ",\"" + sqlResultSet.getString("mgfTitle") + "\",\n"); + } + } else { + if (sqlResultSet.getInt("hitType") == 0) { + interTargetWriter.write(sqlResultSet.getInt("scanNum") + "," + entry.scanId + "," + sqlResultSet.getDouble("precursorMz") + "," + sqlResultSet.getDouble("precursorMass") + "," + sqlResultSet.getDouble("theoMass") + "," + sqlResultSet.getInt("rt") + "," + sqlResultSet.getInt("isotopeCorrectionNum") + "," + sqlResultSet.getInt("precursorCharge") + "," + sqlResultSet.getDouble("score") + "," + sqlResultSet.getDouble("deltaC") + "," + sqlResultSet.getDouble("ppm") + "," + sqlResultSet.getString("seq1") + "-" + sqlResultSet.getInt("linkSite1") + "-" + sqlResultSet.getString("seq2") + "-" + sqlResultSet.getInt("linkSite2") + "," + sqlResultSet.getString("proId1") + "-" + sqlResultSet.getString("proId2") + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", sqlResultSet.getDouble("eValue")) : "-") + "," + entry.qValue + ",\"" + sqlResultSet.getString("mgfTitle") + "\",\n"); + } else { + interDecoyWriter.write(sqlResultSet.getInt("scanNum") + "," + entry.scanId + "," + sqlResultSet.getDouble("precursorMz") + "," + sqlResultSet.getDouble("precursorMass") + "," + sqlResultSet.getDouble("theoMass") + "," + sqlResultSet.getInt("rt") + "," + sqlResultSet.getInt("isotopeCorrectionNum") + "," + sqlResultSet.getInt("precursorCharge") + "," + sqlResultSet.getDouble("score") + "," + sqlResultSet.getDouble("deltaC") + "," + sqlResultSet.getDouble("ppm") + "," + sqlResultSet.getString("seq1") + "-" + sqlResultSet.getInt("linkSite1") + "-" + sqlResultSet.getString("seq2") + "-" + sqlResultSet.getInt("linkSite2") + "," + sqlResultSet.getString("proId1") + "-" + sqlResultSet.getString("proId2") + ",\"" + String.join(";", proAnnotationList1) + "\",\"" + String.join(";", proAnnotationList2) + "\"," + (cal_evalue ? String.format(Locale.US, "%E", sqlResultSet.getDouble("eValue")) : "-") + "," + entry.qValue + ",\"" + sqlResultSet.getString("mgfTitle") + "\",\n"); + } + } } - } - } - writer.close(); - } - - private static List> pickResult(Set search_result) { - List> picked_result = new LinkedList<>(); - List inter_protein_result = new LinkedList<>(); - List intra_protein_result = new LinkedList<>(); - - for (FinalResultEntry result_entry : search_result) { - if (result_entry.cl_type.contentEquals("intra_protein")) { - intra_protein_result.add(result_entry); } else { - inter_protein_result.add(result_entry); - } - } - - picked_result.add(intra_protein_result); - picked_result.add(inter_protein_result); - - return picked_result; - } - - private static int finishedFutureNum(List> futureList) { - int count = 0; - for (Future future : futureList) { - if (future.isDone()) { - ++count; + throw new NullPointerException(String.format(Locale.US, "Cannot find the record of scan %s.", entry.scanId)); } } - return count; - } - - private static boolean hasUnfinishedFuture(List> futureList) { - for (Future future : futureList) { - if (!future.isDone()) { - return true; - } + intraTargetWriter.close(); + intraDecoyWriter.close(); + interTargetWriter.close(); + interDecoyWriter.close(); + if (sqlResultSet != null) { + sqlResultSet.close(); } - return false; + sqlStatement.close(); + sqlConnection.close(); } private static void help() { diff --git a/src/main/java/proteomics/Search/CalEValue.java b/src/main/java/proteomics/Search/CalEValue.java index 5263e08..b0d6e29 100644 --- a/src/main/java/proteomics/Search/CalEValue.java +++ b/src/main/java/proteomics/Search/CalEValue.java @@ -18,17 +18,17 @@ public class CalEValue { private static final double maxTolerance = 20; private static final double toleranceStep = 1; - static void calEValue(int scan_num, ResultEntry result_entry, BuildIndex buildIndexObj, TreeMap> binScoresMap, float linker_mass, float originalTolerance, SparseVector xcorrPL, double singleChainT) throws IOException { + static void calEValue(String scanId, ResultEntry result_entry, BuildIndex buildIndexObj, TreeMap> binScoresMap, int precursorCharge, double massWithoutLinker, double precursorMass, double originalTolerance, SparseVector xcorrPL, double singleChainT) throws IOException { int gap_num = ECL2.score_point_t - result_entry.getScoreCount(); double tolerance = originalTolerance; int maxBinIdx = buildIndexObj.massToBin(massWithoutLinker * 0.5); while (gap_num > 0 && tolerance <= maxTolerance) { - gap_num = generateRandomRandomScores(gap_num, tolerance, toleranceStep, binScoresMap, result_entry.spectrum_mass, massWithoutLinker, result_entry.charge, xcorrPL, buildIndexObj, buildIndexObj.getMassBinSeqMap(), buildIndexObj.getSeqEntryMap(), buildIndexObj.returnMassTool(), result_entry, maxBinIdx, singleChainT); + gap_num = generateRandomRandomScores(gap_num, tolerance, toleranceStep, binScoresMap, precursorMass, massWithoutLinker, precursorCharge, xcorrPL, buildIndexObj, buildIndexObj.getMassBinSeqMap(), buildIndexObj.getSeqEntryMap(), buildIndexObj.returnMassTool(), result_entry, maxBinIdx, singleChainT); tolerance += toleranceStep; } if (gap_num > 0) { - logger.debug("Scan {}: Estimated e-value may not be accurate ({} data points).", scan_num, ECL2.score_point_t - gap_num); + logger.debug("Scan {}: Estimated e-value may not be accurate ({} data points).", scanId, ECL2.score_point_t - gap_num); } int[] score_histogram = result_entry.getScoreHistogram(); @@ -42,7 +42,7 @@ static void calEValue(int scan_num, ResultEntry result_entry, BuildIndex buildIn } } if (min_nonzero_idx == score_histogram.length - 1) { - logger.debug("Fail to estimate e-value for Scan {} (an empty score histogram).", scan_num); + logger.debug("Fail to estimate e-value for Scan {} (an empty score histogram).", scanId); result_entry.setEValue(9999); return; } @@ -55,10 +55,10 @@ static void calEValue(int scan_num, ResultEntry result_entry, BuildIndex buildIn } } if (max_nonzero_idx - min_nonzero_idx < 5) { - logger.debug("Fail to estimate e-value for Scan {} (doesn't have a useful score histogram).", scan_num); + logger.debug("Fail to estimate e-value for Scan {} (doesn't have a useful score histogram).", scanId); result_entry.setEValue(9999); if (ECL2.debug) { - BufferedWriter writer = new BufferedWriter(new FileWriter(scan_num + ".evalue.csv")); + BufferedWriter writer = new BufferedWriter(new FileWriter(scanId + ".evalue.csv")); writer.write("histogram\n"); for (int i = 0; i < max_nonzero_idx; ++i) { writer.write(String.format(Locale.US, "%d\n", score_histogram[i])); @@ -168,14 +168,14 @@ static void calEValue(int scan_num, ResultEntry result_entry, BuildIndex buildIn if (optimal_slope >= 0) { result_entry.setEValue(9999); - logger.debug("Estimating E-value failed. Scan: {}, mass: {}, slope: {}, intercept: {}, R square: {}, point num: {}.",scan_num, result_entry.spectrum_mass, optimal_slope, optimal_intercept, max_r_square, result_entry.getScoreCount()); + logger.debug("Estimating E-value failed. Scan: {}, mass: {}, slope: {}, intercept: {}, R square: {}, point num: {}.",scanId, precursorMass, optimal_slope, optimal_intercept, max_r_square, result_entry.getScoreCount()); } else { result_entry.setEValue(Math.exp((optimal_slope * Math.round(result_entry.getScore() * inverseHistogramBinSize) + optimal_intercept) + Math.log((double) result_entry.getCandidateNum() / (double) result_entry.getScoreCount()))); // double point precision limitation. result_entry.setEValueDetails(max_r_square, optimal_slope, optimal_intercept, optimal_start_idx, null_end_idx); } if (ECL2.debug) { - BufferedWriter writer = new BufferedWriter(new FileWriter(scan_num + ".evalue.csv")); + BufferedWriter writer = new BufferedWriter(new FileWriter(scanId + ".evalue.csv")); writer.write(String.format(Locale.US, "histogram,survival,ln(survival),slope=%f,intercept=%f,rsquare=%f,start=%d,end=%d\n", optimal_slope, optimal_intercept, max_r_square, optimal_start_idx, null_end_idx)); for (int i = 0; i <= max_nonzero_idx; ++i) { if (i < ln_survival_count_array.length) { diff --git a/src/main/java/proteomics/Search/Search.java b/src/main/java/proteomics/Search/Search.java index 63cb411..f22dfa7 100644 --- a/src/main/java/proteomics/Search/Search.java +++ b/src/main/java/proteomics/Search/Search.java @@ -15,7 +15,6 @@ public class Search { public final double ms1_tolerance; public final int ms1_tolerance_unit; private final Map chain_entry_map; - private final Map fix_mod_map; private final MassTool mass_tool_obj; private final TreeMap> bin_seq_map; private final BuildIndex build_index_obj; @@ -27,7 +26,6 @@ public class Search { public Search(BuildIndex build_index_obj, Map parameter_map) { this.build_index_obj = build_index_obj; chain_entry_map = build_index_obj.getSeqEntryMap(); - fix_mod_map = build_index_obj.getFixModMap(); mass_tool_obj = build_index_obj.returnMassTool(); ms1_tolerance_unit = Integer.valueOf(parameter_map.get("ms1_tolerance_unit")); ms1_tolerance = Double.valueOf(parameter_map.get("ms1_tolerance")); @@ -47,8 +45,8 @@ public Search(BuildIndex build_index_obj, Map parameter_map) { Arrays.sort(C13_correction_range); } - ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap> binScoresMap) throws IOException { - int max_chain_bin_idx = Math.min(build_index_obj.massToBin(spectrumEntry.mass_without_linker_mass + C13_correction_range[C13_correction_range.length - 1] * 1.00335483f) + 1 - bin_seq_map.firstKey(), bin_seq_map.lastKey()); + ResultEntry doSearch(SparseVector xcorrPL, TreeMap> binScoresMap, double massWithoutLinker, int precursorCharge, double precursorMass, String scanId) throws IOException { + int max_chain_bin_idx = Math.min(build_index_obj.massToBin(massWithoutLinker + C13_correction_range[C13_correction_range.length - 1] * 1.00335483) + 1 - bin_seq_map.firstKey(), bin_seq_map.lastKey()); int min_chain_bin_idx = bin_seq_map.firstKey(); if (max_chain_bin_idx < min_chain_bin_idx) { @@ -59,8 +57,8 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap< double leftMs1Tol = ms1_tolerance; double rightMs1Tol = ms1_tolerance; if (ms1_tolerance_unit == 1) { - leftMs1Tol = spectrumEntry.precursor_mass - (spectrumEntry.precursor_mass / (1 + ms1_tolerance * 1e-6f)); - rightMs1Tol = (spectrumEntry.precursor_mass / (1 - ms1_tolerance * 1e-6f)) - spectrumEntry.precursor_mass; + leftMs1Tol = precursorMass - (precursorMass / (1 + ms1_tolerance * 1e-6)); + rightMs1Tol = (precursorMass / (1 - ms1_tolerance * 1e-6)) - precursorMass; } // for debug @@ -71,8 +69,8 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap< TreeMap binChainMap = new TreeMap<>(); Set checkedBinSet = new HashSet<>(bin_seq_map.size() + 1, 1); long candidate_num = 0; - ResultEntry resultEntry = new ResultEntry(spectrumEntry.spectrum_id, spectrumEntry.precursor_mz, spectrumEntry.precursor_mass, spectrumEntry.rt, spectrumEntry.precursor_charge, cal_evalue, binChainMap); - int stopIdx = (int) Math.ceil(build_index_obj.massToBin((spectrumEntry.mass_without_linker_mass + 1.00335483f) * 0.5f + rightMs1Tol)); + ResultEntry resultEntry = new ResultEntry(scanId, cal_evalue, binChainMap); + int stopIdx = (int) Math.ceil(build_index_obj.massToBin((massWithoutLinker + 1.00335483) * 0.5 + rightMs1Tol)); for (int idx_1 : bin_seq_map.keySet()) { if (idx_1 > stopIdx) { break; @@ -86,8 +84,8 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap< // consider C13 correction for (int i : C13_correction_range) { - left_mass_2 = spectrumEntry.mass_without_linker_mass + i * 1.00335483f - build_index_obj.binToRightMass(idx_1) - leftMs1Tol; - right_mass_2 = spectrumEntry.mass_without_linker_mass + i * 1.00335483f - build_index_obj.binToLeftMass(idx_1) + rightMs1Tol; + left_mass_2 = massWithoutLinker + i * 1.00335483 - build_index_obj.binToRightMass(idx_1) - leftMs1Tol; + right_mass_2 = massWithoutLinker + i * 1.00335483 - build_index_obj.binToLeftMass(idx_1) + rightMs1Tol; left_idx_2 = build_index_obj.massToBin(left_mass_2); right_idx_2 = build_index_obj.massToBin(right_mass_2) + 1; sub_map.putAll(bin_seq_map.subMap(left_idx_2, true, right_idx_2, true)); @@ -98,7 +96,7 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap< // This bin hasn't been visited. Linear scan first. for (String seq : bin_seq_map.get(idx_1)) { ChainEntry chainEntry = chain_entry_map.get(seq); - linearScan(spectrumEntry, xcorrPL, chainEntry, idx_1, binChainMap, binScoresMap, debugEntryList, devChainScoreMap); + linearScan(xcorrPL, chainEntry, idx_1, binChainMap, binScoresMap, debugEntryList, devChainScoreMap, precursorCharge, precursorMass); } checkedBinSet.add(idx_1); } @@ -110,7 +108,7 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap< // this bin hasn't been visited. Linear scan first. for (String seq : bin_seq_map.get(idx_2)) { ChainEntry chainEntry = chain_entry_map.get(seq); - linearScan(spectrumEntry, xcorrPL, chainEntry, idx_2, binChainMap, binScoresMap, debugEntryList, devChainScoreMap); + linearScan(xcorrPL, chainEntry, idx_2, binChainMap, binScoresMap, debugEntryList, devChainScoreMap, precursorCharge, precursorMass); } checkedBinSet.add(idx_2); } @@ -173,11 +171,11 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap< // write debug file if (ECL2.debug) { - BufferedWriter writer = new BufferedWriter(new FileWriter(spectrumEntry.spectrum_id + ".csv")); + BufferedWriter writer = new BufferedWriter(new FileWriter(scanId + ".csv")); debugEntryList.sort(Comparator.reverseOrder()); writer.write("chain,link_site,mass,score\n"); for (DebugEntry t : debugEntryList) { - writer.write(String.format(Locale.US, "%s,%d,%f,%f\n", addFixMod(t.chain, t.link_site), t.link_site, t.mass, t.score)); + writer.write(String.format(Locale.US, "%s,%d,%f,%f\n", t.chain, t.link_site, t.mass, t.score)); } writer.close(); } @@ -200,83 +198,10 @@ ResultEntry doSearch(SpectrumEntry spectrumEntry, SparseVector xcorrPL, TreeMap< } } - public FinalResultEntry convertResultEntry(int scanNum, ResultEntry result_entry, Map> seqProMap, SpectrumEntry spectrumEntry) { - int rank = 1; - String chain_seq_1 = result_entry.getChain1(); - String chain_seq_2 = result_entry.getChain2(); - ChainEntry chain_entry_1 = chain_entry_map.get(chain_seq_1); - ChainEntry chain_entry_2 = chain_entry_map.get(chain_seq_2); - - float spectrum_mass = result_entry.spectrum_mass; - float theo_mass = chain_entry_1.chain_mass + chain_entry_2.chain_mass + build_index_obj.linker_mass; - - int C13_Diff_num = getC13Num(spectrum_mass, theo_mass); - spectrum_mass += C13_Diff_num * 1.00335483f; - float ppm = (spectrum_mass - theo_mass) * 1e6f / theo_mass; - - Set pro1Set = new TreeSet<>(); - boolean isDecoy1 = false; - for (String temp : seqProMap.get(chain_entry_1.seq.replaceAll("[^A-Znc]", ""))) { - pro1Set.add(temp); - if (temp.startsWith("DECOY")) { // there is no overlapped peptide between target and decoy. - isDecoy1 = true; - } - } - Set pro2Set = new TreeSet<>(); - boolean isDecoy2 = false; - for (String temp : seqProMap.get(chain_entry_2.seq.replaceAll("[^A-Znc]", ""))) { - pro2Set.add(temp); - if (temp.startsWith("DECOY")) { // there is no overlapped peptide between target and decoy. - isDecoy2 = true; - } - } - - int hit_type; // 0 = T-T; 1 = D-D; 2 = T-D; - if (isDecoy1 && isDecoy2) { - hit_type = 1; - } else if (!isDecoy1 && !isDecoy2) { - hit_type = 0; - } else { - hit_type = 2; - } - - String cl_type = "intra_protein"; - boolean keep = false; - for (String temp_1 : seqProMap.get(chain_entry_1.seq.replaceAll("[^A-Znc]", ""))) { - if (temp_1.startsWith("DECOY_")) { - temp_1 = temp_1.substring(6); - } - for (String temp_2 : seqProMap.get(chain_entry_2.seq.replaceAll("[^A-Znc]", ""))) { - if (temp_2.startsWith("DECOY_")) { - temp_2 = temp_2.substring(6); - } - if (temp_1.contentEquals(temp_2)) { // bias to intra protein cross-linking - keep = true; - break; - } - } - if (keep) { - break; - } - } - - if (!keep) { - cl_type = "inter_protein"; - } - - double delta_c = 1 - (result_entry.getSecondScore() / result_entry.getScore()); - - // add fix modification to the sequences. - String final_seq_1 = addFixMod(chain_seq_1, result_entry.getLinkSite1()); - String final_seq_2 = addFixMod(chain_seq_2, result_entry.getLinkSite2()); - - return new FinalResultEntry(scanNum, result_entry.spectrum_id, rank, result_entry.charge, result_entry.spectrum_mz, result_entry.spectrum_mass, theo_mass, result_entry.rt, ppm, result_entry.getScore(), delta_c, final_seq_1, result_entry.getLinkSite1(), String.join(";", pro1Set), final_seq_2, result_entry.getLinkSite2(), String.join(";", pro2Set), cl_type, hit_type, C13_Diff_num, result_entry.getEValue(), result_entry.getScoreCount(), result_entry.getRSquare(), result_entry.getSlope(), result_entry.getIntercept(), result_entry.getStartIdx(), result_entry.getEndIdx(), result_entry.getChainScore1(), result_entry.getChainRank1(), result_entry.getChainScore2(), result_entry.getChainRank2(), result_entry.getCandidateNum(), cal_evalue, spectrumEntry.mgfTitle); - } - - private void linearScan(SpectrumEntry spectrumEntry, SparseVector xcorrPL, ChainEntry chainEntry, int binInx, TreeMap binChainMap, TreeMap> binScoresMap, List debugEntryList, Map devChainScoreMap) { + private void linearScan(SparseVector xcorrPL, ChainEntry chainEntry, int binInx, TreeMap binChainMap, TreeMap> binScoresMap, List debugEntryList, Map devChainScoreMap, int precursorCharge, double precursorMass) { for (short link_site_1 : chainEntry.link_site_set) { // generate theoretical fragment ion bins and calculate XCorr. - double dot_product = mass_tool_obj.generateTheoFragmentAndCalXCorr(chainEntry.seq, link_site_1, spectrumEntry.precursor_mass - chainEntry.chain_mass, spectrumEntry.precursor_charge, xcorrPL); + double dot_product = mass_tool_obj.generateTheoFragmentAndCalXCorr(chainEntry.seq, link_site_1, precursorMass - chainEntry.chain_mass, precursorCharge, xcorrPL); if (ECL2.debug) { debugEntryList.add(new DebugEntry(chainEntry.seq, link_site_1, chainEntry.chain_mass, dot_product)); @@ -320,39 +245,4 @@ private void linearScan(SpectrumEntry spectrumEntry, SparseVector xcorrPL, Chain } } } - - private int getC13Num(float exp_mass, float theo_mass) { - float min_diff = 10; - int num = 0; - - for (int i : C13_correction_range) { - float temp = Math.abs(exp_mass + i * 1.00335483f - theo_mass); - if (temp < min_diff) { - min_diff = temp; - num = i; - } - } - - return num; - } - - private String addFixMod(String seq, int linkSite) { - AA[] aaList = MassTool.seqToAAList(seq); - StringBuilder sb = new StringBuilder(seq.length() * 3); - for (int i = 0; i < aaList.length; ++i) { - AA aa = aaList[i]; - if (i == linkSite) { // priority order: linkSite > fixMod > varMod - sb.append(aa.aa); - } else if (Math.abs(fix_mod_map.get(aa.aa)) > 1e-6) { - sb.append(String.format(Locale.US, "%c[%.3f]", aa.aa, fix_mod_map.get(aa.aa))); - } else { - if (Math.abs(aa.delta_mass) > 1e-6) { - sb.append(String.format(Locale.US, "%c[%.3f]", aa.aa, aa.delta_mass)); - } else { - sb.append(aa.aa); - } - } - } - return sb.toString(); - } } \ No newline at end of file diff --git a/src/main/java/proteomics/Search/SearchWrap.java b/src/main/java/proteomics/Search/SearchWrap.java index 82d0915..2cce015 100644 --- a/src/main/java/proteomics/Search/SearchWrap.java +++ b/src/main/java/proteomics/Search/SearchWrap.java @@ -5,41 +5,63 @@ import proteomics.Spectrum.PreSpectrum; import proteomics.TheoSeq.MassTool; import proteomics.Types.*; +import uk.ac.ebi.pride.tools.jmzreader.JMzReader; +import uk.ac.ebi.pride.tools.jmzreader.JMzReaderException; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; +import java.sql.*; +import java.util.*; import java.util.concurrent.Callable; +import java.util.concurrent.locks.ReentrantLock; -public class SearchWrap implements Callable { +import static proteomics.ECL2.C13CorrectionRange; + +public class SearchWrap implements Callable { private final Search search_obj; - private final SpectrumEntry spectrumEntry; private final BuildIndex build_index_obj; private final PreSpectrum preSpectrumObj; - private final Map> seqProMap; private final boolean cal_evalue; private final double delta_c_t; + private final JMzReader spectraParser; + private final ReentrantLock lock; + private final String scanId; + private final int precursorCharge; + private final double massWithoutLinker; + private final double precursorMass; + private final Connection sqlConnection; - public SearchWrap(Search search_obj, SpectrumEntry spectrumEntry, BuildIndex build_index_obj, MassTool mass_tool_obj, Map> seqProMap, boolean cal_evalue, float delta_c_t, boolean flankingPeaks) { + public SearchWrap(Search search_obj, BuildIndex build_index_obj, MassTool mass_tool_obj, boolean cal_evalue, double delta_c_t, boolean flankingPeaks, JMzReader spectraParser, ReentrantLock lock, String scanId, int precursorCharge, double massWithoutLinker, double precursorMass, Connection sqlConnection) { this.search_obj = search_obj; - this.spectrumEntry = spectrumEntry; this.build_index_obj = build_index_obj; preSpectrumObj = new PreSpectrum(mass_tool_obj, flankingPeaks); - this.seqProMap = seqProMap; this.cal_evalue = cal_evalue; this.delta_c_t = delta_c_t; + this.spectraParser = spectraParser; + this.lock = lock; + this.scanId = scanId; + this.precursorCharge = precursorCharge; + this.massWithoutLinker = massWithoutLinker; + this.precursorMass = precursorMass; + this.sqlConnection = sqlConnection; } @Override - public FinalResultEntry call() throws IOException { - SparseVector xcorrPL = preSpectrumObj.preSpectrum(spectrumEntry.originalPlMap, spectrumEntry.precursor_mass, spectrumEntry.scan_num); + public Boolean call() throws IOException, JMzReaderException, SQLException { + Map rawPLMap; + try { + lock.lock(); + // Reading peak list. + rawPLMap = spectraParser.getSpectrumById(scanId).getPeakList(); + } finally { + lock.unlock(); + } + + SparseVector xcorrPL = preSpectrumObj.preSpectrum(rawPLMap, precursorMass, scanId); if (ECL2.debug) { - BufferedWriter writer = new BufferedWriter(new FileWriter(spectrumEntry.scan_num + ".xcorr.spectrum.csv")); + BufferedWriter writer = new BufferedWriter(new FileWriter(scanId + ".xcorr.spectrum.csv")); writer.write("bin_idx,intensity\n"); for (int idx : xcorrPL.getIdxSet()) { writer.write(idx + "," + xcorrPL.get(idx) + "\n"); @@ -47,30 +69,173 @@ public FinalResultEntry call() throws IOException { writer.close(); } TreeMap> binScoresMap = new TreeMap<>(); - ResultEntry resultEntry = search_obj.doSearch(spectrumEntry, xcorrPL, binScoresMap); + ResultEntry resultEntry = search_obj.doSearch(xcorrPL, binScoresMap, massWithoutLinker, precursorCharge, precursorMass, scanId); if (resultEntry != null) { if (1 - (resultEntry.getSecondScore() / resultEntry.getScore()) >= delta_c_t) { if (cal_evalue) { double originalTolerance; if (search_obj.ms1_tolerance_unit == 1) { - originalTolerance = resultEntry.spectrum_mass * search_obj.ms1_tolerance * 1e-6f; + originalTolerance = precursorMass * search_obj.ms1_tolerance * 1e-6; } else { originalTolerance = search_obj.ms1_tolerance; } - CalEValue.calEValue(spectrumEntry.scan_num, resultEntry, build_index_obj, binScoresMap, build_index_obj.linker_mass, originalTolerance, xcorrPL, search_obj.single_chain_t); + CalEValue.calEValue(scanId, resultEntry, build_index_obj, binScoresMap, precursorCharge, massWithoutLinker, precursorMass, originalTolerance, xcorrPL, search_obj.single_chain_t); if (resultEntry.getEValue() != 9999) { - return search_obj.convertResultEntry(spectrumEntry.scan_num, resultEntry, seqProMap, spectrumEntry); + recordResult(scanId, resultEntry, precursorMass); + return true; } else { - return null; + return false; } } else { - return search_obj.convertResultEntry(spectrumEntry.scan_num, resultEntry, seqProMap, spectrumEntry); + recordResult(scanId, resultEntry, precursorMass); + return true; } } else { - return null; + return false; + } + } else { + return false; + } + } + + private void recordResult(String scanId, ResultEntry result_entry, double precursorMass) throws SQLException { + Map chainEntryMap = build_index_obj.getSeqEntryMap(); + Map> seqProMap = build_index_obj.getSeqProMap(); + + int rank = 1; + String chain_seq_1 = result_entry.getChain1(); + String chain_seq_2 = result_entry.getChain2(); + ChainEntry chain_entry_1 = chainEntryMap.get(chain_seq_1); + ChainEntry chain_entry_2 = chainEntryMap.get(chain_seq_2); + + double theo_mass = chain_entry_1.chain_mass + chain_entry_2.chain_mass + build_index_obj.linker_mass; + + int C13_Diff_num = getC13Num(precursorMass, theo_mass); + precursorMass += C13_Diff_num * 1.00335483; + double ppm = (precursorMass - theo_mass) * 1e6 / theo_mass; + + Set pro1Set = new TreeSet<>(); + boolean isDecoy1 = false; + for (String temp : seqProMap.get(chain_entry_1.seq.replaceAll("[^A-Znc]", ""))) { + pro1Set.add(temp); + if (temp.startsWith("DECOY")) { // there is no overlapped peptide between target and decoy. + isDecoy1 = true; } + } + Set pro2Set = new TreeSet<>(); + boolean isDecoy2 = false; + for (String temp : seqProMap.get(chain_entry_2.seq.replaceAll("[^A-Znc]", ""))) { + pro2Set.add(temp); + if (temp.startsWith("DECOY")) { // there is no overlapped peptide between target and decoy. + isDecoy2 = true; + } + } + + int hit_type; // 0 = T-T; 1 = D-D; 2 = T-D; + if (isDecoy1 && isDecoy2) { + hit_type = 1; + } else if (!isDecoy1 && !isDecoy2) { + hit_type = 0; } else { - return null; + hit_type = 2; + } + + String cl_type = "intra_protein"; + boolean keep = false; + for (String temp_1 : seqProMap.get(chain_entry_1.seq.replaceAll("[^A-Znc]", ""))) { + if (temp_1.startsWith("DECOY_")) { + temp_1 = temp_1.substring(6); + } + for (String temp_2 : seqProMap.get(chain_entry_2.seq.replaceAll("[^A-Znc]", ""))) { + if (temp_2.startsWith("DECOY_")) { + temp_2 = temp_2.substring(6); + } + if (temp_1.contentEquals(temp_2)) { // bias to intra protein cross-linking + keep = true; + break; + } + } + if (keep) { + break; + } + } + + if (!keep) { + cl_type = "inter_protein"; + } + + double delta_c = 1 - (result_entry.getSecondScore() / result_entry.getScore()); + + // add fix modification to the sequences. + String final_seq_1 = addFixMod(chain_seq_1, result_entry.getLinkSite1()); + String final_seq_2 = addFixMod(chain_seq_2, result_entry.getLinkSite2()); + + Statement sqlStatement = sqlConnection.createStatement(); + ResultSet sqlResultSet = sqlStatement.executeQuery(String.format(Locale.US, "SELECT scanNum, scanId, precursorCharge, precursorMz, precursorMass, rt, massWithoutLinker, mgfTitle, isotopeCorrectionNum, ms1PearsonCorrelationCoefficient, score, hitType FROM spectraTable WHERE scanId='%s'", scanId)); + if (sqlResultSet.next()) { + boolean needUpdate = false; + int hitTypeOld = sqlResultSet.getInt("hitType"); + if (!sqlResultSet.wasNull()) { + double scoreOld = sqlResultSet.getDouble("score"); + if (result_entry.getScore() > scoreOld || (result_entry.getScore() == scoreOld && hitTypeOld != 0 && hit_type == 1)) { + needUpdate = true; + } + } else { + needUpdate = true; + } + if (needUpdate) { + int scanNum = sqlResultSet.getInt("scanNum"); + int precursorCharge = sqlResultSet.getInt("precursorCharge"); + double precursorMz = sqlResultSet.getDouble("precursorMz"); + int rt = sqlResultSet.getInt("rt"); + double massWithoutLinker = sqlResultSet.getDouble("massWithoutLinker"); + String mgtTitle = sqlResultSet.getString("mgfTitle"); + int isotopeCorrectionNum = sqlResultSet.getInt("isotopeCorrectionNum"); + double ms1PearsonCorrelationCoefficient = sqlResultSet.getDouble("ms1PearsonCorrelationCoefficient"); + sqlStatement.executeUpdate(String.format(Locale.US, "DELETE FROM spectraTable WHERE scanId=%s", scanId)); + sqlStatement.executeUpdate(String.format(Locale.US, "INSERT INTO spectraTable (scanNum, scanId, precursorCharge, precursorMz, precursorMass, rt, massWithoutLinker, mgfTitle, isotopeCorrectionNum, ms1PearsonCorrelationCoefficient, theoMass, score, deltaC, rank, ppm, seq1, linkSite1, proId1, seq2, linkSite2, proId2, clType, hitType, eValue, candidateNum, pointCount, rSquare, slope, intercept, startIdx, endIdx, chainScore1, chainRank1, chainScore2, chainRank2) VALUES (%d, '%s', %d, %f, %f, %d, %f, '%s', %d, %f, %f, %f, %f, %d, %f, '%s', %d, '%s', '%s', %d, '%s', '%s', %d, %f, %d, %d, %f, %f, %f, %d, %d, %f, %d, %f, %d)", scanNum, scanId, precursorCharge, precursorMz, precursorMass, rt, massWithoutLinker, mgtTitle, isotopeCorrectionNum, ms1PearsonCorrelationCoefficient, theo_mass, result_entry.getScore(), delta_c, rank, ppm, final_seq_1, result_entry.getLinkSite1(), String.join(";", pro1Set), final_seq_2, result_entry.getLinkSite2(), String.join(";", pro2Set), cl_type, hit_type, result_entry.getEValue(), result_entry.getCandidateNum(), result_entry.getScoreCount(), result_entry.getRSquare(), result_entry.getSlope(), result_entry.getIntercept(), result_entry.getStartIdx(), result_entry.getEndIdx(), result_entry.getChainScore1(), result_entry.getChainRank1(), result_entry.getChainScore2(), result_entry.getChainRank2())); + } + } else { + throw new NullPointerException(String.format(Locale.US, "There is no record %s in the spectraTable.", scanId)); + } + + sqlResultSet.close(); + sqlStatement.close(); + } + + private int getC13Num(double exp_mass, double theo_mass) { + double min_diff = 10; + int num = 0; + + for (int i : C13CorrectionRange) { + double temp = Math.abs(exp_mass + i * 1.00335483 - theo_mass); + if (temp < min_diff) { + min_diff = temp; + num = i; + } + } + + return num; + } + + private String addFixMod(String seq, int linkSite) { + Map fix_mod_map = build_index_obj.getFixModMap(); + AA[] aaList = MassTool.seqToAAList(seq); + StringBuilder sb = new StringBuilder(seq.length() * 3); + for (int i = 0; i < aaList.length; ++i) { + AA aa = aaList[i]; + if (i == linkSite) { // priority order: linkSite > fixMod > varMod + sb.append(aa.aa); + } else if (Math.abs(fix_mod_map.get(aa.aa)) > 1e-6) { + sb.append(String.format(Locale.US, "%c[%.3f]", aa.aa, fix_mod_map.get(aa.aa))); + } else { + if (Math.abs(aa.delta_mass) > 1e-6) { + sb.append(String.format(Locale.US, "%c[%.3f]", aa.aa, aa.delta_mass)); + } else { + sb.append(aa.aa); + } + } } + return sb.toString(); } } diff --git a/src/main/java/proteomics/Spectrum/PreSpectra.java b/src/main/java/proteomics/Spectrum/PreSpectra.java index 48e321a..1bad9fc 100644 --- a/src/main/java/proteomics/Spectrum/PreSpectra.java +++ b/src/main/java/proteomics/Spectrum/PreSpectra.java @@ -4,7 +4,6 @@ import org.slf4j.LoggerFactory; import proteomics.ECL2; import proteomics.Index.BuildIndex; -import proteomics.Types.SpectrumEntry; import uk.ac.ebi.pride.tools.jmzreader.JMzReader; import uk.ac.ebi.pride.tools.jmzreader.model.*; import uk.ac.ebi.pride.tools.mzxml_parser.MzXMLFile; @@ -13,6 +12,7 @@ import uk.ac.ebi.pride.tools.mgf_parser.model.Ms2Query; import java.io.*; +import java.sql.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -25,10 +25,8 @@ public class PreSpectra { private static final Pattern scanNumPattern2 = Pattern.compile("scan=([0-9]+)", Pattern.CASE_INSENSITIVE); private static final Pattern scanNumPattern3 = Pattern.compile("^[^.]+\\.([0-9]+)\\.[0-9]+\\.[0-9]"); - private Map num_spectrum_map = new HashMap<>(); - private Set debug_scan_num_set = new HashSet<>(); - - public PreSpectra(JMzReader spectra_parser, BuildIndex build_index_obj, Map parameter_map, String ext) throws MzXMLParsingException, IOException { + public PreSpectra(JMzReader spectra_parser, BuildIndex build_index_obj, Map parameter_map, String ext, String sqlPath) throws MzXMLParsingException, IOException, SQLException { + Set debug_scan_num_set = new HashSet<>(); // In DEBUG mode, filter out unlisted scan num if (ECL2.debug) { for (String k : parameter_map.keySet()) { @@ -39,6 +37,18 @@ public PreSpectra(JMzReader spectra_parser, BuildIndex build_index_obj, Map spectrumIterator = spectra_parser.getSpectrumIterator(); while (spectrumIterator.hasNext()) { Spectrum spectrum = spectrumIterator.next(); @@ -74,7 +84,7 @@ public PreSpectra(JMzReader spectra_parser, BuildIndex build_index_obj, Map getNumSpectrumMap() { - return num_spectrum_map; + sqlConnection.commit(); + sqlConnection.setAutoCommit(true); + sqlPrepareStatement.close(); + sqlConnection.close(); + logger.info("Useful MS/MS spectra number: {}.", usefulSpectraNum); } } diff --git a/src/main/java/proteomics/Spectrum/PreSpectrum.java b/src/main/java/proteomics/Spectrum/PreSpectrum.java index b8e24df..82e067b 100644 --- a/src/main/java/proteomics/Spectrum/PreSpectrum.java +++ b/src/main/java/proteomics/Spectrum/PreSpectrum.java @@ -21,7 +21,7 @@ public PreSpectrum(MassTool mass_tool_obj, boolean flankingPeaks) { this.flankingPeaks =flankingPeaks; } - public SparseVector preSpectrum (Map peaks_map, float precursor_mass, int scanNum) throws IOException { + public SparseVector preSpectrum (Map peaks_map, double precursor_mass, String scanId) throws IOException { // sqrt the intensity Map sqrt_pl_map = new HashMap<>(peaks_map.size() + 1, 1); for (double mz : peaks_map.keySet()) { @@ -37,7 +37,7 @@ public SparseVector preSpectrum (Map peaks_map, float precursor_ double[] normalizedSpectrum = normalizeSpec(pl_array); if (ECL2.debug) { - BufferedWriter writer = new BufferedWriter(new FileWriter(scanNum + ".normalized.spectrum.csv")); + BufferedWriter writer = new BufferedWriter(new FileWriter(scanId + ".normalized.spectrum.csv")); writer.write("bin_idx,intensity\n"); for (int i = 0; i < normalizedSpectrum.length; ++i) { writer.write(i + "," + normalizedSpectrum[i] + "\n"); diff --git a/src/main/java/proteomics/Types/FinalResultEntry.java b/src/main/java/proteomics/Types/FinalResultEntry.java deleted file mode 100644 index 799bb85..0000000 --- a/src/main/java/proteomics/Types/FinalResultEntry.java +++ /dev/null @@ -1,135 +0,0 @@ -package proteomics.Types; - - -public class FinalResultEntry implements Comparable { - - public final int scan_num; - public final String spectrum_id; - public final int rank; - public final int charge; - public final float spectrum_mz; - public final float spectrum_mass; - public final float peptide_mass; - public final float rt; - public final float ppm; - public final double score; - public final double delta_c; - public final String seq_1; - public final int link_site_1; - public final String pro_id_1; - public final String seq_2; - public final int link_site_2; - public final String pro_id_2; - public final String cl_type; - public final int hit_type; // 0 = T-T; 1 = D-D; 2 = T-D; - public final int C13_correction; - public final double e_value; - public final double negative_log10_evalue; - public float qvalue = -1; - public final String mgfTitle; - - private final int hashCode; - - public final long candidate_num; - - public int point_count; - public float r_square; - public float slope; - public float intercept; - public int start_idx; - public int end_idx; - - public double chain_score_1; - public int chain_rank_1; - public double chain_score_2; - public int chain_rank_2; - - private final boolean cal_evalue; - - public FinalResultEntry(int scan_num, String spectrum_id, int rank, int charge, float spectrum_mz, float spectrum_mass, float peptide_mass, float rt, float ppm, double score, double delta_c, String seq_1, int link_site_1, String pro_id_1, String seq_2, int link_site_2, String pro_id_2, String cl_type, int hit_type, int C13_correction, double e_value, int point_count, float r_square, float slope, float intercept, int start_idx, int end_idx, double chain_score_1, int chain_rank_1, double chain_score_2, int chain_rank_2, long candidate_num, boolean cal_evalue, String mgfTitle) { - this.scan_num = scan_num; - this.spectrum_id = spectrum_id; - this.rank = rank; - this.charge = charge; - this.spectrum_mz = spectrum_mz; - this.spectrum_mass = spectrum_mass; - this.peptide_mass = peptide_mass; - this.rt = rt; - this.ppm = ppm; - this.score = score; - this.delta_c = delta_c; - if (seq_1.compareTo(seq_2) < 0) { - this.seq_1 = seq_1; - this.link_site_1 = link_site_1; - this.pro_id_1 = pro_id_1; - this.seq_2 = seq_2; - this.link_site_2 = link_site_2; - this.pro_id_2 = pro_id_2; - } else { - this.seq_1 = seq_2; - this.link_site_1 = link_site_2; - this.pro_id_1 = pro_id_2; - this.seq_2 = seq_1; - this.link_site_2 = link_site_1; - this.pro_id_2 = pro_id_1; - } - this.cl_type = cl_type; - this.hit_type = hit_type; - this.C13_correction = C13_correction; - this.e_value = e_value; - negative_log10_evalue = -1 * Math.log10(e_value); - - this.point_count = point_count; - this.r_square = r_square; - this.slope = slope; - this.intercept = intercept; - this.start_idx = start_idx; - this.end_idx = end_idx; - - this.chain_score_1 = chain_score_1; - this.chain_rank_1 = chain_rank_1; - this.chain_score_2 = chain_score_2; - this.chain_rank_2 = chain_rank_2; - - this.candidate_num = candidate_num; - - String toString = scan_num + "-" + seq_1 + "-" + link_site_1 + "-" + seq_2 + link_site_2; - hashCode = toString.hashCode(); - - this.cal_evalue = cal_evalue; - - this.mgfTitle = mgfTitle; - } - - public int compareTo(FinalResultEntry other) { - if (cal_evalue) { - if (this.negative_log10_evalue > other.negative_log10_evalue) { - return 1; - } else if (this.negative_log10_evalue < other.negative_log10_evalue) { - return -1; - } else { - return 0; - } - } else { - if (this.score > other.score) { - return 1; - } else if (this.score < other.score) { - return -1; - } else { - return 0; - } - } - } - - public int hashCode() { - return hashCode; - } - - public boolean equals(Object other) { - if (other instanceof FinalResultEntry) { - return ((FinalResultEntry) other).hashCode == hashCode; - } else { - return false; - } - } -} diff --git a/src/main/java/proteomics/Types/ResultEntry.java b/src/main/java/proteomics/Types/ResultEntry.java index 90a88a8..96a5770 100644 --- a/src/main/java/proteomics/Types/ResultEntry.java +++ b/src/main/java/proteomics/Types/ResultEntry.java @@ -13,10 +13,6 @@ public class ResultEntry{ private static final double max_score = 20; public final String spectrum_id; - public final float spectrum_mass; - public final float spectrum_mz; - public final float rt; - public final int charge; private final TreeMap binChainMap; @@ -42,15 +38,11 @@ public class ResultEntry{ private double chain_score_2; private int chain_rank_2; - public ResultEntry(String spectrum_id, float spectrum_mz, float spectrum_mass, float rt, int charge, boolean cal_evalue, TreeMap binChainMap) { + public ResultEntry(String spectrum_id, boolean cal_evalue, TreeMap binChainMap) { if (cal_evalue) { score_histogram = new int[(int) Math.round(max_score * inverseHistogramBinSize) + 1]; // start from zero score. } this.spectrum_id = spectrum_id; - this.spectrum_mz = spectrum_mz; - this.spectrum_mass = spectrum_mass; - this.rt = rt; - this.charge = charge; this.binChainMap = binChainMap; } diff --git a/src/main/java/proteomics/Types/SpectrumEntry.java b/src/main/java/proteomics/Types/SpectrumEntry.java deleted file mode 100644 index ce3730d..0000000 --- a/src/main/java/proteomics/Types/SpectrumEntry.java +++ /dev/null @@ -1,40 +0,0 @@ -package proteomics.Types; - -import java.util.Map; - -public class SpectrumEntry { - public final int scan_num; - public final String spectrum_id; - public final float precursor_mz; - public final float precursor_mass; - public final float rt; - public final float mass_without_linker_mass; - public final int precursor_charge; - public final String mgfTitle; - public Map originalPlMap; - - public SpectrumEntry(int scan_num, String spectrum_id, float precursor_mz, float precursor_mass, int precursor_charge, float rt, Map originalPlMap, float linker_mass, String mgfTitle) { - this.scan_num = scan_num; - this.spectrum_id = spectrum_id; - this.precursor_mz = precursor_mz; - this.precursor_mass = precursor_mass; - mass_without_linker_mass = precursor_mass - linker_mass; - this.precursor_charge = precursor_charge; - this.rt = rt; - this.mgfTitle = mgfTitle; - this.originalPlMap = originalPlMap; - } - - public int hashCode() { - return scan_num; - } - - public boolean equals(Object other) { - if (other instanceof SpectrumEntry) { - SpectrumEntry temp = (SpectrumEntry) other; - return (temp.scan_num == scan_num); - } else { - return false; - } - } -} \ No newline at end of file diff --git a/src/main/java/proteomics/Validation/CalFDR.java b/src/main/java/proteomics/Validation/CalFDR.java index 97ffc84..e836e2a 100644 --- a/src/main/java/proteomics/Validation/CalFDR.java +++ b/src/main/java/proteomics/Validation/CalFDR.java @@ -1,77 +1,68 @@ package proteomics.Validation; -import proteomics.Types.FinalResultEntry; - -import java.util.List; +import java.sql.*; +import java.util.*; public class CalFDR { - private final float inversePrecision; - - private double min_score = 999; - private double max_score = -999; - private float[] qvalue_array = null; - private List results; - - public CalFDR(List results, boolean cal_evalue) { - this.results = results; + public static Map calFDR(String sqlPath, boolean cal_evalue, String clType) throws SQLException { + double min_score = 999; + double max_score = -999; + double inversePrecision; if (cal_evalue) { inversePrecision = 10; } else { inversePrecision = 1000; } - // find the max score - for (FinalResultEntry entry : results) { - if (cal_evalue) { - if (entry.negative_log10_evalue > max_score) { - max_score = entry.negative_log10_evalue; - } - if (entry.negative_log10_evalue < min_score) { - min_score = entry.negative_log10_evalue; + + String scoreType = "score"; + if (cal_evalue) { + scoreType = "eValue"; + } + + Map scanIdEntryMap = new HashMap<>(); + Connection sqlConnection = DriverManager.getConnection(sqlPath); + Statement sqlStatement = sqlConnection.createStatement(); + ResultSet sqlResultSet = sqlStatement.executeQuery(String.format(Locale.US, "SELECT scanId, %s, hitType FROM spectraTable WHERE clType='%s'", scoreType, clType)); + while (sqlResultSet.next()) { + int hitType = sqlResultSet.getInt("hitType"); + if (!sqlResultSet.wasNull()) { + String scanId = sqlResultSet.getString("scanId"); + double score = sqlResultSet.getDouble(scoreType); + if (cal_evalue) { + score = -1 * Math.log10(score); } - } else { - if (entry.score > max_score) { - max_score = entry.score; + scanIdEntryMap.put(scanId, new Entry(scanId, score, hitType)); + if (score > max_score) { + max_score = score; } - if (entry.score < min_score) { - min_score = entry.score; + if (score < min_score) { + min_score = score; } } } + sqlResultSet.close(); + sqlStatement.close(); + sqlConnection.close(); final int array_length = 1 + (int) Math.ceil((max_score - min_score) * inversePrecision); - float[] decoy_count_vector = new float[array_length]; - float[] target_count_vector = new float[array_length]; - float[] fuse_count_vector = new float[array_length]; - float[] fdr_array = new float[array_length]; - qvalue_array = new float[array_length]; - - for (FinalResultEntry re : results) { - if (re.hit_type == 1) { - int idx; - if (cal_evalue) { - idx = (int) Math.floor((re.negative_log10_evalue - min_score) * inversePrecision); - } else { - idx = (int) Math.floor((re.score - min_score) * inversePrecision); - } + double[] decoy_count_vector = new double[array_length]; + double[] target_count_vector = new double[array_length]; + double[] fuse_count_vector = new double[array_length]; + double[] fdr_array = new double[array_length]; + double[] qvalue_array = new double[array_length]; + + for (Entry re : scanIdEntryMap.values()) { + if (re.hitType == 1) { + int idx = (int) Math.round((re.score - min_score) * inversePrecision); ++decoy_count_vector[idx]; - } else if (re.hit_type == 0) { - int idx; - if (cal_evalue) { - idx = (int) Math.floor((re.negative_log10_evalue - min_score) * inversePrecision); - } else { - idx = (int) Math.floor((re.score - min_score) * inversePrecision); - } + } else if (re.hitType == 0) { + int idx = (int) Math.round((re.score - min_score) * inversePrecision); ++target_count_vector[idx]; } else { - int idx; - if (cal_evalue) { - idx = (int) Math.floor((re.negative_log10_evalue - min_score) * inversePrecision); - } else { - idx = (int) Math.floor((re.score - min_score) * inversePrecision); - } + int idx = (int) Math.round((re.score - min_score) * inversePrecision); ++fuse_count_vector[idx]; } } @@ -113,21 +104,26 @@ public CalFDR(List results, boolean cal_evalue) { last_q_value = q_value; } } - } - public List includeStats(boolean cal_evalue) { - for (FinalResultEntry re : results) { - if (re.hit_type == 0) { - if (cal_evalue) { - int idx = (int) Math.floor((re.negative_log10_evalue - min_score) * inversePrecision); - re.qvalue = qvalue_array[idx]; - } else { - int idx = (int) Math.floor((re.score - min_score) * inversePrecision); - re.qvalue = qvalue_array[idx]; - } - } + for (Entry entry : scanIdEntryMap.values()) { + entry.qValue = qvalue_array[(int) Math.round((entry.score - min_score) * inversePrecision)]; } - return results; + return scanIdEntryMap; + } + + public static class Entry { + + public final String scanId; + public final double score; + public final int hitType; + + public double qValue; + + public Entry(String scanId, double score, int hitType) { + this.scanId = scanId; + this.score = score; + this.hitType = hitType; + } } } \ No newline at end of file