From add5476c02126d4faed216ebf498287b3d169bda Mon Sep 17 00:00:00 2001 From: Fengchao Date: Sun, 21 May 2017 12:46:43 +0800 Subject: [PATCH] Improve the preprosessing part abased on Comet. --- .../java/proteomics/Search/SearchWrap.java | 2 +- .../java/proteomics/Spectrum/PreSpectra.java | 21 +--- .../java/proteomics/Spectrum/PreSpectrum.java | 118 ++++++++++++------ .../java/proteomics/Types/SpectrumEntry.java | 6 +- 4 files changed, 82 insertions(+), 65 deletions(-) diff --git a/src/main/java/proteomics/Search/SearchWrap.java b/src/main/java/proteomics/Search/SearchWrap.java index d924aca..0fd1e64 100644 --- a/src/main/java/proteomics/Search/SearchWrap.java +++ b/src/main/java/proteomics/Search/SearchWrap.java @@ -39,7 +39,7 @@ public SearchWrap(Search search_obj, SpectrumEntry spectrumEntry, BuildIndex bui @Override public FinalResultEntry call() { - SparseVector xcorrPL = preSpectrumObj.prepareXcorr(spectrumEntry.originalPlMap, spectrumEntry.precursor_mass); + SparseVector xcorrPL = preSpectrumObj.preSpectrum(spectrumEntry.originalPlMap, spectrumEntry.precursor_mass, spectrumEntry.scan_num); if (ECL2.debug) { try (BufferedWriter writer = new BufferedWriter(new FileWriter(spectrumEntry.scan_num + ".xcorr.spectrum.csv"))) { writer.write("bin_idx,intensity\n"); diff --git a/src/main/java/proteomics/Spectrum/PreSpectra.java b/src/main/java/proteomics/Spectrum/PreSpectra.java index 8fe5f84..fbd8f03 100644 --- a/src/main/java/proteomics/Spectrum/PreSpectra.java +++ b/src/main/java/proteomics/Spectrum/PreSpectra.java @@ -102,31 +102,12 @@ public void write(int b) throws IOException {} continue; } - TreeMap originalPlMap = pre_spectrum_obj.preSpectrum(raw_mz_intensity_map, precursor_mass); - - if (ECL2.debug) { - try (BufferedWriter writer = new BufferedWriter(new FileWriter(Integer.valueOf(spectrum.getId()) + ".normalized.spectrum.csv"))) { - writer.write("mz,intensity\n"); - for (float mz : originalPlMap.keySet()) { - writer.write(mz + "," + originalPlMap.get(mz) + "\n"); - } - } catch (IOException ex) { - ex.printStackTrace(); - logger.error(ex.getMessage()); - System.exit(1); - } - } - - if (originalPlMap.size() <= min_peak_num) { - continue; - } - int scan_num = Integer.valueOf(spectrum.getId()); Scan scan = spectra_parser.getScanByNum((long) scan_num); float rt = scan.getRetentionTime().getSeconds(); - SpectrumEntry spectrum_entry = new SpectrumEntry(scan_num, spectrum.getId(), precursor_mz, precursor_mass, precursor_charge, rt, originalPlMap, build_index_obj.linker_mass); + SpectrumEntry spectrum_entry = new SpectrumEntry(scan_num, spectrum.getId(), precursor_mz, precursor_mass, precursor_charge, rt, raw_mz_intensity_map, build_index_obj.linker_mass); num_spectrum_map.put(scan_num, spectrum_entry); } } catch (MzXMLParsingException ex) { diff --git a/src/main/java/proteomics/Spectrum/PreSpectrum.java b/src/main/java/proteomics/Spectrum/PreSpectrum.java index f825c80..4af5f4a 100644 --- a/src/main/java/proteomics/Spectrum/PreSpectrum.java +++ b/src/main/java/proteomics/Spectrum/PreSpectrum.java @@ -1,13 +1,19 @@ package proteomics.Spectrum; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import proteomics.ECL2; import proteomics.TheoSeq.MassTool; import proteomics.Types.SparseVector; +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; import java.util.*; public class PreSpectrum { - private static final float DEFAULT_INTENSITY = 1; // DO NOT change. Otherwise, change the whole project accordingly. + private static final Logger logger = LoggerFactory.getLogger(PreSpectrum.class); private static final int XCORR_OFFSET = 75; private final MassTool mass_tool_obj; @@ -16,22 +22,48 @@ public PreSpectrum(MassTool mass_tool_obj) { this.mass_tool_obj = mass_tool_obj; } - TreeMap preSpectrum (Map peaks_map, float precursor_mass) { - return normalizeSpec(peaks_map, precursor_mass); + public SparseVector preSpectrum (Map peaks_map, float precursor_mass, int scanNum) { + // sqrt the intensity + Map sqrt_pl_map = new HashMap<>(); + for (double mz : peaks_map.keySet()) { + if ((peaks_map.get(mz) > 1e-6) && (mz < precursor_mass)) { + sqrt_pl_map.put(mz, Math.sqrt(peaks_map.get(mz))); + } + } + + // digitize the spectrum + double[] pl_array = digitizeSpec(sqrt_pl_map); + + // normalize the spectrum + double[] normalizedSpectrum = normalizeSpec(pl_array); + + if (ECL2.debug) { + try (BufferedWriter writer = new BufferedWriter(new FileWriter(scanNum + ".normalized.spectrum.csv"))) { + writer.write("bin_idx,intensity\n"); + for (int i = 0; i < normalizedSpectrum.length; ++i) { + writer.write(i + "," + normalizedSpectrum[i] + "\n"); + } + } catch (IOException ex) { + ex.printStackTrace(); + logger.error(ex.getMessage()); + System.exit(1); + } + } + + return prepareXcorr(normalizedSpectrum); } // prepare for XCorr - public SparseVector prepareXcorr(TreeMap pl_map, float precursor_mass) { - float[] pl_array = digitizeSpec(pl_map, precursor_mass); - + private SparseVector prepareXcorr(double[] pl_array) { SparseVector xcorr_pl = new SparseVector(); - float my_sum = 0; int offset_range = 2 * XCORR_OFFSET + 1; + double factor = 1 / (double) (offset_range - 1); // caution: 1/150 rather than 1/151 + double my_sum = 0; for (int i = 0; i < XCORR_OFFSET; ++i) { my_sum += pl_array[i]; } - float factor = 1 / (float) (offset_range - 1); // caution: 1/150 rather than 1/151 + double[] tempArray = new double[pl_array.length]; for (int i = XCORR_OFFSET; i < pl_array.length + XCORR_OFFSET; ++i) { if (i < pl_array.length) { my_sum += pl_array[i]; @@ -39,59 +71,63 @@ public SparseVector prepareXcorr(TreeMap pl_map, float precursor_m if (i >= offset_range) { my_sum -= pl_array[i - offset_range]; } - float temp = (pl_array[i - XCORR_OFFSET] - (my_sum - pl_array[i - XCORR_OFFSET]) * factor); + tempArray[i - XCORR_OFFSET] = (my_sum - pl_array[i - XCORR_OFFSET]) * factor; + } + + for (int i = 1; i < pl_array.length; ++i) { + double temp = pl_array[i] - tempArray[i]; if (Math.abs(temp) > 1e-6) { - xcorr_pl.put(i - XCORR_OFFSET, temp); + xcorr_pl.put(i, (float) temp); } } return xcorr_pl; } - TreeMap normalizeSpec(Map pl_map, float precursor_mass) { - // sqrt the intensity and find the highest intensity. - TreeMap sqrt_pl_map = new TreeMap<>(); - for (double mz : pl_map.keySet()) { - if ((pl_map.get(mz) > 1e-6) && (mz < precursor_mass)) { - float sqrt_intensity = (float) Math.sqrt(pl_map.get(mz)); - sqrt_pl_map.put((float) mz, sqrt_intensity); + private double[] normalizeSpec(double[] plArray) { + double maxIntensity = 0; + for (double intensity : plArray) { + if (intensity > maxIntensity) { + maxIntensity = intensity; } } - // divide the spectrum into 10 windows and normalize each windows to DEFAULT_INTENSITY - TreeMap windowed_pl_map = new TreeMap<>(); - float min_mz = sqrt_pl_map.firstKey(); - float max_mz = sqrt_pl_map.lastKey(); - float window_size = (max_mz - min_mz) / 10 + 1; + double[] normalizedPlArray = new double[plArray.length]; + int windowSize = (plArray.length / 10) + 1; for (int i = 0; i < 10; ++i) { // find the max intensity in each window - float left_mz = Math.min(min_mz + i * window_size, max_mz); - float right_mz = Math.min(left_mz + window_size, max_mz); - NavigableMap sub_map; - if (right_mz < max_mz) { - sub_map = sqrt_pl_map.subMap(left_mz, true, right_mz, false); - } else { - sub_map = sqrt_pl_map.subMap(left_mz, true, right_mz, true); + double maxWindowIntensity = 0; + for (int j = 0; j < windowSize; ++j) { + int idx = i * windowSize + j; + if (idx < plArray.length) { + if (plArray[idx] > maxWindowIntensity) { + maxWindowIntensity = plArray[idx]; + } + } } - if (!sub_map.isEmpty()) { - Float[] intensity_array = sub_map.values().toArray(new Float[sub_map.size()]); - Arrays.sort(intensity_array); - float temp_1 = DEFAULT_INTENSITY / intensity_array[intensity_array.length - 1]; - float temp_2 = (float) 0.05 * intensity_array[intensity_array.length - 1]; - for (float mz : sub_map.keySet()) { - if (sub_map.get(mz) > temp_2) { - windowed_pl_map.put(mz, sub_map.get(mz) * temp_1); + + if (maxWindowIntensity > 0) { + double temp1 = 50 / maxWindowIntensity; + double temp2 = 0.05f * maxIntensity; + for (int j = 0; j < windowSize; ++j) { + int idx = i * windowSize + j; + if (idx < plArray.length) { + if (plArray[idx] > temp2) { + normalizedPlArray[idx] = plArray[idx] * temp1; + } } } } } - return windowed_pl_map; + return normalizedPlArray; } - float[] digitizeSpec(TreeMap pl, float precursor_mass) { - float[] digitized_pl = new float[mass_tool_obj.mzToBin(precursor_mass) + 1]; - for (float mz : pl.keySet()) { + private double[] digitizeSpec(Map pl) { + Double[] mzArray = pl.keySet().toArray(new Double[pl.size()]); + Arrays.sort(mzArray); + double[] digitized_pl = new double[mass_tool_obj.mzToBin(mzArray[mzArray.length - 1]) + 1]; + for (double mz : pl.keySet()) { int idx = mass_tool_obj.mzToBin(mz); digitized_pl[idx] = Math.max(pl.get(mz), digitized_pl[idx]); } diff --git a/src/main/java/proteomics/Types/SpectrumEntry.java b/src/main/java/proteomics/Types/SpectrumEntry.java index 98fcf2e..1dfe6a3 100644 --- a/src/main/java/proteomics/Types/SpectrumEntry.java +++ b/src/main/java/proteomics/Types/SpectrumEntry.java @@ -1,6 +1,6 @@ package proteomics.Types; -import java.util.TreeMap; +import java.util.Map; public class SpectrumEntry { public final int scan_num; @@ -10,10 +10,10 @@ public class SpectrumEntry { public final float rt; public final float mass_without_linker_mass; public final int precursor_charge; - public TreeMap originalPlMap; + public Map originalPlMap; private final String to_string; - public SpectrumEntry(int scan_num, String spectrum_id, float precursor_mz, float precursor_mass, int precursor_charge, float rt, TreeMap originalPlMap, float linker_mass) { + public SpectrumEntry(int scan_num, String spectrum_id, float precursor_mz, float precursor_mass, int precursor_charge, float rt, Map originalPlMap, float linker_mass) { this.scan_num = scan_num; this.spectrum_id = spectrum_id; this.precursor_mz = precursor_mz;