Skip to content

Commit

Permalink
Eliminate the uniprot decoy pool.
Browse files Browse the repository at this point in the history
  • Loading branch information
fcyu committed May 22, 2017
1 parent 65676f6 commit 588ccda
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 97 deletions.
1 change: 0 additions & 1 deletion src/main/java/proteomics/ECL2.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ public class ECL2 {
public static final boolean cal_evalue = true;
public static final double delta_c_t = 0;
public static final int score_point_t = 15000;
public static final int random_score_point_t = 2;
public static final boolean flankingPeaks = true;

private static final Logger logger = LoggerFactory.getLogger(ECL2.class);
Expand Down
41 changes: 0 additions & 41 deletions src/main/java/proteomics/Index/BuildIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import org.apache.commons.math3.util.CombinatoricsUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import proteomics.ECL2;
import proteomics.TheoSeq.DbTool;
import proteomics.TheoSeq.MassTool;
import proteomics.Types.AA;
Expand Down Expand Up @@ -31,7 +30,6 @@ public class BuildIndex {
private Map<Integer, Long> bin_candidate_num_map = new HashMap<>();
private Map<String, ChainEntry> seq_entry_map = new HashMap<>();
private Map<String, boolean[]> seq_term_map = new HashMap<>();
private TreeMap<Float, Set<String>> uniprot_decoy_mass_seq_map = new TreeMap<>();
private Set<String> for_check_duplicate = new HashSet<>();
private Map<String, Set<String>> seqProMap;

Expand Down Expand Up @@ -170,41 +168,6 @@ public BuildIndex(Map<String, String> parameter_map) {
}
bin_candidate_num_map.put(bin_index, candidate_num);
}

// read and index large decoy uniprot database
if (ECL2.cal_evalue) {
db_tool_obj = new DbTool(uniprot_decoy_db);
Map<String, String> uniprot_decoy_pro_seq_map = db_tool_obj.getProSeqMap();
for (String pro_seq : uniprot_decoy_pro_seq_map.values()) {
Set<String> seq_set = mass_tool_obj.buildChainSet(pro_seq);
for (String mod_free_seq : seq_set) {
if ((mod_free_seq.length() >= min_chain_length) && (mod_free_seq.length() <= max_chain_length)) {
if (!mod_free_seq.contains("B") && !mod_free_seq.contains("J") && !mod_free_seq.contains("X") && !mod_free_seq.contains("Z")) {
String temp_str = mod_free_seq.replace("L", "!").replace("I", "!").replace("K", "#").replace("Q", "#");
if (!for_check_duplicate.contains(temp_str)) {
for_check_duplicate.add(temp_str);
int temp_int = mod_free_seq.indexOf('K'); // only use the first K
if ((temp_int != -1) && (temp_int != mod_free_seq.length() - 2)) { // only consider chains whose link site is in the middle.
// consider PTM free
float mass = (float) (mass_tool_obj.calResidueMass(mod_free_seq) + MassTool.H2O);
if (mass < max_precursor_mass - linker_mass) {
if (uniprot_decoy_mass_seq_map.containsKey(mass)) {
if (uniprot_decoy_mass_seq_map.get(mass).size() < ECL2.random_score_point_t) { // limit the size of set for the sake of memory.
uniprot_decoy_mass_seq_map.get(mass).add(mod_free_seq + "-" + temp_int);
}
} else {
Set<String> temp = new HashSet<>();
temp.add(mod_free_seq + "-" + temp_int);
uniprot_decoy_mass_seq_map.put(mass, temp);
}
}
}
}
}
}
}
}
}
}

public Map<String, Set<String>> getSeqProMap() {
Expand Down Expand Up @@ -243,10 +206,6 @@ public int massToBin(float mass) {
return (int) Math.floor(mass / ms1_bin_size);
}

public TreeMap<Float, Set<String>> getUniprotDecoyMassSeqMap() {
return uniprot_decoy_mass_seq_map;
}

public Map<Integer, Long> getBinCandidateNumMap() {
return bin_candidate_num_map;
}
Expand Down
101 changes: 50 additions & 51 deletions src/main/java/proteomics/Search/CalEValue.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,51 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import proteomics.ECL2;
import proteomics.Index.BuildIndex;
import proteomics.TheoSeq.MassTool;
import proteomics.Types.ChainEntry;
import proteomics.Types.ResultEntry;
import proteomics.Types.SparseBooleanVector;
import proteomics.Types.SparseVector;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.NavigableMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

public class CalEValue {

private static final Logger logger = LoggerFactory.getLogger(CalEValue.class);
private static final float maxTolerance = 20;

private ResultEntry result_entry;
private TreeMap<Float, Set<String>> uniprot_decoy_mass_seq_map;
private TreeMap<Integer, Set<String>> bin_seq_map;
private Map<String, ChainEntry> seq_entry_map;
private BuildIndex buildIndexObj;
private float linker_mass;
private MassTool mass_tool_obj;
private int max_common_ion_charge;
private SparseVector pl_map_xcorr;
private int maxBinIdx;
private float e_value_precursor_mass_tol;
private int specMaxBinIdx;

CalEValue(int scan_num, ResultEntry result_entry, SparseVector pl_map_xcorr, int maxBinIdx, TreeMap<Float, Set<String>> uniprot_decoy_mass_seq_map, MassTool mass_tool_obj, float linker_mass, int max_common_ion_charge, float e_value_precursor_mass_tol) {
CalEValue(int scan_num, ResultEntry result_entry, SparseVector pl_map_xcorr, int specMaxBinIdx, BuildIndex buildIndexObj, MassTool mass_tool_obj, float linker_mass, int max_common_ion_charge, float originalTolerance) {
this.result_entry = result_entry;
this.uniprot_decoy_mass_seq_map = uniprot_decoy_mass_seq_map;
this.bin_seq_map = buildIndexObj.getMassBinSeqMap();
this.seq_entry_map = buildIndexObj.getSeqEntryMap();
this.buildIndexObj = buildIndexObj;
this.linker_mass = linker_mass;
this.mass_tool_obj = mass_tool_obj;
this.max_common_ion_charge = max_common_ion_charge;
this.pl_map_xcorr = pl_map_xcorr;
this.maxBinIdx = maxBinIdx;
this.e_value_precursor_mass_tol = e_value_precursor_mass_tol;
this.specMaxBinIdx = specMaxBinIdx;

int gap_num = ECL2.score_point_t - result_entry.getScoreCount();
if (gap_num >= 0) {
generateRandomRandomScores(gap_num);
float tolerance = originalTolerance;
while (gap_num > 0 && tolerance <= maxTolerance) {
gap_num = generateRandomRandomScores(gap_num, tolerance, tolerance + 1);
tolerance += 1;
}

if (gap_num > 0) {
Expand Down Expand Up @@ -215,25 +222,40 @@ public class CalEValue {
}
}

private int generateRandomRandomScores(int gap_num) {
float precursor_mass = result_entry.spectrum_mass;
float max_mass = (precursor_mass - linker_mass) / 2;
for (float mass_1 : uniprot_decoy_mass_seq_map.keySet()) {
if (mass_1 <= max_mass) {
float left_mass = precursor_mass - linker_mass - mass_1 - e_value_precursor_mass_tol;
float right_mass = precursor_mass - linker_mass - mass_1 + e_value_precursor_mass_tol;
NavigableMap<Float, Set<String>> sub_map = uniprot_decoy_mass_seq_map.subMap(left_mass, true, right_mass, true);
private int generateRandomRandomScores(int gap_num, float previousTolerance, float tolerance) {
int maxBinIdx = buildIndexObj.massToBin((result_entry.spectrum_mass - linker_mass) / 2);
for (int binIdx1 : bin_seq_map.keySet()) {
if (binIdx1 < maxBinIdx) {
int leftBinIdx1 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass - previousTolerance - tolerance) - maxBinIdx;
int rightBinIdx1 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass - previousTolerance) - maxBinIdx - 1;
int leftBinIdx2 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass + previousTolerance) - maxBinIdx + 1;
int rightBinIdx2 = buildIndexObj.massToBin(result_entry.spectrum_mass - linker_mass + previousTolerance + tolerance) - maxBinIdx;
TreeMap<Integer, Set<String>> sub_map = new TreeMap<>();
sub_map.putAll(bin_seq_map.subMap(leftBinIdx1, true, rightBinIdx1, false));
sub_map.putAll(bin_seq_map.subMap(leftBinIdx2, false, rightBinIdx2, true));
if (!sub_map.isEmpty()) {
for (String seq_1_link_site : uniprot_decoy_mass_seq_map.get(mass_1)) {
String[] temp = seq_1_link_site.split("-");
String seq_1 = temp[0];
short link_site = Short.valueOf(temp[1]);
SparseBooleanVector theo_mz_1 = mass_tool_obj.buildTheoVector(seq_1, link_site, precursor_mass - mass_1, result_entry.charge, max_common_ion_charge, maxBinIdx);
double score1 = theo_mz_1.dot(pl_map_xcorr) * 0.005;
if (score1 > Search.single_chain_t) {
gap_num = generateMoreScoresSub(sub_map, seq_1, score1, precursor_mass, gap_num);
if (gap_num < 0) {
return gap_num;
for (String seq1 : bin_seq_map.get(binIdx1)) {
ChainEntry chainEntry1 = seq_entry_map.get(seq1);
for (short linkSite1 : chainEntry1.link_site_set) {
SparseBooleanVector theoMz1 = mass_tool_obj.buildTheoVector(seq1, linkSite1, result_entry.spectrum_mass - chainEntry1.chain_mass, result_entry.charge, max_common_ion_charge, specMaxBinIdx);
double score1 = theoMz1.dot(pl_map_xcorr) * 0.005;
if (score1 > Search.single_chain_t) {
for (int binIdx2 : sub_map.keySet()) {
for (String seq2 : sub_map.get(binIdx2)) {
ChainEntry chainEntry2 = seq_entry_map.get(seq2);
for (short linkSite2 : chainEntry2.link_site_set) {
SparseBooleanVector theoMz2 = mass_tool_obj.buildTheoVector(seq2, linkSite2, result_entry.spectrum_mass - chainEntry2.chain_mass, result_entry.charge, max_common_ion_charge, specMaxBinIdx);
double score2 = theoMz2.dot(pl_map_xcorr) * 0.0005;
if (score2 > Search.single_chain_t) {
result_entry.addToScoreHistogram(score1 + score2);
--gap_num;
if (gap_num <= 0) {
return gap_num;
}
}
}
}
}
}
}
}
Expand All @@ -242,27 +264,4 @@ private int generateRandomRandomScores(int gap_num) {
}
return gap_num;
}

private int generateMoreScoresSub(NavigableMap<Float, Set<String>> sub_map, String seq_1, double score1, float precursor_mass, int gap_num) {
for (float mass_2 : sub_map.keySet()) {
for (String seq_2_link_site : sub_map.get(mass_2)) {
String[] temp = seq_2_link_site.split("-");
String seq_2 = temp[0];
short link_site = Short.valueOf(temp[1]);
if (!seq_1.contentEquals(seq_2)) {
SparseBooleanVector theo_mz_2 = mass_tool_obj.buildTheoVector(seq_2, link_site, precursor_mass - mass_2, result_entry.charge, max_common_ion_charge, maxBinIdx);
double score2 = theo_mz_2.dot(pl_map_xcorr) * 0.005;
if (score2 > Search.single_chain_t) {
double score = score1 + score2;
result_entry.addToScoreHistogram(score);
--gap_num;
if (gap_num < 0) {
return gap_num;
}
}
}
}
}
return gap_num;
}
}
8 changes: 4 additions & 4 deletions src/main/java/proteomics/Search/SearchWrap.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ public FinalResultEntry call() {
}
if (1 - (resultEntry.getSecondScore() / resultEntry.getScore()) > ECL2.delta_c_t) {
if (ECL2.cal_evalue) {
float e_value_precursor_mass_tol;
float originalTolerance;
if (search_obj.ms1_tolerance_unit == 1) {
e_value_precursor_mass_tol = resultEntry.spectrum_mass * search_obj.ms1_tolerance * 1e-6f;
originalTolerance = resultEntry.spectrum_mass * search_obj.ms1_tolerance * 1e-6f;
} else {
e_value_precursor_mass_tol = search_obj.ms1_tolerance;
originalTolerance = search_obj.ms1_tolerance;
}
new CalEValue(spectrumEntry.scan_num, resultEntry, xcorrPL, maxBinIdx, build_index_obj.getUniprotDecoyMassSeqMap(), mass_tool_obj, build_index_obj.linker_mass, max_common_ion_charge, e_value_precursor_mass_tol);
new CalEValue(spectrumEntry.scan_num, resultEntry, xcorrPL, specMaxBinIdx, build_index_obj, mass_tool_obj, build_index_obj.linker_mass, max_common_ion_charge, originalTolerance);
if (resultEntry.getEValue() != 9999) {
return search_obj.convertResultEntry(spectrumEntry.scan_num, resultEntry, seqProMap);
} else {
Expand Down

0 comments on commit 588ccda

Please sign in to comment.