diff --git a/pom.xml b/pom.xml index f1b69e7..bb85f78 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ hk.ust.bioinformatics ECL2 - 2.1.4-dev-1c9b46a + 2.1.4-dev-201705262001 jar ECL2 diff --git a/src/main/java/proteomics/ECL2.java b/src/main/java/proteomics/ECL2.java index 1618075..489eb1a 100644 --- a/src/main/java/proteomics/ECL2.java +++ b/src/main/java/proteomics/ECL2.java @@ -25,7 +25,7 @@ public class ECL2 { public static final boolean flankingPeaks = true; private static final Logger logger = LoggerFactory.getLogger(ECL2.class); - public static final String version = "2.1.4-dev-1c9b46a"; + public static final String version = "2.1.4-dev-201705262001"; public static boolean debug; public static boolean dev; diff --git a/src/main/java/proteomics/Index/BuildIndex.java b/src/main/java/proteomics/Index/BuildIndex.java index 8407448..ce5473e 100644 --- a/src/main/java/proteomics/Index/BuildIndex.java +++ b/src/main/java/proteomics/Index/BuildIndex.java @@ -6,6 +6,7 @@ import proteomics.TheoSeq.DbTool; import proteomics.TheoSeq.MassTool; import proteomics.Types.AA; +import proteomics.Types.BinaryModParam; import proteomics.Types.ChainEntry; import proteomics.Types.VarModParam; @@ -16,7 +17,7 @@ public class BuildIndex { private static final Logger logger = LoggerFactory.getLogger(BuildIndex.class); - private static final Pattern varModParamPattern = Pattern.compile("([0-9.-]+)\\s+([A-Znc]+)"); + private static final Pattern varModParamPattern = Pattern.compile("([0-9.-]+)\\s+([A-Znc]+)\\s+([01])"); private static final int globalVarModMaxNum = 5; // Do not change this value. Otherwise, change generateLocalIdxModMassMap accordingly. private static final float varModMassResolution = 0.01f; @@ -94,26 +95,27 @@ public BuildIndex(Map parameter_map) { seqProMap = buildSeqProMap(pro_seq_map, min_chain_length, max_chain_length); // read var mods - Set varModParams = new HashSet<>(); + Set varModParamSet = new HashSet<>(); + Set binaryModParamSet = new HashSet<>(); for (String k : parameter_map.keySet()) { if (k.contentEquals("var_mod1")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } else if (k.contentEquals("var_mod2")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } else if (k.contentEquals("var_mod3")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } else if (k.contentEquals("var_mod4")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } else if (k.contentEquals("var_mod5")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } else if (k.contentEquals("var_mod6")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } else if (k.contentEquals("var_mod7")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } else if (k.contentEquals("var_mod8")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } else if (k.contentEquals("var_mod9")) { - varModParams.addAll(getVarModParams(parameter_map.get(k))); + getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet); } } @@ -144,7 +146,7 @@ public BuildIndex(Map parameter_map) { } // mod containing - Set varSeqSet = generateModSeq(seq, linkSiteSet, varModParams, varModMaxNum); + Set varSeqSet = generateModSeq(seq, linkSiteSet, varModParamSet, binaryModParamSet, varModMaxNum); for (String varSeq : varSeqSet) { linkSiteSet = getLinkSiteSet(varSeq, proteinNTerm, proteinCTerm); if (!linkSiteSet.isEmpty()) { @@ -290,38 +292,121 @@ private Map> buildSeqProMap(Map pro_seq_map, return seq_pro_map; } - private Set generateModSeq(String seq, Set modFreeListSites, Set varModParams, int varModMaxNum) { // todo: check - // get all locations' var lists - Map> idxModMassMap = new HashMap<>(); - for (int i = 0; i < seq.length(); ++i) { - char aa = seq.charAt(i); - for (VarModParam varModParam : varModParams) { - if (varModParam.aa == aa) { - if (idxModMassMap.containsKey(i)) { - idxModMassMap.get(i).add(varModParam.modMass); - } else { - List temp = new LinkedList<>(); - temp.add(varModParam.modMass); - idxModMassMap.put(i, temp); + private Set generateModSeq(String seq, Set modFreeListSites, Set varModParamSet, Set binaryModParamSet, int varModMaxNum) { // todo: check + Set varSeqSet = new HashSet<>(); + for (short linkSite : modFreeListSites) { + // has binary mod + for (BinaryModParam binaryModParam : binaryModParamSet) { + // get all locations having binary mod + Map> idxBinaryModMassMap = new HashMap<>(); + for (int i = 0; i < seq.length(); ++i) { + if (i != linkSite) { + String aa = seq.substring(i, i + 1); + if (binaryModParam.aas.contains(aa)) { + List tempList = new LinkedList<>(); + tempList.add(binaryModParam.modMass); + idxBinaryModMassMap.put(i, tempList); + } + } + } + if (!idxBinaryModMassMap.isEmpty()) { + // generate a sequence only containing the binary mod + StringBuilder sb = new StringBuilder(seq.length() * 10); + for (int i = 0; i < seq.length(); ++i) { + sb.append(seq.substring(i, i + 1)); + if (idxBinaryModMassMap.containsKey(i)) { + sb.append(String.format("[%.2f]", idxBinaryModMassMap.get(i).get(0))); + } + } + varSeqSet.add(sb.toString()); + + if (idxBinaryModMassMap.size() < varModMaxNum) { + // generate sequences containing the binary mod and additional var mod + // get all locations having var mods + Map> idxVarModMassMap = new HashMap<>(); + for (int i = 0; i < seq.length(); ++i) { + if (i != linkSite) { + if (!idxBinaryModMassMap.containsKey(i)) { + char aa = seq.charAt(i); + for (VarModParam varModParam : varModParamSet) { + if (varModParam.aa == aa) { + if (idxVarModMassMap.containsKey(i)) { + idxVarModMassMap.get(i).add(varModParam.modMass); + } else { + List temp = new LinkedList<>(); + temp.add(varModParam.modMass); + idxVarModMassMap.put(i, temp); + } + } + } + } + } + } + if (!idxVarModMassMap.isEmpty()) { + // generate var containing sequences + Map> idxBinaryVarModMassMap = new HashMap<>(); + idxBinaryVarModMassMap.putAll(idxBinaryModMassMap); + idxBinaryVarModMassMap.putAll(idxVarModMassMap); + Integer[] allIdxArray = idxVarModMassMap.keySet().toArray(new Integer[idxVarModMassMap.size()]); + Arrays.sort(allIdxArray); + for (int i = 1; i <= Math.min(varModMaxNum - idxBinaryModMassMap.size(), idxVarModMassMap.size()); ++i) { + List idxCombinationList = generateIdxCombinations(allIdxArray, i); + Set varSetSubSet = new HashSet<>(); + for (int[] idxCombination : idxCombinationList) { + int[] allIdxCombination = new int[idxCombination.length + idxBinaryModMassMap.size()]; + int j = 0; + for (int idx : idxBinaryModMassMap.keySet()) { + allIdxCombination[j] = idx; + ++j; + } + for (int k = 0; k < idxCombination.length; ++k) { + allIdxCombination[j + k] = idxCombination[k]; + } + Arrays.sort(allIdxCombination); + varSetSubSet.addAll(generateModSeqSub(seq, allIdxCombination, idxBinaryVarModMassMap)); + } + if (!varSetSubSet.isEmpty()) { + varSeqSet.addAll(checkKCTermMod(varSetSubSet)); // eliminate those sequence that the middle amino acids having the same mod mass and the n-term and the first amino acid or the c-term and the last amino acid have the same mod mass. + } + } + } } } } - } - // generate var containing sequences - Set varSeqSet = new HashSet<>(); - Integer[] allIdxArray = idxModMassMap.keySet().toArray(new Integer[idxModMassMap.size()]); - Arrays.sort(allIdxArray); - for (int i = 1; i <= Math.min(varModMaxNum, idxModMassMap.size()); ++i) { - List idxCombinationList = generateIdxCombinations(allIdxArray, i); - Set varSetSubSet = new HashSet<>(); - for (int[] idxCombination : idxCombinationList) { - if (stillHasLinkSite(idxCombination, modFreeListSites)) { - varSetSubSet.addAll(generateModSeqSub(seq, idxCombination, idxModMassMap)); + // does not have binary mod + // get all locations' var lists + Map> idxVarModMassMap = new HashMap<>(); + for (int i = 0; i < seq.length(); ++i) { + if (i != linkSite) { + char aa = seq.charAt(i); + for (VarModParam varModParam : varModParamSet) { + if (varModParam.aa == aa) { + if (idxVarModMassMap.containsKey(i)) { + idxVarModMassMap.get(i).add(varModParam.modMass); + } else { + List temp = new LinkedList<>(); + temp.add(varModParam.modMass); + idxVarModMassMap.put(i, temp); + } + } + } } } - if (!varSetSubSet.isEmpty()) { - varSeqSet.addAll(checkKCTermMod(varSetSubSet)); // eliminate those sequence that the middle amino acids having the same mod mass and the n-term and the first amino acid or the c-term and the last amino acid have the same mod mass. + if (!idxVarModMassMap.isEmpty()) { + // generate var containing sequences + Integer[] allIdxArray = idxVarModMassMap.keySet().toArray(new Integer[idxVarModMassMap.size()]); + Arrays.sort(allIdxArray); + for (int i = 1; i <= Math.min(varModMaxNum, idxVarModMassMap.size()); ++i) { + List idxCombinationList = generateIdxCombinations(allIdxArray, i); + Set varSetSubSet = new HashSet<>(); + for (int[] idxCombination : idxCombinationList) { + varSetSubSet.addAll(generateModSeqSub(seq, idxCombination, idxVarModMassMap)); + } + if (!varSetSubSet.isEmpty()) { + varSeqSet.addAll(checkKCTermMod(varSetSubSet)); // eliminate those sequence that the middle amino acids having the same mod mass and the n-term and the first amino acid or the c-term and the last amino acid have the same mod mass. + } + } } } @@ -344,15 +429,6 @@ private List generateIdxCombinations(Integer[] allIdxArray, int num) { return outputList; } - private boolean stillHasLinkSite(int[] idxCombination, Set modFreeLinkSites) { - for (int modFreeLinkSite : modFreeLinkSites) { - if (Arrays.binarySearch(idxCombination, modFreeLinkSite) < 0) { - return true; - } - } - return false; - } - private Set generateModSeqSub(String seq, int[] idxCombination, Map> idxModMassMap) { List> localIdxModMassMaps = generateLocalIdxModMassMap(idxCombination, idxModMassMap); @@ -493,24 +569,24 @@ private Set getLinkSiteSet(String seq, boolean n_term, boolean c_term) { return output; } - private Set getVarModParams(String v) { - Set varModParams = new HashSet<>(); - + private void getVarModParams(String v, Set varModParamSet, Set binaryModParamSet) { Matcher varModMatcher = varModParamPattern.matcher(v); if (varModMatcher.matches()) { float modMass = Float.valueOf(varModMatcher.group(1)); String aas = varModMatcher.group(2); - if (Math.abs(modMass) < varModMassResolution) { - return varModParams; - } - for (int i = 0; i < aas.length(); ++i) { - varModParams.add(new VarModParam(modMass, aas.charAt(i))); + boolean isBinary = varModMatcher.group(3).contentEquals("1"); + if (Math.abs(modMass) > varModMassResolution) { + if (isBinary) { + binaryModParamSet.add(new BinaryModParam(modMass, aas)); + } else { + for (int i = 0; i < aas.length(); ++i) { + varModParamSet.add(new VarModParam(modMass, aas.charAt(i))); + } + } } } else { logger.error("Cannot parse variable modification parameter from {}.", v); System.exit(1); } - - return varModParams; } } diff --git a/src/main/java/proteomics/Types/BinaryModParam.java b/src/main/java/proteomics/Types/BinaryModParam.java new file mode 100644 index 0000000..adebfd5 --- /dev/null +++ b/src/main/java/proteomics/Types/BinaryModParam.java @@ -0,0 +1,31 @@ +package proteomics.Types; + + +public class BinaryModParam { + public final float modMass; + public final String aas; + public final String toString; + + public BinaryModParam(float modMass, String aas) { + this.modMass = modMass; + this.aas = aas; + toString = modMass + "@" + aas + "(binary)"; + } + + public String toString() { + return toString; + } + + public int hashCode() { + return toString.hashCode(); + } + + public boolean equals(Object other) { + if (other instanceof BinaryModParam) { + BinaryModParam temp = (BinaryModParam) other; + return (Math.abs(temp.modMass - modMass) <= 0.01) && (temp.aas.contentEquals(aas)); + } else { + return false; + } + } +} diff --git a/src/main/java/proteomics/Types/BinaryVarMod.java b/src/main/java/proteomics/Types/BinaryVarMod.java deleted file mode 100644 index c4fee5d..0000000 --- a/src/main/java/proteomics/Types/BinaryVarMod.java +++ /dev/null @@ -1,31 +0,0 @@ -package proteomics.Types; - -public class BinaryVarMod { - - public final String site; - public final float mod_mass; - private final String to_string; - - public BinaryVarMod(String site, float mod_mass) { - this.site = site; - this.mod_mass = mod_mass; - to_string = String.format("%.2f@%s", mod_mass, site); - } - - public boolean equals(Object other) { - if (other instanceof BinaryVarMod) { - BinaryVarMod temp = (BinaryVarMod) other; - return ((temp.site.contentEquals(site)) && (temp.mod_mass == mod_mass)); - } else { - return false; - } - } - - public String toString() { - return to_string; - } - - public int hashCode() { - return to_string.hashCode(); - } -} diff --git a/src/main/resources/parameter.def b/src/main/resources/parameter.def index a7bd11a..5b87040 100644 --- a/src/main/resources/parameter.def +++ b/src/main/resources/parameter.def @@ -1,5 +1,5 @@ -# 2.1.4-dev -# The first line is the parameter file version. Don't change it. +# 2.1.4-dev-201705262001 +# The first line is the parameter file version. Do not change it. thread_num = 0 debug = 0 dev = 0 @@ -29,16 +29,18 @@ mz_bin_offset = 0 cl_mass = 138.0680796 # Var modification -# format: -var_mod1 = 0.0 X -var_mod2 = 0.0 X -var_mod3 = 0.0 X -var_mod4 = 0.0 X -var_mod5 = 0.0 X -var_mod6 = 0.0 X -var_mod7 = 0.0 X -var_mod8 = 0.0 X -var_mod9 = 0.0 X +# format: +# == 0/1 +# binary modification is mutual exclusion with each other +var_mod1 = 15.99 M 0 +var_mod2 = 0.0 X 0 +var_mod3 = 0.0 X 0 +var_mod4 = 0.0 X 0 +var_mod5 = 0.0 X 0 +var_mod6 = 0.0 X 0 +var_mod7 = 0.0 X 0 +var_mod8 = 0.0 X 0 +var_mod9 = 0.0 X 0 var_mod_max_num = 5 # max number of modified amino acids in a peptide. The max value is 5 # Fix modification