Skip to content

Commit

Permalink
Add binary var mod.
Browse files Browse the repository at this point in the history
  • Loading branch information
fcyu committed May 26, 2017
1 parent 5b9a1e6 commit a32411c
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 102 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

<groupId>hk.ust.bioinformatics</groupId>
<artifactId>ECL2</artifactId>
<version>2.1.4-dev-1c9b46a</version>
<version>2.1.4-dev-201705262001</version>
<packaging>jar</packaging>

<name>ECL2</name>
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/proteomics/ECL2.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public class ECL2 {
public static final boolean flankingPeaks = true;

private static final Logger logger = LoggerFactory.getLogger(ECL2.class);
public static final String version = "2.1.4-dev-1c9b46a";
public static final String version = "2.1.4-dev-201705262001";

public static boolean debug;
public static boolean dev;
Expand Down
190 changes: 133 additions & 57 deletions src/main/java/proteomics/Index/BuildIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import proteomics.TheoSeq.DbTool;
import proteomics.TheoSeq.MassTool;
import proteomics.Types.AA;
import proteomics.Types.BinaryModParam;
import proteomics.Types.ChainEntry;
import proteomics.Types.VarModParam;

Expand All @@ -16,7 +17,7 @@
public class BuildIndex {

private static final Logger logger = LoggerFactory.getLogger(BuildIndex.class);
private static final Pattern varModParamPattern = Pattern.compile("([0-9.-]+)\\s+([A-Znc]+)");
private static final Pattern varModParamPattern = Pattern.compile("([0-9.-]+)\\s+([A-Znc]+)\\s+([01])");
private static final int globalVarModMaxNum = 5; // Do not change this value. Otherwise, change generateLocalIdxModMassMap accordingly.
private static final float varModMassResolution = 0.01f;

Expand Down Expand Up @@ -94,26 +95,27 @@ public BuildIndex(Map<String, String> parameter_map) {
seqProMap = buildSeqProMap(pro_seq_map, min_chain_length, max_chain_length);

// read var mods
Set<VarModParam> varModParams = new HashSet<>();
Set<VarModParam> varModParamSet = new HashSet<>();
Set<BinaryModParam> binaryModParamSet = new HashSet<>();
for (String k : parameter_map.keySet()) {
if (k.contentEquals("var_mod1")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
} else if (k.contentEquals("var_mod2")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
} else if (k.contentEquals("var_mod3")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
} else if (k.contentEquals("var_mod4")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
} else if (k.contentEquals("var_mod5")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
} else if (k.contentEquals("var_mod6")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
} else if (k.contentEquals("var_mod7")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
} else if (k.contentEquals("var_mod8")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
} else if (k.contentEquals("var_mod9")) {
varModParams.addAll(getVarModParams(parameter_map.get(k)));
getVarModParams(parameter_map.get(k), varModParamSet, binaryModParamSet);
}
}

Expand Down Expand Up @@ -144,7 +146,7 @@ public BuildIndex(Map<String, String> parameter_map) {
}

// mod containing
Set<String> varSeqSet = generateModSeq(seq, linkSiteSet, varModParams, varModMaxNum);
Set<String> varSeqSet = generateModSeq(seq, linkSiteSet, varModParamSet, binaryModParamSet, varModMaxNum);
for (String varSeq : varSeqSet) {
linkSiteSet = getLinkSiteSet(varSeq, proteinNTerm, proteinCTerm);
if (!linkSiteSet.isEmpty()) {
Expand Down Expand Up @@ -290,38 +292,121 @@ private Map<String, Set<String>> buildSeqProMap(Map<String, String> pro_seq_map,
return seq_pro_map;
}

private Set<String> generateModSeq(String seq, Set<Short> modFreeListSites, Set<VarModParam> varModParams, int varModMaxNum) { // todo: check
// get all locations' var lists
Map<Integer, List<Float>> idxModMassMap = new HashMap<>();
for (int i = 0; i < seq.length(); ++i) {
char aa = seq.charAt(i);
for (VarModParam varModParam : varModParams) {
if (varModParam.aa == aa) {
if (idxModMassMap.containsKey(i)) {
idxModMassMap.get(i).add(varModParam.modMass);
} else {
List<Float> temp = new LinkedList<>();
temp.add(varModParam.modMass);
idxModMassMap.put(i, temp);
private Set<String> generateModSeq(String seq, Set<Short> modFreeListSites, Set<VarModParam> varModParamSet, Set<BinaryModParam> binaryModParamSet, int varModMaxNum) { // todo: check
Set<String> varSeqSet = new HashSet<>();
for (short linkSite : modFreeListSites) {
// has binary mod
for (BinaryModParam binaryModParam : binaryModParamSet) {
// get all locations having binary mod
Map<Integer, List<Float>> idxBinaryModMassMap = new HashMap<>();
for (int i = 0; i < seq.length(); ++i) {
if (i != linkSite) {
String aa = seq.substring(i, i + 1);
if (binaryModParam.aas.contains(aa)) {
List<Float> tempList = new LinkedList<>();
tempList.add(binaryModParam.modMass);
idxBinaryModMassMap.put(i, tempList);
}
}
}
if (!idxBinaryModMassMap.isEmpty()) {
// generate a sequence only containing the binary mod
StringBuilder sb = new StringBuilder(seq.length() * 10);
for (int i = 0; i < seq.length(); ++i) {
sb.append(seq.substring(i, i + 1));
if (idxBinaryModMassMap.containsKey(i)) {
sb.append(String.format("[%.2f]", idxBinaryModMassMap.get(i).get(0)));
}
}
varSeqSet.add(sb.toString());

if (idxBinaryModMassMap.size() < varModMaxNum) {
// generate sequences containing the binary mod and additional var mod
// get all locations having var mods
Map<Integer, List<Float>> idxVarModMassMap = new HashMap<>();
for (int i = 0; i < seq.length(); ++i) {
if (i != linkSite) {
if (!idxBinaryModMassMap.containsKey(i)) {
char aa = seq.charAt(i);
for (VarModParam varModParam : varModParamSet) {
if (varModParam.aa == aa) {
if (idxVarModMassMap.containsKey(i)) {
idxVarModMassMap.get(i).add(varModParam.modMass);
} else {
List<Float> temp = new LinkedList<>();
temp.add(varModParam.modMass);
idxVarModMassMap.put(i, temp);
}
}
}
}
}
}
if (!idxVarModMassMap.isEmpty()) {
// generate var containing sequences
Map<Integer, List<Float>> idxBinaryVarModMassMap = new HashMap<>();
idxBinaryVarModMassMap.putAll(idxBinaryModMassMap);
idxBinaryVarModMassMap.putAll(idxVarModMassMap);
Integer[] allIdxArray = idxVarModMassMap.keySet().toArray(new Integer[idxVarModMassMap.size()]);
Arrays.sort(allIdxArray);
for (int i = 1; i <= Math.min(varModMaxNum - idxBinaryModMassMap.size(), idxVarModMassMap.size()); ++i) {
List<int[]> idxCombinationList = generateIdxCombinations(allIdxArray, i);
Set<String> varSetSubSet = new HashSet<>();
for (int[] idxCombination : idxCombinationList) {
int[] allIdxCombination = new int[idxCombination.length + idxBinaryModMassMap.size()];
int j = 0;
for (int idx : idxBinaryModMassMap.keySet()) {
allIdxCombination[j] = idx;
++j;
}
for (int k = 0; k < idxCombination.length; ++k) {
allIdxCombination[j + k] = idxCombination[k];
}
Arrays.sort(allIdxCombination);
varSetSubSet.addAll(generateModSeqSub(seq, allIdxCombination, idxBinaryVarModMassMap));
}
if (!varSetSubSet.isEmpty()) {
varSeqSet.addAll(checkKCTermMod(varSetSubSet)); // eliminate those sequence that the middle amino acids having the same mod mass and the n-term and the first amino acid or the c-term and the last amino acid have the same mod mass.
}
}
}
}
}
}
}

// generate var containing sequences
Set<String> varSeqSet = new HashSet<>();
Integer[] allIdxArray = idxModMassMap.keySet().toArray(new Integer[idxModMassMap.size()]);
Arrays.sort(allIdxArray);
for (int i = 1; i <= Math.min(varModMaxNum, idxModMassMap.size()); ++i) {
List<int[]> idxCombinationList = generateIdxCombinations(allIdxArray, i);
Set<String> varSetSubSet = new HashSet<>();
for (int[] idxCombination : idxCombinationList) {
if (stillHasLinkSite(idxCombination, modFreeListSites)) {
varSetSubSet.addAll(generateModSeqSub(seq, idxCombination, idxModMassMap));
// does not have binary mod
// get all locations' var lists
Map<Integer, List<Float>> idxVarModMassMap = new HashMap<>();
for (int i = 0; i < seq.length(); ++i) {
if (i != linkSite) {
char aa = seq.charAt(i);
for (VarModParam varModParam : varModParamSet) {
if (varModParam.aa == aa) {
if (idxVarModMassMap.containsKey(i)) {
idxVarModMassMap.get(i).add(varModParam.modMass);
} else {
List<Float> temp = new LinkedList<>();
temp.add(varModParam.modMass);
idxVarModMassMap.put(i, temp);
}
}
}
}
}
if (!varSetSubSet.isEmpty()) {
varSeqSet.addAll(checkKCTermMod(varSetSubSet)); // eliminate those sequence that the middle amino acids having the same mod mass and the n-term and the first amino acid or the c-term and the last amino acid have the same mod mass.
if (!idxVarModMassMap.isEmpty()) {
// generate var containing sequences
Integer[] allIdxArray = idxVarModMassMap.keySet().toArray(new Integer[idxVarModMassMap.size()]);
Arrays.sort(allIdxArray);
for (int i = 1; i <= Math.min(varModMaxNum, idxVarModMassMap.size()); ++i) {
List<int[]> idxCombinationList = generateIdxCombinations(allIdxArray, i);
Set<String> varSetSubSet = new HashSet<>();
for (int[] idxCombination : idxCombinationList) {
varSetSubSet.addAll(generateModSeqSub(seq, idxCombination, idxVarModMassMap));
}
if (!varSetSubSet.isEmpty()) {
varSeqSet.addAll(checkKCTermMod(varSetSubSet)); // eliminate those sequence that the middle amino acids having the same mod mass and the n-term and the first amino acid or the c-term and the last amino acid have the same mod mass.
}
}
}
}

Expand All @@ -344,15 +429,6 @@ private List<int[]> generateIdxCombinations(Integer[] allIdxArray, int num) {
return outputList;
}

private boolean stillHasLinkSite(int[] idxCombination, Set<Short> modFreeLinkSites) {
for (int modFreeLinkSite : modFreeLinkSites) {
if (Arrays.binarySearch(idxCombination, modFreeLinkSite) < 0) {
return true;
}
}
return false;
}

private Set<String> generateModSeqSub(String seq, int[] idxCombination, Map<Integer, List<Float>> idxModMassMap) {
List<Map<Integer, Float>> localIdxModMassMaps = generateLocalIdxModMassMap(idxCombination, idxModMassMap);

Expand Down Expand Up @@ -493,24 +569,24 @@ private Set<Short> getLinkSiteSet(String seq, boolean n_term, boolean c_term) {
return output;
}

private Set<VarModParam> getVarModParams(String v) {
Set<VarModParam> varModParams = new HashSet<>();

private void getVarModParams(String v, Set<VarModParam> varModParamSet, Set<BinaryModParam> binaryModParamSet) {
Matcher varModMatcher = varModParamPattern.matcher(v);
if (varModMatcher.matches()) {
float modMass = Float.valueOf(varModMatcher.group(1));
String aas = varModMatcher.group(2);
if (Math.abs(modMass) < varModMassResolution) {
return varModParams;
}
for (int i = 0; i < aas.length(); ++i) {
varModParams.add(new VarModParam(modMass, aas.charAt(i)));
boolean isBinary = varModMatcher.group(3).contentEquals("1");
if (Math.abs(modMass) > varModMassResolution) {
if (isBinary) {
binaryModParamSet.add(new BinaryModParam(modMass, aas));
} else {
for (int i = 0; i < aas.length(); ++i) {
varModParamSet.add(new VarModParam(modMass, aas.charAt(i)));
}
}
}
} else {
logger.error("Cannot parse variable modification parameter from {}.", v);
System.exit(1);
}

return varModParams;
}
}
31 changes: 31 additions & 0 deletions src/main/java/proteomics/Types/BinaryModParam.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package proteomics.Types;


public class BinaryModParam {
public final float modMass;
public final String aas;
public final String toString;

public BinaryModParam(float modMass, String aas) {
this.modMass = modMass;
this.aas = aas;
toString = modMass + "@" + aas + "(binary)";
}

public String toString() {
return toString;
}

public int hashCode() {
return toString.hashCode();
}

public boolean equals(Object other) {
if (other instanceof BinaryModParam) {
BinaryModParam temp = (BinaryModParam) other;
return (Math.abs(temp.modMass - modMass) <= 0.01) && (temp.aas.contentEquals(aas));
} else {
return false;
}
}
}
31 changes: 0 additions & 31 deletions src/main/java/proteomics/Types/BinaryVarMod.java

This file was deleted.

26 changes: 14 additions & 12 deletions src/main/resources/parameter.def
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# 2.1.4-dev
# The first line is the parameter file version. Don't change it.
# 2.1.4-dev-201705262001
# The first line is the parameter file version. Do not change it.
thread_num = 0
debug = 0
dev = 0
Expand Down Expand Up @@ -29,16 +29,18 @@ mz_bin_offset = 0
cl_mass = 138.0680796

# Var modification
# format: <mass> <residues>
var_mod1 = 0.0 X
var_mod2 = 0.0 X
var_mod3 = 0.0 X
var_mod4 = 0.0 X
var_mod5 = 0.0 X
var_mod6 = 0.0 X
var_mod7 = 0.0 X
var_mod8 = 0.0 X
var_mod9 = 0.0 X
# format: <mass> <residues> <binary>
# <binary> == 0/1
# binary modification is mutual exclusion with each other
var_mod1 = 15.99 M 0
var_mod2 = 0.0 X 0
var_mod3 = 0.0 X 0
var_mod4 = 0.0 X 0
var_mod5 = 0.0 X 0
var_mod6 = 0.0 X 0
var_mod7 = 0.0 X 0
var_mod8 = 0.0 X 0
var_mod9 = 0.0 X 0
var_mod_max_num = 5 # max number of modified amino acids in a peptide. The max value is 5

# Fix modification
Expand Down

0 comments on commit a32411c

Please sign in to comment.