From db9a769ffc391a480ec7425f97dcfacf409c46c2 Mon Sep 17 00:00:00 2001 From: Fengchao Date: Tue, 28 Nov 2017 13:52:19 +0800 Subject: [PATCH] Add a databaseType for parsing the header. Add to methods in DbTool. --- .../java/proteomics/Index/BuildIndex.java | 6 ++- src/main/java/proteomics/TheoSeq/DbTool.java | 47 ++++++++++++++++++- .../java/proteomics/TheoSeq/DbToolTest.java | 2 +- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/src/main/java/proteomics/Index/BuildIndex.java b/src/main/java/proteomics/Index/BuildIndex.java index 58d10be..63d2c57 100644 --- a/src/main/java/proteomics/Index/BuildIndex.java +++ b/src/main/java/proteomics/Index/BuildIndex.java @@ -92,7 +92,11 @@ public BuildIndex(Map parameter_map) { } // read protein database - DbTool db_tool_obj = new DbTool(db_path); + String databaseType = "UniProt"; + if (parameter_map.containsKey("database_type")) { + databaseType = parameter_map.get("database_type"); + } + DbTool db_tool_obj = new DbTool(db_path, databaseType); Map pro_seq_map = db_tool_obj.getProSeqMap(); pro_annotate_map = db_tool_obj.getProAnnotateMap(); diff --git a/src/main/java/proteomics/TheoSeq/DbTool.java b/src/main/java/proteomics/TheoSeq/DbTool.java index 82d8ae2..513386a 100644 --- a/src/main/java/proteomics/TheoSeq/DbTool.java +++ b/src/main/java/proteomics/TheoSeq/DbTool.java @@ -14,14 +14,19 @@ public class DbTool { private Map pro_seq_map = new HashMap<>(); private Map pro_annotate_map = new HashMap<>(); - public DbTool(String db_name) { + public DbTool(String db_name, String databaseType) { String id = ""; String annotate; StringBuilder seq = new StringBuilder(99999); boolean new_pro = true; - Pattern header_pattern = Pattern.compile(">([^\\s]*)(.*)"); + Pattern header_pattern = Pattern.compile("^>([^\\s]+)[\\s|]+(.+)");; + if (databaseType.contentEquals("TAIR")) { + header_pattern = Pattern.compile("^>([^\\s]+)[\\s|]+(.+)$"); + } else if (databaseType.contentEquals("UniProt") || databaseType.contentEquals("SwissProt")) { + header_pattern = Pattern.compile("^>[^|]+\\|(.+)\\|(.+)$"); + } try (BufferedReader db_reader = new BufferedReader(new FileReader(db_name))) { String line; @@ -65,4 +70,42 @@ public Map getProSeqMap() { public Map getProAnnotateMap() { return pro_annotate_map; } + + public Set findPeptideLocation(String proteinId, String peptide) throws NullPointerException { + peptide = peptide.trim().replaceAll("[^A-Z]+", ""); + Set output = new HashSet<>(); + int idx = pro_seq_map.get(proteinId).indexOf(peptide); + while (idx >= 0) { + output.add(idx); + idx = pro_seq_map.get(proteinId).indexOf(peptide, idx + 1); + } + if (!output.isEmpty()) { + return output; + } else { + throw new NullPointerException(String.format(Locale.US, "Cannot find the peptide %s from the protein %s.", peptide, proteinId)); + } + } + + public static Set reduceProteinIdSet(Set input) { + if (input.size() == 1) { + return input; + } else { + Map tempMap = new HashMap<>(); + for (String s : input) { + String[] tempArray = s.split("\\."); + if (tempMap.containsKey(tempArray[0])) { + if (tempMap.get(tempArray[0]) > Integer.valueOf(tempArray[1])) { + tempMap.put(tempArray[0], Integer.valueOf(tempArray[1])); + } + } else { + tempMap.put(tempArray[0], Integer.valueOf(tempArray[1])); + } + } + Set output = new HashSet<>(); + for (String s : tempMap.keySet()) { + output.add(s + "." + tempMap.get(s)); + } + return output; + } + } } diff --git a/src/test/java/proteomics/TheoSeq/DbToolTest.java b/src/test/java/proteomics/TheoSeq/DbToolTest.java index 86e8e79..194ca3a 100644 --- a/src/test/java/proteomics/TheoSeq/DbToolTest.java +++ b/src/test/java/proteomics/TheoSeq/DbToolTest.java @@ -13,7 +13,7 @@ public class DbToolTest { @BeforeClass public static void setUp() throws Exception { - db_tool_obj = new DbTool(Thread.currentThread().getContextClassLoader().getResource("test.fasta").getPath()); + db_tool_obj = new DbTool(Thread.currentThread().getContextClassLoader().getResource("test.fasta").getPath(), "UniProt"); } @Test