From 074ae3ec7f05bb69464b9184d4a523cd341f114c Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Mon, 29 Aug 2016 16:57:29 +0200 Subject: [PATCH 1/4] Refactor and test a little bit --- .../net/sf/jabref/model/DuplicateCheck.java | 147 ++++++++---------- .../sf/jabref/model/DuplicateCheckTest.java | 47 ++++-- 2 files changed, 94 insertions(+), 100 deletions(-) diff --git a/src/main/java/net/sf/jabref/model/DuplicateCheck.java b/src/main/java/net/sf/jabref/model/DuplicateCheck.java index 056920d2bd5..36436c4c7a0 100644 --- a/src/main/java/net/sf/jabref/model/DuplicateCheck.java +++ b/src/main/java/net/sf/jabref/model/DuplicateCheck.java @@ -3,6 +3,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -16,6 +17,7 @@ import net.sf.jabref.model.entry.FieldProperty; import net.sf.jabref.model.entry.InternalBibtexFields; +import info.debatty.java.stringsimilarity.Levenshtein; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -23,18 +25,18 @@ * This class contains utility method for duplicate checking of entries. */ public class DuplicateCheck { - private static final Log LOGGER = LogFactory.getLog(DuplicateCheck.class); - /* - * Integer values for indicating result of duplicate check (for entries): - * + /** + * Enumeration for indicating the result of a duplicate check */ - private static final int NOT_EQUAL = 0; - private static final int EQUAL = 1; - private static final int EMPTY_IN_ONE = 2; - private static final int EMPTY_IN_TWO = 3; - private static final int EMPTY_IN_BOTH = 4; + private enum CheckResult { + NOT_EQUAL, + EQUAL, + EMPTY_IN_ONE, + EMPTY_IN_TWO, + EMPTY_IN_BOTH + } public static double duplicateThreshold = 0.75; // The overall threshold to signal a duplicate pair // Non-required fields are investigated only if the required fields give a value within @@ -46,7 +48,6 @@ public class DuplicateCheck { // Extra weighting of those fields that are most likely to provide correct duplicate detection: private static final Map FIELD_WEIGHTS = new HashMap<>(); - static { DuplicateCheck.FIELD_WEIGHTS.put(FieldName.AUTHOR, 2.5); DuplicateCheck.FIELD_WEIGHTS.put(FieldName.EDITOR, 2.5); @@ -54,7 +55,6 @@ public class DuplicateCheck { DuplicateCheck.FIELD_WEIGHTS.put(FieldName.JOURNAL, 2.); } - /** * Checks if the two entries represent the same publication. * @@ -63,20 +63,19 @@ public class DuplicateCheck { * @return boolean */ public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bibDatabaseMode) { - - // First check if they are of the same type - a necessary condition: + // same type is mandatory if (!one.getType().equals(two.getType())) { return false; } - EntryType type = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode); - // The check if they have the same required fields: - List var = type.getRequiredFieldsFlat(); + // check for equal required fields + EntryType entryType = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode); + List requiredFields = entryType.getRequiredFieldsFlat(); double[] req; - if (var == null) { + if (requiredFields == null) { req = new double[]{0., 0.}; } else { - req = DuplicateCheck.compareFieldSet(var, one, two); + req = DuplicateCheck.compareFieldSet(requiredFields, one, two); } if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.DOUBT_RANGE) { @@ -84,7 +83,8 @@ public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bi return req[0] >= DuplicateCheck.duplicateThreshold; } // Close to the threshold value, so we take a look at the optional fields, if any: - List optionalFields = type.getOptionalFields(); + // check for equal optional fields + List optionalFields = entryType.getOptionalFields(); if (optionalFields != null) { double[] opt = DuplicateCheck.compareFieldSet(optionalFields, one, two); double totValue = ((DuplicateCheck.REQUIRED_WEIGHT * req[0] * req[1]) + (opt[0] * opt[1])) / ((req[1] * DuplicateCheck.REQUIRED_WEIGHT) + opt[1]); @@ -104,10 +104,11 @@ private static double[] compareFieldSet(List fields, BibEntry one, BibEn weight = 1.0; } totWeights += weight; - int result = DuplicateCheck.compareSingleField(field, one, two); - if (result == EQUAL) { + + CheckResult result = DuplicateCheck.compareSingleField(field, one, two); + if (result == CheckResult.EQUAL) { res += weight; - } else if (result == EMPTY_IN_BOTH) { + } else if (result == CheckResult.EMPTY_IN_BOTH) { totWeights -= weight; } } @@ -117,16 +118,16 @@ private static double[] compareFieldSet(List fields, BibEntry one, BibEn return new double[] {0.5, 0.0}; } - private static int compareSingleField(String field, BibEntry one, BibEntry two) { + private static CheckResult compareSingleField(String field, BibEntry one, BibEntry two) { Optional optionalStringOne = one.getField(field); Optional optionalStringTwo = two.getField(field); if (!optionalStringOne.isPresent()) { if (!optionalStringTwo.isPresent()) { - return EMPTY_IN_BOTH; + return CheckResult.EMPTY_IN_BOTH; } - return EMPTY_IN_ONE; + return CheckResult.EMPTY_IN_ONE; } else if (!optionalStringTwo.isPresent()) { - return EMPTY_IN_TWO; + return CheckResult.EMPTY_IN_TWO; } // Both strings present @@ -140,9 +141,9 @@ private static int compareSingleField(String field, BibEntry one, BibEntry two) String authorTwo = AuthorList.fixAuthorLastNameOnlyCommas(stringTwo, false).replace(" and ", " ").toLowerCase(); double similarity = DuplicateCheck.correlateByWords(authorOne, authorTwo); if (similarity > 0.8) { - return EQUAL; + return CheckResult.EQUAL; } - return NOT_EQUAL; + return CheckResult.NOT_EQUAL; } else if (FieldName.PAGES.equals(field)) { // Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ". // We do a replace to harmonize these to a simple "-": @@ -150,9 +151,9 @@ private static int compareSingleField(String field, BibEntry one, BibEntry two) stringOne = stringOne.replaceAll("[- ]+", "-"); stringTwo = stringTwo.replaceAll("[- ]+", "-"); if (stringOne.equals(stringTwo)) { - return EQUAL; + return CheckResult.EQUAL; } - return NOT_EQUAL; + return CheckResult.NOT_EQUAL; } else if (FieldName.JOURNAL.equals(field)) { // We do not attempt to harmonize abbreviation state of the journal names, // but we remove periods from the names in case they are abbreviated with @@ -161,17 +162,17 @@ private static int compareSingleField(String field, BibEntry one, BibEntry two) stringTwo = stringTwo.replace(".", "").toLowerCase(); double similarity = DuplicateCheck.correlateByWords(stringOne, stringTwo); if (similarity > 0.8) { - return EQUAL; + return CheckResult.EQUAL; } - return NOT_EQUAL; + return CheckResult.NOT_EQUAL; } else { stringOne = stringOne.toLowerCase(); stringTwo = stringTwo.toLowerCase(); double similarity = DuplicateCheck.correlateByWords(stringOne, stringTwo); if (similarity > 0.8) { - return EQUAL; + return CheckResult.EQUAL; } - return NOT_EQUAL; + return CheckResult.NOT_EQUAL; } } @@ -217,18 +218,21 @@ public static Optional containsDuplicate(BibDatabase database, BibEntr /** * Compare two strings on the basis of word-by-word correlation analysis. + * TODO: strange algorithm as when there are only words inserted this gives a bad value, e.g., + * a test -> this a test (0.0) + * characterization -> characterization of me (1.0) * - * @param s1 The first string - * @param s2 The second string + * @param s1 The first string + * @param s2 The second string * @return a value in the interval [0, 1] indicating the degree of match. */ public static double correlateByWords(String s1, String s2) { - String[] w1 = s1.split("\\s"); - String[] w2 = s2.split("\\s"); - int n = Math.min(w1.length, w2.length); + String[] words1 = s1.split("\\s"); + String[] words2 = s2.split("\\s"); + int n = Math.min(words1.length, words2.length); int misses = 0; for (int i = 0; i < n; i++) { - double corr = similarity(w1[i], w2[i]); + double corr = similarity(words1[i], words2[i]); if (corr < 0.75) { misses++; } @@ -239,58 +243,31 @@ public static double correlateByWords(String s1, String s2) { /** - * Calculates the similarity (a number within 0 and 1) between two strings. - * http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java + * Calculates the similarity between two strings. + *

+ * The result will be in the interval [0;1]. */ private static double similarity(String s1, String s2) { - String longer = s1; - String shorter = s2; - - if (s1.length() < s2.length()) { // longer should always have greater length - longer = s2; - shorter = s1; + // method is performance optimized + String longerString = s1; + String shorterString = s2; + + // determine longer string + if (s1.length() < s2.length()) { + longerString = s2; + shorterString = s1; } - int longerLength = longer.length(); + + int longerLength = longerString.length(); + // both strings are zero length if (longerLength == 0) { return 1.0; - /* both strings are zero length */ } - double sim = (longerLength - editDistance(longer, shorter)) / (double) longerLength; - LOGGER.debug("Longer string: " + longer + " Shorter string: " + shorter + " Similarity: " + sim); - return sim; + } + return (longerLength - levenshteinDistance(longerString, shorterString)) / longerLength; } - /* - * Levenshtein Edit Distance - * http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java - */ - private static int editDistance(String s1, String s2) { - String s1LowerCase = s1.toLowerCase(); - String s2LowerCase = s2.toLowerCase(); - - int[] costs = new int[s2LowerCase.length() + 1]; - for (int i = 0; i <= s1LowerCase.length(); i++) { - int lastValue = i; - for (int j = 0; j <= s2LowerCase.length(); j++) { - if (i == 0) { - costs[j] = j; - } else if (j > 0) { - int newValue = costs[j - 1]; - if (s1LowerCase.charAt(i - 1) != s2LowerCase.charAt(j - 1)) { - newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1; - } - costs[j - 1] = lastValue; - lastValue = newValue; - - } - } - if (i > 0) { - costs[s2LowerCase.length()] = lastValue; - } - } - LOGGER.debug("String 1: " + s1LowerCase + " String 2: " + s2LowerCase + " Distance: " + costs[s2LowerCase.length()]); - return costs[s2LowerCase.length()]; + private static double levenshteinDistance(String s1, String s2) { + return new Levenshtein().distance(s1.toLowerCase(Locale.ENGLISH), s2.toLowerCase(Locale.ENGLISH)); } - - } diff --git a/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java b/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java index d0d05212dfb..e3929050c0c 100644 --- a/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java +++ b/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java @@ -11,14 +11,14 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -/** - * Created by IntelliJ IDEA. - * User: alver - * Date: Nov 9, 2007 - * Time: 7:04:25 PM - * To change this template use File | Settings | File Templates. - */ public class DuplicateCheckTest { + @Test + public void noDuplicateForDifferentTypes() { + BibEntry e1 = new BibEntry("1", "article"); + BibEntry e2 = new BibEntry("2", "journal"); + assertFalse(DuplicateCheck.isDuplicate(e1, e2, BibDatabaseMode.BIBTEX)); + assertFalse(DuplicateCheck.isDuplicate(e1, e2, BibDatabaseMode.BIBLATEX)); + } @Test public void testDuplicateDetection() { @@ -77,14 +77,31 @@ public void testDuplicateDetection() { } @Test - public void testWordCorrelation() { - String d1 = "Characterization of Calanus finmarchicus habitat in the North Sea"; - String d2 = "Characterization of Calunus finmarchicus habitat in the North Sea"; - String d3 = "Characterization of Calanus glacialissss habitat in the South Sea"; - - assertEquals(1.0, (DuplicateCheck.correlateByWords(d1, d2)), 0.01); - assertEquals(0.78, (DuplicateCheck.correlateByWords(d1, d3)), 0.01); - assertEquals(0.78, (DuplicateCheck.correlateByWords(d2, d3)), 0.01); + public void wordCorrelationIsOneForEmptyStrings() { + assertEquals(1.0, DuplicateCheck.correlateByWords("", ""), 0.01); } + @Test + public void wordCorrelationForSmallerFirstString() { + String d1 = "a test"; + String d2 = "this a test"; + + assertEquals(0.0, DuplicateCheck.correlateByWords(d1, d2), 0.01); + } + + @Test + public void wordCorrelationForBiggerFirstString() { + String d1 = "Characterization of me"; + String d2 = "Characterization"; + + assertEquals(1.0, DuplicateCheck.correlateByWords(d1, d2), 0.01); + } + + @Test + public void wordCorrelationForEqualStrings() { + String d1 = "Characterization"; + String d2 = "Characterization"; + + assertEquals(1.0, DuplicateCheck.correlateByWords(d1, d2), 0.01); + } } From 4645813b96a969627d0c32c67fb2b3365c61149a Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Mon, 29 Aug 2016 17:22:04 +0200 Subject: [PATCH 2/4] Improve tests for duplicate detection --- .../net/sf/jabref/model/DuplicateCheck.java | 7 +-- .../sf/jabref/model/DuplicateCheckTest.java | 56 ++++++++++++++++++- 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/src/main/java/net/sf/jabref/model/DuplicateCheck.java b/src/main/java/net/sf/jabref/model/DuplicateCheck.java index 36436c4c7a0..3e952e927cb 100644 --- a/src/main/java/net/sf/jabref/model/DuplicateCheck.java +++ b/src/main/java/net/sf/jabref/model/DuplicateCheck.java @@ -5,6 +5,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -185,14 +186,12 @@ public static double compareEntriesStrictly(BibEntry one, BibEntry two) { for (String field : allFields) { Optional stringOne = one.getField(field); Optional stringTwo = two.getField(field); - if (stringOne.equals(stringTwo)) { + if (Objects.equals(stringOne, stringTwo)) { score++; } } if (score == allFields.size()) { - return 1.01; // Just to make sure we can - // use score>1 without - // trouble. + return 1.0; } return (double) score / allFields.size(); } diff --git a/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java b/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java index e3929050c0c..dad1275e7d3 100644 --- a/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java +++ b/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java @@ -20,6 +20,60 @@ public void noDuplicateForDifferentTypes() { assertFalse(DuplicateCheck.isDuplicate(e1, e2, BibDatabaseMode.BIBLATEX)); } + @Test + public void noStrictDuplicateForDifferentTypes() { + BibEntry e1 = new BibEntry("1", "article"); + BibEntry e2 = new BibEntry("2", "journal"); + assertEquals(0, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01); + } + + @Test + public void strictDuplicateForEqualFields() { + BibEntry e1 = new BibEntry(); + e1.setField("key1", "value1"); + e1.setField("key2", "value2"); + BibEntry e2 = new BibEntry(); + e2.setField("key1", "value1"); + e2.setField("key2", "value2"); + assertEquals(1, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01); + } + + @Test + public void noStrictDuplicateForDifferentKeys() { + BibEntry e1 = new BibEntry(); + e1.setField("key", "value1"); + BibEntry e2 = new BibEntry(); + e2.setField("key1", "value1");; + assertEquals(0, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01); + } + + @Test + public void noStrictDuplicateForDifferentValues() { + BibEntry e1 = new BibEntry(); + e1.setField("key1", "value"); + BibEntry e2 = new BibEntry(); + e2.setField("key1", "value1"); + assertEquals(0, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01); + } + + @Test + public void noStrictDuplicateIsCaseInsensitiveForKey() { + BibEntry e1 = new BibEntry(); + e1.setField("KEY1", "value"); + BibEntry e2 = new BibEntry(); + e2.setField("key1", "value"); + assertEquals(1, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01); + } + + @Test + public void noStrictDuplicateIsCaseSensitiveForValue() { + BibEntry e1 = new BibEntry(); + e1.setField("key1", "Value"); + BibEntry e2 = new BibEntry(); + e2.setField("key1", "value"); + assertEquals(0, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01); + } + @Test public void testDuplicateDetection() { BibEntry one = new BibEntry(IdGenerator.next(), BibtexEntryTypes.ARTICLE.getName()); @@ -45,11 +99,9 @@ public void testDuplicateDetection() { one.setField("journal", "A"); two.setField("journal", "A"); assertTrue(DuplicateCheck.isDuplicate(one, two, BibDatabaseMode.BIBTEX)); - assertEquals(1.01, DuplicateCheck.compareEntriesStrictly(one, two), 0.01); two.setField("journal", "B"); assertTrue(DuplicateCheck.isDuplicate(one, two, BibDatabaseMode.BIBTEX)); - assertEquals(0.75, DuplicateCheck.compareEntriesStrictly(one, two), 0.01); two.setField("journal", "A"); one.setField("number", "1"); From 6b83848e791e3ebfd145ba6af11d9f7b01b228f4 Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Mon, 29 Aug 2016 17:31:15 +0200 Subject: [PATCH 3/4] Another find --- src/main/java/net/sf/jabref/model/DuplicateCheck.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/net/sf/jabref/model/DuplicateCheck.java b/src/main/java/net/sf/jabref/model/DuplicateCheck.java index 3e952e927cb..1d3c9d7e593 100644 --- a/src/main/java/net/sf/jabref/model/DuplicateCheck.java +++ b/src/main/java/net/sf/jabref/model/DuplicateCheck.java @@ -106,6 +106,7 @@ private static double[] compareFieldSet(List fields, BibEntry one, BibEn } totWeights += weight; + // TODO: EMPTY_IN_ONE, EMPTY_IN_TWO, NOT_EQUAL is not used CheckResult result = DuplicateCheck.compareSingleField(field, one, two); if (result == CheckResult.EQUAL) { res += weight; From 99e7b4cf0e3c5306cd5b2dbd1a301c205507038a Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Mon, 29 Aug 2016 17:49:16 +0200 Subject: [PATCH 4/4] Add duplicate database --- .../net/sf/jabref/model/DuplicateCheck.java | 2 ++ .../net/sf/jabref/model/duplicates.bib | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 src/test/resources/net/sf/jabref/model/duplicates.bib diff --git a/src/main/java/net/sf/jabref/model/DuplicateCheck.java b/src/main/java/net/sf/jabref/model/DuplicateCheck.java index 1d3c9d7e593..12aa7147f27 100644 --- a/src/main/java/net/sf/jabref/model/DuplicateCheck.java +++ b/src/main/java/net/sf/jabref/model/DuplicateCheck.java @@ -24,6 +24,8 @@ /** * This class contains utility method for duplicate checking of entries. + * + * TODO: http://hpi.de/naumann/projects/data-quality-and-cleansing/dude-duplicate-detection.html */ public class DuplicateCheck { private static final Log LOGGER = LogFactory.getLog(DuplicateCheck.class); diff --git a/src/test/resources/net/sf/jabref/model/duplicates.bib b/src/test/resources/net/sf/jabref/model/duplicates.bib new file mode 100644 index 00000000000..8ae5fa0103f --- /dev/null +++ b/src/test/resources/net/sf/jabref/model/duplicates.bib @@ -0,0 +1,17 @@ +% Encoding: UTF-8 + +@InProceedings{dupJournalTechreport, + author = {Kolb, Stefan and Wirtz, Guido}, + title = {This was published at a Workshop and a Journal with the same Title}, + booktitle = {Proc. Workshop Cloud}, + year = {2016}, +} + +@Article{dupJournalTechreport, + author = {Kolb, Stefan and Wirtz, Guido}, + title = {This was published at a Workshop and a Journal with the same Title}, + journal = {Transactions on Cloud Computing}, + year = {2016}, +} + +@Comment{jabref-meta: databaseType:bibtex;}