From 074ae3ec7f05bb69464b9184d4a523cd341f114c Mon Sep 17 00:00:00 2001
From: Stefan Kolb <stefan-kolb@web.de>
Date: Mon, 29 Aug 2016 16:57:29 +0200
Subject: [PATCH 1/4] Refactor and test a little bit

---
 .../net/sf/jabref/model/DuplicateCheck.java   | 147 ++++++++----------
 .../sf/jabref/model/DuplicateCheckTest.java   |  47 ++++--
 2 files changed, 94 insertions(+), 100 deletions(-)
diff --git a/src/main/java/net/sf/jabref/model/DuplicateCheck.java b/src/main/java/net/sf/jabref/model/DuplicateCheck.java
index 056920d2bd5..36436c4c7a0 100644
--- a/src/main/java/net/sf/jabref/model/DuplicateCheck.java
+++ b/src/main/java/net/sf/jabref/model/DuplicateCheck.java
@@ -3,6 +3,7 @@
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
@@ -16,6 +17,7 @@
 import net.sf.jabref.model.entry.FieldProperty;
 import net.sf.jabref.model.entry.InternalBibtexFields;
 
+import info.debatty.java.stringsimilarity.Levenshtein;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
@@ -23,18 +25,18 @@
  * This class contains utility method for duplicate checking of entries.
  */
 public class DuplicateCheck {
-
     private static final Log LOGGER = LogFactory.getLog(DuplicateCheck.class);
 
-    /*
-     * Integer values for indicating result of duplicate check (for entries):
-     *
+    /**
+     * Enumeration for indicating the result of a duplicate check
      */
-    private static final int NOT_EQUAL = 0;
-    private static final int EQUAL = 1;
-    private static final int EMPTY_IN_ONE = 2;
-    private static final int EMPTY_IN_TWO = 3;
-    private static final int EMPTY_IN_BOTH = 4;
+    private enum CheckResult {
+        NOT_EQUAL,
+        EQUAL,
+        EMPTY_IN_ONE,
+        EMPTY_IN_TWO,
+        EMPTY_IN_BOTH
+    }
 
     public static double duplicateThreshold = 0.75; // The overall threshold to signal a duplicate pair
     // Non-required fields are investigated only if the required fields give a value within
@@ -46,7 +48,6 @@ public class DuplicateCheck {
     // Extra weighting of those fields that are most likely to provide correct duplicate detection:
     private static final Map<String, Double> FIELD_WEIGHTS = new HashMap<>();
 
-
     static {
         DuplicateCheck.FIELD_WEIGHTS.put(FieldName.AUTHOR, 2.5);
         DuplicateCheck.FIELD_WEIGHTS.put(FieldName.EDITOR, 2.5);
@@ -54,7 +55,6 @@ public class DuplicateCheck {
         DuplicateCheck.FIELD_WEIGHTS.put(FieldName.JOURNAL, 2.);
     }
 
-
     /**
      * Checks if the two entries represent the same publication.
      *
@@ -63,20 +63,19 @@ public class DuplicateCheck {
      * @return boolean
      */
     public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bibDatabaseMode) {
-
-        // First check if they are of the same type - a necessary condition:
+        // same type is mandatory
         if (!one.getType().equals(two.getType())) {
             return false;
         }
-        EntryType type = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode);
 
-        // The check if they have the same required fields:
-        List<String> var = type.getRequiredFieldsFlat();
+        // check for equal required fields
+        EntryType entryType = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode);
+        List<String> requiredFields = entryType.getRequiredFieldsFlat();
         double[] req;
-        if (var == null) {
+        if (requiredFields == null) {
             req = new double[]{0., 0.};
         } else {
-            req = DuplicateCheck.compareFieldSet(var, one, two);
+            req = DuplicateCheck.compareFieldSet(requiredFields, one, two);
         }
 
         if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.DOUBT_RANGE) {
@@ -84,7 +83,8 @@ public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bi
             return req[0] >= DuplicateCheck.duplicateThreshold;
         }
         // Close to the threshold value, so we take a look at the optional fields, if any:
-        List<String> optionalFields = type.getOptionalFields();
+        // check for equal optional fields
+        List<String> optionalFields = entryType.getOptionalFields();
         if (optionalFields != null) {
             double[] opt = DuplicateCheck.compareFieldSet(optionalFields, one, two);
             double totValue = ((DuplicateCheck.REQUIRED_WEIGHT * req[0] * req[1]) + (opt[0] * opt[1])) / ((req[1] * DuplicateCheck.REQUIRED_WEIGHT) + opt[1]);
@@ -104,10 +104,11 @@ private static double[] compareFieldSet(List<String> fields, BibEntry one, BibEn
                 weight = 1.0;
             }
             totWeights += weight;
-            int result = DuplicateCheck.compareSingleField(field, one, two);
-            if (result == EQUAL) {
+
+            CheckResult result = DuplicateCheck.compareSingleField(field, one, two);
+            if (result == CheckResult.EQUAL) {
                 res += weight;
-            } else if (result == EMPTY_IN_BOTH) {
+            } else if (result == CheckResult.EMPTY_IN_BOTH) {
                 totWeights -= weight;
             }
         }
@@ -117,16 +118,16 @@ private static double[] compareFieldSet(List<String> fields, BibEntry one, BibEn
         return new double[] {0.5, 0.0};
     }
 
-    private static int compareSingleField(String field, BibEntry one, BibEntry two) {
+    private static CheckResult compareSingleField(String field, BibEntry one, BibEntry two) {
         Optional<String> optionalStringOne = one.getField(field);
         Optional<String> optionalStringTwo = two.getField(field);
         if (!optionalStringOne.isPresent()) {
             if (!optionalStringTwo.isPresent()) {
-                return EMPTY_IN_BOTH;
+                return CheckResult.EMPTY_IN_BOTH;
             }
-            return EMPTY_IN_ONE;
+            return CheckResult.EMPTY_IN_ONE;
         } else if (!optionalStringTwo.isPresent()) {
-            return EMPTY_IN_TWO;
+            return CheckResult.EMPTY_IN_TWO;
         }
 
         // Both strings present
@@ -140,9 +141,9 @@ private static int compareSingleField(String field, BibEntry one, BibEntry two)
             String authorTwo = AuthorList.fixAuthorLastNameOnlyCommas(stringTwo, false).replace(" and ", " ").toLowerCase();
             double similarity = DuplicateCheck.correlateByWords(authorOne, authorTwo);
             if (similarity > 0.8) {
-                return EQUAL;
+                return CheckResult.EQUAL;
             }
-            return NOT_EQUAL;
+            return CheckResult.NOT_EQUAL;
         } else if (FieldName.PAGES.equals(field)) {
             // Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ".
             // We do a replace to harmonize these to a simple "-":
@@ -150,9 +151,9 @@ private static int compareSingleField(String field, BibEntry one, BibEntry two)
             stringOne = stringOne.replaceAll("[- ]+", "-");
             stringTwo = stringTwo.replaceAll("[- ]+", "-");
             if (stringOne.equals(stringTwo)) {
-                return EQUAL;
+                return CheckResult.EQUAL;
             }
-            return NOT_EQUAL;
+            return CheckResult.NOT_EQUAL;
         } else if (FieldName.JOURNAL.equals(field)) {
             // We do not attempt to harmonize abbreviation state of the journal names,
             // but we remove periods from the names in case they are abbreviated with
@@ -161,17 +162,17 @@ private static int compareSingleField(String field, BibEntry one, BibEntry two)
             stringTwo = stringTwo.replace(".", "").toLowerCase();
             double similarity = DuplicateCheck.correlateByWords(stringOne, stringTwo);
             if (similarity > 0.8) {
-                return EQUAL;
+                return CheckResult.EQUAL;
             }
-            return NOT_EQUAL;
+            return CheckResult.NOT_EQUAL;
         } else {
             stringOne = stringOne.toLowerCase();
             stringTwo = stringTwo.toLowerCase();
             double similarity = DuplicateCheck.correlateByWords(stringOne, stringTwo);
             if (similarity > 0.8) {
-                return EQUAL;
+                return CheckResult.EQUAL;
             }
-            return NOT_EQUAL;
+            return CheckResult.NOT_EQUAL;
         }
     }
 
@@ -217,18 +218,21 @@ public static Optional<BibEntry> containsDuplicate(BibDatabase database, BibEntr
 
     /**
      * Compare two strings on the basis of word-by-word correlation analysis.
+     * TODO: strange algorithm as when there are only words inserted this gives a bad value, e.g.,
+     * a test -> this a test (0.0)
+     * characterization -> characterization of me (1.0)
      *
-     * @param s1       The first string
-     * @param s2       The second string
+     * @param s1 The first string
+     * @param s2 The second string
      * @return a value in the interval [0, 1] indicating the degree of match.
      */
     public static double correlateByWords(String s1, String s2) {
-        String[] w1 = s1.split("\\s");
-        String[] w2 = s2.split("\\s");
-        int n = Math.min(w1.length, w2.length);
+        String[] words1 = s1.split("\\s");
+        String[] words2 = s2.split("\\s");
+        int n = Math.min(words1.length, words2.length);
         int misses = 0;
         for (int i = 0; i < n; i++) {
-            double corr = similarity(w1[i], w2[i]);
+            double corr = similarity(words1[i], words2[i]);
             if (corr < 0.75) {
                 misses++;
             }
@@ -239,58 +243,31 @@ public static double correlateByWords(String s1, String s2) {
 
 
     /**
-     * Calculates the similarity (a number within 0 and 1) between two strings.
-     * http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java
+     * Calculates the similarity between two strings.
+     * <p>
+     * The result will be in the interval [0;1].
      */
     private static double similarity(String s1, String s2) {
-        String longer = s1;
-        String shorter = s2;
-
-        if (s1.length() < s2.length()) { // longer should always have greater length
-            longer = s2;
-            shorter = s1;
+        // method is performance optimized
+        String longerString = s1;
+        String shorterString = s2;
+
+        // determine longer string
+        if (s1.length() < s2.length()) {
+            longerString = s2;
+            shorterString = s1;
         }
-        int longerLength = longer.length();
+
+        int longerLength = longerString.length();
+        // both strings are zero length
         if (longerLength == 0) {
             return 1.0;
-            /* both strings are zero length */ }
-        double sim = (longerLength - editDistance(longer, shorter)) / (double) longerLength;
-        LOGGER.debug("Longer string: " + longer + " Shorter string: " + shorter + " Similarity: " + sim);
-        return sim;
+        }
 
+        return (longerLength - levenshteinDistance(longerString, shorterString)) / longerLength;
     }
 
-    /*
-    * Levenshtein Edit Distance
-    * http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java
-    */
-    private static int editDistance(String s1, String s2) {
-        String s1LowerCase = s1.toLowerCase();
-        String s2LowerCase = s2.toLowerCase();
-
-        int[] costs = new int[s2LowerCase.length() + 1];
-        for (int i = 0; i <= s1LowerCase.length(); i++) {
-            int lastValue = i;
-            for (int j = 0; j <= s2LowerCase.length(); j++) {
-                if (i == 0) {
-                    costs[j] = j;
-                } else if (j > 0) {
-                    int newValue = costs[j - 1];
-                    if (s1LowerCase.charAt(i - 1) != s2LowerCase.charAt(j - 1)) {
-                        newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
-                    }
-                    costs[j - 1] = lastValue;
-                    lastValue = newValue;
-
-                }
-            }
-            if (i > 0) {
-                costs[s2LowerCase.length()] = lastValue;
-            }
-        }
-        LOGGER.debug("String 1: " + s1LowerCase + " String 2: " + s2LowerCase + " Distance: " + costs[s2LowerCase.length()]);
-        return costs[s2LowerCase.length()];
+    private static double levenshteinDistance(String s1, String s2) {
+        return new Levenshtein().distance(s1.toLowerCase(Locale.ENGLISH), s2.toLowerCase(Locale.ENGLISH));
     }
-
-
 }
diff --git a/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java b/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java
index d0d05212dfb..e3929050c0c 100644
--- a/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java
+++ b/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java
@@ -11,14 +11,14 @@
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
-/**
- * Created by IntelliJ IDEA.
- * User: alver
- * Date: Nov 9, 2007
- * Time: 7:04:25 PM
- * To change this template use File | Settings | File Templates.
- */
 public class DuplicateCheckTest {
+    @Test
+    public void noDuplicateForDifferentTypes() {
+        BibEntry e1 = new BibEntry("1", "article");
+        BibEntry e2 = new BibEntry("2", "journal");
+        assertFalse(DuplicateCheck.isDuplicate(e1, e2, BibDatabaseMode.BIBTEX));
+        assertFalse(DuplicateCheck.isDuplicate(e1, e2, BibDatabaseMode.BIBLATEX));
+    }
 
     @Test
     public void testDuplicateDetection() {
@@ -77,14 +77,31 @@ public void testDuplicateDetection() {
     }
 
     @Test
-    public void testWordCorrelation() {
-        String d1 = "Characterization of Calanus finmarchicus habitat in the North Sea";
-        String d2 = "Characterization of Calunus finmarchicus habitat in the North Sea";
-        String d3 = "Characterization of Calanus glacialissss habitat in the South Sea";
-
-        assertEquals(1.0, (DuplicateCheck.correlateByWords(d1, d2)), 0.01);
-        assertEquals(0.78, (DuplicateCheck.correlateByWords(d1, d3)), 0.01);
-        assertEquals(0.78, (DuplicateCheck.correlateByWords(d2, d3)), 0.01);
+    public void wordCorrelationIsOneForEmptyStrings() {
+        assertEquals(1.0, DuplicateCheck.correlateByWords("", ""), 0.01);
     }
 
+    @Test
+    public void wordCorrelationForSmallerFirstString() {
+        String d1 = "a test";
+        String d2 = "this a test";
+
+        assertEquals(0.0, DuplicateCheck.correlateByWords(d1, d2), 0.01);
+    }
+
+    @Test
+    public void wordCorrelationForBiggerFirstString() {
+        String d1 = "Characterization of me";
+        String d2 = "Characterization";
+
+        assertEquals(1.0, DuplicateCheck.correlateByWords(d1, d2), 0.01);
+    }
+
+    @Test
+    public void wordCorrelationForEqualStrings() {
+        String d1 = "Characterization";
+        String d2 = "Characterization";
+
+        assertEquals(1.0, DuplicateCheck.correlateByWords(d1, d2), 0.01);
+    }
 }

From 4645813b96a969627d0c32c67fb2b3365c61149a Mon Sep 17 00:00:00 2001
From: Stefan Kolb <stefan-kolb@web.de>
Date: Mon, 29 Aug 2016 17:22:04 +0200
Subject: [PATCH 2/4] Improve tests for duplicate detection

---
 .../net/sf/jabref/model/DuplicateCheck.java   |  7 +--
 .../sf/jabref/model/DuplicateCheckTest.java   | 56 ++++++++++++++++++-
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/src/main/java/net/sf/jabref/model/DuplicateCheck.java b/src/main/java/net/sf/jabref/model/DuplicateCheck.java
index 36436c4c7a0..3e952e927cb 100644
--- a/src/main/java/net/sf/jabref/model/DuplicateCheck.java
+++ b/src/main/java/net/sf/jabref/model/DuplicateCheck.java
@@ -5,6 +5,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Objects;
 import java.util.Optional;
 import java.util.Set;
 
@@ -185,14 +186,12 @@ public static double compareEntriesStrictly(BibEntry one, BibEntry two) {
         for (String field : allFields) {
             Optional<String> stringOne = one.getField(field);
             Optional<String> stringTwo = two.getField(field);
-            if (stringOne.equals(stringTwo)) {
+            if (Objects.equals(stringOne, stringTwo)) {
                 score++;
             }
         }
         if (score == allFields.size()) {
-            return 1.01; // Just to make sure we can
-            // use score>1 without
-            // trouble.
+            return 1.0;
         }
         return (double) score / allFields.size();
     }
diff --git a/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java b/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java
index e3929050c0c..dad1275e7d3 100644
--- a/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java
+++ b/src/test/java/net/sf/jabref/model/DuplicateCheckTest.java
@@ -20,6 +20,60 @@ public void noDuplicateForDifferentTypes() {
         assertFalse(DuplicateCheck.isDuplicate(e1, e2, BibDatabaseMode.BIBLATEX));
     }
 
+    @Test
+    public void noStrictDuplicateForDifferentTypes() {
+        BibEntry e1 = new BibEntry("1", "article");
+        BibEntry e2 = new BibEntry("2", "journal");
+        assertEquals(0, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01);
+    }
+
+    @Test
+    public void strictDuplicateForEqualFields() {
+        BibEntry e1 = new BibEntry();
+        e1.setField("key1", "value1");
+        e1.setField("key2", "value2");
+        BibEntry e2 = new BibEntry();
+        e2.setField("key1", "value1");
+        e2.setField("key2", "value2");
+        assertEquals(1, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01);
+    }
+
+    @Test
+    public void noStrictDuplicateForDifferentKeys() {
+        BibEntry e1 = new BibEntry();
+        e1.setField("key", "value1");
+        BibEntry e2 = new BibEntry();
+        e2.setField("key1", "value1");;
+        assertEquals(0, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01);
+    }
+
+    @Test
+    public void noStrictDuplicateForDifferentValues() {
+        BibEntry e1 = new BibEntry();
+        e1.setField("key1", "value");
+        BibEntry e2 = new BibEntry();
+        e2.setField("key1", "value1");
+        assertEquals(0, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01);
+    }
+
+    @Test
+    public void noStrictDuplicateIsCaseInsensitiveForKey() {
+        BibEntry e1 = new BibEntry();
+        e1.setField("KEY1", "value");
+        BibEntry e2 = new BibEntry();
+        e2.setField("key1", "value");
+        assertEquals(1, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01);
+    }
+
+    @Test
+    public void noStrictDuplicateIsCaseSensitiveForValue() {
+        BibEntry e1 = new BibEntry();
+        e1.setField("key1", "Value");
+        BibEntry e2 = new BibEntry();
+        e2.setField("key1", "value");
+        assertEquals(0, DuplicateCheck.compareEntriesStrictly(e1, e2), 0.01);
+    }
+
     @Test
     public void testDuplicateDetection() {
         BibEntry one = new BibEntry(IdGenerator.next(), BibtexEntryTypes.ARTICLE.getName());
@@ -45,11 +99,9 @@ public void testDuplicateDetection() {
         one.setField("journal", "A");
         two.setField("journal", "A");
         assertTrue(DuplicateCheck.isDuplicate(one, two, BibDatabaseMode.BIBTEX));
-        assertEquals(1.01, DuplicateCheck.compareEntriesStrictly(one, two), 0.01);
 
         two.setField("journal", "B");
         assertTrue(DuplicateCheck.isDuplicate(one, two, BibDatabaseMode.BIBTEX));
-        assertEquals(0.75, DuplicateCheck.compareEntriesStrictly(one, two), 0.01);
 
         two.setField("journal", "A");
         one.setField("number", "1");

From 6b83848e791e3ebfd145ba6af11d9f7b01b228f4 Mon Sep 17 00:00:00 2001
From: Stefan Kolb <stefan-kolb@web.de>
Date: Mon, 29 Aug 2016 17:31:15 +0200
Subject: [PATCH 3/4] Another find

---
 src/main/java/net/sf/jabref/model/DuplicateCheck.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/main/java/net/sf/jabref/model/DuplicateCheck.java b/src/main/java/net/sf/jabref/model/DuplicateCheck.java
index 3e952e927cb..1d3c9d7e593 100644
--- a/src/main/java/net/sf/jabref/model/DuplicateCheck.java
+++ b/src/main/java/net/sf/jabref/model/DuplicateCheck.java
@@ -106,6 +106,7 @@ private static double[] compareFieldSet(List<String> fields, BibEntry one, BibEn
             }
             totWeights += weight;
 
+            // TODO: EMPTY_IN_ONE, EMPTY_IN_TWO, NOT_EQUAL is not used
             CheckResult result = DuplicateCheck.compareSingleField(field, one, two);
             if (result == CheckResult.EQUAL) {
                 res += weight;

From 99e7b4cf0e3c5306cd5b2dbd1a301c205507038a Mon Sep 17 00:00:00 2001
From: Stefan Kolb <stefan-kolb@web.de>
Date: Mon, 29 Aug 2016 17:49:16 +0200
Subject: [PATCH 4/4] Add duplicate database

---
 .../net/sf/jabref/model/DuplicateCheck.java     |  2 ++
 .../net/sf/jabref/model/duplicates.bib          | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 src/test/resources/net/sf/jabref/model/duplicates.bib

diff --git a/src/main/java/net/sf/jabref/model/DuplicateCheck.java b/src/main/java/net/sf/jabref/model/DuplicateCheck.java
index 1d3c9d7e593..12aa7147f27 100644
--- a/src/main/java/net/sf/jabref/model/DuplicateCheck.java
+++ b/src/main/java/net/sf/jabref/model/DuplicateCheck.java
@@ -24,6 +24,8 @@
 
 /**
  * This class contains utility method for duplicate checking of entries.
+ *
+ * TODO: http://hpi.de/naumann/projects/data-quality-and-cleansing/dude-duplicate-detection.html
  */
 public class DuplicateCheck {
     private static final Log LOGGER = LogFactory.getLog(DuplicateCheck.class);
diff --git a/src/test/resources/net/sf/jabref/model/duplicates.bib b/src/test/resources/net/sf/jabref/model/duplicates.bib
new file mode 100644
index 00000000000..8ae5fa0103f
--- /dev/null
+++ b/src/test/resources/net/sf/jabref/model/duplicates.bib
@@ -0,0 +1,17 @@
+% Encoding: UTF-8
+
+@InProceedings{dupJournalTechreport,
+  author    = {Kolb, Stefan and Wirtz, Guido},
+  title     = {This was published at a Workshop and a Journal with the same Title},
+  booktitle = {Proc. Workshop Cloud},
+  year      = {2016},
+}
+
+@Article{dupJournalTechreport,
+  author  = {Kolb, Stefan and Wirtz, Guido},
+  title   = {This was published at a Workshop and a Journal with the same Title},
+  journal = {Transactions on Cloud Computing},
+  year    = {2016},
+}
+
+@Comment{jabref-meta: databaseType:bibtex;}