JabRef · koppor · Oct 3, 2020 · Sep 4, 2020 · Sep 8, 2020 · Sep 21, 2020
diff --git a/docs/adr/0017-use-regular-expression-to-split-multiple-sentence-titles.md b/docs/adr/0017-use-regular-expression-to-split-multiple-sentence-titles.md
@@ -0,0 +1,26 @@
+# Use regular expression to split multiple-sentence titles
+
+## Context and Problem Statement
+
+Some entry titles are composed of multiple sentences, for example: "Whose Music? A Sociology of Musical Language", therefore, it is necessary to first split the title into sentences and process them individually to ensure proper formatting using '[Sentence Case](https://en.wiktionary.org/wiki/sentence_case)' or '[Title Case](https://en.wiktionary.org/wiki/title_case#English)'
+
+## Considered Options
+
+* [Regular expression](https://docs.oracle.com/javase/tutorial/essential/regex/)
+* [OpenNLP](https://opennlp.apache.org/)
+* [ICU4J](http://site.icu-project.org/home)
+
+## Decision Outcome
+
+Chosen option: "Regular expression", because we can use Java internal classes (Pattern, Matcher) instead of adding additional dependencies
+
+### Positive Consequences
+
+* Less dependencies on third party libraries
+* Smaller project size (ICU4J is very large)
+* No need for model data (OpenNLP is a machine learning based toolkit and needs a trained model to work properly)
+
+### Negative Consequences
+
+* Regular expressions can never cover every case, therefore, splitting may not be accurate for every title
+
diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/SentenceCaseFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/SentenceCaseFormatter.java
@@ -1,7 +1,10 @@
 package org.jabref.logic.formatter.casechanger;
 
+import java.util.stream.Collectors;
+
 import org.jabref.logic.cleanup.Formatter;
 import org.jabref.logic.l10n.Localization;
+import org.jabref.model.strings.StringUtil;
 
 public class SentenceCaseFormatter extends Formatter {
 
@@ -20,11 +23,16 @@ public String getKey() {
      */
     @Override
     public String format(String input) {
-        Title title = new Title(new LowerCaseFormatter().format(input));
-
-        title.getWords().stream().findFirst().ifPresent(Word::toUpperFirst);
-
-        return title.toString();
+        return StringUtil.getStringAsSentences(input)
+                .stream()
+                .map(new LowerCaseFormatter()::format)
+                .map(Title::new)
+                .map(title -> {
+                    title.getFirstWord().ifPresent(Word::toUpperFirst);
+                    return title;
+                })
+                .map(Object::toString)
+                .collect(Collectors.joining(" "));
     }
 
     @Override

diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/TitleCaseFormatter.java b/src/main/java/org/jabref/logic/formatter/casechanger/TitleCaseFormatter.java
@@ -1,7 +1,10 @@
 package org.jabref.logic.formatter.casechanger;
 
+import java.util.stream.Collectors;
+
 import org.jabref.logic.cleanup.Formatter;
 import org.jabref.logic.l10n.Localization;
+import org.jabref.model.strings.StringUtil;
 
 public class TitleCaseFormatter extends Formatter {
 
@@ -22,21 +25,26 @@ public String getKey() {
      */
     @Override
     public String format(String input) {
-        Title title = new Title(input);
-
-        title.getWords().stream().filter(Word::isSmallerWord).forEach(Word::toLowerCase);
-        title.getWords().stream().filter(Word::isLargerWord).forEach(Word::toUpperFirst);
-
-        title.getFirstWord().ifPresent(Word::toUpperFirst);
-        title.getLastWord().ifPresent(Word::toUpperFirst);
-
-        for (int i = 0; i < (title.getWords().size() - 2); i++) {
-            if (title.getWords().get(i).endsWithColon()) {
-                title.getWords().get(i + 1).toUpperFirst();
-            }
-        }
-
-        return title.toString();
+        return StringUtil.getStringAsSentences(input)
+                .stream()
+                .map(sentence -> {
+                    Title title = new Title(sentence);
+
+                    title.getWords().stream().filter(Word::isSmallerWord).forEach(Word::toLowerCase);
+                    title.getWords().stream().filter(Word::isLargerWord).forEach(Word::toUpperFirst);
+
+                    title.getFirstWord().ifPresent(Word::toUpperFirst);
+                    title.getLastWord().ifPresent(Word::toUpperFirst);
+
+                    for (int i = 0; i < (title.getWords().size() - 2); i++) {
+                        if (title.getWords().get(i).endsWithColon()) {
+                            title.getWords().get(i + 1).toUpperFirst();
+                        }
+                    }
+
+                    return title.toString();
+                })
+                .collect(Collectors.joining(" "));
     }
 
     @Override

diff --git a/src/main/java/org/jabref/model/strings/StringUtil.java b/src/main/java/org/jabref/model/strings/StringUtil.java
@@ -720,6 +720,15 @@ public static List<String> getStringAsWords(String text) {
         return Arrays.asList(text.split("[\\s,;]+"));
     }
 
+    /**
+     * Returns a list of sentences contained in the given text.
+     */
+    public static List<String> getStringAsSentences(String text) {
+        // A sentence ends with a .?!;, but not in the case of "Mr.", "Ms.", "Mrs.", "Dr.", "st.", "jr.", "co.", "inc.", and "ltd."
+        Pattern splitTextPattern = Pattern.compile("(?<=[\\.!;\\?])(?<![Mm](([Rr]|[Rr][Ss])|[Ss])\\.|[Dd][Rr]\\.|[Ss][Tt]\\.|[Jj][Rr]\\.|[Cc][Oo]\\.|[Ii][Nn][Cc]\\.|[Ll][Tt][Dd]\\.)\\s+");
+        return Arrays.asList(splitTextPattern.split(text));
+    }
+
     @ApacheCommonsLang3Allowed("No direct Guava equivalent existing - see https://stackoverflow.com/q/16560635/873282")
     public static boolean containsIgnoreCase(String text, String searchString) {
         return StringUtils.containsIgnoreCase(text, searchString);

diff --git a/src/test/java/org/jabref/logic/formatter/casechanger/SentenceCaseFormatterTest.java b/src/test/java/org/jabref/logic/formatter/casechanger/SentenceCaseFormatterTest.java
@@ -1,7 +1,10 @@
 package org.jabref.logic.formatter.casechanger;
 
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
@@ -10,23 +13,30 @@
  */
 public class SentenceCaseFormatterTest {
 
-    private SentenceCaseFormatter formatter;
-
-    @BeforeEach
-    public void setUp() {
-        formatter = new SentenceCaseFormatter();
-    }
+    private final SentenceCaseFormatter formatter = new SentenceCaseFormatter();
 
-    @Test
-    public void test() {
-        assertEquals("Upper first", formatter.format("upper First"));
-        assertEquals("Upper first", formatter.format("uPPER FIRST"));
-        assertEquals("Upper {NOT} first", formatter.format("upper {NOT} FIRST"));
-        assertEquals("Upper {N}ot first", formatter.format("upper {N}OT FIRST"));
+    private static Stream<Arguments> testData() {
+        return Stream.of(
+                Arguments.of("Upper first", "upper First"),
+                Arguments.of("Upper first", "uPPER FIRST"),
+                Arguments.of("Upper {NOT} first", "upper {NOT} FIRST"),
+                Arguments.of("Upper {N}ot first", "upper {N}OT FIRST"),
+                Arguments.of("Whose music? A sociology of musical language",
+                    "Whose music? a sociology of musical language"),
+                Arguments.of("Bibliographic software. A comparison.",
+                    "bibliographic software. a comparison."),
+                Arguments.of("England’s monitor; The history of the separation",
+                    "England’s Monitor; the History of the Separation"),
+                Arguments.of("Dr. schultz: a dentist turned bounty hunter.",
+                    "Dr. schultz: a dentist turned bounty hunter."),
+                Arguments.of("Example case. {EXCLUDED SENTENCE.}",
+                    "Example case. {EXCLUDED SENTENCE.}"),
+                Arguments.of("I have {Aa} dream", new SentenceCaseFormatter().getExampleInput()));
     }
 
-    @Test
-    public void formatExample() {
-        assertEquals("I have {Aa} dream", formatter.format(formatter.getExampleInput()));
+    @ParameterizedTest
+    @MethodSource("testData")
+    public void test(String expected, String input) {
+        assertEquals(expected, formatter.format(input));
     }
 }
diff --git a/src/test/java/org/jabref/logic/formatter/casechanger/TitleCaseFormatterTest.java b/src/test/java/org/jabref/logic/formatter/casechanger/TitleCaseFormatterTest.java
@@ -1,7 +1,10 @@
 package org.jabref.logic.formatter.casechanger;
 
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
+import java.util.stream.Stream;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
@@ -10,8 +13,43 @@
  */
 public class TitleCaseFormatterTest {
 
-    private TitleCaseFormatter formatter;
+    private final TitleCaseFormatter formatter = new TitleCaseFormatter();
+
+    private static Stream<Arguments> testData() {
+        return Stream.of(
+                Arguments.of("Upper Each First", "upper each first"),
+                Arguments.of("Upper Each First", "upper eACH first"),
+                Arguments.of("An Upper Each First And", "an upper each first and"),
+                Arguments.of("An Upper Each First And", "an upper each first AND"),
+                Arguments.of("An Upper Each of the and First And",
+                             "an upper each of the and first and"),
+                Arguments.of("An Upper Each of the and First And",
+                             "an upper each of the AND first and"),
+                Arguments.of("An Upper Each of: The and First And",
+                             "an upper each of: the and first and"),
+                Arguments.of("An Upper First with and without {CURLY} {brackets}",
+                             "AN UPPER FIRST WITH AND WITHOUT {CURLY} {brackets}"),
+                Arguments.of("An Upper First with {A}nd without {C}urly {b}rackets",
+                             "AN UPPER FIRST WITH {A}ND WITHOUT {C}URLY {b}rackets"),
+                Arguments.of("{b}rackets {b}rac{K}ets Brack{E}ts",
+                             "{b}RaCKeTS {b}RaC{K}eTS bRaCK{E}ts"),
+                Arguments.of("Two Experiences Designing for Effective Security",
+                             "Two experiences designing for effective security"),
+                Arguments.of("Bibliographic Software. A Comparison.",
+                             "bibliographic software. a comparison."),
+                Arguments.of("Bibliographic Software. {A COMPARISON.}",
+                             "bibliographic software. {A COMPARISON.}"),
+                Arguments.of("{BPMN} Conformance in Open Source Engines",
+                             new TitleCaseFormatter().getExampleInput()));
+    }
 
+    @ParameterizedTest
+    @MethodSource("testData")
+    public void test(String expected, String input) {
+        assertEquals(expected, formatter.format(input));
+    }
+
+    /*
     @BeforeEach
     public void setUp() {
         formatter = new TitleCaseFormatter();
@@ -80,8 +118,21 @@ public void testTwoExperiencesTitle() {
                 formatter.format("Two experiences designing for effective security"));
     }
 
+    @Test
+    public void testSimpleTwoSentenceTitle() {
+        assertEquals("Bibliographic Software. A Comparison.",
+                     formatter.format("bibliographic software. a comparison."));
+    }
+
+    @Test
+    public void secondSentenceInBracketsIsLeftUnchanged() {
+        assertEquals("Bibliographic Software. {A COMPARISON.}",
+                     formatter.format("bibliographic software. {A COMPARISON.}"));
+    }
+
     @Test
     public void formatExample() {
         assertEquals("{BPMN} Conformance in Open Source Engines", formatter.format(formatter.getExampleInput()));
     }
+    */
 }