Updates to institution citation keys (#7210)

* Add test cases * Fix overly broad regexp The regexp is still broad, but unless there are further complaints perhaps it is enough. * Fix case-sensitivity in test case * Fix inline abbreviation for institutes * Drop test for short author * Add test case * Add test case * Fix test case for single word author * Fix un-escaped backslash in test case * Fix unbalanced brackets in text cases * Fix institute author abbreviations * Readability modifications * Add log output for generating university key When generating a key from a university name it should contain at least two parts, "university" and the university's name. If it does not it is likely that the name contained latex that could not be resolved correctly. * Fix JavaDoc * Update CHANGELOG.md * Add log message on miss-parsed LaTeX * Change fields to final * Fix institute abbreviation with special characters Some characters will be converted into a more BibTeX friendly during citation key generation. Øresund Science Region should be abbreviated to OSR but instead becomes OeSR. * Drop out-of-scope test case * Codestyle change * Update src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java Co-authored-by: Christoph <siedlerkiller@gmail.com> * Update src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java Co-authored-by: Christoph <siedlerkiller@gmail.com> * Update src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java Co-authored-by: Christoph <siedlerkiller@gmail.com> * Update src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java Co-authored-by: Christoph <siedlerkiller@gmail.com> * Removes unnecessary checked exception * Fix missed NFC normalization * Add test case Co-authored-by: Christoph <siedlerkiller@gmail.com>
JabRef · Dec 28, 2020 · 78b08b5 · 78b08b5
1 parent a6749ed
commit 78b08b5
Show file tree

Hide file tree

Showing 7 changed files with 128 additions and 125 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -90,6 +90,8 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 - We fixed an issue where the password for a shared SQL database was not remembered [#6869](https://github.com/JabRef/jabref/issues/6869)
 - We fixed an issue where newly added entires were not synced to a shared SQL database [#7176](https://github.com/JabRef/jabref/issues/7176)
 - We fixed an issue where the PDF-Content importer threw an exception when no DOI number is present at the first page of the PDF document [#7203](https://github.com/JabRef/jabref/issues/7203)
+- We fixed an issue where authors that only have last names were incorrectly identified as institutes when generating citation keys [#7199](https://github.com/JabRef/jabref/issues/7199)
+- We fixed an issue where institutes were incorrectly identified as universities when generating citation keys [#6942](https://github.com/JabRef/jabref/issues/6942)
 
 ### Removed
 

diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
@@ -57,6 +57,7 @@
     requires reactfx;
     requires commons.cli;
     requires com.github.tomtung.latex2unicode;
+    requires fastparse;
     requires jbibtex;
     requires citeproc.java;
     requires antlr.runtime;

diff --git a/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java b/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java
@@ -1,5 +1,6 @@
 package org.jabref.logic.citationkeypattern;
 
+import java.text.Normalizer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -31,6 +32,7 @@
 import org.jabref.model.entry.field.InternalField;
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.strings.LatexToUnicodeAdapter;
+import org.jabref.model.strings.StringUtil;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -59,13 +61,14 @@ public class BracketedPattern {
      */
     private static final Pattern NOT_CAPITAL_CHARACTER = Pattern.compile("[^A-Z]");
     /**
-     * Matches with "({[A-Z]}+)", which should be used to abbreviate the name of an institution
+     * Matches uppercase english letters between "({" and "})", which should be used to abbreviate the name of an institution
      */
-    private static final Pattern ABBREVIATIONS = Pattern.compile(".*\\(\\{[A-Z]+}\\).*");
+    private static final Pattern INLINE_ABBREVIATION = Pattern.compile("(?<=\\(\\{)[A-Z]+(?=}\\))");
     /**
      * Matches with "dep"/"dip", case insensitive
      */
     private static final Pattern DEPARTMENTS = Pattern.compile("^d[ei]p.*", Pattern.CASE_INSENSITIVE);
+    private static final Pattern WHITESPACE = Pattern.compile("\\p{javaWhitespace}");
 
     private enum Institution {
         SCHOOL,
@@ -74,9 +77,9 @@ private enum Institution {
         TECHNOLOGY;
 
         /**
-         * Matches "uni" at the start of a string or after a space, case insensitive
+         * Matches "uni" followed by "v" or "b", at the start of a string or after a space, case insensitive
          */
-        private static final Pattern UNIVERSITIES = Pattern.compile("^uni.*", Pattern.CASE_INSENSITIVE);
+        private static final Pattern UNIVERSITIES = Pattern.compile("^uni(v|b|$).*", Pattern.CASE_INSENSITIVE);
         /**
          * Matches with "tech", case insensitive
          */
@@ -492,9 +495,9 @@ private static AuthorList createAuthorList(String unparsedAuthors) {
         for (Author author : AuthorList.parse(unparsedAuthors).getAuthors()) {
             // If the author is an institution, use an institution key instead of the full name
             String lastName = author.getLast()
-                                    .map(LatexToUnicodeAdapter::format)
-                                    .map(isInstitution(author) ?
-                                            BracketedPattern::generateInstitutionKey : Function.identity())
+                                    .map(lastPart -> isInstitution(author) ?
+                                            generateInstitutionKey(lastPart) :
+                                            LatexToUnicodeAdapter.format(lastPart))
                                     .orElse(null);
             authorList.addAuthor(
                     author.getFirst().map(LatexToUnicodeAdapter::format).orElse(null),
@@ -508,14 +511,15 @@ private static AuthorList createAuthorList(String unparsedAuthors) {
     }
 
     /**
-     * Checks if an author is an institution by verifying that only the last name is present.
+     * Checks if an author is an institution which can get a citation key from {@link #generateInstitutionKey(String)}.
      *
      * @param author the checked author
-     * @return true if only the last name is present
+     * @return true if only the last name is present and it contains at least one whitespace character.
      */
     private static boolean isInstitution(Author author) {
         return author.getFirst().isEmpty() && author.getFirstAbbr().isEmpty() && author.getJr().isEmpty()
-                && author.getVon().isEmpty() && author.getLast().isPresent();
+                && author.getVon().isEmpty() && author.getLast().isPresent()
+                && WHITESPACE.matcher(author.getLast().get()).find();
     }
 
     /**
@@ -658,52 +662,31 @@ public static String camelizeSignificantWordsInTitle(String title) {
     }
 
     public static String removeSmallWords(String title) {
-        StringJoiner stringJoiner = new StringJoiner(" ");
         String formattedTitle = formatTitle(title);
 
         try (Scanner titleScanner = new Scanner(formattedTitle)) {
-            mainl:
-            while (titleScanner.hasNext()) {
-                String word = titleScanner.next();
-
-                for (String smallWord : Word.SMALLER_WORDS) {
-                    if (word.equalsIgnoreCase(smallWord)) {
-                        continue mainl;
-                    }
-                }
-
-                stringJoiner.add(word);
-            }
+            return titleScanner.tokens()
+                               .filter(Predicate.not(
+                                       Word::isSmallerWord))
+                               .collect(Collectors.joining(" "));
         }
-
-        return stringJoiner.toString();
     }
 
     private static String getTitleWordsWithSpaces(int number, String title) {
-        StringJoiner stringJoiner = new StringJoiner(" ");
         String formattedTitle = formatTitle(title);
-        int words = 0;
 
         try (Scanner titleScanner = new Scanner(formattedTitle)) {
-            while (titleScanner.hasNext() && (words < number)) {
-                String word = titleScanner.next();
-
-                stringJoiner.add(word);
-                words++;
-            }
+            return titleScanner.tokens()
+                               .limit(number)
+                               .collect(Collectors.joining(" "));
         }
-
-        return stringJoiner.toString();
     }
 
     private static String keepLettersAndDigitsOnly(String in) {
-        StringBuilder stringBuilder = new StringBuilder();
-        for (int i = 0; i < in.length(); i++) {
-            if (Character.isLetterOrDigit(in.charAt(i))) {
-                stringBuilder.append(in.charAt(i));
-            }
-        }
-        return stringBuilder.toString();
+        return in.codePoints()
+                 .filter(Character::isLetterOrDigit)
+                 .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
+                 .toString();
     }
 
     /**
@@ -1131,51 +1114,6 @@ protected static List<String> parseFieldAndModifiers(String arg) {
         return parts;
     }
 
-    /**
-     * Will remove diacritics from the content.
-     * <ul>
-     * <li>Replaces umlaut: \"x with xe, e.g. \"o -> oe, \"u -> ue, etc.</li>
-     * <li>Removes all other diacritics: \?x -> x, e.g. \'a -> a, etc.</li>
-     * </ul>
-     *
-     * @param content The content.
-     * @return The content without diacritics.
-     */
-    private static String removeDiacritics(String content) {
-        if (content.isEmpty()) {
-            return content;
-        }
-
-        String result = content;
-        // Replace umlaut with '?e'
-        result = result.replaceAll("\\{\\\\\"([a-zA-Z])\\}", "$1e");
-        result = result.replaceAll("\\\\\"\\{([a-zA-Z])\\}", "$1e");
-        result = result.replaceAll("\\\\\"([a-zA-Z])", "$1e");
-        // Remove diacritics
-        result = result.replaceAll("\\{\\\\.([a-zA-Z])\\}", "$1");
-        result = result.replaceAll("\\\\.\\{([a-zA-Z])\\}", "$1");
-        result = result.replaceAll("\\\\.([a-zA-Z])", "$1");
-        return result;
-    }
-
-    /**
-     * Unifies umlauts.
-     * <ul>
-     * <li>Replaces: $\ddot{\mathrm{X}}$ (an alternative umlaut) with: {\"X}</li>
-     * <li>Replaces: \?{X} and \?X with {\?X}, where ? is a diacritic symbol</li>
-     * </ul>
-     *
-     * @param content The content.
-     * @return The content with unified diacritics.
-     */
-    private static String unifyDiacritics(String content) {
-        return content.replaceAll(
-                "\\$\\\\ddot\\{\\\\mathrm\\{([^\\}])\\}\\}\\$",
-                "{\\\"$1}").replaceAll(
-                "(\\\\[^\\-a-zA-Z])\\{?([a-zA-Z])\\}?",
-                "{$1$2}");
-    }
-
     /**
      * <p>
      * An author or editor may be and institution not a person. In that case the key generator builds very long keys,
@@ -1248,15 +1186,20 @@ private static String generateInstitutionKey(String content) {
             return "";
         }
 
-        String result = content;
-        result = unifyDiacritics(result);
-        result = result.replaceAll("^\\{", "").replaceAll("}$", "");
-        Matcher matcher = ABBREVIATIONS.matcher(result);
-        if (matcher.matches()) {
-            return matcher.group(1);
+        Matcher matcher = INLINE_ABBREVIATION.matcher(content);
+        if (matcher.find()) {
+            return LatexToUnicodeAdapter.format(matcher.group());
         }
 
-        result = removeDiacritics(result);
+        Optional<String> unicodeFormattedName = LatexToUnicodeAdapter.parse(content);
+        if (unicodeFormattedName.isEmpty()) {
+            LOGGER.warn("{} could not be converted to unicode. This can result in an incorrect or missing institute citation key", content);
+        }
+        String result = unicodeFormattedName.orElse(Normalizer.normalize(content, Normalizer.Form.NFC));
+
+        // Special characters can't be allowed past this point because the citation key generator might replace them with multiple mixed-case characters
+        result = StringUtil.replaceSpecialCharacters(result);
+
         String[] institutionNameTokens = result.split(",");
 
         // Key parts
@@ -1335,7 +1278,6 @@ private static String generateInstitutionKey(String content) {
      * institution keyword and has an uppercase first letter, except univ/tech key word.
      *
      * @param word to check
-     * @return
      */
     private static boolean noOtherInstitutionKeyWord(String word) {
         return !DEPARTMENTS.matcher(word).matches()

diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/Word.java b/src/main/java/org/jabref/logic/formatter/casechanger/Word.java
@@ -1,19 +1,24 @@
 package org.jabref.logic.formatter.casechanger;
 
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Objects;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 /**
  * Represents a word in a title of a bibtex entry.
  * <p>
  * A word can have protected chars (enclosed in '{' '}') and may be a small (a, an, the, ...) word.
  */
 public final class Word {
+    /**
+     * Set containing common lowercase function words
+     */
     public static final Set<String> SMALLER_WORDS;
+    private final char[] chars;
+    private final boolean[] protectedChars;
 
     static {
         Set<String> smallerWords = new HashSet<>();
@@ -26,12 +31,11 @@ public final class Word {
         smallerWords.addAll(Arrays.asList("and", "but", "for", "nor", "or", "so", "yet"));
 
         // unmodifiable for thread safety
-        SMALLER_WORDS = Collections.unmodifiableSet(smallerWords);
+        SMALLER_WORDS = smallerWords.stream()
+                            .map(word -> word.toLowerCase(Locale.ROOT))
+                            .collect(Collectors.toUnmodifiableSet());
     }
 
-    private final char[] chars;
-    private final boolean[] protectedChars;
-
     public Word(char[] chars, boolean[] protectedChars) {
         this.chars = Objects.requireNonNull(chars);
         this.protectedChars = Objects.requireNonNull(protectedChars);
@@ -41,16 +45,21 @@ public Word(char[] chars, boolean[] protectedChars) {
         }
     }
 
+    /**
+     * Case-insensitive check against {@link Word#SMALLER_WORDS}. Checks for common function words.
+     */
+    public static boolean isSmallerWord(String word) {
+        return SMALLER_WORDS.contains(word.toLowerCase(Locale.ROOT));
+    }
+
     /**
      * Only change letters of the word that are unprotected to upper case.
      */
     public void toUpperCase() {
         for (int i = 0; i < chars.length; i++) {
-            if (protectedChars[i]) {
-                continue;
+            if (!protectedChars[i]) {
+                chars[i] = Character.toUpperCase(chars[i]);
             }
-
-            chars[i] = Character.toUpperCase(chars[i]);
         }
     }
 
@@ -59,24 +68,18 @@ public void toUpperCase() {
      */
     public void toLowerCase() {
         for (int i = 0; i < chars.length; i++) {
-            if (protectedChars[i]) {
-                continue;
+            if (!protectedChars[i]) {
+                chars[i] = Character.toLowerCase(chars[i]);
             }
-
-            chars[i] = Character.toLowerCase(chars[i]);
         }
     }
 
     public void toUpperFirst() {
         for (int i = 0; i < chars.length; i++) {
-            if (protectedChars[i]) {
-                continue;
-            }
-
-            if (i == 0) {
-                chars[i] = Character.toUpperCase(chars[i]);
-            } else {
-                chars[i] = Character.toLowerCase(chars[i]);
+            if (!protectedChars[i]) {
+                chars[i] = (i == 0) ?
+                        Character.toUpperCase(chars[i]) :
+                        Character.toLowerCase(chars[i]);
             }
         }
     }

diff --git a/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java b/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java
@@ -2,26 +2,49 @@
 
 import java.text.Normalizer;
 import java.util.Objects;
+import java.util.Optional;
 import java.util.regex.Pattern;
 
 import com.github.tomtung.latex2unicode.LaTeX2Unicode;
+import fastparse.core.Parsed;
 
 /**
  * Adapter class for the latex2unicode lib. This is an alternative to our LatexToUnicode class
  */
 public class LatexToUnicodeAdapter {
 
-    private static Pattern underscoreMatcher = Pattern.compile("_(?!\\{)");
+    private static final Pattern UNDERSCORE_MATCHER = Pattern.compile("_(?!\\{)");
 
-    private static String replacementChar = "\uFFFD";
+    private static final String REPLACEMENT_CHAR = "\uFFFD";
 
-    private static Pattern underscorePlaceholderMatcher = Pattern.compile(replacementChar);
+    private static final Pattern UNDERSCORE_PLACEHOLDER_MATCHER = Pattern.compile(REPLACEMENT_CHAR);
 
+    /**
+     * Attempts to resolve all LaTeX in the String.
+     *
+     * @param inField a String containing LaTeX
+     * @return a String with LaTeX resolved into Unicode, or the original String if the LaTeX could not be parsed
+     */
     public static String format(String inField) {
         Objects.requireNonNull(inField);
+        return parse(inField).orElse(Normalizer.normalize(inField, Normalizer.Form.NFC));
+    }
 
-        String toFormat = underscoreMatcher.matcher(inField).replaceAll(replacementChar);
-        toFormat = Normalizer.normalize(LaTeX2Unicode.convert(toFormat), Normalizer.Form.NFC);
-        return underscorePlaceholderMatcher.matcher(toFormat).replaceAll("_");
+    /**
+     * Attempts to resolve all LaTeX in the String.
+     *
+     * @param inField a String containing LaTeX
+     * @return an {@code Optional<String>} with LaTeX resolved into Unicode or {@code empty} on failure.
+     */
+    public static Optional<String> parse(String inField) {
+        Objects.requireNonNull(inField);
+        String toFormat = UNDERSCORE_MATCHER.matcher(inField).replaceAll(REPLACEMENT_CHAR);
+        var parsingResult = LaTeX2Unicode.parse(toFormat);
+        if (parsingResult instanceof Parsed.Success) {
+            String text = parsingResult.get().value();
+            toFormat = Normalizer.normalize(text, Normalizer.Form.NFC);
+            return Optional.of(UNDERSCORE_PLACEHOLDER_MATCHER.matcher(toFormat).replaceAll("_"));
+        }
+        return Optional.empty();
     }
 }