Skip to content

Commit

Permalink
DOI improvements (#1447)
Browse files Browse the repository at this point in the history
* Fix #1420 Auto downloader should respect file pattern and propose correct filename
* Improve DOI detection #1424
* Add tests
* Resolves #1421 Auto downloader should try to retrieve DOI if not present and fetch afterwards
* Add external library, document test class
* Changelog
  • Loading branch information
stefan-kolb committed May 25, 2016
1 parent 781bbf1 commit 4a6f191
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- Fixed [#1412](https://github.com/JabRef/jabref/issues/1412): Save action *protect terms* protects terms within words unecessarily
- Fixed [#1420](https://github.com/JabRef/jabref/issues/1420): Auto downloader should respect file pattern and propose correct filename
- Fixed [#651](https://github.com/JabRef/jabref/issues/651): Improve parsing of author names containing braces
- Fixed [#1421](https://github.com/JabRef/jabref/issues/1421):Auto downloader should try to retrieve DOI if not present and fetch afterwards

### Removed
- Removed possibility to export entries/databases to an `.sql` file, as the logic cannot easily use the correct escape logic
Expand Down
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ dependencies {

compile 'org.jsoup:jsoup:1.9.1'
compile 'com.mashape.unirest:unirest-java:1.4.9'
compile 'info.debatty:java-string-similarity:0.13'

compile 'org.apache.logging.log4j:log4j-jcl:2.5'
compile 'org.apache.logging.log4j:log4j-api:2.5'
Expand Down
5 changes: 5 additions & 0 deletions external-libraries.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ Project: Unirest for Java
URL: https://github.com/Mashape/unirest-java
License: MIT

Id: info.debatty:java-string-similarity
Project: Java String Similarity
URL: https://github.com/tdebatty/java-string-similarity
License: MIT

Id: mysql:mysql-connector-java
Project: MySQL Connector/J
URL: http://www.mysql.de/downloads/connector/j/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
import net.sf.jabref.gui.fieldeditors.FieldEditor;
import net.sf.jabref.gui.mergeentries.MergeEntryDOIDialog;
import net.sf.jabref.gui.undo.UndoableFieldChange;
import net.sf.jabref.importer.fetcher.CrossRef;
import net.sf.jabref.logic.journals.JournalAbbreviationRepository;
import net.sf.jabref.logic.l10n.Localization;
import net.sf.jabref.logic.net.URLUtil;
Expand Down Expand Up @@ -187,12 +186,12 @@ public static Optional<JComponent> getDoiExtraComponent(BasePanel panel, EntryEd
// lookup doi
JButton doiButton = new JButton(Localization.lang("Lookup DOI"));
doiButton.addActionListener(actionEvent -> {
Optional<DOI> doi = CrossRef.findDOI(entryEditor.getEntry());
if (doi.isPresent()) {
entryEditor.getEntry().setField("doi", doi.get().getDOI());
} else {
panel.frame().setStatus(Localization.lang("No DOI found"));
}
Optional<DOI> doi = DOI.fromBibEntry(entryEditor.getEntry());
if (doi.isPresent()) {
entryEditor.getEntry().setField("doi", doi.get().getDOI());
} else {
panel.frame().setStatus(Localization.lang("No DOI found"));
}
});
// fetch bibtex data
JButton fetchButton = new JButton(Localization.lang("Get BibTeX data from DOI"));
Expand Down
52 changes: 47 additions & 5 deletions src/main/java/net/sf/jabref/importer/fetcher/CrossRef.java
Original file line number Diff line number Diff line change
@@ -1,28 +1,35 @@
package net.sf.jabref.importer.fetcher;

import java.util.Locale;
import java.util.Objects;
import java.util.Optional;

import net.sf.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter;
import net.sf.jabref.logic.util.DOI;
import net.sf.jabref.model.entry.BibEntry;

import com.mashape.unirest.http.HttpResponse;
import com.mashape.unirest.http.JsonNode;
import com.mashape.unirest.http.Unirest;
import com.mashape.unirest.http.exceptions.UnirestException;
import info.debatty.java.stringsimilarity.Levenshtein;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

/**
* A class for fetching DOIs from CrossRef
*
* @see https://github.com/CrossRef/rest-api-doc
* See https://github.com/CrossRef/rest-api-doc
*/
public class CrossRef {
private static final Log LOGGER = LogFactory.getLog(CrossRef.class);

private static final String API_URL = "http://api.crossref.org";
private static final Levenshtein METRIC_DISTANCE = new Levenshtein();
private static final int METRIC_THRESHOLD = 4;

public static Optional<DOI> findDOI(BibEntry entry) {
Objects.requireNonNull(entry);
Expand All @@ -44,10 +51,12 @@ public static Optional<DOI> findDOI(BibEntry entry) {
.asJson();

JSONArray items = response.getBody().getObject().getJSONObject("message").getJSONArray("items");
String dataTitle = items.getJSONObject(0).getJSONArray("title").getString(0);
String dataDOI = items.getJSONObject(0).getString("DOI");
LOGGER.info("DOI " + dataDOI + " for " + title + " found.");
return DOI.build(dataDOI);
// quality check
if (checkValidity(entry, items)) {
String dataDOI = items.getJSONObject(0).getString("DOI");
LOGGER.debug("DOI " + dataDOI + " for " + title + " found.");
return DOI.build(dataDOI);
}
} catch (UnirestException e) {
LOGGER.warn("Unable to query CrossRef API: " + e.getMessage(), e);
}
Expand All @@ -70,4 +79,37 @@ private static String enhanceQuery(String query, BibEntry entry) {

return enhancedQuery.toString();
}

private static boolean checkValidity(BibEntry entry, JSONArray result) {
final String entryTitle = new RemoveBracesFormatter().format(entry.getField("title"));

// currently only title-based
// title: [ "How the Mind Hurts and Heals the Body." ]
// subtitle: [ "" ]
try {
// title
JSONObject data = result.getJSONObject(0);
String dataTitle = data.getJSONArray("title").getString(0);

if (editDistanceIgnoreCase(entryTitle, dataTitle) <= METRIC_THRESHOLD) {
return true;
}

// subtitle
// additional check, as sometimes subtitle is needed but sometimes only duplicates the title
if (data.getJSONArray("subtitle").length() > 0) {
String dataWithSubTitle = dataTitle + " " + data.getJSONArray("subtitle").getString(0);

return editDistanceIgnoreCase(entryTitle, dataWithSubTitle) <= METRIC_THRESHOLD;
}

return false;
} catch(JSONException ex) {
return false;
}
}

private static double editDistanceIgnoreCase(String a, String b) {
return METRIC_DISTANCE.distance(a.toLowerCase(Locale.ENGLISH), b.toLowerCase(Locale.ENGLISH));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package net.sf.jabref.importer.fetcher;

import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;

import net.sf.jabref.Globals;
import net.sf.jabref.JabRefPreferences;
import net.sf.jabref.importer.ParserResult;
import net.sf.jabref.importer.fileformat.BibtexParser;
import net.sf.jabref.logic.util.DOI;
import net.sf.jabref.model.database.BibDatabase;
import net.sf.jabref.model.entry.BibEntry;

/**
* Useful for checking out new algorithm improvements and thresholds. Not used inside the JabRef code itself.
*/
public class CrossrefFetcherEvaluator {
public static void main(String[] args) throws IOException, InterruptedException {
Globals.prefs = JabRefPreferences.getInstance();

BibtexParser parser = new BibtexParser(new FileReader(args[0]));
ParserResult result = parser.parse();
BibDatabase db = result.getDatabase();

int total = result.getDatabase().getEntryCount();

AtomicInteger dois = new AtomicInteger();
AtomicInteger doiFound = new AtomicInteger();
AtomicInteger doiNew = new AtomicInteger();
AtomicInteger doiIdentical = new AtomicInteger();


List<BibEntry> entries = db.getEntries();
CountDownLatch countDownLatch = new CountDownLatch(entries.size());

ExecutorService executorService = Executors.newFixedThreadPool(5);

for (BibEntry entry : entries) {
executorService.execute(new Runnable() {

@Override
public void run() {
Optional<DOI> origDOI = DOI.build(entry.getField("doi"));
if (origDOI.isPresent()) {
dois.incrementAndGet();
Optional<DOI> crossrefDOI = CrossRef.findDOI(entry);
if (crossrefDOI.isPresent()) {
doiFound.incrementAndGet();
if (origDOI.get().getDOI().equalsIgnoreCase(crossrefDOI.get().getDOI())) {
doiIdentical.incrementAndGet();
} else {
System.out.println("DOI not identical for : " + entry);
}
} else {
System.out.println("DOI not found for: " + entry);
}
} else {
Optional<DOI> crossrefDOI = CrossRef.findDOI(entry);
if (crossrefDOI.isPresent()) {
System.out.println("New DOI found for: " + entry);
doiNew.incrementAndGet();
}
}
countDownLatch.countDown();
}
});

}
countDownLatch.await();

System.out.println("---------------------------------");
System.out.println("Total DB size: " + total);
System.out.println("Total DOIs: " + dois);
System.out.println("DOIs found: " + doiFound);
System.out.println("DOIs identical: " + doiIdentical);
System.out.println("New DOIs found: " + doiNew);

executorService.shutdown();
}
}
12 changes: 11 additions & 1 deletion src/main/java/net/sf/jabref/logic/fulltext/FindFullText.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import java.util.List;
import java.util.Optional;

import net.sf.jabref.importer.fetcher.CrossRef;
import net.sf.jabref.logic.io.MimeTypeDetector;
import net.sf.jabref.logic.util.DOI;
import net.sf.jabref.model.entry.BibEntry;

import org.apache.commons.logging.Log;
Expand Down Expand Up @@ -38,9 +40,17 @@ public FindFullText(List<FullTextFinder> fetcher) {
}

public Optional<URL> findFullTextPDF(BibEntry entry) {
// for accuracy, fetch DOI first but do not modify entry
BibEntry clonedEntry = (BibEntry) entry.clone();
String doi = clonedEntry.getField("doi");

if (doi == null || !DOI.build(doi).isPresent()) {
CrossRef.findDOI(clonedEntry).ifPresent(e -> clonedEntry.setField("doi", e.getDOI()));
}

for (FullTextFinder finder : finders) {
try {
Optional<URL> result = finder.findFullText(entry);
Optional<URL> result = finder.findFullText(clonedEntry);

if (result.isPresent() && MimeTypeDetector.isPdfContentType(result.get().toString())) {
return result;
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/net/sf/jabref/logic/util/DOI.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.sf.jabref.importer.fetcher.CrossRef;
import net.sf.jabref.model.entry.BibEntry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

Expand Down Expand Up @@ -120,6 +123,16 @@ public static Optional<DOI> findInText(String text) {
return result;
}

/**
* Tries to retrieve a DOI for an existing BibEntry.
*
* @param entry the BibteX entry
* @return an Optional containing the DOI or an empty Optional
*/
public static Optional<DOI> fromBibEntry(BibEntry entry) {
return CrossRef.findDOI(entry);
}

/**
* Return the plain DOI
*
Expand Down
19 changes: 18 additions & 1 deletion src/test/java/net/sf/jabref/importer/fetcher/CrossRefTest.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package net.sf.jabref.importer.fetcher;

import java.util.Locale;
import java.util.Optional;

import net.sf.jabref.model.entry.BibEntry;

Expand Down Expand Up @@ -34,13 +35,29 @@ public void findTitleOnly() {
}

@Test
public void findIncompleteTitle() {
public void notFindIncompleteTitle() {
BibEntry entry = new BibEntry();
entry.setField("title", "Towards Application Portability");
entry.setField("author", "Stefan Kolb and Guido Wirtz");
assertEquals(Optional.empty(), CrossRef.findDOI(entry));
}

@Test
public void acceptTitleUnderThreshold() {
BibEntry entry = new BibEntry();
entry.setField("title", "Towards Application Portability in Platform as a Service----");
entry.setField("author", "Stefan Kolb and Guido Wirtz");
assertEquals("10.1109/sose.2014.26", CrossRef.findDOI(entry).get().getDOI().toLowerCase(Locale.ENGLISH));
}

@Test
public void notAcceptTitleOverThreshold() {
BibEntry entry = new BibEntry();
entry.setField("title", "Towards Application Portability in Platform as a Service-----");
entry.setField("author", "Stefan Kolb and Guido Wirtz");
assertEquals(Optional.empty(), CrossRef.findDOI(entry));
}

@Test
public void findWrongAuthor() {
BibEntry entry = new BibEntry();
Expand Down

0 comments on commit 4a6f191

Please sign in to comment.