From 2bd9612e03c5885e209b6966b558f17233cdead4 Mon Sep 17 00:00:00 2001 From: Dennis Tschechlov Date: Sun, 26 Jun 2016 15:25:01 +0200 Subject: [PATCH] rewrite MedlineImporter with JAXB parser --- .../importer/fileformat/MedlineImporter.java | 418 ++++++++++++++++-- .../fileformat/MedlineImporterTest.java | 7 - 2 files changed, 374 insertions(+), 51 deletions(-) diff --git a/src/main/java/net/sf/jabref/importer/fileformat/MedlineImporter.java b/src/main/java/net/sf/jabref/importer/fileformat/MedlineImporter.java index 4a27483d3cc1..b149884ca091 100644 --- a/src/main/java/net/sf/jabref/importer/fileformat/MedlineImporter.java +++ b/src/main/java/net/sf/jabref/importer/fileformat/MedlineImporter.java @@ -16,28 +16,63 @@ package net.sf.jabref.importer.fileformat; import java.io.BufferedReader; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; +import java.io.Serializable; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Objects; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBElement; +import javax.xml.bind.JAXBException; +import javax.xml.bind.Unmarshaller; import net.sf.jabref.importer.ParserResult; +import net.sf.jabref.logic.fileformats.medline.Abstract; +import net.sf.jabref.logic.fileformats.medline.AbstractText; +import net.sf.jabref.logic.fileformats.medline.AffiliationInfo; +import net.sf.jabref.logic.fileformats.medline.ArticleTitle; +import net.sf.jabref.logic.fileformats.medline.Author; +import net.sf.jabref.logic.fileformats.medline.AuthorList; +import net.sf.jabref.logic.fileformats.medline.Chemical; +import net.sf.jabref.logic.fileformats.medline.DateCompleted; +import net.sf.jabref.logic.fileformats.medline.DateCreated; +import net.sf.jabref.logic.fileformats.medline.DateRevised; +import net.sf.jabref.logic.fileformats.medline.ELocationID; +import net.sf.jabref.logic.fileformats.medline.GeneSymbolList; +import net.sf.jabref.logic.fileformats.medline.GeneralNote; +import net.sf.jabref.logic.fileformats.medline.ISSN; +import net.sf.jabref.logic.fileformats.medline.Investigator; +import net.sf.jabref.logic.fileformats.medline.InvestigatorList; +import net.sf.jabref.logic.fileformats.medline.IsoLanguageCodes; +import net.sf.jabref.logic.fileformats.medline.Journal; +import net.sf.jabref.logic.fileformats.medline.JournalIssue; +import net.sf.jabref.logic.fileformats.medline.Keyword; +import net.sf.jabref.logic.fileformats.medline.KeywordList; +import net.sf.jabref.logic.fileformats.medline.MedlineCitation; +import net.sf.jabref.logic.fileformats.medline.MedlineJournalInfo; +import net.sf.jabref.logic.fileformats.medline.MeshHeading; +import net.sf.jabref.logic.fileformats.medline.MeshHeadingList; +import net.sf.jabref.logic.fileformats.medline.OtherID; +import net.sf.jabref.logic.fileformats.medline.Pagination; +import net.sf.jabref.logic.fileformats.medline.PersonalNameSubject; +import net.sf.jabref.logic.fileformats.medline.PersonalNameSubjectList; +import net.sf.jabref.logic.fileformats.medline.PubDate; +import net.sf.jabref.logic.fileformats.medline.PubmedArticle; +import net.sf.jabref.logic.fileformats.medline.PubmedArticleSet; +import net.sf.jabref.logic.fileformats.medline.QualifierName; import net.sf.jabref.model.entry.BibEntry; +import net.sf.jabref.model.entry.IdGenerator; +import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.xml.sax.InputSource; /** - * Importer for the Refer/Endnote format. + * Importer for the Medline/Pubmed format. * * check here for details on the format - * http://www.ecst.csuchico.edu/~jacobsd/bib/formats/endnote.html + * https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html */ public class MedlineImporter extends ImportFormat { @@ -83,50 +118,345 @@ public boolean isRecognizedFormat(BufferedReader reader) throws IOException { public ParserResult importDatabase(BufferedReader reader) throws IOException { Objects.requireNonNull(reader); - // Obtain a factory object for creating SAX parsers - SAXParserFactory parserFactory = SAXParserFactory.newInstance(); - - // Configure the factory object to specify attributes of the parsers it - // creates - parserFactory.setValidating(true); - parserFactory.setNamespaceAware(true); - - // Now create a SAXParser object List bibItems = new ArrayList<>(); try { - SAXParser parser = parserFactory.newSAXParser(); // May throw - // exceptions - MedlineHandler handler = new MedlineHandler(); - // Start the parser. It reads the file and calls methods of the - // handler. - parser.parse(new InputSource(reader), handler); - - // Switch this to true if you want to make a local copy for testing. - if (false) { - reader.reset(); - try (FileOutputStream out = new FileOutputStream(new File("/home/alver/ut.txt"))) { - int c; - while ((c = reader.read()) != -1) { - out.write((char) c); + JAXBContext context = JAXBContext.newInstance("net.sf.jabref.logic.fileformats.medline"); + Unmarshaller unmarshaller = context.createUnmarshaller(); + Object o = unmarshaller.unmarshal(reader); + + if (o instanceof PubmedArticleSet) { + PubmedArticleSet articleSet = (PubmedArticleSet) o; + for (int i = 0; i < articleSet.getPubmedArticleOrPubmedBookArticle().size(); i++) { + PubmedArticle currentArticle = (PubmedArticle) articleSet.getPubmedArticleOrPubmedBookArticle() + .get(i); + parseArticle(currentArticle, bibItems); + } + } else { + PubmedArticle article = (PubmedArticle) o; + parseArticle(article, bibItems); + } + } catch (JAXBException e1) { + LOGGER.warn("Could not parse document"); + } + + return new ParserResult(bibItems); + } + + private void parseArticle(PubmedArticle article, List bibItems) { + HashMap fields = new HashMap<>(); + + if (article.getPubmedData() != null) { + addDateRevised(fields, article); + putIfNotNull(fields, "pubstatus", article.getPubmedData().getPublicationStatus()); + } + if (article.getMedlineCitation() != null) { + MedlineCitation medlineCitation = article.getMedlineCitation(); + + fields.put("status", medlineCitation.getStatus()); + DateCreated dateCreated = medlineCitation.getDateCreated(); + fields.put("created", dateCreated.getDay() + "/" + dateCreated.getMonth() + "/" + dateCreated.getYear()); + fields.put("pubmodel", medlineCitation.getArticle().getPubModel()); + DateCompleted dateCompleted = medlineCitation.getDateCompleted(); + fields.put("completed", + dateCompleted.getDay() + "/" + dateCompleted.getMonth() + "/" + dateCompleted.getYear()); + fields.put("pmid", medlineCitation.getPMID().getContent()); + fields.put("owner", medlineCitation.getOwner()); + + addArticleInformation(fields, medlineCitation.getArticle().getContent()); + + MedlineJournalInfo medlineJournalInfo = medlineCitation.getMedlineJournalInfo(); + putIfNotNull(fields, "country", medlineJournalInfo.getCountry()); + putIfNotNull(fields, "journal-abbreviation", medlineJournalInfo.getMedlineTA()); + putIfNotNull(fields, "nlm-id", medlineJournalInfo.getNlmUniqueID()); + putIfNotNull(fields, "issn-linking", medlineJournalInfo.getISSNLinking()); + if (medlineCitation.getChemicalList().getChemical() != null) { + addChemicals(fields, medlineCitation.getChemicalList().getChemical()); + } + if (medlineCitation.getCitationSubset() != null) { + fields.put("citation-subset", StringUtils.join(medlineCitation.getCitationSubset(), ", ")); + } + if (medlineCitation.getGeneSymbolList() != null) { + addGeneSymbols(fields, medlineCitation.getGeneSymbolList()); + } + if (medlineCitation.getMeshHeadingList() != null) { + addMeashHeading(fields, medlineCitation.getMeshHeadingList()); + } + putIfNotNull(fields, "references", medlineCitation.getNumberOfReferences()); + if (medlineCitation.getPersonalNameSubjectList() != null) { + addPersonalNames(fields, medlineCitation.getPersonalNameSubjectList()); + } + if (medlineCitation.getOtherID() != null) { + addOtherId(fields, medlineCitation.getOtherID()); + } + if (medlineCitation.getKeywordList() != null) { + addKeyWords(fields, medlineCitation.getKeywordList()); + } + if (medlineCitation.getSpaceFlightMission() != null) { + fields.put("space-flight-mission", StringUtils.join(medlineCitation.getSpaceFlightMission(), ", ")); + } + if (medlineCitation.getInvestigatorList() != null) { + addInvestigators(fields, medlineCitation.getInvestigatorList()); + } + if (medlineCitation.getGeneralNote() != null) { + addNotes(fields, medlineCitation.getGeneralNote()); + } + + } + + BibEntry entry = new BibEntry(IdGenerator.next(), "article"); // id assumes an existing database so don't create one here + entry.setField(fields); + + bibItems.add(entry); + } + + private void addNotes(HashMap fields, List generalNote) { + List notes = new ArrayList<>(); + for (GeneralNote note : generalNote) { + if (note != null) { + notes.add(note.getContent()); + } + } + fields.put("note", StringUtils.join(notes, ", ")); + } + + private void addInvestigators(HashMap fields, InvestigatorList investigatorList) { + List investigatorNames = new ArrayList<>(); + List affiliationInfos = new ArrayList<>(); + String name = ""; + // add the investigators like the authors + if (investigatorList.getInvestigator() != null) { + List investigators = investigatorList.getInvestigator(); + for (Investigator investigator : investigators) { + name = investigator.getLastName(); + if (investigator.getForeName() != null) { + name += ", " + investigator.getForeName(); + } + investigatorNames.add(name); + + //now add the affiliation info + if (investigator.getAffiliationInfo() != null) { + for (AffiliationInfo info : investigator.getAffiliationInfo()) { + for (Serializable affiliation : info.getAffiliation().getContent()) { + if (affiliation instanceof String) { + affiliationInfos.add((String) affiliation); + } + } } + fields.put("affiliation", StringUtils.join(affiliationInfos, ", ")); } } + fields.put("investigator", StringUtils.join(investigatorNames, " and ")); + } + } - // When you're done, report the results stored by your handler - // object - bibItems.addAll(handler.getItems()); - } catch (javax.xml.parsers.ParserConfigurationException e) { - LOGGER.error("Error with XML parser configuration", e); - return ParserResult.fromErrorMessage(e.getLocalizedMessage()); - } catch (org.xml.sax.SAXException e) { - LOGGER.error("Error during XML parsing", e); - return ParserResult.fromErrorMessage(e.getLocalizedMessage()); - } catch (IOException e) { - LOGGER.error("Error during file import", e); - return ParserResult.fromErrorMessage(e.getLocalizedMessage()); + private void addKeyWords(HashMap fields, List keywordList) { + List keywordStrings = new ArrayList<>(); + //add keywords to the list + for (KeywordList keywords : keywordList) { + for (Keyword keyword : keywords.getKeyword()) { + for (Serializable content : keyword.getContent()) { + if (content instanceof String) { + keywordStrings.add((String) content); + } + } + } } + //Check whether MeshHeadingList exist or not + if (fields.get("keywords") == null) { + fields.put("keywords", StringUtils.join(keywordStrings, "; ")); + } else { + if (keywordStrings.size() > 0) { + //if it exists, combine the MeshHeading with the keywords + String result = StringUtils.join(keywordStrings, "; "); + result = fields.get("keywords") + "; " + result; + fields.put("keywords", result); + } + } + } - return new ParserResult(bibItems); + private void addOtherId(HashMap fields, List otherID) { + for (OtherID id : otherID) { + if ((id.getSource() != null) && (id.getContent() != null)) { + fields.put(id.getSource(), id.getContent()); + } + } + } + + private void addPersonalNames(HashMap fields, PersonalNameSubjectList personalNameSubjectList) { + //if no authors are named, then add the personal names as authors + if (fields.get("author") == null) { + List personalNames = new ArrayList<>(); + if (personalNameSubjectList.getPersonalNameSubject() != null) { + List personalNameSubject = personalNameSubjectList.getPersonalNameSubject(); + for (PersonalNameSubject personalName : personalNameSubject) { + String name = personalName.getLastName(); + if (personalName.getForeName() != null) { + name += ", " + personalName.getForeName(); + } + personalNames.add(name); + } + fields.put("author", StringUtils.join(personalNames, " and ")); + } + } + } + + private void addMeashHeading(HashMap fields, MeshHeadingList meshHeadingList) { + ArrayList keywords = new ArrayList<>(); + String result = ""; + for (MeshHeading keyword : meshHeadingList.getMeshHeading()) { + result = keyword.getDescriptorName().getContent(); + if (keyword.getQualifierName() != null) { + for (QualifierName qualifier : keyword.getQualifierName()) { + result += ", " + qualifier.getContent(); + } + } + keywords.add(result); + } + fields.put("keywords", StringUtils.join(keywords, "; ")); + } + + private void addGeneSymbols(HashMap fields, GeneSymbolList geneSymbolList) { + List geneSymbols = geneSymbolList.getGeneSymbol(); + fields.put("gene-symbols", StringUtils.join(geneSymbols, ", ")); + } + + private void addChemicals(HashMap fields, List chemicals) { + List chemicalNames = new ArrayList<>(); + for (Chemical chemical : chemicals) { + if (chemical != null) { + chemicalNames.add(chemical.getNameOfSubstance().getContent()); + } + } + fields.put("chemicals", StringUtils.join(chemicalNames, ", ")); + } + + private void addArticleInformation(HashMap fields, List content) { + for (Object o : content) { + if (o instanceof Journal) { + Journal journal = (Journal) o; + putIfNotNull(fields, "journal", journal.getTitle()); + + ISSN issn = journal.getISSN(); + putIfNotNull(fields, "issn", issn.getContent()); + + JournalIssue journalIssue = journal.getJournalIssue(); + putIfNotNull(fields, "volume", journalIssue.getVolume()); + putIfNotNull(fields, "issue", journalIssue.getIssue()); + + PubDate pubDate = journalIssue.getPubDate(); + if (pubDate.getYear() == null) { + fields.put("year", pubDate.getMedlineDate()); + } else { + fields.put("year", pubDate.getYear()); + if (pubDate.getMonth() != null) { + fields.put("month", pubDate.getMonth()); + } else if (pubDate.getSeason() != null) { + fields.put("season", pubDate.getSeason()); + } + } + } else if (o instanceof ArticleTitle) { + ArticleTitle articleTitle = (ArticleTitle) o; + fields.put("title", articleTitle.getContent().toString()); + } else if (o instanceof Pagination) { + Pagination pagination = (Pagination) o; + String startPage = ""; + String endPage = ""; + for (JAXBElement element : pagination.getContent()) { + if (element.getName().getLocalPart().equals("MedlinePgn")) { + putIfNotNull(fields, "pages", fixPageRange(element.getValue())); + } else if (element.getName().getLocalPart().equals("StartPage")) { + startPage = element.getValue() + endPage; + putIfNotNull(fields, "pages", startPage); + } else if (element.getName().getLocalPart().equals("EndPage")) { + if (!"".equals(startPage)) { + fields.put("pages", startPage + "--" + endPage); + } else { + fields.put("pages", endPage); + } + } + } + } else if (o instanceof ELocationID) { + ELocationID eLocationID = (ELocationID) o; + if ("doi".equals(eLocationID.getEIdType())) { + fields.put("doi", eLocationID.getContent()); + } + if ("pii".equals(eLocationID.getEIdType())) { + fields.put("pii", eLocationID.getContent()); + } + } else if (o instanceof Abstract) { + Abstract abs = (Abstract) o; + putIfNotNull(fields, "copyright", abs.getCopyrightInformation()); + List abstractText = new ArrayList<>(); + for (AbstractText text : abs.getAbstractText()) { + for (Serializable textContent : text.getContent()) { + if (textContent instanceof String) { + abstractText.add((String) textContent); + } + } + } + fields.put("abstract", StringUtils.join(abstractText, "")); + } else if (o instanceof AuthorList) { + AuthorList authors = (AuthorList) o; + handleAuthors(fields, authors); + } else if (o instanceof IsoLanguageCodes) { + IsoLanguageCodes language = (IsoLanguageCodes) o; + putIfNotNull(fields, "language", language.value()); + } + } + } + + private void handleAuthors(HashMap fields, AuthorList authors) { + List authorNames = new ArrayList<>(); + for (Author author : authors.getAuthor()) { + if (author.getCollectiveName() != null) { + authorNames.add(author.getCollectiveName().toString()); + } else if (author.getLastName() != null) { + String authorName = author.getLastName(); + if (author.getForeName() != null) { + authorName += ", " + author.getForeName(); + } + + authorNames.add(authorName); + } + } + fields.put("author", StringUtils.join(authorNames, " and ")); + } + + private void addDateRevised(HashMap fields, PubmedArticle article) { + if (article.getMedlineCitation().getDateRevised() != null) { + DateRevised dateRevised = article.getMedlineCitation().getDateRevised(); + if ((dateRevised.getDay() != null) && (dateRevised.getMonth() != null) && (dateRevised.getYear() != null)) { + fields.put("revised", + dateRevised.getDay() + "/" + dateRevised.getMonth() + "/" + dateRevised.getYear()); + } + } + } + + private void putIfNotNull(HashMap fields, String medlineKey, String value) { + if (value != null) { + fields.put(medlineKey, value); + } + } + + // PENDING jeffrey.kuhn@yale.edu 2005-05-27 : added fixPageRange method + // Convert medline page ranges from short form to full form. + // Medline reports page ranges in a shorthand format. + // The last page is reported using only the digits which + // differ from the first page. + // i.e. 12345-51 refers to the actual range 12345-12351 + private String fixPageRange(String pageRange) { + int minusPos = pageRange.indexOf('-'); + if (minusPos < 0) { + return pageRange; + } + String first = pageRange.substring(0, minusPos).trim(); + String last = pageRange.substring(minusPos + 1).trim(); + int llast = last.length(); + int lfirst = first.length(); + if (llast < lfirst) { + last = first.substring(0, lfirst - llast) + last; + } + return first + "--" + last; } } diff --git a/src/test/java/net/sf/jabref/importer/fileformat/MedlineImporterTest.java b/src/test/java/net/sf/jabref/importer/fileformat/MedlineImporterTest.java index 0dcefbbd835c..b1ab78a84ec5 100644 --- a/src/test/java/net/sf/jabref/importer/fileformat/MedlineImporterTest.java +++ b/src/test/java/net/sf/jabref/importer/fileformat/MedlineImporterTest.java @@ -7,7 +7,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.stream.Collectors; @@ -58,12 +57,6 @@ public void setUp() throws Exception { this.medlineImporter = new MedlineImporter(); } - @Test - public void testGetItemsEmpty() { - MedlineHandler handler = new MedlineHandler(); - assertEquals(Collections.emptyList(), handler.getItems()); - } - @Test public void testGetFormatName() { assertEquals("Medline", medlineImporter.getFormatName());