Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for issue 5850: Journal abbreviations in UTF-8 not recognized #7639

Merged
merged 30 commits into from
Apr 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2caa8e0
fix issue #5850 for encoding problem
MrGhabi Apr 17, 2021
26d5100
add a blank line for build.gradle
MrGhabi Apr 17, 2021
aec8447
initial as main branch for build.gradle
MrGhabi Apr 17, 2021
304adc0
initial as main branch for build.gradle
MrGhabi Apr 17, 2021
8a8df28
add the change of fix information of issue 5850
MrGhabi Apr 17, 2021
590940a
Fix check style
MrGhabi Apr 17, 2021
ee1cac7
Update CHANGELOG.md
MrGhabi Apr 17, 2021
c6f0cc2
Add the utf8 check for biblatex and ascii check for bibtex
MrGhabi Apr 17, 2021
cc099d7
Merge remote-tracking branch 'origin/fix-for-issue-5850' into fix-for…
MrGhabi Apr 17, 2021
a18a3af
add the new localization string the l10 files
MrGhabi Apr 17, 2021
fe69305
fix error
MrGhabi Apr 17, 2021
673cc42
add the statement only in en.properties
MrGhabi Apr 18, 2021
7e04a98
Merge remote-tracking branch 'origin/fix-for-issue-5850' into fix-for…
MrGhabi Apr 18, 2021
f3bf4ac
revert changes
MrGhabi Apr 18, 2021
083e3ea
Update JabRef_da.properties
MrGhabi Apr 18, 2021
b1b5999
Update JabRef_ru.properties
MrGhabi Apr 18, 2021
9e94837
Update build.gradle
MrGhabi Apr 18, 2021
e07e530
Update JabRef_fa.properties
MrGhabi Apr 18, 2021
b1a4f58
Update JabRef_no.properties
MrGhabi Apr 18, 2021
85d2198
Update JabRef_pl.properties
MrGhabi Apr 18, 2021
7e44819
Update JabRef_pt.properties
MrGhabi Apr 18, 2021
a81d2ec
Update JabRef_vi.properties
MrGhabi Apr 18, 2021
d980120
Update JabRef_zh_TW.properties
MrGhabi Apr 18, 2021
d7b1917
reset the default charset
MrGhabi Apr 19, 2021
cec382e
Merge remote-tracking branch 'origin/fix-for-issue-5850' into fix-for…
MrGhabi Apr 19, 2021
02cc61e
reset the default charset
MrGhabi Apr 19, 2021
a4aff23
add the javaDoc of UTF8Checker
MrGhabi Apr 19, 2021
e8e02a9
add the javaDoc of UTF8CheckerTest and IntegrityCheckTest
MrGhabi Apr 19, 2021
5092817
Remove the unwieldy Junit tests
MrGhabi Apr 21, 2021
7bfe74a
Merge branch 'main' into fix-for-issue-5850
Siedlerchr Apr 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We fixed an issue where opening BibTex file (doubleclick) from Folder with spaces not working. [#6487](https://github.com/JabRef/jabref/issues/6487)
- We fixed an issue with saving large `.bib` files [#7265](https://github.com/JabRef/jabref/issues/7265)
- We fixed an issue with very large page numbers [#7590](https://github.com/JabRef/jabref/issues/7590)
- We fixed an issue where journal abbreviations in UTF-8 were not recognized [#5850](https://github.com/JabRef/jabref/issues/5850)
- We fixed an issue where the article title with curly brackets fails to download the arXiv link (pdf file). [#7633](https://github.com/JabRef/jabref/issues/7633)

### Removed
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/org/jabref/logic/integrity/IntegrityCheck.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
new CitationKeyDeviationChecker(bibDatabaseContext, citationKeyPatternPreferences),
new CitationKeyDuplicationChecker(bibDatabaseContext.getDatabase())
));

if (bibDatabaseContext.isBiblatexMode()) {
entryCheckers.add(new JournalInAbbreviationListChecker(StandardField.JOURNALTITLE, journalAbbreviationRepository));
entryCheckers.addAll(List.of(
new JournalInAbbreviationListChecker(StandardField.JOURNALTITLE, journalAbbreviationRepository),
new UTF8Checker())
);
} else {
entryCheckers.addAll(List.of(
new JournalInAbbreviationListChecker(StandardField.JOURNAL, journalAbbreviationRepository),
Expand All @@ -59,7 +61,6 @@ List<IntegrityMessage> check() {
for (BibEntry entry : database.getEntries()) {
result.addAll(checkEntry(entry));
}

result.addAll(checkDatabase(database));

return result;
Expand Down
53 changes: 53 additions & 0 deletions src/main/java/org/jabref/logic/integrity/UTF8Checker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.jabref.logic.integrity;

import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.Field;

public class UTF8Checker implements EntryChecker {

/**
* Detect any non UTF-8 encoded field
* @param entry the BibEntry of BibLatex.
* @return return the warning of UTF-8 check for BibLatex.
*/
@Override
public List<IntegrityMessage> check(BibEntry entry) {
List<IntegrityMessage> results = new ArrayList<>();
Charset charset = Charset.forName(System.getProperty("file.encoding"));
for (Map.Entry<Field, String> field : entry.getFieldMap().entrySet()) {
boolean utfOnly = UTF8EncodingChecker(field.getValue().getBytes(charset));
if (!utfOnly) {
results.add(new IntegrityMessage(Localization.lang("Non-UTF-8 encoded field found"), entry,
field.getKey()));
}
}
return results;
}

/**
* Check whether a byte array is encoded in UTF-8 charset
*
* Use java api decoder and try&catch block to check the charset.
* @param data the byte array used to check the encoding charset
* @return true if is encoded in UTF-8 & false is not encoded in UTF-8
*/
public static boolean UTF8EncodingChecker(byte[] data) {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
try {
decoder.decode(ByteBuffer.wrap(data));
} catch (CharacterCodingException ex) {
return false;
}
return true;
}
}
1 change: 1 addition & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1629,6 +1629,7 @@ Style\ file=Style file
Open\ OpenOffice/LibreOffice\ connection=Open OpenOffice/LibreOffice connection
You\ must\ enter\ at\ least\ one\ field\ name=You must enter at least one field name
Non-ASCII\ encoded\ character\ found=Non-ASCII encoded character found
Non-UTF-8\ encoded\ field\ found=Non-UTF-8 encoded field found
Toggle\ web\ search\ interface=Toggle web search interface

Migration\ help\ information=Migration help information
Expand Down
14 changes: 7 additions & 7 deletions src/test/java/org/jabref/logic/integrity/IntegrityCheckTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ private static Stream<String> provideCorrectFormat() {

private static Stream<String> provideIncorrectFormat() {
return Stream.of(" Knuth, Donald E. ",
"Knuth, Donald E. and Kurt Cobain and A. Einstein",
", and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and ,",
"and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and");
"Knuth, Donald E. and Kurt Cobain and A. Einstein",
", and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and ,",
"and Kurt Cobain and A. Einstein", "Donald E. Knuth and Kurt Cobain and");
}

@Test
Expand Down Expand Up @@ -190,10 +190,10 @@ private void assertCorrect(BibDatabaseContext context) {

private void assertCorrect(BibDatabaseContext context, boolean allowIntegerEdition) {
List<IntegrityMessage> messages = new IntegrityCheck(context,
mock(FilePreferences.class),
createCitationKeyPatternPreferences(),
JournalAbbreviationLoader.loadBuiltInRepository(),
allowIntegerEdition).check();
mock(FilePreferences.class),
createCitationKeyPatternPreferences(),
JournalAbbreviationLoader.loadBuiltInRepository(),
allowIntegerEdition).check();
assertEquals(Collections.emptyList(), messages);
}

Expand Down
74 changes: 74 additions & 0 deletions src/test/java/org/jabref/logic/integrity/UTF8CheckerTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package org.jabref.logic.integrity;

import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;

import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

public class UTF8CheckerTest {

private final BibEntry entry = new BibEntry();

/**
* fieldAcceptsUTF8 to check UTF8Checker's result set
* when the entry is encoded in UTF-8 (should be empty)
*/
@Test
void fieldAcceptsUTF8() {
UTF8Checker checker = new UTF8Checker();
entry.setField(StandardField.TITLE, "Only ascii characters!'@12");
assertEquals(Collections.emptyList(), checker.check(entry));
}

/**
* fieldDoesNotAcceptUmlauts to check UTF8Checker's result set
* when the entry is encoded in Non-Utf-8 charset and the System
* environment is Non UTF-8.
* Finally we need to reset the environment charset.
* @throws UnsupportedEncodingException initial a String in charset GBK
* Demo: new String(StringDemo.getBytes(), "GBK");
*/
@Test
void fieldDoesNotAcceptUmlauts() throws UnsupportedEncodingException {
String defaultCharset = System.getProperty("file.encoding");
System.getProperties().put("file.encoding", "GBK");
UTF8Checker checker = new UTF8Checker();
String NonUTF8 = new String("你好,这条语句使用GBK字符集".getBytes(), "GBK");
entry.setField(StandardField.MONTH, NonUTF8);
assertEquals(List.of(new IntegrityMessage("Non-UTF-8 encoded field found", entry, StandardField.MONTH)), checker.check(entry));
System.getProperties().put("file.encoding", defaultCharset);
}

/**
* To check the UTF8Checker.UTF8EncodingChecker
* in NonUTF8 char array (should return false)
*
* @throws UnsupportedEncodingException initial a String in charset GBK
* Demo: new String(StringDemo.getBytes(), "GBK");
*/
@Test
void NonUTF8EncodingCheckerTest() throws UnsupportedEncodingException {
String NonUTF8 = new String("你好,这条语句使用GBK字符集".getBytes(), "GBK");
assertFalse(UTF8Checker.UTF8EncodingChecker(NonUTF8.getBytes("GBK")));

}

/**
* To check the UTF8Checker.UTF8EncodingChecker
* in UTF-8 char array (should return true)
*/
@Test
void UTF8EncodingCheckerTest() {
String UTF8Demo = new String("你好,这条语句使用GBK字符集".getBytes(), StandardCharsets.UTF_8);
assertTrue(UTF8Checker.UTF8EncodingChecker(UTF8Demo.getBytes(StandardCharsets.UTF_8)));
}
}