From 4c25a1d8eb770f454a96974eb87e34f0c80b94e0 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Wed, 4 Nov 2020 23:49:30 +0100 Subject: [PATCH 01/14] Fix Google Scholar fetcher for downloading a single entry --- docs/advanced-reading/fetchers.md | 11 ++++++ .../logic/importer/fetcher/GoogleScholar.java | 31 +++++++++++++--- .../org/jabref/logic/net/URLDownload.java | 4 +++ .../importer/fetcher/GoogleScholarTest.java | 36 +++++++++---------- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/docs/advanced-reading/fetchers.md b/docs/advanced-reading/fetchers.md index e3256585127..9a430bce48f 100644 --- a/docs/advanced-reading/fetchers.md +++ b/docs/advanced-reading/fetchers.md @@ -14,6 +14,17 @@ Fetchers are the implementation of the [search using online services](https://do On Windows, you have to log-off and log-on to let IntelliJ know about the environment variable change. Execute the gradle task "processResources" in the group "others" within IntelliJ to ensure the values have been correctly written. Now, the fetcher tests should run without issues. +## Change the log levels to enable debugging + +1. Open `src/test/resources/log4j2-test.xml` +2. Add following XML snippet + + ```xml + + + + ``` + ## Background on embedding the keys in JabRef The keys are placed into the `build.properties` file. diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index a35e1353373..90025b96738 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -41,6 +41,7 @@ public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher { private static final Logger LOGGER = LoggerFactory.getLogger(GoogleScholar.class); + private static final Pattern LINK_TO_SUBPAGE_PATTERN = Pattern.compile("data-clk-atid=\"([^\"]*)\""); private static final Pattern LINK_TO_BIB_PATTERN = Pattern.compile("(https:\\/\\/scholar.googleusercontent.com\\/scholar.bib[^\"]*)"); private static final String BASIC_SEARCH_URL = "https://scholar.google.ch/scholar?"; @@ -128,11 +129,11 @@ public Optional getHelpPage() { @Override public List performSearch(String query) throws FetcherException { - LOGGER.debug("Using URL {}", query); + LOGGER.debug("Using query {}", query); obtainAndModifyCookie(); List foundEntries = new ArrayList<>(20); - URIBuilder uriBuilder = null; + URIBuilder uriBuilder; try { uriBuilder = new URIBuilder(BASIC_SEARCH_URL); } catch (URISyntaxException e) { @@ -143,6 +144,7 @@ public List performSearch(String query) throws FetcherException { uriBuilder.addParameter("btnG", "Search"); uriBuilder.addParameter("q", query); String queryURL = uriBuilder.toString(); + LOGGER.debug("Using URL {}", queryURL); try { addHitsFromQuery(foundEntries, queryURL); @@ -150,7 +152,8 @@ public List performSearch(String query) throws FetcherException { // if there are too much requests from the same IP address google is answering with a 503 and redirecting to a captcha challenge // The caught IOException looks for example like this: // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 - if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { + if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") || + (e.getMessage().contains("Server returned HTTP response code: 503 for URL"))) { throw new FetcherException("Fetching from Google Scholar at URL " + queryURL + " failed.", Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); } else { @@ -214,6 +217,7 @@ private String constructComplexQueryString(ComplexSearchQuery complexSearchQuery } private void addHitsFromQuery(List entryList, String queryURL) throws IOException, FetcherException { + LOGGER.debug("Downloading from {}", queryURL); String content = new URLDownload(queryURL).asString(); if (needsCaptcha(content)) { @@ -221,15 +225,34 @@ private void addHitsFromQuery(List entryList, String queryURL) throws Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null); } - Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content); + Matcher matcher = LINK_TO_SUBPAGE_PATTERN.matcher(content); + if (!matcher.find()) { + LOGGER.debug("No data-clk-atid found in html {}", content); + return; + } + + String infoPageUrl = BASIC_SEARCH_URL + "q=info:" + matcher.group(1) + ":scholar.google.com/&output=cite&scirp=0&hl=en"; + LOGGER.debug("Using infoPageUrl {}", infoPageUrl); + URLDownload infoPageUrlDownload = new URLDownload(infoPageUrl); + LOGGER.debug("Downloading from {}", infoPageUrl); + String infoPageContent = infoPageUrlDownload.asString(); + + matcher = LINK_TO_BIB_PATTERN.matcher(infoPageContent); + boolean found = false; while (matcher.find()) { + found = true; String citationsPageURL = matcher.group().replace("&", "&"); + LOGGER.debug("Using citationsPageURL {}", citationsPageURL); BibEntry newEntry = downloadEntry(citationsPageURL); entryList.add(newEntry); } + if (!found) { + LOGGER.debug("Did not found pattern in html {}", infoPageContent); + } } private BibEntry downloadEntry(String link) throws IOException, FetcherException { + LOGGER.debug("Downloading from {}", link); String downloadedContent = new URLDownload(link).asString(); BibtexParser parser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor()); ParserResult result = parser.parse(new StringReader(downloadedContent)); diff --git a/src/main/java/org/jabref/logic/net/URLDownload.java b/src/main/java/org/jabref/logic/net/URLDownload.java index 99887c7ac93..7bd2baecd20 100644 --- a/src/main/java/org/jabref/logic/net/URLDownload.java +++ b/src/main/java/org/jabref/logic/net/URLDownload.java @@ -235,6 +235,10 @@ public String asString() throws IOException { return asString(StandardCharsets.UTF_8); } + /** + * Returns a modifiable list of cookies related to the URL of this URLDownload. + * Any modifications will be used at later calls + */ public List getCookieFromUrl() throws IOException { CookieManager cookieManager = new CookieManager(); CookieHandler.setDefault(cookieManager); diff --git a/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java b/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java index 9c0ef78b411..8f195940222 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java @@ -26,49 +26,49 @@ @FetcherTest class GoogleScholarTest implements SearchBasedFetcherCapabilityTest { - private GoogleScholar finder; - private BibEntry entry; + private GoogleScholar fetcher; @BeforeEach void setUp() { ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class); when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn( mock(FieldContentFormatterPreferences.class)); - finder = new GoogleScholar(importFormatPreferences); - entry = new BibEntry(); + fetcher = new GoogleScholar(importFormatPreferences); } @Test @DisabledOnCIServer("CI server is blocked by Google") void linkFound() throws IOException, FetcherException { - entry.setField(StandardField.TITLE, "Towards Application Portability in Platform as a Service"); + BibEntry entry = new BibEntry() + .withField(StandardField.TITLE, "Towards Application Portability in Platform as a Service"); assertEquals( Optional.of(new URL("https://www.uni-bamberg.de/fileadmin/uni/fakultaeten/wiai_lehrstuehle/praktische_informatik/Dateien/Publikationen/sose14-towards-application-portability-in-paas.pdf")), - finder.findFullText(entry) + fetcher.findFullText(entry) ); } @Test @DisabledOnCIServer("CI server is blocked by Google") void noLinkFound() throws IOException, FetcherException { - entry.setField(StandardField.TITLE, "Curriculum programme of career-oriented java specialty guided by principles of software engineering"); + BibEntry entry = new BibEntry() + .withField(StandardField.TITLE, "Curriculum programme of career-oriented java specialty guided by principles of software engineering"); - assertEquals(Optional.empty(), finder.findFullText(entry)); + assertEquals(Optional.empty(), fetcher.findFullText(entry)); } @Test @DisabledOnCIServer("CI server is blocked by Google") void findSingleEntry() throws FetcherException { - entry.setType(StandardEntryType.InProceedings); - entry.setCitationKey("geiger2013detecting"); - entry.setField(StandardField.TITLE, "Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models."); - entry.setField(StandardField.AUTHOR, "Geiger, Matthias and Wirtz, Guido"); - entry.setField(StandardField.BOOKTITLE, "ZEUS"); - entry.setField(StandardField.YEAR, "2013"); - entry.setField(StandardField.PAGES, "41--44"); + BibEntry entry = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("geiger2013detecting") + .withField(StandardField.TITLE, "Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models.") + .withField(StandardField.AUTHOR, "Geiger, Matthias and Wirtz, Guido") + .withField(StandardField.BOOKTITLE, "ZEUS") + .withField(StandardField.YEAR, "2013") + .withField(StandardField.PAGES, "41--44"); - List foundEntries = finder.performSearch("Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models"); + List foundEntries = fetcher.performSearch("Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models"); assertEquals(Collections.singletonList(entry), foundEntries); } @@ -76,14 +76,14 @@ void findSingleEntry() throws FetcherException { @Test @DisabledOnCIServer("CI server is blocked by Google") void findManyEntries() throws FetcherException { - List foundEntries = finder.performSearch("random test string"); + List foundEntries = fetcher.performSearch("random test string"); assertEquals(20, foundEntries.size()); } @Override public SearchBasedFetcher getFetcher() { - return finder; + return fetcher; } @Override From 90efdd1e927b8e949eb86b59110865d55a3f36e2 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 12:56:52 +0100 Subject: [PATCH 02/14] Shrink scopy of try...catch Co-authored-by: Dominik Voigt --- .../logic/importer/fetcher/GoogleScholar.java | 74 ++++++++++--------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index fab7f300f2d..f88a89337be 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -207,46 +207,48 @@ private void obtainAndModifyCookie() throws FetcherException { @Override public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException { + LOGGER.debug("Using query {}", complexSearchQuery); + obtainAndModifyCookie(); + List foundEntries = new ArrayList<>(10); + + String complexQueryString = constructComplexQueryString(complexSearchQuery); + final URIBuilder uriBuilder; try { - obtainAndModifyCookie(); - List foundEntries = new ArrayList<>(10); + uriBuilder = new URIBuilder(BASIC_SEARCH_URL); + } catch (URISyntaxException e) { + throw new FetcherException("Error while fetching from " + getName(), e); + } + uriBuilder.addParameter("hl", "en"); + uriBuilder.addParameter("btnG", "Search"); + uriBuilder.addParameter("q", complexQueryString); + uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize())); + uriBuilder.addParameter("num", String.valueOf(getPageSize())); + complexSearchQuery.getFromYear().ifPresent(year -> uriBuilder.addParameter("as_ylo", year.toString())); + complexSearchQuery.getToYear().ifPresent(year -> uriBuilder.addParameter("as_yhi", year.toString())); + complexSearchQuery.getSingleYear().ifPresent(year -> { + uriBuilder.addParameter("as_ylo", year.toString()); + uriBuilder.addParameter("as_yhi", year.toString()); + }); - String complexQueryString = constructComplexQueryString(complexSearchQuery); - URIBuilder uriBuilder = new URIBuilder(BASIC_SEARCH_URL); - uriBuilder.addParameter("hl", "en"); - uriBuilder.addParameter("btnG", "Search"); - uriBuilder.addParameter("q", complexQueryString); - uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize())); - uriBuilder.addParameter("num", String.valueOf(getPageSize())); - complexSearchQuery.getFromYear().ifPresent(year -> uriBuilder.addParameter("as_ylo", year.toString())); - complexSearchQuery.getToYear().ifPresent(year -> uriBuilder.addParameter("as_yhi", year.toString())); - complexSearchQuery.getSingleYear().ifPresent(year -> { - uriBuilder.addParameter("as_ylo", year.toString()); - uriBuilder.addParameter("as_yhi", year.toString()); - }); - - try { - addHitsFromQuery(foundEntries, uriBuilder.toString()); + try { + addHitsFromQuery(foundEntries, uriBuilder.toString()); - if (foundEntries.size() == 10) { - uriBuilder.addParameter("start", "10"); - addHitsFromQuery(foundEntries, uriBuilder.toString()); - } - } catch (IOException e) { - LOGGER.info("IOException for URL {}", uriBuilder.toString()); - // if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge - // The caught IOException looks for example like this: - // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 - if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { - throw new FetcherException("Fetching from Google Scholar failed.", - Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); - } else { - throw new FetcherException("Error while fetching from " + getName(), e); - } + if (foundEntries.size() == 10) { + uriBuilder.addParameter("start", "10"); + addHitsFromQuery(foundEntries, uriBuilder.toString()); + } + } catch (IOException e) { + LOGGER.info("IOException for URL {}", uriBuilder.toString()); + // if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge + // The caught IOException looks for example like this: + // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 + if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { + throw new FetcherException("Fetching from Google Scholar failed.", + Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); + } else { + throw new FetcherException("Error while fetching from " + getName(), e); } - return new Page<>(complexQueryString, pageNumber, foundEntries); - } catch (URISyntaxException e) { - throw new FetcherException("Error while fetching from " + getName(), e); } + return new Page<>(complexQueryString, pageNumber, foundEntries); } } From 531fcaddd2945e3e360e4033c7f66bc6b67c7113 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 12:59:55 +0100 Subject: [PATCH 03/14] Remove fetching of 10 more results if exactly 10 results are fetched (and log URL) Co-authored-by: Dominik Voigt --- .../logic/importer/fetcher/GoogleScholar.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index f88a89337be..d5f8a5b2a45 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -230,16 +230,13 @@ public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, uriBuilder.addParameter("as_yhi", year.toString()); }); + String queryURL = uriBuilder.toString(); + LOGGER.debug("Using URL {}", queryURL); try { - addHitsFromQuery(foundEntries, uriBuilder.toString()); - - if (foundEntries.size() == 10) { - uriBuilder.addParameter("start", "10"); - addHitsFromQuery(foundEntries, uriBuilder.toString()); - } + addHitsFromQuery(foundEntries, queryURL); } catch (IOException e) { - LOGGER.info("IOException for URL {}", uriBuilder.toString()); - // if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge + LOGGER.info("IOException for URL {}", queryURL); + // if there are too much requests from the same IP address google is answering with a 503 and redirecting to a captcha challenge // The caught IOException looks for example like this: // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { From 4fcfb61b491fc1ba0a33dd3d7556a20ed28414a9 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 13:02:30 +0100 Subject: [PATCH 04/14] Add 403 Co-authored-by: Dominik Voigt --- .../org/jabref/logic/importer/fetcher/GoogleScholar.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index d5f8a5b2a45..cfe92576404 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -236,10 +236,11 @@ public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, addHitsFromQuery(foundEntries, queryURL); } catch (IOException e) { LOGGER.info("IOException for URL {}", queryURL); - // if there are too much requests from the same IP address google is answering with a 503 and redirecting to a captcha challenge + // if there are too much requests from the same IP address google is answering with a 403 or 503 and redirecting to a captcha challenge // The caught IOException looks for example like this: // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 - if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { + if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") || + (e.getMessage().contains("Server returned HTTP response code: 503 for URL"))) { throw new FetcherException("Fetching from Google Scholar failed.", Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); } else { From 60c74e1f283b7ffc7e88d43f96bf10a3390dfc1d Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 13:02:37 +0100 Subject: [PATCH 05/14] Compile fix Co-authored-by: Dominik Voigt --- .../org/jabref/logic/importer/fetcher/GoogleScholarTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java b/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java index bbc4166e870..7eb4e088edf 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java @@ -89,7 +89,7 @@ public SearchBasedFetcher getFetcher() { @Override public PagedSearchBasedFetcher getPagedFetcher() { - return finder; + return fetcher; } @Override From f3488e25e367e1c127f443f9818277936af28c54 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 13:09:53 +0100 Subject: [PATCH 06/14] Have obtainAndModifiycookie patch the URL download (and not create a new one) Co-authored-by: Dominik Voigt --- .../org/jabref/logic/importer/fetcher/GoogleScholar.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index cfe92576404..53e6ad9a349 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -140,7 +140,9 @@ private String constructComplexQueryString(ComplexSearchQuery complexSearchQuery private void addHitsFromQuery(List entryList, String queryURL) throws IOException, FetcherException { LOGGER.debug("Downloading from {}", queryURL); - String content = new URLDownload(queryURL).asString(); + URLDownload urlDownload = new URLDownload(queryURL); + obtainAndModifyCookie(urlDownload); + String content = urlDownload.asString(); if (needsCaptcha(content)) { throw new FetcherException("Fetching from Google Scholar failed: Captacha hit at " + queryURL + ".", @@ -192,9 +194,8 @@ private BibEntry downloadEntry(String link) throws IOException, FetcherException } } - private void obtainAndModifyCookie() throws FetcherException { + private void obtainAndModifyCookie(URLDownload downloader) throws FetcherException { try { - URLDownload downloader = new URLDownload("https://scholar.google.com"); List cookies = downloader.getCookieFromUrl(); for (HttpCookie cookie : cookies) { // append "CF=4" which represents "Citation format bibtex" @@ -208,7 +209,6 @@ private void obtainAndModifyCookie() throws FetcherException { @Override public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException { LOGGER.debug("Using query {}", complexSearchQuery); - obtainAndModifyCookie(); List foundEntries = new ArrayList<>(10); String complexQueryString = constructComplexQueryString(complexSearchQuery); From 62a5100be57dbbacb7b153688101ffcefb0aa722 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 14:46:34 +0100 Subject: [PATCH 07/14] Keep order of terms when transforming the query Co-authored-by: Dominik Voigt --- .../jabref/logic/importer/QueryParser.java | 69 +++++++++++++++---- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/QueryParser.java b/src/main/java/org/jabref/logic/importer/QueryParser.java index 65359122ff2..5bd1179db48 100644 --- a/src/main/java/org/jabref/logic/importer/QueryParser.java +++ b/src/main/java/org/jabref/logic/importer/QueryParser.java @@ -1,6 +1,7 @@ package org.jabref.logic.importer; import java.util.ArrayList; +import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; import java.util.List; @@ -11,7 +12,10 @@ import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; +import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode; +import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; +import org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; @@ -26,24 +30,65 @@ public class QueryParser { * Parses the given query string into a complex query using lucene. * Note: For unique fields, the alphabetically and numerically first instance in the query string is used in the complex query. * - * @param query The given query string + * @param query The given query string. E.g. BPMN 2.0 or author:"Kopp" AND title:"BPEL4Chor" * @return A complex query containing all fields of the query string */ public Optional parseQueryStringIntoComplexQuery(String query) { try { - StandardQueryParser parser = new StandardQueryParser(); - Query luceneQuery = parser.parse(query, "default"); - Set terms = new HashSet<>(); - // This implementation collects all terms from the leaves of the query tree independent of the internal boolean structure - // If further capabilities are required in the future the visitor and ComplexSearchQuery has to be adapted accordingly. - QueryVisitor visitor = QueryVisitor.termCollector(terms); - luceneQuery.visit(visitor); - - List sortedTerms = new ArrayList<>(terms); - sortedTerms.sort(Comparator.comparing(Term::text).reversed()); - return Optional.of(ComplexSearchQuery.fromTerms(sortedTerms)); + StandardSyntaxParser parser = new StandardSyntaxParser(); + QueryNode luceneQuery = parser.parse(query, "default"); + QueryToComplexSearchQueryTransformator transformator = new QueryToComplexSearchQueryTransformator(); + return Optional.of(transformator.handle(luceneQuery)); } catch (QueryNodeException | IllegalStateException | IllegalArgumentException ex) { return Optional.empty(); } } + + private static class QueryToComplexSearchQueryTransformator { + + ComplexSearchQuery.ComplexSearchQueryBuilder builder; + + public ComplexSearchQuery handle(QueryNode query) { + builder = ComplexSearchQuery.builder(); + transform(query); + return builder.build(); + } + + public void transform(QueryNode query) { + if (query instanceof FieldQueryNode) { + transform(((FieldQueryNode) query)); + return; + } + query.getChildren().forEach(this::transform); + } + + private void transform(FieldQueryNode query) { + final String fieldValue = query.getTextAsString(); + switch (query.getFieldAsString()) { + case "author" -> { + builder.author(fieldValue); + } + case "journal" -> { + builder.journal(fieldValue); + } + case "title" -> { + builder.titlePhrase(fieldValue); + } + case "year" -> { + builder.singleYear(Integer.valueOf(fieldValue)); + } + case "year-range" -> { + String[] years = fieldValue.split("-"); + if (years.length != 2) { + return; + } + builder.fromYearAndToYear(Integer.valueOf(years[0]), Integer.valueOf(years[1])); + } + default -> { + builder.defaultFieldPhrase(fieldValue); + } + } + } + + } } From 07e93f477cf45044f50b8dd6aa166b654cb9162c Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 14:46:56 +0100 Subject: [PATCH 08/14] Improve logger and initial ArrayList size Co-authored-by: Dominik Voigt --- .../java/org/jabref/logic/importer/fetcher/GoogleScholar.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 53e6ad9a349..147c56ae014 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -185,7 +185,7 @@ private BibEntry downloadEntry(String link) throws IOException, FetcherException } else { Collection entries = result.getDatabase().getEntries(); if (entries.size() != 1) { - LOGGER.debug(entries.size() + " entries found! (" + link + ")"); + LOGGER.debug("{} entries found ({})", entries.size(), link); throw new FetcherException("Parsing entries from Google Scholar bib file failed."); } else { BibEntry entry = entries.iterator().next(); @@ -209,7 +209,7 @@ private void obtainAndModifyCookie(URLDownload downloader) throws FetcherExcepti @Override public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException { LOGGER.debug("Using query {}", complexSearchQuery); - List foundEntries = new ArrayList<>(10); + List foundEntries = new ArrayList<>(getPageSize()); String complexQueryString = constructComplexQueryString(complexSearchQuery); final URIBuilder uriBuilder; From 9cad8306690027eb23fd976beb344e36bf0e17a1 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 14:47:08 +0100 Subject: [PATCH 09/14] Refine caught exceptions Co-authored-by: Dominik Voigt --- .../org/jabref/logic/importer/fetcher/GoogleScholar.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 147c56ae014..45153ef7068 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -236,11 +236,13 @@ public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, addHitsFromQuery(foundEntries, queryURL); } catch (IOException e) { LOGGER.info("IOException for URL {}", queryURL); - // if there are too much requests from the same IP address google is answering with a 403 or 503 and redirecting to a captcha challenge + // If there are too much requests from the same IP address google is answering with a 403, 429, or 503 and redirecting to a captcha challenge + // Example URL: https://www.google.com/sorry/index?continue=https://scholar.google.ch/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3D%2522in%2522%2B%2522and%2522%2B%2522Process%2522%2B%2522Models%2522%2B%2522Issues%2522%2B%2522Interoperability%2522%2B%2522Detecting%2522%2B%2522Correctness%2522%2B%2522BPMN%2522%2B%25222.0%2522%2Ballintitle%253A%26start%3D0%26num%3D20&hl=en&q=EgTZGO7HGOuK2P4FIhkA8aeDSwDHMafs3bst5vlLM-Sk4TtpMrOtMgFy // The caught IOException looks for example like this: // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") || - (e.getMessage().contains("Server returned HTTP response code: 503 for URL"))) { + e.getMessage().contains("Server returned HTTP response code: 429 for URL") || + e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { throw new FetcherException("Fetching from Google Scholar failed.", Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); } else { From f504609dbef0daa0d01b8211e694990c0f81876f Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 15:12:13 +0100 Subject: [PATCH 10/14] Fix typo Co-authored-by: Dominik Voigt --- .../java/org/jabref/logic/importer/fetcher/GoogleScholar.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 45153ef7068..35f149ae25e 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -145,7 +145,7 @@ private void addHitsFromQuery(List entryList, String queryURL) throws String content = urlDownload.asString(); if (needsCaptcha(content)) { - throw new FetcherException("Fetching from Google Scholar failed: Captacha hit at " + queryURL + ".", + throw new FetcherException("Fetching from Google Scholar failed: Captcha hit at " + queryURL + ".", Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null); } From 6775728704f892fa821a0d992727745f84115729 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 13 Dec 2020 16:02:15 +0100 Subject: [PATCH 11/14] Fix Google Scholar: Show "allintitle:" only if title is present Co-authored-by: Dominik Voigt --- .../java/org/jabref/logic/importer/fetcher/GoogleScholar.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 35f149ae25e..7cc9ca2eba0 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -132,7 +132,9 @@ private String constructComplexQueryString(ComplexSearchQuery complexSearchQuery List searchTerms = new ArrayList<>(); searchTerms.addAll(complexSearchQuery.getDefaultFieldPhrases()); complexSearchQuery.getAuthors().forEach(author -> searchTerms.add("author:" + author)); - searchTerms.add("allintitle:" + String.join(" ", complexSearchQuery.getTitlePhrases())); + if (!complexSearchQuery.getTitlePhrases().isEmpty()) { + searchTerms.add("allintitle:" + String.join(" ", complexSearchQuery.getTitlePhrases())); + } complexSearchQuery.getJournal().ifPresent(journal -> searchTerms.add("source:" + journal)); // API automatically ANDs the terms return String.join(" ", searchTerms); From b691aedc336952515b0b4ec24e4a125eb441961b Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Mon, 14 Dec 2020 19:36:43 +0100 Subject: [PATCH 12/14] Working on adding the CaptchaSolver Co-authored-by: Dominik Voigt --- .../logic/importer/fetcher/CaptchaSolver.java | 11 ++++ .../logic/importer/fetcher/GoogleScholar.java | 55 ++++++++++++++++++- src/test/resources/log4j2-test.xml | 3 + 3 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java diff --git a/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java b/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java new file mode 100644 index 00000000000..10416af6714 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java @@ -0,0 +1,11 @@ +package org.jabref.logic.importer.fetcher; + +public interface CaptchaSolver { + + /** + * Instructes the user to solve the captcha given at + * @param queryURL + * @return + */ + String solve(String queryURL); +} diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 7cc9ca2eba0..15ae62af251 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -13,6 +13,11 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import javafx.application.Platform; +import javafx.scene.control.ButtonType; +import javafx.scene.web.WebView; + +import org.jabref.gui.util.BaseDialog; import org.jabref.logic.help.HelpFile; import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.FulltextFetcher; @@ -27,6 +32,7 @@ import org.jabref.model.paging.Page; import org.jabref.model.util.DummyFileUpdateMonitor; +import com.sun.star.sheet.XSolver; import org.apache.http.client.utils.URIBuilder; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -50,10 +56,12 @@ public class GoogleScholar implements FulltextFetcher, PagedSearchBasedFetcher { private static final int NUM_RESULTS = 10; private final ImportFormatPreferences importFormatPreferences; + private CaptchaSolver captchaSolver; - public GoogleScholar(ImportFormatPreferences importFormatPreferences) { + public GoogleScholar(ImportFormatPreferences importFormatPreferences, CaptchaSolver solver) { Objects.requireNonNull(importFormatPreferences); this.importFormatPreferences = importFormatPreferences; + this.captchaSolver = solver; } @Override @@ -144,13 +152,23 @@ private void addHitsFromQuery(List entryList, String queryURL) throws LOGGER.debug("Downloading from {}", queryURL); URLDownload urlDownload = new URLDownload(queryURL); obtainAndModifyCookie(urlDownload); - String content = urlDownload.asString(); + + // We need JSOUP directly to read the content when 429 is returned + + String content; + try { + content = urlDownload.asString(); + } if (needsCaptcha(content)) { throw new FetcherException("Fetching from Google Scholar failed: Captcha hit at " + queryURL + ".", Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null); } + extractEntriesFromContent(content, entryList); + } + + private void extractEntriesFromContent(String content, List entryList) throws IOException, FetcherException { Matcher matcher = LINK_TO_SUBPAGE_PATTERN.matcher(content); if (!matcher.find()) { LOGGER.debug("No data-clk-atid found in html {}", content); @@ -159,6 +177,7 @@ private void addHitsFromQuery(List entryList, String queryURL) throws String infoPageUrl = BASIC_SEARCH_URL + "q=info:" + matcher.group(1) + ":scholar.google.com/&output=cite&scirp=0&hl=en"; LOGGER.debug("Using infoPageUrl {}", infoPageUrl); + // FIXME: Existing cookies should be reused. URLDownload infoPageUrlDownload = new URLDownload(infoPageUrl); LOGGER.debug("Downloading from {}", infoPageUrl); String infoPageContent = infoPageUrlDownload.asString(); @@ -245,6 +264,10 @@ public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") || e.getMessage().contains("Server returned HTTP response code: 429 for URL") || e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { + LOGGER.debug("Captcha found. Calling the CaptchaSolver"); + String content = captchaSolver.solve(queryURL); + extractEntriesFromContent(content, foundEntries); + throw new FetcherException("Fetching from Google Scholar failed.", Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); } else { @@ -253,4 +276,32 @@ public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, } return new Page<>(complexQueryString, pageNumber, foundEntries); } + + public void displayCaptchaDialog(String link) { + Platform.runLater(() -> new CaptchaDialog(link).showAndWait()); + /* + if (dialog.retry()) { + displayCaptchaDialog(link); + } + */ + } + + private static final class CaptchaDialog extends BaseDialog { + public CaptchaDialog(String content) { + super(); + this.getDialogPane().getButtonTypes().add(ButtonType.CLOSE); + this.getDialogPane().lookupButton(ButtonType.CLOSE).setVisible(true); + WebView webView = new WebView(); + + // webView.getEngine().setJavaScriptEnabled(true); + webView.getEngine().setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0"); + this.getDialogPane().setContent(webView); + webView.getEngine().loadContent(content); + } + + public boolean retry() { + return false; + } + } + } diff --git a/src/test/resources/log4j2-test.xml b/src/test/resources/log4j2-test.xml index 8c6a336420a..14f7339b051 100644 --- a/src/test/resources/log4j2-test.xml +++ b/src/test/resources/log4j2-test.xml @@ -6,6 +6,9 @@ + + + From 5051e1bdd327745cf05265b6991415c09efc6cee Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Tue, 22 Dec 2020 20:27:26 +0100 Subject: [PATCH 13/14] Chagne order Co-authored-by: Dominik Voigt --- .../logic/importer/fetcher/GoogleScholar.java | 119 +++++++++--------- 1 file changed, 59 insertions(+), 60 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 15ae62af251..49b7f9d23a4 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -33,6 +33,7 @@ import org.jabref.model.util.DummyFileUpdateMonitor; import com.sun.star.sheet.XSolver; +import kong.unirest.Unirest; import org.apache.http.client.utils.URIBuilder; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -88,11 +89,6 @@ public Optional findFullText(BibEntry entry) throws IOException, FetcherExc } } - @Override - public TrustLevel getTrustLevel() { - return TrustLevel.META_SEARCH; - } - private Optional search(String url) throws IOException { Optional pdfLink = Optional.empty(); @@ -122,8 +118,9 @@ private Optional search(String url) throws IOException { return pdfLink; } - private boolean needsCaptcha(String body) { - return body.contains("id=\"gs_captcha_ccl\""); + @Override + public TrustLevel getTrustLevel() { + return TrustLevel.META_SEARCH; } @Override @@ -136,9 +133,58 @@ public Optional getHelpPage() { return Optional.of(HelpFile.FETCHER_GOOGLE_SCHOLAR); } + @Override + public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException { + LOGGER.debug("Using query {}", complexSearchQuery); + List foundEntries = new ArrayList<>(getPageSize()); + + String complexQueryString = constructComplexQueryString(complexSearchQuery); + final URIBuilder uriBuilder; + try { + uriBuilder = new URIBuilder(BASIC_SEARCH_URL); + } catch (URISyntaxException e) { + throw new FetcherException("Error while fetching from " + getName(), e); + } + uriBuilder.addParameter("hl", "en"); + uriBuilder.addParameter("btnG", "Search"); + uriBuilder.addParameter("q", complexQueryString); + uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize())); + uriBuilder.addParameter("num", String.valueOf(getPageSize())); + complexSearchQuery.getFromYear().ifPresent(year -> uriBuilder.addParameter("as_ylo", year.toString())); + complexSearchQuery.getToYear().ifPresent(year -> uriBuilder.addParameter("as_yhi", year.toString())); + complexSearchQuery.getSingleYear().ifPresent(year -> { + uriBuilder.addParameter("as_ylo", year.toString()); + uriBuilder.addParameter("as_yhi", year.toString()); + }); + + String queryURL = uriBuilder.toString(); + LOGGER.debug("Using URL {}", queryURL); + try { + addHitsFromQuery(foundEntries, queryURL); + } catch (IOException e) { + LOGGER.info("IOException for URL {}", queryURL); + // If there are too much requests from the same IP address google is answering with a 403, 429, or 503 and redirecting to a captcha challenge + // Example URL: https://www.google.com/sorry/index?continue=https://scholar.google.ch/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3D%2522in%2522%2B%2522and%2522%2B%2522Process%2522%2B%2522Models%2522%2B%2522Issues%2522%2B%2522Interoperability%2522%2B%2522Detecting%2522%2B%2522Correctness%2522%2B%2522BPMN%2522%2B%25222.0%2522%2Ballintitle%253A%26start%3D0%26num%3D20&hl=en&q=EgTZGO7HGOuK2P4FIhkA8aeDSwDHMafs3bst5vlLM-Sk4TtpMrOtMgFy + // The caught IOException looks for example like this: + // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 + if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") || + e.getMessage().contains("Server returned HTTP response code: 429 for URL") || + e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { + LOGGER.debug("Captcha found. Calling the CaptchaSolver"); + String content = captchaSolver.solve(queryURL); + extractEntriesFromContent(content, foundEntries); + + throw new FetcherException("Fetching from Google Scholar failed.", + Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); + } else { + throw new FetcherException("Error while fetching from " + getName(), e); + } + } + return new Page<>(complexQueryString, pageNumber, foundEntries); + } + private String constructComplexQueryString(ComplexSearchQuery complexSearchQuery) { - List searchTerms = new ArrayList<>(); - searchTerms.addAll(complexSearchQuery.getDefaultFieldPhrases()); + List searchTerms = new ArrayList<>(complexSearchQuery.getDefaultFieldPhrases()); complexSearchQuery.getAuthors().forEach(author -> searchTerms.add("author:" + author)); if (!complexSearchQuery.getTitlePhrases().isEmpty()) { searchTerms.add("allintitle:" + String.join(" ", complexSearchQuery.getTitlePhrases())); @@ -227,56 +273,6 @@ private void obtainAndModifyCookie(URLDownload downloader) throws FetcherExcepti } } - @Override - public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException { - LOGGER.debug("Using query {}", complexSearchQuery); - List foundEntries = new ArrayList<>(getPageSize()); - - String complexQueryString = constructComplexQueryString(complexSearchQuery); - final URIBuilder uriBuilder; - try { - uriBuilder = new URIBuilder(BASIC_SEARCH_URL); - } catch (URISyntaxException e) { - throw new FetcherException("Error while fetching from " + getName(), e); - } - uriBuilder.addParameter("hl", "en"); - uriBuilder.addParameter("btnG", "Search"); - uriBuilder.addParameter("q", complexQueryString); - uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize())); - uriBuilder.addParameter("num", String.valueOf(getPageSize())); - complexSearchQuery.getFromYear().ifPresent(year -> uriBuilder.addParameter("as_ylo", year.toString())); - complexSearchQuery.getToYear().ifPresent(year -> uriBuilder.addParameter("as_yhi", year.toString())); - complexSearchQuery.getSingleYear().ifPresent(year -> { - uriBuilder.addParameter("as_ylo", year.toString()); - uriBuilder.addParameter("as_yhi", year.toString()); - }); - - String queryURL = uriBuilder.toString(); - LOGGER.debug("Using URL {}", queryURL); - try { - addHitsFromQuery(foundEntries, queryURL); - } catch (IOException e) { - LOGGER.info("IOException for URL {}", queryURL); - // If there are too much requests from the same IP address google is answering with a 403, 429, or 503 and redirecting to a captcha challenge - // Example URL: https://www.google.com/sorry/index?continue=https://scholar.google.ch/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3D%2522in%2522%2B%2522and%2522%2B%2522Process%2522%2B%2522Models%2522%2B%2522Issues%2522%2B%2522Interoperability%2522%2B%2522Detecting%2522%2B%2522Correctness%2522%2B%2522BPMN%2522%2B%25222.0%2522%2Ballintitle%253A%26start%3D0%26num%3D20&hl=en&q=EgTZGO7HGOuK2P4FIhkA8aeDSwDHMafs3bst5vlLM-Sk4TtpMrOtMgFy - // The caught IOException looks for example like this: - // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 - if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") || - e.getMessage().contains("Server returned HTTP response code: 429 for URL") || - e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { - LOGGER.debug("Captcha found. Calling the CaptchaSolver"); - String content = captchaSolver.solve(queryURL); - extractEntriesFromContent(content, foundEntries); - - throw new FetcherException("Fetching from Google Scholar failed.", - Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); - } else { - throw new FetcherException("Error while fetching from " + getName(), e); - } - } - return new Page<>(complexQueryString, pageNumber, foundEntries); - } - public void displayCaptchaDialog(String link) { Platform.runLater(() -> new CaptchaDialog(link).showAndWait()); /* @@ -286,6 +282,10 @@ public void displayCaptchaDialog(String link) { */ } + private boolean needsCaptcha(String body) { + return body.contains("id=\"gs_captcha_ccl\""); + } + private static final class CaptchaDialog extends BaseDialog { public CaptchaDialog(String content) { super(); @@ -303,5 +303,4 @@ public boolean retry() { return false; } } - } From 625778f01a0d5954c40552f827e4873ec4f7a4e3 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Tue, 22 Dec 2020 21:44:21 +0100 Subject: [PATCH 14/14] CaptchaDialog for the first step Co-authored-by: Dominik Voigt --- .../gui/dialogs/CaptchaSolverDialog.java | 56 +++++++++++++++++++ .../fetcher/WebSearchPaneViewModel.java | 6 ++ .../jabref/logic/importer/WebFetchers.java | 13 ++++- .../logic/importer/fetcher/CaptchaSolver.java | 7 ++- .../logic/importer/fetcher/GoogleScholar.java | 16 +++--- .../importer/fetcher/NoneCaptchaSolver.java | 8 +++ .../CompositeSearchBasedFetcherTest.java | 2 +- .../importer/fetcher/GoogleScholarTest.java | 2 +- 8 files changed, 94 insertions(+), 16 deletions(-) create mode 100644 src/main/java/org/jabref/gui/dialogs/CaptchaSolverDialog.java create mode 100644 src/main/java/org/jabref/logic/importer/fetcher/NoneCaptchaSolver.java diff --git a/src/main/java/org/jabref/gui/dialogs/CaptchaSolverDialog.java b/src/main/java/org/jabref/gui/dialogs/CaptchaSolverDialog.java new file mode 100644 index 00000000000..4caac65b72a --- /dev/null +++ b/src/main/java/org/jabref/gui/dialogs/CaptchaSolverDialog.java @@ -0,0 +1,56 @@ +package org.jabref.gui.dialogs; + +import java.util.concurrent.CountDownLatch; + +import javafx.application.Platform; +import javafx.scene.control.ButtonType; +import javafx.scene.web.WebView; + +import org.jabref.gui.util.BaseDialog; +import org.jabref.logic.l10n.Localization; +import org.jabref.logic.net.URLDownload; + +import org.jsoup.helper.W3CDom; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; + +public class CaptchaSolverDialog extends BaseDialog implements org.jabref.logic.importer.fetcher.CaptchaSolver { + + public static final Logger LOGGER = LoggerFactory.getLogger(CaptchaSolverDialog.class); + + private WebView webView; + + public CaptchaSolverDialog() { + super(); + this.setTitle(Localization.lang("Captcha Solver")); + getDialogPane().getButtonTypes().add(ButtonType.CLOSE); + getDialogPane().lookupButton(ButtonType.CLOSE).setVisible(true); + + webView = new WebView(); + webView.getEngine().setJavaScriptEnabled(true); + webView.getEngine().setUserAgent(URLDownload.USER_AGENT); + getDialogPane().setContent(webView); + } + + @Override + public String solve(String queryURL) { + // slim implementation of https://news.kynosarges.org/2014/05/01/simulating-platform-runandwait/ + final CountDownLatch doneLatch = new CountDownLatch(1); + Platform.runLater(() -> { + webView.getEngine().load(queryURL); + // For the quick implementation, we ignore the result + // Later, at "webView", we directly extract it from the web view + this.showAndWait(); + doneLatch.countDown(); + }); + try { + doneLatch.await(); + Document document = webView.getEngine().getDocument(); + return W3CDom.asString(document, null); + } catch (InterruptedException e) { + LOGGER.error("Issues with the UI", e); + } + return ""; + } +} diff --git a/src/main/java/org/jabref/gui/importer/fetcher/WebSearchPaneViewModel.java b/src/main/java/org/jabref/gui/importer/fetcher/WebSearchPaneViewModel.java index 9d59a0f76cf..29bada428ea 100644 --- a/src/main/java/org/jabref/gui/importer/fetcher/WebSearchPaneViewModel.java +++ b/src/main/java/org/jabref/gui/importer/fetcher/WebSearchPaneViewModel.java @@ -18,11 +18,13 @@ import org.jabref.gui.DialogService; import org.jabref.gui.StateManager; +import org.jabref.gui.dialogs.CaptchaSolverDialog; import org.jabref.gui.importer.ImportEntriesDialog; import org.jabref.gui.util.BackgroundTask; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.importer.SearchBasedFetcher; import org.jabref.logic.importer.WebFetchers; +import org.jabref.logic.importer.fetcher.GoogleScholar; import org.jabref.logic.l10n.Localization; import org.jabref.model.strings.StringUtil; import org.jabref.preferences.PreferencesService; @@ -43,6 +45,7 @@ public WebSearchPaneViewModel(PreferencesService preferencesService, DialogServi this.dialogService = dialogService; this.stateManager = stateManager; + WebFetchers.setCaptchaSolver(new CaptchaSolverDialog()); SortedSet allFetchers = WebFetchers.getSearchBasedFetchers(preferencesService.getImportFormatPreferences()); fetchers.setAll(allFetchers); @@ -107,6 +110,9 @@ public void search() { task = BackgroundTask.wrap(() -> new ParserResult(activeFetcher.performSearch(getQuery().trim()))) .withInitialMessage(Localization.lang("Processing %0", getQuery().trim())); task.onFailure(dialogService::showErrorDialogAndWait); + if (activeFetcher instanceof GoogleScholar) { + task.showToUser(true); + } ImportEntriesDialog dialog = new ImportEntriesDialog(stateManager.getActiveDatabase().get(), task); dialog.setTitle(activeFetcher.getName()); diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java index b73dbf8191b..4705bbf93e5 100644 --- a/src/main/java/org/jabref/logic/importer/WebFetchers.java +++ b/src/main/java/org/jabref/logic/importer/WebFetchers.java @@ -11,6 +11,7 @@ import org.jabref.logic.importer.fetcher.ApsFetcher; import org.jabref.logic.importer.fetcher.ArXiv; import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem; +import org.jabref.logic.importer.fetcher.CaptchaSolver; import org.jabref.logic.importer.fetcher.CiteSeer; import org.jabref.logic.importer.fetcher.CollectionOfComputerScienceBibliographiesFetcher; import org.jabref.logic.importer.fetcher.CompositeSearchBasedFetcher; @@ -31,6 +32,7 @@ import org.jabref.logic.importer.fetcher.MathSciNet; import org.jabref.logic.importer.fetcher.MedlineFetcher; import org.jabref.logic.importer.fetcher.Medra; +import org.jabref.logic.importer.fetcher.NoneCaptchaSolver; import org.jabref.logic.importer.fetcher.OpenAccessDoi; import org.jabref.logic.importer.fetcher.RfcFetcher; import org.jabref.logic.importer.fetcher.ScienceDirect; @@ -51,6 +53,13 @@ public class WebFetchers { private WebFetchers() { } + // Default CaptchaSolver is the useless one (which just does not through an exception) + private static CaptchaSolver captchaSolver = new NoneCaptchaSolver(); + + public static void setCaptchaSolver(CaptchaSolver captchaSolver) { + WebFetchers.captchaSolver = captchaSolver; + } + public static Optional getIdBasedFetcherForField(Field field, ImportFormatPreferences preferences) { IdBasedFetcher fetcher; @@ -96,7 +105,7 @@ public static SortedSet getSearchBasedFetchers(ImportFormatP set.add(new ZbMATH(importFormatPreferences)); // see https://github.com/JabRef/jabref/issues/5804 // set.add(new ACMPortalFetcher(importFormatPreferences)); - set.add(new GoogleScholar(importFormatPreferences)); + set.add(new GoogleScholar(importFormatPreferences, captchaSolver)); set.add(new DBLPFetcher(importFormatPreferences)); set.add(new SpringerFetcher()); set.add(new CrossRef()); @@ -170,7 +179,7 @@ public static Set getFullTextFetchers(ImportFormatPreferences i fetchers.add(new ApsFetcher()); // Meta search fetchers.add(new JstorFetcher(importFormatPreferences)); - fetchers.add(new GoogleScholar(importFormatPreferences)); + fetchers.add(new GoogleScholar(importFormatPreferences, captchaSolver)); fetchers.add(new OpenAccessDoi()); return fetchers; diff --git a/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java b/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java index 10416af6714..003a3c10e01 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java @@ -3,9 +3,10 @@ public interface CaptchaSolver { /** - * Instructes the user to solve the captcha given at - * @param queryURL - * @return + * Instructs the user to solve the captcha given at + * + * @param queryURL the URL to query + * @return html content after solving the captcha */ String solve(String queryURL); } diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 49b7f9d23a4..96d44b7608a 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -172,8 +172,12 @@ public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { LOGGER.debug("Captcha found. Calling the CaptchaSolver"); String content = captchaSolver.solve(queryURL); - extractEntriesFromContent(content, foundEntries); - + LOGGER.debug("Returned result {}", content); + try { + extractEntriesFromContent(content, foundEntries); + } catch (IOException ioException) { + LOGGER.error("Still failing at Google Scholar", ioException); + } throw new FetcherException("Fetching from Google Scholar failed.", Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); } else { @@ -199,13 +203,7 @@ private void addHitsFromQuery(List entryList, String queryURL) throws URLDownload urlDownload = new URLDownload(queryURL); obtainAndModifyCookie(urlDownload); - // We need JSOUP directly to read the content when 429 is returned - - String content; - try { - content = urlDownload.asString(); - } - + String content = urlDownload.asString(); if (needsCaptcha(content)) { throw new FetcherException("Fetching from Google Scholar failed: Captcha hit at " + queryURL + ".", Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null); diff --git a/src/main/java/org/jabref/logic/importer/fetcher/NoneCaptchaSolver.java b/src/main/java/org/jabref/logic/importer/fetcher/NoneCaptchaSolver.java new file mode 100644 index 00000000000..7c0376674e5 --- /dev/null +++ b/src/main/java/org/jabref/logic/importer/fetcher/NoneCaptchaSolver.java @@ -0,0 +1,8 @@ +package org.jabref.logic.importer.fetcher; + +public class NoneCaptchaSolver implements CaptchaSolver { + @Override + public String solve(String queryURL) { + return ""; + } +} diff --git a/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java index b639c35cf5b..75603d35314 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java @@ -101,7 +101,7 @@ static Stream performSearchParameters() { list.add(new AstrophysicsDataSystem(importFormatPreferences)); list.add(new MathSciNet(importFormatPreferences)); list.add(new ZbMATH(importFormatPreferences)); - list.add(new GoogleScholar(importFormatPreferences)); + list.add(new GoogleScholar(importFormatPreferences, new NoneCaptchaSolver())); list.add(new DBLPFetcher(importFormatPreferences)); list.add(new SpringerFetcher()); list.add(new CrossRef()); diff --git a/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java b/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java index 7eb4e088edf..bbeb3f6773a 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java @@ -34,7 +34,7 @@ void setUp() { ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class); when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn( mock(FieldContentFormatterPreferences.class)); - fetcher = new GoogleScholar(importFormatPreferences); + fetcher = new GoogleScholar(importFormatPreferences, new NoneCaptchaSolver()); } @Test