diff --git a/docs/advanced-reading/fetchers.md b/docs/advanced-reading/fetchers.md
index e3256585127..9a430bce48f 100644
--- a/docs/advanced-reading/fetchers.md
+++ b/docs/advanced-reading/fetchers.md
@@ -14,6 +14,17 @@ Fetchers are the implementation of the [search using online services](https://do
On Windows, you have to log-off and log-on to let IntelliJ know about the environment variable change. Execute the gradle task "processResources" in the group "others" within IntelliJ to ensure the values have been correctly written. Now, the fetcher tests should run without issues.
+## Change the log levels to enable debugging
+
+1. Open `src/test/resources/log4j2-test.xml`
+2. Add following XML snippet
+
+ ```xml
+
+
+
+ ```
+
## Background on embedding the keys in JabRef
The keys are placed into the `build.properties` file.
diff --git a/src/main/java/org/jabref/gui/dialogs/CaptchaSolverDialog.java b/src/main/java/org/jabref/gui/dialogs/CaptchaSolverDialog.java
new file mode 100644
index 00000000000..4caac65b72a
--- /dev/null
+++ b/src/main/java/org/jabref/gui/dialogs/CaptchaSolverDialog.java
@@ -0,0 +1,56 @@
+package org.jabref.gui.dialogs;
+
+import java.util.concurrent.CountDownLatch;
+
+import javafx.application.Platform;
+import javafx.scene.control.ButtonType;
+import javafx.scene.web.WebView;
+
+import org.jabref.gui.util.BaseDialog;
+import org.jabref.logic.l10n.Localization;
+import org.jabref.logic.net.URLDownload;
+
+import org.jsoup.helper.W3CDom;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+
+public class CaptchaSolverDialog extends BaseDialog implements org.jabref.logic.importer.fetcher.CaptchaSolver {
+
+ public static final Logger LOGGER = LoggerFactory.getLogger(CaptchaSolverDialog.class);
+
+ private WebView webView;
+
+ public CaptchaSolverDialog() {
+ super();
+ this.setTitle(Localization.lang("Captcha Solver"));
+ getDialogPane().getButtonTypes().add(ButtonType.CLOSE);
+ getDialogPane().lookupButton(ButtonType.CLOSE).setVisible(true);
+
+ webView = new WebView();
+ webView.getEngine().setJavaScriptEnabled(true);
+ webView.getEngine().setUserAgent(URLDownload.USER_AGENT);
+ getDialogPane().setContent(webView);
+ }
+
+ @Override
+ public String solve(String queryURL) {
+ // slim implementation of https://news.kynosarges.org/2014/05/01/simulating-platform-runandwait/
+ final CountDownLatch doneLatch = new CountDownLatch(1);
+ Platform.runLater(() -> {
+ webView.getEngine().load(queryURL);
+ // For the quick implementation, we ignore the result
+ // Later, at "webView", we directly extract it from the web view
+ this.showAndWait();
+ doneLatch.countDown();
+ });
+ try {
+ doneLatch.await();
+ Document document = webView.getEngine().getDocument();
+ return W3CDom.asString(document, null);
+ } catch (InterruptedException e) {
+ LOGGER.error("Issues with the UI", e);
+ }
+ return "";
+ }
+}
diff --git a/src/main/java/org/jabref/gui/importer/fetcher/WebSearchPaneViewModel.java b/src/main/java/org/jabref/gui/importer/fetcher/WebSearchPaneViewModel.java
index 9d59a0f76cf..29bada428ea 100644
--- a/src/main/java/org/jabref/gui/importer/fetcher/WebSearchPaneViewModel.java
+++ b/src/main/java/org/jabref/gui/importer/fetcher/WebSearchPaneViewModel.java
@@ -18,11 +18,13 @@
import org.jabref.gui.DialogService;
import org.jabref.gui.StateManager;
+import org.jabref.gui.dialogs.CaptchaSolverDialog;
import org.jabref.gui.importer.ImportEntriesDialog;
import org.jabref.gui.util.BackgroundTask;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.WebFetchers;
+import org.jabref.logic.importer.fetcher.GoogleScholar;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.strings.StringUtil;
import org.jabref.preferences.PreferencesService;
@@ -43,6 +45,7 @@ public WebSearchPaneViewModel(PreferencesService preferencesService, DialogServi
this.dialogService = dialogService;
this.stateManager = stateManager;
+ WebFetchers.setCaptchaSolver(new CaptchaSolverDialog());
SortedSet allFetchers = WebFetchers.getSearchBasedFetchers(preferencesService.getImportFormatPreferences());
fetchers.setAll(allFetchers);
@@ -107,6 +110,9 @@ public void search() {
task = BackgroundTask.wrap(() -> new ParserResult(activeFetcher.performSearch(getQuery().trim())))
.withInitialMessage(Localization.lang("Processing %0", getQuery().trim()));
task.onFailure(dialogService::showErrorDialogAndWait);
+ if (activeFetcher instanceof GoogleScholar) {
+ task.showToUser(true);
+ }
ImportEntriesDialog dialog = new ImportEntriesDialog(stateManager.getActiveDatabase().get(), task);
dialog.setTitle(activeFetcher.getName());
diff --git a/src/main/java/org/jabref/logic/importer/QueryParser.java b/src/main/java/org/jabref/logic/importer/QueryParser.java
index 65359122ff2..5bd1179db48 100644
--- a/src/main/java/org/jabref/logic/importer/QueryParser.java
+++ b/src/main/java/org/jabref/logic/importer/QueryParser.java
@@ -1,6 +1,7 @@
package org.jabref.logic.importer;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
@@ -11,7 +12,10 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
+import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
+import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
+import org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
@@ -26,24 +30,65 @@ public class QueryParser {
* Parses the given query string into a complex query using lucene.
* Note: For unique fields, the alphabetically and numerically first instance in the query string is used in the complex query.
*
- * @param query The given query string
+ * @param query The given query string. E.g. BPMN 2.0
or author:"Kopp" AND title:"BPEL4Chor"
* @return A complex query containing all fields of the query string
*/
public Optional parseQueryStringIntoComplexQuery(String query) {
try {
- StandardQueryParser parser = new StandardQueryParser();
- Query luceneQuery = parser.parse(query, "default");
- Set terms = new HashSet<>();
- // This implementation collects all terms from the leaves of the query tree independent of the internal boolean structure
- // If further capabilities are required in the future the visitor and ComplexSearchQuery has to be adapted accordingly.
- QueryVisitor visitor = QueryVisitor.termCollector(terms);
- luceneQuery.visit(visitor);
-
- List sortedTerms = new ArrayList<>(terms);
- sortedTerms.sort(Comparator.comparing(Term::text).reversed());
- return Optional.of(ComplexSearchQuery.fromTerms(sortedTerms));
+ StandardSyntaxParser parser = new StandardSyntaxParser();
+ QueryNode luceneQuery = parser.parse(query, "default");
+ QueryToComplexSearchQueryTransformator transformator = new QueryToComplexSearchQueryTransformator();
+ return Optional.of(transformator.handle(luceneQuery));
} catch (QueryNodeException | IllegalStateException | IllegalArgumentException ex) {
return Optional.empty();
}
}
+
+ private static class QueryToComplexSearchQueryTransformator {
+
+ ComplexSearchQuery.ComplexSearchQueryBuilder builder;
+
+ public ComplexSearchQuery handle(QueryNode query) {
+ builder = ComplexSearchQuery.builder();
+ transform(query);
+ return builder.build();
+ }
+
+ public void transform(QueryNode query) {
+ if (query instanceof FieldQueryNode) {
+ transform(((FieldQueryNode) query));
+ return;
+ }
+ query.getChildren().forEach(this::transform);
+ }
+
+ private void transform(FieldQueryNode query) {
+ final String fieldValue = query.getTextAsString();
+ switch (query.getFieldAsString()) {
+ case "author" -> {
+ builder.author(fieldValue);
+ }
+ case "journal" -> {
+ builder.journal(fieldValue);
+ }
+ case "title" -> {
+ builder.titlePhrase(fieldValue);
+ }
+ case "year" -> {
+ builder.singleYear(Integer.valueOf(fieldValue));
+ }
+ case "year-range" -> {
+ String[] years = fieldValue.split("-");
+ if (years.length != 2) {
+ return;
+ }
+ builder.fromYearAndToYear(Integer.valueOf(years[0]), Integer.valueOf(years[1]));
+ }
+ default -> {
+ builder.defaultFieldPhrase(fieldValue);
+ }
+ }
+ }
+
+ }
}
diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java
index b73dbf8191b..4705bbf93e5 100644
--- a/src/main/java/org/jabref/logic/importer/WebFetchers.java
+++ b/src/main/java/org/jabref/logic/importer/WebFetchers.java
@@ -11,6 +11,7 @@
import org.jabref.logic.importer.fetcher.ApsFetcher;
import org.jabref.logic.importer.fetcher.ArXiv;
import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem;
+import org.jabref.logic.importer.fetcher.CaptchaSolver;
import org.jabref.logic.importer.fetcher.CiteSeer;
import org.jabref.logic.importer.fetcher.CollectionOfComputerScienceBibliographiesFetcher;
import org.jabref.logic.importer.fetcher.CompositeSearchBasedFetcher;
@@ -31,6 +32,7 @@
import org.jabref.logic.importer.fetcher.MathSciNet;
import org.jabref.logic.importer.fetcher.MedlineFetcher;
import org.jabref.logic.importer.fetcher.Medra;
+import org.jabref.logic.importer.fetcher.NoneCaptchaSolver;
import org.jabref.logic.importer.fetcher.OpenAccessDoi;
import org.jabref.logic.importer.fetcher.RfcFetcher;
import org.jabref.logic.importer.fetcher.ScienceDirect;
@@ -51,6 +53,13 @@ public class WebFetchers {
private WebFetchers() {
}
+ // Default CaptchaSolver is the useless one (which just does not through an exception)
+ private static CaptchaSolver captchaSolver = new NoneCaptchaSolver();
+
+ public static void setCaptchaSolver(CaptchaSolver captchaSolver) {
+ WebFetchers.captchaSolver = captchaSolver;
+ }
+
public static Optional getIdBasedFetcherForField(Field field, ImportFormatPreferences preferences) {
IdBasedFetcher fetcher;
@@ -96,7 +105,7 @@ public static SortedSet getSearchBasedFetchers(ImportFormatP
set.add(new ZbMATH(importFormatPreferences));
// see https://github.com/JabRef/jabref/issues/5804
// set.add(new ACMPortalFetcher(importFormatPreferences));
- set.add(new GoogleScholar(importFormatPreferences));
+ set.add(new GoogleScholar(importFormatPreferences, captchaSolver));
set.add(new DBLPFetcher(importFormatPreferences));
set.add(new SpringerFetcher());
set.add(new CrossRef());
@@ -170,7 +179,7 @@ public static Set getFullTextFetchers(ImportFormatPreferences i
fetchers.add(new ApsFetcher());
// Meta search
fetchers.add(new JstorFetcher(importFormatPreferences));
- fetchers.add(new GoogleScholar(importFormatPreferences));
+ fetchers.add(new GoogleScholar(importFormatPreferences, captchaSolver));
fetchers.add(new OpenAccessDoi());
return fetchers;
diff --git a/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java b/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java
new file mode 100644
index 00000000000..003a3c10e01
--- /dev/null
+++ b/src/main/java/org/jabref/logic/importer/fetcher/CaptchaSolver.java
@@ -0,0 +1,12 @@
+package org.jabref.logic.importer.fetcher;
+
+public interface CaptchaSolver {
+
+ /**
+ * Instructs the user to solve the captcha given at
+ *
+ * @param queryURL the URL to query
+ * @return html content after solving the captcha
+ */
+ String solve(String queryURL);
+}
diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
index 58d6c8546b7..96d44b7608a 100644
--- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
+++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
@@ -13,6 +13,11 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import javafx.application.Platform;
+import javafx.scene.control.ButtonType;
+import javafx.scene.web.WebView;
+
+import org.jabref.gui.util.BaseDialog;
import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.FulltextFetcher;
@@ -27,6 +32,8 @@
import org.jabref.model.paging.Page;
import org.jabref.model.util.DummyFileUpdateMonitor;
+import com.sun.star.sheet.XSolver;
+import kong.unirest.Unirest;
import org.apache.http.client.utils.URIBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@@ -42,6 +49,7 @@
public class GoogleScholar implements FulltextFetcher, PagedSearchBasedFetcher {
private static final Logger LOGGER = LoggerFactory.getLogger(GoogleScholar.class);
+ private static final Pattern LINK_TO_SUBPAGE_PATTERN = Pattern.compile("data-clk-atid=\"([^\"]*)\"");
private static final Pattern LINK_TO_BIB_PATTERN = Pattern.compile("(https:\\/\\/scholar.googleusercontent.com\\/scholar.bib[^\"]*)");
private static final String BASIC_SEARCH_URL = "https://scholar.google.ch/scholar?";
@@ -49,10 +57,12 @@ public class GoogleScholar implements FulltextFetcher, PagedSearchBasedFetcher {
private static final int NUM_RESULTS = 10;
private final ImportFormatPreferences importFormatPreferences;
+ private CaptchaSolver captchaSolver;
- public GoogleScholar(ImportFormatPreferences importFormatPreferences) {
+ public GoogleScholar(ImportFormatPreferences importFormatPreferences, CaptchaSolver solver) {
Objects.requireNonNull(importFormatPreferences);
this.importFormatPreferences = importFormatPreferences;
+ this.captchaSolver = solver;
}
@Override
@@ -79,11 +89,6 @@ public Optional findFullText(BibEntry entry) throws IOException, FetcherExc
}
}
- @Override
- public TrustLevel getTrustLevel() {
- return TrustLevel.META_SEARCH;
- }
-
private Optional search(String url) throws IOException {
Optional pdfLink = Optional.empty();
@@ -113,8 +118,9 @@ private Optional search(String url) throws IOException {
return pdfLink;
}
- private boolean needsCaptcha(String body) {
- return body.contains("id=\"gs_captcha_ccl\"");
+ @Override
+ public TrustLevel getTrustLevel() {
+ return TrustLevel.META_SEARCH;
}
@Override
@@ -127,33 +133,115 @@ public Optional getHelpPage() {
return Optional.of(HelpFile.FETCHER_GOOGLE_SCHOLAR);
}
+ @Override
+ public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException {
+ LOGGER.debug("Using query {}", complexSearchQuery);
+ List foundEntries = new ArrayList<>(getPageSize());
+
+ String complexQueryString = constructComplexQueryString(complexSearchQuery);
+ final URIBuilder uriBuilder;
+ try {
+ uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
+ } catch (URISyntaxException e) {
+ throw new FetcherException("Error while fetching from " + getName(), e);
+ }
+ uriBuilder.addParameter("hl", "en");
+ uriBuilder.addParameter("btnG", "Search");
+ uriBuilder.addParameter("q", complexQueryString);
+ uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize()));
+ uriBuilder.addParameter("num", String.valueOf(getPageSize()));
+ complexSearchQuery.getFromYear().ifPresent(year -> uriBuilder.addParameter("as_ylo", year.toString()));
+ complexSearchQuery.getToYear().ifPresent(year -> uriBuilder.addParameter("as_yhi", year.toString()));
+ complexSearchQuery.getSingleYear().ifPresent(year -> {
+ uriBuilder.addParameter("as_ylo", year.toString());
+ uriBuilder.addParameter("as_yhi", year.toString());
+ });
+
+ String queryURL = uriBuilder.toString();
+ LOGGER.debug("Using URL {}", queryURL);
+ try {
+ addHitsFromQuery(foundEntries, queryURL);
+ } catch (IOException e) {
+ LOGGER.info("IOException for URL {}", queryURL);
+ // If there are too much requests from the same IP address google is answering with a 403, 429, or 503 and redirecting to a captcha challenge
+ // Example URL: https://www.google.com/sorry/index?continue=https://scholar.google.ch/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3D%2522in%2522%2B%2522and%2522%2B%2522Process%2522%2B%2522Models%2522%2B%2522Issues%2522%2B%2522Interoperability%2522%2B%2522Detecting%2522%2B%2522Correctness%2522%2B%2522BPMN%2522%2B%25222.0%2522%2Ballintitle%253A%26start%3D0%26num%3D20&hl=en&q=EgTZGO7HGOuK2P4FIhkA8aeDSwDHMafs3bst5vlLM-Sk4TtpMrOtMgFy
+ // The caught IOException looks for example like this:
+ // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
+ if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") ||
+ e.getMessage().contains("Server returned HTTP response code: 429 for URL") ||
+ e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
+ LOGGER.debug("Captcha found. Calling the CaptchaSolver");
+ String content = captchaSolver.solve(queryURL);
+ LOGGER.debug("Returned result {}", content);
+ try {
+ extractEntriesFromContent(content, foundEntries);
+ } catch (IOException ioException) {
+ LOGGER.error("Still failing at Google Scholar", ioException);
+ }
+ throw new FetcherException("Fetching from Google Scholar failed.",
+ Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
+ } else {
+ throw new FetcherException("Error while fetching from " + getName(), e);
+ }
+ }
+ return new Page<>(complexQueryString, pageNumber, foundEntries);
+ }
+
private String constructComplexQueryString(ComplexSearchQuery complexSearchQuery) {
- List searchTerms = new ArrayList<>();
- searchTerms.addAll(complexSearchQuery.getDefaultFieldPhrases());
+ List searchTerms = new ArrayList<>(complexSearchQuery.getDefaultFieldPhrases());
complexSearchQuery.getAuthors().forEach(author -> searchTerms.add("author:" + author));
- searchTerms.add("allintitle:" + String.join(" ", complexSearchQuery.getTitlePhrases()));
+ if (!complexSearchQuery.getTitlePhrases().isEmpty()) {
+ searchTerms.add("allintitle:" + String.join(" ", complexSearchQuery.getTitlePhrases()));
+ }
complexSearchQuery.getJournal().ifPresent(journal -> searchTerms.add("source:" + journal));
// API automatically ANDs the terms
return String.join(" ", searchTerms);
}
private void addHitsFromQuery(List entryList, String queryURL) throws IOException, FetcherException {
- String content = new URLDownload(queryURL).asString();
+ LOGGER.debug("Downloading from {}", queryURL);
+ URLDownload urlDownload = new URLDownload(queryURL);
+ obtainAndModifyCookie(urlDownload);
+ String content = urlDownload.asString();
if (needsCaptcha(content)) {
- throw new FetcherException("Fetching from Google Scholar failed: Captacha hit at " + queryURL + ".",
+ throw new FetcherException("Fetching from Google Scholar failed: Captcha hit at " + queryURL + ".",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null);
}
- Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content);
+ extractEntriesFromContent(content, entryList);
+ }
+
+ private void extractEntriesFromContent(String content, List entryList) throws IOException, FetcherException {
+ Matcher matcher = LINK_TO_SUBPAGE_PATTERN.matcher(content);
+ if (!matcher.find()) {
+ LOGGER.debug("No data-clk-atid found in html {}", content);
+ return;
+ }
+
+ String infoPageUrl = BASIC_SEARCH_URL + "q=info:" + matcher.group(1) + ":scholar.google.com/&output=cite&scirp=0&hl=en";
+ LOGGER.debug("Using infoPageUrl {}", infoPageUrl);
+ // FIXME: Existing cookies should be reused.
+ URLDownload infoPageUrlDownload = new URLDownload(infoPageUrl);
+ LOGGER.debug("Downloading from {}", infoPageUrl);
+ String infoPageContent = infoPageUrlDownload.asString();
+
+ matcher = LINK_TO_BIB_PATTERN.matcher(infoPageContent);
+ boolean found = false;
while (matcher.find()) {
+ found = true;
String citationsPageURL = matcher.group().replace("&", "&");
+ LOGGER.debug("Using citationsPageURL {}", citationsPageURL);
BibEntry newEntry = downloadEntry(citationsPageURL);
entryList.add(newEntry);
}
+ if (!found) {
+ LOGGER.debug("Did not found pattern in html {}", infoPageContent);
+ }
}
private BibEntry downloadEntry(String link) throws IOException, FetcherException {
+ LOGGER.debug("Downloading from {}", link);
String downloadedContent = new URLDownload(link).asString();
BibtexParser parser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor());
ParserResult result = parser.parse(new StringReader(downloadedContent));
@@ -162,7 +250,7 @@ private BibEntry downloadEntry(String link) throws IOException, FetcherException
} else {
Collection entries = result.getDatabase().getEntries();
if (entries.size() != 1) {
- LOGGER.debug(entries.size() + " entries found! (" + link + ")");
+ LOGGER.debug("{} entries found ({})", entries.size(), link);
throw new FetcherException("Parsing entries from Google Scholar bib file failed.");
} else {
BibEntry entry = entries.iterator().next();
@@ -171,9 +259,8 @@ private BibEntry downloadEntry(String link) throws IOException, FetcherException
}
}
- private void obtainAndModifyCookie() throws FetcherException {
+ private void obtainAndModifyCookie(URLDownload downloader) throws FetcherException {
try {
- URLDownload downloader = new URLDownload("https://scholar.google.com");
List cookies = downloader.getCookieFromUrl();
for (HttpCookie cookie : cookies) {
// append "CF=4" which represents "Citation format bibtex"
@@ -184,48 +271,34 @@ private void obtainAndModifyCookie() throws FetcherException {
}
}
- @Override
- public Page performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException {
- try {
- obtainAndModifyCookie();
- List foundEntries = new ArrayList<>(10);
+ public void displayCaptchaDialog(String link) {
+ Platform.runLater(() -> new CaptchaDialog(link).showAndWait());
+ /*
+ if (dialog.retry()) {
+ displayCaptchaDialog(link);
+ }
+ */
+ }
- String complexQueryString = constructComplexQueryString(complexSearchQuery);
- URIBuilder uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
- uriBuilder.addParameter("hl", "en");
- uriBuilder.addParameter("btnG", "Search");
- uriBuilder.addParameter("q", complexQueryString);
- uriBuilder.addParameter("start", String.valueOf(pageNumber * getPageSize()));
- uriBuilder.addParameter("num", String.valueOf(getPageSize()));
- complexSearchQuery.getFromYear().ifPresent(year -> uriBuilder.addParameter("as_ylo", year.toString()));
- complexSearchQuery.getToYear().ifPresent(year -> uriBuilder.addParameter("as_yhi", year.toString()));
- complexSearchQuery.getSingleYear().ifPresent(year -> {
- uriBuilder.addParameter("as_ylo", year.toString());
- uriBuilder.addParameter("as_yhi", year.toString());
- });
-
- try {
- addHitsFromQuery(foundEntries, uriBuilder.toString());
-
- if (foundEntries.size() == 10) {
- uriBuilder.addParameter("start", "10");
- addHitsFromQuery(foundEntries, uriBuilder.toString());
- }
- } catch (IOException e) {
- LOGGER.info("IOException for URL {}", uriBuilder.toString());
- // if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge
- // The caught IOException looks for example like this:
- // java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
- if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
- throw new FetcherException("Fetching from Google Scholar failed.",
- Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
- } else {
- throw new FetcherException("Error while fetching from " + getName(), e);
- }
- }
- return new Page<>(complexQueryString, pageNumber, foundEntries);
- } catch (URISyntaxException e) {
- throw new FetcherException("Error while fetching from " + getName(), e);
+ private boolean needsCaptcha(String body) {
+ return body.contains("id=\"gs_captcha_ccl\"");
+ }
+
+ private static final class CaptchaDialog extends BaseDialog {
+ public CaptchaDialog(String content) {
+ super();
+ this.getDialogPane().getButtonTypes().add(ButtonType.CLOSE);
+ this.getDialogPane().lookupButton(ButtonType.CLOSE).setVisible(true);
+ WebView webView = new WebView();
+
+ // webView.getEngine().setJavaScriptEnabled(true);
+ webView.getEngine().setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0");
+ this.getDialogPane().setContent(webView);
+ webView.getEngine().loadContent(content);
+ }
+
+ public boolean retry() {
+ return false;
}
}
}
diff --git a/src/main/java/org/jabref/logic/importer/fetcher/NoneCaptchaSolver.java b/src/main/java/org/jabref/logic/importer/fetcher/NoneCaptchaSolver.java
new file mode 100644
index 00000000000..7c0376674e5
--- /dev/null
+++ b/src/main/java/org/jabref/logic/importer/fetcher/NoneCaptchaSolver.java
@@ -0,0 +1,8 @@
+package org.jabref.logic.importer.fetcher;
+
+public class NoneCaptchaSolver implements CaptchaSolver {
+ @Override
+ public String solve(String queryURL) {
+ return "";
+ }
+}
diff --git a/src/main/java/org/jabref/logic/net/URLDownload.java b/src/main/java/org/jabref/logic/net/URLDownload.java
index 99887c7ac93..7bd2baecd20 100644
--- a/src/main/java/org/jabref/logic/net/URLDownload.java
+++ b/src/main/java/org/jabref/logic/net/URLDownload.java
@@ -235,6 +235,10 @@ public String asString() throws IOException {
return asString(StandardCharsets.UTF_8);
}
+ /**
+ * Returns a modifiable list of cookies related to the URL of this URLDownload.
+ * Any modifications will be used at later calls
+ */
public List getCookieFromUrl() throws IOException {
CookieManager cookieManager = new CookieManager();
CookieHandler.setDefault(cookieManager);
diff --git a/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java
index b639c35cf5b..75603d35314 100644
--- a/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java
+++ b/src/test/java/org/jabref/logic/importer/fetcher/CompositeSearchBasedFetcherTest.java
@@ -101,7 +101,7 @@ static Stream performSearchParameters() {
list.add(new AstrophysicsDataSystem(importFormatPreferences));
list.add(new MathSciNet(importFormatPreferences));
list.add(new ZbMATH(importFormatPreferences));
- list.add(new GoogleScholar(importFormatPreferences));
+ list.add(new GoogleScholar(importFormatPreferences, new NoneCaptchaSolver()));
list.add(new DBLPFetcher(importFormatPreferences));
list.add(new SpringerFetcher());
list.add(new CrossRef());
diff --git a/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java b/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java
index 87ff79ac608..bbeb3f6773a 100644
--- a/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java
+++ b/src/test/java/org/jabref/logic/importer/fetcher/GoogleScholarTest.java
@@ -27,49 +27,49 @@
@FetcherTest
class GoogleScholarTest implements SearchBasedFetcherCapabilityTest, PagedSearchFetcherTest {
- private GoogleScholar finder;
- private BibEntry entry;
+ private GoogleScholar fetcher;
@BeforeEach
void setUp() {
ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class);
when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn(
mock(FieldContentFormatterPreferences.class));
- finder = new GoogleScholar(importFormatPreferences);
- entry = new BibEntry();
+ fetcher = new GoogleScholar(importFormatPreferences, new NoneCaptchaSolver());
}
@Test
@DisabledOnCIServer("CI server is blocked by Google")
void linkFound() throws IOException, FetcherException {
- entry.setField(StandardField.TITLE, "Towards Application Portability in Platform as a Service");
+ BibEntry entry = new BibEntry()
+ .withField(StandardField.TITLE, "Towards Application Portability in Platform as a Service");
assertEquals(
Optional.of(new URL("https://www.uni-bamberg.de/fileadmin/uni/fakultaeten/wiai_lehrstuehle/praktische_informatik/Dateien/Publikationen/sose14-towards-application-portability-in-paas.pdf")),
- finder.findFullText(entry)
+ fetcher.findFullText(entry)
);
}
@Test
@DisabledOnCIServer("CI server is blocked by Google")
void noLinkFound() throws IOException, FetcherException {
- entry.setField(StandardField.TITLE, "Curriculum programme of career-oriented java specialty guided by principles of software engineering");
+ BibEntry entry = new BibEntry()
+ .withField(StandardField.TITLE, "Curriculum programme of career-oriented java specialty guided by principles of software engineering");
- assertEquals(Optional.empty(), finder.findFullText(entry));
+ assertEquals(Optional.empty(), fetcher.findFullText(entry));
}
@Test
@DisabledOnCIServer("CI server is blocked by Google")
void findSingleEntry() throws FetcherException {
- entry.setType(StandardEntryType.InProceedings);
- entry.setCitationKey("geiger2013detecting");
- entry.setField(StandardField.TITLE, "Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models.");
- entry.setField(StandardField.AUTHOR, "Geiger, Matthias and Wirtz, Guido");
- entry.setField(StandardField.BOOKTITLE, "ZEUS");
- entry.setField(StandardField.YEAR, "2013");
- entry.setField(StandardField.PAGES, "41--44");
+ BibEntry entry = new BibEntry(StandardEntryType.InProceedings)
+ .withCitationKey("geiger2013detecting")
+ .withField(StandardField.TITLE, "Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models.")
+ .withField(StandardField.AUTHOR, "Geiger, Matthias and Wirtz, Guido")
+ .withField(StandardField.BOOKTITLE, "ZEUS")
+ .withField(StandardField.YEAR, "2013")
+ .withField(StandardField.PAGES, "41--44");
- List foundEntries = finder.performSearch("Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models");
+ List foundEntries = fetcher.performSearch("Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models");
assertEquals(Collections.singletonList(entry), foundEntries);
}
@@ -77,19 +77,19 @@ void findSingleEntry() throws FetcherException {
@Test
@DisabledOnCIServer("CI server is blocked by Google")
void findManyEntries() throws FetcherException {
- List foundEntries = finder.performSearch("random test string");
+ List foundEntries = fetcher.performSearch("random test string");
assertEquals(20, foundEntries.size());
}
@Override
public SearchBasedFetcher getFetcher() {
- return finder;
+ return fetcher;
}
@Override
public PagedSearchBasedFetcher getPagedFetcher() {
- return finder;
+ return fetcher;
}
@Override
diff --git a/src/test/resources/log4j2-test.xml b/src/test/resources/log4j2-test.xml
index 8c6a336420a..14f7339b051 100644
--- a/src/test/resources/log4j2-test.xml
+++ b/src/test/resources/log4j2-test.xml
@@ -6,6 +6,9 @@
+
+
+