-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Google Scholar fetcher for downloading a single entry #7075
Changes from 1 commit
4c25a1d
3f389c4
90efdd1
531fcad
4fcfb61
60c74e1
f3488e2
62a5100
07e93f4
9cad830
f504609
6775728
b691aed
f34d7f8
5051e1b
625778f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,7 @@ | |
public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher { | ||
private static final Logger LOGGER = LoggerFactory.getLogger(GoogleScholar.class); | ||
|
||
private static final Pattern LINK_TO_SUBPAGE_PATTERN = Pattern.compile("data-clk-atid=\"([^\"]*)\""); | ||
private static final Pattern LINK_TO_BIB_PATTERN = Pattern.compile("(https:\\/\\/scholar.googleusercontent.com\\/scholar.bib[^\"]*)"); | ||
|
||
private static final String BASIC_SEARCH_URL = "https://scholar.google.ch/scholar?"; | ||
|
@@ -128,11 +129,11 @@ public Optional<HelpFile> getHelpPage() { | |
|
||
@Override | ||
public List<BibEntry> performSearch(String query) throws FetcherException { | ||
LOGGER.debug("Using URL {}", query); | ||
LOGGER.debug("Using query {}", query); | ||
obtainAndModifyCookie(); | ||
List<BibEntry> foundEntries = new ArrayList<>(20); | ||
|
||
URIBuilder uriBuilder = null; | ||
URIBuilder uriBuilder; | ||
try { | ||
uriBuilder = new URIBuilder(BASIC_SEARCH_URL); | ||
} catch (URISyntaxException e) { | ||
|
@@ -143,14 +144,16 @@ public List<BibEntry> performSearch(String query) throws FetcherException { | |
uriBuilder.addParameter("btnG", "Search"); | ||
uriBuilder.addParameter("q", query); | ||
String queryURL = uriBuilder.toString(); | ||
LOGGER.debug("Using URL {}", queryURL); | ||
|
||
try { | ||
addHitsFromQuery(foundEntries, queryURL); | ||
} catch (IOException e) { | ||
// if there are too much requests from the same IP address google is answering with a 503 and redirecting to a captcha challenge | ||
// The caught IOException looks for example like this: | ||
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0 | ||
if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) { | ||
if (e.getMessage().contains("Server returned HTTP response code: 403 for URL") || | ||
(e.getMessage().contains("Server returned HTTP response code: 503 for URL"))) { | ||
throw new FetcherException("Fetching from Google Scholar at URL " + queryURL + " failed.", | ||
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e); | ||
} else { | ||
|
@@ -214,22 +217,42 @@ private String constructComplexQueryString(ComplexSearchQuery complexSearchQuery | |
} | ||
|
||
private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws IOException, FetcherException { | ||
LOGGER.debug("Downloading from {}", queryURL); | ||
String content = new URLDownload(queryURL).asString(); | ||
|
||
if (needsCaptcha(content)) { | ||
throw new FetcherException("Fetching from Google Scholar failed: Captacha hit at " + queryURL + ".", | ||
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null); | ||
} | ||
|
||
Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content); | ||
Matcher matcher = LINK_TO_SUBPAGE_PATTERN.matcher(content); | ||
if (!matcher.find()) { | ||
LOGGER.debug("No data-clk-atid found in html {}", content); | ||
return; | ||
} | ||
|
||
String infoPageUrl = BASIC_SEARCH_URL + "q=info:" + matcher.group(1) + ":scholar.google.com/&output=cite&scirp=0&hl=en"; | ||
LOGGER.debug("Using infoPageUrl {}", infoPageUrl); | ||
URLDownload infoPageUrlDownload = new URLDownload(infoPageUrl); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If yu want to reuse the connection you should use unirest or jsoup |
||
LOGGER.debug("Downloading from {}", infoPageUrl); | ||
String infoPageContent = infoPageUrlDownload.asString(); | ||
|
||
matcher = LINK_TO_BIB_PATTERN.matcher(infoPageContent); | ||
boolean found = false; | ||
while (matcher.find()) { | ||
found = true; | ||
String citationsPageURL = matcher.group().replace("&", "&"); | ||
LOGGER.debug("Using citationsPageURL {}", citationsPageURL); | ||
BibEntry newEntry = downloadEntry(citationsPageURL); | ||
entryList.add(newEntry); | ||
} | ||
if (!found) { | ||
LOGGER.debug("Did not found pattern in html {}", infoPageContent); | ||
} | ||
} | ||
|
||
private BibEntry downloadEntry(String link) throws IOException, FetcherException { | ||
LOGGER.debug("Downloading from {}", link); | ||
String downloadedContent = new URLDownload(link).asString(); | ||
BibtexParser parser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor()); | ||
ParserResult result = parser.parse(new StringReader(downloadedContent)); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,64 +26,64 @@ | |
@FetcherTest | ||
class GoogleScholarTest implements SearchBasedFetcherCapabilityTest { | ||
|
||
private GoogleScholar finder; | ||
private BibEntry entry; | ||
private GoogleScholar fetcher; | ||
|
||
@BeforeEach | ||
void setUp() { | ||
ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class); | ||
when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn( | ||
mock(FieldContentFormatterPreferences.class)); | ||
finder = new GoogleScholar(importFormatPreferences); | ||
entry = new BibEntry(); | ||
fetcher = new GoogleScholar(importFormatPreferences); | ||
} | ||
|
||
@Test | ||
@DisabledOnCIServer("CI server is blocked by Google") | ||
void linkFound() throws IOException, FetcherException { | ||
entry.setField(StandardField.TITLE, "Towards Application Portability in Platform as a Service"); | ||
BibEntry entry = new BibEntry() | ||
.withField(StandardField.TITLE, "Towards Application Portability in Platform as a Service"); | ||
|
||
assertEquals( | ||
Optional.of(new URL("https://www.uni-bamberg.de/fileadmin/uni/fakultaeten/wiai_lehrstuehle/praktische_informatik/Dateien/Publikationen/sose14-towards-application-portability-in-paas.pdf")), | ||
finder.findFullText(entry) | ||
fetcher.findFullText(entry) | ||
); | ||
} | ||
|
||
@Test | ||
@DisabledOnCIServer("CI server is blocked by Google") | ||
void noLinkFound() throws IOException, FetcherException { | ||
entry.setField(StandardField.TITLE, "Curriculum programme of career-oriented java specialty guided by principles of software engineering"); | ||
BibEntry entry = new BibEntry() | ||
.withField(StandardField.TITLE, "Curriculum programme of career-oriented java specialty guided by principles of software engineering"); | ||
|
||
assertEquals(Optional.empty(), finder.findFullText(entry)); | ||
assertEquals(Optional.empty(), fetcher.findFullText(entry)); | ||
} | ||
|
||
@Test | ||
@DisabledOnCIServer("CI server is blocked by Google") | ||
void findSingleEntry() throws FetcherException { | ||
entry.setType(StandardEntryType.InProceedings); | ||
entry.setCitationKey("geiger2013detecting"); | ||
entry.setField(StandardField.TITLE, "Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models."); | ||
entry.setField(StandardField.AUTHOR, "Geiger, Matthias and Wirtz, Guido"); | ||
entry.setField(StandardField.BOOKTITLE, "ZEUS"); | ||
entry.setField(StandardField.YEAR, "2013"); | ||
entry.setField(StandardField.PAGES, "41--44"); | ||
BibEntry entry = new BibEntry(StandardEntryType.InProceedings) | ||
.withCitationKey("geiger2013detecting") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is nothing wrong with the |
||
.withField(StandardField.TITLE, "Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models.") | ||
.withField(StandardField.AUTHOR, "Geiger, Matthias and Wirtz, Guido") | ||
.withField(StandardField.BOOKTITLE, "ZEUS") | ||
.withField(StandardField.YEAR, "2013") | ||
.withField(StandardField.PAGES, "41--44"); | ||
|
||
List<BibEntry> foundEntries = finder.performSearch("Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models"); | ||
List<BibEntry> foundEntries = fetcher.performSearch("Detecting Interoperability and Correctness Issues in BPMN 2.0 Process Models"); | ||
|
||
assertEquals(Collections.singletonList(entry), foundEntries); | ||
} | ||
|
||
@Test | ||
@DisabledOnCIServer("CI server is blocked by Google") | ||
void findManyEntries() throws FetcherException { | ||
List<BibEntry> foundEntries = finder.performSearch("random test string"); | ||
List<BibEntry> foundEntries = fetcher.performSearch("random test string"); | ||
|
||
assertEquals(20, foundEntries.size()); | ||
} | ||
|
||
@Override | ||
public SearchBasedFetcher getFetcher() { | ||
return finder; | ||
return fetcher; | ||
} | ||
|
||
@Override | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can also just start JabRef with -debug as program argument.