Skip to content

Commit

Permalink
Rework fulltext crawlers and first prototype
Browse files Browse the repository at this point in the history
  • Loading branch information
stefan-kolb committed Aug 17, 2015
1 parent 2d630e9 commit 2e93572
Show file tree
Hide file tree
Showing 15 changed files with 222 additions and 298 deletions.
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ dependencies {
compile 'junit:junit:4.12'

compile 'org.jsoup:jsoup:1.8.3'
compile 'com.mashape.unirest:unirest-java:1.4.6'
}

sourceSets {
Expand Down
5 changes: 1 addition & 4 deletions src/main/java/net/sf/jabref/BasePanel.java
Original file line number Diff line number Diff line change
Expand Up @@ -404,9 +404,6 @@ public void action() {
*/
});

actions.put("test",// new AccessLinksForEntries.SaveWithLinkedFiles(this));
new FindFullTextAction(this));

// The action for saving a database.
actions.put("save", saveAction);

Expand Down Expand Up @@ -1735,7 +1732,7 @@ public void action() throws Throwable {
actions.put("removeFromGroup", new GroupAddRemoveDialog(this, false, false));
actions.put("moveToGroup", new GroupAddRemoveDialog(this, true, true));

//actions.put("downloadFullText", new FindFullTextAction(this));
actions.put("downloadFullText", new FindFullTextAction(this));
}

/**
Expand Down
7 changes: 4 additions & 3 deletions src/main/java/net/sf/jabref/BibtexEntryType.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ public boolean isVisibleAtNewEntryDialog() {

static {
// Put the standard entry types into the type map.
if (!Globals.prefs.getBoolean(JabRefPreferences.BIBLATEX_MODE)) {
// FIXME: throws NullPoinetrException when using BibtexEntry without JabREf instance -> Tests
//if (!Globals.prefs.getBoolean(JabRefPreferences.BIBLATEX_MODE)) {
ALL_TYPES.put("article", BibtexEntryTypes.ARTICLE);
ALL_TYPES.put("inbook", BibtexEntryTypes.INBOOK);
ALL_TYPES.put("book", BibtexEntryTypes.BOOK);
Expand All @@ -127,7 +128,7 @@ public boolean isVisibleAtNewEntryDialog() {
ALL_TYPES.put("misc", BibtexEntryTypes.MISC);
ALL_TYPES.put("other", BibtexEntryTypes.OTHER);
ALL_TYPES.put("ieeetranbstctl", BibtexEntryTypes.IEEETRANBSTCTL);
} else {
/*} else {
ALL_TYPES.put("article", BibLatexEntryTypes.ARTICLE);
ALL_TYPES.put("book", BibLatexEntryTypes.BOOK);
ALL_TYPES.put("inbook", BibLatexEntryTypes.INBOOK);
Expand Down Expand Up @@ -158,7 +159,7 @@ public boolean isVisibleAtNewEntryDialog() {
ALL_TYPES.put("techreport", BibLatexEntryTypes.TECHREPORT);
ALL_TYPES.put("www", BibLatexEntryTypes.WWW);
ALL_TYPES.put("ieeetranbstctl", BibLatexEntryTypes.IEEETRANBSTCTL);
}
}*/
// We need a record of the standard types, in case the user wants
// to remove a customized version. Therefore we clone the map.
STANDARD_TYPES = new TreeMap<String, BibtexEntryType>(ALL_TYPES);
Expand Down
8 changes: 4 additions & 4 deletions src/main/java/net/sf/jabref/JabRefFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -427,9 +427,9 @@ void addAction(Action a) {
GUIGlobals.getIconUrl("mergeentries"));

private final AbstractAction dbImport = new DbImportAction(this).getAction();
private final AbstractAction//downloadFullText = new GeneralAction("downloadFullText", "Look up full text document",
// Globals.lang("Follow DOI or URL link and try to locate PDF full text document")),
increaseFontSize = new IncreaseTableFontSizeAction();
private final AbstractAction downloadFullText = new GeneralAction("downloadFullText", "Look up full text document",
Localization.lang("Follow DOI or URL link and try to locate PDF full text document"));
private final AbstractAction increaseFontSize = new IncreaseTableFontSizeAction();
private final AbstractAction decreseFontSize = new DecreaseTableFontSizeAction();
private final AbstractAction resolveDuplicateKeys = new GeneralAction("resolveDuplicateKeys", "Resolve duplicate BibTeX keys",
Localization.lang("Find and remove duplicate BibTeX keys"),
Expand Down Expand Up @@ -1457,7 +1457,7 @@ private void fillMenu() {
tools.add(makeKeyAction);
tools.add(Cleanup);
tools.add(mergeEntries);
//tools.add(downloadFullText);
tools.add(downloadFullText);
tools.add(newSubDatabaseAction);
tools.add(writeXmpAction);
OpenOfficePanel otp = OpenOfficePanel.getInstance();
Expand Down
125 changes: 36 additions & 89 deletions src/main/java/net/sf/jabref/external/FindFullText.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

import net.sf.jabref.BibtexEntry;
import net.sf.jabref.logic.crawler.ACS;
import net.sf.jabref.logic.crawler.GoogleScholar;
import net.sf.jabref.logic.crawler.ScienceDirect;
import net.sf.jabref.logic.crawler.SpringerLink;
import net.sf.jabref.util.DOI;
Expand All @@ -39,103 +41,61 @@
*/
public class FindFullText {

private static final int
FOUND_PDF = 0;
private static final int FOUND_PDF = 0;
public static final int WRONG_MIME_TYPE = 1;
public static final int UNKNOWN_DOMAIN = 2;
public static final int LINK_NOT_FOUND = 3;
public static final int IO_EXCEPTION = 4;
public static final int NO_URLS_DEFINED = 5;
public static final int LINK_NOT_FOUND = 2;
public static final int IO_EXCEPTION = 3;

private final List<FullTextFinder> finders = new ArrayList<FullTextFinder>();


public FindFullText() {
// Ordering is important, authorities first!
// Publisher
finders.add(new ScienceDirect());
finders.add(new SpringerLink());
finders.add(new ACS());
// Meta search
finders.add(new GoogleScholar());
}

public FindResult findFullText(BibtexEntry entry) {
String urlText = entry.getField("url");
String doiText = entry.getField("doi");
// First try the Doi link, if defined:
if (doiText != null && !doiText.trim().isEmpty()) {
FindResult resDoi = lookForFullTextAtURL(new DOI(doiText).getURL());
if (resDoi.status == FindFullText.FOUND_PDF) {
return resDoi;
} else if (urlText != null && !urlText.trim().isEmpty()) {
FindResult resUrl = lookForFullTextAtURL(urlText);
if (resUrl.status == FindFullText.FOUND_PDF) {
return resUrl;
} else {
return resDoi; // If both URL and Doi fail, we assume that the error code for Doi is
// probably the most relevant.
}
} else {
return resDoi;
}
}
// No Doi? Try URL:
else if (urlText != null && !urlText.trim().isEmpty()) {
return lookForFullTextAtURL(urlText);
}
// No URL either? Return error code.
else {
return new FindResult(FindFullText.NO_URLS_DEFINED, null);
}
}

private FindResult lookForFullTextAtURL(String urlText) {
try {
URL url = new URL(urlText);
url = resolveRedirects(url, 0);
boolean domainKnown = false;
for (FullTextFinder finder : finders) {
if (finder.supportsSite(url)) {
domainKnown = true;
URL result = finder.findFullTextURL(url);
if (result != null) {

// Check the MIME type of this URL to see if it is a PDF. If not,
// it could be because the user doesn't have access:
try {
String mimeType = new URLDownload(result).determineMimeType();
if (mimeType != null && mimeType.toLowerCase().equals("application/pdf")) {
return new FindResult(result, url);
}
else {
new URLDownload(result).downloadToFile(new File("page.html"));
return new FindResult(FindFullText.WRONG_MIME_TYPE, url);
}
} catch (IOException ex) {
ex.printStackTrace();
return new FindResult(FindFullText.IO_EXCEPTION, url);
}
for (FullTextFinder finder : finders) {
try {
Optional<URL> result = finder.findFullText(entry);

if (result.isPresent()) {
// TODO: recheck this!
// Check the MIME type of this URL to see if it is a PDF. If not,
// it could be because the user doesn't have access:
// FIXME: redirection break this!
// Property-based software engineering measurement
// http://drum.lib.umd.edu/bitstream/1903/19/2/CS-TR-3368.pdf
// FIXME:
// INFO: Fulltext PDF found @ Google: https://www.uni-bamberg.de/fileadmin/uni/fakultaeten/wiai_lehrstuehle/praktische_informatik/Dateien/Publikationen/sose14-towards-application-portability-in-paas.pdf
// javax.net.ssl.SSLProtocolException: handshake alert: unrecognized_name
// http://stackoverflow.com/questions/7615645/ssl-handshake-alert-unrecognized-name-error-since-upgrade-to-java-1-7-0
String mimeType = new URLDownload(result.get()).determineMimeType();
if (mimeType != null && mimeType.toLowerCase().equals("application/pdf")) {
return new FindResult(result.get(), result.get());
} else {
return new FindResult(WRONG_MIME_TYPE, result.get());
}

}
} catch (IOException ex) {
ex.printStackTrace();
return new FindResult(IO_EXCEPTION, null);
}
if (!domainKnown) {
return new FindResult(FindFullText.UNKNOWN_DOMAIN, url);
} else {
return new FindResult(FindFullText.LINK_NOT_FOUND, url);
}
} catch (MalformedURLException e) {
e.printStackTrace();

} catch (IOException e) {
e.printStackTrace();
}

return null;
return new FindResult(LINK_NOT_FOUND, null);
}

/**
* Follow redirects until the final location is reached. This is necessary to handle Doi links, which
* redirect to publishers' web sites. We need to know the publisher's domain name in order to choose
* which FullTextFinder to use.
* @param url The url to start with.
*
* @param url The url to start with.
* @param redirectCount The number of previous redirects. We will follow a maximum of 5 redirects.
* @return the final URL, or the initial one in case there is no redirect.
* @throws IOException for connection error
Expand Down Expand Up @@ -188,8 +148,7 @@ public static String loadPage(URL url) throws IOException {
sb.append((char) c);
}
return sb.toString();
}
else {
} else {
return null; // TODO: are other types of connection (https?) relevant?
}
} finally {
Expand Down Expand Up @@ -231,16 +190,4 @@ public FindResult(int status, URL originalUrl) {
}
}
}


public static void dumpToFile(String text, File f) {
try {
FileWriter fw = new FileWriter(f);
fw.write(text);
fw.close();
} catch (IOException e) {
e.printStackTrace();

}
}
}
13 changes: 1 addition & 12 deletions src/main/java/net/sf/jabref/external/FindFullTextAction.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ public void run() {

@Override
public void update() {
//pdfURL = new URL("http://geog-www.sbs.ohio-state.edu/faculty/bmark/abbott_etal_ppp03.pdf");
if (result.url != null) {
//System.out.println("PDF URL: "+result.url);
String bibtexKey = entry.getCiteKey();
String[] dirs = basePanel.metaData().getFileDirectory(GUIGlobals.FILE_FIELD);
if (dirs.length == 0) {
Expand All @@ -69,7 +67,6 @@ public void update() {

@Override
public void downloadComplete(FileListEntry file) {
System.out.println("finished");
FileListTableModel tm = new FileListTableModel();
String oldValue = entry.getField(GUIGlobals.FILE_FIELD);
tm.setContent(oldValue);
Expand All @@ -90,24 +87,16 @@ public void downloadComplete(FileListEntry file) {
else {
String message = null;
switch (result.status) {
case FindFullText.UNKNOWN_DOMAIN:
message = Localization.lang("Unable to find full text article. No search algorithm "
+ "defined for the '%0' web site.", result.host);
break;
case FindFullText.WRONG_MIME_TYPE:
message = Localization.lang("Found pdf link, but received the wrong MIME type. "
+ "This could indicate that you don't have access to the fulltext article.");
break;
case FindFullText.LINK_NOT_FOUND:
message = Localization.lang("Unable to find full text document in the linked web page.");
message = Localization.lang("Unable to find full text document.");
break;
case FindFullText.IO_EXCEPTION:
message = Localization.lang("Connection error when trying to find full text document.");
break;
case FindFullText.NO_URLS_DEFINED:
message = Localization.lang("This entry provides no URL or DOI links.");
break;

}
basePanel.output(Localization.lang("Full text article download failed"));
JOptionPane.showMessageDialog(basePanel.frame(), message, Localization.lang("Full text article download failed"),
Expand Down
30 changes: 12 additions & 18 deletions src/main/java/net/sf/jabref/external/FullTextFinder.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,25 @@
*/
package net.sf.jabref.external;

import net.sf.jabref.BibtexEntry;

import java.net.URL;
import java.io.IOException;
import java.util.Optional;

/**
* This interface is used for classes that try to resolve a full-text PDF url from an article
* web page. Implementing classes should specialize on specific article sites.
* */
* This interface is used for classes that try to resolve a full-text PDF url for a BibTex entry.
* Implementing classes should specialize on specific article sites.
* See e.g. @link{http://libguides.mit.edu/apis}.
*/
public interface FullTextFinder {

/**
* Report whether this FullTextFinder works for the site providing the given URL.
*
* @param url The url to check.
* @return true if the site is supported, false otherwise. If the site might be supported,
* it is best to return true.
*/
boolean supportsSite(URL url);

/**
* Take the source HTML for an article page, and try to find the URL to the
* full text for this article.
* Tries to find a fulltext URL for a given BibTex entry.
*
* @param url The URL to the article's web page.
* @return The fulltext PDF URL, if found, or null if not found.
* @param entry The Bibtex entry
* @return The fulltext PDF URL Optional, if found, or an empty Optional if not found.
* @throws NullPointerException if no BibTex entry is given
* @throws java.io.IOException
*/
URL findFullTextURL(URL url) throws IOException;
Optional<URL> findFullText(BibtexEntry entry) throws IOException;
}
Loading

0 comments on commit 2e93572

Please sign in to comment.