Rework fulltext crawlers and first prototype

JabRef · Aug 17, 2015 · 2e93572 · 2e93572
1 parent 2d630e9
commit 2e93572
Show file tree

Hide file tree

Showing 15 changed files with 222 additions and 298 deletions.
diff --git a/build.gradle b/build.gradle
@@ -77,6 +77,7 @@ dependencies {
     compile 'junit:junit:4.12'
 
     compile 'org.jsoup:jsoup:1.8.3'
+    compile 'com.mashape.unirest:unirest-java:1.4.6'
 }
 
 sourceSets {

diff --git a/src/main/java/net/sf/jabref/BasePanel.java b/src/main/java/net/sf/jabref/BasePanel.java
@@ -404,9 +404,6 @@ public void action() {
             */
         });
 
-        actions.put("test",// new AccessLinksForEntries.SaveWithLinkedFiles(this));
-                new FindFullTextAction(this));
-
         // The action for saving a database.
         actions.put("save", saveAction);
 
@@ -1735,7 +1732,7 @@ public void action() throws Throwable {
         actions.put("removeFromGroup", new GroupAddRemoveDialog(this, false, false));
         actions.put("moveToGroup", new GroupAddRemoveDialog(this, true, true));
 
-        //actions.put("downloadFullText", new FindFullTextAction(this));
+        actions.put("downloadFullText", new FindFullTextAction(this));
     }
 
     /**

diff --git a/src/main/java/net/sf/jabref/BibtexEntryType.java b/src/main/java/net/sf/jabref/BibtexEntryType.java
@@ -106,7 +106,8 @@ public boolean isVisibleAtNewEntryDialog() {
 
     static {
 			// Put the standard entry types into the type map.
-			if (!Globals.prefs.getBoolean(JabRefPreferences.BIBLATEX_MODE)) {
+            // FIXME: throws NullPoinetrException when using BibtexEntry without JabREf instance -> Tests
+			//if (!Globals.prefs.getBoolean(JabRefPreferences.BIBLATEX_MODE)) {
 				ALL_TYPES.put("article", BibtexEntryTypes.ARTICLE);
 				ALL_TYPES.put("inbook", BibtexEntryTypes.INBOOK);
 				ALL_TYPES.put("book", BibtexEntryTypes.BOOK);
@@ -127,7 +128,7 @@ public boolean isVisibleAtNewEntryDialog() {
 				ALL_TYPES.put("misc", BibtexEntryTypes.MISC);
 				ALL_TYPES.put("other", BibtexEntryTypes.OTHER);
 				ALL_TYPES.put("ieeetranbstctl", BibtexEntryTypes.IEEETRANBSTCTL);
-			} else {
+			/*} else {
 				ALL_TYPES.put("article", BibLatexEntryTypes.ARTICLE);
 				ALL_TYPES.put("book", BibLatexEntryTypes.BOOK);
 				ALL_TYPES.put("inbook", BibLatexEntryTypes.INBOOK);
@@ -158,7 +159,7 @@ public boolean isVisibleAtNewEntryDialog() {
 				ALL_TYPES.put("techreport", BibLatexEntryTypes.TECHREPORT);
 				ALL_TYPES.put("www", BibLatexEntryTypes.WWW);
 				ALL_TYPES.put("ieeetranbstctl", BibLatexEntryTypes.IEEETRANBSTCTL);
-			}
+			}*/
 		// We need a record of the standard types, in case the user wants
 		// to remove a customized version. Therefore we clone the map.
 		STANDARD_TYPES = new TreeMap<String, BibtexEntryType>(ALL_TYPES);	

diff --git a/src/main/java/net/sf/jabref/JabRefFrame.java b/src/main/java/net/sf/jabref/JabRefFrame.java
@@ -427,9 +427,9 @@ void addAction(Action a) {
                     GUIGlobals.getIconUrl("mergeentries"));
 
     private final AbstractAction dbImport = new DbImportAction(this).getAction();
-    private final AbstractAction//downloadFullText = new GeneralAction("downloadFullText", "Look up full text document",
-            //        Globals.lang("Follow DOI or URL link and try to locate PDF full text document")),
-            increaseFontSize = new IncreaseTableFontSizeAction();
+    private final AbstractAction downloadFullText = new GeneralAction("downloadFullText", "Look up full text document",
+            Localization.lang("Follow DOI or URL link and try to locate PDF full text document"));
+            private final AbstractAction increaseFontSize = new IncreaseTableFontSizeAction();
     private final AbstractAction decreseFontSize = new DecreaseTableFontSizeAction();
     private final AbstractAction resolveDuplicateKeys = new GeneralAction("resolveDuplicateKeys", "Resolve duplicate BibTeX keys",
                     Localization.lang("Find and remove duplicate BibTeX keys"),
@@ -1457,7 +1457,7 @@ private void fillMenu() {
         tools.add(makeKeyAction);
         tools.add(Cleanup);
         tools.add(mergeEntries);
-        //tools.add(downloadFullText);
+        tools.add(downloadFullText);
         tools.add(newSubDatabaseAction);
         tools.add(writeXmpAction);
         OpenOfficePanel otp = OpenOfficePanel.getInstance();

diff --git a/src/main/java/net/sf/jabref/external/FindFullText.java b/src/main/java/net/sf/jabref/external/FindFullText.java
@@ -26,9 +26,11 @@
 import java.net.URLConnection;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Optional;
 
 import net.sf.jabref.BibtexEntry;
 import net.sf.jabref.logic.crawler.ACS;
+import net.sf.jabref.logic.crawler.GoogleScholar;
 import net.sf.jabref.logic.crawler.ScienceDirect;
 import net.sf.jabref.logic.crawler.SpringerLink;
 import net.sf.jabref.util.DOI;
@@ -39,103 +41,61 @@
  */
 public class FindFullText {
 
-    private static final int
-            FOUND_PDF = 0;
+    private static final int FOUND_PDF = 0;
     public static final int WRONG_MIME_TYPE = 1;
-    public static final int UNKNOWN_DOMAIN = 2;
-    public static final int LINK_NOT_FOUND = 3;
-    public static final int IO_EXCEPTION = 4;
-    public static final int NO_URLS_DEFINED = 5;
+    public static final int LINK_NOT_FOUND = 2;
+    public static final int IO_EXCEPTION = 3;
 
     private final List<FullTextFinder> finders = new ArrayList<FullTextFinder>();
 
 
     public FindFullText() {
+        // Ordering is important, authorities first!
+        // Publisher
         finders.add(new ScienceDirect());
         finders.add(new SpringerLink());
         finders.add(new ACS());
+        // Meta search
+        finders.add(new GoogleScholar());
     }
 
     public FindResult findFullText(BibtexEntry entry) {
-        String urlText = entry.getField("url");
-        String doiText = entry.getField("doi");
-        // First try the Doi link, if defined:
-        if (doiText != null && !doiText.trim().isEmpty()) {
-            FindResult resDoi = lookForFullTextAtURL(new DOI(doiText).getURL());
-            if (resDoi.status == FindFullText.FOUND_PDF) {
-                return resDoi;
-            } else if (urlText != null && !urlText.trim().isEmpty()) {
-                FindResult resUrl = lookForFullTextAtURL(urlText);
-                if (resUrl.status == FindFullText.FOUND_PDF) {
-                    return resUrl;
-                } else {
-                    return resDoi; // If both URL and Doi fail, we assume that the error code for Doi is
-                                   // probably the most relevant.
-                }
-            } else {
-                return resDoi;
-            }
-        }
-        // No Doi? Try URL:
-        else if (urlText != null && !urlText.trim().isEmpty()) {
-            return lookForFullTextAtURL(urlText);
-        }
-        // No URL either? Return error code.
- else {
-            return new FindResult(FindFullText.NO_URLS_DEFINED, null);
-        }
-    }
-
-    private FindResult lookForFullTextAtURL(String urlText) {
-        try {
-            URL url = new URL(urlText);
-            url = resolveRedirects(url, 0);
-            boolean domainKnown = false;
-            for (FullTextFinder finder : finders) {
-                if (finder.supportsSite(url)) {
-                    domainKnown = true;
-                    URL result = finder.findFullTextURL(url);
-                    if (result != null) {
-
-                        // Check the MIME type of this URL to see if it is a PDF. If not,
-                        // it could be because the user doesn't have access:
-                        try {
-                            String mimeType = new URLDownload(result).determineMimeType();
-                            if (mimeType != null && mimeType.toLowerCase().equals("application/pdf")) {
-                                return new FindResult(result, url);
-                            }
-                            else {
-                                new URLDownload(result).downloadToFile(new File("page.html"));
-                                return new FindResult(FindFullText.WRONG_MIME_TYPE, url);
-                            }
-                        } catch (IOException ex) {
-                            ex.printStackTrace();
-                            return new FindResult(FindFullText.IO_EXCEPTION, url);
-                        }
+        for (FullTextFinder finder : finders) {
+            try {
+                Optional<URL> result = finder.findFullText(entry);
+
+                if (result.isPresent()) {
+                    // TODO: recheck this!
+                    // Check the MIME type of this URL to see if it is a PDF. If not,
+                    // it could be because the user doesn't have access:
+                    // FIXME: redirection break this!
+                    // Property-based software engineering measurement
+                    // http://drum.lib.umd.edu/bitstream/1903/19/2/CS-TR-3368.pdf
+                    // FIXME:
+                    // INFO: Fulltext PDF found @ Google: https://www.uni-bamberg.de/fileadmin/uni/fakultaeten/wiai_lehrstuehle/praktische_informatik/Dateien/Publikationen/sose14-towards-application-portability-in-paas.pdf
+                    // javax.net.ssl.SSLProtocolException: handshake alert:  unrecognized_name
+                    // http://stackoverflow.com/questions/7615645/ssl-handshake-alert-unrecognized-name-error-since-upgrade-to-java-1-7-0
+                    String mimeType = new URLDownload(result.get()).determineMimeType();
+                    if (mimeType != null && mimeType.toLowerCase().equals("application/pdf")) {
+                        return new FindResult(result.get(), result.get());
+                    } else {
+                        return new FindResult(WRONG_MIME_TYPE, result.get());
                     }
-
                 }
+            } catch (IOException ex) {
+                ex.printStackTrace();
+                return new FindResult(IO_EXCEPTION, null);
             }
-            if (!domainKnown) {
-                return new FindResult(FindFullText.UNKNOWN_DOMAIN, url);
-            } else {
-                return new FindResult(FindFullText.LINK_NOT_FOUND, url);
-            }
-        } catch (MalformedURLException e) {
-            e.printStackTrace();
-
-        } catch (IOException e) {
-            e.printStackTrace();
         }
-
-        return null;
+        return new FindResult(LINK_NOT_FOUND, null);
     }
 
     /**
      * Follow redirects until the final location is reached. This is necessary to handle Doi links, which
      * redirect to publishers' web sites. We need to know the publisher's domain name in order to choose
      * which FullTextFinder to use.
-     * @param url The url to start with.
+     *
+     * @param url           The url to start with.
      * @param redirectCount The number of previous redirects. We will follow a maximum of 5 redirects.
      * @return the final URL, or the initial one in case there is no redirect.
      * @throws IOException for connection error
@@ -188,8 +148,7 @@ public static String loadPage(URL url) throws IOException {
                     sb.append((char) c);
                 }
                 return sb.toString();
-            }
-            else {
+            } else {
                 return null; // TODO: are other types of connection (https?) relevant?
             }
         } finally {
@@ -231,16 +190,4 @@ public FindResult(int status, URL originalUrl) {
             }
         }
     }
-
-
-    public static void dumpToFile(String text, File f) {
-        try {
-            FileWriter fw = new FileWriter(f);
-            fw.write(text);
-            fw.close();
-        } catch (IOException e) {
-            e.printStackTrace();
-
-        }
-    }
 }
diff --git a/src/main/java/net/sf/jabref/external/FindFullTextAction.java b/src/main/java/net/sf/jabref/external/FindFullTextAction.java
@@ -52,9 +52,7 @@ public void run() {
 
     @Override
     public void update() {
-        //pdfURL = new URL("http://geog-www.sbs.ohio-state.edu/faculty/bmark/abbott_etal_ppp03.pdf");
         if (result.url != null) {
-            //System.out.println("PDF URL: "+result.url);
             String bibtexKey = entry.getCiteKey();
             String[] dirs = basePanel.metaData().getFileDirectory(GUIGlobals.FILE_FIELD);
             if (dirs.length == 0) {
@@ -69,7 +67,6 @@ public void update() {
 
                     @Override
                     public void downloadComplete(FileListEntry file) {
-                        System.out.println("finished");
                         FileListTableModel tm = new FileListTableModel();
                         String oldValue = entry.getField(GUIGlobals.FILE_FIELD);
                         tm.setContent(oldValue);
@@ -90,24 +87,16 @@ public void downloadComplete(FileListEntry file) {
         else {
             String message = null;
             switch (result.status) {
-            case FindFullText.UNKNOWN_DOMAIN:
-                message = Localization.lang("Unable to find full text article. No search algorithm "
-                        + "defined for the '%0' web site.", result.host);
-                break;
             case FindFullText.WRONG_MIME_TYPE:
                 message = Localization.lang("Found pdf link, but received the wrong MIME type. "
                         + "This could indicate that you don't have access to the fulltext article.");
                 break;
             case FindFullText.LINK_NOT_FOUND:
-                message = Localization.lang("Unable to find full text document in the linked web page.");
+                message = Localization.lang("Unable to find full text document.");
                 break;
             case FindFullText.IO_EXCEPTION:
                 message = Localization.lang("Connection error when trying to find full text document.");
                 break;
-            case FindFullText.NO_URLS_DEFINED:
-                message = Localization.lang("This entry provides no URL or DOI links.");
-                break;
-
             }
             basePanel.output(Localization.lang("Full text article download failed"));
             JOptionPane.showMessageDialog(basePanel.frame(), message, Localization.lang("Full text article download failed"),

diff --git a/src/main/java/net/sf/jabref/external/FullTextFinder.java b/src/main/java/net/sf/jabref/external/FullTextFinder.java
@@ -15,31 +15,25 @@
 */
 package net.sf.jabref.external;
 
+import net.sf.jabref.BibtexEntry;
+
 import java.net.URL;
 import java.io.IOException;
+import java.util.Optional;
 
 /**
- * This interface is used for classes that try to resolve a full-text PDF url from an article
- * web page. Implementing classes should specialize on specific article sites.
- *  */
+ * This interface is used for classes that try to resolve a full-text PDF url for a BibTex entry.
+ * Implementing classes should specialize on specific article sites.
+ * See e.g. @link{http://libguides.mit.edu/apis}.
+ */
 public interface FullTextFinder {
-
-    /**
-     * Report whether this FullTextFinder works for the site providing the given URL.
-     *
-     * @param url The url to check.
-     * @return true if the site is supported, false otherwise. If the site might be supported,
-     *   it is best to return true.
-     */
-    boolean supportsSite(URL url);
-
     /**
-     * Take the source HTML for an article page, and try to find the URL to the
-     * full text for this article.
+     * Tries to find a fulltext URL for a given BibTex entry.
      *
-     * @param url The URL to the article's web page.
-     * @return The fulltext PDF URL, if found, or null if not found.
+     * @param entry The Bibtex entry
+     * @return The fulltext PDF URL Optional, if found, or an empty Optional if not found.
+     * @throws NullPointerException if no BibTex entry is given
      * @throws java.io.IOException
      */
-    URL findFullTextURL(URL url) throws IOException;
+    Optional<URL> findFullText(BibtexEntry entry) throws IOException;
 }