Add Path-accepting Jsoup methods

Also improve file parsing using SeekableByteChannel
jhy · Nov 19, 2023 · 2ce7b46 · 2ce7b46
1 parent 6307b94
commit 2ce7b46
Show file tree

Hide file tree

Showing 3 changed files with 115 additions and 12 deletions.
diff --git a/pom.xml b/pom.xml
@@ -88,6 +88,9 @@
                 <version>2.3.3_r2</version>
               </signature>
               <ignores>
+                <ignore>java.io.File</ignore> <!-- File#toPath() -->
+                <ignore>java.nio.file.*</ignore>
+                <ignore>java.nio.channels.SeekableByteChannel</ignore>
                 <ignore>java.util.function.Consumer</ignore>
                 <ignore>java.util.function.Supplier</ignore>
                 <ignore>java.lang.ThreadLocal</ignore>

diff --git a/src/main/java/org/jsoup/Jsoup.java b/src/main/java/org/jsoup/Jsoup.java
@@ -13,6 +13,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 
 /**
  The core public access point to the jsoup functionality.
@@ -183,6 +184,72 @@ public static Document parse(File file, @Nullable String charsetName, String bas
         return DataUtil.load(file, charsetName, baseUri, parser);
     }
 
+    /**
+     Parse the contents of a file as HTML.
+
+     @param path          file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     @since 1.17.1
+     */
+    public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException {
+        return DataUtil.load(path, charsetName, baseUri);
+    }
+
+    /**
+     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
+
+     @param path        file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     @see #parse(File, String, String) parse(file, charset, baseUri)
+     @since 1.17.1
+     */
+    public static Document parse(Path path, @Nullable String charsetName) throws IOException {
+        return DataUtil.load(path, charsetName, path.toAbsolutePath().toString());
+    }
+
+    /**
+     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
+     The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
+     or if neither is present, will be {@code UTF-8}.
+
+     <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
+
+     @param path the file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @return sane HTML
+     @throws IOException if the file could not be found or read.
+     @see #parse(Path, String, String) parse(file, charset, baseUri)
+     @since 1.17.1
+     */
+    public static Document parse(Path path) throws IOException {
+        return DataUtil.load(path, null, path.toAbsolutePath().toString());
+    }
+
+    /**
+     Parse the contents of a file as HTML.
+
+     @param path          file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
+     @param parser alternate {@link Parser#xmlParser() parser} to use.
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     @since 1.17.1
+     */
+    public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+        return DataUtil.load(path, charsetName, baseUri, parser);
+    }
+
      /**
      Read an input stream, and parse it to a Document.
 

diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -15,7 +15,6 @@
 import java.io.BufferedReader;
 import java.io.CharArrayReader;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -24,8 +23,12 @@
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.SeekableByteChannel;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Locale;
 import java.util.Random;
 import java.util.regex.Matcher;
@@ -61,7 +64,7 @@ private DataUtil() {}
      * @throws IOException on IO error
      */
     public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
-        return load(file, charsetName, baseUri, Parser.htmlParser());
+        return load(file.toPath(), charsetName, baseUri);
     }
 
     /**
@@ -79,18 +82,48 @@ public static Document load(File file, @Nullable String charsetName, String base
      * @since 1.14.2
      */
     public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
-        InputStream stream = new FileInputStream(file);
-        String name = Normalizer.lowerCase(file.getName());
-        if (name.endsWith(".gz") || name.endsWith(".z")) {
-            // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
-            boolean zipped;
-            try {
-                zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
-            } finally {
-                stream.close();
+        return load(file.toPath(), charsetName, baseUri, parser);
+    }
+
+    /**
+     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
+     * are supported in addition to uncompressed files.
+     *
+     * @param path file to load
+     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
+     *     the file will always override this setting.
+     * @param baseUri base URI of document, to resolve relative links against
+     * @return Document
+     * @throws IOException on IO error
+     */
+    public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
+        return load(path, charsetName, baseUri, Parser.htmlParser());
+    }
 
+    /**
+     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
+     * are supported in addition to uncompressed files.
+     *
+     * @param path file to load
+     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
+     *     the file will always override this setting.
+     * @param baseUri base URI of document, to resolve relative links against
+     * @param parser alternate {@link Parser#xmlParser() parser} to use.
+
+     * @return Document
+     * @throws IOException on IO error
+     * @since 1.17.1
+     */
+    public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+        final SeekableByteChannel byteChannel = Files.newByteChannel(path);
+        InputStream stream = Channels.newInputStream(byteChannel);
+        String name = Normalizer.lowerCase(path.getFileName().toString());
+        if (name.endsWith(".gz") || name.endsWith(".z")) {
+            final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
+            byteChannel.position(0); // reset to start of file
+            if (zipped) {
+                stream = new GZIPInputStream(stream);
             }
-            stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
         }
         return parseInputStream(stream, charsetName, baseUri, parser);
     }