jhy · jhy · Dec 30, 2023 · Jul 29, 2023 · Dec 7, 2023 · Dec 28, 2023
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,14 @@
 # jsoup Changelog
 
+## 1.18.1 (Pending)
+
+### Improvements
+
+* Added `Path` accepting parse methods: `Jsoup.parse(Path)`, `Jsoup.parse(path, charsetName, baseUri, parser)`,
+  etc. [2055](https://github.com/jhy/jsoup/pull/2055)
+
+---
+
 ## 1.17.2 (2023-Dec-29)
 
 ### Improvements

diff --git a/pom.xml b/pom.xml
@@ -88,8 +88,12 @@
                 <version>2.3.3_r2</version>
               </signature>
               <ignores>
+                <ignore>java.io.File</ignore> <!-- File#toPath() -->
+                <ignore>java.nio.file.*</ignore>
+                <ignore>java.nio.channels.SeekableByteChannel</ignore>
                 <ignore>java.util.function.*</ignore>
                 <ignore>java.util.stream.*</ignore>
+                <ignore>java.lang.Throwable</ignore> <!-- Throwable#addSuppressed(Throwable) -->
                 <ignore>java.lang.ThreadLocal</ignore>
                 <ignore>java.io.UncheckedIOException</ignore>
                 <ignore>java.util.List</ignore> <!-- List#stream() -->

diff --git a/src/main/java/org/jsoup/Jsoup.java b/src/main/java/org/jsoup/Jsoup.java
@@ -13,6 +13,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 
 /**
  The core public access point to the jsoup functionality.
@@ -183,6 +184,72 @@ public static Document parse(File file, @Nullable String charsetName, String bas
         return DataUtil.load(file, charsetName, baseUri, parser);
     }
 
+    /**
+     Parse the contents of a file as HTML.
+
+     @param path          file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     @since 1.18.1
+     */
+    public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException {
+        return DataUtil.load(path, charsetName, baseUri);
+    }
+
+    /**
+     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
+
+     @param path        file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     @see #parse(File, String, String) parse(file, charset, baseUri)
+     @since 1.18.1
+     */
+    public static Document parse(Path path, @Nullable String charsetName) throws IOException {
+        return DataUtil.load(path, charsetName, path.toAbsolutePath().toString());
+    }
+
+    /**
+     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
+     The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
+     or if neither is present, will be {@code UTF-8}.
+
+     <p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
+
+     @param path the file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @return sane HTML
+     @throws IOException if the file could not be found or read.
+     @see #parse(Path, String, String) parse(file, charset, baseUri)
+     @since 1.18.1
+     */
+    public static Document parse(Path path) throws IOException {
+        return DataUtil.load(path, null, path.toAbsolutePath().toString());
+    }
+
+    /**
+     Parse the contents of a file as HTML.
+
+     @param path          file to load HTML from. Supports gzipped files (ending in .z or .gz).
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
+     @param parser alternate {@link Parser#xmlParser() parser} to use.
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     @since 1.18.1
+     */
+    public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+        return DataUtil.load(path, charsetName, baseUri, parser);
+    }
+
      /**
      Read an input stream, and parse it to a Document.
 

diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -2,7 +2,6 @@
 
 import org.jsoup.internal.ControllableInputStream;
 import org.jsoup.internal.Normalizer;
-import org.jsoup.internal.SharedConstants;
 import org.jsoup.internal.StringUtil;
 import org.jsoup.nodes.Comment;
 import org.jsoup.nodes.Document;
@@ -16,7 +15,6 @@
 import java.io.BufferedReader;
 import java.io.CharArrayReader;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -25,8 +23,12 @@
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.SeekableByteChannel;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Locale;
 import java.util.Random;
 import java.util.regex.Matcher;
@@ -63,7 +65,7 @@ private DataUtil() {}
      * @throws IOException on IO error
      */
     public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
-        return load(file, charsetName, baseUri, Parser.htmlParser());
+        return load(file.toPath(), charsetName, baseUri);
     }
 
     /**
@@ -81,18 +83,48 @@ public static Document load(File file, @Nullable String charsetName, String base
      * @since 1.14.2
      */
     public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
-        InputStream stream = new FileInputStream(file);
-        String name = Normalizer.lowerCase(file.getName());
-        if (name.endsWith(".gz") || name.endsWith(".z")) {
-            // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
-            boolean zipped;
-            try {
-                zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
-            } finally {
-                stream.close();
+        return load(file.toPath(), charsetName, baseUri, parser);
+    }
+
+    /**
+     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
+     * are supported in addition to uncompressed files.
+     *
+     * @param path file to load
+     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
+     *     the file will always override this setting.
+     * @param baseUri base URI of document, to resolve relative links against
+     * @return Document
+     * @throws IOException on IO error
+     */
+    public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
+        return load(path, charsetName, baseUri, Parser.htmlParser());
+    }
 
+    /**
+     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
+     * are supported in addition to uncompressed files.
+     *
+     * @param path file to load
+     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
+     *     the file will always override this setting.
+     * @param baseUri base URI of document, to resolve relative links against
+     * @param parser alternate {@link Parser#xmlParser() parser} to use.
+
+     * @return Document
+     * @throws IOException on IO error
+     * @since 1.17.2
+     */
+    public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
+        final SeekableByteChannel byteChannel = Files.newByteChannel(path);
+        InputStream stream = Channels.newInputStream(byteChannel);
+        String name = Normalizer.lowerCase(path.getFileName().toString());
+        if (name.endsWith(".gz") || name.endsWith(".z")) {
+            final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
+            byteChannel.position(0); // reset to start of file
+            if (zipped) {
+                stream = new GZIPInputStream(stream);
             }
-            stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
         }
         return parseInputStream(stream, charsetName, baseUri, parser);
     }
@@ -139,16 +171,15 @@ static void crossStreams(final InputStream in, final OutputStream out) throws IO
     static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException  {
         if (input == null) // empty body
             return new Document(baseUri);
-        input = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
 
         @Nullable Document doc = null;
 
         // read the start of the stream and look for a BOM or meta charset
-        try {
-            input.mark(DefaultBufferSize);
-            ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
-            boolean fullyRead = (input.read() == -1);
-            input.reset();
+        try (InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0)) {
+            wrappedInputStream.mark(DefaultBufferSize);
+            ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
+            boolean fullyRead = (wrappedInputStream.read() == -1);
+            wrappedInputStream.reset();
 
             // look for BOM - overrides any other header or input
             BomCharset bomCharset = detectCharsetFromBom(firstBytes);
@@ -189,9 +220,8 @@ else if (first instanceof Comment) {
                         if (comment.isXmlDeclaration())
                             decl = comment.asXmlDeclaration();
                     }
-                    if (decl != null) {
-                        if (decl.name().equalsIgnoreCase("xml"))
-                            foundCharset = decl.attr("encoding");
+                    if (decl != null && decl.name().equalsIgnoreCase("xml")) {
+                        foundCharset = decl.attr("encoding");
                     }
                 }
                 foundCharset = validateCharset(foundCharset);
@@ -208,8 +238,7 @@ else if (first instanceof Comment) {
             if (doc == null) {
                 if (charsetName == null)
                     charsetName = defaultCharsetName;
-                BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), DefaultBufferSize); // Android level does not allow us try-with-resources
-                try {
+                try (BufferedReader reader = new BufferedReader(new InputStreamReader(wrappedInputStream, Charset.forName(charsetName)), DefaultBufferSize)) {
                     if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here
                         long skipped = reader.skip(1);
                         Validate.isTrue(skipped == 1); // WTF if this fails.
@@ -227,14 +256,8 @@ else if (first instanceof Comment) {
                         doc.charset(UTF_8);
                     }
                 }
-                finally {
-                    reader.close();
-                }
             }
         }
-        finally {
-            input.close();
-        }
         return doc;
     }
 

diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java
@@ -11,8 +11,10 @@
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.nio.file.Path;
 
 import static org.jsoup.integration.ParseTest.getFile;
+import static org.jsoup.integration.ParseTest.getPath;
 import static org.junit.jupiter.api.Assertions.*;
 
 public class DataUtilTest {
@@ -207,13 +209,21 @@ public void supportsXmlCharsetDeclaration() throws IOException {
 
 
     @Test
-    public void lLoadsGzipFile() throws IOException {
+    public void loadsGzipFile() throws IOException {
         File in = getFile("/htmltests/gzip.html.gz");
         Document doc = Jsoup.parse(in, null);
         assertEquals("Gzip test", doc.title());
         assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
     }
 
+    @Test
+    public void loadsGzipPath() throws IOException {
+        Path in = getPath("/htmltests/gzip.html.gz");
+        Document doc = Jsoup.parse(in, null);
+        assertEquals("Gzip test", doc.title());
+        assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
+    }
+
     @Test
     public void loadsZGzipFile() throws IOException {
         // compressed on win, with z suffix
@@ -223,6 +233,15 @@ public void loadsZGzipFile() throws IOException {
         assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
     }
 
+    @Test
+    public void loadsZGzipPath() throws IOException {
+        // compressed on win, with z suffix
+        Path in = getPath("/htmltests/gzip.html.z");
+        Document doc = Jsoup.parse(in, null);
+        assertEquals("Gzip test", doc.title());
+        assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
+    }
+
     @Test
     public void handlesFakeGzipFile() throws IOException {
         File in = getFile("/htmltests/fake-gzip.html.gz");
@@ -231,6 +250,14 @@ public void handlesFakeGzipFile() throws IOException {
         assertEquals("And should still be readable.", doc.selectFirst("p").text());
     }
 
+    @Test
+    public void handlesFakeGzipPath() throws IOException {
+        Path in = getPath("/htmltests/fake-gzip.html.gz");
+        Document doc = Jsoup.parse(in, null);
+        assertEquals("This is not gzipped", doc.title());
+        assertEquals("And should still be readable.", doc.selectFirst("p").text());
+    }
+
     // an input stream to give a range of output sizes, that changes on each read
     static class VaryingReadInputStream extends InputStream {
         final InputStream in;

diff --git a/src/test/java/org/jsoup/integration/ParseTest.java b/src/test/java/org/jsoup/integration/ParseTest.java
@@ -15,6 +15,8 @@
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.zip.GZIPInputStream;
 
 import static org.junit.jupiter.api.Assertions.*;
@@ -133,6 +135,15 @@ public static File getFile(String resourceName) {
         }
     }
 
+    public static Path getPath(String resourceName) {
+        try {
+            URL resource = ParseTest.class.getResource(resourceName);
+            return resource != null ? Paths.get(resource.toURI()) : Paths.get("/404");
+        } catch (URISyntaxException e) {
+            throw new IllegalStateException(e);
+        }
+    }
+
     public static InputStream inputStreamFrom(String s) {
         return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
     }