diff --git a/CHANGES.md b/CHANGES.md index 7fbcb33472..9e221e908a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,14 @@ # jsoup Changelog +## 1.18.1 (Pending) + +### Improvements + +* Added `Path` accepting parse methods: `Jsoup.parse(Path)`, `Jsoup.parse(path, charsetName, baseUri, parser)`, + etc. [2055](https://github.com/jhy/jsoup/pull/2055) + +--- + ## 1.17.2 (2023-Dec-29) ### Improvements diff --git a/pom.xml b/pom.xml index c0df9e0a78..b53736ca09 100644 --- a/pom.xml +++ b/pom.xml @@ -88,8 +88,12 @@ 2.3.3_r2 + java.io.File + java.nio.file.* + java.nio.channels.SeekableByteChannel java.util.function.* java.util.stream.* + java.lang.Throwable java.lang.ThreadLocal java.io.UncheckedIOException java.util.List diff --git a/src/main/java/org/jsoup/Jsoup.java b/src/main/java/org/jsoup/Jsoup.java index 29acbafbc4..e20311bcd8 100644 --- a/src/main/java/org/jsoup/Jsoup.java +++ b/src/main/java/org/jsoup/Jsoup.java @@ -13,6 +13,7 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; +import java.nio.file.Path; /** The core public access point to the jsoup functionality. @@ -183,6 +184,72 @@ public static Document parse(File file, @Nullable String charsetName, String bas return DataUtil.load(file, charsetName, baseUri, parser); } + /** + Parse the contents of a file as HTML. + + @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + @since 1.18.1 + */ + public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException { + return DataUtil.load(path, charsetName, baseUri); + } + + /** + Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. + + @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + @see #parse(File, String, String) parse(file, charset, baseUri) + @since 1.18.1 + */ + public static Document parse(Path path, @Nullable String charsetName) throws IOException { + return DataUtil.load(path, charsetName, path.toAbsolutePath().toString()); + } + + /** + Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. + The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code } tag, + or if neither is present, will be {@code UTF-8}. + +

This is the equivalent of calling {@link #parse(File, String) parse(file, null)}

+ + @param path the file to load HTML from. Supports gzipped files (ending in .z or .gz). + @return sane HTML + @throws IOException if the file could not be found or read. + @see #parse(Path, String, String) parse(file, charset, baseUri) + @since 1.18.1 + */ + public static Document parse(Path path) throws IOException { + return DataUtil.load(path, null, path.toAbsolutePath().toString()); + } + + /** + Parse the contents of a file as HTML. + + @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @param parser alternate {@link Parser#xmlParser() parser} to use. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + @since 1.18.1 + */ + public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + return DataUtil.load(path, charsetName, baseUri, parser); + } + /** Read an input stream, and parse it to a Document. diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index c1c791053c..58f44fb7c0 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -2,7 +2,6 @@ import org.jsoup.internal.ControllableInputStream; import org.jsoup.internal.Normalizer; -import org.jsoup.internal.SharedConstants; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Comment; import org.jsoup.nodes.Document; @@ -16,7 +15,6 @@ import java.io.BufferedReader; import java.io.CharArrayReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -25,8 +23,12 @@ import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.CharBuffer; +import java.nio.channels.Channels; +import java.nio.channels.SeekableByteChannel; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Locale; import java.util.Random; import java.util.regex.Matcher; @@ -63,7 +65,7 @@ private DataUtil() {} * @throws IOException on IO error */ public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { - return load(file, charsetName, baseUri, Parser.htmlParser()); + return load(file.toPath(), charsetName, baseUri); } /** @@ -81,18 +83,48 @@ public static Document load(File file, @Nullable String charsetName, String base * @since 1.14.2 */ public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { - InputStream stream = new FileInputStream(file); - String name = Normalizer.lowerCase(file.getName()); - if (name.endsWith(".gz") || name.endsWith(".z")) { - // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read - boolean zipped; - try { - zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes - } finally { - stream.close(); + return load(file.toPath(), charsetName, baseUri, parser); + } + + /** + * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) + * are supported in addition to uncompressed files. + * + * @param path file to load + * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in + * the file will always override this setting. + * @param baseUri base URI of document, to resolve relative links against + * @return Document + * @throws IOException on IO error + */ + public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException { + return load(path, charsetName, baseUri, Parser.htmlParser()); + } + /** + * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) + * are supported in addition to uncompressed files. + * + * @param path file to load + * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in + * the file will always override this setting. + * @param baseUri base URI of document, to resolve relative links against + * @param parser alternate {@link Parser#xmlParser() parser} to use. + + * @return Document + * @throws IOException on IO error + * @since 1.17.2 + */ + public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + final SeekableByteChannel byteChannel = Files.newByteChannel(path); + InputStream stream = Channels.newInputStream(byteChannel); + String name = Normalizer.lowerCase(path.getFileName().toString()); + if (name.endsWith(".gz") || name.endsWith(".z")) { + final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes + byteChannel.position(0); // reset to start of file + if (zipped) { + stream = new GZIPInputStream(stream); } - stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); } return parseInputStream(stream, charsetName, baseUri, parser); } @@ -139,16 +171,15 @@ static void crossStreams(final InputStream in, final OutputStream out) throws IO static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { if (input == null) // empty body return new Document(baseUri); - input = ControllableInputStream.wrap(input, DefaultBufferSize, 0); @Nullable Document doc = null; // read the start of the stream and look for a BOM or meta charset - try { - input.mark(DefaultBufferSize); - ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. - boolean fullyRead = (input.read() == -1); - input.reset(); + try (InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0)) { + wrappedInputStream.mark(DefaultBufferSize); + ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. + boolean fullyRead = (wrappedInputStream.read() == -1); + wrappedInputStream.reset(); // look for BOM - overrides any other header or input BomCharset bomCharset = detectCharsetFromBom(firstBytes); @@ -189,9 +220,8 @@ else if (first instanceof Comment) { if (comment.isXmlDeclaration()) decl = comment.asXmlDeclaration(); } - if (decl != null) { - if (decl.name().equalsIgnoreCase("xml")) - foundCharset = decl.attr("encoding"); + if (decl != null && decl.name().equalsIgnoreCase("xml")) { + foundCharset = decl.attr("encoding"); } } foundCharset = validateCharset(foundCharset); @@ -208,8 +238,7 @@ else if (first instanceof Comment) { if (doc == null) { if (charsetName == null) charsetName = defaultCharsetName; - BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), DefaultBufferSize); // Android level does not allow us try-with-resources - try { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(wrappedInputStream, Charset.forName(charsetName)), DefaultBufferSize)) { if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here long skipped = reader.skip(1); Validate.isTrue(skipped == 1); // WTF if this fails. @@ -227,14 +256,8 @@ else if (first instanceof Comment) { doc.charset(UTF_8); } } - finally { - reader.close(); - } } } - finally { - input.close(); - } return doc; } diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index 10074d4ca9..61627aac20 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -11,8 +11,10 @@ import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.Path; import static org.jsoup.integration.ParseTest.getFile; +import static org.jsoup.integration.ParseTest.getPath; import static org.junit.jupiter.api.Assertions.*; public class DataUtilTest { @@ -207,13 +209,21 @@ public void supportsXmlCharsetDeclaration() throws IOException { @Test - public void lLoadsGzipFile() throws IOException { + public void loadsGzipFile() throws IOException { File in = getFile("/htmltests/gzip.html.gz"); Document doc = Jsoup.parse(in, null); assertEquals("Gzip test", doc.title()); assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); } + @Test + public void loadsGzipPath() throws IOException { + Path in = getPath("/htmltests/gzip.html.gz"); + Document doc = Jsoup.parse(in, null); + assertEquals("Gzip test", doc.title()); + assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); + } + @Test public void loadsZGzipFile() throws IOException { // compressed on win, with z suffix @@ -223,6 +233,15 @@ public void loadsZGzipFile() throws IOException { assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); } + @Test + public void loadsZGzipPath() throws IOException { + // compressed on win, with z suffix + Path in = getPath("/htmltests/gzip.html.z"); + Document doc = Jsoup.parse(in, null); + assertEquals("Gzip test", doc.title()); + assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); + } + @Test public void handlesFakeGzipFile() throws IOException { File in = getFile("/htmltests/fake-gzip.html.gz"); @@ -231,6 +250,14 @@ public void handlesFakeGzipFile() throws IOException { assertEquals("And should still be readable.", doc.selectFirst("p").text()); } + @Test + public void handlesFakeGzipPath() throws IOException { + Path in = getPath("/htmltests/fake-gzip.html.gz"); + Document doc = Jsoup.parse(in, null); + assertEquals("This is not gzipped", doc.title()); + assertEquals("And should still be readable.", doc.selectFirst("p").text()); + } + // an input stream to give a range of output sizes, that changes on each read static class VaryingReadInputStream extends InputStream { final InputStream in; diff --git a/src/test/java/org/jsoup/integration/ParseTest.java b/src/test/java/org/jsoup/integration/ParseTest.java index 0c5cb2b15c..d84c103497 100644 --- a/src/test/java/org/jsoup/integration/ParseTest.java +++ b/src/test/java/org/jsoup/integration/ParseTest.java @@ -15,6 +15,8 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.zip.GZIPInputStream; import static org.junit.jupiter.api.Assertions.*; @@ -133,6 +135,15 @@ public static File getFile(String resourceName) { } } + public static Path getPath(String resourceName) { + try { + URL resource = ParseTest.class.getResource(resourceName); + return resource != null ? Paths.get(resource.toURI()) : Paths.get("/404"); + } catch (URISyntaxException e) { + throw new IllegalStateException(e); + } + } + public static InputStream inputStreamFrom(String s) { return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8)); }