Skip to content

Commit

Permalink
Add Path-accepting Jsoup methods
Browse files Browse the repository at this point in the history
Also improve file parsing using SeekableByteChannel
  • Loading branch information
Isira-Seneviratne committed Nov 19, 2023
1 parent 6307b94 commit 2ce7b46
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 12 deletions.
3 changes: 3 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@
<version>2.3.3_r2</version>
</signature>
<ignores>
<ignore>java.io.File</ignore> <!-- File#toPath() -->
<ignore>java.nio.file.*</ignore>
<ignore>java.nio.channels.SeekableByteChannel</ignore>
<ignore>java.util.function.Consumer</ignore>
<ignore>java.util.function.Supplier</ignore>
<ignore>java.lang.ThreadLocal</ignore>
Expand Down
67 changes: 67 additions & 0 deletions src/main/java/org/jsoup/Jsoup.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Path;

/**
The core public access point to the jsoup functionality.
Expand Down Expand Up @@ -183,6 +184,72 @@ public static Document parse(File file, @Nullable String charsetName, String bas
return DataUtil.load(file, charsetName, baseUri, parser);
}

/**
Parse the contents of a file as HTML.
@param path file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@since 1.17.1
*/
public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException {
return DataUtil.load(path, charsetName, baseUri);
}

/**
Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
@param path file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@see #parse(File, String, String) parse(file, charset, baseUri)
@since 1.17.1
*/
public static Document parse(Path path, @Nullable String charsetName) throws IOException {
return DataUtil.load(path, charsetName, path.toAbsolutePath().toString());
}

/**
Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
or if neither is present, will be {@code UTF-8}.
<p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
@param path the file to load HTML from. Supports gzipped files (ending in .z or .gz).
@return sane HTML
@throws IOException if the file could not be found or read.
@see #parse(Path, String, String) parse(file, charset, baseUri)
@since 1.17.1
*/
public static Document parse(Path path) throws IOException {
return DataUtil.load(path, null, path.toAbsolutePath().toString());
}

/**
Parse the contents of a file as HTML.
@param path file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@param parser alternate {@link Parser#xmlParser() parser} to use.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@since 1.17.1
*/
public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
return DataUtil.load(path, charsetName, baseUri, parser);
}

/**
Read an input stream, and parse it to a Document.
Expand Down
57 changes: 45 additions & 12 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
Expand All @@ -24,8 +23,12 @@
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Random;
import java.util.regex.Matcher;
Expand Down Expand Up @@ -61,7 +64,7 @@ private DataUtil() {}
* @throws IOException on IO error
*/
public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
return load(file, charsetName, baseUri, Parser.htmlParser());
return load(file.toPath(), charsetName, baseUri);
}

/**
Expand All @@ -79,18 +82,48 @@ public static Document load(File file, @Nullable String charsetName, String base
* @since 1.14.2
*/
public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
InputStream stream = new FileInputStream(file);
String name = Normalizer.lowerCase(file.getName());
if (name.endsWith(".gz") || name.endsWith(".z")) {
// unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
boolean zipped;
try {
zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
} finally {
stream.close();
return load(file.toPath(), charsetName, baseUri, parser);
}

/**
* Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
* are supported in addition to uncompressed files.
*
* @param path file to load
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
* the file will always override this setting.
* @param baseUri base URI of document, to resolve relative links against
* @return Document
* @throws IOException on IO error
*/
public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
return load(path, charsetName, baseUri, Parser.htmlParser());
}

/**
* Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
* are supported in addition to uncompressed files.
*
* @param path file to load
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
* the file will always override this setting.
* @param baseUri base URI of document, to resolve relative links against
* @param parser alternate {@link Parser#xmlParser() parser} to use.
* @return Document
* @throws IOException on IO error
* @since 1.17.1
*/
public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
final SeekableByteChannel byteChannel = Files.newByteChannel(path);
InputStream stream = Channels.newInputStream(byteChannel);
String name = Normalizer.lowerCase(path.getFileName().toString());
if (name.endsWith(".gz") || name.endsWith(".z")) {
final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
byteChannel.position(0); // reset to start of file
if (zipped) {
stream = new GZIPInputStream(stream);
}
stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
}
return parseInputStream(stream, charsetName, baseUri, parser);
}
Expand Down

0 comments on commit 2ce7b46

Please sign in to comment.