Skip to content

Commit

Permalink
feat: add GzipEncoder and ZlibEncoder, make compress use encoders
Browse files Browse the repository at this point in the history
This provides a much more flexible and powerful API even in no-std
environments, stepping up the gzip and Zlib formats as we previously did
with raw DEFLATE via `DeflateEncoder`. The new encoder writers are just
wrappers around `DeflateEncoder` that write header and trailer data
before and after the DEFLATE stream, respectively.

While at it, let's remove the `deflate` function in favor of composite
factory methods that return a `DeflateEncoder` wrapped in a `BufWriter`.
  • Loading branch information
AlexTMjugador committed Sep 27, 2023
1 parent d5fcfcc commit 90cfdd4
Show file tree
Hide file tree
Showing 7 changed files with 246 additions and 169 deletions.
8 changes: 4 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ proptest-derive = "0.4.0"
miniz_oxide = "0.7.1"

[features]
default = ["gzip", "zlib"]
gzip = ["std", "dep:crc32fast"]
zlib = ["std", "dep:simd-adler32"]
default = ["std", "gzip", "zlib"]
gzip = ["dep:crc32fast"]
zlib = ["dep:simd-adler32"]

std = ["crc32fast?/std", "simd-adler32?/std"]
nightly = ["crc32fast?/nightly", "simd-adler32?/nightly"]

[[bin]]
name = "zopfli"
required-features = ["gzip", "zlib"]
required-features = ["std", "gzip", "zlib"]

[profile.release]
debug = true
Expand Down
51 changes: 17 additions & 34 deletions src/deflate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ use crate::{
/// backreference window. As a consequence, frequent short writes may cause more
/// DEFLATE blocks to be emitted with less optimal Huffman trees, which can hurt
/// compression and runtime. If they are a concern, short writes can be conveniently
/// dealt with by wrapping this encoder with a [`BufWriter`](std::io::BufWriter). An
/// adequate write size would be >32 KiB, which allows the second complete chunk to
/// leverage a full-sized backreference window.
/// dealt with by wrapping this encoder with a [`BufWriter`](std::io::BufWriter), as done
/// by the [`new_buffered`](DeflateEncoder::new_buffered) method. An adequate write size
/// would be >32 KiB, which allows the second complete chunk to leverage a full-sized
/// backreference window.
pub struct DeflateEncoder<W: Write> {
options: Options,
btype: BlockType,
Expand All @@ -54,6 +55,18 @@ impl<W: Write> DeflateEncoder<W> {
}
}

/// Creates a new Zopfli DEFLATE encoder that operates according to the
/// specified options and is wrapped with a buffer to guarantee that
/// data is compressed in large chunks, which is necessary for decent
/// performance and good compression ratio.
#[cfg(feature = "std")]
pub fn new_buffered(options: Options, btype: BlockType, sink: W) -> std::io::BufWriter<Self> {
std::io::BufWriter::with_capacity(
crate::util::ZOPFLI_MASTER_BLOCK_SIZE,
Self::new(options, btype, sink),
)
}

/// Encodes any pending chunks of data and writes them to the sink,
/// consuming the encoder and returning the wrapped sink. The sink
/// will have received a complete DEFLATE stream when this method
Expand All @@ -63,7 +76,7 @@ impl<W: Write> DeflateEncoder<W> {
/// dropped, but explicitly finishing it with this method allows
/// handling I/O errors.
pub fn finish(mut self) -> Result<W, Error> {
self._finish().map(|writer| writer.unwrap())
self._finish().map(|sink| sink.unwrap())
}

/// Compresses the chunk stored at `window_and_chunk`. This includes
Expand Down Expand Up @@ -148,36 +161,6 @@ impl<W: Write> Drop for DeflateEncoder<W> {
}
}

/// Convenience function to efficiently compress data in DEFLATE format
/// from an arbitrary source to an arbitrary destination.
#[cfg(feature = "std")]
pub fn deflate<R: std::io::Read, W: Write>(
options: Options,
btype: BlockType,
mut in_data: R,
out: W,
) -> Result<(), Error> {
/// A block structure of huge, non-smart, blocks to divide the input into, to allow
/// operating on huge files without exceeding memory, such as the 1GB wiki9 corpus.
/// The whole compression algorithm, including the smarter block splitting, will
/// be executed independently on each huge block.
/// Dividing into huge blocks hurts compression, but not much relative to the size.
/// This must be equal or greater than `ZOPFLI_WINDOW_SIZE`.
const ZOPFLI_MASTER_BLOCK_SIZE: usize = 1_000_000;

// Wrap the encoder with a buffer to guarantee the data is compressed in big chunks,
// which is necessary for decent performance and good compression ratio
let mut deflater = std::io::BufWriter::with_capacity(
ZOPFLI_MASTER_BLOCK_SIZE,
DeflateEncoder::new(options, btype, out),
);

std::io::copy(&mut in_data, &mut deflater)?;
deflater.into_inner()?.finish()?;

Ok(())
}

/// Deflate a part, to allow for chunked, streaming compression with [`DeflateEncoder`].
/// It is possible to call this function multiple times in a row, shifting
/// instart and inend to next bytes of the data. If instart is larger than 0, then
Expand Down
138 changes: 102 additions & 36 deletions src/gzip.rs
Original file line number Diff line number Diff line change
@@ -1,37 +1,103 @@
use std::io::Read;

use crate::{
deflate::{deflate, BlockType},
util::HashingAndCountingRead,
Error, Options, Write,
};

static HEADER: &[u8] = &[
31, // ID1
139, // ID2
8, // CM
0, // FLG
0, // MTIME
0, 0, 0, 2, // XFL, 2 indicates best compression.
3, // OS follows Unix conventions.
];

/// Compresses the data according to the gzip specification, RFC 1952.
pub fn gzip_compress<R: Read, W: Write>(
options: Options,
in_data: R,
mut out: W,
) -> Result<(), Error> {
let mut crc_hasher = crc32fast::Hasher::new();
let mut insize = 0;

let in_data = HashingAndCountingRead::new(in_data, &mut crc_hasher, Some(&mut insize));

out.write_all(HEADER)?;

deflate(options, BlockType::Dynamic, in_data, &mut out)?;

out.write_all(&crc_hasher.finalize().to_le_bytes())?;

out.write_all(&insize.to_le_bytes())
use crate::{BlockType, DeflateEncoder, Error, Options, Write};

/// A Gzip encoder powered by the Zopfli algorithm, that compresses data using
/// a [`DeflateEncoder`]. Most users will find using [`compress`](crate::compress)
/// easier and more performant.
///
/// The caveats about short writes in [`DeflateEncoder`]s carry over to `GzipEncoder`s:
/// for best performance and compression, it is best to avoid them. One way to ensure
/// this is to use the [`new_buffered`](GzipEncoder::new_buffered) method.
pub struct GzipEncoder<W: Write> {
deflate_encoder: Option<DeflateEncoder<W>>,
crc32_hasher: crc32fast::Hasher,
input_size: u32,
}

impl<W: Write> GzipEncoder<W> {
/// Creates a new Gzip encoder that will operate according to the
/// specified options.
pub fn new(options: Options, btype: BlockType, mut sink: W) -> Result<Self, Error> {
static HEADER: &[u8] = &[
31, // ID1
139, // ID2
8, // CM
0, // FLG
0, // MTIME
0, 0, 0, 2, // XFL, 2 indicates best compression.
3, // OS follows Unix conventions.
];

sink.write_all(HEADER)?;

Ok(Self {
deflate_encoder: Some(DeflateEncoder::new(options, btype, sink)),
crc32_hasher: crc32fast::Hasher::new(),
input_size: 0,
})
}

/// Creates a new Gzip encoder that operates according to the specified
/// options and is wrapped with a buffer to guarantee that data is
/// compressed in large chunks, which is necessary for decent performance
/// and good compression ratio.
#[cfg(feature = "std")]
pub fn new_buffered(
options: Options,
btype: BlockType,
sink: W,
) -> Result<std::io::BufWriter<Self>, Error> {
Ok(std::io::BufWriter::with_capacity(
crate::util::ZOPFLI_MASTER_BLOCK_SIZE,
Self::new(options, btype, sink)?,
))
}

/// Encodes any pending chunks of data and writes them to the sink,
/// consuming the encoder and returning the wrapped sink. The sink
/// will have received a complete Gzip stream when this method
/// returns.
///
/// The encoder is automatically [`finish`](Self::finish)ed when
/// dropped, but explicitly finishing it with this method allows
/// handling I/O errors.
pub fn finish(mut self) -> Result<W, Error> {
self._finish().map(|sink| sink.unwrap())
}

fn _finish(&mut self) -> Result<Option<W>, Error> {
if self.deflate_encoder.is_none() {
return Ok(None);
}

let mut sink = self.deflate_encoder.take().unwrap().finish()?;

sink.write_all(&self.crc32_hasher.clone().finalize().to_le_bytes())?;
sink.write_all(&self.input_size.to_le_bytes())?;

Ok(Some(sink))
}
}

impl<W: Write> Write for GzipEncoder<W> {
fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
self.deflate_encoder
.as_mut()
.unwrap()
.write(buf)
.map(|bytes_written| {
self.crc32_hasher.update(&buf[..bytes_written]);
self.input_size = self.input_size.wrapping_add(bytes_written as u32);
bytes_written
})
}

fn flush(&mut self) -> Result<(), Error> {
self.deflate_encoder.as_mut().unwrap().flush()
}
}

impl<W: Write> Drop for GzipEncoder<W> {
fn drop(&mut self) {
self._finish().ok();
}
}
31 changes: 24 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
//! This crate exposes the following features. You can enable or disable them in your `Cargo.toml`
//! as needed.
//!
//! - `gzip` (enabled by default): enables support for compression in the gzip format. Implies `std`.
//! - `zlib` (enabled by default): enables support for compression in the Zlib format. Implies `std`.
//! - `gzip` (enabled by default): enables support for compression in the gzip format.
//! - `zlib` (enabled by default): enables support for compression in the Zlib format.
//! - `std` (enabled by default): enables linking against the Rust standard library. When not enabled,
//! the crate is built with the `#![no_std]` attribute and can be used
//! in any environment where [`alloc`](https://doc.rust-lang.org/alloc/)
Expand All @@ -37,8 +37,12 @@
extern crate alloc;

pub use deflate::{BlockType, DeflateEncoder};
#[cfg(feature = "gzip")]
pub use gzip::GzipEncoder;
#[cfg(all(test, feature = "std"))]
use proptest::prelude::*;
#[cfg(feature = "zlib")]
pub use zlib::ZlibEncoder;

mod blocksplitter;
mod cache;
Expand Down Expand Up @@ -138,17 +142,30 @@ pub enum Format {
/// options, and writes the result to a sink in the defined output format.
#[cfg(feature = "std")]
pub fn compress<R: std::io::Read, W: Write>(
options: &Options,
options: Options,
output_format: Format,
in_data: R,
mut in_data: R,
out: W,
) -> Result<(), Error> {
match output_format {
#[cfg(feature = "gzip")]
Format::Gzip => gzip::gzip_compress(*options, in_data, out),
Format::Gzip => {
let mut gzip_encoder = GzipEncoder::new_buffered(options, BlockType::Dynamic, out)?;
std::io::copy(&mut in_data, &mut gzip_encoder)?;
gzip_encoder.into_inner()?.finish().map(|_| ())
}
#[cfg(feature = "zlib")]
Format::Zlib => zlib::zlib_compress(*options, in_data, out),
Format::Deflate => deflate::deflate(*options, BlockType::Dynamic, in_data, out),
Format::Zlib => {
let mut zlib_encoder = ZlibEncoder::new_buffered(options, BlockType::Dynamic, out)?;
std::io::copy(&mut in_data, &mut zlib_encoder)?;
zlib_encoder.into_inner()?.finish().map(|_| ())
}
Format::Deflate => {
let mut deflate_encoder =
DeflateEncoder::new_buffered(options, BlockType::Dynamic, out);
std::io::copy(&mut in_data, &mut deflate_encoder)?;
deflate_encoder.into_inner()?.finish().map(|_| ())
}
}
}

Expand Down
6 changes: 3 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::{
env,
fs::File,
io::{self, prelude::*, BufWriter},
io::{self, prelude::*},
};

use log::info;
Expand Down Expand Up @@ -29,9 +29,9 @@ fn main() {
// Attempt to create the output file, panic if the output file could not be opened
let out_file = File::create(&out_filename)
.unwrap_or_else(|why| panic!("couldn't create output file {}: {}", out_filename, why));
let mut out_file = WriteStatistics::new(BufWriter::new(out_file));
let mut out_file = WriteStatistics::new(out_file);

zopfli::compress(&options, output_type, &file, &mut out_file).unwrap_or_else(|why| {
zopfli::compress(options, output_type, &file, &mut out_file).unwrap_or_else(|why| {
panic!("couldn't write to output file {}: {}", out_filename, why)
});

Expand Down
Loading

0 comments on commit 90cfdd4

Please sign in to comment.