Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

batched f16 conversion #191

Merged
merged 33 commits into from
Jul 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
baf90df
prototype unoptimized batched f16 conversion + fix round up division …
johannesvollmer Jan 7, 2023
c13b24e
Merge branch 'master' into f16_batch_conversion
johannesvollmer Jan 7, 2023
7b41d0d
rename some stuff and improve error message
johannesvollmer Jan 7, 2023
284da36
use batch size of 16
johannesvollmer Jan 7, 2023
b4b9518
Merge branch 'master' into f16_batch_conversion
johannesvollmer Jan 8, 2023
9de05cc
improve comments, update to new `half` version
johannesvollmer Jan 20, 2023
27aca4d
Merge remote-tracking branch 'origin/f16_batch_conversion' into f16_b…
johannesvollmer Jan 20, 2023
5d96f18
Merge branch 'master' into f16_batch_conversion
johannesvollmer Jan 20, 2023
376ae08
revert an incomplete refactoring
johannesvollmer Jan 20, 2023
6475c87
refactor batch conversion function to reduce code duplication
johannesvollmer Jan 20, 2023
aaf06fe
Merge branch 'master' into f16_batch_conversion
johannesvollmer Feb 28, 2023
278c517
Merge branch 'master' into f16_batch_conversion
johannesvollmer Jun 27, 2023
a6c125b
add simple unit test
johannesvollmer Jun 27, 2023
da4690e
fix two compiler warnings
johannesvollmer Jun 27, 2023
633ab61
force use newest version of `half`
johannesvollmer Jun 27, 2023
3d3c6bd
Update src/block/samples.rs
johannesvollmer Jun 27, 2023
ff2a2ed
Update src/block/samples.rs
johannesvollmer Jun 27, 2023
66123ed
Merge remote-tracking branch 'origin/f16_batch_conversion' into f16_b…
johannesvollmer Jul 2, 2023
441e813
add more benchmarks
johannesvollmer Jul 2, 2023
8cc1e38
add more benchmarks
johannesvollmer Jul 2, 2023
3ece29d
add more benchmarks
johannesvollmer Jul 2, 2023
4236bf3
Merge branch 'master' into f16_batch_conversion
johannesvollmer Jul 3, 2023
a572e94
inline-hint closures
johannesvollmer Jul 3, 2023
f559647
undo use inline attribute (experimental, not supported yet)
johannesvollmer Jul 3, 2023
73a62b9
use inline syntax for `run` commands
johannesvollmer Jul 3, 2023
f0052ad
Merge branch 'master' into f16_batch_conversion
johannesvollmer Jul 3, 2023
1d5fd6b
attempt ci without cache
johannesvollmer Jul 6, 2023
980369b
attempt fix ci
johannesvollmer Jul 6, 2023
e9bff52
refactor
johannesvollmer Jul 6, 2023
79940c5
bump rust version to take advantage of f16 intrinsics
johannesvollmer Jul 6, 2023
3ce3c05
Merge branch 'master' into f16_batch_conversion
johannesvollmer Jul 7, 2023
c3cb590
retain backwards compatibility for this pr, only break the release af…
johannesvollmer Jul 7, 2023
85a311d
Merge branch 'master' into f16_batch_conversion
johannesvollmer Jul 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: Install or use cached foresterre/cargo-msrv
uses: baptiste0928/cargo-install@v1
with:
crate: cargo-msrv
- name: Install foresterre/cargo-msrv without cache (takes longer, but caching produces unexpected behaviour)
run: cargo install cargo-msrv

- name: Verify the Rustc version declared in `cargo.toml`
- name: Verify the Rustc version declared in `cargo.toml` without cache (takes longer, but caching produces unexpected behaviour)
run: |
rm -f Cargo.lock
cargo update
cargo-msrv verify

# github actions does not support big endian systems directly, but it does support QEMU.
Expand Down Expand Up @@ -82,13 +82,11 @@ jobs:
run: sudo systemctl start docker

- name: Cross-Compile project to mips-unknown-linux-gnu
run: |
cross build --target=mips-unknown-linux-gnu --verbose
run: cross build --target=mips-unknown-linux-gnu --verbose

# https://github.com/cross-rs/cross#supported-targets
- name: Cross-Run Tests in mips-unknown-linux-gnu using Qemu
run: |
cross test --target mips-unknown-linux-gnu --verbose
run: cross test --target mips-unknown-linux-gnu --verbose

wasm32:
runs-on: ubuntu-latest
Expand All @@ -109,3 +107,4 @@ jobs:

- name: Run tests without default features
run: cargo test --verbose --no-default-features

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ proc-macro = false

[dependencies]
lebe = "^0.5.2" # generic binary serialization
half = ">=2.1.0, <2.3" # 16 bit float pixel data type
half = ">=2.1.0, <2.3" # 16 bit float pixel data type
bit_field = "^0.10.1" # exr file version bit flags
miniz_oxide = "^0.7.1" # zip compression for pxr24
smallvec = "^1.7.0" # make cache-friendly allocations TODO profile if smallvec is really an improvement!
Expand Down
126 changes: 88 additions & 38 deletions benches/pixel_format_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,62 +8,112 @@ use bencher::Bencher;
use std::fs;
use std::io::Cursor;
use exr::image::pixel_vec::PixelVec;
use exr::io::Data;
use exr::block::samples::FromNativeSample;

const F32_ZIPS_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zips.exr";
const F32_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed.exr";
const F16_UNCOMPRESSED_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_uncompressed_half.exr";
const F16_ZIP_PATH: &'static str = "tests/images/valid/custom/crowskull/crow_zip_half.exr";

/// Read an image from an in-memory buffer into its native f32 format
fn read_image_rgba_f32_to_f32(bench: &mut Bencher) {
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
bencher::black_box(&mut file);
fn read_f32_as_f32_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F32_UNCOMPRESSED_PATH, false);
}

bench.iter(||{
let image = exr::prelude::read()
.no_deep_data().largest_resolution_level()
.rgba_channels(PixelVec::<(f32,f32,f32,f32)>::constructor, PixelVec::set_pixel)
.all_layers().all_attributes()
.non_parallel()
.from_buffered(Cursor::new(file.as_slice())).unwrap();
/// Read image and convert the samples to u32 (from native f32)
fn read_f32_as_u32_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<u32>(bench, F32_UNCOMPRESSED_PATH, false);
}

bencher::black_box(image);
})
/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
fn read_f32_as_f16_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F32_UNCOMPRESSED_PATH, false);
}

/// Read image and convert the samples to u32 (from native f32)
fn read_image_rgba_f32_to_u32(bench: &mut Bencher) {
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();
bencher::black_box(&mut file);
fn read_f16_as_f16_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F16_UNCOMPRESSED_PATH, false);
}

bench.iter(||{
let image = exr::prelude::read()
.no_deep_data().largest_resolution_level()
.rgba_channels(PixelVec::<(u32,u32,u32,u32)>::constructor, PixelVec::set_pixel)
.all_layers().all_attributes()
.non_parallel()
.from_buffered(Cursor::new(file.as_slice())).unwrap();
fn read_f16_as_f32_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F16_UNCOMPRESSED_PATH, false);
}

bencher::black_box(image);
})
fn read_f16_as_u32_uncompressed_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<u32>(bench, F16_UNCOMPRESSED_PATH, false);
}

/// f16 is not natively supported by CPUs, which introduces unique performance pitfalls
fn read_image_rgba_f32_to_f16(bench: &mut Bencher) {
let mut file = fs::read("tests/images/valid/custom/crowskull/crow_uncompressed.exr").unwrap();

fn read_f32_as_f16_zips_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, false);
}

fn read_f16_as_f32_zip_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, false);
}

fn read_f32_as_f16_zips_nthreads(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F32_ZIPS_PATH, true);
}

fn read_f16_as_f32_zip_nthreads(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F16_ZIP_PATH, true);
}

fn read_f32_as_f32_zips_nthreads(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, true);
}

fn read_f16_as_f16_zip_nthreads(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, true);
}

fn read_f32_as_f32_zips_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f32>(bench, F32_ZIPS_PATH, false);
}

fn read_f16_as_f16_zip_1thread(bench: &mut Bencher) {
bench_read_image_rgba_as::<f16>(bench, F16_ZIP_PATH, false);
}

fn bench_read_image_rgba_as<T>(bench: &mut Bencher, path: &str, parallel: bool) {
let mut file = fs::read(path).unwrap();
bencher::black_box(&mut file);

bench.iter(||{
let image = exr::prelude::read()
.no_deep_data().largest_resolution_level()
.rgba_channels(PixelVec::<(f16,f16,f16,f16)>::constructor, PixelVec::set_pixel)
.all_layers().all_attributes()
.non_parallel()
.from_buffered(Cursor::new(file.as_slice())).unwrap();

let image = read_file_from_memory_as::<f16>(file.as_slice(), parallel);
bencher::black_box(image);
})
}

fn read_file_from_memory_as<T>(file: &[u8], parallel: bool) -> RgbaImage<PixelVec<(T, T, T, T)>>
where T: FromNativeSample
{
let read = exr::prelude::read()
.no_deep_data().largest_resolution_level()
.rgba_channels(PixelVec::<(T, T, T, T)>::constructor, PixelVec::set_pixel)
.first_valid_layer().all_attributes();

let read = if parallel { read } else { read.non_parallel() };
read.from_buffered(Cursor::new(file)).unwrap()
}

benchmark_group!(pixel_format_conversion,
read_image_rgba_f32_to_f32,
read_image_rgba_f32_to_u32,
read_image_rgba_f32_to_f16,
read_f32_as_f32_uncompressed_1thread,
read_f32_as_u32_uncompressed_1thread,
read_f32_as_f16_uncompressed_1thread,
read_f32_as_f16_zips_1thread,
read_f32_as_f16_zips_nthreads,
read_f32_as_f32_zips_nthreads,
read_f32_as_f32_zips_1thread,

read_f16_as_f16_uncompressed_1thread,
read_f16_as_u32_uncompressed_1thread,
read_f16_as_f32_uncompressed_1thread,
read_f16_as_f32_zip_1thread,
read_f16_as_f32_zip_nthreads,
read_f16_as_f16_zip_nthreads,
read_f16_as_f16_zip_1thread,
);

benchmark_main!(pixel_format_conversion);
80 changes: 68 additions & 12 deletions src/block/samples.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Extract pixel samples from a block of pixel bytes.

use crate::prelude::*;
use half::prelude::HalfFloatSliceExt;


/// A single red, green, blue, or alpha value.
Expand Down Expand Up @@ -112,6 +113,7 @@ impl From<Sample> for u32 { #[inline] fn from(s: Sample) -> Self { s.to_u32() }

/// Create an arbitrary sample type from one of the defined sample types.
/// Should be compiled to a no-op where the file contains the predicted sample type.
/// The slice functions should be optimized into a `memcpy` where there is no conversion needed.
pub trait FromNativeSample: Sized + Copy + Default + 'static {

/// Create this sample from a f16, trying to represent the same numerical value
Expand All @@ -122,31 +124,85 @@ pub trait FromNativeSample: Sized + Copy + Default + 'static {

/// Create this sample from a u32, trying to represent the same numerical value
fn from_u32(value: u32) -> Self;

/// Convert all values from the slice into this type.
/// This function exists to allow the compiler to perform a vectorization optimization.
/// Note that this default implementation will **not** be vectorized by the compiler automatically.
johannesvollmer marked this conversation as resolved.
Show resolved Hide resolved
/// For maximum performance you will need to override this function and implement it via
/// an explicit batched conversion such as [`convert_to_f32_slice`](https://docs.rs/half/2.3.1/half/slice/trait.HalfFloatSliceExt.html#tymethod.convert_to_f32_slice)
#[inline]
fn from_f16s(from: &[f16], to: &mut [Self]) {
assert_eq!(from.len(), to.len(), "slices must have the same length");
for (from, to) in from.iter().zip(to.iter_mut()) {
*to = Self::from_f16(*from);
}
}

/// Convert all values from the slice into this type.
/// This function exists to allow the compiler to perform a vectorization optimization.
/// Note that this default implementation will be vectorized by the compiler automatically.
#[inline]
fn from_f32s(from: &[f32], to: &mut [Self]) {
johannesvollmer marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!(from.len(), to.len(), "slices must have the same length");
for (from, to) in from.iter().zip(to.iter_mut()) {
*to = Self::from_f32(*from);
}
}

/// Convert all values from the slice into this type.
/// This function exists to allow the compiler to perform a vectorization optimization.
/// Note that this default implementation will be vectorized by the compiler automatically,
/// provided that the CPU supports the necessary conversion instructions.
/// For example, x86_64 lacks the instructions to convert `u32` to floats,
/// so this will inevitably be slow on x86_64.
#[inline]
fn from_u32s(from: &[u32], to: &mut [Self]) {
assert_eq!(from.len(), to.len(), "slices must have the same length");
for (from, to) in from.iter().zip(to.iter_mut()) {
*to = Self::from_u32(*from);
}
}
}

// TODO haven't i implemented this exact behaviour already somewhere else in this library...??
impl FromNativeSample for f32 {
fn from_f16(value: f16) -> Self { value.to_f32() }
fn from_f32(value: f32) -> Self { value } // this branch means that we never have to match every single sample if the file format matches the expected output
fn from_u32(value: u32) -> Self { value as f32 }
#[inline] fn from_f16(value: f16) -> Self { value.to_f32() }
#[inline] fn from_f32(value: f32) -> Self { value }
#[inline] fn from_u32(value: u32) -> Self { value as f32 }

// f16 is a custom type
// so the compiler can not automatically vectorize the conversion
// that's why we need to specialize this function
#[inline]
fn from_f16s(from: &[f16], to: &mut [Self]) {
from.convert_to_f32_slice(to);
}
}

impl FromNativeSample for u32 {
fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
fn from_f32(value: f32) -> Self { value as u32 }
fn from_u32(value: u32) -> Self { value }
#[inline] fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
#[inline] fn from_f32(value: f32) -> Self { value as u32 }
#[inline] fn from_u32(value: u32) -> Self { value }
}

impl FromNativeSample for f16 {
fn from_f16(value: f16) -> Self { value }
fn from_f32(value: f32) -> Self { f16::from_f32(value) }
fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
#[inline] fn from_f16(value: f16) -> Self { value }
#[inline] fn from_f32(value: f32) -> Self { f16::from_f32(value) }
#[inline] fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }

// f16 is a custom type
// so the compiler can not automatically vectorize the conversion
// that's why we need to specialize this function
#[inline]
fn from_f32s(from: &[f32], to: &mut [Self]) {
to.convert_from_f32_slice(from)
}
}

impl FromNativeSample for Sample {
fn from_f16(value: f16) -> Self { Self::from(value) }
fn from_f32(value: f32) -> Self { Self::from(value) }
fn from_u32(value: u32) -> Self { Self::from(value) }
#[inline] fn from_f16(value: f16) -> Self { Self::from(value) }
#[inline] fn from_f32(value: f32) -> Self { Self::from(value) }
#[inline] fn from_u32(value: u32) -> Self { Self::from(value) }
}


Expand Down
Loading
Loading