Skip to content

Commit

Permalink
argon2: WIP: optimize with AVX2 SIMD
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 committed Jul 5, 2023
1 parent dfefd6f commit 7c780fb
Show file tree
Hide file tree
Showing 4 changed files with 318 additions and 4 deletions.
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions argon2/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ rust-version = "1.65"
[dependencies]
base64ct = "1"
blake2 = { version = "0.10.6", default-features = false }
cpufeatures = "0.2.9"

# optional dependencies
password-hash = { version = "0.5", optional = true }
Expand Down
314 changes: 313 additions & 1 deletion argon2/src/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,174 @@ macro_rules! permute {
};
}

const fn _MM_SHUFFLE2(z: i32, y: i32, x: i32, w: i32) -> i32 {
(z << 6) | (y << 4) | (x << 2) | w
}

macro_rules! rotr32 {
($x:expr) => {
_mm256_shuffle_epi32($x, _MM_SHUFFLE2(2, 3, 0, 1))
};
}

macro_rules! rotr24 {
($x:expr) => {
_mm256_shuffle_epi8(
$x,
_mm256_setr_epi8(
3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11,
12, 13, 14, 15, 8, 9, 10,
),
)
};
}

macro_rules! rotr16 {
($x:expr) => {
_mm256_shuffle_epi8(
$x,
_mm256_setr_epi8(
2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10,
11, 12, 13, 14, 15, 8, 9,
),
)
};
}

macro_rules! rotr63 {
($x:expr) => {
_mm256_xor_si256(_mm256_srli_epi64($x, 63), _mm256_add_epi64($x, $x))
};
}

macro_rules! G1_AVX2 {
($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{
let ml = _mm256_mul_epu32($A0, $B0);
let ml = _mm256_add_epi64(ml, ml);
$A0 = _mm256_add_epi64($A0, _mm256_add_epi64($B0, ml));
$D0 = _mm256_xor_si256($D0, $A0);
$D0 = rotr32!($D0);
let ml = _mm256_mul_epu32($C0, $D0);
let ml = _mm256_add_epi64(ml, ml);
$C0 = _mm256_add_epi64($C0, _mm256_add_epi64($D0, ml));
$B0 = _mm256_xor_si256($B0, $C0);
$B0 = rotr24!($B0);
let ml = _mm256_mul_epu32($A1, $B1);
let ml = _mm256_add_epi64(ml, ml);
$A1 = _mm256_add_epi64($A1, _mm256_add_epi64($B1, ml));
$D1 = _mm256_xor_si256($D1, $A1);
$D1 = rotr32!($D1);
let ml = _mm256_mul_epu32($C1, $D1);
let ml = _mm256_add_epi64(ml, ml);
$C1 = _mm256_add_epi64($C1, _mm256_add_epi64($D1, ml));
$B1 = _mm256_xor_si256($B1, $C1);
$B1 = rotr24!($B1);
}};
}

macro_rules! G2_AVX2 {
($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{
let ml = _mm256_mul_epu32($A0, $B0);
let ml = _mm256_add_epi64(ml, ml);
$A0 = _mm256_add_epi64($A0, _mm256_add_epi64($B0, ml));
$D0 = _mm256_xor_si256($D0, $A0);
$D0 = rotr16!($D0);
let ml = _mm256_mul_epu32($C0, $D0);
let ml = _mm256_add_epi64(ml, ml);
$C0 = _mm256_add_epi64($C0, _mm256_add_epi64($D0, ml));
$B0 = _mm256_xor_si256($B0, $C0);
$B0 = rotr63!($B0);
let ml = _mm256_mul_epu32($A1, $B1);
let ml = _mm256_add_epi64(ml, ml);
$A1 = _mm256_add_epi64($A1, _mm256_add_epi64($B1, ml));
$D1 = _mm256_xor_si256($D1, $A1);
$D1 = rotr16!($D1);
let ml = _mm256_mul_epu32($C1, $D1);
let ml = _mm256_add_epi64(ml, ml);
$C1 = _mm256_add_epi64($C1, _mm256_add_epi64($D1, ml));
$B1 = _mm256_xor_si256($B1, $C1);
$B1 = rotr63!($B1);
}};
}

macro_rules! DIAGONALIZE_1 {
($A0:expr, $B0:expr, $C0:expr, $D0:expr, $A1:expr, $B1:expr, $C1:expr, $D1:expr) => {{
$B0 = _mm256_permute4x64_epi64($B0, _MM_SHUFFLE2(0, 3, 2, 1));
$C0 = _mm256_permute4x64_epi64($C0, _MM_SHUFFLE2(1, 0, 3, 2));
$D0 = _mm256_permute4x64_epi64($D0, _MM_SHUFFLE2(2, 1, 0, 3));
$B1 = _mm256_permute4x64_epi64($B1, _MM_SHUFFLE2(0, 3, 2, 1));
$C1 = _mm256_permute4x64_epi64($C1, _MM_SHUFFLE2(1, 0, 3, 2));
$D1 = _mm256_permute4x64_epi64($D1, _MM_SHUFFLE2(2, 1, 0, 3));
}};
}

macro_rules! DIAGONALIZE_2 {
($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{
let tmp1 = _mm256_blend_epi32($B0, $B1, 0xCC);
let tmp2 = _mm256_blend_epi32($B0, $B1, 0x33);
$B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1));
$B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1));
let tmp1 = $C0;
$C0 = $C1;
$C1 = tmp1;
let tmp1 = _mm256_blend_epi32($D0, $D1, 0xCC);
let tmp2 = _mm256_blend_epi32($D0, $D1, 0x33);
$D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1));
$D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1));
}};
}

macro_rules! UNDIAGONALIZE_1 {
($A0:expr, $B0:expr, $C0:expr, $D0:expr, $A1:expr, $B1:expr, $C1:expr, $D1:expr) => {{
$B0 = _mm256_permute4x64_epi64($B0, _MM_SHUFFLE2(2, 1, 0, 3));
$C0 = _mm256_permute4x64_epi64($C0, _MM_SHUFFLE2(1, 0, 3, 2));
$D0 = _mm256_permute4x64_epi64($D0, _MM_SHUFFLE2(0, 3, 2, 1));
$B1 = _mm256_permute4x64_epi64($B1, _MM_SHUFFLE2(2, 1, 0, 3));
$C1 = _mm256_permute4x64_epi64($C1, _MM_SHUFFLE2(1, 0, 3, 2));
$D1 = _mm256_permute4x64_epi64($D1, _MM_SHUFFLE2(0, 3, 2, 1));
}};
}

macro_rules! UNDIAGONALIZE_2 {
($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{
let tmp1 = _mm256_blend_epi32($B0, $B1, 0xCC);
let tmp2 = _mm256_blend_epi32($B0, $B1, 0x33);
$B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1));
$B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1));
let tmp1 = $C0;
$C0 = $C1;
$C1 = tmp1;
let tmp1 = _mm256_blend_epi32($D0, $D1, 0x33);
let tmp2 = _mm256_blend_epi32($D0, $D1, 0xCC);
$D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE2(2, 3, 0, 1));
$D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE2(2, 3, 0, 1));
}};
}

macro_rules! BLAKE2_ROUND_1 {
($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{
G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
DIAGONALIZE_1!($A0, $B0, $C0, $D0, $A1, $B1, $C1, $D1);
G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
UNDIAGONALIZE_1!($A0, $B0, $C0, $D0, $A1, $B1, $C1, $D1);
}};
}

macro_rules! BLAKE2_ROUND_2 {
($A0:expr, $A1:expr, $B0:expr, $B1:expr, $C0:expr, $C1:expr, $D0:expr, $D1:expr) => {{
G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
DIAGONALIZE_2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
G1_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
G2_AVX2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
UNDIAGONALIZE_2!($A0, $A1, $B0, $B1, $C0, $C1, $D0, $D1);
}};
}

cpufeatures::new!(avx2_cpuid, "avx2");

/// Structure for the (1 KiB) memory block implemented as 128 64-bit words.
#[derive(Copy, Clone, Debug)]
#[repr(align(64))]
Expand All @@ -67,6 +235,17 @@ impl Block {
}

pub(crate) fn compress(rhs: &Self, lhs: &Self) -> Self {
#[cfg(any(target_arch = "x86_64"))]
{
let (_, avx2) = avx2_cpuid::init_get();
if avx2 {
return unsafe { Self::compress_avx2(rhs, lhs) };
}
}
Self::compress_safe(rhs, lhs)
}

fn compress_safe(rhs: &Self, lhs: &Self) -> Self {
let r = *rhs ^ lhs;

// Apply permutations rowwise
Expand Down Expand Up @@ -101,6 +280,108 @@ impl Block {
q ^= &r;
q
}

#[cfg(any(target_arch = "x86_64"))]
unsafe fn compress_avx2(rhs: &Self, lhs: &Self) -> Self {
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

// extract the data into registers
let mut state = [
_mm256_loadu_si256(rhs.0.as_ptr().offset(0 * 4) as *const __m256i),
_mm256_loadu_si256(rhs.0.as_ptr().offset(1 * 4) as *const __m256i),
_mm256_loadu_si256(rhs.0.as_ptr().offset(2 * 4) as *const __m256i),
_mm256_loadu_si256(rhs.0.as_ptr().offset(3 * 4) as *const __m256i),
_mm256_loadu_si256(rhs.0.as_ptr().offset(4 * 4) as *const __m256i),
_mm256_loadu_si256(rhs.0.as_ptr().offset(5 * 4) as *const __m256i),
_mm256_loadu_si256(rhs.0.as_ptr().offset(6 * 4) as *const __m256i),
_mm256_loadu_si256(rhs.0.as_ptr().offset(7 * 4) as *const __m256i),
];

let mut block_xy = [
_mm256_loadu_si256(lhs.0.as_ptr().offset(0 * 4) as *const __m256i),
_mm256_loadu_si256(lhs.0.as_ptr().offset(1 * 4) as *const __m256i),
_mm256_loadu_si256(lhs.0.as_ptr().offset(2 * 4) as *const __m256i),
_mm256_loadu_si256(lhs.0.as_ptr().offset(3 * 4) as *const __m256i),
_mm256_loadu_si256(lhs.0.as_ptr().offset(4 * 4) as *const __m256i),
_mm256_loadu_si256(lhs.0.as_ptr().offset(5 * 4) as *const __m256i),
_mm256_loadu_si256(lhs.0.as_ptr().offset(6 * 4) as *const __m256i),
_mm256_loadu_si256(lhs.0.as_ptr().offset(7 * 4) as *const __m256i),
];

// xor registers
for i in 0..8 {
state[i] = _mm256_xor_si256(state[i], block_xy[i]);
}

// for i in 0..4 {
// #[rustfmt::skip]
// BLAKE2_ROUND_1!(
// state[(i + 0) % 8], state[(i + 4) % 8],
// state[(i + 1) % 8], state[(i + 5) % 8],
// state[(i + 2) % 8], state[(i + 6) % 8],
// state[(i + 3) % 8], state[(i + 7) % 8]
// );
// }

// for i in 0..4 {
// BLAKE2_ROUND_2!(
// state[0 + i],
// state[1 + i],
// state[2 + i],
// state[3 + i],
// state[(4 + i) % 8],
// state[(5 + i) % 8],
// state[(6 + i) % 8],
// state[(7 + i) % 8]
// );
// }

// reapply registers
let mut r = Self::new();
_mm256_storeu_si256(r.0.as_mut_ptr().offset(0 * 4) as *mut __m256i, state[0]);
_mm256_storeu_si256(r.0.as_mut_ptr().offset(1 * 4) as *mut __m256i, state[1]);
_mm256_storeu_si256(r.0.as_mut_ptr().offset(2 * 4) as *mut __m256i, state[2]);
_mm256_storeu_si256(r.0.as_mut_ptr().offset(3 * 4) as *mut __m256i, state[3]);
_mm256_storeu_si256(r.0.as_mut_ptr().offset(4 * 4) as *mut __m256i, state[4]);
_mm256_storeu_si256(r.0.as_mut_ptr().offset(5 * 4) as *mut __m256i, state[5]);
_mm256_storeu_si256(r.0.as_mut_ptr().offset(6 * 4) as *mut __m256i, state[6]);
_mm256_storeu_si256(r.0.as_mut_ptr().offset(7 * 4) as *mut __m256i, state[7]);

// let r = *rhs ^ lhs;

// Apply permutations rowwise
let mut q = r;
for chunk in q.0.chunks_exact_mut(16) {
#[rustfmt::skip]
permute!(
chunk[0], chunk[1], chunk[2], chunk[3],
chunk[4], chunk[5], chunk[6], chunk[7],
chunk[8], chunk[9], chunk[10], chunk[11],
chunk[12], chunk[13], chunk[14], chunk[15],
);
}

// Apply permutations columnwise
for i in 0..8 {
let b = i * 2;

#[rustfmt::skip]
permute!(
q.0[b], q.0[b + 1],
q.0[b + 16], q.0[b + 17],
q.0[b + 32], q.0[b + 33],
q.0[b + 48], q.0[b + 49],
q.0[b + 64], q.0[b + 65],
q.0[b + 80], q.0[b + 81],
q.0[b + 96], q.0[b + 97],
q.0[b + 112], q.0[b + 113],
);
}

q ^= &r;
q
}
}

impl Default for Block {
Expand Down Expand Up @@ -132,12 +413,43 @@ impl BitXor<&Block> for Block {

impl BitXorAssign<&Block> for Block {
fn bitxor_assign(&mut self, rhs: &Block) {
for (dst, src) in self.0.iter_mut().zip(rhs.0.iter().copied()) {
for (dst, src) in self.0.iter_mut().zip(rhs.0.iter()) {
*dst ^= src;
}
}
}

#[target_feature(enable = "avx2")]
unsafe fn avx2_permute_chunk(chunk: &mut [u64; 16]) {
use core::arch::x86_64::*;

let a = _mm256_loadu_si256(chunk.as_ptr() as *const __m256i);
let b = _mm256_loadu_si256((chunk.as_ptr() as *const __m256i).offset(1));

let a_lo = _mm256_unpacklo_epi64(a, b);
let a_hi = _mm256_unpackhi_epi64(a, b);

let prod1 = _mm256_mul_epu32(a_lo, a_hi);
let prod2 = _mm256_mul_epu32(
_mm256_shuffle_epi32(a_lo, 0xB1),
_mm256_shuffle_epi32(a_hi, 0xB1),
);

let sum1 = _mm256_add_epi64(a_lo, a_hi);
let sum2 = _mm256_add_epi64(
_mm256_and_si256(prod1, _mm256_set1_epi64x(TRUNC as i64)),
_mm256_and_si256(prod2, _mm256_set1_epi64x(TRUNC as i64)),
);

let res1 = _mm256_add_epi64(sum1, sum2);
let res2 = _mm256_xor_si256(
_mm256_loadu_si256(chunk.as_ptr() as *const __m256i),
_mm256_srli_epi64(res1, 32),
);

_mm256_storeu_si256(chunk.as_mut_ptr() as *mut __m256i, res2);
}

#[cfg(feature = "zeroize")]
impl Zeroize for Block {
fn zeroize(&mut self) {
Expand Down
2 changes: 1 addition & 1 deletion benches/src/argon2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ fn bench_vary_p(c: &mut Criterion) {

criterion_group!(
name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
config = Criterion::default().with_profiler(PProfProfiler::new(300, Output::Flamegraph(None)));
targets =
bench_default_params,
bench_vary_m,
Expand Down

0 comments on commit 7c780fb

Please sign in to comment.