Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge BoringSSL 13c9d5c69d04485a7a8840c12185c832026c8315 #1643

Merged
merged 2 commits into from
Sep 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"aes_nohw_set_encrypt_key",
"aesni_gcm_decrypt",
"aesni_gcm_encrypt",
"bn_from_montgomery",
"bn_from_montgomery_in_place",
"bn_gather5",
"bn_mul_mont",
Expand Down
197 changes: 0 additions & 197 deletions crypto/fipsmodule/bn/asm/x86_64-mont5.pl
Original file line number Diff line number Diff line change
Expand Up @@ -2088,194 +2088,6 @@
.size __bn_post4x_internal,.-__bn_post4x_internal
___
}
{
$code.=<<___;
.globl bn_from_montgomery
.type bn_from_montgomery,\@abi-omnipotent
.align 32
bn_from_montgomery:
.cfi_startproc
testl \$7,`($win64?"48(%rsp)":"%r9d")`
jz bn_from_mont8x
xor %eax,%eax
ret
.cfi_endproc
.size bn_from_montgomery,.-bn_from_montgomery

.type bn_from_mont8x,\@function,6
.align 32
bn_from_mont8x:
.cfi_startproc
.byte 0x67
mov %rsp,%rax
.cfi_def_cfa_register %rax
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lfrom_prologue:

shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0

##############################################################
# Ensure that stack frame doesn't alias with $rptr+3*$num
# modulo 4096, which covers ret[num], am[num] and n[num]
# (see bn_exp.c). The stack is allocated to aligned with
# bn_power5's frame, and as bn_from_montgomery happens to be
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
sub %r11,%rbp # align with $aptr
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done

.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rbp
.Lfrom_sp_done:
and \$-64,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lfrom_page_walk
jmp .Lfrom_page_walk_done

.Lfrom_page_walk:
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lfrom_page_walk
.Lfrom_page_walk_done:

mov $num,%r10
neg $num

##############################################################
# Stack layout
#
# +0 saved $num, used in reduction section
# +8 &t[2*$num], used in reduction section
# +32 saved *n0
# +40 saved %rsp
# +48 t[2*$num]
#
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.cfi_cfa_expression %rsp+40,deref,+8
.Lfrom_body:
mov $num,%r11
lea 48(%rsp),%rax
pxor %xmm0,%xmm0
jmp .Lmul_by_1

.align 32
.Lmul_by_1:
movdqu ($aptr),%xmm1
movdqu 16($aptr),%xmm2
movdqu 32($aptr),%xmm3
movdqa %xmm0,(%rax,$num)
movdqu 48($aptr),%xmm4
movdqa %xmm0,16(%rax,$num)
.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
movdqa %xmm1,(%rax)
movdqa %xmm0,32(%rax,$num)
movdqa %xmm2,16(%rax)
movdqa %xmm0,48(%rax,$num)
movdqa %xmm3,32(%rax)
movdqa %xmm4,48(%rax)
lea 64(%rax),%rax
sub \$64,%r11
jnz .Lmul_by_1

movq $rptr,%xmm1
movq $nptr,%xmm2
.byte 0x67
mov $nptr,%rbp
movq %r10, %xmm3 # -num
___
$code.=<<___ if ($addx);
leaq OPENSSL_ia32cap_P(%rip),%r11
mov 8(%r11),%r11d
and \$0x80108,%r11d
cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
jne .Lfrom_mont_nox

lea (%rax,$num),$rptr
call __bn_sqrx8x_reduction
call __bn_postx4x_internal

pxor %xmm0,%xmm0
lea 48(%rsp),%rax
jmp .Lfrom_mont_zero

.align 32
.Lfrom_mont_nox:
___
$code.=<<___;
call __bn_sqr8x_reduction
call __bn_post4x_internal

pxor %xmm0,%xmm0
lea 48(%rsp),%rax
jmp .Lfrom_mont_zero

.align 32
.Lfrom_mont_zero:
mov 40(%rsp),%rsi # restore %rsp
.cfi_def_cfa %rsi,8
movdqa %xmm0,16*0(%rax)
movdqa %xmm0,16*1(%rax)
movdqa %xmm0,16*2(%rax)
movdqa %xmm0,16*3(%rax)
lea 16*4(%rax),%rax
sub \$32,$num
jnz .Lfrom_mont_zero

mov \$1,%rax
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lfrom_epilogue:
ret
.cfi_endproc
.size bn_from_mont8x,.-bn_from_mont8x
___
}
}}}

if ($addx) {{{
Expand Down Expand Up @@ -3864,10 +3676,6 @@
.rva .LSEH_begin_bn_power5
.rva .LSEH_end_bn_power5
.rva .LSEH_info_bn_power5

.rva .LSEH_begin_bn_from_mont8x
.rva .LSEH_end_bn_from_mont8x
.rva .LSEH_info_bn_from_mont8x
___
$code.=<<___ if ($addx);
.rva .LSEH_begin_bn_mulx4x_mont_gather5
Expand Down Expand Up @@ -3899,11 +3707,6 @@
.byte 9,0,0,0
.rva mul_handler
.rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_from_mont8x:
.byte 9,0,0,0
.rva mul_handler
.rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
___
$code.=<<___ if ($addx);
.align 8
Expand Down
77 changes: 30 additions & 47 deletions src/arithmetic/bigint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,29 +146,28 @@ impl<M, E> Elem<M, E> {
}
}

impl<M, E: ReductionEncoding> Elem<M, E> {
fn decode_once(self, m: &Modulus<M>) -> Elem<M, <E as ReductionEncoding>::Output> {
// A multiplication isn't required since we're multiplying by the
// unencoded value one (1); only a Montgomery reduction is needed.
// However the only non-multiplication Montgomery reduction function we
// have requires the input to be large, so we avoid using it here.
let mut limbs = self.limbs;
let num_limbs = m.width().num_limbs;
let mut one = [0; MODULUS_MAX_LIMBS];
one[0] = 1;
let one = &one[..num_limbs]; // assert!(num_limbs <= MODULUS_MAX_LIMBS);
limbs_mont_mul(&mut limbs, one, m.limbs(), m.n0(), m.cpu_features());
Elem {
limbs,
encoding: PhantomData,
}
/// Does a Montgomery reduction on `limbs` assuming they are Montgomery-encoded ('R') and assuming
/// they are the same size as `m`, but perhaps not reduced mod `m`. The result will be
/// fully reduced mod `m`.
fn from_montgomery_amm<M>(limbs: BoxedLimbs<M>, m: &Modulus<M>) -> Elem<M, Unencoded> {
debug_assert_eq!(limbs.len(), m.limbs().len());

let mut limbs = limbs;
let num_limbs = m.width().num_limbs;
let mut one = [0; MODULUS_MAX_LIMBS];
one[0] = 1;
let one = &one[..num_limbs]; // assert!(num_limbs <= MODULUS_MAX_LIMBS);
limbs_mont_mul(&mut limbs, one, m.limbs(), m.n0(), m.cpu_features());
Elem {
limbs,
encoding: PhantomData,
}
}

impl<M> Elem<M, R> {
#[inline]
pub fn into_unencoded(self, m: &Modulus<M>) -> Elem<M, Unencoded> {
self.decode_once(m)
from_montgomery_amm(self.limbs, m)
}
}

Expand Down Expand Up @@ -623,7 +622,13 @@ pub fn elem_exp_consttime<M>(
limbs_mont_square(acc, m, n0, cpu_features);
}

fn gather_mul_base(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
fn gather_mul_base_amm(
table: &[Limb],
state: &mut [Limb],
n0: &N0,
i: Window,
num_limbs: usize,
) {
prefixed_extern! {
fn bn_mul_mont_gather5(
rp: *mut Limb,
Expand All @@ -648,7 +653,7 @@ pub fn elem_exp_consttime<M>(
}
}

fn power(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
fn power_amm(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
prefixed_extern! {
fn bn_power5(
r: *mut Limb,
Expand Down Expand Up @@ -690,7 +695,7 @@ pub fn elem_exp_consttime<M>(
// TODO: Optimize this to avoid gathering
gather_square(table, state, m.n0(), i / 2, num_limbs, cpu_features);
} else {
gather_mul_base(table, state, m.n0(), i - 1, num_limbs)
gather_mul_base_amm(table, state, m.n0(), i - 1, num_limbs)
};
scatter(table, state, i, num_limbs);
}
Expand All @@ -702,37 +707,15 @@ pub fn elem_exp_consttime<M>(
state
},
|state, window| {
power(table, state, m.n0(), window, num_limbs);
power_amm(table, state, m.n0(), window, num_limbs);
state
},
);

prefixed_extern! {
fn bn_from_montgomery(
r: *mut Limb,
a: *const Limb,
not_used: *const Limb,
n: *const Limb,
n0: &N0,
num: c::size_t,
) -> bssl::Result;
}
Result::from(unsafe {
bn_from_montgomery(
entry_mut(state, ACC, num_limbs).as_mut_ptr(),
entry(state, ACC, num_limbs).as_ptr(),
core::ptr::null(),
entry(state, M, num_limbs).as_ptr(),
m.n0(),
num_limbs,
)
})?;
let mut r = Elem {
limbs: base.limbs,
encoding: PhantomData,
};
r.limbs.copy_from_slice(entry(state, ACC, num_limbs));
Ok(r)
let mut r_amm = base.limbs;
r_amm.copy_from_slice(entry(state, ACC, num_limbs));

Ok(from_montgomery_amm(r_amm, m))
}

/// Verified a == b**-1 (mod m), i.e. a**-1 == b (mod m).
Expand Down
Loading
Loading