briansmith · briansmith · Sep 16, 2023 · Jun 1, 2022 · Sep 14, 2023
diff --git a/build.rs b/build.rs
@@ -952,7 +952,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "aes_nohw_set_encrypt_key",
         "aesni_gcm_decrypt",
         "aesni_gcm_encrypt",
-        "bn_from_montgomery",
         "bn_from_montgomery_in_place",
         "bn_gather5",
         "bn_mul_mont",

diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
@@ -2088,194 +2088,6 @@
 .size	__bn_post4x_internal,.-__bn_post4x_internal
 ___
 }
-{
-$code.=<<___;
-.globl	bn_from_montgomery
-.type	bn_from_montgomery,\@abi-omnipotent
-.align	32
-bn_from_montgomery:
-.cfi_startproc
-	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
-	jz	bn_from_mont8x
-	xor	%eax,%eax
-	ret
-.cfi_endproc
-.size	bn_from_montgomery,.-bn_from_montgomery
-
-.type	bn_from_mont8x,\@function,6
-.align	32
-bn_from_mont8x:
-.cfi_startproc
-	.byte	0x67
-	mov	%rsp,%rax
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-.Lfrom_prologue:
-
-	shl	\$3,${num}d		# convert $num to bytes
-	lea	($num,$num,2),%r10	# 3*$num in bytes
-	neg	$num
-	mov	($n0),$n0		# *n0
-
-	##############################################################
-	# Ensure that stack frame doesn't alias with $rptr+3*$num
-	# modulo 4096, which covers ret[num], am[num] and n[num]
-	# (see bn_exp.c). The stack is allocated to aligned with
-	# bn_power5's frame, and as bn_from_montgomery happens to be
-	# last operation, we use the opportunity to cleanse it.
-	#
-	lea	-320(%rsp,$num,2),%r11
-	mov	%rsp,%rbp
-	sub	$rptr,%r11
-	and	\$4095,%r11
-	cmp	%r11,%r10
-	jb	.Lfrom_sp_alt
-	sub	%r11,%rbp		# align with $aptr
-	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
-	jmp	.Lfrom_sp_done
-
-.align	32
-.Lfrom_sp_alt:
-	lea	4096-320(,$num,2),%r10
-	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
-	sub	%r10,%r11
-	mov	\$0,%r10
-	cmovc	%r10,%r11
-	sub	%r11,%rbp
-.Lfrom_sp_done:
-	and	\$-64,%rbp
-	mov	%rsp,%r11
-	sub	%rbp,%r11
-	and	\$-4096,%r11
-	lea	(%rbp,%r11),%rsp
-	mov	(%rsp),%r10
-	cmp	%rbp,%rsp
-	ja	.Lfrom_page_walk
-	jmp	.Lfrom_page_walk_done
-
-.Lfrom_page_walk:
-	lea	-4096(%rsp),%rsp
-	mov	(%rsp),%r10
-	cmp	%rbp,%rsp
-	ja	.Lfrom_page_walk
-.Lfrom_page_walk_done:
-
-	mov	$num,%r10
-	neg	$num
-
-	##############################################################
-	# Stack layout
-	#
-	# +0	saved $num, used in reduction section
-	# +8	&t[2*$num], used in reduction section
-	# +32	saved *n0
-	# +40	saved %rsp
-	# +48	t[2*$num]
-	#
-	mov	$n0,  32(%rsp)
-	mov	%rax, 40(%rsp)		# save original %rsp
-.cfi_cfa_expression	%rsp+40,deref,+8
-.Lfrom_body:
-	mov	$num,%r11
-	lea	48(%rsp),%rax
-	pxor	%xmm0,%xmm0
-	jmp	.Lmul_by_1
-
-.align	32
-.Lmul_by_1:
-	movdqu	($aptr),%xmm1
-	movdqu	16($aptr),%xmm2
-	movdqu	32($aptr),%xmm3
-	movdqa	%xmm0,(%rax,$num)
-	movdqu	48($aptr),%xmm4
-	movdqa	%xmm0,16(%rax,$num)
-	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
-	movdqa	%xmm1,(%rax)
-	movdqa	%xmm0,32(%rax,$num)
-	movdqa	%xmm2,16(%rax)
-	movdqa	%xmm0,48(%rax,$num)
-	movdqa	%xmm3,32(%rax)
-	movdqa	%xmm4,48(%rax)
-	lea	64(%rax),%rax
-	sub	\$64,%r11
-	jnz	.Lmul_by_1
-
-	movq	$rptr,%xmm1
-	movq	$nptr,%xmm2
-	.byte	0x67
-	mov	$nptr,%rbp
-	movq	%r10, %xmm3		# -num
-___
-$code.=<<___ if ($addx);
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	mov	8(%r11),%r11d
-	and	\$0x80108,%r11d
-	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
-	jne	.Lfrom_mont_nox
-
-	lea	(%rax,$num),$rptr
-	call	__bn_sqrx8x_reduction
-	call	__bn_postx4x_internal
-
-	pxor	%xmm0,%xmm0
-	lea	48(%rsp),%rax
-	jmp	.Lfrom_mont_zero
-
-.align	32
-.Lfrom_mont_nox:
-___
-$code.=<<___;
-	call	__bn_sqr8x_reduction
-	call	__bn_post4x_internal
-
-	pxor	%xmm0,%xmm0
-	lea	48(%rsp),%rax
-	jmp	.Lfrom_mont_zero
-
-.align	32
-.Lfrom_mont_zero:
-	mov	40(%rsp),%rsi		# restore %rsp
-.cfi_def_cfa	%rsi,8
-	movdqa	%xmm0,16*0(%rax)
-	movdqa	%xmm0,16*1(%rax)
-	movdqa	%xmm0,16*2(%rax)
-	movdqa	%xmm0,16*3(%rax)
-	lea	16*4(%rax),%rax
-	sub	\$32,$num
-	jnz	.Lfrom_mont_zero
-
-	mov	\$1,%rax
-	mov	-48(%rsi),%r15
-.cfi_restore	%r15
-	mov	-40(%rsi),%r14
-.cfi_restore	%r14
-	mov	-32(%rsi),%r13
-.cfi_restore	%r13
-	mov	-24(%rsi),%r12
-.cfi_restore	%r12
-	mov	-16(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	(%rsi),%rsp
-.cfi_def_cfa_register	%rsp
-.Lfrom_epilogue:
-	ret
-.cfi_endproc
-.size	bn_from_mont8x,.-bn_from_mont8x
-___
-}
 }}}
 
 if ($addx) {{{
@@ -3864,10 +3676,6 @@
 	.rva	.LSEH_begin_bn_power5
 	.rva	.LSEH_end_bn_power5
 	.rva	.LSEH_info_bn_power5
-
-	.rva	.LSEH_begin_bn_from_mont8x
-	.rva	.LSEH_end_bn_from_mont8x
-	.rva	.LSEH_info_bn_from_mont8x
 ___
 $code.=<<___ if ($addx);
 	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
@@ -3899,11 +3707,6 @@
 	.byte	9,0,0,0
 	.rva	mul_handler
 	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
-.align	8
-.LSEH_info_bn_from_mont8x:
-	.byte	9,0,0,0
-	.rva	mul_handler
-	.rva	.Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
 ___
 $code.=<<___ if ($addx);
 .align	8

diff --git a/src/arithmetic/bigint.rs b/src/arithmetic/bigint.rs
@@ -146,29 +146,28 @@ impl<M, E> Elem<M, E> {
     }
 }
 
-impl<M, E: ReductionEncoding> Elem<M, E> {
-    fn decode_once(self, m: &Modulus<M>) -> Elem<M, <E as ReductionEncoding>::Output> {
-        // A multiplication isn't required since we're multiplying by the
-        // unencoded value one (1); only a Montgomery reduction is needed.
-        // However the only non-multiplication Montgomery reduction function we
-        // have requires the input to be large, so we avoid using it here.
-        let mut limbs = self.limbs;
-        let num_limbs = m.width().num_limbs;
-        let mut one = [0; MODULUS_MAX_LIMBS];
-        one[0] = 1;
-        let one = &one[..num_limbs]; // assert!(num_limbs <= MODULUS_MAX_LIMBS);
-        limbs_mont_mul(&mut limbs, one, m.limbs(), m.n0(), m.cpu_features());
-        Elem {
-            limbs,
-            encoding: PhantomData,
-        }
+/// Does a Montgomery reduction on `limbs` assuming they are Montgomery-encoded ('R') and assuming
+/// they are the same size as `m`, but perhaps not reduced mod `m`. The result will be
+/// fully reduced mod `m`.
+fn from_montgomery_amm<M>(limbs: BoxedLimbs<M>, m: &Modulus<M>) -> Elem<M, Unencoded> {
+    debug_assert_eq!(limbs.len(), m.limbs().len());
+
+    let mut limbs = limbs;
+    let num_limbs = m.width().num_limbs;
+    let mut one = [0; MODULUS_MAX_LIMBS];
+    one[0] = 1;
+    let one = &one[..num_limbs]; // assert!(num_limbs <= MODULUS_MAX_LIMBS);
+    limbs_mont_mul(&mut limbs, one, m.limbs(), m.n0(), m.cpu_features());
+    Elem {
+        limbs,
+        encoding: PhantomData,
     }
 }
 
 impl<M> Elem<M, R> {
     #[inline]
     pub fn into_unencoded(self, m: &Modulus<M>) -> Elem<M, Unencoded> {
-        self.decode_once(m)
+        from_montgomery_amm(self.limbs, m)
     }
 }
 
@@ -623,7 +622,13 @@ pub fn elem_exp_consttime<M>(
         limbs_mont_square(acc, m, n0, cpu_features);
     }
 
-    fn gather_mul_base(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
+    fn gather_mul_base_amm(
+        table: &[Limb],
+        state: &mut [Limb],
+        n0: &N0,
+        i: Window,
+        num_limbs: usize,
+    ) {
         prefixed_extern! {
             fn bn_mul_mont_gather5(
                 rp: *mut Limb,
@@ -648,7 +653,7 @@ pub fn elem_exp_consttime<M>(
         }
     }
 
-    fn power(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
+    fn power_amm(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
         prefixed_extern! {
             fn bn_power5(
                 r: *mut Limb,
@@ -690,7 +695,7 @@ pub fn elem_exp_consttime<M>(
             // TODO: Optimize this to avoid gathering
             gather_square(table, state, m.n0(), i / 2, num_limbs, cpu_features);
         } else {
-            gather_mul_base(table, state, m.n0(), i - 1, num_limbs)
+            gather_mul_base_amm(table, state, m.n0(), i - 1, num_limbs)
         };
         scatter(table, state, i, num_limbs);
     }
@@ -702,37 +707,15 @@ pub fn elem_exp_consttime<M>(
             state
         },
         |state, window| {
-            power(table, state, m.n0(), window, num_limbs);
+            power_amm(table, state, m.n0(), window, num_limbs);
             state
         },
     );
 
-    prefixed_extern! {
-        fn bn_from_montgomery(
-            r: *mut Limb,
-            a: *const Limb,
-            not_used: *const Limb,
-            n: *const Limb,
-            n0: &N0,
-            num: c::size_t,
-        ) -> bssl::Result;
-    }
-    Result::from(unsafe {
-        bn_from_montgomery(
-            entry_mut(state, ACC, num_limbs).as_mut_ptr(),
-            entry(state, ACC, num_limbs).as_ptr(),
-            core::ptr::null(),
-            entry(state, M, num_limbs).as_ptr(),
-            m.n0(),
-            num_limbs,
-        )
-    })?;
-    let mut r = Elem {
-        limbs: base.limbs,
-        encoding: PhantomData,
-    };
-    r.limbs.copy_from_slice(entry(state, ACC, num_limbs));
-    Ok(r)
+    let mut r_amm = base.limbs;
+    r_amm.copy_from_slice(entry(state, ACC, num_limbs));
+
+    Ok(from_montgomery_amm(r_amm, m))
 }
 
 /// Verified a == b**-1 (mod m), i.e. a**-1 == b (mod m).