From 2aafa50867a95099d7b71c31300ce5a9d9f4bc5a Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 20 Sep 2023 11:52:51 +0200 Subject: [PATCH 1/4] bindings/rust/build.rs: allow cross-compilations in MSVC environment. --- bindings/rust/build.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bindings/rust/build.rs b/bindings/rust/build.rs index 27e4f862..b2a7f21d 100644 --- a/bindings/rust/build.rs +++ b/bindings/rust/build.rs @@ -5,9 +5,14 @@ extern crate cc; use std::env; use std::path::{Path, PathBuf}; -fn assembly(file_vec: &mut Vec, base_dir: &Path, _arch: &str) { +fn assembly( + file_vec: &mut Vec, + base_dir: &Path, + _arch: &str, + _is_msvc: bool, +) { #[cfg(target_env = "msvc")] - if env::var("CARGO_CFG_TARGET_ENV").unwrap().eq("msvc") { + if _is_msvc { let sfx = match _arch { "x86_64" => "x86_64", "aarch64" => "armv8", @@ -113,8 +118,8 @@ fn main() { } } - #[cfg(target_env = "msvc")] - if env::var("CARGO_CFG_TARGET_POINTER_WIDTH").unwrap().eq("32") + if target_env.eq("msvc") + && env::var("CARGO_CFG_TARGET_POINTER_WIDTH").unwrap().eq("32") && !env::var("CC").is_ok() { match std::process::Command::new("clang-cl") @@ -142,7 +147,12 @@ fn main() { if target_arch.eq("x86_64") || target_arch.eq("aarch64") { let asm_dir = blst_base_dir.join("build"); println!("cargo:rerun-if-changed={}", asm_dir.display()); - assembly(&mut file_vec, &asm_dir, &target_arch); + assembly( + &mut file_vec, + &asm_dir, + &target_arch, + cc.get_compiler().is_like_msvc(), + ); } else { cc.define("__BLST_NO_ASM__", None); } @@ -199,7 +209,7 @@ fn main() { "Cannot compile with both `portable` and `force-adx` features" ), } - if target_env.eq("msvc") { + if target_env.eq("msvc") && cc.get_compiler().is_like_msvc() { cc.flag("-Zl"); } cc.flag_if_supported("-mno-avx") // avoid costly transitions From 1e8b6f8cccd4c05aef0ba76b503d8fb05541b589 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 21 Sep 2023 22:34:20 +0200 Subject: [PATCH 2/4] asm/add_mod_384-armv8.pl: make it compile with ARM assembler. --- src/asm/add_mod_384-armv8.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/asm/add_mod_384-armv8.pl b/src/asm/add_mod_384-armv8.pl index 6accdbb1..1ff06f1c 100755 --- a/src/asm/add_mod_384-armv8.pl +++ b/src/asm/add_mod_384-armv8.pl @@ -888,7 +888,7 @@ sub vec_select { .Loop_is_zero_done: dup v1.2d, v0.2d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.2d[0] + umov x1, v0.2d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq @@ -923,7 +923,7 @@ sub vec_select { .Loop_is_equal_done: dup v1.2d, v0.2d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.2d[0] + umov x1, v0.2d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq From 9c6912e01bd07c28f2dd9c13d7cd437673ac16f8 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 21 Sep 2023 22:36:42 +0200 Subject: [PATCH 3/4] asm/arm-xlate.pl: transliterate pre-processor directives for ARM assembler. --- src/asm/arm-xlate.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/asm/arm-xlate.pl b/src/asm/arm-xlate.pl index f8525b49..60264eb7 100755 --- a/src/asm/arm-xlate.pl +++ b/src/asm/arm-xlate.pl @@ -326,6 +326,17 @@ sub expand_line { while(my $line=<>) { + if ($flavour =~ /win/) { + if ($line =~ m/^#\s*(ifdef|ifndef|else|endif)\b(.*)/) { + my ($op, $arg) = ($1, $2); + $op = "if :def:" if ($op eq "ifdef"); + $op = "if :lnot::def:" if ($op eq "ifndef"); + print " ".$op.$arg."\n"; + next; + } + $line =~ s|//.*||; + } + # fix up assembler-specific commentary delimiter $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/); From 393232e16e20401951bffd60d4328621bf555c7c Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 21 Sep 2023 22:43:08 +0200 Subject: [PATCH 4/4] Exectute build/refresh.sh. --- build/coff/add_mod_384-armv8.S | 4 +- build/elf/add_mod_384-armv8.S | 4 +- build/mach-o/add_mod_384-armv8.S | 4 +- build/win64/add_mod_256-armv8.asm | 20 +- build/win64/add_mod_384-armv8.asm | 12 +- build/win64/ct_inverse_mod_256-armv8.asm | 472 ++++++++++----------- build/win64/ct_inverse_mod_384-armv8.asm | 366 ++++++++-------- build/win64/ct_is_square_mod_384-armv8.asm | 96 ++--- build/win64/div3w-armv8.asm | 56 +-- build/win64/mul_mont_256-armv8.asm | 100 ++--- build/win64/mul_mont_384-armv8.asm | 176 ++++---- build/win64/sha256-armv8.asm | 172 ++++---- 12 files changed, 741 insertions(+), 741 deletions(-) diff --git a/build/coff/add_mod_384-armv8.S b/build/coff/add_mod_384-armv8.S index 2eff0677..d30d4cf3 100644 --- a/build/coff/add_mod_384-armv8.S +++ b/build/coff/add_mod_384-armv8.S @@ -1017,7 +1017,7 @@ vec_is_zero_16x: .Loop_is_zero_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] + umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq @@ -1048,7 +1048,7 @@ vec_is_equal_16x: .Loop_is_equal_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] + umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq diff --git a/build/elf/add_mod_384-armv8.S b/build/elf/add_mod_384-armv8.S index 5c18d7fe..88ae99e2 100644 --- a/build/elf/add_mod_384-armv8.S +++ b/build/elf/add_mod_384-armv8.S @@ -963,7 +963,7 @@ vec_is_zero_16x: .Loop_is_zero_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] + umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq @@ -992,7 +992,7 @@ vec_is_equal_16x: .Loop_is_equal_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] + umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq diff --git a/build/mach-o/add_mod_384-armv8.S b/build/mach-o/add_mod_384-armv8.S index a62995f2..e5812538 100644 --- a/build/mach-o/add_mod_384-armv8.S +++ b/build/mach-o/add_mod_384-armv8.S @@ -963,7 +963,7 @@ Loop_is_zero: Loop_is_zero_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] + umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq @@ -992,7 +992,7 @@ Loop_is_equal: Loop_is_equal_done: dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] + umov x1, v0.d[0] mov x0, #1 cmp x1, #0 csel x0, x0, xzr, eq diff --git a/build/win64/add_mod_256-armv8.asm b/build/win64/add_mod_256-armv8.asm index 8d697518..37b8e4a6 100644 --- a/build/win64/add_mod_256-armv8.asm +++ b/build/win64/add_mod_256-armv8.asm @@ -234,12 +234,12 @@ ldp x4,x5,[x1] ldp x6,x7,[x1,#16] -#ifdef __AARCH64EB__ + if :def: __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 -#endif + endif subs xzr,x8,x4 sbcs xzr,x9,x5 @@ -268,7 +268,7 @@ ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] -#ifdef __AARCH64EB__ + if :def: __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -277,7 +277,7 @@ rev x14,x14 rev x11,x11 rev x15,x15 -#endif + endif adds x8,x8,x12 ldp x4,x5,[x3] @@ -302,12 +302,12 @@ orr x17, x10, x11 orr x16, x16, x17 -#ifdef __AARCH64EB__ + if :def: __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 -#endif + endif stp x8,x9,[x0] stp x10,x11,[x0,#16] @@ -329,7 +329,7 @@ ldp x10,x11,[x1,#16] ldp x14,x15,[x2,#16] -#ifdef __AARCH64EB__ + if :def: __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -338,7 +338,7 @@ rev x14,x14 rev x11,x11 rev x15,x15 -#endif + endif subs x8,x8,x12 sbcs x9,x9,x13 @@ -361,12 +361,12 @@ orr x17, x10, x11 orr x16, x16, x17 -#ifdef __AARCH64EB__ + if :def: __AARCH64EB__ rev x8,x8 rev x9,x9 rev x10,x10 rev x11,x11 -#endif + endif stp x8,x9,[x0] stp x10,x11,[x0,#16] diff --git a/build/win64/add_mod_384-armv8.asm b/build/win64/add_mod_384-armv8.asm index 4bf703a6..f797e9d9 100644 --- a/build/win64/add_mod_384-armv8.asm +++ b/build/win64/add_mod_384-armv8.asm @@ -154,7 +154,7 @@ adcs x13,x13,x20 and x22,x22,x9 adcs x14,x14,x21 - extr x10,x11,x10,#1 // a[0:5] >>= 1 + extr x10,x11,x10,#1 adcs x15,x15,x22 extr x11,x12,x11,#1 adc x22,xzr,xzr @@ -604,7 +604,7 @@ ldp x8,x9,[x2,#32] add x2,x1,#48 - bl __sub_mod_384 // a->re - a->im + bl __sub_mod_384 ldp x16,x17,[x1] ldp x19,x20,[x1,#16] @@ -616,7 +616,7 @@ stp x14,x15,[x0,#32] ldp x14,x15,[x1,#80] - bl __add_mod_384_ab_are_loaded // a->re + a->im + bl __add_mod_384_ab_are_loaded ldr x30,[sp,#8] stp x10,x11,[x0,#48] @@ -744,7 +744,7 @@ and x3,x3,#1 and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity + orr x0,x1,x3 ret ENDP @@ -963,7 +963,7 @@ |$Loop_is_zero_done| dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] + umov x1, v0.d[0] mov x0, #1 cmp x1, #0 cseleq x0,x0,xzr @@ -992,7 +992,7 @@ |$Loop_is_equal_done| dup v1.2d, v0.d[1] orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] + umov x1, v0.d[0] mov x0, #1 cmp x1, #0 cseleq x0,x0,xzr diff --git a/build/win64/ct_inverse_mod_256-armv8.asm b/build/win64/ct_inverse_mod_256-armv8.asm index a4467904..b5df8b4a 100644 --- a/build/win64/ct_inverse_mod_256-armv8.asm +++ b/build/win64/ct_inverse_mod_256-armv8.asm @@ -17,395 +17,395 @@ ldp x4, x5, [x1,#8*0] ldp x6, x7, [x1,#8*2] - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... + add x1, sp, #16+511 + and x1, x1, #-512 str x0, [sp] ldp x8, x9, [x2,#8*0] ldp x10, x11, [x2,#8*2] - stp x4, x5, [x1,#8*0] // copy input to |a| + stp x4, x5, [x1,#8*0] stp x6, x7, [x1,#8*2] - stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x8, x9, [x1,#8*4] stp x10, x11, [x1,#8*6] - ////////////////////////////////////////// first iteration + bl |$Lab_approximation_31_256_loaded| - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - str x12,[x0,#8*8] // initialize |u| with |f0| + str x12,[x0,#8*8] - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to dst |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - str x12, [x0,#8*9] // initialize |v| with |f1| + str x12, [x0,#8*9] + - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - ldr x8, [x1,#8*8] // |u| - ldr x9, [x1,#8*13] // |v| - madd x4, x16, x8, xzr // |u|*|f0| - madd x4, x17, x9, x4 // |v|*|g0| + ldr x8, [x1,#8*8] + ldr x9, [x1,#8*13] + madd x4, x16, x8, xzr + madd x4, x17, x9, x4 str x4, [x0,#8*4] - asr x5, x4, #63 // sign extension + asr x5, x4, #63 stp x5, x5, [x0,#8*5] stp x5, x5, [x0,#8*7] - madd x4, x12, x8, xzr // |u|*|f1| - madd x4, x13, x9, x4 // |v|*|g1| + madd x4, x12, x8, xzr + madd x4, x13, x9, x4 str x4, [x0,#8*9] - asr x5, x4, #63 // sign extension + asr x5, x4, #63 stp x5, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 adc x22, x22, x23 stp x22, x22, [x0,#8*4] stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 adc x22, x22, x23 stp x22, x22, [x0,#8*4] stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 adc x22, x22, x23 stp x22, x22, [x0,#8*4] stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 adc x22, x22, x23 stp x22, x22, [x0,#8*4] stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 adc x22, x22, x23 stp x22, x22, [x0,#8*4] stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 adc x22, x22, x23 stp x22, x22, [x0,#8*4] stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 bl __ab_approximation_31_256 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| + mov x16, x12 + mov x17, x13 - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| + mov x12, x14 + mov x13, x15 + add x0, x0, #8*4 bl __smul_256_n_shift_by_31 - add x0, x0, #8*4 // pointer to destination |u| + add x0, x0, #8*4 bl __smul_256x63 adc x22, x22, x23 str x22, [x0,#8*4] - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| + mov x16, x12 + mov x17, x13 + add x0, x0, #8*5 bl __smul_256x63 bl __smul_512x63_tail - ////////////////////////////////////////// two[!] last iterations - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #47 // 31 + 512 % 31 - //bl __ab_approximation_62_256 // |a| and |b| are exact, - ldr x7, [x1,#8*0] // just load + + eor x1, x1, #256 + mov x2, #47 + + ldr x7, [x1,#8*0] ldr x11, [x1,#8*4] bl __inner_loop_62_256 mov x16, x14 mov x17, x15 - ldr x0, [sp] // original out_ptr + ldr x0, [sp] bl __smul_256x63 bl __smul_512x63_tail ldr x30, [x29,#8] - smulh x20, x7, x17 // figure out top-most limb + smulh x20, x7, x17 ldp x8, x9, [x3,#8*0] adc x23, x23, x25 ldp x10, x11, [x3,#8*2] - add x20, x20, x23 // x20 is 1, 0 or -1 - asr x19, x20, #63 // sign as mask + add x20, x20, x23 + asr x19, x20, #63 - and x23, x8, x19 // add mod<<256 conditionally + and x23, x8, x19 and x24, x9, x19 adds x4, x4, x23 and x25, x10, x19 @@ -413,18 +413,18 @@ and x26, x11, x19 adcs x6, x6, x25 adcs x7, x22, x26 - adc x20, x20, xzr // x20 is 1, 0 or -1 + adc x20, x20, xzr neg x19, x20 - orr x20, x20, x19 // excess bit or sign as mask - asr x19, x19, #63 // excess bit as mask + orr x20, x20, x19 + asr x19, x19, #63 - and x8, x8, x20 // mask |mod| + and x8, x8, x20 and x9, x9, x20 and x10, x10, x20 and x11, x11, x20 - eor x8, x8, x19 // conditionally negate |mod| + eor x8, x8, x19 eor x9, x9, x19 adds x8, x8, x19, lsr#63 eor x10, x10, x19 @@ -433,7 +433,7 @@ adcs x10, x10, xzr adc x11, x11, xzr - adds x4, x4, x8 // final adjustment for |mod|<<256 + adds x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 stp x4, x5, [x0,#8*4] @@ -450,17 +450,17 @@ ret ENDP -//////////////////////////////////////////////////////////////////////// + ALIGN 32 |__smul_256x63| PROC - ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) - asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x4, x5, [x1,#8*0+64] + asr x14, x16, #63 ldp x6, x7, [x1,#8*2+64] - eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + eor x16, x16, x14 ldr x22, [x1,#8*4+64] - eor x4, x4, x14 // conditionally negate |u| (or |v|) + eor x4, x4, x14 sub x16, x16, x14 eor x5, x5, x14 adds x4, x4, x14, lsr#63 @@ -484,13 +484,13 @@ adcs x6, x6, x20 adcs x24, x24, x21 adc x26, xzr, xzr - ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) - asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x8, x9, [x1,#8*0+104] + asr x14, x17, #63 ldp x10, x11, [x1,#8*2+104] - eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + eor x17, x17, x14 ldr x23, [x1,#8*4+104] - eor x8, x8, x14 // conditionally negate |u| (or |v|) + eor x8, x8, x14 sub x17, x17, x14 eor x9, x9, x14 adds x8, x8, x14, lsr#63 @@ -504,7 +504,7 @@ umulh x20, x9, x17 adcs x23, x23, xzr umulh x21, x10, x17 - adc x15, xzr, xzr // used in __smul_512x63_tail + adc x15, xzr, xzr mul x8, x8, x17 cmp x17, #0 mul x9, x9, x17 @@ -530,17 +530,17 @@ ALIGN 32 |__smul_512x63_tail| PROC umulh x24, x7, x16 - ldp x5, x6, [x1,#8*18] // load rest of |v| + ldp x5, x6, [x1,#8*18] adc x26, x26, xzr ldr x7, [x1,#8*20] and x22, x22, x16 - umulh x11, x11, x17 // resume |v|*|g1| chain + umulh x11, x11, x17 - sub x24, x24, x22 // tie up |u|*|f1| chain + sub x24, x24, x22 asr x25, x24, #63 - eor x5, x5, x14 // conditionally negate rest of |v| + eor x5, x5, x14 eor x6, x6, x14 adds x5, x5, x15 eor x7, x7, x14 @@ -559,13 +559,13 @@ mul x22, x7, x17 adcs x6, x6, x20 adcs x22, x22, x21 - adc x23, xzr, xzr // used in the final step + adc x23, xzr, xzr adds x4, x4, x24 adcs x5, x5, x25 adcs x6, x6, x25 stp x4, x5, [x0,#8*4] - adcs x22, x22, x25 // carry is used in the final step + adcs x22, x22, x25 stp x6, x22, [x0,#8*6] ret @@ -574,12 +574,12 @@ ALIGN 32 |__smul_256_n_shift_by_31| PROC - ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) - asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x4, x5, [x1,#8*0+0] + asr x24, x12, #63 ldp x6, x7, [x1,#8*2+0] - eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + eor x25, x12, x24 - eor x4, x4, x24 // conditionally negate |a| (or |b|) + eor x4, x4, x24 sub x25, x25, x24 eor x5, x5, x24 adds x4, x4, x24, lsr#63 @@ -603,12 +603,12 @@ adcs x6, x6, x20 adcs x7, x7, x21 adc x22, x22, x24 - ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) - asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x8, x9, [x1,#8*0+32] + asr x24, x13, #63 ldp x10, x11, [x1,#8*2+32] - eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + eor x25, x13, x24 - eor x8, x8, x24 // conditionally negate |a| (or |b|) + eor x8, x8, x24 sub x25, x25, x24 eor x9, x9, x24 adds x8, x8, x24, lsr#63 @@ -641,10 +641,10 @@ extr x4, x5, x4, #31 extr x5, x6, x5, #31 extr x6, x7, x6, #31 - asr x23, x8, #63 // result's sign as mask + asr x23, x8, #63 extr x7, x8, x7, #31 - eor x4, x4, x23 // ensure the result is positive + eor x4, x4, x23 eor x5, x5, x23 adds x4, x4, x23, lsr#63 eor x6, x6, x23 @@ -655,7 +655,7 @@ adc x7, x7, xzr stp x6, x7, [x0,#8*2] - eor x12, x12, x23 // adjust |f/g| accordingly + eor x12, x12, x23 eor x13, x13, x23 sub x12, x12, x23 sub x13, x13, x23 @@ -671,19 +671,19 @@ ldp x8, x9, [x1,#8*4] |$Lab_approximation_31_256_loaded| - orr x19, x7, x11 // check top-most limbs, ... + orr x19, x7, x11 cmp x19, #0 cselne x7,x7,x6 cselne x11,x11,x10 cselne x6,x6,x5 - orr x19, x7, x11 // and ones before top-most, ... + orr x19, x7, x11 cselne x10,x10,x9 cmp x19, #0 cselne x7,x7,x6 cselne x11,x11,x10 cselne x6,x6,x4 - orr x19, x7, x11 // and one more, ... + orr x19, x7, x11 cselne x10,x10,x8 clz x19, x19 @@ -693,7 +693,7 @@ cselne x11,x11,x10 neg x20, x19 - lslv x7, x7, x19 // align high limbs to the left + lslv x7, x7, x19 lslv x11, x11, x19 lsrv x6, x6, x20 lsrv x10, x10, x20 @@ -713,16 +713,16 @@ ALIGN 16 |__inner_loop_31_256| PROC mov x2, #31 - mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x13, #0x7FFFFFFF80000000 + mov x15, #0x800000007FFFFFFF mov x23,#0x7FFFFFFF7FFFFFFF |$Loop_31_256| - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sbfx x22, x7, #0, #1 sub x2, x2, #1 and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + sub x20, x11, x7 + subs x21, x7, x19 mov x19, x15 cselhs x11,x11,x7 cselhs x7,x21,x20 @@ -731,8 +731,8 @@ lsr x7, x7, #1 and x19, x15, x22 and x20, x23, x22 - sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x15, x15, x15 // |f1|<<=1 + sub x13, x13, x19 + add x15, x15, x15 add x13, x13, x20 sub x15, x15, x23 cbnz x2, |$Loop_31_256| @@ -742,7 +742,7 @@ ubfx x13, x13, #32, #32 ubfx x14, x15, #0, #32 ubfx x15, x15, #32, #32 - sub x12, x12, x23 // remove bias + sub x12, x12, x23 sub x13, x13, x23 sub x14, x14, x23 sub x15, x15, x23 @@ -753,17 +753,17 @@ ALIGN 16 |__inner_loop_62_256| PROC - mov x12, #1 // |f0|=1 - mov x13, #0 // |g0|=0 - mov x14, #0 // |f1|=0 - mov x15, #1 // |g1|=1 + mov x12, #1 + mov x13, #0 + mov x14, #0 + mov x15, #1 |$Loop_62_256| - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sbfx x22, x7, #0, #1 sub x2, x2, #1 and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + sub x20, x11, x7 + subs x21, x7, x19 mov x19, x12 cselhs x11,x11,x7 cselhs x7,x21,x20 @@ -775,10 +775,10 @@ lsr x7, x7, #1 and x19, x14, x22 and x20, x15, x22 - add x14, x14, x14 // |f1|<<=1 - add x15, x15, x15 // |g1|<<=1 - sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + add x14, x14, x14 + add x15, x15, x15 + sub x12, x12, x19 + sub x13, x13, x20 cbnz x2, |$Loop_62_256| ret diff --git a/build/win64/ct_inverse_mod_384-armv8.asm b/build/win64/ct_inverse_mod_384-armv8.asm index 311ce763..27dace27 100644 --- a/build/win64/ct_inverse_mod_384-armv8.asm +++ b/build/win64/ct_inverse_mod_384-armv8.asm @@ -19,301 +19,301 @@ ldp x5, x6, [x1,#8*2] ldp x7, x8, [x1,#8*4] - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... + add x1, sp, #16+511 + and x1, x1, #-512 stp x0, x3, [sp] ldp x9, x10, [x2,#8*0] ldp x11, x12, [x2,#8*2] ldp x13, x14, [x2,#8*4] - stp x22, x4, [x1,#8*0] // copy input to |a| + stp x22, x4, [x1,#8*0] stp x5, x6, [x1,#8*2] stp x7, x8, [x1,#8*4] - stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x9, x10, [x1,#8*6] stp x11, x12, [x1,#8*8] stp x13, x14, [x1,#8*10] - ////////////////////////////////////////// first iteration + mov x2, #62 bl |$Lab_approximation_62_loaded| - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - str x15,[x0,#8*12] // initialize |u| with |f0| + str x15,[x0,#8*12] - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to dst |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - str x15, [x0,#8*12] // initialize |v| with |f1| + str x15, [x0,#8*12] - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| + + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - ldr x7, [x1,#8*12] // |u| - ldr x8, [x1,#8*18] // |v| - mul x3, x20, x7 // |u|*|f0| + ldr x7, [x1,#8*12] + ldr x8, [x1,#8*18] + mul x3, x20, x7 smulh x4, x20, x7 - mul x5, x21, x8 // |v|*|g0| + mul x5, x21, x8 smulh x6, x21, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extension + asr x5, x4, #63 stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] - mul x3, x15, x7 // |u|*|f1| + mul x3, x15, x7 smulh x4, x15, x7 - mul x5, x16, x8 // |v|*|g1| + mul x5, x16, x8 smulh x6, x16, x8 adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extension + asr x5, x4, #63 stp x5, x5, [x0,#8*14] stp x5, x5, [x0,#8*16] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 - asr x27, x27, #63 // sign extension + asr x27, x27, #63 stp x27, x27, [x0,#8*6] stp x27, x27, [x0,#8*8] stp x27, x27, [x0,#8*10] - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| + eor x1, x1, #256 mov x2, #62 bl __ab_approximation_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| + mov x20, x15 + mov x21, x16 - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*6 bl __smul_383_n_shift_by_62 - add x0, x0, #8*6 // pointer to destination |u| + add x0, x0, #8*6 bl __smul_383x63 - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 bl __smul_767x63_tail - ////////////////////////////////////////// iteration before last - eor x1, x1, #256 // flip-flop src |a|b|u|v| + + eor x1, x1, #256 mov x2, #62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldp x3, x8, [x1,#8*0] // just load + + ldp x3, x8, [x1,#8*0] ldp x9, x14, [x1,#8*6] bl __inner_loop_62 - eor x0, x1, #256 // pointer to dst |a|b|u|v| + eor x0, x1, #256 str x3, [x0,#8*0] str x9, [x0,#8*6] - mov x20, x15 // exact |f0| - mov x21, x16 // exact |g0| + mov x20, x15 + mov x21, x16 mov x15, x17 mov x16, x19 - add x0, x0, #8*12 // pointer to dst |u| + add x0, x0, #8*12 bl __smul_383x63 - mov x20, x15 // exact |f1| - mov x21, x16 // exact |g1| - add x0, x0, #8*6 // pointer to dst |v| + mov x20, x15 + mov x21, x16 + add x0, x0, #8*6 bl __smul_383x63 bl __smul_767x63_tail - ////////////////////////////////////////// last iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #22 // 766 % 62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldr x3, [x1,#8*0] // just load + + eor x1, x1, #256 + mov x2, #22 + + ldr x3, [x1,#8*0] eor x8, x8, x8 ldr x9, [x1,#8*6] eor x14, x14, x14 @@ -321,17 +321,17 @@ mov x20, x17 mov x21, x19 - ldp x0, x15, [sp] // original out_ptr and n_ptr + ldp x0, x15, [sp] bl __smul_383x63 bl __smul_767x63_tail ldr x30, [x29,#8] - asr x22, x8, #63 // sign as mask + asr x22, x8, #63 ldp x9, x10, [x15,#8*0] ldp x11, x12, [x15,#8*2] ldp x13, x14, [x15,#8*4] - and x9, x9, x22 // add mod<<384 conditionally + and x9, x9, x22 and x10, x10, x22 adds x3, x3, x9 and x11, x11, x22 @@ -358,18 +358,18 @@ ret ENDP -//////////////////////////////////////////////////////////////////////// -// see corresponding commentary in ctx_inverse_mod_384-x86_64... + + ALIGN 32 |__smul_383x63| PROC - ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) - asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x3, x4, [x1,#8*0+96] + asr x17, x20, #63 ldp x5, x6, [x1,#8*2+96] - eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + eor x20, x20, x17 ldp x7, x8, [x1,#8*4+96] - eor x3, x3, x17 // conditionally negate |u| (or |v|) + eor x3, x3, x17 sub x20, x20, x17 eor x4, x4, x17 adds x3, x3, x17, lsr#63 @@ -399,13 +399,13 @@ adcs x7, x7, x25 adcs x27,x27,x26 adc x2, xzr, xzr - ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) - asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x9, x10, [x1,#8*0+144] + asr x17, x21, #63 ldp x11, x12, [x1,#8*2+144] - eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + eor x21, x21, x17 ldp x13, x14, [x1,#8*4+144] - eor x9, x9, x17 // conditionally negate |u| (or |v|) + eor x9, x9, x17 sub x21, x21, x17 eor x10, x10, x17 adds x9, x9, x17, lsr#63 @@ -422,7 +422,7 @@ umulh x24, x11, x21 adcs x14, x14, xzr umulh x25, x12, x21 - adc x19, xzr, xzr // used in __smul_767x63_tail + adc x19, xzr, xzr umulh x26, x13, x21 mul x9, x9, x21 mul x10, x10, x21 @@ -446,7 +446,7 @@ stp x5, x6, [x0,#8*2] adcs x27, x27, x28 stp x7, x27, [x0,#8*4] - adc x28, x2, xzr // used in __smul_767x63_tail + adc x28, x2, xzr ret ENDP @@ -455,12 +455,12 @@ ALIGN 32 |__smul_767x63_tail| PROC smulh x27, x8, x20 - ldp x3, x4, [x1,#8*24] // load rest of |v| + ldp x3, x4, [x1,#8*24] umulh x14,x14, x21 ldp x5, x6, [x1,#8*26] ldp x7, x8, [x1,#8*28] - eor x3, x3, x17 // conditionally negate rest of |v| + eor x3, x3, x17 eor x4, x4, x17 eor x5, x5, x17 adds x3, x3, x19 @@ -509,13 +509,13 @@ ALIGN 32 |__smul_383_n_shift_by_62| PROC - ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) - asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x3, x4, [x1,#8*0+0] + asr x28, x15, #63 ldp x5, x6, [x1,#8*2+0] - eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + eor x2, x15, x28 ldp x7, x8, [x1,#8*4+0] - eor x3, x3, x28 // conditionally negate |a| (or |b|) + eor x3, x3, x28 sub x2, x2, x28 eor x4, x4, x28 adds x3, x3, x28, lsr#63 @@ -547,13 +547,13 @@ adcs x7, x7, x25 adcs x8, x8 ,x26 adc x27, x27, xzr - ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) - asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x9, x10, [x1,#8*0+48] + asr x28, x16, #63 ldp x11, x12, [x1,#8*2+48] - eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + eor x2, x16, x28 ldp x13, x14, [x1,#8*4+48] - eor x9, x9, x28 // conditionally negate |a| (or |b|) + eor x9, x9, x28 sub x2, x2, x28 eor x10, x10, x28 adds x9, x9, x28, lsr#63 @@ -633,12 +633,12 @@ ldp x11, x12, [x1,#8*8] |$Lab_approximation_62_loaded| - orr x22, x8, x14 // check top-most limbs, ... + orr x22, x8, x14 cmp x22, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x6 - orr x22, x8, x14 // ... ones before top-most, ... + orr x22, x8, x14 cselne x13,x13,x12 ldp x3, x4, [x1,#8*0] @@ -648,7 +648,7 @@ cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x5 - orr x22, x8, x14 // ... and ones before that ... + orr x22, x8, x14 cselne x13,x13,x11 cmp x22, #0 @@ -665,7 +665,7 @@ cselne x14,x14,x13 neg x23, x22 - lslv x8, x8, x22 // align high limbs to the left + lslv x8, x8, x22 lslv x14, x14, x22 lsrv x7, x7, x23 lsrv x13, x13, x23 @@ -680,19 +680,19 @@ ALIGN 16 |__inner_loop_62| PROC - mov x15, #1 // |f0|=1 - mov x16, #0 // |g0|=0 - mov x17, #0 // |f1|=0 - mov x19, #1 // |g1|=1 + mov x15, #1 + mov x16, #0 + mov x17, #0 + mov x19, #1 |$Loop_62| - sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sbfx x28, x3, #0, #1 sub x2, x2, #1 - subs x24, x9, x3 // |b_|-|a_| + subs x24, x9, x3 and x22, x9, x28 sbc x25, x14, x8 and x23, x14, x28 - subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + subs x26, x3, x22 mov x22, x15 sbcs x27, x8, x23 mov x23, x16 @@ -708,10 +708,10 @@ lsr x8, x8, #1 and x22, x17, x28 and x23, x19, x28 - add x17, x17, x17 // |f1|<<=1 - add x19, x19, x19 // |g1|<<=1 - sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + add x17, x17, x17 + add x19, x19, x19 + sub x15, x15, x22 + sub x16, x16, x23 cbnz x2, |$Loop_62| ret diff --git a/build/win64/ct_is_square_mod_384-armv8.asm b/build/win64/ct_is_square_mod_384-armv8.asm index e2454897..90bf90a0 100644 --- a/build/win64/ct_is_square_mod_384-armv8.asm +++ b/build/win64/ct_is_square_mod_384-armv8.asm @@ -15,26 +15,26 @@ stp x27, x28, [sp,#80] sub sp, sp, #512 - ldp x3, x4, [x0,#8*0] // load input + ldp x3, x4, [x0,#8*0] ldp x5, x6, [x0,#8*2] ldp x7, x8, [x0,#8*4] - add x0, sp, #255 // find closest 256-byte-aligned spot - and x0, x0, #-256 // in the frame... + add x0, sp, #255 + and x0, x0, #-256 - ldp x9, x10, [x1,#8*0] // load modulus + ldp x9, x10, [x1,#8*0] ldp x11, x12, [x1,#8*2] ldp x13, x14, [x1,#8*4] - stp x3, x4, [x0,#8*6] // copy input to |a| + stp x3, x4, [x0,#8*6] stp x5, x6, [x0,#8*8] stp x7, x8, [x0,#8*10] - stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x9, x10, [x0,#8*0] stp x11, x12, [x0,#8*2] stp x13, x14, [x0,#8*4] - eor x2, x2, x2 // init the |$Legendre| symbol - mov x15, #24 // 24 is 768/30-1 + eor x2, x2, x2 + mov x15, #24 b |$Loop_is_square| ALIGN 16 @@ -42,26 +42,26 @@ bl __ab_approximation_30 sub x15, x15, #1 - eor x1, x0, #128 // pointer to dst |b| + eor x1, x0, #128 bl __smul_384_n_shift_by_30 - mov x19, x16 // |f0| - mov x20, x17 // |g0| - add x1, x1, #8*6 // pointer to dst |a| + mov x19, x16 + mov x20, x17 + add x1, x1, #8*6 bl __smul_384_n_shift_by_30 ldp x9, x10, [x1,#-8*6] - eor x0, x0, #128 // flip-flop src |a|b| - and x27, x27, x9 // if |a| was negative, - add x2, x2, x27, lsr#1 // adjust |L| + eor x0, x0, #128 + and x27, x27, x9 + add x2, x2, x27, lsr#1 cbnz x15, |$Loop_is_square| - ////////////////////////////////////////// last iteration - //bl __ab_approximation_30 // |a| and |b| are exact, - //ldr x8, [x0,#8*6] // and loaded - //ldr x14, [x0,#8*0] - mov x15, #48 // 48 is 768%30 + 30 + + + + + mov x15, #48 bl __inner_loop_48 ldr x30, [x29,#8] @@ -82,13 +82,13 @@ ALIGN 32 |__smul_384_n_shift_by_30| PROC - ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) - asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x3, x4, [x0,#8*0+0] + asr x27, x20, #63 ldp x5, x6, [x0,#8*2+0] - eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + eor x20, x20, x27 ldp x7, x8, [x0,#8*4+0] - eor x3, x3, x27 // conditionally negate |b| (or |a|) + eor x3, x3, x27 sub x20, x20, x27 eor x4, x4, x27 adds x3, x3, x27, lsr#63 @@ -122,13 +122,13 @@ adcs x7, x7, x24 adcs x8, x8 ,x25 adc x26, x26, x28 - ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) - asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x9, x10, [x0,#8*0+48] + asr x27, x19, #63 ldp x11, x12, [x0,#8*2+48] - eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + eor x19, x19, x27 ldp x13, x14, [x0,#8*4+48] - eor x9, x9, x27 // conditionally negate |b| (or |a|) + eor x9, x9, x27 sub x19, x19, x27 eor x10, x10, x27 adds x9, x9, x27, lsr#63 @@ -199,29 +199,29 @@ ALIGN 16 |__ab_approximation_30| PROC - ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x13, x14, [x0,#8*4] ldp x11, x12, [x0,#8*2] - orr x21, x8, x14 // check top-most limbs, ... + orr x21, x8, x14 cmp x21, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x6 - orr x21, x8, x14 // ... ones before top-most, ... + orr x21, x8, x14 cselne x13,x13,x12 cmp x21, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x5 - orr x21, x8, x14 // ... and ones before that ... + orr x21, x8, x14 cselne x13,x13,x11 cmp x21, #0 cselne x8,x8,x7 cselne x14,x14,x13 cselne x7,x7,x4 - orr x21, x8, x14 // and one more, ... + orr x21, x8, x14 cselne x13,x13,x10 cmp x21, #0 @@ -238,7 +238,7 @@ cselne x14,x14,x13 neg x22, x21 - lslv x8, x8, x21 // align high limbs to the left + lslv x8, x8, x21 lslv x14, x14, x21 lsrv x7, x7, x22 lsrv x13, x13, x22 @@ -258,19 +258,19 @@ ALIGN 16 |__inner_loop_30| PROC mov x28, #30 - mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x17, #0x7FFFFFFF80000000 + mov x20, #0x800000007FFFFFFF mov x27,#0x7FFFFFFF7FFFFFFF |$Loop_30| - sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + sbfx x24, x8, #0, #1 and x25, x8, x14 sub x28, x28, #1 and x21, x14, x24 - sub x22, x14, x8 // |b_|-|a_| - subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + sub x22, x14, x8 + subs x23, x8, x21 + add x25, x2, x25, lsr#1 mov x21, x20 cselhs x14,x14,x8 cselhs x8,x23,x22 @@ -281,9 +281,9 @@ and x21, x20, x24 and x22, x27, x24 add x23, x14, #2 - sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x20, x20, x20 // |f1|<<=1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + sub x17, x17, x21 + add x20, x20, x20 + add x2, x2, x23, lsr#2 add x17, x17, x22 sub x20, x20, x27 @@ -294,7 +294,7 @@ ubfx x17, x17, #32, #32 ubfx x19, x20, #0, #32 ubfx x20, x20, #32, #32 - sub x16, x16, x27 // remove the bias + sub x16, x16, x27 sub x17, x17, x27 sub x19, x19, x27 sub x20, x20, x27 @@ -305,19 +305,19 @@ ALIGN 16 |__inner_loop_48| PROC |$Loop_48| - sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sbfx x24, x3, #0, #1 and x25, x3, x9 sub x15, x15, #1 and x21, x9, x24 - sub x22, x9, x3 // |b_|-|a_| - subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + sub x22, x9, x3 + subs x23, x3, x21 add x25, x2, x25, lsr#1 cselhs x9,x9,x3 cselhs x3,x23,x22 cselhs x2,x2,x25 add x23, x9, #2 lsr x3, x3, #1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x2, x2, x23, lsr#2 cbnz x15, |$Loop_48| diff --git a/build/win64/div3w-armv8.asm b/build/win64/div3w-armv8.asm index aec90679..c8fe251a 100644 --- a/build/win64/div3w-armv8.asm +++ b/build/win64/div3w-armv8.asm @@ -4,32 +4,32 @@ EXPORT |div_3_limbs|[FUNC] ALIGN 32 |div_3_limbs| PROC - ldp x4,x5,[x0] // load R - eor x0,x0,x0 // Q = 0 - mov x3,#64 // loop counter + ldp x4,x5,[x0] + eor x0,x0,x0 + mov x3,#64 nop |$Loop| - subs x6,x4,x1 // R - D - add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 + add x0,x0,x0 sbcs x7,x5,x2 - add x0,x0,#1 // Q + speculative bit + add x0,x0,#1 csello x4,x4,x6 - extr x1,x2,x1,#1 // D >>= 1 + extr x1,x2,x1,#1 csello x5,x5,x7 lsr x2,x2,#1 - sbc x0,x0,xzr // subtract speculative bit + sbc x0,x0,xzr sub x3,x3,#1 cbnz x3,|$Loop| - asr x3,x0,#63 // top bit -> mask - add x0,x0,x0 // Q <<= 1 - subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + speculative bit + asr x3,x0,#63 + add x0,x0,x0 + subs x6,x4,x1 + add x0,x0,#1 sbcs x7,x5,x2 - sbc x0,x0,xzr // subtract speculative bit + sbc x0,x0,xzr - orr x0,x0,x3 // all ones if overflow + orr x0,x0,x3 ret ENDP @@ -39,32 +39,32 @@ |quot_rem_128| PROC ldp x3,x4,[x1] - mul x5,x3,x2 // divisor[0:1} * quotient + mul x5,x3,x2 umulh x6,x3,x2 mul x11, x4,x2 umulh x7,x4,x2 - ldp x8,x9,[x0] // load 3 limbs of the dividend + ldp x8,x9,[x0] ldr x10,[x0,#16] adds x6,x6,x11 adc x7,x7,xzr - subs x8,x8,x5 // dividend - divisor * quotient + subs x8,x8,x5 sbcs x9,x9,x6 sbcs x10,x10,x7 - sbc x5,xzr,xzr // borrow -> mask + sbc x5,xzr,xzr - add x2,x2,x5 // if borrowed, adjust the quotient ... + add x2,x2,x5 and x3,x3,x5 and x4,x4,x5 - adds x8,x8,x3 // ... and add divisor + adds x8,x8,x3 adc x9,x9,x4 - stp x8,x9,[x0] // save 2 limbs of the remainder - str x2,[x0,#16] // and one limb of the quotient + stp x8,x9,[x0] + str x2,[x0,#16] - mov x0,x2 // return adjusted quotient + mov x0,x2 ret ENDP @@ -74,15 +74,15 @@ ALIGN 32 |quot_rem_64| PROC ldr x3,[x1] - ldr x8,[x0] // load 1 limb of the dividend + ldr x8,[x0] - mul x5,x3,x2 // divisor * quotient + mul x5,x3,x2 - sub x8,x8,x5 // dividend - divisor * quotient + sub x8,x8,x5 - stp x8,x2,[x0] // save remainder and quotient + stp x8,x2,[x0] - mov x0,x2 // return quotient + mov x0,x2 ret ENDP diff --git a/build/win64/mul_mont_256-armv8.asm b/build/win64/mul_mont_256-armv8.asm index bb2dfe04..285001fc 100644 --- a/build/win64/mul_mont_256-armv8.asm +++ b/build/win64/mul_mont_256-armv8.asm @@ -28,7 +28,7 @@ umulh x16,x12,x9 umulh x17,x13,x9 adds x20,x20,x14 - //mul x14,x5,x3 + adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 @@ -36,7 +36,7 @@ adc x23,xzr, x17 mul x17,x8,x3 ldr x9,[x2,8*1] - subs xzr,x19,#1 //adds x19,x19,x14 + subs xzr,x19,#1 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 @@ -68,7 +68,7 @@ adc x23,x23,xzr adds x20,x20,x14 - //mul x14,x5,x3 + adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 @@ -76,7 +76,7 @@ adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*2] - subs xzr,x19,#1 //adds x19,x19,x14 + subs xzr,x19,#1 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 @@ -108,7 +108,7 @@ adc x23,x23,xzr adds x20,x20,x14 - //mul x14,x5,x3 + adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 @@ -116,7 +116,7 @@ adc x23,x23,x17 mul x17,x8,x3 ldr x9,[x2,8*3] - subs xzr,x19,#1 //adds x19,x19,x14 + subs xzr,x19,#1 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 @@ -148,14 +148,14 @@ adc x23,x23,xzr adds x20,x20,x14 - //mul x14,x5,x3 + adcs x21,x21,x15 mul x15,x6,x3 adcs x22,x22,x16 mul x16,x7,x3 adc x23,x23,x17 mul x17,x8,x3 - subs xzr,x19,#1 //adds x19,x19,x14 + subs xzr,x19,#1 umulh x14,x5,x3 adcs x20,x20,x15 umulh x15,x6,x3 @@ -206,64 +206,64 @@ ldp x7,x8,[x1,#16] mov x4,x3 - //////////////////////////////////////////////////////////////// - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x11,x6,x5 // a[1]*a[0] + + + + + + + + + + + + + + + + mul x11,x6,x5 umulh x15,x6,x5 - mul x12,x7,x5 // a[2]*a[0] + mul x12,x7,x5 umulh x16,x7,x5 - mul x13,x8,x5 // a[3]*a[0] + mul x13,x8,x5 umulh x19,x8,x5 - adds x12,x12,x15 // accumulate high parts of multiplication - mul x14,x7,x6 // a[2]*a[1] + adds x12,x12,x15 + mul x14,x7,x6 umulh x15,x7,x6 adcs x13,x13,x16 - mul x16,x8,x6 // a[3]*a[1] + mul x16,x8,x6 umulh x17,x8,x6 - adc x19,x19,xzr // can't overflow + adc x19,x19,xzr - mul x20,x8,x7 // a[3]*a[2] + mul x20,x8,x7 umulh x21,x8,x7 - adds x15,x15,x16 // accumulate high parts of multiplication - mul x10,x5,x5 // a[0]*a[0] - adc x16,x17,xzr // can't overflow + adds x15,x15,x16 + mul x10,x5,x5 + adc x16,x17,xzr - adds x13,x13,x14 // accumulate low parts of multiplication + adds x13,x13,x14 umulh x5,x5,x5 adcs x19,x19,x15 - mul x15,x6,x6 // a[1]*a[1] + mul x15,x6,x6 adcs x20,x20,x16 umulh x6,x6,x6 - adc x21,x21,xzr // can't overflow + adc x21,x21,xzr - adds x11,x11,x11 // acc[1-6]*=2 - mul x16,x7,x7 // a[2]*a[2] + adds x11,x11,x11 + mul x16,x7,x7 adcs x12,x12,x12 umulh x7,x7,x7 adcs x13,x13,x13 - mul x17,x8,x8 // a[3]*a[3] + mul x17,x8,x8 adcs x19,x19,x19 umulh x8,x8,x8 adcs x20,x20,x20 adcs x21,x21,x21 adc x22,xzr,xzr - adds x11,x11,x5 // +a[i]*a[i] + adds x11,x11,x5 adcs x12,x12,x15 adcs x13,x13,x6 adcs x19,x19,x16 @@ -274,7 +274,7 @@ bl __mul_by_1_mont_256 ldr x30,[x29,#8] - adds x10,x10,x19 // accumulate upper half + adds x10,x10,x19 adcs x11,x11,x20 adcs x12,x12,x21 adcs x13,x13,x22 @@ -384,11 +384,11 @@ mul x3,x4,x10 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] - //mul x14,x5,x3 + mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 + subs xzr,x10,#1 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 @@ -403,11 +403,11 @@ adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 - //mul x14,x5,x3 + mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 + subs xzr,x10,#1 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 @@ -422,11 +422,11 @@ adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 - //mul x14,x5,x3 + mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 + subs xzr,x10,#1 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 @@ -441,11 +441,11 @@ adcs x12,x13,x16 mul x3,x4,x10 adc x13,x9,x17 - //mul x14,x5,x3 + mul x15,x6,x3 mul x16,x7,x3 mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 + subs xzr,x10,#1 umulh x14,x5,x3 adcs x11,x11,x15 umulh x15,x6,x3 diff --git a/build/win64/mul_mont_384-armv8.asm b/build/win64/mul_mont_384-armv8.asm index a309dfa4..28f35797 100644 --- a/build/win64/mul_mont_384-armv8.asm +++ b/build/win64/mul_mont_384-armv8.asm @@ -247,16 +247,16 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - sub sp,sp,#288 // space for 3 768-bit vectors + sub sp,sp,#288 - mov x26,x0 // save r_ptr - mov x27,x1 // save b_ptr - mov x28,x2 // save b_ptr + mov x26,x0 + mov x27,x1 + mov x28,x2 - sub x0,sp,#0 // mul_384(t0, a->re, b->re) + sub x0,sp,#0 bl __mul_384 - add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x1,x1,#48 add x2,x2,#48 add x0,sp,#96 bl __mul_384 @@ -271,12 +271,12 @@ add x1,x28,#0 add x2,x28,#48 - add x0,sp,#192 // t2 + add x0,sp,#192 bl __add_mod_384 add x1,x0,#0 add x2,x0,#48 - bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + bl __mul_384 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] @@ -287,19 +287,19 @@ bl __sub_mod_384x384 add x2,sp,#96 - bl __sub_mod_384x384 // t2 = t2-t0-t1 + bl __sub_mod_384x384 add x1,sp,#0 add x2,sp,#96 add x0,sp,#0 - bl __sub_mod_384x384 // t0 = t0-t1 + bl __sub_mod_384x384 - add x1,sp,#0 // ret->re = redc(t0) + add x1,sp,#0 add x0,x26,#0 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 - add x1,sp,#192 // ret->im = redc(t2) + add x1,sp,#192 add x0,x0,#48 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 @@ -329,9 +329,9 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 2 384-bit vectors - mov x4,x3 // adjust for missing b_ptr + stp x3,x0,[sp,#96] + sub sp,sp,#96 + mov x4,x3 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] @@ -339,19 +339,19 @@ add x2,x1,#48 add x0,sp,#0 - bl __add_mod_384 // t0 = a->re + a->im + bl __add_mod_384 add x0,sp,#48 - bl __sub_mod_384 // t1 = a->re - a->im + bl __sub_mod_384 ldp x11,x12,[x1] ldr x17, [x2] ldp x13,x14,[x1,#16] ldp x15,x16,[x1,#32] - bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + bl __mul_mont_384 - adds x11,x11,x11 // add with itself + adds x11,x11,x11 adcs x12,x12,x12 adcs x13,x13,x13 adcs x14,x14,x14 @@ -383,7 +383,7 @@ stp x23,x24,[x2,#80] add x2,sp,#48 - bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + bl __mul_mont_384 ldr x30,[x29,#8] stp x11,x12,[x2] @@ -414,7 +414,7 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + stp x4,x0,[sp,#96] ldp x11,x12,[x1] ldr x17, [x2] @@ -461,7 +461,7 @@ umulh x3,x16,x17 adds x20,x20,x26 - // mul x26,x5,x4 + adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 @@ -473,7 +473,7 @@ adc x25,xzr, x3 mul x3,x10,x4 mov x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 + subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 @@ -521,7 +521,7 @@ adc x17,xzr,xzr adds x20,x20,x26 - // mul x26,x5,x4 + adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 @@ -533,7 +533,7 @@ adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 + subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 @@ -581,7 +581,7 @@ adc x17,xzr,xzr adds x20,x20,x26 - // mul x26,x5,x4 + adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 @@ -593,7 +593,7 @@ adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 + subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 @@ -641,7 +641,7 @@ adc x17,xzr,xzr adds x20,x20,x26 - // mul x26,x5,x4 + adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 @@ -653,7 +653,7 @@ adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 + subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 @@ -701,7 +701,7 @@ adc x17,xzr,xzr adds x20,x20,x26 - // mul x26,x5,x4 + adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 @@ -713,7 +713,7 @@ adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 + subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 @@ -761,7 +761,7 @@ adc x17,xzr,xzr adds x20,x20,x26 - // mul x26,x5,x4 + adcs x21,x21,x27 mul x27,x6,x4 adcs x22,x22,x28 @@ -773,7 +773,7 @@ adcs x25,x25,x3 mul x3,x10,x4 adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 + subs xzr,x19,#1 umulh x26,x5,x4 adcs x20,x20,x27 umulh x27,x6,x4 @@ -786,7 +786,7 @@ adcs x24,x24,x3 umulh x3,x10,x4 adcs x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr + ldp x4,x2,[x29,#96] adc x17,x17,xzr adds x19,x20,x26 @@ -827,10 +827,10 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for 768-bit vector - mov x4,x3 // adjust for missing b_ptr + sub sp,sp,#96 + mov x4,x3 - mov x3,x0 // save r_ptr + mov x3,x0 mov x0,sp ldp x11,x12,[x1] @@ -844,7 +844,7 @@ ldp x9,x10,[x2,#32] mov x1,sp - mov x0,x3 // restore r_ptr + mov x0,x3 bl __mul_by_1_mont_384 bl __redc_tail_mont_384 ldr x30,[x29,#8] @@ -873,9 +873,9 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 768-bit vector - mov x17,x5 // save b_ptr + stp x4,x0,[sp,#96] + sub sp,sp,#96 + mov x17,x5 ldp x11,x12,[x1] ldp x13,x14,[x1,#16] @@ -883,7 +883,7 @@ mov x0,sp |$Loop_sqr_383| bl __sqr_384 - sub x2,x2,#1 // counter + sub x2,x2,#1 ldp x5,x6,[x3] ldp x7,x8,[x3,#16] @@ -896,7 +896,7 @@ ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] - adds x11,x11,x19 // just accumulate upper half + adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 @@ -1081,7 +1081,7 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr + mov x4,x3 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] @@ -1114,7 +1114,7 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr + mov x4,x3 ldp x5,x6,[x2] ldp x7,x8,[x2,#16] @@ -1159,13 +1159,13 @@ mul x26,x4,x11 ldp x15,x16,[x1,#32] - // mul x19,x5,x26 + mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 + subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 @@ -1186,13 +1186,13 @@ adcs x15,x15,x24 adc x16,x16,x25 - // mul x19,x5,x26 + mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 + subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 @@ -1213,13 +1213,13 @@ adcs x15,x15,x24 adc x16,x16,x25 - // mul x19,x5,x26 + mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 + subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 @@ -1240,13 +1240,13 @@ adcs x15,x15,x24 adc x16,x16,x25 - // mul x19,x5,x26 + mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 + subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 @@ -1267,13 +1267,13 @@ adcs x15,x15,x24 adc x16,x16,x25 - // mul x19,x5,x26 + mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 + subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 @@ -1294,13 +1294,13 @@ adcs x15,x15,x24 adc x16,x16,x25 - // mul x19,x5,x26 + mul x20,x6,x26 mul x21,x7,x26 mul x22,x8,x26 mul x23,x9,x26 mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 + subs xzr,x11,#1 umulh x11,x5,x26 adcs x20,x20,x12 umulh x12,x6,x26 @@ -1330,7 +1330,7 @@ ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] - adds x11,x11,x19 // accumulate upper half + adds x11,x11,x19 adcs x12,x12,x20 adcs x13,x13,x21 adcs x14,x14,x22 @@ -1577,17 +1577,17 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for two 384-bit vectors + sub sp,sp,#96 ldp x11,x12,[x1] - mov x26,x0 // save r_ptr + mov x26,x0 ldp x19,x20,[x1,#48] - mov x27,x1 // save a_ptr + mov x27,x1 ldp x13,x14,[x1,#16] - mov x28,x2 // save b_ptr + mov x28,x2 ldp x21,x22,[x1,#64] ldp x15,x16,[x1,#32] - adds x5,x11,x19 // t0 = a->re + a->im + adds x5,x11,x19 ldp x23,x24,[x1,#80] adcs x6,x12,x20 ldp x11,x12,[x2] @@ -1601,7 +1601,7 @@ ldp x15,x16,[x2,#32] stp x5,x6,[sp] - adds x5,x11,x19 // t1 = b->re + b->im + adds x5,x11,x19 ldp x23,x24,[x2,#80] adcs x6,x12,x20 stp x7,x8,[sp,#16] @@ -1614,14 +1614,14 @@ stp x7,x8,[sp,#64] stp x9,x10,[sp,#80] - bl __mul_384 // mul_384(ret->re, a->re, b->re) + bl __mul_384 - add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x1,sp,#0 add x2,sp,#48 add x0,x26,#96 bl __mul_384 - add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x1,x27,#48 add x2,x28,#48 add x0,sp,#0 bl __mul_384 @@ -1630,15 +1630,15 @@ ldp x7,x8,[x3,#16] ldp x9,x10,[x3,#32] - add x1,x26,#96 // ret->im -= tx + add x1,x26,#96 add x2,sp,#0 add x0,x26,#96 bl __sub_mod_384x384 - add x2,x26,#0 // ret->im -= ret->re + add x2,x26,#0 bl __sub_mod_384x384 - add x1,x26,#0 // ret->re -= tx + add x1,x26,#0 add x2,sp,#0 add x0,x26,#0 bl __sub_mod_384x384 @@ -1672,7 +1672,7 @@ ldp x11,x12,[x1] ldp x19,x20,[x1,#48] ldp x13,x14,[x1,#16] - adds x5,x11,x19 // t0 = a->re + a->im + adds x5,x11,x19 ldp x21,x22,[x1,#64] adcs x6,x12,x20 ldp x15,x16,[x1,#32] @@ -1685,7 +1685,7 @@ adc x10,x16,x24 stp x7,x8,[x0,#16] - subs x11,x11,x19 // t1 = a->re - a->im + subs x11,x11,x19 ldp x7,x8,[x2,#16] sbcs x12,x12,x20 stp x9,x10,[x0,#32] @@ -1712,12 +1712,12 @@ stp x13,x14,[x0,#64] stp x15,x16,[x0,#80] - mov x4,x1 // save a_ptr - add x1,x0,#0 // mul_384(ret->re, t0, t1) + mov x4,x1 + add x1,x0,#0 add x2,x0,#48 bl __mul_384 - add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x1,x4,#0 add x2,x4,#48 add x0,x0,#96 bl __mul_384 @@ -1725,7 +1725,7 @@ ldp x11,x12,[x0] ldp x13,x14,[x0,#16] - adds x11,x11,x11 // add with itself + adds x11,x11,x11 ldp x15,x16,[x0,#32] adcs x12,x12,x12 adcs x13,x13,x13 @@ -1768,9 +1768,9 @@ stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#112 // space for two 384-bit vectors + word - mov x4,x3 // adjust for missing b_ptr + stp x3,x0,[sp,#96] + sub sp,sp,#112 + mov x4,x3 ldp x11,x12,[x1] ldp x13,x14,[x1,#16] @@ -1780,20 +1780,20 @@ ldp x21,x22,[x1,#64] ldp x23,x24,[x1,#80] - adds x5,x11,x17 // t0 = a->re + a->im + adds x5,x11,x17 adcs x6,x12,x20 adcs x7,x13,x21 adcs x8,x14,x22 adcs x9,x15,x23 adc x10,x16,x24 - subs x19,x11,x17 // t1 = a->re - a->im + subs x19,x11,x17 sbcs x20,x12,x20 sbcs x21,x13,x21 sbcs x22,x14,x22 sbcs x23,x15,x23 sbcs x24,x16,x24 - sbc x25,xzr,xzr // borrow flag as mask + sbc x25,xzr,xzr stp x5,x6,[sp] stp x7,x8,[sp,#16] @@ -1808,9 +1808,9 @@ ldp x9,x10,[x2,#32] add x2,x1,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + bl __mul_mont_383_nonred - adds x19,x11,x11 // add with itself + adds x19,x11,x11 adcs x20,x12,x12 adcs x21,x13,x13 adcs x22,x14,x14 @@ -1827,10 +1827,10 @@ ldp x15,x16,[sp,#32] add x2,sp,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + bl __mul_mont_383_nonred ldr x30,[x29,#8] - ldr x25,[sp,#96] // account for sign from a->re - a->im + ldr x25,[sp,#96] ldp x19,x20,[sp] ldp x21,x22,[sp,#16] ldp x23,x24,[sp,#32] @@ -2207,7 +2207,7 @@ adcs x24,x24,x3 umulh x3,x10,x4 adc x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr + ldp x4,x2,[x29,#96] adds x11,x20,x26 adcs x12,x21,x27 @@ -2359,7 +2359,7 @@ and x3,x3,#1 and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity + orr x0,x1,x3 ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] diff --git a/build/win64/sha256-armv8.asm b/build/win64/sha256-armv8.asm index 31e74219..e3a6ca83 100644 --- a/build/win64/sha256-armv8.asm +++ b/build/win64/sha256-armv8.asm @@ -1,19 +1,19 @@ -// -// Copyright Supranational LLC -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// ==================================================================== -// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -// project. -// ==================================================================== -// -// sha256_block procedure for ARMv8. -// -// This module is stripped of scalar code paths, with rationale that all -// known processors are NEON-capable. -// -// See original module at CRYPTOGAMS for further details. + + + + + + + + + + + + + + + + COMMON |__blst_platform_cap|,4 AREA |.text|,CODE,ALIGN=8,ARM64 @@ -37,7 +37,7 @@ DCDU 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 DCDU 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 DCDU 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - DCDU 0 //terminator + DCDU 0 DCB "SHA256 block transform for ARMv8, CRYPTOGAMS by @dot-asm",0 ALIGN 4 @@ -61,115 +61,115 @@ rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b - orr v18.16b,v0.16b,v0.16b // offload + orr v18.16b,v0.16b,v0.16b orr v19.16b,v1.16b,v1.16b ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s - DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + DCDU 0x5e2828a4 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + DCDU 0x5e104020 + DCDU 0x5e105041 + DCDU 0x5e0760c4 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s - DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + DCDU 0x5e2828c5 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + DCDU 0x5e114020 + DCDU 0x5e115041 + DCDU 0x5e0460e5 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s - DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + DCDU 0x5e2828e6 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + DCDU 0x5e104020 + DCDU 0x5e105041 + DCDU 0x5e056086 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s - DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + DCDU 0x5e282887 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + DCDU 0x5e114020 + DCDU 0x5e115041 + DCDU 0x5e0660a7 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s - DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + DCDU 0x5e2828a4 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + DCDU 0x5e104020 + DCDU 0x5e105041 + DCDU 0x5e0760c4 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s - DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + DCDU 0x5e2828c5 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + DCDU 0x5e114020 + DCDU 0x5e115041 + DCDU 0x5e0460e5 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s - DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + DCDU 0x5e2828e6 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + DCDU 0x5e104020 + DCDU 0x5e105041 + DCDU 0x5e056086 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s - DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + DCDU 0x5e282887 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + DCDU 0x5e114020 + DCDU 0x5e115041 + DCDU 0x5e0660a7 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s - DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + DCDU 0x5e2828a4 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + DCDU 0x5e104020 + DCDU 0x5e105041 + DCDU 0x5e0760c4 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s - DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + DCDU 0x5e2828c5 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + DCDU 0x5e114020 + DCDU 0x5e115041 + DCDU 0x5e0460e5 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v6.4s - DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + DCDU 0x5e2828e6 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + DCDU 0x5e104020 + DCDU 0x5e105041 + DCDU 0x5e056086 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v7.4s - DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + DCDU 0x5e282887 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + DCDU 0x5e114020 + DCDU 0x5e115041 + DCDU 0x5e0660a7 ld1 {v17.4s},[x3],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e104020 + DCDU 0x5e105041 ld1 {v16.4s},[x3],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e114020 + DCDU 0x5e115041 ld1 {v17.4s},[x3] add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind + sub x3,x3,#64*4-16 orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e104020 + DCDU 0x5e105041 add v17.4s,v17.4s,v7.4s orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e114020 + DCDU 0x5e115041 add v0.4s,v0.4s,v18.4s add v1.4s,v1.4s,v19.4s @@ -195,7 +195,7 @@ sub sp,sp,#16*4 adr x16,|$LK256| - add x2,x1,x2,lsl#6 // len to point at the end of inp + add x2,x1,x2,lsl#6 ld1 {v0.16b},[x1], #16 ld1 {v1.16b},[x1], #16 @@ -205,8 +205,8 @@ ld1 {v5.4s},[x16], #16 ld1 {v6.4s},[x16], #16 ld1 {v7.4s},[x16], #16 - rev32 v0.16b,v0.16b // yes, even on - rev32 v1.16b,v1.16b // big-endian + rev32 v0.16b,v0.16b + rev32 v1.16b,v1.16b rev32 v2.16b,v2.16b rev32 v3.16b,v3.16b mov x17,sp @@ -666,16 +666,16 @@ add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 - cmp w12,#0 // check for K256 terminator + cmp w12,#0 ldr w12,[sp,#0] sub x17,x17,#64 bne |$L_00_48| - sub x16,x16,#256 // rewind x16 + sub x16,x16,#256 cmp x1,x2 mov x17, #64 cseleq x17,x17,xzr - sub x1,x1,x17 // avoid SEGV + sub x1,x1,x17 mov x17,sp add w10,w10,w12 add w3,w3,w15 @@ -1000,11 +1000,11 @@ add w7,w7,w3 eor w13,w13,w5 st1 {v4.4s},[x17], #16 - add w3,w3,w15 // h+=Sigma0(a) from the past + add w3,w3,w15 ldp w11,w12,[x0,#0] - add w3,w3,w13 // h+=Maj(a,b,c) from the past + add w3,w3,w13 ldp w13,w14,[x0,#8] - add w3,w3,w11 // accumulate + add w3,w3,w11 add w4,w4,w12 ldp w11,w12,[x0,#16] add w5,w5,w13 @@ -1036,12 +1036,12 @@ |blst_sha256_emit| PROC ldp x4,x5,[x1] ldp x6,x7,[x1,#16] -#ifndef __AARCH64EB__ + if :lnot::def: __AARCH64EB__ rev x4,x4 rev x5,x5 rev x6,x6 rev x7,x7 -#endif + endif str w4,[x0,#4] lsr x4,x4,#32 str w5,[x0,#12]