Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use BigMul for 32x32=64 in decimal #93345

Merged
merged 6 commits into from
Nov 6, 2023
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 50 additions & 68 deletions src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -177,27 +177,9 @@ private static unsafe uint GetExponent(double d)
return (uint)(BitConverter.DoubleToUInt64Bits(d) >> 52) & 0x7FFu;
}

private static ulong UInt32x32To64(uint a, uint b)
{
return (ulong)a * (ulong)b;
}

private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be replaced, technically speaking, by Math.BigMul(ulong, ulong, out ulong) as well.

Then it can also be deleted.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah sure, NB that overflow check and insertion to DecCalc result are still in body below. Do we prefer to move those around to where this is now called or should i keep this method?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably better to keep the extra checks centralized here and just defer the algorithm to Math.BigMul

{
ulong low = UInt32x32To64((uint)a, (uint)b); // lo partial prod
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you delete UInt32x32To64 as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could yes, but:

  • Unfortunately does not exist any BigMul method for two uint32.
  • It is used in a lot of places where narrowing/casting from ulongs to uints are done, and the code is pretty convoluted already. This shows intent quite nicely

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you measure the performance of this change? Last time I checked, a similar change caused a significant performance regression because the codegen for Bmi2.MultiplyNoFlags is suboptimal.

ulong mid = UInt32x32To64((uint)a, (uint)(b >> 32)); // mid 1 partial prod
ulong high = UInt32x32To64((uint)(a >> 32), (uint)(b >> 32));
high += mid >> 32;
low += mid <<= 32;
if (low < mid) // test for carry
high++;

mid = UInt32x32To64((uint)(a >> 32), (uint)b);
high += mid >> 32;
low += mid <<= 32;
if (low < mid) // test for carry
high++;

ulong high = Math.BigMul(a, b, out ulong low);
if (high > uint.MaxValue)
Number.ThrowOverflowException(SR.Overflow_Decimal);
result.Low64 = low;
Expand Down Expand Up @@ -394,7 +376,7 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den)

// Compute full remainder, rem = dividend - (quo * divisor).
//
ulong prod = UInt32x32To64(quo, (uint)den); // quo * lo divisor
ulong prod = quo * (den & uint.MaxValue); // quo * lo divisor
Copy link
Member

@tannergooding tannergooding Oct 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably needs a comment on what it's doing.

But, notably, this may regress 32-bit platforms as it will now do a more expensive 64x64=64 multiplication, rather than doing the cheaper 32x32=64.

In general an internal Math.BigMul(uint a, uint b, out uint low) could be defined that uses Bmi2.MultiplyNoFlags, ArmBase.MultiplyHigh, and otherwise falls back to the naive algorithm of (ulong)a * b

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved all the stuff into an internal ulong Math.BigMul(uint, uint) (since we had a long Math.BigMul(int, int)). I tried using initrinsic for x86 on 32 bit. Doesn't seem to exist a ArmBase.MultiplyHigh yet.

num -= prod;

if (num > ~prod)
Expand Down Expand Up @@ -440,8 +422,8 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)

// Compute full remainder, rem = dividend - (quo * divisor).
//
ulong prod1 = UInt32x32To64(quo, bufDen.U0); // quo * lo divisor
ulong prod2 = UInt32x32To64(quo, bufDen.U1); // quo * mid divisor
ulong prod1 = (ulong)quo * bufDen.U0; // quo * lo divisor
ulong prod2 = (ulong)quo * bufDen.U1; // quo * mid divisor
prod2 += prod1 >> 32;
prod1 = (uint)prod1 | (prod2 << 32);
prod2 >>= 32;
Expand Down Expand Up @@ -500,23 +482,23 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
/// <returns>Returns highest 32 bits of product</returns>
private static uint IncreaseScale(ref Buf12 bufNum, uint power)
{
ulong tmp = UInt32x32To64(bufNum.U0, power);
ulong tmp = (ulong)bufNum.U0 * power;
bufNum.U0 = (uint)tmp;
tmp >>= 32;
tmp += UInt32x32To64(bufNum.U1, power);
tmp += (ulong)bufNum.U1 * power;
bufNum.U1 = (uint)tmp;
tmp >>= 32;
tmp += UInt32x32To64(bufNum.U2, power);
tmp += (ulong)bufNum.U2 * power;
bufNum.U2 = (uint)tmp;
return (uint)(tmp >> 32);
}

private static void IncreaseScale64(ref Buf12 bufNum, uint power)
{
ulong tmp = UInt32x32To64(bufNum.U0, power);
ulong tmp = (ulong)bufNum.U0 * power;
bufNum.U0 = (uint)tmp;
tmp >>= 32;
tmp += UInt32x32To64(bufNum.U1, power);
tmp += (ulong)bufNum.U1 * power;
bufNum.High64 = tmp;
}

Expand Down Expand Up @@ -934,11 +916,11 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign)
{
if (scale <= MaxInt32Scale)
{
low64 = UInt32x32To64((uint)low64, UInt32Powers10[scale]);
low64 = (low64 & uint.MaxValue) * UInt32Powers10[scale];
goto AlignedAdd;
}
scale -= MaxInt32Scale;
low64 = UInt32x32To64((uint)low64, TenToPowerNine);
low64 = (low64 & uint.MaxValue) * TenToPowerNine;
} while (low64 <= uint.MaxValue);
}

Expand All @@ -947,8 +929,8 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign)
power = TenToPowerNine;
if (scale < MaxInt32Scale)
power = UInt32Powers10[scale];
tmpLow = UInt32x32To64((uint)low64, power);
tmp64 = UInt32x32To64((uint)(low64 >> 32), power) + (tmpLow >> 32);
tmpLow = (low64 & uint.MaxValue) * power;
tmp64 = ((low64 >> 32) * power) + (tmpLow >> 32);
low64 = (uint)tmpLow + (tmp64 << 32);
high = (uint)(tmp64 >> 32);
if ((scale -= MaxInt32Scale) <= 0)
Expand All @@ -963,11 +945,11 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign)
power = TenToPowerNine;
if (scale < MaxInt32Scale)
power = UInt32Powers10[scale];
tmpLow = UInt32x32To64((uint)low64, power);
tmp64 = UInt32x32To64((uint)(low64 >> 32), power) + (tmpLow >> 32);
tmpLow = (low64 & uint.MaxValue) * power;
tmp64 = ((low64 >> 32) * power) + (tmpLow >> 32);
low64 = (uint)tmpLow + (tmp64 << 32);
tmp64 >>= 32;
tmp64 += UInt32x32To64(high, power);
tmp64 += (ulong)high * power;

scale -= MaxInt32Scale;
if (tmp64 > uint.MaxValue)
Expand Down Expand Up @@ -999,7 +981,7 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign)
for (uint cur = 0; ;)
{
Debug.Assert(cur < Buf24.Length);
tmp64 += UInt32x32To64(rgulNum[cur], power);
tmp64 += (ulong)rgulNum[cur] * power;
rgulNum[cur] = (uint)tmp64;
cur++;
tmp64 >>= 32;
Expand Down Expand Up @@ -1202,10 +1184,10 @@ internal static long VarCyFromDec(ref DecCalc pdecIn)
if (pdecIn.High != 0)
goto ThrowOverflow;
uint pwr = UInt32Powers10[-scale];
ulong high = UInt32x32To64(pwr, pdecIn.Mid);
ulong high = (ulong)pwr * pdecIn.Mid;
if (high > uint.MaxValue)
goto ThrowOverflow;
ulong low = UInt32x32To64(pwr, pdecIn.Low);
ulong low = (ulong)pwr * pdecIn.Low;
low += high <<= 32;
if (low < high)
goto ThrowOverflow;
Expand Down Expand Up @@ -1290,11 +1272,11 @@ private static int VarDecCmpSub(in decimal d1, in decimal d2)
do
{
uint power = scale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[scale];
ulong tmpLow = UInt32x32To64((uint)low64, power);
ulong tmp = UInt32x32To64((uint)(low64 >> 32), power) + (tmpLow >> 32);
ulong tmpLow = (low64 & uint.MaxValue) * power;
ulong tmp = ((low64 >> 32) * power) + (tmpLow >> 32);
low64 = (uint)tmpLow + (tmp << 32);
tmp >>= 32;
tmp += UInt32x32To64(high, power);
tmp += (ulong)high * power;
// If the scaled value has more than 96 significant bits then it's greater than d2
if (tmp > uint.MaxValue)
return sign;
Expand Down Expand Up @@ -1337,7 +1319,7 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
{
// Upper 64 bits are zero.
//
ulong low64 = UInt32x32To64(d1.Low, d2.Low);
ulong low64 = (ulong)d1.Low * d2.Low;
if (scale > DEC_SCALE_MAX)
{
// Result scale is too big. Divide result by power of 10 to reduce it.
Expand Down Expand Up @@ -1371,16 +1353,16 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
else
{
// Left value is 32-bit, result fits in 4 uints
tmp = UInt32x32To64(d1.Low, d2.Low);
tmp = (ulong)d1.Low * d2.Low;
bufProd.U0 = (uint)tmp;

tmp = UInt32x32To64(d1.Low, d2.Mid) + (tmp >> 32);
tmp = ((ulong)d1.Low * d2.Mid) + (tmp >> 32);
bufProd.U1 = (uint)tmp;
tmp >>= 32;

if (d2.High != 0)
{
tmp += UInt32x32To64(d1.Low, d2.High);
tmp += (ulong)d1.Low * d2.High;
if (tmp > uint.MaxValue)
{
bufProd.Mid64 = tmp;
Expand All @@ -1395,16 +1377,16 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
else if ((d2.High | d2.Mid) == 0)
{
// Right value is 32-bit, result fits in 4 uints
tmp = UInt32x32To64(d2.Low, d1.Low);
tmp = (ulong)d2.Low * d1.Low;
bufProd.U0 = (uint)tmp;

tmp = UInt32x32To64(d2.Low, d1.Mid) + (tmp >> 32);
tmp = ((ulong)d2.Low * d1.Mid) + (tmp >> 32);
bufProd.U1 = (uint)tmp;
tmp >>= 32;

if (d1.High != 0)
{
tmp += UInt32x32To64(d2.Low, d1.High);
tmp += (ulong)d2.Low * d1.High;
if (tmp > uint.MaxValue)
{
bufProd.Mid64 = tmp;
Expand Down Expand Up @@ -1439,52 +1421,52 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
// [p-5][p-4][p-3][p-2][p-1][p-0] prod[] array
//

tmp = UInt32x32To64(d1.Low, d2.Low);
tmp = (ulong)d1.Low * d2.Low;
bufProd.U0 = (uint)tmp;

ulong tmp2 = UInt32x32To64(d1.Low, d2.Mid) + (tmp >> 32);
ulong tmp2 = ((ulong)d1.Low * d2.Mid) + (tmp >> 32);

tmp = UInt32x32To64(d1.Mid, d2.Low);
tmp = (ulong)d1.Mid * d2.Low;
tmp += tmp2; // this could generate carry
bufProd.U1 = (uint)tmp;
if (tmp < tmp2) // detect carry
tmp2 = (tmp >> 32) | (1UL << 32);
else
tmp2 = tmp >> 32;

tmp = UInt32x32To64(d1.Mid, d2.Mid) + tmp2;
tmp = ((ulong)d1.Mid * d2.Mid) + tmp2;

if ((d1.High | d2.High) > 0)
{
// Highest 32 bits is non-zero. Calculate 5 more partial products.
//
tmp2 = UInt32x32To64(d1.Low, d2.High);
tmp2 = (ulong)d1.Low * d2.High;
tmp += tmp2; // this could generate carry
uint tmp3 = 0;
if (tmp < tmp2) // detect carry
tmp3 = 1;

tmp2 = UInt32x32To64(d1.High, d2.Low);
tmp2 = (ulong)d1.High * d2.Low;
tmp += tmp2; // this could generate carry
bufProd.U2 = (uint)tmp;
if (tmp < tmp2) // detect carry
tmp3++;
tmp2 = ((ulong)tmp3 << 32) | (tmp >> 32);

tmp = UInt32x32To64(d1.Mid, d2.High);
tmp = (ulong)d1.Mid * d2.High;
tmp += tmp2; // this could generate carry
tmp3 = 0;
if (tmp < tmp2) // detect carry
tmp3 = 1;

tmp2 = UInt32x32To64(d1.High, d2.Mid);
tmp2 = (ulong)d1.High * d2.Mid;
tmp += tmp2; // this could generate carry
bufProd.U3 = (uint)tmp;
if (tmp < tmp2) // detect carry
tmp3++;
tmp = ((ulong)tmp3 << 32) | (tmp >> 32);

bufProd.High64 = UInt32x32To64(d1.High, d2.High) + tmp;
bufProd.High64 = ((ulong)d1.High * d2.High) + tmp;

hiProd = 5;
}
Expand Down Expand Up @@ -1607,22 +1589,22 @@ internal static void VarDecFromR4(float input, out DecCalc result)
power = -power;
if (power < 10)
{
result.Low64 = UInt32x32To64(mant, UInt32Powers10[power]);
result.Low64 = (ulong)mant * UInt32Powers10[power];
}
else
{
// Have a big power of 10.
//
if (power > 18)
{
ulong low64 = UInt32x32To64(mant, UInt32Powers10[power - 18]);
ulong low64 = (ulong)mant * UInt32Powers10[power - 18];
UInt64x64To128(low64, TenToPowerEighteen, ref result);
}
else
{
ulong low64 = UInt32x32To64(mant, UInt32Powers10[power - 9]);
ulong hi64 = UInt32x32To64(TenToPowerNine, (uint)(low64 >> 32));
low64 = UInt32x32To64(TenToPowerNine, (uint)low64);
ulong low64 = (ulong)mant * UInt32Powers10[power - 9];
ulong hi64 = TenToPowerNine * (low64 >> 32);
low64 = TenToPowerNine * (low64 & uint.MaxValue);
result.Low = (uint)low64;
hi64 += low64 >> 32;
result.Mid = (uint)hi64;
Expand Down Expand Up @@ -1775,8 +1757,8 @@ internal static void VarDecFromR8(double input, out DecCalc result)
if (power < 10)
{
uint pow10 = UInt32Powers10[power];
ulong low64 = UInt32x32To64((uint)mant, pow10);
ulong hi64 = UInt32x32To64((uint)(mant >> 32), pow10);
ulong low64 = (mant & uint.MaxValue) * pow10;
ulong hi64 = (mant >> 32) * pow10;
result.Low = (uint)low64;
hi64 += low64 >> 32;
result.Mid = (uint)hi64;
Expand Down Expand Up @@ -1980,7 +1962,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
if (IncreaseScale(ref bufQuo, power) != 0)
goto ThrowOverflow;

ulong num = UInt32x32To64(remainder, power);
ulong num = (ulong)remainder * power;
// TODO: https://github.com/dotnet/runtime/issues/5213
uint div = (uint)(num / den);
remainder = (uint)num - div * den;
Expand Down Expand Up @@ -2213,7 +2195,7 @@ internal static void VarDecMod(ref DecCalc d1, ref DecCalc d2)
do
{
uint power = scale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[scale];
ulong tmp = UInt32x32To64(d2.Low, power);
ulong tmp = (ulong)d2.Low * power;
d2.Low = (uint)tmp;
tmp >>= 32;
tmp += (d2.Mid + ((ulong)d2.High << 32)) * power;
Expand All @@ -2240,7 +2222,7 @@ internal static void VarDecMod(ref DecCalc d1, ref DecCalc d2)
break;
uint power = iCurScale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[iCurScale];
scale += iCurScale;
ulong tmp = UInt32x32To64(bufQuo.U0, power);
ulong tmp = (ulong)bufQuo.U0 * power;
bufQuo.U0 = (uint)tmp;
tmp >>= 32;
bufQuo.High64 = tmp + bufQuo.High64 * power;
Expand Down Expand Up @@ -2301,12 +2283,12 @@ private static unsafe void VarDecModFull(ref DecCalc d1, ref DecCalc d2, int sca
{
uint power = scale <= -MaxInt32Scale ? TenToPowerNine : UInt32Powers10[-scale];
uint* buf = (uint*)&b;
ulong tmp64 = UInt32x32To64(b.Buf24.U0, power);
ulong tmp64 = (ulong)b.Buf24.U0 * power;
b.Buf24.U0 = (uint)tmp64;
for (int i = 1; i <= high; i++)
{
tmp64 >>= 32;
tmp64 += UInt32x32To64(buf[i], power);
tmp64 += (ulong)buf[i] * power;
buf[i] = (uint)tmp64;
}
// The high bit of the dividend must not be set.
Expand Down
Loading