Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic support for folding SIMD intrinsics #81547

Merged
merged 8 commits into from
Feb 11, 2023
240 changes: 240 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -9613,6 +9613,246 @@ enum varRefKinds
VR_GLB_VAR = 0x04, // a global (clsVar)
};

template <typename TBase>
TBase EvaluateUnaryScalar(genTreeOps oper, TBase arg0)
{
switch (oper)
{
case GT_NEG:
{
return static_cast<TBase>(0) - arg0;
}

default:
{
unreached();
}
}
}

template <typename TSimd, typename TBase>
void EvaluateUnarySimd(genTreeOps oper, bool scalar, TSimd* result, TSimd arg0)
{
uint32_t count = sizeof(TSimd) / sizeof(TBase);

if (scalar)
{
count = 1;

#if defined(TARGET_XARCH)
// scalar operations on xarch copy the upper bits from arg0
*result = arg0;
#endif // TARGET_XARCH
}

for (uint32_t i = 0; i < count; i++)
{
// Safely execute `result[i] = oper(arg0[i])`

TBase input0;
memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));

TBase output = EvaluateUnaryScalar<TBase>(oper, input0);
memcpy(&result->u8[i * sizeof(TBase)], &output, sizeof(TBase));
}
}

template <typename TSimd>
void EvaluateUnarySimd(genTreeOps oper, bool scalar, var_types baseType, TSimd* result, TSimd arg0)
{
switch (baseType)
{
case TYP_FLOAT:
{
EvaluateUnarySimd<TSimd, float>(oper, scalar, result, arg0);
break;
}

case TYP_DOUBLE:
{
EvaluateUnarySimd<TSimd, double>(oper, scalar, result, arg0);
break;
}

case TYP_BYTE:
{
EvaluateUnarySimd<TSimd, int8_t>(oper, scalar, result, arg0);
break;
}

case TYP_SHORT:
{
EvaluateUnarySimd<TSimd, int16_t>(oper, scalar, result, arg0);
break;
}

case TYP_INT:
{
EvaluateUnarySimd<TSimd, int32_t>(oper, scalar, result, arg0);
break;
}

case TYP_LONG:
{
EvaluateUnarySimd<TSimd, int64_t>(oper, scalar, result, arg0);
break;
}

case TYP_UBYTE:
{
EvaluateUnarySimd<TSimd, uint8_t>(oper, scalar, result, arg0);
break;
}

case TYP_USHORT:
{
EvaluateUnarySimd<TSimd, uint16_t>(oper, scalar, result, arg0);
break;
}

case TYP_UINT:
{
EvaluateUnarySimd<TSimd, uint32_t>(oper, scalar, result, arg0);
break;
}

case TYP_ULONG:
{
EvaluateUnarySimd<TSimd, uint64_t>(oper, scalar, result, arg0);
break;
}

default:
{
unreached();
}
}
}

template <typename TBase>
TBase EvaluateBinaryScalar(genTreeOps oper, TBase arg0, TBase arg1)
{
switch (oper)
{
case GT_ADD:
{
return arg0 + arg1;
}

case GT_SUB:
{
return arg0 - arg1;
}

default:
{
unreached();
}
}
}

template <typename TSimd, typename TBase>
void EvaluateBinarySimd(genTreeOps oper, bool scalar, TSimd* result, TSimd arg0, TSimd arg1)
{
uint32_t count = sizeof(TSimd) / sizeof(TBase);

if (scalar)
{
count = 1;

#if defined(TARGET_XARCH)
// scalar operations on xarch copy the upper bits from arg0
*result = arg0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you explain me please this part on an example? (the difference between xarch and arm)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xarch has the behavior where scalar operations "copy the upper bits", that is x + y is equivalent to:

Vector128<T> result = x;
return result.WithElement(0, x.GetElement(0) + y.GetElement(0));

arm on the other hand zeros the upper bits, that is x + y is equivalent to:

Vector128<T> result = Vector128<T>.Zero;
return result.WithElement(0, x.GetElement(0) + y.GetElement(0));

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a path that explicitly zeros for Arm64 to help clarify the logic

#endif // TARGET_XARCH
}

for (uint32_t i = 0; i < count; i++)
{
// Safely execute `result[i] = oper(arg0[i], arg1[i])`

TBase input0;
memcpy(&input0, &arg0.u8[i * sizeof(TBase)], sizeof(TBase));

TBase input1;
memcpy(&input1, &arg1.u8[i * sizeof(TBase)], sizeof(TBase));

TBase output = EvaluateBinaryScalar<TBase>(oper, input0, input1);
memcpy(&result->u8[i * sizeof(TBase)], &output, sizeof(TBase));
}
}

template <typename TSimd>
void EvaluateBinarySimd(genTreeOps oper, bool scalar, var_types baseType, TSimd* result, TSimd arg0, TSimd arg1)
{
switch (baseType)
{
case TYP_FLOAT:
{
EvaluateBinarySimd<TSimd, float>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_DOUBLE:
{
EvaluateBinarySimd<TSimd, double>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_BYTE:
{
EvaluateBinarySimd<TSimd, int8_t>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_SHORT:
{
EvaluateBinarySimd<TSimd, int16_t>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_INT:
{
EvaluateBinarySimd<TSimd, int32_t>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_LONG:
{
EvaluateBinarySimd<TSimd, int64_t>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_UBYTE:
{
EvaluateBinarySimd<TSimd, uint8_t>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_USHORT:
{
EvaluateBinarySimd<TSimd, uint16_t>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_UINT:
{
EvaluateBinarySimd<TSimd, uint32_t>(oper, scalar, result, arg0, arg1);
break;
}

case TYP_ULONG:
{
EvaluateBinarySimd<TSimd, uint64_t>(oper, scalar, result, arg0, arg1);
break;
}

default:
{
unreached();
}
}
}

/*****************************************************************************/
#endif // !GENTREE_H
/*****************************************************************************/
7 changes: 7 additions & 0 deletions src/coreclr/jit/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ struct simd12_t
uint8_t u8[12];
uint16_t u16[6];
uint32_t u32[3];

// These three exist to simplify templatized code
// they won't actually be accessed for real scenarios

double f64[1];
int64_t i64[1];
uint64_t u64[1];
};

bool operator==(const simd12_t& other) const
Expand Down
Loading